Repository: rust-lang-nursery/packed_simd Branch: master Commit: d938e39bee9b Files: 363 Total size: 939.2 KB Directory structure: gitextract_ltzo2pap/ ├── .appveyor.yml ├── .github/ │ └── workflows/ │ ├── benchmarks.yml │ ├── ci.yml │ ├── docs.yml │ └── run-ci-script.yml ├── .gitignore ├── .travis.yml ├── Cargo.toml ├── LICENSE-APACHE ├── LICENSE-MIT ├── README.md ├── bors.toml ├── build.rs ├── ci/ │ ├── all.sh │ ├── android-install-ndk.sh │ ├── android-install-sdk.sh │ ├── android-sysimage.sh │ ├── benchmark.sh │ ├── deploy_and_run_on_ios_simulator.rs │ ├── docker/ │ │ ├── aarch64-linux-android/ │ │ │ └── Dockerfile │ │ ├── aarch64-unknown-linux-gnu/ │ │ │ └── Dockerfile │ │ ├── arm-unknown-linux-gnueabi/ │ │ │ └── Dockerfile │ │ ├── arm-unknown-linux-gnueabihf/ │ │ │ └── Dockerfile │ │ ├── armv7-linux-androideabi/ │ │ │ └── Dockerfile │ │ ├── armv7-unknown-linux-gnueabihf/ │ │ │ └── Dockerfile │ │ ├── i586-unknown-linux-gnu/ │ │ │ └── Dockerfile │ │ ├── i686-unknown-linux-gnu/ │ │ │ └── Dockerfile │ │ ├── mips-unknown-linux-gnu/ │ │ │ └── Dockerfile │ │ ├── mips64-unknown-linux-gnuabi64/ │ │ │ └── Dockerfile │ │ ├── mips64el-unknown-linux-gnuabi64/ │ │ │ └── Dockerfile │ │ ├── mipsel-unknown-linux-musl/ │ │ │ └── Dockerfile │ │ ├── powerpc-unknown-linux-gnu/ │ │ │ └── Dockerfile │ │ ├── powerpc64-unknown-linux-gnu/ │ │ │ └── Dockerfile │ │ ├── powerpc64le-unknown-linux-gnu/ │ │ │ └── Dockerfile │ │ ├── s390x-unknown-linux-gnu/ │ │ │ └── Dockerfile │ │ ├── sparc64-unknown-linux-gnu/ │ │ │ └── Dockerfile │ │ ├── thumbv7neon-linux-androideabi/ │ │ │ └── Dockerfile │ │ ├── thumbv7neon-unknown-linux-gnueabihf/ │ │ │ └── Dockerfile │ │ ├── wasm32-unknown-unknown/ │ │ │ └── Dockerfile │ │ ├── x86_64-linux-android/ │ │ │ └── Dockerfile │ │ ├── x86_64-unknown-linux-gnu/ │ │ │ └── Dockerfile │ │ └── x86_64-unknown-linux-gnu-emulated/ │ │ └── Dockerfile │ ├── dox.sh │ ├── linux-s390x.sh │ ├── linux-sparc64.sh │ ├── lld-shim.rs │ ├── max_line_width.sh │ ├── run-docker.sh │ ├── run.sh │ ├── run_examples.sh │ ├── runtest-android.rs │ ├── setup_benchmarks.sh │ └── test-runner-linux ├── contributing.md ├── examples/ │ ├── Cargo.toml │ ├── aobench/ │ │ ├── Cargo.toml │ │ ├── benches/ │ │ │ ├── ambient_occlusion.rs │ │ │ ├── isec_plane.rs │ │ │ ├── isec_sphere.rs │ │ │ ├── random.rs │ │ │ └── scanlines.rs │ │ ├── benchmark.sh │ │ ├── build.rs │ │ ├── readme.md │ │ ├── rustfmt.toml │ │ ├── src/ │ │ │ ├── ambient_occlusion.rs │ │ │ ├── geometry/ │ │ │ │ ├── mod.rs │ │ │ │ ├── plane.rs │ │ │ │ ├── ray.rs │ │ │ │ ├── rayxN.rs │ │ │ │ ├── sphere.rs │ │ │ │ ├── vec.rs │ │ │ │ └── vecxN.rs │ │ │ ├── image.rs │ │ │ ├── intersection/ │ │ │ │ ├── mod.rs │ │ │ │ ├── packet.rs │ │ │ │ ├── ray_plane.rs │ │ │ │ ├── ray_sphere.rs │ │ │ │ └── single.rs │ │ │ ├── ispc_.rs │ │ │ ├── lib.rs │ │ │ ├── main.rs │ │ │ ├── random.rs │ │ │ ├── scalar.rs │ │ │ ├── scalar_parallel.rs │ │ │ ├── scene/ │ │ │ │ ├── mod.rs │ │ │ │ ├── random.rs │ │ │ │ └── test.rs │ │ │ ├── tiled.rs │ │ │ ├── tiled_parallel.rs │ │ │ ├── vector.rs │ │ │ └── vector_parallel.rs │ │ └── volta/ │ │ ├── .gitignore │ │ └── ao.ispc │ ├── dot_product/ │ │ ├── Cargo.toml │ │ ├── readme.md │ │ └── src/ │ │ ├── lib.rs │ │ ├── scalar.rs │ │ └── simd.rs │ ├── fannkuch_redux/ │ │ ├── Cargo.toml │ │ ├── readme.md │ │ └── src/ │ │ ├── fannkuchredux-output.txt │ │ ├── lib.rs │ │ ├── main.rs │ │ ├── scalar.rs │ │ └── simd.rs │ ├── mandelbrot/ │ │ ├── Cargo.toml │ │ ├── benchmark.sh │ │ ├── build.rs │ │ ├── readme.md │ │ ├── src/ │ │ │ ├── ispc_tasks.rs │ │ │ ├── lib.rs │ │ │ ├── main.rs │ │ │ ├── mandelbrot-output.txt │ │ │ ├── scalar_par.rs │ │ │ └── simd_par.rs │ │ └── volta/ │ │ └── mandelbrot.ispc │ ├── matrix_inverse/ │ │ ├── Cargo.toml │ │ ├── readme.md │ │ └── src/ │ │ ├── lib.rs │ │ ├── scalar.rs │ │ └── simd.rs │ ├── nbody/ │ │ ├── Cargo.toml │ │ ├── benches/ │ │ │ └── algs.rs │ │ ├── readme.md │ │ └── src/ │ │ ├── lib.rs │ │ ├── main.rs │ │ ├── nbody-output.txt │ │ ├── scalar.rs │ │ └── simd.rs │ ├── options_pricing/ │ │ ├── Cargo.toml │ │ ├── benchmark.sh │ │ ├── build.rs │ │ ├── readme.md │ │ ├── src/ │ │ │ ├── ispc_.rs │ │ │ ├── lib.rs │ │ │ ├── main.rs │ │ │ ├── scalar.rs │ │ │ ├── simd.rs │ │ │ ├── simd_kernels.rs │ │ │ ├── simd_par.rs │ │ │ └── sum.rs │ │ └── volta/ │ │ ├── options.ispc │ │ └── options_defs.h │ ├── rust-toolchain │ ├── slice_sum/ │ │ ├── Cargo.toml │ │ ├── readme.md │ │ └── src/ │ │ └── main.rs │ ├── spectral_norm/ │ │ ├── Cargo.toml │ │ ├── readme.md │ │ └── src/ │ │ ├── lib.rs │ │ ├── main.rs │ │ ├── scalar.rs │ │ ├── simd.rs │ │ └── spectralnorm-output.txt │ ├── stencil/ │ │ ├── Cargo.toml │ │ ├── benchmark.sh │ │ ├── build.rs │ │ ├── readme.md │ │ ├── src/ │ │ │ ├── ispc_loops.rs │ │ │ ├── lib.rs │ │ │ ├── main.rs │ │ │ ├── scalar.rs │ │ │ ├── simd.rs │ │ │ └── simd_par.rs │ │ └── volta/ │ │ ├── .gitignore │ │ ├── Makefile │ │ ├── common.mk │ │ ├── stencil.cpp │ │ ├── stencil.ispc │ │ ├── stencil_serial.cpp │ │ ├── tasksys.cpp │ │ └── timing.h │ └── triangle_xform/ │ ├── Cargo.toml │ ├── readme.md │ └── src/ │ ├── lib.rs │ ├── scalar.rs │ └── simd.rs ├── micro_benchmarks/ │ ├── Cargo.toml │ ├── benches/ │ │ └── mask_reductions.rs │ └── rust-toolchain ├── perf-guide/ │ ├── .gitignore │ ├── book.toml │ └── src/ │ ├── SUMMARY.md │ ├── ascii.css │ ├── bound_checks.md │ ├── float-math/ │ │ ├── approx.md │ │ ├── fma.md │ │ ├── fp.md │ │ └── svml.md │ ├── introduction.md │ ├── prof/ │ │ ├── linux.md │ │ ├── mca.md │ │ └── profiling.md │ ├── target-feature/ │ │ ├── attribute.md │ │ ├── features.md │ │ ├── inlining.md │ │ ├── practice.md │ │ ├── runtime.md │ │ └── rustflags.md │ └── vert-hor-ops.md ├── rust-toolchain ├── rustfmt.toml ├── src/ │ ├── api/ │ │ ├── bit_manip.rs │ │ ├── bitmask.rs │ │ ├── cast/ │ │ │ ├── macros.rs │ │ │ ├── v128.rs │ │ │ ├── v16.rs │ │ │ ├── v256.rs │ │ │ ├── v32.rs │ │ │ ├── v512.rs │ │ │ └── v64.rs │ │ ├── cast.rs │ │ ├── cmp/ │ │ │ ├── eq.rs │ │ │ ├── ord.rs │ │ │ ├── partial_eq.rs │ │ │ ├── partial_ord.rs │ │ │ └── vertical.rs │ │ ├── cmp.rs │ │ ├── default.rs │ │ ├── fmt/ │ │ │ ├── binary.rs │ │ │ ├── debug.rs │ │ │ ├── lower_hex.rs │ │ │ ├── octal.rs │ │ │ └── upper_hex.rs │ │ ├── fmt.rs │ │ ├── from/ │ │ │ ├── from_array.rs │ │ │ └── from_vector.rs │ │ ├── from.rs │ │ ├── hash.rs │ │ ├── into_bits/ │ │ │ ├── arch_specific.rs │ │ │ ├── macros.rs │ │ │ ├── v128.rs │ │ │ ├── v16.rs │ │ │ ├── v256.rs │ │ │ ├── v32.rs │ │ │ ├── v512.rs │ │ │ └── v64.rs │ │ ├── into_bits.rs │ │ ├── math/ │ │ │ ├── float/ │ │ │ │ ├── abs.rs │ │ │ │ ├── consts.rs │ │ │ │ ├── cos.rs │ │ │ │ ├── exp.rs │ │ │ │ ├── ln.rs │ │ │ │ ├── mul_add.rs │ │ │ │ ├── mul_adde.rs │ │ │ │ ├── powf.rs │ │ │ │ ├── recpre.rs │ │ │ │ ├── rsqrte.rs │ │ │ │ ├── sin.rs │ │ │ │ ├── sqrt.rs │ │ │ │ ├── sqrte.rs │ │ │ │ └── tanh.rs │ │ │ └── float.rs │ │ ├── math.rs │ │ ├── minimal/ │ │ │ ├── iuf.rs │ │ │ ├── mask.rs │ │ │ └── ptr.rs │ │ ├── minimal.rs │ │ ├── ops/ │ │ │ ├── scalar_arithmetic.rs │ │ │ ├── scalar_bitwise.rs │ │ │ ├── scalar_mask_bitwise.rs │ │ │ ├── scalar_shifts.rs │ │ │ ├── vector_arithmetic.rs │ │ │ ├── vector_bitwise.rs │ │ │ ├── vector_float_min_max.rs │ │ │ ├── vector_int_min_max.rs │ │ │ ├── vector_mask_bitwise.rs │ │ │ ├── vector_neg.rs │ │ │ ├── vector_rotates.rs │ │ │ └── vector_shifts.rs │ │ ├── ops.rs │ │ ├── ptr/ │ │ │ └── gather_scatter.rs │ │ ├── ptr.rs │ │ ├── reductions/ │ │ │ ├── bitwise.rs │ │ │ ├── float_arithmetic.rs │ │ │ ├── integer_arithmetic.rs │ │ │ ├── mask.rs │ │ │ └── min_max.rs │ │ ├── reductions.rs │ │ ├── select.rs │ │ ├── shuffle.rs │ │ ├── shuffle1_dyn.rs │ │ ├── slice/ │ │ │ ├── from_slice.rs │ │ │ └── write_to_slice.rs │ │ ├── slice.rs │ │ └── swap_bytes.rs │ ├── api.rs │ ├── codegen/ │ │ ├── bit_manip.rs │ │ ├── llvm.rs │ │ ├── math/ │ │ │ ├── float/ │ │ │ │ ├── abs.rs │ │ │ │ ├── cos.rs │ │ │ │ ├── cos_pi.rs │ │ │ │ ├── exp.rs │ │ │ │ ├── ln.rs │ │ │ │ ├── macros.rs │ │ │ │ ├── mul_add.rs │ │ │ │ ├── mul_adde.rs │ │ │ │ ├── powf.rs │ │ │ │ ├── sin.rs │ │ │ │ ├── sin_cos_pi.rs │ │ │ │ ├── sin_pi.rs │ │ │ │ ├── sqrt.rs │ │ │ │ ├── sqrte.rs │ │ │ │ └── tanh.rs │ │ │ └── float.rs │ │ ├── math.rs │ │ ├── pointer_sized_int.rs │ │ ├── reductions/ │ │ │ ├── mask/ │ │ │ │ ├── aarch64.rs │ │ │ │ ├── arm.rs │ │ │ │ ├── fallback.rs │ │ │ │ ├── fallback_impl.rs │ │ │ │ ├── x86/ │ │ │ │ │ ├── avx.rs │ │ │ │ │ ├── avx2.rs │ │ │ │ │ ├── sse.rs │ │ │ │ │ └── sse2.rs │ │ │ │ └── x86.rs │ │ │ └── mask.rs │ │ ├── reductions.rs │ │ ├── shuffle.rs │ │ ├── shuffle1_dyn.rs │ │ ├── swap_bytes.rs │ │ ├── v128.rs │ │ ├── v16.rs │ │ ├── v256.rs │ │ ├── v32.rs │ │ ├── v512.rs │ │ ├── v64.rs │ │ ├── vPtr.rs │ │ └── vSize.rs │ ├── codegen.rs │ ├── lib.rs │ ├── masks.rs │ ├── sealed.rs │ ├── testing/ │ │ ├── macros.rs │ │ └── utils.rs │ ├── testing.rs │ ├── v128.rs │ ├── v16.rs │ ├── v256.rs │ ├── v32.rs │ ├── v512.rs │ ├── v64.rs │ ├── vPtr.rs │ └── vSize.rs ├── tests/ │ └── endianness.rs └── verify/ └── verify/ ├── Cargo.toml ├── readme.md ├── rust-toolchain └── src/ ├── api/ │ ├── math/ │ │ └── float/ │ │ ├── mod.rs │ │ └── mul_add.rs │ ├── math.rs │ ├── ops/ │ │ ├── vector_rotates/ │ │ │ └── x86.rs │ │ └── vector_rotates.rs │ ├── ops.rs │ ├── reductions/ │ │ ├── mask/ │ │ │ ├── avx.rs │ │ │ ├── avx2.rs │ │ │ ├── sse.rs │ │ │ └── sse2.rs │ │ └── mask.rs │ └── reductions.rs ├── api.rs └── lib.rs ================================================ FILE CONTENTS ================================================ ================================================ FILE: .appveyor.yml ================================================ matrix: allow_failures: # FIXME: https://github.com/rust-lang-nursery/packed_simd/issues/72 - TARGET: i686-pc-windows-msvc - TARGET: i686-pc-windows-gnu - TARGET: x86_64-pc-windows-gnu fast_finish: true environment: matrix: - TARGET: x86_64-pc-windows-msvc MSYSTEM: MINGW64 NOVERIFY: "1" - TARGET: x86_64-pc-windows-msvc MSYSTEM: MINGW64 RUSTFLAGS: "-C target-feature=+sse4.2" NOVERIFY: "1" - TARGET: x86_64-pc-windows-msvc MSYSTEM: MINGW64 RUSTFLAGS: "-C target-feature=+avx" NOVERIFY: "1" - TARGET: x86_64-pc-windows-msvc MSYSTEM: MINGW64 RUSTFLAGS: "-C target-feature=+avx2" NOVERIFY: "1" - TARGET: i686-pc-windows-msvc MSYSTEM: MINGW32 NOVERIFY: "1" - TARGET: i686-pc-windows-msvc MSYSTEM: MINGW32 RUSTFLAGS: "-C target-feature=+sse4.2" NOVERIFY: "1" - TARGET: i686-pc-windows-msvc MSYSTEM: MINGW32 RUSTFLAGS: "-C target-feature=+avx" NOVERIFY: "1" - TARGET: i686-pc-windows-msvc MSYSTEM: MINGW32 RUSTFLAGS: "-C target-feature=+avx2" NOVERIFY: "1" - TARGET: x86_64-pc-windows-gnu MSYSTEM: MINGW64 - TARGET: i686-pc-windows-gnu MSYSTEM: MINGW32 - TARGET: x86_64-pc-windows-gnu MSYSTEM: MINGW64 install: - ps: if (ls -r . -fi "*.rs" | sls "`t") { throw "Found tab character" } - ps: Start-FileDownload "https://static.rust-lang.org/dist/rust-nightly-${env:TARGET}.exe" -FileName "rust-install.exe" - ps: .\rust-install.exe /VERYSILENT /NORESTART /DIR="C:\rust" | Out-Null - ps: $env:PATH="$env:PATH;C:\rust\bin" - set PATH=c:\msys64\%MSYSTEM%\bin;c:\msys64\usr\bin;%PATH% - rustc -vV - cargo -vV build: false test_script: bash -c "ci/run.sh" ================================================ FILE: .github/workflows/benchmarks.yml ================================================ name: benchmarks on: push: branches: - master pull_request: workflow_dispatch: jobs: x86_64-unknown-linux-gnu: uses: ./.github/workflows/run-ci-script.yml with: target: x86_64-unknown-linux-gnu setup_script: ci/setup_benchmarks.sh script: ci/benchmark.sh norun: 1 verify: 1 # FIXME: figure out how to add downloaded ispc to PATH # features: ispc x86_64-apple-darwin: uses: ./.github/workflows/run-ci-script.yml with: target: x86_64-apple-darwin runner: macos-latest setup_script: ci/setup_benchmarks.sh script: ci/benchmark.sh norun: 1 verify: 1 # FIXME: figure out how to add downloaded ispc to PATH # features: ispc ================================================ FILE: .github/workflows/ci.yml ================================================ name: ci # trigger for all PRs and changes to master on: push: branches: - master pull_request: jobs: rustfmt: uses: ./.github/workflows/run-ci-script.yml with: script: ci/all.sh check_fmt || true x86_64-unknown-linux-android: uses: ./.github/workflows/run-ci-script.yml strategy: fail-fast: true with: target: x86_64-linux-android armv7-linux-androideabi: uses: ./.github/workflows/run-ci-script.yml strategy: fail-fast: true with: target: armv7-linux-androideabi aarch64-unknown-linux-android-NEON: uses: ./.github/workflows/run-ci-script.yml strategy: fail-fast: true with: target: aarch64-linux-android rustflags: -Ctarget-feature=+neon thumbv7neon-linux-androideabi: uses: ./.github/workflows/run-ci-script.yml strategy: fail-fast: false with: target: thumbv7neon-linux-androideabi i586-unknown-linux-gnu: uses: ./.github/workflows/run-ci-script.yml strategy: fail-fast: false with: target: i586-unknown-linux-gnu rustflags: -Crelocation-model=static i586-unknown-linux-gnu-SSE: uses: ./.github/workflows/run-ci-script.yml strategy: fail-fast: false with: target: i586-unknown-linux-gnu rustflags: -Crelocation-model=static -Ctarget-feature=+sse i586-unknown-linux-gnu-SSE2: uses: ./.github/workflows/run-ci-script.yml strategy: fail-fast: false with: target: i586-unknown-linux-gnu rustflags: -Crelocation-model=static -Ctarget-feature=+sse2 i686-unknown-linux-gnu: uses: ./.github/workflows/run-ci-script.yml strategy: fail-fast: false with: target: i686-unknown-linux-gnu rustflags: -Crelocation-model=static i686-unknown-linux-gnu-SSE4_2: uses: ./.github/workflows/run-ci-script.yml strategy: fail-fast: false with: target: i686-unknown-linux-gnu rustflags: -Crelocation-model=static -Ctarget-feature=+sse4.2 i686-unknown-linux-gnu-AVX2: uses: ./.github/workflows/run-ci-script.yml strategy: fail-fast: false with: target: i686-unknown-linux-gnu rustflags: -Crelocation-model=static -Ctarget-feature=+avx2 x86_64-unknown-linux-gnu: uses: ./.github/workflows/run-ci-script.yml strategy: fail-fast: true with: target: x86_64-unknown-linux-gnu x86_64-unknown-linux-gnu-SSE4_2: uses: ./.github/workflows/run-ci-script.yml strategy: fail-fast: true with: target: x86_64-unknown-linux-gnu rustflags: -Ctarget-feature=+sse4.2 x86_64-unknown-linux-gnu-AVX2: uses: ./.github/workflows/run-ci-script.yml strategy: fail-fast: true with: target: x86_64-unknown-linux-gnu rustflags: -Ctarget-feature=+avx2 arm-unknown-linux-gnueabihf: uses: ./.github/workflows/run-ci-script.yml strategy: fail-fast: true with: target: arm-unknown-linux-gnueabihf armv7-unknown-linux-gnueabihf: uses: ./.github/workflows/run-ci-script.yml strategy: fail-fast: true with: target: armv7-unknown-linux-gnueabihf armv7-unknown-linux-gnueabihf-NEON: uses: ./.github/workflows/run-ci-script.yml strategy: fail-fast: true with: target: armv7-unknown-linux-gnueabihf rustflags: -Ctarget-feature=+neon thumbv7neon-unknown-linux-gnueabihf: uses: ./.github/workflows/run-ci-script.yml strategy: fail-fast: false with: target: thumbv7neon-unknown-linux-gnueabihf aarch64-unknown-linux-gnu-NEON: uses: ./.github/workflows/run-ci-script.yml strategy: fail-fast: true with: target: aarch64-unknown-linux-gnu rustflags: -Ctarget-feature=+neon powerpc-unknown-linux-gnu: uses: ./.github/workflows/run-ci-script.yml strategy: fail-fast: false with: target: powerpc-unknown-linux-gnu powerpc64-unknown-linux-gnu: uses: ./.github/workflows/run-ci-script.yml strategy: fail-fast: false with: target: powerpc64-unknown-linux-gnu powerpc64le-unknown-linux-gnu: uses: ./.github/workflows/run-ci-script.yml strategy: fail-fast: true with: target: powerpc64le-unknown-linux-gnu powerpc64le-unknown-linux-gnu-ALTIVEC: uses: ./.github/workflows/run-ci-script.yml strategy: fail-fast: true with: target: powerpc64le-unknown-linux-gnu rustflags: -Ctarget-feature=+altivec powerpc64le-unknown-linux-gnu-VSX: uses: ./.github/workflows/run-ci-script.yml strategy: fail-fast: true with: target: powerpc64le-unknown-linux-gnu rustflags: -Ctarget-feature=+vsx s390x-unknown-linux-gnu: uses: ./.github/workflows/run-ci-script.yml strategy: fail-fast: false with: target: s390x-unknown-linux-gnu sparc64-unknown-linux-gnu: uses: ./.github/workflows/run-ci-script.yml strategy: fail-fast: false with: target: sparc64-unknown-linux-gnu wasm32-unknown-unknown: uses: ./.github/workflows/run-ci-script.yml strategy: fail-fast: false with: target: wasm32-unknown-unknown x86_64-apple-darwin-SSE4_2: uses: ./.github/workflows/run-ci-script.yml strategy: fail-fast: true with: runner: macos-latest script: ci/run.sh target: x86_64-apple-darwin rustflags: -Ctarget-feature=+sse4.2 x86_64-apple-darwin-AVX: uses: ./.github/workflows/run-ci-script.yml strategy: fail-fast: true with: runner: macos-latest script: ci/run.sh target: x86_64-apple-darwin rustflags: -Ctarget-feature=+avx x86_64-apple-ios: uses: ./.github/workflows/run-ci-script.yml strategy: fail-fast: true with: runner: macos-latest script: ci/run.sh target: x86_64-apple-ios aarch64-apple-ios: uses: ./.github/workflows/run-ci-script.yml strategy: fail-fast: true with: runner: macos-latest script: ci/run.sh target: aarch64-apple-ios rustflags: -Ctarget-feature=+neon ================================================ FILE: .github/workflows/docs.yml ================================================ name: docs on: push: branches: - master jobs: docs: uses: ./.github/workflows/run-ci-script.yml with: setup_script: cargo install mdbook script: ci/dox.sh ================================================ FILE: .github/workflows/run-ci-script.yml ================================================ name: run-ci-script on: workflow_call: inputs: runner: required: false type: string default: ubuntu-latest target: required: false type: string default: '' rustflags: required: false type: string default: '' script: required: false type: string default: ci/run-docker.sh setup_script: required: false type: string norun: required: false type: string default: '' verify: required: false type: string default: '' features: required: false type: string default: '' jobs: run-ci-script: runs-on: ${{ inputs.runner }} steps: - name: Checkout uses: actions/checkout@v2 - name: Init Rustup Cache uses: actions/cache@v2 with: path: | ~/.rustup/toolchains key: ${{ runner.os }}-cargo-${{ hashFiles('**/rust-toolchain') }} - name: Install Toolchain uses: dtolnay/rust-toolchain@nightly with: # FIXME: change to nightly once https://github.com/rust-lang/packed_simd/pull/350 is merged # needs to be kept in sync with the toolchain files targets: ${{ inputs.target }} components: rustfmt - name: Generate Lockfile run: cargo generate-lockfile - name: Init Cargo Cache uses: actions/cache@v2 with: path: | ~/.cargo/bin/ ~/.cargo/registry/index/ ~/.cargo/registry/cache/ ~/.cargo/git/db/ target/ key: ${{ runner.os }}-cargo-${{ hashFiles('**/Cargo.lock') }} - name: Setup if: ${{ inputs.setup_script != '' }} run: ${{ inputs.setup_script }} env: TARGET: ${{ inputs.target }} RUSTFLAGS: ${{ inputs.rustflags }} NORUN: ${{ inputs.norun }} VERIFY: ${{ inputs.verify }} FEATURES: ${{ inputs.features }} - name: Run CI Script timeout-minutes: 30 run: ${{ inputs.script }} env: TARGET: ${{ inputs.target }} RUSTFLAGS: ${{ inputs.rustflags }} NORUN: ${{ inputs.norun }} VERIFY: ${{ inputs.verify }} FEATURES: ${{ inputs.features }} ================================================ FILE: .gitignore ================================================ Cargo.lock target/ # llvm-ir and assembly *.ll *.d # png files output by benchmarks *.png # -*- mode: gitignore; -*- *~ \#*\# /.emacs.desktop /.emacs.desktop.lock *.elc auto-save-list tramp .\#* # Org-mode .org-id-locations *_archive # flymake-mode *_flymake.* # eshell files /eshell/history /eshell/lastdir # elpa packages /elpa/ # reftex files *.rel # AUCTeX auto folder /auto/ # cask packages .cask/ dist/ # Flycheck flycheck_*.el # server auth directory /server/ # projectiles files .projectile # directory configuration .dir-locals.el ================================================ FILE: .travis.yml ================================================ language: rust rust: nightly os: linux dist: focal stages: - tools - build-test-verify # Passes full test suite, permit no regressions (unless it's rustup :/) - 32bit-tier1 - 64bit-tier2 - 32bit-tier2 jobs: fast_finish: true include: # Android: - env: TARGET=x86_64-linux-android name: "x86_64-unknown-linux-android + SSE2" stage: build-test-verify - env: TARGET=arm-linux-androideabi name: "arm-linux-androideabi" stage: build-test-verify - name: "aarch64-unknown-linux-android + NEON" env: TARGET=aarch64-linux-android RUSTFLAGS="-C target-feature=+neon" stage: build-test-verify - env: TARGET="thumbv7neon-linux-androideabi" name: "thumbv7neon-linux-androideabi" stage: 32bit-tier2 # Linux: - env: TARGET=i586-unknown-linux-gnu name: "i586-unknown-linux-gnu" stage: 32bit-tier2 - env: TARGET=i586-unknown-linux-gnu RUSTFLAGS="-C target-feature=+sse" name: "i586-unknown-linux-gnu + SSE" stage: 32bit-tier2 - env: TARGET=i586-unknown-linux-gnu RUSTFLAGS="-C target-feature=+sse2" name: "i586-unknown-linux-gnu + SSE2" stage: 32bit-tier2 - env: TARGET=i686-unknown-linux-gnu name: "i686-unknown-linux-gnu + SSE2" stage: 32bit-tier1 - env: TARGET=i686-unknown-linux-gnu RUSTFLAGS="-C target-feature=+sse4.2" name: "i686-unknown-linux-gnu + SSE4.2" stage: 32bit-tier1 - env: TARGET=i686-unknown-linux-gnu RUSTFLAGS="-C target-feature=+avx2" name: "i686-unknown-linux-gnu + AVX2" stage: 32bit-tier1 - env: TARGET=x86_64-unknown-linux-gnu RUSTFLAGS="-C target-feature=+sse4.2" name: "x86_64-unknown-linux-gnu + SSE4.2" stage: build-test-verify - env: TARGET=x86_64-unknown-linux-gnu RUSTFLAGS="-C target-feature=+avx2" name: "x86_64-unknown-linux-gnu + AVX2" stage: build-test-verify - env: TARGET=arm-unknown-linux-gnueabihf name: "arm-unknown-linux-gnueabihf" stage: build-test-verify - env: TARGET=armv7-unknown-linux-gnueabihf name: "armv7-unknown-linux-gnueabihf" stage: build-test-verify - env: TARGET=armv7-unknown-linux-gnueabihf RUSTFLAGS="-C target-feature=+neon" name: "armv7-unknown-linux-gnueabihf + NEON" stage: build-test-verify - env: TARGET="thumbv7neon-unknown-linux-gnueabihf" name: "thumbv7neon-unknown-linux-gnueabihf" stage: 32bit-tier2 - name: "aarch64-unknown-linux-gnu + NEON" env: TARGET=aarch64-unknown-linux-gnu RUSTFLAGS="-C target-feature=+neon" stage: build-test-verify - env: TARGET=mips-unknown-linux-gnu name: "mips-unknown-linux-gnu" stage: 32bit-tier2 - env: TARGET=mipsel-unknown-linux-musl name: "mipsel-unknown-linux-musl" stage: 32bit-tier2 - env: TARGET=mips64-unknown-linux-gnuabi64 name: "mips64-unknown-linux-gnuabi64" stage: 64bit-tier2 - env: TARGET=mips64el-unknown-linux-gnuabi64 name: "mips64el-unknown-linux-gnuabi64" stage: 64bit-tier2 # FIXME: https://github.com/rust-lang-nursery/packed_simd/issues/18 # env: TARGET=mips64el-unknown-linux-gnuabi64 RUSTFLAGS="-C target-feature=+msa -C target-cpu=mips64r6" - env: TARGET=powerpc-unknown-linux-gnu name: "powerpc-unknown-linux-gnu" stage: 32bit-tier2 - env: TARGET=powerpc64-unknown-linux-gnu name: "powerpc64-unknown-linux-gnu" stage: 64bit-tier2 - name: "powerpc64le-unknown-linux-gnu" env: TARGET=powerpc64le-unknown-linux-gnu stage: build-test-verify - name: "powerpc64le-unknown-linux-gnu + ALTIVEC" env: TARGET=powerpc64le-unknown-linux-gnu RUSTFLAGS="-C target-feature=+altivec" stage: build-test-verify - name: "powerpc64le-unknown-linux-gnu + VSX" env: TARGET=powerpc64le-unknown-linux-gnu RUSTFLAGS="-C target-feature=+vsx" stage: build-test-verify - name: "s390x-unknown-linux-gnu" env: TARGET=s390x-unknown-linux-gnu stage: 64bit-tier2 - env: TARGET=sparc64-unknown-linux-gnu name: "sparc64-unknown-linux-gnu" stage: 64bit-tier2 # WebAssembly: - env: TARGET=wasm32-unknown-unknown name: "wasm32-unknown-unknown" stage: 32bit-tier2 # MacOSX: - os: osx env: TARGET=x86_64-apple-darwin RUSTFLAGS="-C target-feature=+sse4.2" name: "x86_64-apple-darwin + SSE4.2" install: true script: ci/run.sh osx_image: xcode10 stage: build-test-verify # Travis-CI OSX build bots do not support AVX2: - os: osx env: TARGET=x86_64-apple-darwin RUSTFLAGS="-C target-feature=+avx" name: "x86_64-apple-darwin + AVX" install: true script: ci/run.sh osx_image: xcode10 stage: build-test-verify # *BSDs: #- env: TARGET=i686-unknown-freebsd NORUN=1 # script: ci/run.sh #- env: TARGET=x86_64-unknown-freebsd NORUN=1 # script: ci/run.sh #- env: TARGET=x86_64-unknown-netbsd NORUN=1 # script: ci/run.sh # Solaris: #- env: TARGET=x86_64-sun-solaris NORUN=1 # script: ci/run.sh # iOS: - os: osx env: TARGET=x86_64-apple-ios name: "x86_64-apple-ios + SSE2" script: ci/run.sh osx_image: xcode9.4 stage: 64bit-tier2 - name: "aarch64-apple-ios + NEON" env: TARGET=aarch64-apple-ios RUSTFLAGS="-C target-feature=+neon" os: osx osx_image: xcode9.4 script: ci/run.sh stage: 64bit-tier2 # BENCHMARKS: - name: "Benchmarks - x86_64-unknown-linux-gnu" install: TARGET=x86_64-unknown-linux-gnu ./ci/setup_benchmarks.sh # FIXME: Use `core_arch,sleef-sys` features once they works again script: PATH=$(pwd):$PATH NORUN=1 VERIFY=1 FEATURES=ispc ci/benchmark.sh stage: tools - name: "Benchmarks - x86_64-apple-darwin" install: TARGET=x86_64-apple-darwin ./ci/setup_benchmarks.sh # FIXME: Use `core_arch,sleef-sys` features once they works again script: PATH=$(pwd):$PATH NORUN=1 VERIFY=1 FEATURES=ispc ci/benchmark.sh os: osx osx_image: xcode9.4 stage: tools # TOOLS: - name: "Documentation" before_install: - sudo add-apt-repository -y ppa:deadsnakes/ppa - sudo apt-get update -y - sudo apt-get install -y python3.9 install: - cargo install mdbook script: ci/dox.sh stage: tools - name: "rustfmt" install: true script: | rustup toolchain install nightly -c rustfmt --allow-downgrade ci/all.sh check_fmt || true stage: tools allow_failures: # FIXME: ISPC cannot be found? - name: "Benchmarks - x86_64-apple-darwin" # FIXME: i686 fails in inlining, apparently - stage: 32bit-tier1 #- env: TARGET=i686-unknown-freebsd NORUN=1 #- env: TARGET=x86_64-unknown-freebsd NORUN=1 #- env: TARGET=x86_64-unknown-netbsd NORUN=1 #- env: TARGET=x86_64-sun-solaris NORUN=1 # FIXME: TBD - stage: 64bit-tier2 - stage: 32bit-tier2 # FIXME: iOS # https://github.com/rust-lang-nursery/packed_simd/issues/26 - env: TARGET=x86_64-apple-ios # Is this related to the above? Mysterious test failure - name: "aarch64-apple-ios + NEON" install: travis_retry rustup target add $TARGET before_script: cargo generate-lockfile script: travis_wait 50 ci/run-docker.sh after_script: sleep 5 env: global: secure: "lPHv7s6+AxQYNaFncycVFQt++Y1asQmMhOikQU1ztlP8CK7+hn2m98cg/euOJyzIOb2iJ3ZX4cGZkzw4lc59MQBByb1GtDbazQoUOzVDbVfe9BDD2f8JVoIFh1CMfjPKQ7Gg/rJqWlwrUlSd5GNxPCutKjY7qZhJuR6SQbJjlWaGN2Vd4fVCzKXz8fHRXgMEZS+d+CR4Nsrkb83J3Z4s5kSdJmhYxJ61AWjuzJVwUh4l3/HEYlSL5XXpuh5R2i7W16h1PlNdaTUgkZli1lHzO8+6Q8LzX9+XiLIEVX9lw3A2NdIKGz8E/+7Qs5oYOkwYhjROsDQxIK7xkSM30bQuN7cwMBybAVIyOPJkqXQ1dQyp83KSdsOj7JMyDDRvcEDLI6ehRlm5EcdH7YrReuboN81iUo0Sa7VsuUmgj5hjERCt9r30f9aWuitABai7vKRtjglg7Sp5CrEVPA4PQs6PqKCCRogoggbXJ/Z5Dyw/RZaXPeNR9+qIKN1Vjm9Gew1sRN2JK/3+vXTKtyJXH/uBxgJt4jQlbuShOJuF+BSfTF88sMe67a/357SSOIb4JkaCyd0flDCWYE8576kaHPlVVMT2peXee0LeRXm1e13nG3Na0t3LS/orJLPHOShNQGoDj7qAP5aEKggRya896JGwtvlaBHHTmSQh65G7cyNErZo=" branches: only: - staging # bors r+ - trying # bors try - master notifications: email: on_success: never ================================================ FILE: Cargo.toml ================================================ [package] name = "packed_simd" version = "0.3.9" description = "Portable Packed SIMD vectors" documentation = "https://docs.rs/crate/packed_simd/" homepage = "https://github.com/rust-lang/packed_simd" repository = "https://github.com/rust-lang/packed_simd" keywords = ["simd", "vector", "portability"] categories = ["hardware-support", "concurrency", "no-std", "data-structures"] license = "MIT OR Apache-2.0" build = "build.rs" edition = "2018" [package.metadata.docs.rs] features = ["into_bits"] rustdoc-args = ["--cfg", "doc_cfg"] # To build locally: # RUSTDOCFLAGS="--cfg doc_cfg" cargo +nightly doc --features into_bits --no-deps --open [badges] is-it-maintained-issue-resolution = { repository = "rust-lang/packed_simd" } is-it-maintained-open-issues = { repository = "rust-lang/packed_simd" } maintenance = { status = "experimental" } [dependencies] cfg-if = "1.0.0" core_arch = { version = "0.1.5", optional = true } num-traits = { version = "0.2.14", default-features = false, features = ["libm"] } [features] default = [] into_bits = [] libcore_neon = [] [dev-dependencies] paste = "^1" arrayvec = { version = "^0.5", default-features = false } [target.'cfg(target_arch = "x86_64")'.dependencies.sleef-sys] version = "0.1.2" optional = true [target.wasm32-unknown-unknown.dev-dependencies] # Keep in sync with the version on Dockerfile. wasm-bindgen = "=0.2.87" wasm-bindgen-test = "=0.3.37" ================================================ FILE: LICENSE-APACHE ================================================ Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 1. Definitions. "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: (a) You must give any other recipients of the Work or Derivative Works a copy of this License; and (b) You must cause any modified files to carry prominent notices stating that You changed the files; and (c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and (d) If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. END OF TERMS AND CONDITIONS APPENDIX: How to apply the Apache License to your work. To apply the Apache License to your work, attach the following boilerplate notice, with the fields enclosed by brackets "[]" replaced with your own identifying information. (Don't include the brackets!) The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives. Copyright [yyyy] [name of copyright owner] Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ================================================ FILE: LICENSE-MIT ================================================ Copyright (c) 2014 The Rust Project Developers Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: README.md ================================================ # `Simd<[T; N]>` ## Implementation of [Rust RFC #2366: `std::simd`][rfc2366] [![Latest Version]][crates.io] [![docs]][master_docs] **WARNING**: this crate only supports the most recent nightly Rust toolchain and will be superseded by [`#![feature(portable_simd)]`](https://github.com/rust-lang/portable-simd). ## Documentation * [API docs (`master` branch)][master_docs] * [Performance guide][perf_guide] * [API docs (`docs.rs`)][docs.rs] * [RFC2366 `std::simd`][rfc2366]: - contains motivation, design rationale, discussion, etc. ## Examples Most of the examples come with both a scalar and a vectorized implementation. * [`aobench`](https://github.com/rust-lang-nursery/packed_simd/tree/master/examples/aobench) * [`fannkuch_redux`](https://github.com/rust-lang-nursery/packed_simd/tree/master/examples/fannkuch_redux) * [`matrix inverse`](https://github.com/rust-lang-nursery/packed_simd/tree/master/examples/matrix_inverse) * [`mandelbrot`](https://github.com/rust-lang-nursery/packed_simd/tree/master/examples/mandelbrot) * [`n-body`](https://github.com/rust-lang-nursery/packed_simd/tree/master/examples/nbody) * [`options_pricing`](https://github.com/rust-lang-nursery/packed_simd/tree/master/examples/options_pricing) * [`spectral_norm`](https://github.com/rust-lang-nursery/packed_simd/tree/master/examples/spectral_norm) * [`triangle transform`](https://github.com/rust-lang-nursery/packed_simd/tree/master/examples/triangle_xform) * [`stencil`](https://github.com/rust-lang-nursery/packed_simd/tree/master/examples/stencil) * [`vector dot product`](https://github.com/rust-lang-nursery/packed_simd/tree/master/examples/dot_product) ## Cargo features * `into_bits` (default: disabled): enables `FromBits`/`IntoBits` trait implementations for the vector types. These allow reinterpreting the bits of a vector type as those of another vector type safely by just using the `.into_bits()` method. ## Performance The following [ISPC] examples are also part of `packed_simd`'s [`examples/`](https://github.com/rust-lang-nursery/packed_simd/tree/master/examples/) directory, where `packed_simd`+[`rayon`][rayon] are used to emulate [ISPC]'s Single-Program-Multiple-Data (SPMD) programming model. The performance results on different hardware is shown in the `readme.md` of each example. The following table summarizes the performance ranges, where `+` means speed-up and `-` slowdown: * `aobench`: `[-1.02x, +1.53x]`, * `stencil`: `[+1.06x, +1.72x]`, * `mandelbrot`: `[-1.74x, +1.2x]`, * `options_pricing`: * `black_scholes`: `+1.0x` * `binomial_put`: `+1.4x` While SPMD is not the intended use case for `packed_simd`, it is possible to combine the library with [`rayon`][rayon] to poorly emulate [ISPC]'s SPMD programming model in Rust. Writing performant code is not as straightforward as with [ISPC], but with some care (e.g. see the [Performance Guide][perf_guide]) one can easily match and often out-perform [ISPC]'s "default performance". ## Platform support The following table describes the supported platforms: `build` shows whether the library compiles without issues for a given target, while `run` shows whether the test suite passes for a given target. | **Linux** | **build** | **run** | |---------------------------------------|-----------|---------| | `i586-unknown-linux-gnu` | ✓ | ✗ | | `i686-unknown-linux-gnu` | ✓ | ✗ | | `x86_64-unknown-linux-gnu` | ✓ | ✓ | | `arm-unknown-linux-gnueabihf` | ✓ | ✓ | | `armv7-unknown-linux-gnueabi` | ✓ | ✓ | | `aarch64-unknown-linux-gnu` | ✓ | ✓ | | `powerpc-unknown-linux-gnu` | ✓ | ✗ | | `powerpc64-unknown-linux-gnu` | ✓ | ✗ | | `powerpc64le-unknown-linux-gnu` | ✓ | ✓ | | `s390x-unknown-linux-gnu` | ✓ | ✗ | | `sparc64-unknown-linux-gnu` | ✓ | ✗ | | `thumbv7neon-unknown-linux-gnueabihf` | ✓ | ✓ | | **MacOSX** | **build** | **run** | | `x86_64-apple-darwin` | ✓ | ✓ | | **Android** | **build** | **run** | | `x86_64-linux-android` | ✓ | ✓ | | `armv7-linux-androideabi` | ✓ | ✗ | | `aarch64-linux-android` | ✓ | ✗ | | `thumbv7neon-linux-androideabi` | ✓ | ✗ | | **iOS** | **build** | **run** | | `x86_64-apple-ios` | ✗ | ✗ | | `aarch64-apple-ios` | ✗ | ✗ | ## Machine code verification The [`verify/`](https://github.com/rust-lang-nursery/packed_simd/tree/master/verify) crate tests disassembles the portable packed vector APIs at run-time and compares the generated machine code against the desired one to make sure that this crate remains efficient. ## License This project is licensed under either of * [Apache License, Version 2.0](http://www.apache.org/licenses/LICENSE-2.0) ([LICENSE-APACHE](LICENSE-APACHE)) * [MIT License](http://opensource.org/licenses/MIT) ([LICENSE-MIT](LICENSE-MIT)) at your option. ## Contributing We welcome all people who want to contribute. Please see the [contributing instructions] for more information. Contributions in any form (issues, pull requests, etc.) to this project must adhere to Rust's [Code of Conduct]. Unless you explicitly state otherwise, any contribution intentionally submitted for inclusion in `packed_simd` by you, as defined in the Apache-2.0 license, shall be dual licensed as above, without any additional terms or conditions. [travis]: https://travis-ci.com/rust-lang/packed_simd [Travis-CI Status]: https://travis-ci.com/rust-lang/packed_simd.svg?branch=master [appveyor]: https://ci.appveyor.com/project/gnzlbg/packed-simd [Appveyor Status]: https://ci.appveyor.com/api/projects/status/hd7v9dvr442hgdix?svg=true [Latest Version]: https://img.shields.io/crates/v/packed_simd.svg [crates.io]: https://crates.io/crates/packed_simd [docs]: https://docs.rs/packed_simd/badge.svg [docs.rs]: https://docs.rs/packed_simd [master_docs]: https://rust-lang-nursery.github.io/packed_simd/packed_simd/ [perf_guide]: https://rust-lang-nursery.github.io/packed_simd/perf-guide/ [rfc2366]: https://github.com/rust-lang/rfcs/pull/2366 [ISPC]: https://ispc.github.io/ [rayon]: https://crates.io/crates/rayon [boost_license]: https://www.boost.org/LICENSE_1_0.txt [SLEEF]: https://sleef.org/ [sleef_sys]: https://crates.io/crates/sleef-sys [contributing instructions]: contributing.md [Code of Conduct]: https://www.rust-lang.org/en-US/conduct.html ================================================ FILE: bors.toml ================================================ status = [ "continuous-integration/travis-ci/push" ] ================================================ FILE: build.rs ================================================ fn main() { let target = std::env::var("TARGET").expect("TARGET environment variable not defined"); if target.contains("neon") { println!("cargo:rustc-cfg=libcore_neon"); } } ================================================ FILE: ci/all.sh ================================================ #!/usr/bin/env bash # # Performs an operation on all targets set -ex : "${1?The all.sh script requires one argument.}" op=$1 cargo_clean() { cargo clean } cargo_check_fmt() { cargo fmt --all -- --check } cargo_fmt() { cargo fmt --all } cargo_clippy() { cargo clippy --all -- -D clippy::perf } CMD="-1" case $op in clean*) CMD=cargo_clean ;; check_fmt*) CMD=cargo_check_fmt ;; fmt*) CMD=cargo_fmt ;; clippy) CMD=cargo_clippy ;; *) echo "Unknown operation: \"${op}\"" exit 1 ;; esac echo "Operation is: ${CMD}" # On src/ $CMD # Check examples/ for dir in examples/*/ do dir=${dir%*/} ( cd "${dir%*/}" $CMD ) done ( cd verify/verify $CMD ) ( cd micro_benchmarks $CMD ) ================================================ FILE: ci/android-install-ndk.sh ================================================ #!/usr/bin/env sh # Copyright 2016 The Rust Project Developers. See the COPYRIGHT # file at the top-level directory of this distribution and at # http://rust-lang.org/COPYRIGHT. # # Licensed under the Apache License, Version 2.0 or the MIT license # , at your # option. This file may not be copied, modified, or distributed # except according to those terms. set -ex ANDROID_NDK_URL=https://dl.google.com/android/repository ANDROID_NDK_ARCHIVE=android-ndk-r25b-linux.zip curl -fO "$ANDROID_NDK_URL/$ANDROID_NDK_ARCHIVE" unzip -q $ANDROID_NDK_ARCHIVE rm $ANDROID_NDK_ARCHIVE mv android-ndk-* ndk rm -rf android-ndk-* ================================================ FILE: ci/android-install-sdk.sh ================================================ #!/usr/bin/env sh # Copyright 2016 The Rust Project Developers. See the COPYRIGHT # file at the top-level directory of this distribution and at # http://rust-lang.org/COPYRIGHT. # # Licensed under the Apache License, Version 2.0 or the MIT license # , at your # option. This file may not be copied, modified, or distributed # except according to those terms. set -ex # Prep the SDK and emulator # # Note that the update process requires that we accept a bunch of licenses, and # we can't just pipe `yes` into it for some reason, so we take the same strategy # located in https://github.com/appunite/docker by just wrapping it in a script # which apparently magically accepts the licenses. mkdir sdk curl --retry 5 https://dl.google.com/android/repository/sdk-tools-linux-3859397.zip -O unzip -d sdk sdk-tools-linux-3859397.zip case "$1" in arm | armv7) abi=armeabi-v7a ;; aarch64) abi=arm64-v8a ;; i686) abi=x86 ;; x86_64) abi=x86_64 ;; *) echo "invalid arch: $1" exit 1 ;; esac; # --no_https avoids # javax.net.ssl.SSLHandshakeException: sun.security.validator.ValidatorException: No trusted certificate found yes | ./sdk/tools/bin/sdkmanager --licenses --no_https yes | ./sdk/tools/bin/sdkmanager --no_https \ "emulator" \ "platform-tools" \ "platforms;android-24" \ "system-images;android-24;default;$abi" echo "no" | ./sdk/tools/bin/avdmanager create avd \ --name "${1}" \ --package "system-images;android-24;default;$abi" ================================================ FILE: ci/android-sysimage.sh ================================================ #!/usr/bin/env bash # Copyright 2017 The Rust Project Developers. See the COPYRIGHT # file at the top-level directory of this distribution and at # http://rust-lang.org/COPYRIGHT. # # Licensed under the Apache License, Version 2.0 or the MIT license # , at your # option. This file may not be copied, modified, or distributed # except according to those terms. set -ex URL=https://dl.google.com/android/repository/sys-img/android main() { local arch="${1}" local name="${2}" local dest=/system local td td="$(mktemp -d)" apt-get install --no-install-recommends e2tools pushd "${td}" curl --retry 5 -O "${URL}/${name}" unzip -q "${name}" local system system="$(find . -name system.img)" mkdir -p ${dest}/{bin,lib,lib64} # Extract android linker and libraries to /system # This allows android executables to be run directly (or with qemu) if [ "${arch}" = "x86_64" ] || [ "${arch}" = "arm64" ]; then e2cp -p "${system}:/bin/linker64" "${dest}/bin/" e2cp -p "${system}:/lib64/libdl.so" "${dest}/lib64/" e2cp -p "${system}:/lib64/libc.so" "${dest}/lib64/" e2cp -p "${system}:/lib64/libm.so" "${dest}/lib64/" else e2cp -p "${system}:/bin/linker" "${dest}/bin/" e2cp -p "${system}:/lib/libdl.so" "${dest}/lib/" e2cp -p "${system}:/lib/libc.so" "${dest}/lib/" e2cp -p "${system}:/lib/libm.so" "${dest}/lib/" fi # clean up apt-get purge --auto-remove -y e2tools popd rm -rf "${td}" } main "${@}" ================================================ FILE: ci/benchmark.sh ================================================ #!/usr/bin/env bash # # Runs all benchmarks. Controlled by the following environment variables: # # FEATURES={} - cargo features to pass to all benchmarks (e.g. core_arch,sleef-sys,ispc) # NORUN={1} - only builds the benchmarks set -ex if [[ ${NORUN} != 1 ]]; then # Most benchmarks require hyperfine; require it upfront. hash hyperfine 2>/dev/null || { echo >&2 "hyperfine is not in PATH."; exit 1; } fi # If the ispc benchmark feature is enabled, ispc must be in the path of the # benchmarks. if echo "$FEATURES" | grep -q "ispc"; then hash ispc 2>/dev/null || { echo >&2 "ispc is not in PATH."; exit 1; } fi # An example with a benchmark.sh is a benchmark: for dir in examples/*/ do dir=${dir%*/} cd ${dir%*/} if [ -f "benchmark.sh" ]; then ./benchmark.sh fi cd - done ================================================ FILE: ci/deploy_and_run_on_ios_simulator.rs ================================================ // Copyright 2017 The Rust Project Developers. See the COPYRIGHT // file at the top-level directory of this distribution and at // http://rust-lang.org/COPYRIGHT. // // Licensed under the Apache License, Version 2.0 or the MIT license // , at your // option. This file may not be copied, modified, or distributed // except according to those terms. // This is a script to deploy and execute a binary on an iOS simulator. // The primary use of this is to be able to run unit tests on the simulator and // retrieve the results. // // To do this through Cargo instead, use Dinghy // (https://github.com/snipsco/dinghy): cargo dinghy install, then cargo dinghy // test. use std::env; use std::fs::{self, File}; use std::io::Write; use std::path::Path; use std::process; use std::process::Command; macro_rules! t { ($e:expr) => (match $e { Ok(e) => e, Err(e) => panic!("{} failed with: {}", stringify!($e), e), }) } // Step one: Wrap as an app fn package_as_simulator_app(crate_name: &str, test_binary_path: &Path) { println!("Packaging simulator app"); drop(fs::remove_dir_all("ios_simulator_app")); t!(fs::create_dir("ios_simulator_app")); t!(fs::copy(test_binary_path, Path::new("ios_simulator_app").join(crate_name))); let mut f = t!(File::create("ios_simulator_app/Info.plist")); t!(f.write_all(format!(r#" CFBundleExecutable {} CFBundleIdentifier com.rust.unittests "#, crate_name).as_bytes())); } // Step two: Start the iOS simulator fn start_simulator() { println!("Looking for iOS simulator"); let output = t!(Command::new("xcrun").arg("simctl").arg("list").output()); assert!(output.status.success()); let mut simulator_exists = false; let mut simulator_booted = false; let mut found_rust_sim = false; let stdout = t!(String::from_utf8(output.stdout)); for line in stdout.lines() { if line.contains("rust_ios") { if found_rust_sim { panic!("Duplicate rust_ios simulators found. Please \ double-check xcrun simctl list."); } simulator_exists = true; simulator_booted = line.contains("(Booted)"); found_rust_sim = true; } } if simulator_exists == false { println!("Creating iOS simulator"); Command::new("xcrun") .arg("simctl") .arg("create") .arg("rust_ios") .arg("com.apple.CoreSimulator.SimDeviceType.iPhone-SE") .arg("com.apple.CoreSimulator.SimRuntime.iOS-10-2") .check_status(); } else if simulator_booted == true { println!("Shutting down already-booted simulator"); Command::new("xcrun") .arg("simctl") .arg("shutdown") .arg("rust_ios") .check_status(); } println!("Starting iOS simulator"); // We can't uninstall the app (if present) as that will hang if the // simulator isn't completely booted; just erase the simulator instead. Command::new("xcrun").arg("simctl").arg("erase").arg("rust_ios").check_status(); Command::new("xcrun").arg("simctl").arg("boot").arg("rust_ios").check_status(); } // Step three: Install the app fn install_app_to_simulator() { println!("Installing app to simulator"); Command::new("xcrun") .arg("simctl") .arg("install") .arg("booted") .arg("ios_simulator_app/") .check_status(); } // Step four: Run the app fn run_app_on_simulator() { println!("Running app"); let output = t!(Command::new("xcrun") .arg("simctl") .arg("launch") .arg("--console") .arg("booted") .arg("com.rust.unittests") .output()); println!("stdout --\n{}\n", String::from_utf8_lossy(&output.stdout)); println!("stderr --\n{}\n", String::from_utf8_lossy(&output.stderr)); let stdout = String::from_utf8_lossy(&output.stdout); let failed = stdout.lines() .find(|l| l.contains("FAILED")) .map(|l| l.contains("FAILED")) .unwrap_or(false); let passed = stdout.lines() .find(|l| l.contains("test result: ok")) .map(|l| l.contains("test result: ok")) .unwrap_or(false); println!("Shutting down simulator"); Command::new("xcrun") .arg("simctl") .arg("shutdown") .arg("rust_ios") .check_status(); if !(passed && !failed) { panic!("tests didn't pass"); } } trait CheckStatus { fn check_status(&mut self); } impl CheckStatus for Command { fn check_status(&mut self) { println!("\trunning: {:?}", self); assert!(t!(self.status()).success()); } } fn main() { let args: Vec = env::args().collect(); if args.len() != 2 { println!("Usage: {} ", args[0]); process::exit(-1); } let test_binary_path = Path::new(&args[1]); let crate_name = test_binary_path.file_name().unwrap(); package_as_simulator_app(crate_name.to_str().unwrap(), test_binary_path); start_simulator(); install_app_to_simulator(); run_app_on_simulator(); } ================================================ FILE: ci/docker/aarch64-linux-android/Dockerfile ================================================ FROM ubuntu:16.04 RUN dpkg --add-architecture i386 && \ apt-get update && \ apt-get install -y --no-install-recommends \ file \ make \ curl \ ca-certificates \ python \ unzip \ expect \ openjdk-9-jre \ libstdc++6:i386 \ libpulse0 \ gcc \ libc6-dev WORKDIR /android/ COPY android* /android/ ENV ANDROID_ARCH=aarch64 ENV PATH=$PATH:/android/ndk-$ANDROID_ARCH/bin:/android/sdk/tools:/android/sdk/platform-tools RUN sh /android/android-install-ndk.sh $ANDROID_ARCH RUN sh /android/android-install-sdk.sh $ANDROID_ARCH RUN mv /root/.android /tmp RUN chmod 777 -R /tmp/.android RUN chmod 755 /android/sdk/tools/* /android/sdk/emulator/qemu/linux-x86_64/* ENV PATH=$PATH:/rust/bin \ CARGO_TARGET_AARCH64_LINUX_ANDROID_LINKER=aarch64-linux-android-gcc \ CARGO_TARGET_AARCH64_LINUX_ANDROID_RUNNER=/tmp/runtest \ OBJDUMP=aarch64-linux-android-objdump \ HOME=/tmp ADD runtest-android.rs /tmp/runtest.rs ENTRYPOINT [ \ "bash", \ "-c", \ # set SHELL so android can detect a 64bits system, see # http://stackoverflow.com/a/41789144 "SHELL=/bin/dash /android/sdk/emulator/emulator @aarch64 -no-window & \ rustc /tmp/runtest.rs -o /tmp/runtest && \ exec \"$@\"", \ "--" \ ] ================================================ FILE: ci/docker/aarch64-unknown-linux-gnu/Dockerfile ================================================ FROM ubuntu:18.04 RUN apt-get update && apt-get install -y --no-install-recommends \ gcc \ ca-certificates \ libc6-dev \ gcc-aarch64-linux-gnu \ libc6-dev-arm64-cross \ qemu-user \ make \ file ENV CARGO_TARGET_AARCH64_UNKNOWN_LINUX_GNU_LINKER=aarch64-linux-gnu-gcc \ CARGO_TARGET_AARCH64_UNKNOWN_LINUX_GNU_RUNNER="qemu-aarch64 -L /usr/aarch64-linux-gnu" \ OBJDUMP=aarch64-linux-gnu-objdump ================================================ FILE: ci/docker/arm-unknown-linux-gnueabi/Dockerfile ================================================ FROM ubuntu:18.04 RUN apt-get update && apt-get install -y --no-install-recommends \ gcc \ ca-certificates \ libc6-dev \ libc6-armel-cross \ libc6-dev-armel-cross \ binutils-arm-linux-gnueabi \ gcc-arm-linux-gnueabi \ qemu-user \ make \ file ENV CARGO_TARGET_ARM_UNKNOWN_LINUX_GNUEABI_LINKER=arm-linux-gnueabi-gcc \ CARGO_TARGET_ARM_UNKNOWN_LINUX_GNUEABI_RUNNER="qemu-arm -L /usr/arm-linux-gnueabi" \ OBJDUMP=arm-linux-gnueabi-objdump ================================================ FILE: ci/docker/arm-unknown-linux-gnueabihf/Dockerfile ================================================ FROM ubuntu:18.04 RUN apt-get update && apt-get install -y --no-install-recommends \ gcc \ ca-certificates \ libc6-dev \ gcc-arm-linux-gnueabihf \ libc6-dev-armhf-cross \ qemu-user \ make \ file ENV CARGO_TARGET_ARM_UNKNOWN_LINUX_GNUEABIHF_LINKER=arm-linux-gnueabihf-gcc \ CARGO_TARGET_ARM_UNKNOWN_LINUX_GNUEABIHF_RUNNER="qemu-arm -L /usr/arm-linux-gnueabihf" \ OBJDUMP=arm-linux-gnueabihf-objdump ================================================ FILE: ci/docker/armv7-linux-androideabi/Dockerfile ================================================ FROM ubuntu:16.04 RUN dpkg --add-architecture i386 && \ apt-get update && \ apt-get install -y --no-install-recommends \ file \ make \ curl \ ca-certificates \ python \ unzip \ expect \ openjdk-9-jre \ libstdc++6:i386 \ libpulse0 \ gcc \ libc6-dev WORKDIR /android/ COPY android* /android/ ENV ANDROID_ARCH=arm ENV PATH=$PATH:/android/ndk-$ANDROID_ARCH/bin:/android/sdk/tools:/android/sdk/platform-tools RUN sh /android/android-install-ndk.sh $ANDROID_ARCH RUN sh /android/android-install-sdk.sh $ANDROID_ARCH RUN mv /root/.android /tmp RUN chmod 777 -R /tmp/.android RUN chmod 755 /android/sdk/tools/* /android/sdk/emulator/qemu/linux-x86_64/* ENV PATH=$PATH:/rust/bin \ CARGO_TARGET_ARM_LINUX_ANDROIDEABI_LINKER=arm-linux-androideabi-gcc \ CARGO_TARGET_ARM_LINUX_ANDROIDEABI_RUNNER=/tmp/runtest \ OBJDUMP=arm-linux-androideabi-objdump \ HOME=/tmp ADD runtest-android.rs /tmp/runtest.rs ENTRYPOINT [ \ "bash", \ "-c", \ # set SHELL so android can detect a 64bits system, see # http://stackoverflow.com/a/41789144 "SHELL=/bin/dash /android/sdk/emulator/emulator @arm -no-window & \ rustc /tmp/runtest.rs -o /tmp/runtest && \ exec \"$@\"", \ "--" \ ] ================================================ FILE: ci/docker/armv7-unknown-linux-gnueabihf/Dockerfile ================================================ FROM ubuntu:18.04 RUN apt-get update && apt-get install -y --no-install-recommends \ gcc \ ca-certificates \ libc6-dev \ gcc-arm-linux-gnueabihf \ libc6-dev-armhf-cross \ qemu-user \ make \ file ENV CARGO_TARGET_ARMV7_UNKNOWN_LINUX_GNUEABIHF_LINKER=arm-linux-gnueabihf-gcc \ CARGO_TARGET_ARMV7_UNKNOWN_LINUX_GNUEABIHF_RUNNER="qemu-arm -L /usr/arm-linux-gnueabihf" \ OBJDUMP=arm-linux-gnueabihf-objdump ================================================ FILE: ci/docker/i586-unknown-linux-gnu/Dockerfile ================================================ FROM ubuntu:18.04 RUN apt-get update && apt-get install -y --no-install-recommends \ gcc-multilib \ libc6-dev \ file \ make \ ca-certificates ================================================ FILE: ci/docker/i686-unknown-linux-gnu/Dockerfile ================================================ FROM ubuntu:18.04 RUN apt-get update && apt-get install -y --no-install-recommends \ gcc-multilib \ libc6-dev \ file \ make \ ca-certificates ================================================ FILE: ci/docker/mips-unknown-linux-gnu/Dockerfile ================================================ FROM ubuntu:18.04 RUN apt-get update && apt-get install -y --no-install-recommends \ gcc libc6-dev qemu-user ca-certificates \ gcc-mips-linux-gnu libc6-dev-mips-cross \ qemu-system-mips \ qemu-user \ make \ file ENV CARGO_TARGET_MIPS_UNKNOWN_LINUX_GNU_LINKER=mips-linux-gnu-gcc \ CARGO_TARGET_MIPS_UNKNOWN_LINUX_GNU_RUNNER="qemu-mips -L /usr/mips-linux-gnu" \ OBJDUMP=mips-linux-gnu-objdump ================================================ FILE: ci/docker/mips64-unknown-linux-gnuabi64/Dockerfile ================================================ FROM ubuntu:18.04 RUN apt-get update && apt-get install -y --no-install-recommends \ gcc libc6-dev qemu-user ca-certificates \ gcc-mips64-linux-gnuabi64 libc6-dev-mips64-cross \ qemu-system-mips64 qemu-user ENV CARGO_TARGET_MIPS64_UNKNOWN_LINUX_GNUABI64_LINKER=mips64-linux-gnuabi64-gcc \ CARGO_TARGET_MIPS64_UNKNOWN_LINUX_GNUABI64_RUNNER="qemu-mips64 -L /usr/mips64-linux-gnuabi64" \ OBJDUMP=mips64-linux-gnuabi64-objdump ================================================ FILE: ci/docker/mips64el-unknown-linux-gnuabi64/Dockerfile ================================================ FROM ubuntu:18.04 RUN apt-get update && apt-get install -y --no-install-recommends \ gcc libc6-dev qemu-user ca-certificates \ gcc-mips64el-linux-gnuabi64 libc6-dev-mips64el-cross \ qemu-system-mips64el ENV CARGO_TARGET_MIPS64EL_UNKNOWN_LINUX_GNUABI64_LINKER=mips64el-linux-gnuabi64-gcc \ CARGO_TARGET_MIPS64EL_UNKNOWN_LINUX_GNUABI64_RUNNER="qemu-mips64el -L /usr/mips64el-linux-gnuabi64" \ OBJDUMP=mips64el-linux-gnuabi64-objdump ================================================ FILE: ci/docker/mipsel-unknown-linux-musl/Dockerfile ================================================ FROM ubuntu:18.10 RUN apt-get update && \ apt-get install -y --no-install-recommends \ ca-certificates \ gcc \ libc6-dev \ make \ qemu-user \ qemu-system-mips \ bzip2 \ curl \ file RUN mkdir /toolchain # Note that this originally came from: # https://downloads.openwrt.org/snapshots/trunk/malta/generic/OpenWrt-Toolchain-malta-le_gcc-5.3.0_musl-1.1.15.Linux-x86_64.tar.bz2 RUN curl -L https://ci-mirrors.rust-lang.org/libc/OpenWrt-Toolchain-malta-le_gcc-5.3.0_musl-1.1.15.Linux-x86_64.tar.bz2 | \ tar xjf - -C /toolchain --strip-components=2 ENV PATH=$PATH:/rust/bin:/toolchain/bin \ CC_mipsel_unknown_linux_musl=mipsel-openwrt-linux-gcc \ CARGO_TARGET_MIPSEL_UNKNOWN_LINUX_MUSL_LINKER=mipsel-openwrt-linux-gcc \ CARGO_TARGET_MIPSEL_UNKNOWN_LINUX_MUSL_RUNNER="qemu-mipsel -L /toolchain" ================================================ FILE: ci/docker/powerpc-unknown-linux-gnu/Dockerfile ================================================ FROM ubuntu:22.04 RUN apt-get update && apt-get install -y --no-install-recommends \ gcc libc6-dev qemu-user ca-certificates \ gcc-powerpc-linux-gnu libc6-dev-powerpc-cross \ qemu-system-ppc \ make \ file ENV CARGO_TARGET_POWERPC_UNKNOWN_LINUX_GNU_LINKER=powerpc-linux-gnu-gcc \ CARGO_TARGET_POWERPC_UNKNOWN_LINUX_GNU_RUNNER="qemu-ppc -cpu Vger -L /usr/powerpc-linux-gnu" \ CC=powerpc-linux-gnu-gcc \ OBJDUMP=powerpc-linux-gnu-objdump ================================================ FILE: ci/docker/powerpc64-unknown-linux-gnu/Dockerfile ================================================ FROM ubuntu:22.04 RUN apt-get update && apt-get install -y --no-install-recommends \ gcc \ ca-certificates \ libc6-dev \ gcc-powerpc64-linux-gnu \ libc6-dev-ppc64-cross \ qemu-user \ qemu-system-ppc \ make \ file ENV CARGO_TARGET_POWERPC64_UNKNOWN_LINUX_GNU_LINKER=powerpc64-linux-gnu-gcc \ CARGO_TARGET_POWERPC64_UNKNOWN_LINUX_GNU_RUNNER="qemu-ppc64 -L /usr/powerpc64-linux-gnu" \ CC=powerpc64-linux-gnu-gcc \ OBJDUMP=powerpc64-linux-gnu-objdump ================================================ FILE: ci/docker/powerpc64le-unknown-linux-gnu/Dockerfile ================================================ FROM ubuntu:22.04 RUN apt-get update && apt-get install -y --no-install-recommends \ gcc libc6-dev qemu-user ca-certificates \ gcc-powerpc64le-linux-gnu libc6-dev-ppc64el-cross \ qemu-system-ppc file make ENV CARGO_TARGET_POWERPC64LE_UNKNOWN_LINUX_GNU_LINKER=powerpc64le-linux-gnu-gcc \ CARGO_TARGET_POWERPC64LE_UNKNOWN_LINUX_GNU_RUNNER="qemu-ppc64le -L /usr/powerpc64le-linux-gnu" \ CC=powerpc64le-linux-gnu-gcc \ OBJDUMP=powerpc64le-linux-gnu-objdump ================================================ FILE: ci/docker/s390x-unknown-linux-gnu/Dockerfile ================================================ FROM ubuntu:22.04 RUN apt-get update && \ apt-get install -y --no-install-recommends \ ca-certificates \ curl \ cmake \ gcc \ libc6-dev \ g++-s390x-linux-gnu \ libc6-dev-s390x-cross \ qemu-user \ make \ file ENV CARGO_TARGET_S390X_UNKNOWN_LINUX_GNU_LINKER=s390x-linux-gnu-gcc \ CARGO_TARGET_S390X_UNKNOWN_LINUX_GNU_RUNNER="qemu-s390x -L /usr/s390x-linux-gnu" \ CC_s390x_unknown_linux_gnu=s390x-linux-gnu-gcc \ CXX_s390x_unknown_linux_gnu=s390x-linux-gnu-g++ \ OBJDUMP=s390x-linux-gnu-objdump ================================================ FILE: ci/docker/sparc64-unknown-linux-gnu/Dockerfile ================================================ FROM debian:bookworm RUN apt-get update && apt-get install -y --no-install-recommends \ curl ca-certificates \ gcc libc6-dev \ gcc-sparc64-linux-gnu libc6-dev-sparc64-cross \ qemu-system-sparc64 openbios-sparc seabios ipxe-qemu \ p7zip-full cpio COPY linux-sparc64.sh / RUN bash /linux-sparc64.sh COPY test-runner-linux / ENV CARGO_TARGET_SPARC64_UNKNOWN_LINUX_GNU_LINKER=sparc64-linux-gnu-gcc \ CARGO_TARGET_SPARC64_UNKNOWN_LINUX_GNU_RUNNER="/test-runner-linux sparc64" \ CC_sparc64_unknown_linux_gnu=sparc64-linux-gnu-gcc \ PATH=$PATH:/rust/bin ================================================ FILE: ci/docker/thumbv7neon-linux-androideabi/Dockerfile ================================================ FROM ubuntu:16.04 RUN dpkg --add-architecture i386 && \ apt-get update && \ apt-get install -y --no-install-recommends \ file \ make \ curl \ ca-certificates \ python \ unzip \ expect \ openjdk-9-jre \ libstdc++6:i386 \ libpulse0 \ gcc \ libc6-dev WORKDIR /android/ COPY android* /android/ ENV ANDROID_ARCH=arm ENV PATH=$PATH:/android/ndk-$ANDROID_ARCH/bin:/android/sdk/tools:/android/sdk/platform-tools RUN sh /android/android-install-ndk.sh $ANDROID_ARCH RUN sh /android/android-install-sdk.sh $ANDROID_ARCH RUN mv /root/.android /tmp RUN chmod 777 -R /tmp/.android RUN chmod 755 /android/sdk/tools/* /android/sdk/emulator/qemu/linux-x86_64/* ENV PATH=$PATH:/rust/bin \ CARGO_TARGET_THUMBV7NEON_LINUX_ANDROIDEABI_LINKER=arm-linux-androideabi-gcc \ CARGO_TARGET_THUMBV7NEON_LINUX_ANDROIDEABI_RUNNER=/tmp/runtest \ OBJDUMP=arm-linux-androideabi-objdump \ HOME=/tmp ADD runtest-android.rs /tmp/runtest.rs ENTRYPOINT [ \ "bash", \ "-c", \ # set SHELL so android can detect a 64bits system, see # http://stackoverflow.com/a/41789144 "SHELL=/bin/dash /android/sdk/emulator/emulator @arm -no-window & \ rustc /tmp/runtest.rs -o /tmp/runtest && \ exec \"$@\"", \ "--" \ ] ================================================ FILE: ci/docker/thumbv7neon-unknown-linux-gnueabihf/Dockerfile ================================================ FROM ubuntu:18.04 RUN apt-get update && apt-get install -y --no-install-recommends \ gcc \ ca-certificates \ libc6-dev \ gcc-arm-linux-gnueabihf \ libc6-dev-armhf-cross \ qemu-user \ make \ file ENV CARGO_TARGET_THUMBV7NEON_UNKNOWN_LINUX_GNUEABIHF_LINKER=arm-linux-gnueabihf-gcc \ CARGO_TARGET_THUMBV7NEON_UNKNOWN_LINUX_GNUEABIHF_RUNNER="qemu-arm -L /usr/arm-linux-gnueabihf" \ OBJDUMP=arm-linux-gnueabihf-objdump ================================================ FILE: ci/docker/wasm32-unknown-unknown/Dockerfile ================================================ FROM ubuntu:22.04 RUN apt-get update -y && apt-get install -y --no-install-recommends \ ca-certificates \ clang \ cmake \ curl \ git \ libc6-dev \ make \ ninja-build \ python-is-python3 \ xz-utils # Install `wasm2wat` RUN git clone --recursive https://github.com/WebAssembly/wabt RUN make -C wabt -j$(nproc) ENV PATH=$PATH:/wabt/bin # Install `wasm-bindgen-test-runner` RUN curl -L https://github.com/rustwasm/wasm-bindgen/releases/download/0.2.87/wasm-bindgen-0.2.87-x86_64-unknown-linux-musl.tar.gz \ | tar xzf - # Keep in sync with the version on Cargo.toml. ENV PATH=$PATH:/wasm-bindgen-0.2.87-x86_64-unknown-linux-musl ENV CARGO_TARGET_WASM32_UNKNOWN_UNKNOWN_RUNNER=wasm-bindgen-test-runner # Install `node` RUN curl https://nodejs.org/dist/v14.16.0/node-v14.16.0-linux-x64.tar.xz | tar xJf - ENV PATH=$PATH:/node-v14.16.0-linux-x64/bin # We use a shim linker that removes `--strip-debug` when passed to LLD. While # this typically results in invalid debug information in release mode it doesn't # result in an invalid names section which is what we're interested in. COPY lld-shim.rs / ENV CARGO_TARGET_WASM32_UNKNOWN_UNKNOWN_LINKER=/tmp/lld-shim # Rustc isn't available until this container starts, so defer compilation of the # shim. ENTRYPOINT /rust/bin/rustc /lld-shim.rs -o /tmp/lld-shim && exec bash "$@" ================================================ FILE: ci/docker/x86_64-linux-android/Dockerfile ================================================ FROM ubuntu:20.04 RUN apt-get update && \ apt-get install -y --no-install-recommends \ ca-certificates \ curl \ gcc \ libc-dev \ python \ unzip \ file \ make WORKDIR /android/ ENV ANDROID_ARCH=x86_64 COPY android-install-ndk.sh /android/ RUN sh /android/android-install-ndk.sh ENV STDARCH_ASSERT_INSTR_LIMIT=30 # We do not run x86_64-linux-android tests on an android emulator. # See ci/android-sysimage.sh for informations about how tests are run. COPY android-sysimage.sh /android/ RUN bash /android/android-sysimage.sh x86_64 x86_64-24_r07.zip ENV PATH=$PATH:/rust/bin:/android/ndk/toolchains/llvm/prebuilt/linux-x86_64/bin \ CARGO_TARGET_X86_64_LINUX_ANDROID_LINKER=x86_64-linux-android21-clang \ CC_x86_64_linux_android=x86_64-linux-android21-clang \ CXX_x86_64_linux_android=x86_64-linux-android21-clang++ \ OBJDUMP=llvm-objdump \ HOME=/tmp ================================================ FILE: ci/docker/x86_64-unknown-linux-gnu/Dockerfile ================================================ FROM ubuntu:18.04 RUN apt-get update && apt-get install -y --no-install-recommends \ gcc \ libc6-dev \ file \ make \ ca-certificates \ cmake \ libclang-dev \ clang ================================================ FILE: ci/docker/x86_64-unknown-linux-gnu-emulated/Dockerfile ================================================ FROM ubuntu:18.04 RUN apt-get update && apt-get install -y --no-install-recommends \ gcc \ libc6-dev \ file \ make \ ca-certificates \ wget \ bzip2 \ cmake \ libclang-dev \ clang RUN wget https://github.com/gnzlbg/intel_sde/raw/master/sde-external-8.16.0-2018-01-30-lin.tar.bz2 RUN tar -xjf sde-external-8.16.0-2018-01-30-lin.tar.bz2 ENV CARGO_TARGET_X86_64_UNKNOWN_LINUX_GNU_RUNNER="/sde-external-8.16.0-2018-01-30-lin/sde64 --" ================================================ FILE: ci/dox.sh ================================================ #!/bin/sh set -ex rm -rf target/doc mkdir -p target/doc # Build API documentation cargo doc --features=into_bits # Build Performance Guide # FIXME: https://github.com/rust-lang-nursery/mdBook/issues/780 # mdbook build perf-guide -d target/doc/perf-guide cd perf-guide mdbook build cd - cp -r perf-guide/book target/doc/perf-guide # If we're on travis, not a PR, and on the right branch, publish! if [ "$TRAVIS_PULL_REQUEST" = "false" ] && [ "$TRAVIS_BRANCH" = "master" ]; then python3 -vV pip -vV python3.9 -vV pip install ghp_import --user ghp-import -n target/doc git push -qf https://${GH_PAGES}@github.com/${TRAVIS_REPO_SLUG}.git gh-pages fi ================================================ FILE: ci/linux-s390x.sh ================================================ set -ex mkdir -m 777 /qemu cd /qemu curl -LO https://github.com/qemu/qemu/raw/master/pc-bios/s390-ccw.img curl -LO http://ftp.debian.org/debian/dists/testing/main/installer-s390x/20170828/images/generic/kernel.debian curl -LO http://ftp.debian.org/debian/dists/testing/main/installer-s390x/20170828/images/generic/initrd.debian mv kernel.debian kernel mv initrd.debian initrd.gz mkdir init cd init gunzip -c ../initrd.gz | cpio -id rm ../initrd.gz cp /usr/s390x-linux-gnu/lib/libgcc_s.so.1 usr/lib/ chmod a+w . ================================================ FILE: ci/linux-sparc64.sh ================================================ set -ex mkdir -m 777 /qemu cd /qemu curl -LO https://cdimage.debian.org/cdimage/ports/9.0/sparc64/iso-cd/debian-9.0-sparc64-NETINST-1.iso 7z e debian-9.0-sparc64-NETINST-1.iso boot/initrd.gz 7z e debian-9.0-sparc64-NETINST-1.iso boot/sparc64 mv sparc64 kernel rm debian-9.0-sparc64-NETINST-1.iso mkdir init cd init gunzip -c ../initrd.gz | cpio -id rm ../initrd.gz cp /usr/sparc64-linux-gnu/lib/libgcc_s.so.1 usr/lib/ chmod a+w . ================================================ FILE: ci/lld-shim.rs ================================================ use std::os::unix::prelude::*; use std::process::Command; use std::env; fn main() { let args = env::args() .skip(1) .filter(|s| s != "--strip-debug") .collect::>(); panic!("failed to exec: {}", Command::new("rust-lld").args(&args).exec()); } ================================================ FILE: ci/max_line_width.sh ================================================ #!/usr/bin/env sh set -x export success=true find . -iname '*.rs' | while read -r file; do result=$(grep '.\{79\}' "${file}" | grep --invert 'http') if [ "${result}" = "" ] then : else echo "file \"${file}\": $result" exit 1 fi done ================================================ FILE: ci/run-docker.sh ================================================ # Small script to run tests for a target (or all targets) inside all the # respective docker images. set -ex run() { echo "Building docker container for TARGET=${TARGET} RUSTFLAGS=${RUSTFLAGS}" docker build -t packed_simd -f ci/docker/${TARGET}/Dockerfile ci/ mkdir -p target target=$(echo "${TARGET}" | sed 's/-emulated//') echo "Running docker" docker run \ --user `id -u`:`id -g` \ --rm \ --init \ --volume $HOME/.cargo:/cargo \ --env CARGO_HOME=/cargo \ --volume `rustc --print sysroot`:/rust:ro \ --env TARGET=$target \ --env NORUN \ --env NOVERIFY \ --env RUSTFLAGS \ --volume `pwd`:/checkout:ro \ --volume `pwd`/target:/checkout/target \ --workdir /checkout \ --privileged \ packed_simd \ bash \ -c 'PATH=$PATH:/rust/bin exec ci/run.sh' } if [ -z "${TARGET}" ]; then for d in `ls ci/docker/`; do run $d done else run ${TARGET} fi ================================================ FILE: ci/run.sh ================================================ #!/usr/bin/env bash set -ex : ${TARGET?"The TARGET environment variable must be set."} # Tests are all super fast anyway, and they fault often enough on travis that # having only one thread increases debuggability to be worth it. #export RUST_TEST_THREADS=1 #export RUST_BACKTRACE=full #export RUST_TEST_NOCAPTURE=1 # Some appveyor builds run out-of-memory; this attempts to mitigate that: # https://github.com/rust-lang-nursery/packed_simd/issues/39 # export RUSTFLAGS="${RUSTFLAGS} -C codegen-units=1" # export CARGO_BUILD_JOBS=1 export CARGO_SUBCMD=test if [[ "${NORUN}" == "1" ]]; then export CARGO_SUBCMD=build fi if [[ ${TARGET} == "x86_64-apple-ios" ]] || [[ ${TARGET} == "i386-apple-ios" ]]; then export RUSTFLAGS="${RUSTFLAGS} -Clink-arg=-mios-simulator-version-min=7.0" rustc ./ci/deploy_and_run_on_ios_simulator.rs -o $HOME/runtest export CARGO_TARGET_X86_64_APPLE_IOS_RUNNER=$HOME/runtest export CARGO_TARGET_I386_APPLE_IOS_RUNNER=$HOME/runtest fi # The source directory is read-only. Need to copy internal crates to the target # directory for their Cargo.lock to be properly written. mkdir target || true rustc --version cargo --version echo "TARGET=${TARGET}" echo "HOST=${HOST}" echo "RUSTFLAGS=${RUSTFLAGS}" echo "NORUN=${NORUN}" echo "NOVERIFY=${NOVERIFY}" echo "CARGO_SUBCMD=${CARGO_SUBCMD}" echo "CARGO_BUILD_JOBS=${CARGO_BUILD_JOBS}" echo "CARGO_INCREMENTAL=${CARGO_INCREMENTAL}" echo "RUST_TEST_THREADS=${RUST_TEST_THREADS}" echo "RUST_BACKTRACE=${RUST_BACKTRACE}" echo "RUST_TEST_NOCAPTURE=${RUST_TEST_NOCAPTURE}" cargo_test() { cmd="cargo ${CARGO_SUBCMD} --verbose --target=${TARGET} ${@}" if [ "${NORUN}" != "1" ] then if [ "$TARGET" != "wasm32-unknown-unknown" ] then cmd="$cmd -- --quiet" fi fi mkdir target || true ${cmd} 2>&1 | tee > target/output if [[ ${PIPESTATUS[0]} != 0 ]]; then cat target/output return 1 fi } cargo_test_impl() { ORIGINAL_RUSTFLAGS=${RUSTFLAGS} RUSTFLAGS="${ORIGINAL_RUSTFLAGS} --cfg test_v16 --cfg test_v32 --cfg test_v64" cargo_test ${@} RUSTFLAGS="${ORIGINAL_RUSTFLAGS} --cfg test_v128 --cfg test_v256" cargo_test ${@} RUSTFLAGS="${ORIGINAL_RUSTFLAGS} --cfg test_v512" cargo_test ${@} RUSTFLAGS=${ORIGINAL_RUSTFLAGS} } # Debug run: if [[ "${TARGET}" != "wasm32-unknown-unknown" ]]; then # Run wasm32-unknown-unknown in release mode only cargo_test_impl fi if [[ "${TARGET}" == "x86_64-unknown-linux-gnu" ]] || [[ "${TARGET}" == "x86_64-pc-windows-msvc" ]]; then # use sleef on linux and windows x86_64 builds # FIXME: Use `core_arch,sleef-sys` features once they works again cargo_test_impl --release --features=into_bits else # FIXME: Use `core_arch` feature once it works again cargo_test_impl --release --features=into_bits fi # Verify code generation if [[ "${NOVERIFY}" != "1" ]]; then cp -r verify/verify target/verify export STDSIMD_ASSERT_INSTR_LIMIT=30 if [[ "${TARGET}" == "i586-unknown-linux-gnu" ]]; then export STDSIMD_ASSERT_INSTR_LIMIT=50 fi cargo_test --release --manifest-path=target/verify/Cargo.toml fi # FIXME: Figure out which examples take too long to run and ignore or adjust those #. ci/run_examples.sh ================================================ FILE: ci/run_examples.sh ================================================ # Runs all examples. # FIXME: https://github.com/rust-lang-nursery/packed_simd/issues/55 # All examples fail to build for `armv7-apple-ios`. if [[ ${TARGET} == "armv7-apple-ios" ]]; then exit 0 fi # FIXME: travis exceeds 50 minutes on these targets # Skipping the examples is an attempt at preventing travis from timing-out if [[ ${TARGET} == "arm-linux-androidabi" ]] || [[ ${TARGET} == "aarch64-linux-androidabi" ]] \ || [[ ${TARGET} == "sparc64-unknown-linux-gnu" ]]; then exit 0 fi if [[ ${TARGET} == "wasm32-unknown-unknown" ]]; then exit 0 fi cp -r examples/aobench target/aobench cargo_test --manifest-path=target/aobench/Cargo.toml --release --no-default-features cargo_test --manifest-path=target/aobench/Cargo.toml --release --features=256bit cp -r examples/dot_product target/dot_product cargo_test --manifest-path=target/dot_product/Cargo.toml --release cp -r examples/fannkuch_redux target/fannkuch_redux cargo_test --manifest-path=target/fannkuch_redux/Cargo.toml --release # FIXME: https://github.com/rust-lang-nursery/packed_simd/issues/56 if [[ ${TARGET} != "i586-unknown-linux-gnu" ]]; then cp -r examples/mandelbrot target/mandelbrot cargo_test --manifest-path=target/mandelbrot/Cargo.toml --release fi cp -r examples/matrix_inverse target/matrix_inverse cargo_test --manifest-path=target/matrix_inverse/Cargo.toml --release cp -r examples/nbody target/nbody cargo_test --manifest-path=target/nbody/Cargo.toml --release cp -r examples/spectral_norm target/spectral_norm cargo_test --manifest-path=target/spectral_norm/Cargo.toml --release if [[ ${TARGET} != "i586-unknown-linux-gnu" ]]; then cp -r examples/stencil target/stencil cargo_test --manifest-path=target/stencil/Cargo.toml --release fi cp -r examples/triangle_xform target/triangle_xform cargo_test --manifest-path=target/triangle_xform/Cargo.toml --release ================================================ FILE: ci/runtest-android.rs ================================================ use std::env; use std::process::Command; use std::path::{Path, PathBuf}; fn main() { let args = env::args_os() .skip(1) .filter(|arg| arg != "--quiet") .collect::>(); assert_eq!(args.len(), 1); let test = PathBuf::from(&args[0]); let dst = Path::new("/data/local/tmp").join(test.file_name().unwrap()); let status = Command::new("adb") .arg("wait-for-device") .status() .expect("failed to run: adb wait-for-device"); assert!(status.success()); let status = Command::new("adb") .arg("push") .arg(&test) .arg(&dst) .status() .expect("failed to run: adb pushr"); assert!(status.success()); let output = Command::new("adb") .arg("shell") .arg(&dst) .output() .expect("failed to run: adb shell"); assert!(status.success()); println!("status: {}\nstdout ---\n{}\nstderr ---\n{}", output.status, String::from_utf8_lossy(&output.stdout), String::from_utf8_lossy(&output.stderr)); let stdout = String::from_utf8_lossy(&output.stdout); let mut lines = stdout.lines().filter(|l| l.starts_with("test result")); if !lines.all(|l| l.contains("test result: ok") && l.contains("0 failed")) { panic!("failed to find successful test run"); } } ================================================ FILE: ci/setup_benchmarks.sh ================================================ #!/usr/bin/env bash set -ex # Get latest ISPC binary for the target and put it in the path git clone https://github.com/gnzlbg/ispc-binaries cp ispc-binaries/ispc-${TARGET} ispc ================================================ FILE: ci/test-runner-linux ================================================ #!/bin/sh set -e arch=$1 prog=$2 cd /qemu/init cp -f $2 prog find . | cpio --create --format='newc' --quiet | gzip > ../initrd.gz cd .. timeout 30s qemu-system-$arch \ -m 1024 \ -nographic \ -kernel kernel \ -initrd initrd.gz \ -append init=/prog > output || true # remove kernel messages tr -d '\r' < output | egrep -v '^\[' # if the output contains a failure, return error ! grep FAILED output > /dev/null ================================================ FILE: contributing.md ================================================ # Contributing to `packed_simd` Welcome! If you are reading this document, it means you are interested in contributing to the `packed_simd` crate. ## Reporting issues All issues with this crate are tracked using GitHub's [Issue Tracker]. You can use issues to bring bugs to the attention of the maintainers, to discuss certain problems encountered with the crate, or to request new features (although feature requests should be limited to things mentioned in the [RFC]). One thing to keep in mind is to always use the **latest** nightly toolchain when working on this crate. Due to the nature of this project, we use a lot of unstable features, meaning breakage happens often. [Issue Tracker]: https://github.com/rust-lang-nursery/packed_simd/issues [RFC]: https://github.com/rust-lang/rfcs/pull/2366 ### LLVM issues The Rust compiler relies on [LLVM](https://llvm.org/) for machine code generation, and quite a few LLVM bugs have been discovered during the development of this project. If you encounter issues with incorrect/suboptimal codegen, which you do not encounter when using the [SIMD vendor intrinsics](https://doc.rust-lang.org/nightly/std/arch/), it is likely the issue is with LLVM, or this crate's interaction with it. You should first open an issue **in this repo** to help us track the problem, and we will help determine what is the exact cause of the problem. If LLVM is indeed the cause, the issue will be reported upstream to the [LLVM bugtracker](https://bugs.llvm.org/). ## Submitting Pull Requests New code is submitted to the crate using GitHub's [pull request] mechanism. You should first fork this repository, make your changes (preferably in a new branch), then use GitHub's web UI to create a new PR. [pull request]: https://help.github.com/articles/about-pull-requests/ ### Examples The `examples` directory contains code showcasing SIMD code written with this crate, usually in comparison to scalar or ISPC code. If you have a project / idea which uses SIMD, we'd love to add it to the examples list. Every example should include a small `README`, describing the example code's purpose. If your example could potentially work as a benchmark, then add a `benchmark.sh` script to allow running the example benchmark code in CI. See an existing example's [`benchmark.sh`](examples/aobench/benchmark.sh) for a sample. Don't forget to update the crate's top-level `README` with a link to your example. ### Perf guide The objective of the [performance guide][perf-guide] is to be a comprehensive resource detailing the process of optimizing Rust code with SIMD support. If you believe a certain section could be reworded, or if you have any tips & tricks related to SIMD which you'd like to share, please open a PR. [mdBook] is used to manage the formatting of the guide as a book. [perf-guide]: https://rust-lang-nursery.github.io/packed_simd/perf-guide/ [mdBook]: https://github.com/rust-lang-nursery/mdBook ================================================ FILE: examples/Cargo.toml ================================================ # FIXME: Many members of this workspace, including aobench, mandelbrot, and stencil, # currently trigger a "null pointer deref" warning. # This is likely due to unsoundness inside packed_simd. [workspace] members = [ "aobench", "dot_product", "fannkuch_redux", "mandelbrot", "matrix_inverse", "nbody", "options_pricing", "slice_sum", "spectral_norm", "stencil", "triangle_xform", ] [profile.release] # Remember to uncomment this when profiling # debug = 2 # You can set the following to lto = 'thin' and 'codegen-units=16' # for better compile times at the cost of performance lto = 'fat' codegen-units = 1 incremental = false panic = 'abort' [profile.bench] # Same as above lto = 'fat' codegen-units = 1 incremental = false ================================================ FILE: examples/aobench/Cargo.toml ================================================ [package] name = "aobench" version = "0.1.0" authors = ["gnzlbg "] autobenches = false edition = "2018" [[bin]] name = "aobench" path = "src/main.rs" [lib] name = "aobench_lib" path = "src/lib.rs" [dependencies] structopt = "^0.3" failure = "^0.1" png = "^0.15" packed_simd = { package = "packed_simd", path = "../.." } rayon = "^1.0" time = "^0.1" cfg-if = "^0.1" ispc = { version = "^1.0.4", optional = true } [build-dependencies] ispc = { version = "^1.0.4", optional = true } [dev-dependencies] criterion = { version = '^0.3', features=['real_blackbox'] } [features] default = [ "256bit" ] 256bit = [] sleef-sys = [ "packed_simd/sleef-sys" ] core_arch = [ "packed_simd/core_arch" ] [[bench]] name = "isec_sphere" path = "benches/isec_sphere.rs" harness = false [[bench]] name = "isec_plane" path = "benches/isec_plane.rs" harness = false [[bench]] name = "ambient_occlusion" path = "benches/ambient_occlusion.rs" harness = false [[bench]] name = "random" path = "benches/random.rs" harness = false ================================================ FILE: examples/aobench/benches/ambient_occlusion.rs ================================================ //! Benchmarks intersection between rays and planes #![feature(stdsimd)] use aobench_lib::*; use criterion::*; use intersection::Isect; use aobench_lib::scene::Test; fn hit_scalar(c: &mut Criterion) { let mut scene = Test::default(); c.bench( "scalar", Benchmark::new("ao_hit", move |b| { b.iter(|| { let mut isect = Isect::default(); let isect = black_box(&mut isect); let s = black_box(&mut scene); let mut v = ambient_occlusion::scalar(s, isect); black_box(&mut v); }) }) .throughput(Throughput::Elements(1)), ); } fn hit_vector(c: &mut Criterion) { let mut scene = Test::default(); c.bench( "vector", Benchmark::new("ao_hit", move |b| { b.iter(|| { let mut isect = Isect::default(); let isect = black_box(&mut isect); let s = black_box(&mut scene); let mut v = ambient_occlusion::vector(s, isect); black_box(&mut v); }) }) .throughput(Throughput::Elements(1)), ); } criterion_group!(benches, hit_scalar, hit_vector); criterion_main!(benches); ================================================ FILE: examples/aobench/benches/isec_plane.rs ================================================ //! Benchmarks intersection between rays and planes #![feature(stdsimd)] use criterion::*; use crate::geometry::{f32xN, Plane, Ray, RayxN, V3DxN, V3D}; use crate::intersection::{Intersect, Isect, IsectxN}; use aobench_lib::*; fn hit_scalar(c: &mut Criterion) { let mut s = Plane { p: V3D { x: 0., y: 0., z: 10., }, n: V3D { x: 0., y: 0., z: 1., }, }; let mut r = Ray { origin: V3D { x: 0., y: 0., z: 0., }, dir: V3D { x: 0., y: 0., z: 1., }, }; c.bench( "scalar", Benchmark::new("isec_plane_hit", move |b| { b.iter(|| { let mut isect = Isect::default(); let isect = black_box(&mut isect); let s = black_box(&mut s); let r = black_box(&mut r); let mut v = r.intersect(s, *isect); black_box(&mut v); assert_eq!(v.hit, true); }) }) .throughput(Throughput::Elements(1)), ); } fn miss_scalar(c: &mut Criterion) { let mut s = Plane { p: V3D { x: 0., y: 0., z: -10., }, n: V3D { x: 0., y: 0., z: 1., }, }; let mut r = Ray { origin: V3D { x: 0., y: 0., z: 0., }, dir: V3D { x: 0., y: 0., z: 1., }, }; c.bench( "scalar", Benchmark::new("isec_plane_miss", move |b| { b.iter(|| { let mut isect = Isect::default(); let isect = black_box(&mut isect); let s = black_box(&mut s); let r = black_box(&mut r); let mut v = r.intersect(s, *isect); black_box(&mut v); assert_eq!(v.hit, false); }) }) .throughput(Throughput::Elements(1)), ); } fn hit_vector(c: &mut Criterion) { let mut s = Plane { p: V3D { x: 0., y: 0., z: 10., }, n: V3D { x: 0., y: 0., z: 1., }, }; let mut r = RayxN { origin: V3DxN { x: f32xN::splat(0.), y: f32xN::splat(0.), z: f32xN::splat(0.), }, dir: V3DxN { x: f32xN::splat(0.), y: f32xN::splat(0.), z: f32xN::splat(1.), }, }; c.bench( "vector", Benchmark::new("isec_plane_hit", move |b| { b.iter(|| { let mut isect = IsectxN::default(); let isect = black_box(&mut isect); let s = black_box(&mut s); let r = black_box(&mut r); let mut v = r.intersect(s, *isect); black_box(&mut v); assert_eq!(v.hit.all(), true); }) }) .throughput(Throughput::Elements(f32xN::lanes() as u64)), ); } fn miss_vector(c: &mut Criterion) { let mut s = Plane { p: V3D { x: 0., y: 0., z: -10., }, n: V3D { x: 0., y: 0., z: 1., }, }; let mut r = RayxN { origin: V3DxN { x: f32xN::splat(0.), y: f32xN::splat(0.), z: f32xN::splat(0.), }, dir: V3DxN { x: f32xN::splat(0.), y: f32xN::splat(0.), z: f32xN::splat(1.), }, }; c.bench( "vector", Benchmark::new("isec_plane_miss", move |b| { b.iter(|| { let mut isect = IsectxN::default(); let isect = black_box(&mut isect); let s = black_box(&mut s); let r = black_box(&mut r); let mut v = r.intersect(s, *isect); black_box(&mut v); assert_eq!(v.hit.any(), false); }) }) .throughput(Throughput::Elements(f32xN::lanes() as u64)), ); } criterion_group!(benches, hit_scalar, miss_scalar, hit_vector, miss_vector); criterion_main!(benches); ================================================ FILE: examples/aobench/benches/isec_sphere.rs ================================================ //! Benchmarks intersection between rays and spheres #![feature(stdsimd)] use crate::geometry::{f32xN, Ray, RayxN, Sphere, V3DxN, V3D}; use crate::intersection::{Intersect, Isect, IsectxN}; use aobench_lib::*; use criterion::*; fn hit_scalar(c: &mut Criterion) { let mut s = Sphere { center: V3D { x: 0., y: 0., z: 10., }, radius: 1., }; let mut r = Ray { origin: V3D { x: 0., y: 0., z: 0., }, dir: V3D { x: 0., y: 0., z: 1., }, }; c.bench( "scalar", Benchmark::new("isec_sphere_hit", move |b| { b.iter(|| { let mut isect = Isect::default(); let isect = black_box(&mut isect); let s = black_box(&mut s); let r = black_box(&mut r); let mut v = r.intersect(s, *isect); black_box(&mut v); assert_eq!(v.hit, true); }) }) .throughput(Throughput::Elements(1)), ); } fn miss_scalar(c: &mut Criterion) { let mut s = Sphere { center: V3D { x: 0., y: 0., z: -10., }, radius: 1., }; let mut r = Ray { origin: V3D { x: 0., y: 0., z: 0., }, dir: V3D { x: 0., y: 0., z: 1., }, }; c.bench( "scalar", Benchmark::new("isec_sphere_miss", move |b| { b.iter(|| { let mut isect = Isect::default(); let isect = black_box(&mut isect); let s = black_box(&mut s); let r = black_box(&mut r); let mut v = r.intersect(s, *isect); black_box(&mut v); assert_eq!(v.hit, false); }) }) .throughput(Throughput::Elements(1)), ); } fn hit_vector(c: &mut Criterion) { let mut s = Sphere { center: V3D { x: 0., y: 0., z: 10., }, radius: 1., }; let mut r = RayxN { origin: V3DxN { x: f32xN::splat(0.), y: f32xN::splat(0.), z: f32xN::splat(0.), }, dir: V3DxN { x: f32xN::splat(0.), y: f32xN::splat(0.), z: f32xN::splat(1.), }, }; c.bench( "vector", Benchmark::new("isec_sphere_hit", move |b| { b.iter(|| { let mut isect = IsectxN::default(); let isect = black_box(&mut isect); let s = black_box(&mut s); let r = black_box(&mut r); let mut v = r.intersect(s, *isect); black_box(&mut v); assert_eq!(v.hit.all(), true); }) }) .throughput(Throughput::Elements(f32xN::lanes() as u64)), ); } fn miss_vector(c: &mut Criterion) { let mut s = Sphere { center: V3D { x: 0., y: 0., z: -10., }, radius: 1., }; let mut r = RayxN { origin: V3DxN { x: f32xN::splat(0.), y: f32xN::splat(0.), z: f32xN::splat(0.), }, dir: V3DxN { x: f32xN::splat(0.), y: f32xN::splat(0.), z: f32xN::splat(1.), }, }; c.bench( "vector", Benchmark::new("isec_sphere_miss", move |b| { b.iter(|| { let mut isect = IsectxN::default(); let isect = black_box(&mut isect); let s = black_box(&mut s); let r = black_box(&mut r); let mut v = r.intersect(s, *isect); black_box(&mut v); assert_eq!(v.hit.any(), false); }) }) .throughput(Throughput::Elements(f32xN::lanes() as u64)), ); } criterion_group!(benches, hit_scalar, miss_scalar, hit_vector, miss_vector); criterion_main!(benches); ================================================ FILE: examples/aobench/benches/random.rs ================================================ //! Benchmarks PNRG #![feature(stdsimd)] use aobench_lib::geometry::f32xN; use aobench_lib::random; use criterion::*; fn random_scalar(c: &mut Criterion) { c.bench( "scalar", Benchmark::new("random", move |b| { let mut rng = random::scalar::thread_rng(); b.iter(|| { black_box(rng.gen()); }) }) .throughput(Throughput::Elements(1)), ); } fn random_vector(c: &mut Criterion) { c.bench( "vector", Benchmark::new("random", move |b| { let mut rng = random::vector::thread_rng(); b.iter(|| { black_box(rng.gen()); }) }) .throughput(Throughput::Elements(f32xN::lanes() as u64)), ); } criterion_group!(benches, random_scalar, random_vector); criterion_main!(benches); ================================================ FILE: examples/aobench/benches/scanlines.rs ================================================ #![feature(test)] use test::{black_box, Bencher}; #[bench] fn scanlines_scalar(b: &mut Bencher) { let width = 50; let height = 50; let width = black_box(width); let height = black_box(height); let mut fdata = Vec::new(); fdata.resize(width * height * 3, 0.); fdata = black_box(fdata); b.iter(|| { black_box(&mut fdata); aobench_lib::scalar::scanlines(0, height, width, height, 2, &mut fdata); }); } #[bench] fn scanlines_vector(b: &mut Bencher) { let width = 50; let height = 50; let width = black_box(width); let height = black_box(height); let mut fdata = Vec::new(); fdata.resize(width * height * 3, 0.); fdata = black_box(fdata); b.iter(|| { black_box(&mut fdata); aobench_lib::vector::scanlines(0, height, width, height, 2, &mut fdata); }); } ================================================ FILE: examples/aobench/benchmark.sh ================================================ #!/usr/bin/env bash # # Runs aobench benchmarks set -ex export WIDTH=800 export HEIGHT=600 if [[ ${NORUN} != 1 ]]; then hash hyperfine 2>/dev/null || { echo >&2 "hyperfine is not in PATH."; exit 1; } fi ALGS=("scalar" "scalar_par" "vector" "vector_par" "tiled" "tiled_par") if echo "$FEATURES" | grep -q "ispc"; then hash ispc 2>/dev/null || { echo >&2 "ispc is not in PATH."; exit 1; } ALGS+=("ispc" "ispc_tasks") fi echo "Benchmark 256-bit wide vectors" RUSTFLAGS="-C target-cpu=native ${RUSTFLAGS}" \ cargo build --release --no-default-features \ --features="${FEATURES},256bit" if [[ "${VERIFY}" == "1" ]]; then RUSTFLAGS="-C target-cpu=native ${RUSTFLAGS}" \ cargo test --release --no-default-features \ --features="${FEATURES},256bit" fi if [[ "${NORUN}" == "1" ]]; then exit 0 fi for alg in "${ALGS[@]}" do hyperfine "../target/release/aobench ${WIDTH} ${HEIGHT} --algo ${alg}" done echo "Benchmark 128-bit wide vectors" RUSTFLAGS="-C target-cpu=native ${RUSTFLAGS}" \ cargo build --release --no-default-features \ --features="${FEATURES}" for alg in "${ALGS[@]}" do hyperfine "../target/release/aobench ${WIDTH} ${HEIGHT} --algo ${alg}" done ================================================ FILE: examples/aobench/build.rs ================================================ fn main() { println!("cargo:rerun-if-changed=build.rs"); #[cfg(feature = "ispc")] { if std::env::var("CARGO_FEATURE_ISPC").is_ok() { let mut cfg = ispc::Config::new(); if cfg!(windows) { cfg.debug(false); } let ispc_files = vec!["volta/ao.ispc"]; for s in &ispc_files[..] { cfg.file(*s); } cfg.target_isas(vec![ ispc::opt::TargetISA::SSE2i32x4, ispc::opt::TargetISA::SSE4i32x4, ispc::opt::TargetISA::AVX1i32x8, ispc::opt::TargetISA::AVX2i32x8, ispc::opt::TargetISA::AVX512KNLi32x16, ]); cfg.compile("aobench"); } } } ================================================ FILE: examples/aobench/readme.md ================================================ # Ambient Occlusion Benchmark > Originally written by Syoyo Fujita: https://github.com/syoyo/aobench `aoench` is a small ambient occlusion renderer for benchmarking realworld floating point performance in various languages. ![image_vector_par](https://user-images.githubusercontent.com/904614/41043073-653aa5be-69a3-11e8-8a9d-007def8516cc.png) ## Instructions To run it with the default target options (replace `${NAME}` with an algorithm name): ``` > cargo run --release -- 800 600 --algo ${NAME} ``` Use `RUSTFLAGS` to set the target CPU, for example: ``` > RUSTFLAGS="-C target-cpu=native" cargo run --release -- 800 600 --algo ${NAME} ``` ## Results ``` ./benchmark.sh ``` On a dual core AVX1 i5 @1.8 GHz: | 800 x 600 | time [ms]
Rust | speedup vs `scalar` [-] | |--------------|---------------------|-------------------------| | `scalar` | 5884 | 1.0x | | `scalar_par` | 2206 | 2.7x | | `vector` | 1458 | 4.0x | | `vector_par` | 622 | 9.5x | | `tiled` | 1328 | 4.4x | | `tiled_par` | 578 | 10.2x | | `ispc` | 1158 | 5.1x | | `ispc_tasks` | 567 | 10.4x | `tiled_par` is 1.02x slower than `ispc_tasks`. On a 28 core Xeon CPU E5-2690 v4 @ 2.60GHz: | 800 x 600 | time [ms]
Rust | speedup vs `scalar` [-] | |--------------|---------------------|-------------------------| | `scalar` | 2981 | 1.0x | | `scalar_par` | 163 | 18.2x | | `vector` | 692 | 4.3x | | `vector_par` | 98 | 30.4x | | `tiled` | 640 | 4.7x | | `tiled_par` | 98 | 30.4x | | `ispc` | 576 | 5.2x | | `ispc_tasks` | 150 | 19.9x | `tiled_par` is 1.53x faster than `ispc_tasks`. On a 40 core Xeon Gold 6148 CPU @ 2.40GHz: | 800 x 600 | time [ms]
Rust | speedup vs `scalar` [-] | |--------------|---------------------|-------------------------| | `scalar` | 3215 | 1.0x | | `scalar_par` | 186 | 17.0x | | `vector` | 802 | 4.0x | | `vector_par` | 106 | 30.3x | | `tiled` | 770 | 4.2x | | `tiled_par` | 102 | 32.1x | | `ispc` | 491 | 6.5x | | `ispc_tasks` | 153 | 21.7x | `tiled_par` is 1.5x faster than `ispc_tasks`. ## Overview There are 4 main pieces in the `aobench` benchmark: * ray-plane intersection algorithm: [source](https://github.com/rust-lang-nursery/packed_simd/tree/master/examples/aobench/src/intersection/ray_plane.rs) * ray-sphere intersection algorithm: [source](https://github.com/rust-lang-nursery/packed_simd/tree/master/examples/aobench/src/intersection/ray_sphere.rs) * ambient occlusion algorithm: [source](https://github.com/rust-lang-nursery/packed_simd/tree/master/examples/aobench/src/ambient_occlusion.rs) * ray-casting the pixels: * scalar serial: [source](https://github.com/rust-lang-nursery/packed_simd/tree/master/examples/aobench/src/scalar.rs) * scalar parallel: [source](https://github.com/rust-lang-nursery/packed_simd/tree/master/examples/aobench/src/scalar_parallel.rs) * vector serial: [source](https://github.com/rust-lang-nursery/packed_simd/tree/master/examples/aobench/src/vector.rs) * vector parallel: [source](https://github.com/rust-lang-nursery/packed_simd/tree/master/examples/aobench/src/vector_parallel.rs) The scalar and vectorized implementations of the intersection and ao algorithms are in the same file so that they can be easily compared. As a comparison, the ISPC sources of the same benchmark are [here](https://github.com/ispc/ispc/tree/master/examples/aobench). ================================================ FILE: examples/aobench/rustfmt.toml ================================================ max_width = 79 ================================================ FILE: examples/aobench/src/ambient_occlusion.rs ================================================ //! Ambient Occlusion implementations use crate::geometry::{f32xN, Ray, RayxN, Selectable, V3DxN, V3D}; use crate::intersection::{Intersect, Isect, IsectxN}; use crate::scene::Scene; use std::f32::consts::PI; /// Scalar ambient occlusion algorithm #[inline(always)] pub fn scalar(scene: &mut S, isect: &Isect) -> f32 { let mut occlusion: f32 = 0.0; let basis = isect.n.ortho_basis(); let eps: f32 = 0.0001; let origin = isect.p + eps * isect.n; let ntheta: usize = S::NAO_SAMPLES; let nphi: usize = S::NAO_SAMPLES; for _i in 0..ntheta { for _j in 0..nphi { let theta = scene.rand().sqrt(); let phi = 2. * PI * scene.rand(); let n = V3D { x: phi.cos() * theta, y: phi.sin() * theta, z: (1.0 - theta * theta).sqrt(), }; let dir = basis * n; let ray = Ray { origin, dir }; let mut occ_isect = Isect::default(); for s in scene.spheres() { occ_isect = ray.intersect(s, occ_isect); } occ_isect = ray.intersect(scene.plane(), occ_isect); if occ_isect.hit { occlusion += 1.; } } } 1. - occlusion / (ntheta * nphi) as f32 } /// Vectorized ambient occlusion algorithm using ray packets #[inline(always)] pub fn vector(scene: &mut S, isect: &Isect) -> f32 { let mut occlusion = f32xN::splat(0.0); let basis = isect.n.ortho_basis(); let eps: f32 = 0.0001; let origin = isect.p + eps * isect.n; let origin = V3DxN { x: f32xN::splat(origin.x), y: f32xN::splat(origin.y), z: f32xN::splat(origin.z), }; let ntheta: usize = S::NAO_SAMPLES; let nphi: usize = S::NAO_SAMPLES; for _i in 0..ntheta { for _j in (0..nphi).step_by(f32xN::lanes()) { let (theta, phi) = scene.rand_f32xN(); let theta = theta.sqrte(); let (sin, cos) = (2. * phi).sin_cos_pi(); let n = V3DxN { x: cos * theta, y: sin * theta, z: (f32xN::splat(1.0) - theta * theta).sqrt(), }; let dir = basis * n; let ray = RayxN { origin, dir }; let mut occ_isect = IsectxN::default(); for s in scene.spheres() { occ_isect = ray.intersect(s, occ_isect); } occ_isect = ray.intersect(scene.plane(), occ_isect); occlusion += occ_isect.hit.sel(f32xN::splat(1.), f32xN::splat(0.)); } } 1. - occlusion.sum() / (ntheta * nphi) as f32 } /// Vectorized ambient occlusion algorithm using ray packets #[inline(always)] pub fn vector_tiled(scene: &mut S, isect: &IsectxN) -> f32xN { let mut occlusion = f32xN::splat(0.0); let basis = isect.n.ortho_basis(); let eps = f32xN::splat(0.0001); let origin = isect.p + eps * isect.n; let ntheta: usize = S::NAO_SAMPLES; let nphi: usize = S::NAO_SAMPLES; for _i in 0..ntheta { for _j in 0..nphi { let (theta, phi) = scene.rand_f32xN(); let theta = theta.sqrte(); let (sin, cos) = (2. * phi).sin_cos_pi(); let n = V3DxN { x: cos * theta, y: sin * theta, z: (1.0 - theta * theta).sqrt(), }; let dir = basis * n; let ray = RayxN { origin, dir }; let mut occ_isect = IsectxN::default(); for s in scene.spheres() { occ_isect = ray.intersect(s, occ_isect); } occ_isect = ray.intersect(scene.plane(), occ_isect); occlusion += occ_isect.hit.sel(f32xN::splat(1.), f32xN::splat(0.)); } } f32xN::splat(1.) - occlusion / (ntheta * nphi) as f32 } #[cfg(test)] mod tests { use super::*; use crate::geometry::V3D; #[test] fn sanity_hit() { let scene = crate::scene::Test::default(); let mut scene_scalar = scene.clone(); let mut scene_vector = scene.clone(); let ray = Ray { origin: V3D::default(), dir: V3D { x: -0.2, y: -0.2, z: -0.2, }, }; let mut isect = Isect::default(); for s in scene.spheres() { isect = ray.intersect(s, isect); } isect = ray.intersect(scene.plane(), isect); assert!(isect.hit); let ao_scalar = scalar(&mut scene_scalar, &isect); let ao_vector = vector(&mut scene_vector, &isect); assert_eq!(ao_scalar, ao_vector); } #[test] fn sanity_miss() { let scene = crate::scene::Test::default(); let mut scene_scalar = scene.clone(); let mut scene_vector = scene.clone(); let ray = Ray { origin: V3D::default(), dir: V3D { x: 0.2, y: 0.2, z: 0.2, }, }; let mut isect = Isect::default(); for s in scene.spheres() { isect = ray.intersect(s, isect); } isect = ray.intersect(scene.plane(), isect); assert!(!isect.hit); let ao_scalar = scalar(&mut scene_scalar, &isect); let ao_vector = vector(&mut scene_vector, &isect); assert_eq!(ao_scalar, ao_vector); } } ================================================ FILE: examples/aobench/src/geometry/mod.rs ================================================ //! Geometry utilities use packed_simd::*; mod plane; mod ray; mod sphere; mod vec; mod rayxN; mod vecxN; pub use self::plane::Plane; pub use self::ray::Ray; pub use self::sphere::Sphere; pub use self::vec::{Dot, M3x3, V3D}; pub use self::rayxN::RayxN; pub use self::vecxN::{Selectable, V3DxN}; #[cfg(feature = "256bit")] pub type f32xN = f32x8; #[cfg(feature = "256bit")] pub type u32xN = u32x8; #[cfg(feature = "256bit")] pub type usizexN = usizex8; #[cfg(feature = "256bit")] pub type m32xN = m32x8; #[cfg(feature = "256bit")] pub type pf32xN = Simd<[*mut f32; 8]>; #[cfg(not(feature = "256bit"))] pub type f32xN = f32x4; #[cfg(not(feature = "256bit"))] pub type u32xN = u32x4; #[cfg(not(feature = "256bit"))] pub type usizexN = usizex4; #[cfg(not(feature = "256bit"))] pub type m32xN = m32x4; #[cfg(not(feature = "256bit"))] pub type pf32xN = Simd<[*mut f32; 4]>; pub trait IncrV { type Element; fn incr(x: Self::Element, step: Self::Element) -> Self; } impl IncrV for f32xN { type Element = f32; #[inline(always)] fn incr(x: f32, step: f32) -> Self { #[cfg(feature = "256bit")] { Self::new( x + 0. * step, x + 1. * step, x + 2. * step, x + 3. * step, x + 4. * step, x + 5. * step, x + 6. * step, x + 7. * step, ) } #[cfg(not(feature = "256bit"))] { Self::new( x + 0. * step, x + 1. * step, x + 2. * step, x + 3. * step, ) } } } impl IncrV for u32xN { type Element = u32; #[inline(always)] fn incr(x: u32, step: u32) -> Self { #[cfg(feature = "256bit")] { Self::new( x + 0 * step, x + 1 * step, x + 2 * step, x + 3 * step, x + 4 * step, x + 5 * step, x + 6 * step, x + 7 * step, ) } #[cfg(not(feature = "256bit"))] { Self::new(x + 0 * step, x + 1 * step, x + 2 * step, x + 3 * step) } } } impl IncrV for usizexN { type Element = usize; #[inline(always)] fn incr(x: usize, step: usize) -> Self { #[cfg(feature = "256bit")] { Self::new( x + 0 * step, x + 1 * step, x + 2 * step, x + 3 * step, x + 4 * step, x + 5 * step, x + 6 * step, x + 7 * step, ) } #[cfg(not(feature = "256bit"))] { Self::new(x + 0 * step, x + 1 * step, x + 2 * step, x + 3 * step) } } } ================================================ FILE: examples/aobench/src/geometry/plane.rs ================================================ //! Plane use crate::geometry::V3D; #[derive(Copy, Clone, Debug)] pub struct Plane { pub p: V3D, pub n: V3D, } ================================================ FILE: examples/aobench/src/geometry/ray.rs ================================================ //! A ray use crate::geometry::V3D; /// Ray starting at `origin` in `dir` direction. #[derive(Copy, Clone, Debug)] pub struct Ray { pub origin: V3D, pub dir: V3D, } ================================================ FILE: examples/aobench/src/geometry/rayxN.rs ================================================ //! Four packed rays use crate::geometry::{Ray, V3DxN}; /// Four packed rays starting at `origin` in `dir` direction. #[derive(Copy, Clone, Debug)] pub struct RayxN { pub origin: V3DxN, pub dir: V3DxN, } impl RayxN { pub fn get(&self, idx: usize) -> Ray { Ray { origin: self.origin.get(idx), dir: self.dir.get(idx), } } } ================================================ FILE: examples/aobench/src/geometry/sphere.rs ================================================ //! Sphere use crate::geometry::V3D; #[derive(Copy, Clone, Debug)] pub struct Sphere { pub center: V3D, pub radius: f32, } ================================================ FILE: examples/aobench/src/geometry/vec.rs ================================================ //! A simple vector type use std::ops::*; #[derive(Copy, Clone, Debug, PartialEq)] pub struct V3D { pub x: f32, pub y: f32, pub z: f32, } impl Default for V3D { #[inline(always)] #[must_use] fn default() -> Self { Self { x: 0., y: 0., z: 0., } } } pub type M3x3 = [V3D; 3]; impl V3D { #[inline(always)] #[must_use] pub fn cross(self, o: Self) -> Self { Self { x: self.y * o.z - self.z * o.y, y: self.z * o.x - self.x * o.z, z: self.x * o.y - self.y * o.x, } } #[inline(always)] #[must_use] pub fn normalized(self) -> Self { let len2 = self.dot(self); let invlen = len2.sqrt().recip(); invlen * self } #[inline(always)] #[must_use] pub fn ortho_basis(self) -> M3x3 { let n = self; let mut basis = [Self::default(), Self::default(), n]; if n.x < 0.6 && n.x > -0.6 { basis[1].x = 1.0; } else if n.y < 0.6 && n.y > -0.6 { basis[1].y = 1.0; } else if n.z < 0.6 && n.z > -0.6 { basis[1].z = 1.0; } else { basis[1].x = 1.0; } basis[0] = basis[1].cross(basis[2]).normalized(); basis[1] = basis[2].cross(basis[0]).normalized(); basis } // Fuzzy float comparison between vectors #[inline(always)] #[must_use] pub fn almost_eq(&self, rhs: &Self) -> bool { const EPSILON: f32 = 1E-3; (self.x - rhs.x).abs() < EPSILON && (self.y - rhs.y).abs() < EPSILON && (self.z - rhs.z).abs() < EPSILON } } impl Add for V3D { type Output = Self; #[inline(always)] fn add(self, o: Self) -> Self::Output { Self { x: self.x + o.x, y: self.y + o.y, z: self.z + o.z, } } } impl Sub for V3D { type Output = Self; #[inline(always)] fn sub(self, o: Self) -> Self::Output { Self { x: self.x - o.x, y: self.y - o.y, z: self.z - o.z, } } } impl Mul for V3D { type Output = Self; fn mul(self, o: Self) -> Self::Output { Self { x: self.x * o.x, y: self.y * o.y, z: self.z * o.z, } } } impl Mul for V3D { type Output = Self; #[inline(always)] fn mul(self, o: f32) -> Self::Output { Self { x: self.x * o, y: self.y * o, z: self.z * o, } } } impl Mul for f32 { type Output = V3D; #[inline(always)] fn mul(self, o: V3D) -> Self::Output { o * self } } impl Mul for M3x3 { type Output = V3D; #[inline(always)] fn mul(self, o: V3D) -> Self::Output { V3D { x: o.dot(V3D { x: self[0].x, y: self[1].x, z: self[2].x, }), y: o.dot(V3D { x: self[0].y, y: self[1].y, z: self[2].y, }), z: o.dot(V3D { x: self[0].z, y: self[1].z, z: self[2].z, }), } } } /// Vector dot product pub trait Dot { type Output; fn dot(self, _: O) -> Self::Output; } impl Dot for V3D { type Output = f32; #[inline(always)] fn dot(self, o: Self) -> Self::Output { self.x * o.x + self.y * o.y + self.z * o.z } } ================================================ FILE: examples/aobench/src/geometry/vecxN.rs ================================================ //! A simple vector type use std::ops::*; use crate::geometry::{f32xN, m32xN, Dot, M3x3, V3D}; #[derive(Copy, Clone, Debug)] pub struct V3DxN { pub x: f32xN, pub y: f32xN, pub z: f32xN, } impl Default for V3DxN { #[inline(always)] #[must_use] fn default() -> Self { Self { x: f32xN::splat(0.), y: f32xN::splat(0.), z: f32xN::splat(0.), } } } impl V3DxN { #[inline(always)] #[must_use] pub fn normalized(self) -> Self { let len2 = self.dot(self); let invlen = len2.rsqrte(); invlen * self } pub fn get(&self, idx: usize) -> V3D { V3D { x: self.x.extract(idx), y: self.y.extract(idx), z: self.z.extract(idx), } } #[must_use] #[inline(always)] pub fn ortho_basis(self) -> [Self; 3] { let n = self; let mut basis = [Self::default(), Self::default(), n]; let max = f32xN::splat(0.6); let min = f32xN::splat(-0.6); let one = f32xN::splat(1.0); let mx = n.x.lt(max) & n.x.gt(min); let my = n.y.lt(max) & n.y.gt(min); let mz = n.z.lt(max) & n.z.gt(min); basis[1].x = (mx | (!mx & !my & !mz)).select(one, basis[1].x); basis[1].y = (!mx & my).select(one, basis[1].y); basis[1].z = (!mx & !my & mz).select(one, basis[1].z); basis[0] = basis[1].cross(basis[2]).normalized(); basis[1] = basis[2].cross(basis[0]).normalized(); basis } #[inline(always)] #[must_use] pub fn cross(self, o: Self) -> Self { Self { x: self.y * o.z - self.z * o.y, y: self.z * o.x - self.x * o.z, z: self.x * o.y - self.y * o.x, } } } impl Add for V3DxN { type Output = Self; #[inline(always)] fn add(self, o: Self) -> Self::Output { Self { x: self.x + o.x, y: self.y + o.y, z: self.z + o.z, } } } impl Mul for V3DxN { type Output = Self; #[inline(always)] fn mul(self, o: Self) -> Self::Output { Self { x: self.x * o.x, y: self.y * o.y, z: self.z * o.z, } } } impl Mul for f32xN { type Output = V3DxN; #[inline(always)] fn mul(self, o: V3DxN) -> Self::Output { V3DxN { x: self * o.x, y: self * o.y, z: self * o.z, } } } impl Mul for [V3DxN; 3] { type Output = V3DxN; #[inline(always)] fn mul(self, o: V3DxN) -> Self::Output { V3DxN { x: o.dot(V3DxN { x: self[0].x, y: self[1].x, z: self[2].x, }), y: o.dot(V3DxN { x: self[0].y, y: self[1].y, z: self[2].y, }), z: o.dot(V3DxN { x: self[0].z, y: self[1].z, z: self[2].z, }), } } } impl Sub for V3DxN { type Output = Self; #[inline(always)] fn sub(self, o: V3D) -> Self::Output { Self { x: self.x - f32xN::splat(o.x), y: self.y - f32xN::splat(o.y), z: self.z - f32xN::splat(o.z), } } } impl Dot for V3DxN { type Output = f32xN; #[inline(always)] fn dot(self, o: Self) -> Self::Output { self.x.mul_adde(o.x, self.y.mul_adde(o.y, self.z * o.z)) } } impl Dot for V3DxN { type Output = f32xN; #[inline(always)] fn dot(self, o: V3D) -> Self::Output { self.x.mul_adde( f32xN::splat(o.x), self.y.mul_adde(f32xN::splat(o.y), self.z * o.z), ) } } pub trait Selectable { type Output; fn sel(self, a: O, b: P) -> Self::Output; } impl Selectable for m32xN { type Output = f32xN; #[inline(always)] fn sel(self, a: f32xN, b: f32xN) -> f32xN { self.select(a, b) } } impl Selectable for m32xN { type Output = V3DxN; #[inline(always)] fn sel(self, a: V3DxN, b: V3DxN) -> V3DxN { V3DxN { x: self.select(a.x, b.x), y: self.select(a.y, b.y), z: self.select(a.z, b.z), } } } impl Selectable for m32xN { type Output = V3DxN; #[inline(always)] fn sel(self, a: V3D, b: V3DxN) -> V3DxN { V3DxN { x: self.select(f32xN::splat(a.x), b.x), y: self.select(f32xN::splat(a.y), b.y), z: self.select(f32xN::splat(a.z), b.z), } } } impl Mul for M3x3 { type Output = V3DxN; #[inline(always)] fn mul(self, o: V3DxN) -> Self::Output { V3DxN { x: o.x.mul_adde( f32xN::splat(self[0].x), o.y.mul_adde( f32xN::splat(self[1].x), o.z * f32xN::splat(self[2].x), ), ), y: o.x.mul_adde( f32xN::splat(self[0].y), o.y.mul_adde( f32xN::splat(self[1].y), o.z * f32xN::splat(self[2].y), ), ), z: o.x.mul_adde( f32xN::splat(self[0].z), o.y.mul_adde( f32xN::splat(self[1].z), o.z * f32xN::splat(self[2].z), ), ), } } } ================================================ FILE: examples/aobench/src/image.rs ================================================ //! Image utilities use failure::Error; #[allow(unused)] use png::{BitDepth, ColorType, Encoder}; use std::path::Path; /// PNG image in RGB format pub struct Image { width: usize, height: usize, data: Vec, pub fdata: Vec, } impl Image { pub fn new(width: usize, height: usize) -> Self { Self { width, height, data: vec![0_u8; width * height * 3 /* RGBA */], fdata: vec![0_f32; width * height * 3 /* RGBA */], } } /// Image's `(width, height)` pub fn size(&self) -> (usize, usize) { (self.width, self.height) } /// Writes the pixels into a png image at `output`. /// /// `soa` specifies whether the bytes in `fdata` are in a Struct of Arrays (rrr...ggg...bbb...) /// or Array of Structs (rgbrgbrgb...) format. pub fn write_png( &mut self, output: &Path, soa: bool, ) -> Result<(), Error> { fn clamp(x: f32) -> u8 { let mut i = (x * 255.5) as isize; if i < 0 { i = 0 }; if i > 255 { i = 255 }; i as u8 } use std::fs::File; use std::io::BufWriter; let file = File::create(output)?; let buf_writer = &mut BufWriter::new(file); let mut encoder = Encoder::new( buf_writer, self.width as u32, self.height as u32, ); encoder.set_color(ColorType::RGB); encoder.set_depth(BitDepth::Eight); let mut writer = encoder.write_header().unwrap(); if soa { let len = (self.width * self.height) as usize; let (r, tail) = self.fdata.split_at(len); let (g, b) = tail.split_at(len); assert!(r.len() == len); assert!(g.len() == len); assert!(b.len() == len); for i in 0..len { self.data[3 * i + 0] = clamp(r[i]); self.data[3 * i + 1] = clamp(g[i]); self.data[3 * i + 2] = clamp(b[i]); } } else { for (&fp, up) in self.fdata.iter().zip(self.data.iter_mut()) { (*up) = clamp(fp); } } writer.write_image_data(&self.data)?; Ok(()) } } ================================================ FILE: examples/aobench/src/intersection/mod.rs ================================================ //! Intersection functions /// Intersection of `I` with `Self` pub trait Intersect { type Isect; fn intersect(&self, other: &I, isect: Self::Isect) -> Self::Isect; } mod packet; mod ray_plane; mod ray_sphere; mod single; pub use self::packet::IsectxN; pub use self::single::Isect; ================================================ FILE: examples/aobench/src/intersection/packet.rs ================================================ //! SIMD intersection result use crate::geometry::{f32xN, m32xN, V3DxN}; use crate::intersection::Isect; /// Intersection result #[derive(Copy, Clone, Debug)] pub struct IsectxN { pub t: f32xN, pub p: V3DxN, pub n: V3DxN, pub hit: m32xN, } impl Default for IsectxN { #[inline] fn default() -> Self { Self { t: f32xN::splat(1e17), hit: m32xN::splat(false), p: V3DxN::default(), n: V3DxN::default(), } } } impl IsectxN { pub fn get(&self, idx: usize) -> Isect { Isect { t: self.t.extract(idx), p: self.p.get(idx), n: self.n.get(idx), hit: self.hit.extract(idx), } } } ================================================ FILE: examples/aobench/src/intersection/ray_plane.rs ================================================ //! Intersection of a ray with a plane use crate::geometry::{f32xN, Dot, Plane, Ray, RayxN, Selectable}; use crate::intersection::{Intersect, Isect, IsectxN}; // Scalar ray-plane intersection impl Intersect for Ray { type Isect = Isect; #[inline(always)] fn intersect(&self, plane: &Plane, mut isect: Isect) -> Isect { let ray = self; let d = -plane.p.dot(plane.n); let v = ray.dir.dot(plane.n); if v.abs() < 1e-17 { return isect; } let t = -(ray.origin.dot(plane.n) + d) / v; if t > 0. && t < isect.t { isect.t = t; isect.hit = true; isect.p = ray.origin + t * ray.dir; isect.n = plane.n; } isect } } // Vector ray-plane intersection for a packet of rays impl Intersect for RayxN { type Isect = IsectxN; #[inline(always)] fn intersect(&self, plane: &Plane, mut isect: IsectxN) -> IsectxN { let ray = self; let d = -plane.p.dot(plane.n); let v = ray.dir.dot(plane.n); let _old_isect = isect; let m = v.abs().ge(f32xN::splat(1e-17)); if m.any() { let t = m.sel(-(ray.origin.dot(plane.n) + d) / v, isect.t); let m = m & t.gt(f32xN::splat(0.)) & t.lt(isect.t); if m.any() { isect.t = m.sel(t, isect.t); isect.hit |= m; isect.p = m.sel(ray.origin + t * ray.dir, isect.p); isect.n = m.sel(plane.n, isect.n); } } #[cfg(debug_assertions)] { // Check that the vector and the scalar version produce the same results // for the same inputs in debug builds for i in 0..f32xN::lanes() { let old_isect_i = _old_isect.get(i); let ray_i = self.get(i); let isect_i = ray_i.intersect(plane, old_isect_i); assert!(isect_i.almost_eq(&isect.get(i)), "{:?} !~= {:?}\n\nplane: {:?}\n\nold_isect: {:?}\n\nrays: {:?}\n\ni: {:?}\nold_isect_i: {:?}\nray_i: {:?}\n\n", isect_i, isect.get(i), plane, _old_isect, self, i, old_isect_i, ray_i); } } isect } } #[cfg(test)] mod tests { use super::*; use crate::geometry::{m32xN, V3DxN, V3D}; #[test] fn sanity() { let plane = Plane { p: V3D { x: 0., y: 0., z: -10., }, n: V3D { x: 0., y: 0., z: 1., }, }; let ray_hit = Ray { origin: V3D::default(), dir: V3D { x: 0.01, y: 0.01, z: -1., }, }; let ray_miss = Ray { origin: V3D::default(), dir: V3D { x: 0., y: 0., z: 1., }, }; let isect_hit = ray_hit.intersect(&plane, Isect::default()); assert!(isect_hit.hit); let isect_miss = ray_miss.intersect(&plane, Isect::default()); assert!(!isect_miss.hit); // hit, miss, hit, miss #[cfg(feature = "256bit")] let z_val = f32xN::new(-1., 1., -1., 1., -1., 1., -1., 1.); #[cfg(not(feature = "256bit"))] let z_val = f32xN::new(-1., 1., -1., 1.); let rays = RayxN { origin: V3DxN::default(), dir: V3DxN { x: f32xN::splat(0.01), y: f32xN::splat(0.01), z: z_val, }, }; let isectxN = rays.intersect(&plane, IsectxN::default()); #[cfg(feature = "256bit")] let expected = m32xN::new(true, false, true, false, true, false, true, false); #[cfg(not(feature = "256bit"))] let expected = m32xN::new(true, false, true, false); assert_eq!(isectxN.hit, expected); assert_eq!(isect_hit.t, isectxN.t.extract(0)); assert_eq!(isect_hit.t, isectxN.t.extract(2)); assert_eq!(isect_miss.t, isectxN.t.extract(1)); assert_eq!(isect_miss.t, isectxN.t.extract(3)); assert_eq!(isect_hit.p.x, isectxN.p.x.extract(0)); assert_eq!(isect_hit.p.y, isectxN.p.y.extract(0)); assert_eq!(isect_hit.p.z, isectxN.p.z.extract(0)); assert_eq!(isect_hit.p.x, isectxN.p.x.extract(2)); assert_eq!(isect_hit.p.y, isectxN.p.y.extract(2)); assert_eq!(isect_hit.p.z, isectxN.p.z.extract(2)); assert_eq!(isect_miss.p.x, isectxN.p.x.extract(1)); assert_eq!(isect_miss.p.y, isectxN.p.y.extract(1)); assert_eq!(isect_miss.p.z, isectxN.p.z.extract(1)); assert_eq!(isect_miss.p.x, isectxN.p.x.extract(3)); assert_eq!(isect_miss.p.y, isectxN.p.y.extract(3)); assert_eq!(isect_miss.p.z, isectxN.p.z.extract(3)); assert_eq!(isect_hit.n.x, isectxN.n.x.extract(0)); assert_eq!(isect_hit.n.y, isectxN.n.y.extract(0)); assert_eq!(isect_hit.n.z, isectxN.n.z.extract(0)); assert_eq!(isect_hit.n.x, isectxN.n.x.extract(2)); assert_eq!(isect_hit.n.y, isectxN.n.y.extract(2)); assert_eq!(isect_hit.n.z, isectxN.n.z.extract(2)); assert_eq!(isect_miss.n.x, isectxN.n.x.extract(1)); assert_eq!(isect_miss.n.y, isectxN.n.y.extract(1)); assert_eq!(isect_miss.n.z, isectxN.n.z.extract(1)); assert_eq!(isect_miss.n.x, isectxN.n.x.extract(3)); assert_eq!(isect_miss.n.y, isectxN.n.y.extract(3)); assert_eq!(isect_miss.n.z, isectxN.n.z.extract(3)); } #[test] fn bug() { let plane = Plane { p: V3D { x: 0., y: -0.5, z: 0., }, n: V3D { x: 0., y: 1., z: 0., }, }; let isect = IsectxN { t: f32xN::splat(2.1931846), p: V3DxN { x: f32xN::splat(-0.2608384), y: f32xN::splat(-0.28958648), z: f32xN::splat(-2.6699374), }, n: V3DxN { x: f32xN::splat(0.47832328), y: f32xN::splat(-0.579173), z: f32xN::splat(0.6601253), }, hit: m32xN::splat(true), }; let rays = RayxN { origin: V3DxN { x: f32xN::splat(-0.5), y: f32xN::splat(-0.4999), z: f32xN::splat(-0.5), }, dir: V3DxN { x: f32xN::splat(0.10904764), y: f32xN::splat(0.095894136), z: f32xN::splat(-0.98940027), }, }; let r = rays.intersect(&plane, isect); assert_eq!(r.hit, m32xN::splat(true)); } } ================================================ FILE: examples/aobench/src/intersection/ray_sphere.rs ================================================ //! Intersection of a ray with a sphere. use crate::geometry::{f32xN, Dot, Ray, RayxN, Selectable, Sphere}; use crate::intersection::{Intersect, Isect, IsectxN}; // Scalar ray-sphere intersection impl Intersect for Ray { type Isect = Isect; #[inline(always)] fn intersect(&self, sphere: &Sphere, mut isect: Isect) -> Isect { let ray = self; let rs = ray.origin - sphere.center; let b = rs.dot(ray.dir); let c = rs.dot(rs) - sphere.radius * sphere.radius; let d = b * b - c; if d > 0. { let t = -b - d.sqrt(); if t > 0. && t < isect.t { isect.t = t; isect.hit = true; isect.p = ray.origin + t * ray.dir; isect.n = (isect.p - sphere.center).normalized(); } } isect } } // Vector ray-sphere intersection for a packet of rays impl Intersect for RayxN { type Isect = IsectxN; #[inline(always)] fn intersect(&self, sphere: &Sphere, mut isect: IsectxN) -> IsectxN { let ray = self; let rs = ray.origin - sphere.center; let b = rs.dot(ray.dir); let radius = f32xN::splat(sphere.radius); let c = radius.mul_adde(-radius, rs.dot(rs)); let d = b.mul_adde(b, -c); let _old_isect = isect; let m = d.gt(f32xN::splat(0.)); if m.any() { let t = m.sel(-b - d.sqrt(), isect.t); let m = m & t.gt(f32xN::splat(0.)) & t.lt(isect.t); if m.any() { isect.t = m.sel(t, isect.t); isect.hit |= m; isect.p = m.sel(ray.origin + t * ray.dir, isect.p); isect.n = m.sel((isect.p - sphere.center).normalized(), isect.n); } } #[cfg(debug_assertions)] { // Check that the vector and the scalar version produce the same results // for the same inputs in debug builds for i in 0..f32xN::lanes() { let old_isect_i = _old_isect.get(i); let ray_i = self.get(i); let isect_i = ray_i.intersect(sphere, old_isect_i); assert!(isect_i.almost_eq(&isect.get(i)), "{:?} !~= {:?}\n\nsphere: {:?}\n\nold_isect: {:?}\n\nrays: {:?}\n\ni: {:?}\nold_isect_i: {:?}\nray_i: {:?}\n\n", isect_i, isect.get(i), sphere, _old_isect, self, i, old_isect_i, ray_i); } } isect } } #[cfg(test)] mod tests { use super::*; use crate::geometry::{m32xN, V3DxN, V3D}; #[test] fn sanity() { let sphere = Sphere { center: V3D { x: 0., y: 0., z: -10., }, radius: 1., }; let ray_hit = Ray { origin: V3D::default(), dir: V3D { x: 0.01, y: 0.01, z: -1., }, }; let ray_miss = Ray { origin: V3D::default(), dir: V3D { x: 0., y: 0., z: 1., }, }; let isect_hit = ray_hit.intersect(&sphere, Isect::default()); assert!(isect_hit.hit); let isect_miss = ray_miss.intersect(&sphere, Isect::default()); assert!(!isect_miss.hit); // hit, miss, hit, miss #[cfg(feature = "256bit")] let z_val = f32xN::new(-1., 1., -1., 1., -1., 1., -1., 1.); #[cfg(not(feature = "256bit"))] let z_val = f32xN::new(-1., 1., -1., 1.); let rays = RayxN { origin: V3DxN::default(), dir: V3DxN { x: f32xN::splat(0.01), y: f32xN::splat(0.01), z: z_val, }, }; let isectxN = rays.intersect(&sphere, IsectxN::default()); #[cfg(feature = "256bit")] let expected = m32xN::new(true, false, true, false, true, false, true, false); #[cfg(not(feature = "256bit"))] let expected = m32xN::new(true, false, true, false); assert_eq!(isectxN.hit, expected); assert_eq!(isect_hit.t, isectxN.t.extract(0)); assert_eq!(isect_hit.t, isectxN.t.extract(2)); assert_eq!(isect_miss.t, isectxN.t.extract(1)); assert_eq!(isect_miss.t, isectxN.t.extract(3)); assert_eq!(isect_hit.p.x, isectxN.p.x.extract(0)); assert_eq!(isect_hit.p.y, isectxN.p.y.extract(0)); assert_eq!(isect_hit.p.z, isectxN.p.z.extract(0)); assert_eq!(isect_hit.p.x, isectxN.p.x.extract(2)); assert_eq!(isect_hit.p.y, isectxN.p.y.extract(2)); assert_eq!(isect_hit.p.z, isectxN.p.z.extract(2)); assert_eq!(isect_miss.p.x, isectxN.p.x.extract(1)); assert_eq!(isect_miss.p.y, isectxN.p.y.extract(1)); assert_eq!(isect_miss.p.z, isectxN.p.z.extract(1)); assert_eq!(isect_miss.p.x, isectxN.p.x.extract(3)); assert_eq!(isect_miss.p.y, isectxN.p.y.extract(3)); assert_eq!(isect_miss.p.z, isectxN.p.z.extract(3)); assert_eq!(isect_hit.n.x, isectxN.n.x.extract(0)); assert_eq!(isect_hit.n.y, isectxN.n.y.extract(0)); assert_eq!(isect_hit.n.z, isectxN.n.z.extract(0)); assert_eq!(isect_hit.n.x, isectxN.n.x.extract(2)); assert_eq!(isect_hit.n.y, isectxN.n.y.extract(2)); assert_eq!(isect_hit.n.z, isectxN.n.z.extract(2)); assert_eq!(isect_miss.n.x, isectxN.n.x.extract(1)); assert_eq!(isect_miss.n.y, isectxN.n.y.extract(1)); assert_eq!(isect_miss.n.z, isectxN.n.z.extract(1)); assert_eq!(isect_miss.n.x, isectxN.n.x.extract(3)); assert_eq!(isect_miss.n.y, isectxN.n.y.extract(3)); assert_eq!(isect_miss.n.z, isectxN.n.z.extract(3)); } } ================================================ FILE: examples/aobench/src/intersection/single.rs ================================================ //! Scalar intersection result use crate::geometry::V3D; /// Intersection result #[derive(Copy, Clone, Debug)] pub struct Isect { pub t: f32, pub p: V3D, pub n: V3D, pub hit: bool, } impl Default for Isect { #[inline] fn default() -> Self { Self { t: 1e17, hit: false, p: V3D::default(), n: V3D::default(), } } } impl Isect { #[inline(always)] #[must_use] pub fn almost_eq(&self, rhs: &Self) -> bool { const EPSILON: f32 = 1E-3; (self.t - rhs.t).abs() < EPSILON && self.p.almost_eq(&rhs.p) && self.n.almost_eq(&rhs.n) && self.hit == rhs.hit } } ================================================ FILE: examples/aobench/src/ispc_.rs ================================================ //! Includes the ISPC implementations. use crate::*; use ispc::*; ispc_module!(aobench); pub fn ao( _scene: &mut S, nsubsamples: usize, img: &mut crate::Image, ) { let (w, h) = img.size(); unsafe { self::aobench::ao_ispc( w as i32, h as i32, nsubsamples as i32, img.fdata.as_mut_ptr(), ) } } pub fn ao_tasks( _scene: &mut S, nsubsamples: usize, img: &mut crate::Image, ) { let (w, h) = img.size(); unsafe { self::aobench::ao_ispc_tasks( w as i32, h as i32, nsubsamples as i32, img.fdata.as_mut_ptr(), ) } } ================================================ FILE: examples/aobench/src/lib.rs ================================================ //! aobench: Ambient Occlusion Renderer benchmark. //! //! Based on [aobench](https://code.google.com/archive/p/aobench/) by Syoyo //! Fujita. // FIXME: Null pointer deref warning triggered in this example, // likely inside a macro expansion deriving from packed_simd. #![deny(rust_2018_idioms)] #![allow(non_snake_case, non_camel_case_types)] #![allow( clippy::many_single_char_names, clippy::similar_names, clippy::cast_precision_loss, clippy::inline_always, clippy::cast_possible_truncation, clippy::cast_sign_loss, clippy::identity_op, clippy::erasing_op, clippy::must_use_candidate, clippy::float_cmp )] pub mod ambient_occlusion; pub mod geometry; pub mod image; pub mod intersection; pub mod random; pub mod scene; #[cfg(feature = "ispc")] pub mod ispc_; pub mod scalar; pub mod scalar_parallel; pub mod tiled; pub mod tiled_parallel; pub mod vector; pub mod vector_parallel; pub use self::image::Image; pub use self::scene::Scene; ================================================ FILE: examples/aobench/src/main.rs ================================================ //! aobench: Ambient Occlusion Renderer benchmark. //! //! Based on [aobench](https://code.google.com/archive/p/aobench/) by Syoyo //! Fujita. #![deny(rust_2018_idioms)] use aobench_lib::*; use std::path::PathBuf; use structopt::StructOpt; /// Command-line arguments. #[derive(StructOpt, Debug)] struct Opt { /// Image width. width: usize, /// Image height. height: usize, /// Algorithm #[structopt(short = "a", long = "algo")] algo: String, /// Output file. #[structopt(short = "o", long = "output", parse(from_os_str))] output: Option, } const ALGORITHMS: &[&str] = &[ "scalar", "scalar_par", "vector", "vector_par", "tiled", "tiled_par", "ispc", "ispc_tasks", ]; fn main() { let opt = Opt::from_args(); let mut scene = aobench_lib::scene::Random::default(); let mut img = Image::new(opt.width, opt.height); let algorithm_name = opt.algo.as_str(); if let Some(algorithm) = ALGORITHMS.iter().find(|&&a| a == algorithm_name) { let d = time::Duration::span(|| match *algorithm { "scalar" => scalar::ao(&mut scene, 2, &mut img), "scalar_par" => scalar_parallel::ao(&mut scene, 2, &mut img), "vector" => vector::ao(&mut scene, 2, &mut img), "vector_par" => vector_parallel::ao(&mut scene, 2, &mut img), "tiled" => tiled::ao(&mut scene, 2, &mut img), "tiled_par" => tiled_parallel::ao(&mut scene, 2, &mut img), "ispc" => { #[cfg(feature = "ispc")] { ispc_::ao(&mut scene, 2, &mut img) } #[cfg(not(feature = "ispc"))] { panic!("the `ispc` algorithm requires building with --features=ispc"); } } "ispc_tasks" => { #[cfg(feature = "ispc")] { ispc_::ao_tasks(&mut scene, 2, &mut img) } #[cfg(not(feature = "ispc"))] { panic!("the `ispc_task` algorithm requires building with --features=ispc"); } } _ => unreachable!(), }); let image_path = opt.output.unwrap_or_else(|| { PathBuf::from(format!("image_{}.png", algorithm)) }); img.write_png(&image_path, false) .expect("failed to write image"); println!("time: {} ms", d.num_milliseconds()); } else { let mut error = format!( "unknown algorithm: \"{}\"\nAvailable algorithms:", algorithm_name ); for a in ALGORITHMS { error.push_str(&format!("\n- {}", a)); } panic!("{}", error); } } ================================================ FILE: examples/aobench/src/random.rs ================================================ //! Pseudo random number generators. //! //! Currently only `LFSR113` is implemented, since that is what ISPC uses, and it //! allows us to compare Rust's codegen with that of ISPC for the same //! algorithms. //! //! Use `{scalar,vector}::thread_rng()` to get a handle to the thread-local //! random number generator, and call `.gen()` to generate an `f32` or an //! `f32xN`. /// Scalar pseudo random number generator pub mod scalar { use std::cell::UnsafeCell; use std::rc::Rc; // Note: This implementation could be vectorized using an `u32x4`. struct RngT(u32, u32, u32, u32); impl RngT { fn from_seed(x: u32) -> Self { let z0 = x; let z1 = x ^ 0xbeef_f00d; let z2 = ((x & 0xffff_u32) << 16) | (x >> 16); let z3 = ((x & 0xff_u32) << 24) | ((x & 0xff00_u32) << 8) | ((x & 0x00ff_0000_u32) >> 8) | (x & 0xff00_0000_u32) >> 24; Self(z0, z1, z2, z3) } pub fn gen_u32(&mut self) -> u32 { let mut b = ((self.0 << 6) ^ self.0) >> 13; self.0 = ((self.0 & 4_294_967_294_u32) << 18) ^ b; b = ((self.1 << 2) ^ self.1) >> 27; self.1 = ((self.1 & 4_294_967_288_u32) << 2) ^ b; b = ((self.2 << 13) ^ self.2) >> 21; self.2 = ((self.2 & 4_294_967_280_u32) << 7) ^ b; b = ((self.3 << 3) ^ self.3) >> 12; self.3 = ((self.3 & 4_294_967_168_u32) << 13) ^ b; self.0 ^ self.1 ^ self.2 ^ self.3 } pub fn gen(&mut self) -> f32 { let mut v = self.gen_u32(); v &= (1_u32 << 23) - 1; let v = f32::from_bits(0x3F80_0000 | v); v - 1. } } #[derive(Clone)] pub struct RngH { rng: Rc>, } impl RngH { pub fn gen(&mut self) -> f32 { unsafe { (*self.rng.get()).gen() } } } thread_local!( static THREAD_RNG_KEY: Rc> = { Rc::new(UnsafeCell::new(RngT::from_seed(1))) } ); pub fn thread_rng() -> RngH { RngH { rng: THREAD_RNG_KEY.with(Clone::clone), } } } /// Vector pseudo random number generator pub mod vector { use crate::geometry::{f32xN, u32xN, IncrV}; use std::cell::UnsafeCell; use std::rc::Rc; struct RngT(u32xN, u32xN, u32xN, u32xN); impl RngT { fn from_seed(x: u32xN) -> Self { let z0 = x; let z1 = x ^ u32xN::splat(0xbeef_f00d); let z2 = ((x & u32xN::splat(0xffff)) << 16) | (x >> 16); let z3 = ((x & u32xN::splat(0xff)) << 24) | ((x & u32xN::splat(0xff00)) << 8) | ((x & u32xN::splat(0x00ff_0000)) >> 8) | (x & u32xN::splat(0xff00_0000)) >> 24; Self(z0, z1, z2, z3) } #[inline(always)] pub fn gen_u32(&mut self) -> u32xN { let mut b = ((self.0 << 6) ^ self.0) >> 13; self.0 = ((self.0 & u32xN::splat(4_294_967_294)) << 18) ^ b; b = ((self.1 << 2) ^ self.1) >> 27; self.1 = ((self.1 & u32xN::splat(4_294_967_288)) << 2) ^ b; b = ((self.2 << 13) ^ self.2) >> 21; self.2 = ((self.2 & u32xN::splat(4_294_967_280)) << 7) ^ b; b = ((self.3 << 3) ^ self.3) >> 12; self.3 = ((self.3 & u32xN::splat(4_294_967_168)) << 13) ^ b; self.0 ^ self.1 ^ self.2 ^ self.3 } #[inline(always)] pub fn gen(&mut self) -> f32xN { let mut v = self.gen_u32(); v &= u32xN::splat((1_u32 << 23) - 1); let v: f32xN = unsafe { std::mem::transmute(u32xN::splat(0x3F80_0000) | v) }; v - f32xN::splat(1.) } } #[derive(Clone)] pub struct RngH { rng: Rc>, } impl RngH { #[inline(always)] pub fn gen(&mut self) -> f32xN { unsafe { (*self.rng.get()).gen() } } } thread_local!( static THREAD_RNG_KEY: Rc> = { Rc::new(UnsafeCell::new(RngT::from_seed(::incr(0, 1)))) } ); pub fn thread_rng() -> RngH { RngH { rng: THREAD_RNG_KEY.with(Clone::clone), } } } ================================================ FILE: examples/aobench/src/scalar.rs ================================================ //! Scalar serial aobench use crate::ambient_occlusion; use crate::geometry::{Ray, V3D}; use crate::intersection::{Intersect, Isect}; use crate::scene::Scene; pub fn ao( scene: &mut S, nsubsamples: usize, img: &mut crate::Image, ) { let (w, h) = img.size(); let image = &mut img.fdata; let ns = nsubsamples; for y in 0..h { for x in 0..w { let offset = 3 * (y * w + x); for u in 0..ns { for v in 0..ns { let (x, y, u, v, h, w, ns) = ( x as f32, y as f32, u as f32, v as f32, h as f32, w as f32, ns as f32, ); let dir: V3D = V3D { x: (x + u / ns - w / 2.) / (w / 2.) * w / h, y: -(y + v / ns - h / 2.) / (h / 2.), z: -1., }; let dir = dir.normalized(); let ray = Ray { origin: V3D::default(), dir, }; let mut isect = Isect::default(); for s in scene.spheres() { isect = ray.intersect(s, isect); } isect = ray.intersect(scene.plane(), isect); let ret = if isect.hit { ambient_occlusion::scalar(scene, &isect) } else { 0. }; // Update image for AO for this ray image[offset + 0] += ret; image[offset + 1] += ret; image[offset + 2] += ret; } } // Normalize image pixels by number of samples taken per pixel let ns = (ns * ns) as f32; image[offset + 0] /= ns; image[offset + 1] /= ns; image[offset + 2] /= ns; } } } ================================================ FILE: examples/aobench/src/scalar_parallel.rs ================================================ //! Scalar parallel aobench use crate::ambient_occlusion; use crate::geometry::{Ray, V3D}; use crate::intersection::{Intersect, Isect}; use crate::scene::Scene; use rayon::prelude::*; pub fn ao(_: &mut S, nsubsamples: usize, img: &mut crate::Image) { let (w, h) = img.size(); let ns = nsubsamples; img.fdata .par_chunks_mut(3 * w) .enumerate() .for_each(|(y, image)| { assert!(image.len() == 3 * w); let mut scene = S::default(); for x in 0..w { let offset = 3 * x; for u in 0..ns { for v in 0..ns { let (x, y, u, v, h, w, ns) = ( x as f32, y as f32, u as f32, v as f32, h as f32, w as f32, ns as f32, ); let dir: V3D = V3D { x: (x + u / ns - w / 2.) / (w / 2.) * w / h, y: -(y + v / ns - h / 2.) / (h / 2.), z: -1., }; let dir = dir.normalized(); let ray = Ray { origin: V3D::default(), dir, }; let mut isect = Isect::default(); for s in scene.spheres() { isect = ray.intersect(s, isect); } isect = ray.intersect(scene.plane(), isect); let ret = if isect.hit { ambient_occlusion::scalar(&mut scene, &isect) } else { 0. }; // Update image for AO for this ray image[offset + 0] += ret; image[offset + 1] += ret; image[offset + 2] += ret; } } // Normalize image pixels by number of samples taken per pixel let ns = (ns * ns) as f32; image[offset + 0] /= ns; image[offset + 1] /= ns; image[offset + 2] /= ns; } }); } ================================================ FILE: examples/aobench/src/scene/mod.rs ================================================ /// Scene interface use crate::geometry::{f32xN, Plane, Sphere}; pub trait Scene: Send + Sync + Default { const NAO_SAMPLES: usize; fn rand(&mut self) -> f32; fn plane(&self) -> &Plane; fn spheres(&self) -> &[Sphere]; fn rand_f32xN(&mut self) -> (f32xN, f32xN) { #[cfg(feature = "256bit")] { let r = [ self.rand(), self.rand(), self.rand(), self.rand(), self.rand(), self.rand(), self.rand(), self.rand(), self.rand(), self.rand(), self.rand(), self.rand(), self.rand(), self.rand(), self.rand(), self.rand(), ]; ( f32xN::new(r[0], r[2], r[4], r[6], r[8], r[10], r[12], r[14]), f32xN::new(r[1], r[3], r[5], r[7], r[9], r[11], r[13], r[15]), ) } #[cfg(not(feature = "256bit"))] { let r = [ self.rand(), self.rand(), self.rand(), self.rand(), self.rand(), self.rand(), self.rand(), self.rand(), ]; ( f32xN::new(r[0], r[2], r[4], r[6]), f32xN::new(r[1], r[3], r[5], r[7]), ) } } } mod random; pub use self::random::Random; mod test; pub use self::test::Test; ================================================ FILE: examples/aobench/src/scene/random.rs ================================================ //! Aobench scene: 3 spheres and a plane using a random number generator use crate::geometry::{f32xN, Plane, Sphere, V3D}; use crate::scene::Scene; #[derive(Clone)] pub struct Random { pub plane: Plane, pub spheres: [Sphere; 3], } impl Default for Random { fn default() -> Self { let plane = Plane { p: V3D { x: 0., y: -0.5, z: 0., }, n: V3D { x: 0., y: 1., z: 0., }, }; let spheres = [ Sphere { center: V3D { x: -2., y: 0., z: -3.5, }, radius: 0.5, }, Sphere { center: V3D { x: -0.5, y: 0., z: -3., }, radius: 0.5, }, Sphere { center: V3D { x: 1., y: 0., z: -2.2, }, radius: 0.5, }, ]; Self { plane, spheres } } } impl Scene for Random { const NAO_SAMPLES: usize = 8; #[inline(always)] fn rand(&mut self) -> f32 { crate::random::scalar::thread_rng().gen() } #[inline(always)] fn plane(&self) -> &Plane { &self.plane } #[inline(always)] fn spheres(&self) -> &[Sphere] { &self.spheres } #[inline(always)] fn rand_f32xN(&mut self) -> (f32xN, f32xN) { let mut rng = crate::random::vector::thread_rng(); (rng.gen(), rng.gen()) } } ================================================ FILE: examples/aobench/src/scene/test.rs ================================================ //! Aobench scene: 3 spheres and a plane using a random number generator use crate::geometry::{Plane, Sphere, V3D}; use crate::scene::Scene; use std::num::Wrapping; #[derive(Clone)] pub struct Test { pub plane: Plane, pub spheres: [Sphere; 3], rands: Vec, rand_step: Wrapping, } impl Default for Test { fn default() -> Self { let plane = Plane { p: V3D { x: 0., y: -0.5, z: 0., }, n: V3D { x: 0., y: 1., z: 0., }, }; let spheres = [ Sphere { center: V3D { x: -2., y: 0., z: -3.5, }, radius: 0.5, }, Sphere { center: V3D { x: -0.5, y: 0., z: -3., }, radius: 0.5, }, Sphere { center: V3D { x: 1., y: 0., z: -2.2, }, radius: 0.5, }, ]; let mut rands = Vec::new(); let mut rng = crate::random::scalar::thread_rng(); for _ in 0..2 * Self::NAO_SAMPLES * Self::NAO_SAMPLES { rands.push(rng.gen()); } let rand_step = Wrapping(0); Self { plane, spheres, rands, rand_step, } } } impl Scene for Test { const NAO_SAMPLES: usize = 8; fn rand(&mut self) -> f32 { let v = self.rands[self.rand_step.0]; self.rand_step += Wrapping(1); if self.rand_step >= Wrapping(2 * Self::NAO_SAMPLES * Self::NAO_SAMPLES) { self.rand_step = Wrapping(0); } v } fn plane(&self) -> &Plane { &self.plane } fn spheres(&self) -> &[Sphere] { &self.spheres } } ================================================ FILE: examples/aobench/src/tiled.rs ================================================ //! SIMD serial aobench use crate::ambient_occlusion; use crate::geometry::{f32xN, pf32xN, usizexN, IncrV, RayxN, V3DxN}; use crate::intersection::{Intersect, IsectxN}; use crate::scene::Scene; use cfg_if::cfg_if; #[inline(always)] fn ao_impl( scene: &mut S, nsubsamples: usize, img: &mut crate::Image, ) { let (w, h) = img.size(); assert_eq!(w % f32xN::lanes(), 0); let image = &mut img.fdata; let ns = nsubsamples; let inv_ns = 1. / (ns as f32); let ptr = pf32xN::splat(image.as_mut_ptr()); for y in 0..h { let yf = f32xN::splat(y as f32); for x in (0..w).step_by(f32xN::lanes()) { let xf = f32xN::incr(x as f32, 1.); let offset = usizexN::splat(3 * (y * w + x)); let r_ptr = unsafe { ptr.add(offset + usizexN::incr(0, 3)) }; let g_ptr = unsafe { ptr.add(offset + usizexN::incr(1, 3)) }; let b_ptr = unsafe { ptr.add(offset + usizexN::incr(2, 3)) }; for u in 0..ns { for v in 0..ns { let du = (u as f32) * inv_ns; let dv = (v as f32) * inv_ns; let (hf, wf) = (h as f32, w as f32); let dir = V3DxN { x: (xf + f32xN::splat(du - (wf / 2.))) / f32xN::splat((wf / 2.) * hf / wf), y: -(yf + f32xN::splat(dv - (hf / 2.))) / f32xN::splat(hf / 2.), z: f32xN::splat(-1.), }; let dir = dir.normalized(); let ray = RayxN { origin: V3DxN::default(), dir, }; let mut isect = IsectxN::default(); for s in scene.spheres() { isect = ray.intersect(s, isect); } isect = ray.intersect(scene.plane(), isect); if isect.hit.any() { let ret = ambient_occlusion::vector_tiled(scene, &isect) * f32xN::splat(inv_ns * inv_ns); unsafe { let img_r = r_ptr.read(isect.hit, f32xN::splat(0.)); let img_g = g_ptr.read(isect.hit, f32xN::splat(0.)); let img_b = b_ptr.read(isect.hit, f32xN::splat(0.)); r_ptr.write(isect.hit, img_r + ret); g_ptr.write(isect.hit, img_g + ret); b_ptr.write(isect.hit, img_b + ret); } } } } } } } cfg_if! { if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] { #[target_feature(enable = "sse4.2")] unsafe fn ao_sse42(scene: &mut S, nsubsamples: usize, img: &mut crate::Image) { ao_impl(scene, nsubsamples, img); } #[target_feature(enable = "avx")] unsafe fn ao_avx(scene: &mut S, nsubsamples: usize, img: &mut crate::Image) { ao_impl(scene, nsubsamples, img); } #[target_feature(enable = "avx,fma")] unsafe fn ao_avx_fma(scene: &mut S, nsubsamples: usize, img: &mut crate::Image) { ao_impl(scene, nsubsamples, img); } #[target_feature(enable = "avx2,fma")] unsafe fn ao_avx2(scene: &mut S, nsubsamples: usize, img: &mut crate::Image) { ao_impl(scene, nsubsamples, img); } pub fn ao(scene: &mut S, nsubsamples: usize, img: &mut crate::Image) { unsafe { if is_x86_feature_detected!("avx2") && is_x86_feature_detected!("fma") { ao_avx2(scene, nsubsamples, img); } else if is_x86_feature_detected!("avx") { if is_x86_feature_detected!("fma") { ao_avx_fma(scene, nsubsamples, img); } else { ao_avx(scene, nsubsamples, img); } } else if is_x86_feature_detected!("sse4.2") { ao_sse42(scene, nsubsamples, img); } else { ao_impl(scene, nsubsamples, img); } } } } else { pub fn ao(scene: &mut S, nsubsamples: usize, img: &mut crate::Image) { ao_impl(scene, nsubsamples, img); } } } ================================================ FILE: examples/aobench/src/tiled_parallel.rs ================================================ //! SIMD tiled parallel aobench use crate::ambient_occlusion; use crate::geometry::{f32xN, pf32xN, usizexN, IncrV, RayxN, V3DxN}; use crate::intersection::{Intersect, IsectxN}; use crate::scene::Scene; use rayon::prelude::*; pub fn ao(_: &mut S, nsubsamples: usize, img: &mut crate::Image) { let (w, h) = img.size(); assert_eq!(w % f32xN::lanes(), 0); let ns = nsubsamples; let inv_ns = 1. / (ns as f32); let ptr = usizexN::splat(img.fdata.as_mut_ptr() as usize); img.fdata .par_chunks_mut(3 * w) .enumerate() .for_each(|(y, image)| { assert!(image.len() == 3 * w); let mut scene = S::default(); let yf = f32xN::splat(y as f32); let ptr: pf32xN = unsafe { std::mem::transmute(ptr) }; for x in (0..w).step_by(f32xN::lanes()) { let xf = f32xN::incr(x as f32, 1.); let offset = usizexN::splat(3 * (y * w + x)); let r_ptr = unsafe { ptr.add(offset + usizexN::incr(0, 3)) }; let g_ptr = unsafe { ptr.add(offset + usizexN::incr(1, 3)) }; let b_ptr = unsafe { ptr.add(offset + usizexN::incr(2, 3)) }; for u in 0..ns { for v in 0..ns { let du = (u as f32) * inv_ns; let dv = (v as f32) * inv_ns; let (hf, wf) = (h as f32, w as f32); let dir = V3DxN { x: (xf + f32xN::splat(du - (wf / 2.))) / f32xN::splat((wf / 2.) * hf / wf), y: -(yf + f32xN::splat(dv - (hf / 2.))) / f32xN::splat(hf / 2.), z: f32xN::splat(-1.), }; let dir = dir.normalized(); let ray = RayxN { origin: V3DxN::default(), dir, }; let mut isect = IsectxN::default(); for s in scene.spheres() { isect = ray.intersect(s, isect); } isect = ray.intersect(scene.plane(), isect); if isect.hit.any() { let ret = ambient_occlusion::vector_tiled( &mut scene, &isect, ) * f32xN::splat(inv_ns * inv_ns); unsafe { let img_r = r_ptr.read(isect.hit, f32xN::splat(0.)); let img_g = g_ptr.read(isect.hit, f32xN::splat(0.)); let img_b = b_ptr.read(isect.hit, f32xN::splat(0.)); r_ptr.write(isect.hit, img_r + ret); g_ptr.write(isect.hit, img_g + ret); b_ptr.write(isect.hit, img_b + ret); } } } } } }); } ================================================ FILE: examples/aobench/src/vector.rs ================================================ //! SIMD serial aobench use crate::ambient_occlusion; use crate::geometry::{Ray, V3D}; use crate::intersection::{Intersect, Isect}; use crate::scene::Scene; use cfg_if::cfg_if; #[inline(always)] fn ao_impl( scene: &mut S, nsubsamples: usize, img: &mut crate::Image, ) { let (w, h) = img.size(); let image = &mut img.fdata; let ns = nsubsamples; let inv_ns = 1. / (ns as f32); for y in 0..h { for x in 0..w { let offset = 3 * (y * w + x); for u in 0..ns { for v in 0..ns { let du = (u as f32) * inv_ns; let dv = (v as f32) * inv_ns; let (x, y, h, w) = (x as f32, y as f32, h as f32, w as f32); let dir = V3D { x: (x + du - (w * 0.5)) / (w * 0.5) * w / h, y: -(y + dv - (h * 0.5)) / (h * 0.5), z: -1., }; let dir = dir.normalized(); let ray = Ray { origin: V3D::default(), dir, }; let mut isect = Isect::default(); for s in scene.spheres() { isect = ray.intersect(s, isect); } isect = ray.intersect(scene.plane(), isect); let ret = if isect.hit { ambient_occlusion::vector(scene, &isect) } else { 0. }; let ret = ret * inv_ns * inv_ns; // Update image for AO for this ray // (already normalized) image[offset + 0] += ret; image[offset + 1] += ret; image[offset + 2] += ret; } } } } } cfg_if! { if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] { #[target_feature(enable = "sse4.2")] unsafe fn ao_sse42(scene: &mut S, nsubsamples: usize, img: &mut crate::Image) { ao_impl(scene, nsubsamples, img); } #[target_feature(enable = "avx")] unsafe fn ao_avx(scene: &mut S, nsubsamples: usize, img: &mut crate::Image) { ao_impl(scene, nsubsamples, img); } #[target_feature(enable = "avx,fma")] unsafe fn ao_avx_fma(scene: &mut S, nsubsamples: usize, img: &mut crate::Image) { ao_impl(scene, nsubsamples, img); } #[target_feature(enable = "avx2,fma")] unsafe fn ao_avx2(scene: &mut S, nsubsamples: usize, img: &mut crate::Image) { ao_impl(scene, nsubsamples, img); } pub fn ao(scene: &mut S, nsubsamples: usize, img: &mut crate::Image) { unsafe { if is_x86_feature_detected!("avx2") && is_x86_feature_detected!("fma") { ao_avx2(scene, nsubsamples, img); } else if is_x86_feature_detected!("avx") { if is_x86_feature_detected!("fma") { ao_avx_fma(scene, nsubsamples, img); } else { ao_avx(scene, nsubsamples, img); } } else if is_x86_feature_detected!("sse4.2") { ao_sse42(scene, nsubsamples, img); } else { ao_impl(scene, nsubsamples, img); } } } } else { pub fn ao(scene: &mut S, nsubsamples: usize, img: &mut crate::Image) { ao_impl(scene, nsubsamples, img); } } } ================================================ FILE: examples/aobench/src/vector_parallel.rs ================================================ //! SIMD parallel aobench use crate::ambient_occlusion; use crate::geometry::{Ray, V3D}; use crate::intersection::{Intersect, Isect}; use crate::scene::Scene; use rayon::prelude::*; pub fn ao(_: &mut S, nsubsamples: usize, img: &mut crate::Image) { let (w, h) = img.size(); let ns = nsubsamples; let inv_ns = 1. / (ns as f32); img.fdata .par_chunks_mut(3 * w) .enumerate() .for_each(|(y, image)| { assert!(image.len() == 3 * w); let mut scene = S::default(); for x in 0..w { let offset = 3 * x; for u in 0..ns { for v in 0..ns { let du = (u as f32) * inv_ns; let dv = (v as f32) * inv_ns; let (x, y, h, w) = (x as f32, y as f32, h as f32, w as f32); let dir = V3D { x: (x + du - (w / 2.)) / (w / 2.) * w / h, y: -(y + dv - (h / 2.)) / (h / 2.), z: -1., }; let dir = dir.normalized(); let ray = Ray { origin: V3D::default(), dir, }; let mut isect = Isect::default(); for s in scene.spheres() { isect = ray.intersect(s, isect); } isect = ray.intersect(scene.plane(), isect); let ret = if isect.hit { ambient_occlusion::vector(&mut scene, &isect) } else { 0. }; let ret = ret * inv_ns * inv_ns; // Update image for AO for this ray // (already normalized) image[offset + 0] += ret; image[offset + 1] += ret; image[offset + 2] += ret; } } } }); } ================================================ FILE: examples/aobench/volta/.gitignore ================================================ ao *.ppm objs/ ================================================ FILE: examples/aobench/volta/ao.ispc ================================================ // -*- mode: c++ -*- /* Copyright (c) 2010-2011, Intel Corporation All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* Based on Syoyo Fujita's aobench: http://code.google.com/p/aobench */ #define NAO_SAMPLES 8 #define M_PI 3.1415926535f typedef float<3> vec; struct Isect { float t; vec p; vec n; int hit; }; struct Sphere { vec center; float radius; }; struct Plane { vec p; vec n; }; struct Ray { vec org; vec dir; }; static inline float dot(vec a, vec b) { return a.x * b.x + a.y * b.y + a.z * b.z; } static inline vec vcross(vec v0, vec v1) { vec ret; ret.x = v0.y * v1.z - v0.z * v1.y; ret.y = v0.z * v1.x - v0.x * v1.z; ret.z = v0.x * v1.y - v0.y * v1.x; return ret; } static inline void vnormalize(vec &v) { float len2 = dot(v, v); float invlen = rsqrt(len2); v *= invlen; } static void ray_plane_intersect(Isect &isect, Ray &ray, uniform Plane &plane) { float d = -dot(plane.p, plane.n); float v = dot(ray.dir, plane.n); cif (abs(v) < 1.0e-17) return; else { float t = -(dot(ray.org, plane.n) + d) / v; cif ((t > 0.0) && (t < isect.t)) { isect.t = t; isect.hit = 1; isect.p = ray.org + ray.dir * t; isect.n = plane.n; } } } static inline void ray_sphere_intersect(Isect &isect, Ray &ray, uniform Sphere &sphere) { vec rs = ray.org - sphere.center; float B = dot(rs, ray.dir); float C = dot(rs, rs) - sphere.radius * sphere.radius; float D = B * B - C; cif (D > 0.) { float t = -B - sqrt(D); cif ((t > 0.0) && (t < isect.t)) { isect.t = t; isect.hit = 1; isect.p = ray.org + t * ray.dir; isect.n = isect.p - sphere.center; vnormalize(isect.n); } } } static void orthoBasis(vec basis[3], vec n) { basis[2] = n; basis[1].x = 0.0; basis[1].y = 0.0; basis[1].z = 0.0; if ((n.x < 0.6) && (n.x > -0.6)) { basis[1].x = 1.0; } else if ((n.y < 0.6) && (n.y > -0.6)) { basis[1].y = 1.0; } else if ((n.z < 0.6) && (n.z > -0.6)) { basis[1].z = 1.0; } else { basis[1].x = 1.0; } basis[0] = vcross(basis[1], basis[2]); vnormalize(basis[0]); basis[1] = vcross(basis[2], basis[0]); vnormalize(basis[1]); } static float ambient_occlusion(Isect &isect, uniform Plane &plane, uniform Sphere spheres[3], RNGState &rngstate) { float eps = 0.0001f; vec p, n; vec basis[3]; float occlusion = 0.0; p = isect.p + eps * isect.n; orthoBasis(basis, isect.n); static const uniform int ntheta = NAO_SAMPLES; static const uniform int nphi = NAO_SAMPLES; for (uniform int j = 0; j < ntheta; j++) { for (uniform int i = 0; i < nphi; i++) { Ray ray; Isect occIsect; float theta = sqrt(frandom(&rngstate)); float phi = 2.0f * M_PI * frandom(&rngstate); float x = cos(phi) * theta; float y = sin(phi) * theta; float z = sqrt(1.0 - theta * theta); // local . global float rx = x * basis[0].x + y * basis[1].x + z * basis[2].x; float ry = x * basis[0].y + y * basis[1].y + z * basis[2].y; float rz = x * basis[0].z + y * basis[1].z + z * basis[2].z; ray.org = p; ray.dir.x = rx; ray.dir.y = ry; ray.dir.z = rz; occIsect.t = 1.0e+17; occIsect.hit = 0; for (uniform int snum = 0; snum < 3; ++snum) ray_sphere_intersect(occIsect, ray, spheres[snum]); ray_plane_intersect (occIsect, ray, plane); if (occIsect.hit) occlusion += 1.0; } } occlusion = (ntheta * nphi - occlusion) / (float)(ntheta * nphi); return occlusion; } /* Compute the image for the scanlines from [y0,y1), for an overall image of width w and height h. */ static void ao_scanlines(uniform int y0, uniform int y1, uniform int w, uniform int h, uniform int nsubsamples, uniform float image[]) { static uniform Plane plane = { { 0.0f, -0.5f, 0.0f }, { 0.f, 1.f, 0.f } }; static uniform Sphere spheres[3] = { { { -2.0f, 0.0f, -3.5f }, 0.5f }, { { -0.5f, 0.0f, -3.0f }, 0.5f }, { { 1.0f, 0.0f, -2.2f }, 0.5f } }; RNGState rngstate; seed_rng(&rngstate, programIndex + (y0 << (programIndex & 15))); float invSamples = 1.f / nsubsamples; foreach_tiled(y = y0 ... y1, x = 0 ... w, u = 0 ... nsubsamples, v = 0 ... nsubsamples) { float du = (float)u * invSamples, dv = (float)v * invSamples; // Figure out x,y pixel in NDC float px = (x + du - (w / 2.0f)) / (w / 2.0f); float py = -(y + dv - (h / 2.0f)) / (h / 2.0f); // Scale NDC based on width/height ratio, supporting non-square image output px *= (float)w / (float)h; float ret = 0.f; Ray ray; Isect isect; ray.org = 0.f; // Poor man's perspective projection ray.dir.x = px; ray.dir.y = py; ray.dir.z = -1.0; vnormalize(ray.dir); isect.t = 1.0e+17; isect.hit = 0; for (uniform int snum = 0; snum < 3; ++snum) ray_sphere_intersect(isect, ray, spheres[snum]); ray_plane_intersect(isect, ray, plane); // Note use of 'coherent' if statement; the set of rays we // trace will often all hit or all miss the scene cif (isect.hit) { ret = ambient_occlusion(isect, plane, spheres, rngstate); ret *= invSamples * invSamples; int offset = 3 * (y * w + x); atomic_add_local(&image[offset], ret); atomic_add_local(&image[offset+1], ret); atomic_add_local(&image[offset+2], ret); } } } export void ao_ispc(uniform int w, uniform int h, uniform int nsubsamples, uniform float image[]) { ao_scanlines(0, h, w, h, nsubsamples, image); } static void task ao_task(uniform int width, uniform int height, uniform int nsubsamples, uniform float image[]) { ao_scanlines(taskIndex, taskIndex+1, width, height, nsubsamples, image); } export void ao_ispc_tasks(uniform int w, uniform int h, uniform int nsubsamples, uniform float image[]) { launch[h] ao_task(w, h, nsubsamples, image); } ================================================ FILE: examples/dot_product/Cargo.toml ================================================ [package] name = "dot_product" version = "0.1.0" authors = ["Gonzalo Brito Gadeschi "] edition = "2018" [dependencies] packed_simd = { package = "packed_simd", path = "../.." } [lib] name = "dot_product_lib" path = "src/lib.rs" ================================================ FILE: examples/dot_product/readme.md ================================================ # Vector dot product ================================================ FILE: examples/dot_product/src/lib.rs ================================================ //! Vector dot product #![deny(rust_2018_idioms)] #![feature(custom_inner_attributes)] #![allow(clippy::must_use_candidate, clippy::float_cmp)] pub mod scalar; pub mod simd; #[cfg(test)] #[rustfmt::skip] fn test f32>(f: F) { let tests: &[(&[f32], &[f32], f32)] = &[ (&[0_f32, 0., 0., 0.], &[0_f32, 0., 0., 0.], 0_f32), (&[0_f32, 0., 0., 1.], &[0_f32, 0., 0., 1.], 1_f32), (&[1_f32, 2., 3., 4.], &[0_f32, 0., 0., 0.], 0_f32), (&[1_f32, 2., 3., 4.], &[1_f32, 2., 3., 4.], 30_f32), (&[1_f32, 2., 3., 4., 1., 2., 3., 4.], &[1_f32, 1., 1., 1., 1., 1., 1., 1.], 20_f32), ]; for &(a, b, output) in tests { assert_eq!(f(a, b), output); } } ================================================ FILE: examples/dot_product/src/scalar.rs ================================================ //! Scalar implementation pub fn dot_prod(a: &[f32], b: &[f32]) -> f32 { assert_eq!(a.len(), b.len()); a.iter().zip(b.iter()).map(|v| v.0 * v.1).sum() } #[cfg(test)] #[test] fn test() { crate::test(dot_prod) } ================================================ FILE: examples/dot_product/src/simd.rs ================================================ //! Scalar implementation use packed_simd::f32x4; pub fn dot_prod(a: &[f32], b: &[f32]) -> f32 { assert_eq!(a.len(), b.len()); assert!(a.len() % 4 == 0); a.chunks_exact(4) .map(f32x4::from_slice_unaligned) .zip(b.chunks_exact(4).map(f32x4::from_slice_unaligned)) .map(|(a, b)| a * b) .sum::() .sum() } #[cfg(test)] #[test] fn test() { crate::test(dot_prod) } ================================================ FILE: examples/fannkuch_redux/Cargo.toml ================================================ [package] name = "fannkuch_redux" version = "0.1.0" authors = ["gnzlbg "] edition = "2018" [dependencies] packed_simd = { package = "packed_simd", path = "../.." } [[bin]] name = "fannkuch_redux" path = "src/main.rs" [lib] name = "fannkuch_redux_lib" path = "src/lib.rs" ================================================ FILE: examples/fannkuch_redux/readme.md ================================================ # Fannkuch redux This is the [`fannkuch redux` benchmark from the benchmarksgame][bg]. ## Background and description The fannkuch benchmark is defined by programs in [Performing Lisp Analysis of the FANNKUCH Benchmark](http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.35.5124), Kenneth R. Anderson and Duane Rettig. FANNKUCH is an abbreviation for the German word __Pfannkuchen_, or pancakes, in analogy to flipping pancakes. The conjecture is that the maximum count is approximated by `n*log(n)` when `n` goes to infinity. Each program should: * Take a permutation of `{1,...,n}`, for example: `{4,2,1,5,3}`. * Take the first element, here `4`, and reverse the order of the first `4` elements: `{5,1,2,4,3}`. * Repeat this until the first element is a `1`, so flipping won't change anything more: `{3,4,2,1,5}`, `{2,4,3,1,5}`, `{4,2,3,1,5}`, `{1,3,2,4,5}`. * Count the number of flips, here `5`. * Keep a checksum * `checksum = checksum + (if permutation_index is even then flips_count else -flips_count)` * `checksum = checksum + (toggle_sign_-1_1 * flips_count)` * Do this for all `n!` permutations, and record the maximum number of flips needed for any permutation. ## Usage It takes two arguments in this order: * `n`: the input sequence length: `{1, ..., n}` * (optional) `algorithm`: the algorithm to use - defaults to the fastest one. * `0`: scalar algorithm * `1`: SIMD algorithm [bg]: https://benchmarksgame-team.pages.debian.net/benchmarksgame/description/fannkuchredux.html#fannkuchredux ================================================ FILE: examples/fannkuch_redux/src/fannkuchredux-output.txt ================================================ 228 Pfannkuchen(7) = 16 ================================================ FILE: examples/fannkuch_redux/src/lib.rs ================================================ //! Fannkuch redux #![deny(warnings, rust_2018_idioms)] #![allow(non_snake_case, non_camel_case_types)] #![allow( clippy::similar_names, clippy::many_single_char_names, clippy::cast_possible_truncation, clippy::cast_sign_loss, clippy::cast_possible_wrap, clippy::must_use_candidate, clippy::float_cmp )] pub mod scalar; pub mod simd; pub fn fannkuch_redux(n: usize, alg: usize) -> (i32, i32) { match alg { 0 => simd::fannkuch_redux(n), 1 => scalar::fannkuch_redux(n), v => panic!("unknown algorithm value: {}", v), } } ================================================ FILE: examples/fannkuch_redux/src/main.rs ================================================ #![deny(rust_2018_idioms)] use fannkuch_redux_lib::*; fn run(o: &mut O, n: usize, alg: usize) { let (checksum, maxflips) = fannkuch_redux(n, alg); writeln!(o, "{}\nPfannkuchen({}) = {}", checksum, n, maxflips).unwrap(); } fn main() { let n: usize = std::env::args().nth(1).expect("need one arg").parse().unwrap(); assert!((3..=14).contains(&n), "n = {} is out-of-range [3, 14]", n); let alg = if let Some(v) = std::env::args().nth(2) { v.parse().unwrap() } else { 0 }; run(&mut std::io::stdout(), n, alg); } #[cfg(test)] mod tests { use super::*; static OUTPUT: &[u8] = include_bytes!("fannkuchredux-output.txt"); #[test] fn verify_output_simd() { let mut out: Vec = Vec::new(); run(&mut out, 7, 0); assert_eq!(out.len(), OUTPUT.len()); if out != OUTPUT { for i in 0..out.len() { assert_eq!( out[i], OUTPUT[i], "byte {} differs - is: {:#08b} - should: {:#08b}", i, out[i], OUTPUT[i] ); } } } #[test] fn verify_output_scalar() { let mut out: Vec = Vec::new(); run(&mut out, 7, 1); assert_eq!(out.len(), OUTPUT.len()); if out != OUTPUT { for i in 0..out.len() { assert_eq!( out[i], OUTPUT[i], "byte {} differs - is: {:#08b} - should: {:#08b}", i, out[i], OUTPUT[i] ); } } } } ================================================ FILE: examples/fannkuch_redux/src/scalar.rs ================================================ //! Scalar fannkuch redux implementation use std::{cmp, mem, thread}; // FIXME: replace with slice rotate fn rotate(x: &mut [i32]) { let mut prev = x[0]; for place in x.iter_mut().rev() { prev = mem::replace(place, prev) } } fn next_permutation(perm: &mut [i32], count: &mut [i32]) { for i in 1..perm.len() { rotate(&mut perm[..=i]); let count_i = &mut count[i]; if *count_i >= i as i32 { *count_i = 0; } else { *count_i += 1; break; } } } #[derive(Clone, Copy)] struct P { p: [i32; 16], } #[derive(Clone, Copy)] struct Perm { cnt: [i32; 16], fact: [u32; 16], n: u32, permcount: u32, perm: P, } impl Perm { fn new(n: u32) -> Self { let mut fact = [1; 16]; for i in 1..=n as usize { fact[i] = fact[i - 1] * i as u32; } Self { cnt: [0; 16], fact, n, permcount: 0, perm: P { p: [0; 16] } } } fn get(&mut self, mut idx: i32) -> P { let mut pp = [0_u8; 16]; self.permcount = idx as u32; for (i, place) in self.perm.p.iter_mut().enumerate() { *place = i as i32 + 1; } for i in (1..self.n as usize).rev() { let d = idx / self.fact[i] as i32; self.cnt[i] = d; idx %= self.fact[i] as i32; for (place, val) in pp.iter_mut().zip(self.perm.p[..=i].iter()) { *place = (*val) as u8 } let d = d as usize; for j in 0..=i { self.perm.p[j] = i32::from(if j + d <= i { pp[j + d] } else { pp[j + d - i - 1] }); } } self.perm } fn count(&self) -> u32 { self.permcount } fn max(&self) -> u32 { self.fact[self.n as usize] } fn next(&mut self) -> P { next_permutation(&mut self.perm.p, &mut self.cnt); self.permcount += 1; self.perm } } fn reverse(tperm: &mut [i32], k: usize) { tperm[..k].reverse() } fn work(mut perm: Perm, n: usize, max: usize) -> (i32, i32) { let mut checksum = 0; let mut maxflips = 0; let mut p = perm.get(n as i32); while perm.count() < max as u32 { let mut flips = 0; while p.p[0] != 1 { let k = p.p[0] as usize; reverse(&mut p.p, k); flips += 1; } checksum += if perm.count() % 2 == 0 { flips } else { -flips }; maxflips = cmp::max(maxflips, flips); p = perm.next(); } (checksum, maxflips) } pub fn fannkuch_redux(n: usize) -> (i32, i32) { let perm = Perm::new(n as u32); let m = 1; let mut futures = vec![]; let k = perm.max() / m; for j in (0..).map(|x| x * k).take_while(|&j| j < k * m) { let max = cmp::min(j + k, perm.max()); futures .push(thread::spawn(move || work(perm, j as usize, max as usize))) } let mut checksum = 0; let mut maxflips = 0; for fut in futures { let (cs, mf) = fut.join().unwrap(); checksum += cs; maxflips = cmp::max(maxflips, mf); } (checksum, maxflips) } #[cfg(test)] #[test] fn test() { assert_eq!(fannkuch_redux(7), (228, 16)); } ================================================ FILE: examples/fannkuch_redux/src/simd.rs ================================================ //! Vectorized fannkuch redux implementation use packed_simd::*; struct State { s: [u8; 16], flip_masks: [u8x16; 16], rotate_masks: [u8x16; 16], maxflips: i32, odd: u16, checksum: i32, } impl Default for State { fn default() -> Self { Self { s: [0; 16], flip_masks: [u8x16::splat(0); 16], rotate_masks: [u8x16::splat(0); 16], maxflips: 0, odd: 0, checksum: 0, } } } impl State { fn rotate_sisd(&mut self, n: usize) { let c = self.s[0]; for i in 1..=n { self.s[i - 1] = self.s[i]; } self.s[n] = c; } fn popmasks(&mut self) { let mut mask = [0_u8; 16]; for i in 0..16 { for (j, m) in mask.iter_mut().enumerate() { *m = j as u8; } for x in 0..(i + 1) / 2 { mask.swap(x, i - x); } self.flip_masks[i] = u8x16::from_slice_unaligned(&mask); for (j, s) in self.s.iter_mut().enumerate() { *s = j as u8; } self.rotate_sisd(i); self.rotate_masks[i] = self.load_s(); } } fn rotate(&mut self, n: usize) { self.load_s() .shuffle1_dyn(self.rotate_masks[n]) .write_to_slice_unaligned(&mut self.s) } fn load_s(&self) -> u8x16 { u8x16::from_slice_unaligned(&self.s) } fn tk(&mut self, n: usize) { #[derive(Copy, Clone, Debug)] struct Perm { perm: u8x16, start: u8, odd: u16, } let mut perms = [Perm { perm: u8x16::splat(0), start: 0, odd: 0 }; 60]; let mut i = 0; let mut c = [0_u8; 16]; let mut perm_max = 0; // Cache this locally outside the loop, since the compiler // can't optimize accesses to it otherwise. let mut odd = self.odd; while i < n { while i < n && perm_max < 60 { self.rotate(i); if c[i] as usize >= i { c[i] = 0; i += 1; continue; } c[i] += 1; i = 1; odd = !odd; if self.s[0] != 0 { if self.s[self.s[0] as usize] == 0 { if self.maxflips == 0 { self.maxflips = 1 } self.checksum += if odd == 0 { 1 } else { -1 }; } else { perms[perm_max].perm = self.load_s(); perms[perm_max].start = self.s[0]; perms[perm_max].odd = odd; perm_max += 1; } } } let mut k = 0; while k < std::cmp::max(1, perm_max) - 1 { let pk = &perms[k]; let pk1 = &perms[k + 1]; let mut perm1 = pk.perm; let mut perm2 = pk1.perm; let mut f1 = 0; let mut f2 = 0; let mut toterm1 = pk.start; let mut toterm2 = pk1.start; while toterm1 != 0 && toterm2 != 0 { perm1 = perm1.shuffle1_dyn(self.flip_masks[toterm1 as usize]); perm2 = perm2.shuffle1_dyn(self.flip_masks[toterm2 as usize]); toterm1 = perm1.extract(0); toterm2 = perm2.extract(0); f1 += 1; f2 += 1; } while toterm1 != 0 { perm1 = perm1.shuffle1_dyn(self.flip_masks[toterm1 as usize]); toterm1 = perm1.extract(0); f1 += 1; } while toterm2 != 0 { perm2 = perm2.shuffle1_dyn(self.flip_masks[toterm2 as usize]); toterm2 = perm2.extract(0); f2 += 1; } if f1 > self.maxflips { self.maxflips = f1 } if f2 > self.maxflips { self.maxflips = f2 } self.checksum += if pk.odd == 0 { f1 } else { -f1 }; self.checksum += if pk1.odd == 0 { f2 } else { -f2 }; k += 2; } while k < perm_max { let pk = &perms[k]; let mut perm = pk.perm; let mut f = 0; let mut toterm = pk.start; while toterm != 0 { perm = perm.shuffle1_dyn(self.flip_masks[toterm as usize]); toterm = perm.extract(0); f += 1; } if f > self.maxflips { self.maxflips = f } self.checksum += if pk.odd == 0 { f } else { -f }; k += 1 } perm_max = 0; } } } pub fn fannkuch_redux(n: usize) -> (i32, i32) { let mut state = State::default(); state.popmasks(); for i in 0..n { state.s[i] = i as u8 } state.tk(n); (state.checksum, state.maxflips) } #[cfg(test)] #[test] fn test() { assert_eq!(fannkuch_redux(7), (228, 16)); } ================================================ FILE: examples/mandelbrot/Cargo.toml ================================================ [package] name = "mandelbrot" version = "0.1.0" authors = ["gnzlbg "] build = "build.rs" edition = "2018" [dependencies] packed_simd = { package = "packed_simd", path = "../.." } rayon = "^1.0" ispc = { version = "^1.0.4", optional = true } structopt = { version = "0.3.0", features = ["color"] } [build-dependencies] ispc = { version = "^1.0.4", optional = true } [[bin]] name = "mandelbrot" path = "src/main.rs" [lib] name = "mandelbrot_lib" path = "src/lib.rs" [features] default = [] sleef-sys = ["packed_simd/sleef-sys"] core_arch = ["packed_simd/core_arch"] ================================================ FILE: examples/mandelbrot/benchmark.sh ================================================ #!/usr/bin/env bash # # Runs mandelbrot benchmarks set -ex WIDTH=800 HEIGHT=800 if [[ ${NORUN} != 1 ]]; then hash hyperfine 2>/dev/null || { echo >&2 "hyperfine is not in PATH."; exit 1; } fi if echo "$FEATURES" | grep -q "ispc"; then hash ispc 2>/dev/null || { echo >&2 "ispc is not in PATH."; exit 1; } fi RUSTFLAGS="-C target-cpu=native ${RUSTFLAGS}" \ cargo build --release --features="${FEATURES}" if [[ "${VERIFY}" == "1" ]]; then RUSTFLAGS="-C target-cpu=native ${RUSTFLAGS}" \ cargo test --release --features="${FEATURES}" fi if [[ "${NORUN}" == "1" ]]; then exit 0 fi hyperfine "../target/release/mandelbrot ${WIDTH} ${HEIGHT} --algo scalar" hyperfine "../target/release/mandelbrot ${WIDTH} ${HEIGHT} --algo simd" if echo "$FEATURES" | grep -q "ispc"; then hyperfine "../target/release/mandelbrot ${WIDTH} ${HEIGHT} --algo ispc" fi ================================================ FILE: examples/mandelbrot/build.rs ================================================ fn main() { println!("cargo:rerun-if-changed=build.rs"); #[cfg(feature = "ispc")] { if std::env::var("CARGO_FEATURE_ISPC").is_ok() { let mut cfg = ispc::Config::new(); if cfg!(windows) { cfg.debug(false); } let ispc_files = vec!["volta/mandelbrot.ispc"]; for s in &ispc_files[..] { cfg.file(*s); } cfg.target_isas(vec![ ispc::opt::TargetISA::SSE2i32x4, ispc::opt::TargetISA::SSE4i32x4, ispc::opt::TargetISA::AVX1i32x8, ispc::opt::TargetISA::AVX2i32x8, ispc::opt::TargetISA::AVX512KNLi32x16, ]); cfg.compile("mandelbrot"); } } } ================================================ FILE: examples/mandelbrot/readme.md ================================================ # Mandelbrot This is the [`mandelbrot` benchmark from the benchmarksgame][bg]. ## Background http://mathworld.wolfram.com/MandelbrotSet.html ## Usage It takes four arguments in this order: * `width`: width of the image to render * `height`: height of the image to render * `algorithm`: algorithm to use: * `scalar`: scalar algorithm * `simd`: parallelized SIMD algorithm * `ispc`: ISPC + tasks algorithm * `--color` (optional): enables colorized output, which also determines the image format. * disabled (default): PBM: Portable BitMap format (black & white output) * enabled: PPM: Portable PixMap format (colored output) The resulting image is piped to `stdout`. `cargo run --release -- 400 400 --algo simd > output.ppm` outputs: ![run_400_png](https://user-images.githubusercontent.com/904614/43190942-72bdb834-8ffa-11e8-9dcf-a9a9632ae907.png) `cargo run --release -- 400 400 --algo simd --color > output.ppm` outputs: ![run_400_400_1_1_png](https://user-images.githubusercontent.com/904614/43190948-759969a4-8ffa-11e8-81a9-35e5baef3e86.png) ## Performance ``` ./benchmark.sh ``` On a dual core AVX1 i5 @1.8 GHz: | 800 x 800 | time [ms]
Rust | speedup vs `scalar` [-] | |------------|---------------------|-------------| | `scalar` | 86.6 | 1.0x | | `simd` | 21.0 | 4.1x | | `ispc` | 25.7 | 3.4x | `simd` algorithm is ~1.2x faster than `ispc`. On a 28 core Xeon CPU E5-2690 v4 @ 2.60GHz: | 800 x 800 | time [ms]
Rust | speedup vs `scalar` [-] | |------------|---------------------|-------------------------| | `scalar` | 50.8 | 1.0x | | `simd` | 25.1 | 2x | | `ispc` | 14.4 | 3.52x | `simd` algorithm is ~1.74x slower than `ispc`. On a 40 core Xeon Gold 6148 CPU @ 2.40GHz: | 800 x 800 | time [ms]
Rust | speedup vs `scalar` [-] | |------------|---------------------|-------------| | `scalar` | 59.9 | 1.0x | | `simd` | 29.9 | 2.0x | | `ispc` | 30.3 | 2.0x | `simd` algorithm is as fast as `ispc`. [bg]: https://benchmarksgame-team.pages.debian.net/benchmarksgame/description/mandelbrot.html#mandelbrot ================================================ FILE: examples/mandelbrot/src/ispc_tasks.rs ================================================ //! Includes the ISPC implementations. use crate::*; use ispc::*; ispc_module!(mandelbrot); pub fn generate(dims: Dimensions, xr: Range, yr: Range) -> Vec { let (width, height) = dims; let Range { start: left, end: right } = xr; let Range { start: top, end: bottom } = yr; let len = width * height; let mut out = Vec::with_capacity(len); unsafe { mandelbrot::mandelbrot_ispc( left, bottom, right, top, height as i32, width as i32, ITER_LIMIT as i32, out.as_mut_ptr() as *mut i32, ); out.set_len(len); } out } ================================================ FILE: examples/mandelbrot/src/lib.rs ================================================ //! The mandelbrot benchmark from the [benchmarks game][bg]. //! //! [bg]: https://benchmarksgame-team.pages.debian.net/benchmarksgame/description/mandelbrot.html#mandelbrot // FIXME: Null pointer deref warning triggered in this example, // likely inside a macro expansion deriving from packed_simd. #![deny(rust_2018_idioms)] #![allow( clippy::cast_precision_loss, clippy::cast_sign_loss, clippy::cast_possible_truncation, clippy::must_use_candidate )] use rayon::prelude::*; use std::{io, ops}; // Each algorithm implementation must expose a single public function, // `generate`: fn generate(dimensions: Dimensions, xr: Range, yr: Range) -> // Vec; // // Generates the Mandelbrot fractal for a region of Cartesian space, // where X is bounded by `xr.begin..xr.end` and Y by `yr.begin..yr.end`. // // Returns a vector of dimensions `width * height`, where each byte is // the number of iterations the corresponding point reached before diverging. #[cfg(feature = "ispc")] mod ispc_tasks; mod scalar_par; mod simd_par; type Range = ops::Range; type Region = (Range, Range); /// The width and height of a generated image pub type Dimensions = (usize, usize); /// The Mandelbrot algorithms supported by this crate. #[derive(Debug, Copy, Clone)] pub enum Algorithm { /// Scalar parallel algorithm Scalar, /// Parallel SIMD algorithm using Rayon Simd, /// ISPC SIMD + parallel tasks algorithm Ispc, } pub struct Mandelbrot { dims: Dimensions, data: Vec, } impl Mandelbrot { /// Generates a new image of the Mandelbrot fractal. pub fn generate(dims: Dimensions, algo: Algorithm) -> Self { Self::generate_region(dims, DEFAULT_REGION, algo) } /// Generates a new image containing a certain region of the Mandelbrot /// fractal. pub fn generate_region( dims: Dimensions, region: Region, algo: Algorithm, ) -> Self { let data = match algo { Algorithm::Scalar => { scalar_par::generate(dims, region.0, region.1) } Algorithm::Simd => simd_par::generate(dims, region.0, region.1), #[cfg(feature = "ispc")] Algorithm::Ispc => ispc_tasks::generate(dims, region.0, region.1), #[cfg(not(feature = "ispc"))] Algorithm::Ispc => unimplemented!( "This crate was built with the `ispc` feature disabled" ), }; Self { dims, data } } /// Writes the PBM / PPM header to the output. fn write_header( &self, f: &mut dyn io::Write, color: bool, ) -> io::Result<()> { writeln!(f, "P{}", if color { 6 } else { 4 })?; write!(f, "{} {}", self.dims.0, self.dims.1)?; if color { write!(f, " 255")?; } writeln!(f) } /// Outputs a black/white PBM bitmap to the given writer. pub fn output_pbm(&self, f: &mut dyn io::Write) -> io::Result<()> { self.write_header(f, false)?; assert_eq!( self.data.len() % 8, 0, "Output data must be a multiple of 8" ); let buf = self .data .par_chunks(8) .map(|ch| { let mut result = 0; ch.iter().enumerate().for_each(|(i, &count)| { let undiverged = count == ITER_LIMIT; result |= (undiverged as u8) << (7 - i); }); result }) .collect::>(); f.write_all(&buf) } /// Outputs a color PPM image to the given writer. pub fn output_ppm(&self, f: &mut dyn io::Write) -> io::Result<()> { self.write_header(f, true)?; let buf = self .data .par_iter() .flat_map(|&val| { const COLORS: &[(f32, f32, f32)] = &[ (0.0, 7.0, 100.0), (32.0, 107.0, 203.0), (237.0, 255.0, 255.0), (255.0, 170.0, 0.0), (0.0, 2.0, 0.0), ]; const SCALE: u32 = 12; let color_count = COLORS.len() as u32; let color = if val == ITER_LIMIT { vec![0, 0, 0] } else { let val = (val % SCALE) * color_count / SCALE; let left = val % color_count; let right = (left + 1) % color_count; let alpha = (val - left) as f32; let (r1, g1, b1) = COLORS[left as usize]; let (r2, g2, b2) = COLORS[right as usize]; vec![ (r1 + (r2 - r1) * alpha) as u8, (g1 + (g2 - g1) * alpha) as u8, (b1 + (b2 - b1) * alpha) as u8, ] }; color.into_par_iter() }) .collect::>(); f.write_all(&buf) } } /// Returns the default region of space to generate an image for. /// /// This is the region containing the fractal most people think of when they /// think of Mandelbrot, since values outside definitely diverge. const DEFAULT_REGION: (Range, Range) = (-1.5..0.5, -1.0..1.0); /// Threshold for Mandelbrot sequence divergence /// /// Complex numbers which have a modulus squared greater than this are /// considered to be diverging. const THRESHOLD: f64 = 4.0; /// Maximum amount of iterations to perform /// /// Increasing this will make more features to be visible in the image, /// assuming the resolution is large enoguh. const ITER_LIMIT: u32 = 50; #[cfg(test)] mod tests { use super::*; #[test] #[cfg_attr(windows, ignore)] fn verify_all() { let width = 400; let height = 800; let dims = (width, height); let verify = |actual: &[u32], expected: &[u32]| { if actual != expected { for row in 0..height { for column in 0..width { let idx = row * width + column; assert_eq!( actual[idx], expected[idx], "difference at ({}, {})", row, column, ); } } } }; eprintln!("Generating Mandelbrot with scalar algorithm"); let scalar = scalar_par::generate(dims, DEFAULT_REGION.0, DEFAULT_REGION.1); assert_eq!(scalar.len(), width * height); eprintln!("Generating Mandelbrot with SIMD algorithm"); let simd = simd_par::generate(dims, DEFAULT_REGION.0, DEFAULT_REGION.1); verify(&simd[..], &scalar[..]); } fn verify_algo(algo: Algorithm) { static OUTPUT: &[u8] = include_bytes!("mandelbrot-output.txt"); let (width, height) = (200, 200); let dims = (width, height); let mb = Mandelbrot::generate(dims, algo); let out = { let mut out = Vec::with_capacity(width * height); mb.output_pbm(&mut out).unwrap(); out }; assert_eq!(out.len(), OUTPUT.len()); if out != OUTPUT { out.into_iter().zip(OUTPUT.iter()).enumerate().for_each( |(i, (a, &b))| { assert_eq!( a, b, "byte {} differs - {:#08b} != {:#08b} (expected)", i, a, b, ); }, ); } } #[test] fn verify_output_scalar() { verify_algo(Algorithm::Scalar); } #[test] #[cfg_attr(windows, ignore)] fn verify_output_simd() { verify_algo(Algorithm::Simd); } } ================================================ FILE: examples/mandelbrot/src/main.rs ================================================ //! The Mandelbrot benchmark from the [benchmarksgame][bg] //! //! [bg]: https://benchmarksgame-team.pages.debian.net/benchmarksgame/description/mandelbrot.html#mandelbrot #![deny(rust_2018_idioms)] use mandelbrot_lib::*; use std::io; use structopt::StructOpt; /// Mandelbrot image generator. /// /// Output is printed to `stdout`. #[derive(StructOpt)] struct Opt { /// Image width. width: usize, /// Image height. height: usize, /// Enable this to output a color image. #[structopt(short = "c", long = "color")] color: bool, /// Algorithm #[structopt(short = "a", long = "algo")] algo: String, } const ALGORITHMS: &[&str] = &["scalar", "simd", "ispc"]; fn main() { let opt = Opt::from_args(); let algo = match opt.algo.as_str() { "scalar" => Algorithm::Scalar, "simd" => Algorithm::Simd, "ispc" => Algorithm::Ispc, algo => panic!( "Unknown algorithm: {:?}\nAvailable algorithms: {:?}", algo, ALGORITHMS ), }; let mb = Mandelbrot::generate((opt.width, opt.height), algo); let mut stdout = io::stdout(); if opt.color { mb.output_ppm(&mut stdout).unwrap(); } else { mb.output_pbm(&mut stdout).unwrap(); } } ================================================ FILE: examples/mandelbrot/src/scalar_par.rs ================================================ //! Scalar mandelbrot implementation use crate::*; /// Complex number #[repr(align(16))] #[derive(Copy, Clone)] struct Complex { real: f64, imag: f64, } impl Complex { /// Returns true if this member of the Mandelbrot sequence is diverging #[inline] fn diverged(&self) -> bool { let Self { real: x, imag: y } = self; let xx = x * x; let yy = y * y; let sum = xx + yy; sum > THRESHOLD } } /// An iterator yielding the infinite Mandelbrot sequence struct MandelbrotIter { /// Initial value which generated this sequence start: Complex, /// Current iteration value current: Complex, } impl MandelbrotIter { /// Creates a new Mandelbrot sequence iterator for a given starting point fn new(start: Complex) -> Self { Self { start, current: start } } /// Returns the number of iterations it takes for the Mandelbrot sequence /// to diverge at this point, or `ITER_LIMIT` if it doesn't diverge. fn count(mut self) -> u32 { let mut z = self.start; for i in 0..ITER_LIMIT { if z.diverged() { return i; } z = self.next().unwrap(); } ITER_LIMIT } } impl Iterator for MandelbrotIter { type Item = Complex; /// Generates the next value in the sequence #[inline] fn next(&mut self) -> Option { let Complex { real: c_x, imag: c_y } = self.start; let Complex { real: x, imag: y } = self.current; let xx = x * x; let yy = y * y; let xy = x * y; let new_x = c_x + (xx - yy); let new_y = c_y + (xy + xy); self.current = Complex { real: new_x, imag: new_y }; Some(self.current) } } pub fn generate(dims: Dimensions, xr: Range, yr: Range) -> Vec { let (width, height) = dims; let xs = { let dx = (xr.end - xr.start) / (width as f64); let mut buf = Vec::new(); (0..width) .into_par_iter() .map(|j| xr.start + dx * (j as f64)) .collect_into_vec(&mut buf); buf }; let dy = (yr.end - yr.start) / (height as f64); let len = width * height; let mut out = Vec::with_capacity(len); unsafe { out.set_len(len); } out.par_chunks_mut(width).enumerate().for_each(|(i, row)| { let y = yr.start + dy * (i as f64); row.iter_mut().enumerate().for_each(|(j, count)| { let x = xs[j]; let z = Complex { real: x, imag: y }; *count = MandelbrotIter::new(z).count() as u32; }); }); out } ================================================ FILE: examples/mandelbrot/src/simd_par.rs ================================================ //! Vectorized parallel Mandelbrot implementation #![allow(non_camel_case_types)] use crate::*; use packed_simd::*; type u64s = u64x8; type u32s = u32x8; type f64s = f64x8; type m64s = m64x8; /// Storage for complex numbers in SIMD format. /// The real and imaginary parts are kept in separate registers. #[derive(Copy, Clone)] struct Complex { real: f64s, imag: f64s, } impl Complex { /// Returns a mask describing which members of the Mandelbrot sequence /// haven't diverged yet #[inline] fn undiverged(&self) -> m64s { let Self { real: x, imag: y } = *self; let xx = x * x; let yy = y * y; let sum = xx + yy; sum.le(f64s::splat(THRESHOLD)) } } /// Mandelbrot sequence iterator using SIMD. struct MandelbrotIter { /// Initial value which generated this sequence start: Complex, /// Current iteration value current: Complex, } impl MandelbrotIter { /// Creates a new Mandelbrot sequence iterator for a given starting point fn new(start: Complex) -> Self { Self { start, current: start } } /// Returns the number of iterations it takes for each member of the /// Mandelbrot sequence to diverge at this point, or `ITER_LIMIT` if /// they don't diverge. /// /// This function will operate on N complex numbers at once, where N is the /// number of lanes in a SIMD vector of doubles. fn count(mut self) -> u32s { let mut z = self.start; let mut count = u64s::splat(0); for _ in 0..ITER_LIMIT { // Keep track of those lanes which haven't diverged yet. The other // ones will be masked off. let undiverged = z.undiverged(); // Stop the iteration if they all diverged. Note that we don't do // this check every iteration, since a branch // misprediction can hurt more than doing some extra // calculations. if undiverged.none() { break; } count += undiverged.select(u64s::splat(1), u64s::splat(0)); z = self.next().unwrap(); } count.cast() } } impl Iterator for MandelbrotIter { type Item = Complex; /// Generates the next values in the sequence #[inline] fn next(&mut self) -> Option { let Complex { real: c_x, imag: c_y } = self.start; let Complex { real: x, imag: y } = self.current; let xx = x * x; let yy = y * y; let xy = x * y; let new_x = c_x + (xx - yy); let new_y = c_y + (xy + xy); self.current = Complex { real: new_x, imag: new_y }; Some(self.current) } } pub fn generate(dims: Dimensions, xr: Range, yr: Range) -> Vec { let (width, height) = dims; let block_size = f64s::lanes(); assert_eq!( width % block_size, 0, "image width = {} is not divisible by the number of vector lanes = {}", width, block_size, ); let width_in_blocks = width / block_size; // The initial X values are the same for every row. let xs = unsafe { let dx = (xr.end - xr.start) / (width as f64); let mut buf: Vec = vec![f64s::splat(0.); width_in_blocks]; std::slice::from_raw_parts_mut(buf.as_mut_ptr() as *mut f64, width) .iter_mut() .enumerate() .for_each(|(j, x)| { *x = xr.start + dx * (j as f64); }); buf }; let dy = (yr.end - yr.start) / (height as f64); let len = width_in_blocks * height; let mut out = Vec::with_capacity(len); unsafe { out.set_len(len); } out.par_chunks_mut(width_in_blocks).enumerate().for_each(|(i, row)| { let y = f64s::splat(yr.start + dy * (i as f64)); row.iter_mut().enumerate().for_each(|(j, count)| { let x = xs[j]; let z = Complex { real: x, imag: y }; *count = MandelbrotIter::new(z).count(); }); }); // This is safe, we're transmuting from a more-aligned type to a // less-aligned one. #[allow(clippy::unsound_collection_transmute)] unsafe { let mut out: Vec = std::mem::transmute(out); out.set_len(width * height); out } } ================================================ FILE: examples/mandelbrot/volta/mandelbrot.ispc ================================================ /* Copyright (c) 2010-2012, Intel Corporation All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ static inline int mandel(double c_re, double c_im, int count) { double z_re = c_re, z_im = c_im; int i; for (i = 0; i < count; ++i) { if (z_re * z_re + z_im * z_im > 4.) break; double new_re = z_re*z_re - z_im*z_im; double new_im = 2.f * z_re * z_im; unmasked { z_re = c_re + new_re; z_im = c_im + new_im; } } return i; } export void mandelbrot_ispc(uniform double x0, uniform double y0, uniform double x1, uniform double y1, uniform int width, uniform int height, uniform int maxIterations, uniform int output[]) { double dx = (x1 - x0) / width; double dy = (y1 - y0) / height; for (uniform int j = 0; j < height; j++) { // Note that we'll be doing programCount computations in parallel, // so increment i by that much. This assumes that width evenly // divides programCount. foreach (i = 0 ... width) { // Figure out the position on the complex plane to compute the // number of iterations at. Note that the x values are // different across different program instances, since its // initializer incorporates the value of the programIndex // variable. double x = x0 + i * dx; double y = y0 + j * dy; int index = j * width + i; output[index] = mandel(x, y, maxIterations); } } } ================================================ FILE: examples/matrix_inverse/Cargo.toml ================================================ [package] name = "matrix_inverse" version = "0.1.0" authors = ["Gonzalo Brito Gadeschi "] edition = "2018" [dependencies] packed_simd = { package = "packed_simd", path = "../.." } [lib] name = "matrix_inverse_lib" path = "src/lib.rs" ================================================ FILE: examples/matrix_inverse/readme.md ================================================ # 4x4 matrix inverse ================================================ FILE: examples/matrix_inverse/src/lib.rs ================================================ //! 4x4 matrix inverse #![feature(custom_inner_attributes)] #![deny(rust_2018_idioms)] #![allow(clippy::must_use_candidate)] pub mod scalar; pub mod simd; #[derive(Copy, Clone, Debug, PartialEq, PartialOrd)] pub struct Matrix4x4([[f32; 4]; 4]); #[cfg(test)] #[rustfmt::skip] fn test Option>(f: F) { let tests: &[(Matrix4x4, Option)] = &[ // Identity: (Matrix4x4([ [1., 0., 0., 0.], [0., 1., 0., 0.], [0., 0., 1., 0.], [0., 0., 0., 1.], ]), Some(Matrix4x4([ [1., 0., 0., 0.], [0., 1., 0., 0.], [0., 0., 1., 0.], [0., 0., 0., 1.], ])) ), // None: (Matrix4x4([ [1., 2., 3., 4.], [12., 11., 10., 9.], [5., 6., 7., 8.], [16., 15., 14., 13.], ]), None ), // Other: (Matrix4x4([ [1., 1., 1., 0.], [0., 3., 1., 2.], [2., 3., 1., 0.], [1., 0., 2., 1.], ]), Some(Matrix4x4([ [-3., -0.5, 1.5, 1.0], [ 1., 0.25, -0.25, -0.5], [ 3., 0.25, -1.25, -0.5], [-3., 0.0, 1.0, 1.0], ])) ), ]; for &(input, output) in tests { assert_eq!(f(input), output); } } ================================================ FILE: examples/matrix_inverse/src/scalar.rs ================================================ //! Scalar implementation #[rustfmt::skip] use crate::*; #[allow(clippy::too_many_lines)] pub fn inv4x4(m: Matrix4x4) -> Option { let m = m.0; let mut inv = [ [ // row 0: // 0,0: m[1][1] * m[2][2] * m[3][3] - m[1][1] * m[2][3] * m[3][2] - m[2][1] * m[1][2] * m[3][3] + m[2][1] * m[1][3] * m[3][2] + m[3][1] * m[1][2] * m[2][3] - m[3][1] * m[1][3] * m[2][2], // 0,1: -m[0][1] * m[2][2] * m[3][3] + m[0][1] * m[2][3] * m[3][2] + m[2][1] * m[0][2] * m[3][3] - m[2][1] * m[0][3] * m[3][2] - m[3][1] * m[0][2] * m[2][3] + m[3][1] * m[0][3] * m[2][2], // 0,2: m[0][1] * m[1][2] * m[3][3] - m[0][1] * m[1][3] * m[3][2] - m[1][1] * m[0][2] * m[3][3] + m[1][1] * m[0][3] * m[3][2] + m[3][1] * m[0][2] * m[1][3] - m[3][1] * m[0][3] * m[1][2], // 0,3: -m[0][1] * m[1][2] * m[2][3] + m[0][1] * m[1][3] * m[2][2] + m[1][1] * m[0][2] * m[2][3] - m[1][1] * m[0][3] * m[2][2] - m[2][1] * m[0][2] * m[1][3] + m[2][1] * m[0][3] * m[1][2], ], [ // row 1 // 1,0: -m[1][0] * m[2][2] * m[3][3] + m[1][0] * m[2][3] * m[3][2] + m[2][0] * m[1][2] * m[3][3] - m[2][0] * m[1][3] * m[3][2] - m[3][0] * m[1][2] * m[2][3] + m[3][0] * m[1][3] * m[2][2], // 1,1: m[0][0] * m[2][2] * m[3][3] - m[0][0] * m[2][3] * m[3][2] - m[2][0] * m[0][2] * m[3][3] + m[2][0] * m[0][3] * m[3][2] + m[3][0] * m[0][2] * m[2][3] - m[3][0] * m[0][3] * m[2][2], // 1,2: -m[0][0] * m[1][2] * m[3][3] + m[0][0] * m[1][3] * m[3][2] + m[1][0] * m[0][2] * m[3][3] - m[1][0] * m[0][3] * m[3][2] - m[3][0] * m[0][2] * m[1][3] + m[3][0] * m[0][3] * m[1][2], // 1,3: m[0][0] * m[1][2] * m[2][3] - m[0][0] * m[1][3] * m[2][2] - m[1][0] * m[0][2] * m[2][3] + m[1][0] * m[0][3] * m[2][2] + m[2][0] * m[0][2] * m[1][3] - m[2][0] * m[0][3] * m[1][2], ], [ // row 2 // 2,0: m[1][0] * m[2][1] * m[3][3] - m[1][0] * m[2][3] * m[3][1] - m[2][0] * m[1][1] * m[3][3] + m[2][0] * m[1][3] * m[3][1] + m[3][0] * m[1][1] * m[2][3] - m[3][0] * m[1][3] * m[2][1], // 2,1: -m[0][0] * m[2][1] * m[3][3] + m[0][0] * m[2][3] * m[3][1] + m[2][0] * m[0][1] * m[3][3] - m[2][0] * m[0][3] * m[3][1] - m[3][0] * m[0][1] * m[2][3] + m[3][0] * m[0][3] * m[2][1], // 2,2: m[0][0] * m[1][1] * m[3][3] - m[0][0] * m[1][3] * m[3][1] - m[1][0] * m[0][1] * m[3][3] + m[1][0] * m[0][3] * m[3][1] + m[3][0] * m[0][1] * m[1][3] - m[3][0] * m[0][3] * m[1][1], // 2,3: -m[0][0] * m[1][1] * m[2][3] + m[0][0] * m[1][3] * m[2][1] + m[1][0] * m[0][1] * m[2][3] - m[1][0] * m[0][3] * m[2][1] - m[2][0] * m[0][1] * m[1][3] + m[2][0] * m[0][3] * m[1][1], ], [ // row 3 // 3,0: -m[1][0] * m[2][1] * m[3][2] + m[1][0] * m[2][2] * m[3][1] + m[2][0] * m[1][1] * m[3][2] - m[2][0] * m[1][2] * m[3][1] - m[3][0] * m[1][1] * m[2][2] + m[3][0] * m[1][2] * m[2][1], // 3,1: m[0][0] * m[2][1] * m[3][2] - m[0][0] * m[2][2] * m[3][1] - m[2][0] * m[0][1] * m[3][2] + m[2][0] * m[0][2] * m[3][1] + m[3][0] * m[0][1] * m[2][2] - m[3][0] * m[0][2] * m[2][1], // 3,2: -m[0][0] * m[1][1] * m[3][2] + m[0][0] * m[1][2] * m[3][1] + m[1][0] * m[0][1] * m[3][2] - m[1][0] * m[0][2] * m[3][1] - m[3][0] * m[0][1] * m[1][2] + m[3][0] * m[0][2] * m[1][1], // 3,3: m[0][0] * m[1][1] * m[2][2] - m[0][0] * m[1][2] * m[2][1] - m[1][0] * m[0][1] * m[2][2] + m[1][0] * m[0][2] * m[2][1] + m[2][0] * m[0][1] * m[1][2] - m[2][0] * m[0][2] * m[1][1], ], ]; let det = m[0][0] * inv[0][0] + m[0][1] * inv[1][0] + m[0][2] * inv[2][0] + m[0][3] * inv[3][0]; if det == 0. { return None; } let det_inv = 1. / det; for row in &mut inv { for elem in row.iter_mut() { *elem *= det_inv; } } Some(Matrix4x4(inv)) } #[cfg(test)] #[test] fn test() { crate::test(inv4x4) } ================================================ FILE: examples/matrix_inverse/src/simd.rs ================================================ //! 4x4 matrix inverse using SIMD use crate::*; use packed_simd::shuffle; use packed_simd::f32x4; pub fn inv4x4(m: Matrix4x4) -> Option { let m = m.0; let m_0 = f32x4::from_slice_unaligned(&m[0]); let m_1 = f32x4::from_slice_unaligned(&m[1]); let m_2 = f32x4::from_slice_unaligned(&m[2]); let m_3 = f32x4::from_slice_unaligned(&m[3]); let tmp1: f32x4 = shuffle!(m_0, m_1, [0, 1, 4, 5]); let row1: f32x4 = shuffle!(m_2, m_3, [0, 1, 4, 5]); let row0 = shuffle!(tmp1, row1, [0, 2, 4, 6]); let row1: f32x4 = shuffle!(row1, tmp1, [1, 3, 5, 7]); let tmp1: f32x4 = shuffle!(m_0, m_1, [2, 3, 6, 7]); let row3: f32x4 = shuffle!(m_2, m_3, [2, 3, 6, 7]); let row2 = shuffle!(tmp1, row3, [0, 2, 4, 6]); let row3 = shuffle!(row3, tmp1, [1, 3, 5, 7]); let tmp1: f32x4 = row2 * row3; let tmp1 = shuffle!(tmp1, [1, 0, 3, 2]); let minor0 = row1 * tmp1; let minor1 = row0 * tmp1; let tmp1 = shuffle!(tmp1, [2, 3, 0, 1]); let minor0 = (row1 * tmp1) - minor0; let minor1 = (row0 * tmp1) - minor1; let minor1 = shuffle!(minor1, [2, 3, 0, 1]); let tmp1 = row1 * row2; let tmp1 = shuffle!(tmp1, [1, 0, 3, 2]); let minor0 = (row3 * tmp1) + minor0; let minor3 = row0 * tmp1; let tmp1 = shuffle!(tmp1, [2, 3, 0, 1]); let minor0 = minor0 - row3 * tmp1; let minor3 = row0 * tmp1 - minor3; let minor3 = shuffle!(minor3, [2, 3, 0, 1]); let tmp1 = row3 * shuffle!(row1, [2, 3, 0, 1]); let tmp1 = shuffle!(tmp1, [1, 0, 3, 2]); let row2 = shuffle!(row2, [2, 3, 0, 1]); let minor0 = row2 * tmp1 + minor0; let minor2 = row0 * tmp1; let tmp1 = shuffle!(tmp1, [2, 3, 0, 1]); let minor0 = minor0 - row2 * tmp1; let minor2 = row0 * tmp1 - minor2; let minor2 = shuffle!(minor2, [2, 3, 0, 1]); let tmp1 = row0 * row1; let tmp1 = shuffle!(tmp1, [1, 0, 3, 2]); let minor2 = minor2 + row3 * tmp1; let minor3 = row2 * tmp1 - minor3; let tmp1 = shuffle!(tmp1, [2, 3, 0, 1]); let minor2 = row3 * tmp1 - minor2; let minor3 = minor3 - row2 * tmp1; let tmp1 = row0 * row3; let tmp1 = shuffle!(tmp1, [1, 0, 3, 2]); let minor1 = minor1 - row2 * tmp1; let minor2 = row1 * tmp1 + minor2; let tmp1 = shuffle!(tmp1, [2, 3, 0, 1]); let minor1 = row2 * tmp1 + minor1; let minor2 = minor2 - row1 * tmp1; let tmp1 = row0 * row2; let tmp1 = shuffle!(tmp1, [1, 0, 3, 2]); let minor1 = row3 * tmp1 + minor1; let minor3 = minor3 - row1 * tmp1; let tmp1 = shuffle!(tmp1, [2, 3, 0, 1]); let minor1 = minor1 - row3 * tmp1; let minor3 = row1 * tmp1 + minor3; let det = row0 * minor0; let det = shuffle!(det, [2, 3, 0, 1]) + det; let det = shuffle!(det, [1, 0, 3, 2]) + det; if det.sum() == 0. { return None; } let tmp1 = det.recpre(); let det = tmp1 + tmp1 - det * tmp1 * tmp1; let res0 = minor0 * det; let res1 = minor1 * det; let res2 = minor2 * det; let res3 = minor3 * det; let mut m = m; res0.write_to_slice_unaligned(&mut m[0]); res1.write_to_slice_unaligned(&mut m[1]); res2.write_to_slice_unaligned(&mut m[2]); res3.write_to_slice_unaligned(&mut m[3]); Some(Matrix4x4(m)) } #[cfg(test)] #[test] fn test() { crate::test(inv4x4) } ================================================ FILE: examples/nbody/Cargo.toml ================================================ [package] name = "nbody" version = "0.1.0" authors = ["Gonzalo Brito Gadeschi "] edition = "2018" [dependencies] packed_simd = { package = "packed_simd", path = "../.." } [[bin]] name = "nbody" path = "src/main.rs" [lib] name = "nbody_lib" path = "src/lib.rs" [features] default = [ ] sleef-sys = [ "packed_simd/sleef-sys" ] core_arch = [ "packed_simd/core_arch" ] ================================================ FILE: examples/nbody/benches/algs.rs ================================================ //! n-body benchmarks #![feature(test)] extern crate nbody_lib; extern crate test; use test::{black_box, Bencher}; #[bench] fn simd(b: &mut Bencher) { b.iter(|| black_box(nbody_lib::simd::run(black_box(10_000)))) } #[bench] fn scalar(b: &mut Bencher) { b.iter(|| black_box(nbody_lib::scalar::run(black_box(10_000)))) } ================================================ FILE: examples/nbody/readme.md ================================================ # N-Body This is the [`n-body` benchmark from the benchmarksgame][bg]. It models the orbits of Jovian planets, using the same simple symplectic-integrator. ## Usage It takes two arguments in this order: * `n`: the number of iterations to perform * (optional) `algorithm`: the algorithm to use - defaults to the fastest one. * `0`: scalar algorithm * `1`: SIMD algorithm ## Implementation There are three kernels, two of which are only run twice independently of the number of iterations (`offset_momentum` and `energy`). The `advance` kernel is run once per iterations and uses 100% of the running time. [bg]: https://benchmarksgame-team.pages.debian.net/benchmarksgame/description/nbody.html#nbody ================================================ FILE: examples/nbody/src/lib.rs ================================================ //! The N-body benchmark from the [benchmarks game][bg]. //! //! [bg]: https://benchmarksgame-team.pages.debian.net/benchmarksgame/description/nbody.html#nbody #![deny(rust_2018_idioms)] #![allow( clippy::similar_names, clippy::excessive_precision, clippy::must_use_candidate )] pub mod scalar; pub mod simd; pub fn run(n: usize, alg: usize) -> (f64, f64) { match alg { 0 => scalar::run(n), 1 => simd::run(n), v => panic!("unknown algorithm value: {}", v), } } #[cfg(test)] const RESULTS: &[(usize, &str, &str)] = &[(1_000_usize, "-0.169075164", "-0.169087605")]; ================================================ FILE: examples/nbody/src/main.rs ================================================ //! The N-body benchmark from the [benchmarks game][bg]. //! //! [bg]: https://benchmarksgame-team.pages.debian.net/benchmarksgame/description/nbody.html#nbody #![deny(rust_2018_idioms)] fn run(o: &mut O, n: usize, alg: usize) { let (energy_before, energy_after) = nbody_lib::run(n, alg); writeln!(o, "{:.9}", energy_before).unwrap(); writeln!(o, "{:.9}", energy_after).unwrap(); } fn main() { let n: usize = std::env::args() .nth(1) .expect("need one arg") .parse() .expect("argument should be a usize"); let alg: usize = if let Some(v) = std::env::args().nth(2) { v.parse().expect("second argument must be a usize") } else { 1 // SIMD algorithm }; run(&mut std::io::stdout(), n, alg); } #[cfg(test)] mod tests { use super::*; static OUTPUT: &[u8] = include_bytes!("nbody-output.txt"); #[test] fn verify_output_simd() { let mut out: Vec = Vec::new(); run(&mut out, 1000, 0); assert_eq!(out.len(), OUTPUT.len()); if out != OUTPUT { for i in 0..out.len() { assert_eq!( out[i], OUTPUT[i], "byte {} differs - is: {:#08b} - should: {:#08b}", i, out[i], OUTPUT[i] ); } } } #[test] fn verify_output_scalar() { let mut out: Vec = Vec::new(); run(&mut out, 1000, 1); assert_eq!(out.len(), OUTPUT.len()); if out != OUTPUT { for i in 0..out.len() { assert_eq!( out[i], OUTPUT[i], "byte {} differs - is: {:#08b} - should: {:#08b}", i, out[i], OUTPUT[i] ); } } } } ================================================ FILE: examples/nbody/src/nbody-output.txt ================================================ -0.169075164 -0.169087605 ================================================ FILE: examples/nbody/src/scalar.rs ================================================ // The Computer Language Benchmarks Game // https://benchmarksgame-team.pages.debian.net // // contributed by the Rust Project Developers // contributed by TeXitoi use std::f64::consts::PI; const SOLAR_MASS: f64 = 4.0 * PI * PI; const DAYS_PER_YEAR: f64 = 365.24; struct Body { x: [f64; 3], v: [f64; 3], mass: f64, } const N_BODIES: usize = 5; #[allow(clippy::unreadable_literal)] const BODIES: [Body; N_BODIES] = [ // Sun Body { x: [0., 0., 0.], v: [0., 0., 0.], mass: SOLAR_MASS }, // Jupiter Body { x: [ 4.84143144246472090e+00, -1.16032004402742839e+00, -1.03622044471123109e-01, ], v: [ 1.66007664274403694e-03 * DAYS_PER_YEAR, 7.69901118419740425e-03 * DAYS_PER_YEAR, -6.90460016972063023e-05 * DAYS_PER_YEAR, ], mass: 9.54791938424326609e-04 * SOLAR_MASS, }, // Saturn Body { x: [ 8.34336671824457987e+00, 4.12479856412430479e+00, -4.03523417114321381e-01, ], v: [ -2.76742510726862411e-03 * DAYS_PER_YEAR, 4.99852801234917238e-03 * DAYS_PER_YEAR, 2.30417297573763929e-05 * DAYS_PER_YEAR, ], mass: 2.85885980666130812e-04 * SOLAR_MASS, }, // Uranus Body { x: [ 1.28943695621391310e+01, -1.51111514016986312e+01, -2.23307578892655734e-01, ], v: [ 2.96460137564761618e-03 * DAYS_PER_YEAR, 2.37847173959480950e-03 * DAYS_PER_YEAR, -2.96589568540237556e-05 * DAYS_PER_YEAR, ], mass: 4.36624404335156298e-05 * SOLAR_MASS, }, // Neptune Body { x: [ 1.53796971148509165e+01, -2.59193146099879641e+01, 1.79258772950371181e-01, ], v: [ 2.68067772490389322e-03 * DAYS_PER_YEAR, 1.62824170038242295e-03 * DAYS_PER_YEAR, -9.51592254519715870e-05 * DAYS_PER_YEAR, ], mass: 5.15138902046611451e-05 * SOLAR_MASS, }, ]; fn advance(bodies: &mut [Body; N_BODIES], dt: f64) { let mut b_slice: &mut [_] = bodies; while let Some(bi) = shift_mut_ref(&mut b_slice) { for bj in b_slice.iter_mut() { let mut dx = [0.; 3]; for (dx, (x_i, x_j)) in dx.iter_mut().zip(bi.x.iter().zip(bj.x.iter())) { *dx = x_i - x_j; } let mut d2: f64 = 0.; for dx in &dx { d2 += dx * dx; } let mag = dt / (d2 * d2.sqrt()); let massi_mag = bi.mass * mag; let massj_mag = bj.mass * mag; for (v_j, (v_i, dx)) in bj.v.iter_mut().zip(bi.v.iter_mut().zip(dx.iter())) { *v_j += dx * massi_mag; *v_i -= dx * massj_mag; } } for (x, v) in bi.x.iter_mut().zip(bi.v.iter()) { *x += dt * v; } } } fn energy(bodies: &[Body; N_BODIES]) -> f64 { let mut e = 0.0; let mut bodies = bodies.iter(); while let Some(bi) = bodies.next() { let mut e_l = 0.; for v in &bi.v { e_l += v * v; } e += e_l * bi.mass / 2.0; for bj in bodies.clone() { let mut dist = 0.; for (xi, xj) in bi.x.iter().zip(bj.x.iter()) { let dx = xi - xj; dist += dx * dx; } e -= bi.mass * bj.mass / dist.sqrt(); } } e } fn offset_momentum(bodies: &mut [Body; N_BODIES]) { let mut p = [0.; 3]; for bi in bodies.iter() { for (p, v) in p.iter_mut().zip(bi.v.iter()) { *p += v * bi.mass; } } let sun = &mut bodies[0]; for (v, p) in sun.v.iter_mut().zip(p.iter()) { *v = -p / SOLAR_MASS; } } /// Pop a mutable reference off the head of a slice, mutating the slice to no /// longer contain the mutable reference. #[allow(clippy::mut_mut)] fn shift_mut_ref<'a, T>(r: &mut &'a mut [T]) -> Option<&'a mut T> { if r.is_empty() { return None; } let tmp = std::mem::replace(r, &mut []); let (h, t) = tmp.split_at_mut(1); *r = t; Some(&mut h[0]) } pub fn run(n: usize) -> (f64, f64) { let mut bodies = BODIES; offset_momentum(&mut bodies); let a = energy(&bodies); for _ in 0..n { advance(&mut bodies, 0.01); } let b = energy(&bodies); (a, b) } #[cfg(test)] mod tests { #[test] fn test() { for &(size, a_e, b_e) in crate::RESULTS { let (a, b) = super::run(size); assert_eq!(format!("{:.9}", a), a_e); assert_eq!(format!("{:.9}", b), b_e); } } } ================================================ FILE: examples/nbody/src/simd.rs ================================================ #![deny(warnings)] use packed_simd::*; use std::f64::consts::PI; const SOLAR_MASS: f64 = 4.0 * PI * PI; const DAYS_PER_YEAR: f64 = 365.24; pub struct Body { pub x: f64x4, pub v: f64x4, pub mass: f64, } const N_BODIES: usize = 5; #[allow(clippy::unreadable_literal)] const BODIES: [Body; N_BODIES] = [ // sun: Body { x: f64x4::new(0., 0., 0., 0.), v: f64x4::new(0., 0., 0., 0.), mass: SOLAR_MASS, }, // jupiter: Body { x: f64x4::new( 4.84143144246472090e+00, -1.16032004402742839e+00, -1.03622044471123109e-01, 0., ), v: f64x4::new( 1.66007664274403694e-03 * DAYS_PER_YEAR, 7.69901118419740425e-03 * DAYS_PER_YEAR, -6.90460016972063023e-05 * DAYS_PER_YEAR, 0., ), mass: 9.54791938424326609e-04 * SOLAR_MASS, }, // saturn: Body { x: f64x4::new( 8.34336671824457987e+00, 4.12479856412430479e+00, -4.03523417114321381e-01, 0., ), v: f64x4::new( -2.76742510726862411e-03 * DAYS_PER_YEAR, 4.99852801234917238e-03 * DAYS_PER_YEAR, 2.30417297573763929e-05 * DAYS_PER_YEAR, 0., ), mass: 2.85885980666130812e-04 * SOLAR_MASS, }, // uranus: Body { x: f64x4::new( 1.28943695621391310e+01, -1.51111514016986312e+01, -2.23307578892655734e-01, 0., ), v: f64x4::new( 2.96460137564761618e-03 * DAYS_PER_YEAR, 2.37847173959480950e-03 * DAYS_PER_YEAR, -2.96589568540237556e-05 * DAYS_PER_YEAR, 0., ), mass: 4.36624404335156298e-05 * SOLAR_MASS, }, // neptune: Body { x: f64x4::new( 1.53796971148509165e+01, -2.59193146099879641e+01, 1.79258772950371181e-01, 0., ), v: f64x4::new( 2.68067772490389322e-03 * DAYS_PER_YEAR, 1.62824170038242295e-03 * DAYS_PER_YEAR, -9.51592254519715870e-05 * DAYS_PER_YEAR, 0., ), mass: 5.15138902046611451e-05 * SOLAR_MASS, }, ]; pub fn offset_momentum(bodies: &mut [Body; N_BODIES]) { let (sun, rest) = bodies.split_at_mut(1); let sun = &mut sun[0]; for body in rest { let m_ratio = body.mass / SOLAR_MASS; sun.v -= body.v * m_ratio; } } pub fn energy(bodies: &[Body; N_BODIES]) -> f64 { let mut e = 0.; for i in 0..N_BODIES { let bi = &bodies[i]; e += bi.mass * (bi.v * bi.v).sum() * 0.5; for bj in bodies.iter().take(N_BODIES).skip(i + 1) { let dx = bi.x - bj.x; e -= bi.mass * bj.mass / (dx * dx).sum().sqrt() } } e } pub fn advance(bodies: &mut [Body; N_BODIES], dt: f64) { const N: usize = N_BODIES * (N_BODIES - 1) / 2; // compute distance between bodies: let mut r = [f64x4::splat(0.); N]; { let mut i = 0; for j in 0..N_BODIES { for k in j + 1..N_BODIES { r[i] = bodies[j].x - bodies[k].x; i += 1; } } } let mut mag = [0.0; N]; let mut i = 0; while i < N { let d2s = f64x2::new((r[i] * r[i]).sum(), (r[i + 1] * r[i + 1]).sum()); let dmags = f64x2::splat(dt) / (d2s * d2s.sqrte()); dmags.write_to_slice_unaligned(&mut mag[i..]); i += 2; } i = 0; for j in 0..N_BODIES { for k in j + 1..N_BODIES { let f = r[i] * mag[i]; bodies[j].v -= f * bodies[k].mass; bodies[k].v += f * bodies[j].mass; i += 1 } } for body in bodies { body.x += dt * body.v } } pub fn run_k(n: usize, k: K) -> (f64, f64) where K: Fn(&mut [Body; N_BODIES], f64), { let mut bodies = BODIES; offset_momentum(&mut bodies); let energy_before = energy(&bodies); for _ in 0..n { k(&mut bodies, 0.01); } let energy_after = energy(&bodies); (energy_before, energy_after) } pub fn run(n: usize) -> (f64, f64) { run_k(n, advance) } #[cfg(test)] mod tests { #[test] fn test() { for &(size, a_e, b_e) in crate::RESULTS { let (a, b) = super::run(size); assert_eq!(format!("{:.9}", a), a_e); assert_eq!(format!("{:.9}", b), b_e); } } } ================================================ FILE: examples/options_pricing/Cargo.toml ================================================ [package] name = "options_pricing" version = "0.1.0" authors = ["gnzlbg "] edition = "2018" [dependencies] packed_simd = { package = "packed_simd", path = "../.." } time = "^0.1" rayon = "^1.0" ispc = { version = "^1.0.4", optional = true } [build-dependencies] ispc = { version = "^1.0.4", optional = true } [[bin]] name = "options_pricing" path = "src/main.rs" [lib] name = "options_pricing_lib" path = "src/lib.rs" [features] default = [] core_arch = [ "packed_simd/core_arch" ] sleef-sys = [ "packed_simd/sleef-sys" ] ispc_libm = [ "ispc" ] ================================================ FILE: examples/options_pricing/benchmark.sh ================================================ #!/usr/bin/env bash # # Runs options_pricing benchmarks set -ex NUM_OPTIONS_BLACK_SCHOLES=10000000 if [[ ${NORUN} != 1 ]]; then hash hyperfine 2>/dev/null || { echo >&2 "hyperfine is not in PATH."; exit 1; } fi # Black-Scholes: ALGS=("black_scholes_scalar" "black_scholes_simd" "black_scholes_simd_par") if echo "$FEATURES" | grep -q "ispc"; then hash ispc 2>/dev/null || { echo >&2 "ispc is not in PATH."; exit 1; } ALGS+=("black_scholes_ispc" "black_scholes_ispc_tasks") fi RUSTFLAGS="-C target-cpu=native ${RUSTFLAGS}" \ cargo build --release --features="${FEATURES}" if [[ "${NORUN}" == "1" ]]; then exit 0 fi #for alg in "${ALGS[@]}" #do # hyperfine "../target/release/options_pricing ${NUM_OPTIONS_BLACK_SCHOLES} ${alg}" #done # Binomial put: ALGS=("binomial_put_scalar" "binomial_put_simd" "binomial_put_simd_par") if echo "$FEATURES" | grep -q "ispc"; then ALGS+=("binomial_put_ispc" "binomial_put_ispc_tasks") fi NUM_OPTIONS_BINOMIAL_PUT=500000 for alg in "${ALGS[@]}" do hyperfine "../target/release/options_pricing ${NUM_OPTIONS_BINOMIAL_PUT} ${alg}" done ================================================ FILE: examples/options_pricing/build.rs ================================================ fn main() { println!("cargo:rerun-if-changed=build.rs"); #[cfg(feature = "ispc")] { if std::env::var("CARGO_FEATURE_ISPC").is_ok() { let mut cfg = ispc::Config::new(); if cfg!(windows) { cfg.debug(false); } let ispc_files = vec!["volta/options.ispc"]; for s in &ispc_files[..] { cfg.file(*s); } cfg.target_isas(vec![ ispc::opt::TargetISA::SSE2i32x4, ispc::opt::TargetISA::SSE4i32x4, ispc::opt::TargetISA::AVX1i32x8, ispc::opt::TargetISA::AVX2i32x8, ispc::opt::TargetISA::AVX512KNLi32x16, ]); #[cfg(feature = "ispc_libm")] { // Use the system's libm cfg.math_lib(ispc::opt::MathLib::System); } cfg.compile("options"); } } } ================================================ FILE: examples/options_pricing/readme.md ================================================ # Options Pricing ISPC example This is the [`options` ISPC benchmark][ispc]: > This program implements both the Black-Scholes and > Binomial options pricing models. ## Usage ``` cargo run --release --features=ispc -- ${SIZE} ${ALGORITHM} ``` ## Results ``` ./benchmark.sh ``` ## Black-Scholes On a dual core AVX1 i5 @1.8 GHz: | 800 x 800 | time [ms]
Rust | speedup vs `scalar` [-] | |--------------|---------------------|-------------------------| | `scalar` | 998 | 1.0x | | `simd` | 367 | 2.7x | | `par_simd` | 246 | 4.1x | | `ispc` | 360 | 2.8x | | `ispc+tasks` | 248 | 4.0x | `par_simd` and `ispc+tasks` algorithms are on par. ## Binomial put On a dual core AVX1 i5 @1.8 GHz: | 800 x 800 | time [ms]
Rust | speedup vs `scalar` [-] | |--------------|---------------------|-------------------------| | `scalar` | 2057 | 1.0x | | `simd` | 651 | 3.2x | | `par_simd` | 279 | 4.3x | | `ispc` | 805 | 7.4x | | `ispc+tasks` | 404 | 5.1x | `par_simd` algorithm is ~1.4x faster than `ispc+tasks`. [ispc]: https://github.com/ispc/ispc/tree/master/examples/options ================================================ FILE: examples/options_pricing/src/ispc_.rs ================================================ //! Includes the ISPC implementations. use ispc::*; ispc_module!(options); pub mod black_scholes { use super::*; pub fn serial( sa: &[f32], xa: &[f32], ta: &[f32], ra: &[f32], va: &[f32], result: &mut [f32], count: usize, ) -> f64 { unsafe { self::options::black_scholes_ispc( sa.as_ptr() as *mut f32, xa.as_ptr() as *mut f32, ta.as_ptr() as *mut f32, ra.as_ptr() as *mut f32, va.as_ptr() as *mut f32, result.as_mut_ptr(), count as i32, ) } } pub fn tasks( sa: &[f32], xa: &[f32], ta: &[f32], ra: &[f32], va: &[f32], result: &mut [f32], count: usize, ) -> f64 { unsafe { self::options::black_scholes_ispc_tasks( sa.as_ptr() as *mut f32, xa.as_ptr() as *mut f32, ta.as_ptr() as *mut f32, ra.as_ptr() as *mut f32, va.as_ptr() as *mut f32, result.as_mut_ptr(), count as i32, ) } } } pub mod binomial_put { use super::*; pub fn serial( sa: &[f32], xa: &[f32], ta: &[f32], ra: &[f32], va: &[f32], result: &mut [f32], count: usize, ) -> f64 { unsafe { self::options::binomial_put_ispc( sa.as_ptr() as *mut f32, xa.as_ptr() as *mut f32, ta.as_ptr() as *mut f32, ra.as_ptr() as *mut f32, va.as_ptr() as *mut f32, result.as_mut_ptr(), count as i32, ) } } pub fn tasks( sa: &[f32], xa: &[f32], ta: &[f32], ra: &[f32], va: &[f32], result: &mut [f32], count: usize, ) -> f64 { unsafe { self::options::binomial_put_ispc_tasks( sa.as_ptr() as *mut f32, xa.as_ptr() as *mut f32, ta.as_ptr() as *mut f32, ra.as_ptr() as *mut f32, va.as_ptr() as *mut f32, result.as_mut_ptr(), count as i32, ) } } } #[cfg(test)] mod tests { use super::*; #[test] fn black_scholes() { const NOPTS: usize = 1_000_000; let mut serial = crate::State::new(NOPTS); let mut tasks = crate::State::new(NOPTS); let serial_sum = serial.exec(black_scholes::serial); let tasks_sum = tasks.exec(black_scholes::tasks); assert_eq!(serial, tasks); assert_eq!(serial_sum, tasks_sum); } #[test] fn binomial_put() { const NOPTS: usize = 1_000_000; let mut serial = crate::State::new(NOPTS); let mut tasks = crate::State::new(NOPTS); let serial_sum = serial.exec(binomial_put::serial); let tasks_sum = tasks.exec(binomial_put::tasks); assert_eq!(serial, tasks); assert_eq!(serial_sum, tasks_sum); } } ================================================ FILE: examples/options_pricing/src/lib.rs ================================================ #![deny(rust_2018_idioms)] #![allow( clippy::inline_always, clippy::many_single_char_names, clippy::excessive_precision, clippy::cast_precision_loss, clippy::cast_possible_truncation, clippy::cast_possible_wrap, clippy::must_use_candidate, clippy::too_many_arguments, clippy::float_cmp )] use packed_simd::f32x8 as f32s; use packed_simd::f64x8 as f64s; const BINOMIAL_NUM: usize = 64; #[cfg(feature = "ispc")] pub mod ispc_; pub mod scalar; pub mod simd; pub mod simd_kernels; pub mod simd_par; pub mod sum; #[derive(PartialEq, Debug)] pub struct State { s: Vec, x: Vec, t: Vec, r: Vec, v: Vec, result: Vec, count: usize, } impl State { pub fn new(count: usize) -> Self { Self { s: vec![100.; count], x: vec![98.; count], t: vec![2.; count], r: vec![0.02; count], v: vec![5.; count], result: vec![0.0; count], count, } } pub fn exec(&mut self, model: F) -> f64 where F: Fn( &[f32], &[f32], &[f32], &[f32], &[f32], &mut [f32], usize, ) -> f64, { model( &self.s, &self.x, &self.t, &self.r, &self.v, &mut self.result, self.count, ) } } #[cfg(test)] fn almost_equal(a: f64, b: f64, max_rel_diff: f64) -> bool { let diff = (a - b).abs(); let a = a.abs(); let b = b.abs(); let largest = a.max(b); diff <= largest * max_rel_diff } ================================================ FILE: examples/options_pricing/src/main.rs ================================================ #![deny(warnings, rust_2018_idioms)] #![feature(custom_inner_attributes)] use options_pricing_lib::*; #[rustfmt::skip] fn run(name: &str, count: usize, f: F) where F: Fn(&[f32], &[f32], &[f32], &[f32], &[f32], &mut [f32], usize) -> f64, { let mut d = State::new(count); let t = time::Duration::span(move || { d.exec(f); } ); println!("{}: {} ms", name, t.num_milliseconds()); } macro_rules! ispc_alg { ($name:tt, $count:ident, $fun:path) => {{ #[cfg(feature = "ispc")] { run($name, $count, $fun); } #[cfg(not(feature = "ispc"))] { panic!("algorithm {} requires --feature=ispc", $name); } }}; } fn main() { let mut args = std::env::args(); args.next(); let num_options: usize = args .next() .unwrap() .parse() .expect("expected argument 1 of type usize: num_options"); let algorithm: String = args .next() .unwrap() .parse() .expect("expected argument 2 of type String: algorithm"); match algorithm.as_str() { "black_scholes_ispc_tasks" => ispc_alg!( "black_scholes_ispc_tasks", num_options, ispc_::black_scholes::tasks ), "black_scholes_ispc" => ispc_alg!( "black_scholes_ispc", num_options, ispc_::black_scholes::serial ), "binomial_put_ispc_tasks" => ispc_alg!( "binomial_put_ispc_tasks", num_options, ispc_::binomial_put::tasks ), "binomial_put_ispc" => ispc_alg!( "binomial_put_ispc", num_options, ispc_::binomial_put::serial ), "black_scholes_scalar" => { run("black_scholes_scalar", num_options, scalar::black_scholes) } "binomial_put_scalar" => { run("binomial_put_scalar", num_options, scalar::binomial_put) } "black_scholes_simd" => { run("black_scholes_simd", num_options, simd::black_scholes) } "binomial_put_simd" => { run("binomial_put_simd", num_options, simd::binomial_put) } "black_scholes_simd_par" => { run("black_scholes_simd_par", num_options, simd_par::black_scholes) } "binomial_put_simd_par" => { run("binomial_put_simd_par", num_options, simd_par::binomial_put) } _ => panic!("unknown algorithm: {}", algorithm), } } ================================================ FILE: examples/options_pricing/src/scalar.rs ================================================ //! Scalar implementation // Cumulative normal distribution function #[inline(always)] fn cnd(x: f32) -> f32 { const INV_SQRT_2PI: f32 = 0.398_942_280_40; let l = x.abs(); let k = 1. / (1. + 0.231_641_9 * l); let k2 = k * k; let k3 = k2 * k; let k4 = k2 * k2; let k5 = k3 * k2; let w: f32 = 0.319_381_53 * k - 0.356_563_782 * k2 + 1.781_477_937 * k3 + -1.821_255_978 * k4 + 1.330_274_429 * k5; let w = w * INV_SQRT_2PI * (-l * l * 0.5).exp(); if x > 0. { 1. - w } else { w } } pub fn black_scholes( sa: &[f32], xa: &[f32], ta: &[f32], ra: &[f32], va: &[f32], result: &mut [f32], count: usize, ) -> f64 { for i in 0..count { let s = sa[i]; let x = xa[i]; let t = ta[i]; let r = ra[i]; let v = va[i]; let d1 = ((s / x).ln() + (r + v * v * 0.5) * t) / (v * t.sqrt()); let d2 = d1 - v * t.sqrt(); result[i] = s * cnd(d1) - x * (-r * t).exp() * cnd(d2); } crate::sum::slice_scalar(&result) } pub fn binomial_put( sa: &[f32], xa: &[f32], ta: &[f32], ra: &[f32], va: &[f32], result: &mut [f32], count: usize, ) -> f64 { use crate::BINOMIAL_NUM; for i in 0..count { let s = sa[i]; let x = xa[i]; let t = ta[i]; let r = ra[i]; let v = va[i]; let dt = t / BINOMIAL_NUM as f32; let u = (v * dt.sqrt()).exp(); let d = 1. / u; let disc = (r * dt).exp(); let pu = (disc - d) / (u - d); let mut vs = [0_f32; BINOMIAL_NUM]; for (j, v) in vs.iter_mut().enumerate() { let e = (2_i32 * (j as i32)).wrapping_sub(BINOMIAL_NUM as i32); let upow = u.powf(e as f32); *v = 0_f32.max(x - s * upow); } for j in (0..BINOMIAL_NUM).rev() { for k in 0..j { vs[k] = ((1. - pu) * vs[k] + pu * vs[k + 1]) / disc; } } result[i] = vs[0]; } crate::sum::slice_scalar(&result) } #[cfg(feature = "ispc")] #[cfg(test)] mod tests { use super::*; use crate::almost_equal; #[test] fn black_scholes_ispc() { const NOPTS: usize = 1_000_000; let mut scalar = crate::State::new(NOPTS); let mut ispc = crate::State::new(NOPTS); let scalar_sum = scalar.exec(black_scholes); let ispc_sum = ispc.exec(crate::ispc_::black_scholes::serial); assert_eq!(scalar, ispc); assert_eq!(scalar_sum, ispc_sum); } #[test] fn binomial_put_ispc() { const NOPTS: usize = 1_000_000; let mut scalar = crate::State::new(NOPTS); let mut ispc = crate::State::new(NOPTS); let scalar_sum = scalar.exec(binomial_put); let ispc_sum = ispc.exec(crate::ispc_::binomial_put::serial); // FIXME: results differ slightly for each value of the result vector // need to figure out why // assert_eq!(scalar, ispc); assert!(almost_equal(scalar_sum, ispc_sum, 1e-5)); } } ================================================ FILE: examples/options_pricing/src/simd.rs ================================================ //! SIMD implementation use crate::f32s; pub fn serial( sa: &[f32], xa: &[f32], ta: &[f32], ra: &[f32], va: &[f32], result: &mut [f32], count: usize, kernel: K, ) -> f64 where K: Fn(f32s, f32s, f32s, f32s, f32s) -> f32s, { assert_eq!(count % f32s::lanes(), 0); for i in (0..count).step_by(f32s::lanes()) { unsafe { let s = f32s::from_slice_unaligned_unchecked(&sa[i..]); let x = f32s::from_slice_unaligned_unchecked(&xa[i..]); let t = f32s::from_slice_unaligned_unchecked(&ta[i..]); let r = f32s::from_slice_unaligned_unchecked(&ra[i..]); let v = f32s::from_slice_unaligned_unchecked(&va[i..]); let r = kernel(s, x, t, r, v); r.write_to_slice_unaligned_unchecked(&mut result[i..]); } } crate::sum::slice(&result) } pub fn black_scholes( sa: &[f32], xa: &[f32], ta: &[f32], ra: &[f32], va: &[f32], result: &mut [f32], count: usize, ) -> f64 { serial( sa, xa, ta, ra, va, result, count, crate::simd_kernels::black_scholes, ) } pub fn binomial_put( sa: &[f32], xa: &[f32], ta: &[f32], ra: &[f32], va: &[f32], result: &mut [f32], count: usize, ) -> f64 { serial( sa, xa, ta, ra, va, result, count, crate::simd_kernels::binomial_put, ) } #[cfg(test)] mod tests { use super::*; use crate::almost_equal; #[test] fn black_scholes_scalar() { const NOPTS: usize = 1_000_000; let mut simd = crate::State::new(NOPTS); let mut scalar = crate::State::new(NOPTS); let simd_sum = simd.exec(black_scholes); let scalar_sum = scalar.exec(crate::scalar::black_scholes); assert_eq!(simd, scalar); assert_eq!(simd_sum, scalar_sum); } #[test] fn binomial_put_scalar() { const NOPTS: usize = 1_000_000; let mut simd = crate::State::new(NOPTS); let mut scalar = crate::State::new(NOPTS); let simd_sum = simd.exec(binomial_put); let scalar_sum = scalar.exec(crate::scalar::binomial_put); // assert_eq!(simd, scalar); // assert_eq!(simd_sum, scalar_sum); assert!(almost_equal(simd_sum, scalar_sum, 1e-5)); } } ================================================ FILE: examples/options_pricing/src/simd_kernels.rs ================================================ use crate::f32s; // Cumulative normal distribution function #[inline(always)] pub fn cnd(x: f32s) -> f32s { const INV_SQRT_2PI: f32s = f32s::splat(0.398_942_280_40); let l = x.abs(); let k = 1. / (1. + 0.231_641_9 * l); let k2 = k * k; let k3 = k2 * k; let k4 = k2 * k2; let k5 = k3 * k2; let w: f32s = 0.319_381_53 * k - 0.356_563_782 * k2 + 1.781_477_937 * k3 + -1.821_255_978 * k4 + 1.330_274_429 * k5; let w = w * INV_SQRT_2PI * (-l * l * 0.5).exp(); x.gt(f32s::splat(0.)).select(1. - w, w) } #[inline(always)] pub fn black_scholes(s: f32s, x: f32s, t: f32s, r: f32s, v: f32s) -> f32s { let d1 = ((s / x).ln() + (r + v * v * 0.5) * t) / (v * t.sqrt()); let d2 = d1 - v * t.sqrt(); s * cnd(d1) - x * (-r * t).exp() * cnd(d2) } #[inline(always)] pub fn binomial_put(s: f32s, x: f32s, t: f32s, r: f32s, v: f32s) -> f32s { use crate::BINOMIAL_NUM; let dt = t / BINOMIAL_NUM as f32; let u = (v * dt.sqrt()).exp(); let d = 1. / u; let disc = (r * dt).exp(); let inv_disc = 1. / disc; let pu = (disc - d) / (u - d); let o_m_pu = 1. - pu; let mut vs = [f32s::splat(0.); BINOMIAL_NUM]; for (j, v) in vs.iter_mut().enumerate() { let e = (2_i32 * (j as i32)).wrapping_sub(BINOMIAL_NUM as i32); let upow = u.powf(f32s::splat(e as f32)); *v = f32s::splat(0.).max(x - s * upow); } for j in (0..BINOMIAL_NUM).rev() { for k in 0..j { vs[k] = (o_m_pu * vs[k] + pu * vs[k + 1]) * inv_disc; } } vs[0] } ================================================ FILE: examples/options_pricing/src/simd_par.rs ================================================ //! SIMD implementation use crate::f32s; pub fn parallel( sa: &[f32], xa: &[f32], ta: &[f32], ra: &[f32], va: &[f32], result: &mut [f32], count: usize, kernel: K, ) -> f64 where K: Fn(f32s, f32s, f32s, f32s, f32s) -> f32s + Sync + Send, { use rayon::prelude::*; assert_eq!(count % f32s::lanes(), 0); result.par_chunks_mut(f32s::lanes()).enumerate().for_each( |(i, result)| { debug_assert!(result.len() == 8); unsafe { let s = f32s::from_slice_unaligned_unchecked(&sa[i..]); let x = f32s::from_slice_unaligned_unchecked(&xa[i..]); let t = f32s::from_slice_unaligned_unchecked(&ta[i..]); let r = f32s::from_slice_unaligned_unchecked(&ra[i..]); let v = f32s::from_slice_unaligned_unchecked(&va[i..]); let r = kernel(s, x, t, r, v); r.write_to_slice_unaligned_unchecked(result); } }, ); crate::sum::slice(&result) } pub fn black_scholes( sa: &[f32], xa: &[f32], ta: &[f32], ra: &[f32], va: &[f32], result: &mut [f32], count: usize, ) -> f64 { parallel( sa, xa, ta, ra, va, result, count, crate::simd_kernels::black_scholes, ) } pub fn binomial_put( sa: &[f32], xa: &[f32], ta: &[f32], ra: &[f32], va: &[f32], result: &mut [f32], count: usize, ) -> f64 { parallel( sa, xa, ta, ra, va, result, count, crate::simd_kernels::binomial_put, ) } #[cfg(test)] mod tests { use super::*; use crate::almost_equal; #[test] fn black_scholes_scalar() { const NOPTS: usize = 1_000_000; let mut simd_par = crate::State::new(NOPTS); let mut scalar = crate::State::new(NOPTS); let simd_par_sum = simd_par.exec(black_scholes); let scalar_sum = scalar.exec(crate::scalar::black_scholes); assert_eq!(simd_par, scalar); assert_eq!(simd_par_sum, scalar_sum); } #[test] fn binomial_put_scalar() { const NOPTS: usize = 1_000_000; let mut simd_par = crate::State::new(NOPTS); let mut scalar = crate::State::new(NOPTS); let simd_par_sum = simd_par.exec(binomial_put); let scalar_sum = scalar.exec(crate::scalar::binomial_put); // assert_eq!(simd_par, scalar); // assert_eq!(simd_par_sum, scalar_sum); assert!(almost_equal(simd_par_sum, scalar_sum, 1e-5)); } } ================================================ FILE: examples/options_pricing/src/sum.rs ================================================ //! Implements different algorithms for summing a slice of `f32`s use super::{f32s, f64s}; pub fn slice(x: &[f32]) -> f64 { assert_eq!(f32s::lanes(), f64s::lanes()); assert_eq!(x.len() % f32s::lanes(), 0); let mut sum = f64s::splat(0.); for i in (0..x.len()).step_by(f32s::lanes()) { unsafe { use packed_simd::Cast; let v: f64s = f32s::from_slice_unaligned_unchecked(&x[i..]).cast(); sum += v; } } sum.sum() } pub fn slice_scalar(x: &[f32]) -> f64 { let mut sum = 0_f64; for &x in x { sum += f64::from(x); } sum } ================================================ FILE: examples/options_pricing/volta/options.ispc ================================================ // -*- mode: c++ -*- /* Copyright (c) 2010-2011, Intel Corporation All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "options_defs.h" // Cumulative normal distribution function static inline float CND(float X) { float L = abs(X); float k = 1.0 / (1.0 + 0.2316419 * L); float k2 = k*k; float k3 = k2*k; float k4 = k2*k2; float k5 = k3*k2; const float invSqrt2Pi = 0.39894228040f; float w = (0.31938153f * k - 0.356563782f * k2 + 1.781477937f * k3 + -1.821255978f * k4 + 1.330274429f * k5); w *= invSqrt2Pi * exp(-L * L * .5f); if (X > 0.f) w = 1.0 - w; return w; } static inline uniform double sum(const uniform float result[], uniform int count) { double s = 0.0; foreach (i = 0 ... count) { s += (double)result[i]; } return reduce_add(s); } task void bs_task(uniform float Sa[], uniform float Xa[], uniform float Ta[], uniform float ra[], uniform float va[], uniform float result[], uniform int count) { uniform int first = taskIndex * (count/taskCount); uniform int last = min(count, (int)((taskIndex+1) * (count/taskCount))); foreach (i = first ... last) { float S = Sa[i], X = Xa[i], T = Ta[i], r = ra[i], v = va[i]; float d1 = (log(S/X) + (r + v * v * .5f) * T) / (v * sqrt(T)); float d2 = d1 - v * sqrt(T); result[i] = S * CND(d1) - X * exp(-r * T) * CND(d2); } } export uniform double black_scholes_ispc_tasks(uniform float Sa[], uniform float Xa[], uniform float Ta[], uniform float ra[], uniform float va[], uniform float result[], uniform int count) { uniform int nTasks = max((int)64, (int)count/16384); launch[nTasks] bs_task(Sa, Xa, Ta, ra, va, result, count); sync; return sum(result, count); } export uniform double black_scholes_ispc(uniform float Sa[], uniform float Xa[], uniform float Ta[], uniform float ra[], uniform float va[], uniform float result[], uniform int count) { foreach (i = 0 ... count) { float S = Sa[i], X = Xa[i], T = Ta[i], r = ra[i], v = va[i]; float d1 = (log(S/X) + (r + v * v * .5f) * T) / (v * sqrt(T)); float d2 = d1 - v * sqrt(T); result[i] = S * CND(d1) - X * exp(-r * T) * CND(d2); } return sum(result, count); } static inline float binomial_put(float S, float X, float T, float r, float v) { float V[BINOMIAL_NUM]; float dt = T / BINOMIAL_NUM; float u = exp(v * sqrt(dt)); float d = 1. / u; float disc = exp(r * dt); float Pu = (disc - d) / (u - d); for (uniform int j = 0; j < BINOMIAL_NUM; ++j) { float upow = pow(u, (float)(2*j-BINOMIAL_NUM)); V[j] = max(0., X - S * upow); } for (uniform int j = BINOMIAL_NUM-1; j >= 0; --j) for (uniform int k = 0; k < j; ++k) V[k] = ((1 - Pu) * V[k] + Pu * V[k + 1]) / disc; return V[0]; } export uniform double binomial_put_ispc(uniform float Sa[], uniform float Xa[], uniform float Ta[], uniform float ra[], uniform float va[], uniform float result[], uniform int count) { foreach (i = 0 ... count) { float S = Sa[i], X = Xa[i], T = Ta[i], r = ra[i], v = va[i]; result[i] = binomial_put(S, X, T, r, v); } return sum(result, count); } task void binomial_task(uniform float Sa[], uniform float Xa[], uniform float Ta[], uniform float ra[], uniform float va[], uniform float result[], uniform int count) { uniform int first = taskIndex * (count/taskCount); uniform int last = min(count, (int)((taskIndex+1) * (count/taskCount))); foreach (i = first ... last) { float S = Sa[i], X = Xa[i], T = Ta[i], r = ra[i], v = va[i]; result[i] = binomial_put(S, X, T, r, v); } } export uniform double binomial_put_ispc_tasks(uniform float Sa[], uniform float Xa[], uniform float Ta[], uniform float ra[], uniform float va[], uniform float result[], uniform int count) { uniform int nTasks = max((int)64, (int)count/16384); launch[nTasks] binomial_task(Sa, Xa, Ta, ra, va, result, count); sync; return sum(result, count); } ================================================ FILE: examples/options_pricing/volta/options_defs.h ================================================ /* Copyright (c) 2010-2011, Intel Corporation All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef OPTIONS_DEFS_H #define OPTIONS_DEFS_H 1 #define BINOMIAL_NUM 64 #endif // OPTIONS_DEFS_H ================================================ FILE: examples/rust-toolchain ================================================ nightly ================================================ FILE: examples/slice_sum/Cargo.toml ================================================ [package] name = "slice_sum" version = "0.1.0" authors = ["gnzlbg "] edition = "2018" [[bin]] name = "slice_sum" path = "src/main.rs" [dependencies] packed_simd = { package = "packed_simd", path = "../.." } rayon = "^1.0" time = "^0.1" rand = "0.7.0" ================================================ FILE: examples/slice_sum/readme.md ================================================ # Computes the sum of a slice of floating-point numbers This example show-cases the performance difference of computing the sum of a `&[f32]` slice using horizontal or vertical operations. To run it: ``` RUSTFLAGS="-C target-cpu=native" cargo run --release ``` On my machine it prints: ``` vertical: 155 ms horizontal: 424 ms ``` that is, on my particular the slice sum algorithm using horizontal vector additions operation is ~2.7x slower than the one using vertical vector operations. ================================================ FILE: examples/slice_sum/src/main.rs ================================================ #![deny(rust_2018_idioms)] use packed_simd::f32x8 as f32s; use std::{mem, slice}; fn init(n: usize) -> Vec { use rand::distributions::Standard; use rand::prelude::*; thread_rng().sample_iter(&Standard).take(n).collect() } fn sum_ver(x: &[f32]) -> f32 { assert_eq!(x.len() % f32s::lanes(), 0); x.chunks_exact(f32s::lanes()) .map(f32s::from_slice_unaligned) .sum::() .sum() } fn sum_hor(x: &[f32]) -> f32 { assert_eq!(x.len() % f32s::lanes(), 0); x.chunks_exact(f32s::lanes()) .map(f32s::from_slice_unaligned) .map(f32s::sum) .sum() } fn sum_ver_par(x: &[f32]) -> f32 { use rayon::prelude::*; let len: usize = x.len(); assert_eq!(len % 8, 0); // find the first properly aligned element let (i, _): (usize, _) = x .iter() .enumerate() .find(|&(_, y): &(usize, &f32)| { (y as *const f32) as usize % mem::align_of::() == 0 }) .unwrap(); let (head, tail) = x.split_at(i); let head_sum: f32 = head.iter().sum(); #[allow(clippy::cast_ptr_alignment)] let tail: &[f32s] = unsafe { slice::from_raw_parts( tail.as_ptr() as *const f32s, tail.len() / f32s::lanes(), ) }; let tail_sum: f32s = tail.into_par_iter().sum(); head_sum + tail_sum.sum() } fn main() { let n: usize = std::env::args() .nth(1) .unwrap_or_else(|| "1000000000".to_string()) .parse() .expect("argument should be a usize"); assert_eq!(n % 8, 0, "argument should be a multiple of 8"); let s: &[f32] = &init(n); let iter = time::Duration::span(|| { let v: f32 = s.iter().sum(); assert!(!v.is_nan()); }); println!("std::iter::sum: {} ms", iter.num_milliseconds()); let rayon = time::Duration::span(|| { use rayon::prelude::*; let v: f32 = s.par_iter().sum(); assert!(!v.is_nan()); }); println!("rayon::sum: {} ms", rayon.num_milliseconds()); let ver = time::Duration::span(|| { assert!(!sum_ver(s).is_nan()); }); println!("vertical: {} ms", ver.num_milliseconds()); let hor = time::Duration::span(|| { assert!(!sum_hor(s).is_nan()); }); println!("horizontal: {} ms", hor.num_milliseconds()); let ver_par = time::Duration::span(|| { assert!(!sum_ver_par(s).is_nan()); }); println!("vertical_par: {} ms", ver_par.num_milliseconds()); } ================================================ FILE: examples/spectral_norm/Cargo.toml ================================================ [package] name = "spectral_norm" version = "0.1.0" authors = ["gnzlbg "] edition = "2018" [dependencies] packed_simd = { package = "packed_simd", path = "../.." } [[bin]] name = "spectral_norm" path = "src/main.rs" [lib] name = "spectral_norm_lib" path = "src/lib.rs" ================================================ FILE: examples/spectral_norm/readme.md ================================================ # Spectral norm This is the [`spectral-norm` benchmark from the benchmarksgame][bg]. ## Background and description MathWorld: ["Hundred-Dollar, Hundred-Digit Challenge Problems"](http://mathworld.wolfram.com/Hundred-DollarHundred-DigitChallengeProblems.html), [Challenge #3](http://mathworld.wolfram.com/SpectralNorm.html). Each program should: * calculate the spectral norm of an infinite matrix `A`, with entries `a11=1`, `a12=1/2`, `a21=1/3`, `a13=1/4`, `a22=1/5`, `a31=1/6`, etc. * implement 4 separate functions / procedures / methods like the [C# program](https://benchmarksgame-team.pages.debian.net/benchmarksgame/program/spectralnorm-csharpcore-1.html) ## Usage It takes two arguments in this order: * `n`: the size of the matrix `A` (n-times-n) * (optional) `algorithm`: the algorithm to use - defaults to the fastest one. * `0`: scalar algorithm * `1`: SIMD algorithm [bg]: https://benchmarksgame-team.pages.debian.net/benchmarksgame/description/spectralnorm.html#spectralnorm ================================================ FILE: examples/spectral_norm/src/lib.rs ================================================ //! Spectral Norm #![deny(rust_2018_idioms)] #![allow(non_snake_case, non_camel_case_types)] #![allow( clippy::cast_precision_loss, clippy::must_use_candidate )] pub mod scalar; pub mod simd; fn A(i: usize, j: usize) -> f64 { ((i + j) * (i + j + 1) / 2 + i + 1) as f64 } pub fn spectral_norm(n: usize, alg: usize) -> f64 { match alg { 0 => simd::spectral_norm(n), 1 => scalar::spectral_norm(n), v => panic!("unknown algorithm value: {}", v), } } ================================================ FILE: examples/spectral_norm/src/main.rs ================================================ extern crate spectral_norm_lib; use spectral_norm_lib::*; fn run(o: &mut O, n: usize, alg: usize) { let answer = spectral_norm(n, alg); writeln!(o, "{:.9}", answer).unwrap(); } fn main() { let n: usize = std::env::args().nth(1).expect("need one arg").parse().unwrap(); let alg = if let Some(v) = std::env::args().nth(2) { v.parse().unwrap() } else { 0 }; run(&mut std::io::stdout(), n, alg); } #[cfg(test)] mod tests { use super::*; static OUTPUT: &[u8] = include_bytes!("spectralnorm-output.txt"); #[test] fn verify_output_simd() { let mut out: Vec = Vec::new(); run(&mut out, 100, 0); assert_eq!(out.len(), OUTPUT.len()); if out != OUTPUT { for i in 0..out.len() { assert_eq!( out[i], OUTPUT[i], "byte {} differs - is: {:#08b} - should: {:#08b}", i, out[i], OUTPUT[i] ); } } } #[test] fn verify_output_scalar() { let mut out: Vec = Vec::new(); run(&mut out, 100, 1); assert_eq!(out.len(), OUTPUT.len()); if out != OUTPUT { for i in 0..out.len() { assert_eq!( out[i], OUTPUT[i], "byte {} differs - is: {:#08b} - should: {:#08b}", i, out[i], OUTPUT[i] ); } } } } ================================================ FILE: examples/spectral_norm/src/scalar.rs ================================================ //! Scalar spectral norm implementation use crate::*; use std::{ iter::*, ops::{Add, Div}, }; struct f64x2(f64, f64); impl Add for f64x2 { type Output = Self; fn add(self, rhs: Self) -> Self { Self(self.0 + rhs.0, self.1 + rhs.1) } } impl Div for f64x2 { type Output = Self; fn div(self, rhs: Self) -> Self { Self(self.0 / rhs.0, self.1 / rhs.1) } } pub fn spectral_norm(n: usize) -> f64 { assert!(n % 2 == 0, "only even lengths are accepted"); let mut u = vec![1.0; n]; let mut v = u.clone(); let mut tmp = v.clone(); for _ in 0..10 { mult_AtAv(&u, &mut v, &mut tmp); mult_AtAv(&v, &mut u, &mut tmp); } (dot(&u, &v) / dot(&v, &v)).sqrt() } fn mult_AtAv(v: &[f64], out: &mut [f64], tmp: &mut [f64]) { mult_Av(v, tmp); mult_Atv(tmp, out); } fn mult_Av(v: &[f64], out: &mut [f64]) { mult(v, out, 0, A); } fn mult_Atv(v: &[f64], out: &mut [f64]) { mult(v, out, 0, |i, j| A(j, i)); } fn mult(v: &[f64], out: &mut [f64], start: usize, a: F) where F: Fn(usize, usize) -> f64, { for (i, slot) in out.iter_mut().enumerate().map(|(i, s)| (i + start, s)) { let mut sum = f64x2(0.0, 0.0); for (j, chunk) in v.chunks(2).enumerate().map(|(j, s)| (2 * j, s)) { let top = f64x2(chunk[0], chunk[1]); let bot = f64x2(a(i, j), a(i, j + 1)); sum = sum + top / bot; } let f64x2(a, b) = sum; *slot = a + b; } } fn dot(x: &[f64], y: &[f64]) -> f64 { x.iter().zip(y).map(|(&x, &y)| x * y).fold(0.0, |a, b| a + b) } #[cfg(test)] #[test] fn test() { assert_eq!(&format!("{:.9}", spectral_norm(100)), "1.274219991"); } ================================================ FILE: examples/spectral_norm/src/simd.rs ================================================ //! Vectorized spectral norm implementation use crate::*; use packed_simd::*; fn mult_Av(v: &[f64], out: &mut [f64]) { assert!(v.len() == out.len()); assert!(v.len() % 2 == 0); for (i, out) in out.iter_mut().enumerate() { let mut sum = f64x2::splat(0.0); let mut j = 0; while j < v.len() { let b = f64x2::from_slice_unaligned(&v[j..]); let a = f64x2::new(A(i, j), A(i, j + 1)); sum += b / a; j += 2 } *out = sum.sum(); } } fn mult_Atv(v: &[f64], out: &mut [f64]) { assert!(v.len() == out.len()); assert!(v.len() % 2 == 0); for (i, out) in out.iter_mut().enumerate() { let mut sum = f64x2::splat(0.0); let mut j = 0; while j < v.len() { let b = f64x2::from_slice_unaligned(&v[j..]); let a = f64x2::new(A(j, i), A(j + 1, i)); sum += b / a; j += 2 } *out = sum.sum(); } } fn mult_AtAv(v: &[f64], out: &mut [f64], tmp: &mut [f64]) { mult_Av(v, tmp); mult_Atv(tmp, out); } pub fn spectral_norm(n: usize) -> f64 { assert!(n % 2 == 0, "only even lengths are accepted"); let mut u = vec![1.0; n]; let mut v = u.clone(); let mut tmp = u.clone(); for _ in 0..10 { mult_AtAv(&u, &mut v, &mut tmp); mult_AtAv(&v, &mut u, &mut tmp); } (dot(&u, &v) / dot(&v, &v)).sqrt() } fn dot(x: &[f64], y: &[f64]) -> f64 { // This is auto-vectorized: x.iter().zip(y).map(|(&x, &y)| x * y).fold(0.0, |a, b| a + b) } #[cfg(test)] #[test] fn test() { assert_eq!(&format!("{:.9}", spectral_norm(100)), "1.274219991"); } ================================================ FILE: examples/spectral_norm/src/spectralnorm-output.txt ================================================ 1.274219991 ================================================ FILE: examples/stencil/Cargo.toml ================================================ [package] name = "stencil" version = "0.1.0" authors = ["gnzlbg "] edition = "2018" [dependencies] packed_simd = { package = "packed_simd", path = "../.." } time = "^0.1" rayon = "^1.0" ispc = { version = "^1.0.4", optional = true } [build-dependencies] ispc = { version = "^1.0.4", optional = true } [[bin]] name = "stencil" path = "src/main.rs" [lib] name = "stencil_lib" path = "src/lib.rs" [features] default = [] core_arch = ["packed_simd/core_arch"] sleef-sys = ["packed_simd/sleef-sys"] ================================================ FILE: examples/stencil/benchmark.sh ================================================ #!/usr/bin/env bash # # Runs aobench benchmarks set -ex if [[ ${NORUN} != 1 ]]; then hash hyperfine 2>/dev/null || { echo >&2 "hyperfine is not in PATH."; exit 1; } fi algs=("0" "1" "2") if echo "$FEATURES" | grep -q "ispc"; then hash ispc 2>/dev/null || { echo >&2 "ispc is not in PATH."; exit 1; } algs+=( "3" "4" ) fi RUSTFLAGS="-C target-cpu=native ${RUSTFLAGS}" \ cargo build --release --no-default-features \ --features="${FEATURES}" if [[ "${VERIFY}" == "1" ]]; then RUSTFLAGS="-C target-cpu=native ${RUSTFLAGS}" \ cargo test --release --no-default-features \ --features="${FEATURES}" fi if [[ "${NORUN}" == "1" ]]; then exit 0 fi for alg in "${algs[@]}" do hyperfine "../target/release/stencil ${alg}" done ================================================ FILE: examples/stencil/build.rs ================================================ fn main() { println!("cargo:rerun-if-changed=build.rs"); #[cfg(feature = "ispc")] { if std::env::var("CARGO_FEATURE_ISPC").is_ok() { let mut cfg = ispc::Config::new(); if cfg!(windows) { cfg.debug(false); } let ispc_files = vec!["volta/stencil.ispc"]; for s in &ispc_files[..] { cfg.file(*s); } cfg.target_isas(vec![ ispc::opt::TargetISA::SSE2i32x4, ispc::opt::TargetISA::SSE4i32x4, ispc::opt::TargetISA::AVX1i32x8, ispc::opt::TargetISA::AVX2i32x8, ispc::opt::TargetISA::AVX512KNLi32x16, ]); cfg.compile("stencil"); } } } ================================================ FILE: examples/stencil/readme.md ================================================ # Stencil This is the generic [`stencil` ISPC benchmark][ispc]. ## Usage ``` cargo run --release --features=ispc ``` will run all benchmarks including the ISPC ones. ## Results ``` ./benchmark.sh ``` On a dual core AVX1 i5 @1.8 GHz: | 800 x 600 | time [ms]
Rust | speedup vs `scalar` [-] | |--------------|---------------------|-------------------------| | `scalar` | 2842 | 1.0x | | `vector` | 630 | 4.5x | | `vector_par` | 444 | 6.4x | | `ispc` | 558 | 5.0x | | `ispc_tasks` | 470 | 6.0x | `vector_par` is 1.06x faster than `ispc_tasks`. On a 28 core Xeon CPU E5-2690 v4 @ 2.60GHz: | 800 x 600 | time [ms]
Rust | speedup vs `scalar` [-] | |--------------|---------------------|-------------------------| | `scalar` | 1499 | 1.0x | | `vector` | 276 | 5.4x | | `vector_par` | 167 | 9.0x | | `ispc` | 287 | 5.2x | | `ispc_tasks` | 395 | 3.8x | `vector_par` is 1.72x faster than `ispc_tasks`. On a 40 core Xeon Gold 6148 CPU @ 2.40GHz: | 800 x 600 | time [ms]
Rust | speedup vs `scalar` [-] | |--------------|---------------------|-------------------------| | `scalar` | 1654 | 1.0x | | `vector` | 278 | 6.0x | | `vector_par` | 148 | 11.2x | | `ispc` | 185 | 9.0x | | `ispc_tasks` | 401 | 4.1x | `vector_par` is 1.25x faster than `ispc`. [ispc]: https://github.com/ispc/ispc/tree/master/examples/stencil ================================================ FILE: examples/stencil/src/ispc_loops.rs ================================================ //! Includes the ISPC implementations. use ispc::*; ispc_module!(stencil); pub fn serial( t0: i32, t1: i32, x0: i32, x1: i32, y0: i32, y1: i32, z0: i32, z1: i32, n_x: i32, n_y: i32, n_z: i32, coef: &[f32; 4], vsq: &[f32], a_even: &mut [f32], a_odd: &mut [f32], ) { unsafe { self::stencil::loop_stencil_ispc( t0, t1, x0, x1, y0, y1, z0, z1, n_x, n_y, n_z, coef.as_ptr(), vsq.as_ptr(), a_even.as_mut_ptr(), a_odd.as_mut_ptr(), ); } } pub fn tasks( t0: i32, t1: i32, x0: i32, x1: i32, y0: i32, y1: i32, z0: i32, z1: i32, n_x: i32, n_y: i32, n_z: i32, coef: &[f32; 4], vsq: &[f32], a_even: &mut [f32], a_odd: &mut [f32], ) { unsafe { self::stencil::loop_stencil_ispc_tasks( t0, t1, x0, x1, y0, y1, z0, z1, n_x, n_y, n_z, coef.as_ptr(), vsq.as_ptr(), a_even.as_mut_ptr(), a_odd.as_mut_ptr(), ); } } ================================================ FILE: examples/stencil/src/lib.rs ================================================ #![feature(custom_inner_attributes, stmt_expr_attributes)] // FIXME: Null pointer deref warning triggered in this example, // likely inside a macro expansion deriving from packed_simd. #![deny(rust_2018_idioms)] #![allow( clippy::similar_names, clippy::cast_precision_loss, clippy::cast_sign_loss, clippy::too_many_arguments, clippy::cast_possible_wrap, clippy::cast_possible_truncation, clippy::inline_always, clippy::must_use_candidate )] #[cfg(feature = "ispc")] pub mod ispc_loops; pub mod scalar; pub mod simd; pub mod simd_par; #[derive(Clone, PartialEq, Debug)] pub struct Data { a: (Vec, Vec), vsq: Vec, coeff: [f32; 4], n: (i32, i32, i32), t: (i32, i32), x: (i32, i32), y: (i32, i32), z: (i32, i32), } impl Data { pub fn default() -> Self { Self::from_bounds(6, 4, 128, 128, 128) } pub fn benchmark() -> Self { Self::from_bounds(6, 4, 256, 256, 256) } pub fn from_bounds( max_t: i32, width: i32, n_x: i32, n_y: i32, n_z: i32, ) -> Self { #[rustfmt::skip] Self::new( 0, max_t, width, n_x - width, width, n_y - width, width, n_z - width, n_x, n_y, n_z, ) } /// Initializes data pub fn new( t0: i32, t1: i32, x0: i32, x1: i32, y0: i32, y1: i32, z0: i32, z1: i32, n_x: i32, n_y: i32, n_z: i32, ) -> Self { let n = (n_x * n_y * n_z) as usize; let mut data = Self { a: (vec![0_f32; n], vec![0_f32; n]), vsq: vec![0_f32; n], coeff: [0.5, -0.25, 0.125, -0.0625], n: (n_x, n_y, n_z), t: (t0, t1), x: (x0, x1), y: (y0, y1), z: (z0, z1), }; data.reinit(); data } pub fn reinit(&mut self) { let mut offset: usize = 0; for z in 0..self.n.2 { for y in 0..self.n.1 { for x in 0..self.n.0 { unsafe { *self.a.0.get_unchecked_mut(offset) = if x < self.n.0 / 2 { x as f32 / self.n.0 as f32 } else { y as f32 / self.n.1 as f32 }; *self.a.1.get_unchecked_mut(offset) = 0.; *self.vsq.get_unchecked_mut(offset) = (x * y * z) as f32 / (self.n.0 * self.n.1 * self.n.2) as f32; offset += 1; } } } } } #[rustfmt::skip] pub fn exec(&mut self, f: F) where F: Fn(i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, &[f32; 4], &[f32], &mut [f32], &mut [f32]), { f( self.t.0, self.t.1, self.x.0, self.x.1, self.y.0, self.y.1, self.z.0, self.z.1, self.n.0, self.n.1, self.n.2, &self.coeff, &self.vsq, &mut self.a.0, &mut self.a.1, ); } } #[cfg(test)] fn assert_data_eq(a: &Data, b: &Data) { if a == b { return; } assert_eq!(a.coeff, b.coeff, "coeffs differ"); assert_eq!(a.n, b.n, "n differ"); assert_eq!(a.t, b.t, "t differ"); assert_eq!(a.x, b.x, "x differ"); assert_eq!(a.y, b.y, "y differ"); assert_eq!(a.z, b.z, "z differ"); for z in 0..a.n.2 { for y in 0..a.n.1 { for x in 0..a.n.0 { let idx = (x + y * a.n.1 + z * a.n.1 * a.n.0) as usize; const EPSILON: f32 = 1E-4; assert!( (a.vsq[idx] - b.vsq[idx]).abs() < EPSILON, "vsq diff at idx = {} ({}, {}, {})", idx, x, y, z, ); assert!( (a.a.0[idx] - b.a.0[idx]).abs() < EPSILON, "a.0 diff at idx = {} ({}, {}, {})", idx, x, y, z, ); assert!( (a.a.1[idx] - b.a.1[idx]).abs() < EPSILON, "a.1 diff at idx = {} ({}, {}, {})", idx, x, y, z, ); } } } } ================================================ FILE: examples/stencil/src/main.rs ================================================ #![feature(custom_inner_attributes)] use stencil_lib::*; use std::env; #[rustfmt::skip] fn run(name: &str, f: F) where F: Fn(i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, &[f32; 4], &[f32], &mut [f32], &mut [f32]), { let mut d = Data::benchmark(); let t = time::Duration::span(move || d.exec(f)); println!("{}: {} ms", name, t.num_milliseconds()); } fn main() { let mut args = env::args(); args.next(); let alg: usize = args.next().unwrap().parse().unwrap(); match alg { 0 => run("scalar", self::scalar::scalar), 1 => run("vector", self::simd::x8), 2 => run("vector_par", self::simd_par::x8_par), 3 => { #[cfg(feature = "ispc")] { run("ispc", self::ispc_loops::serial); } #[cfg(not(feature = "ispc"))] { panic!("error: algorithm requires binary to be compiled with the ispc feature") } } 4 => { #[cfg(feature = "ispc")] { run("ispc+tasks", self::ispc_loops::tasks); } #[cfg(not(feature = "ispc"))] { panic!("error: algorithm requires binary to be compiled with the ispc feature") } } _ => panic!("unknown algorithm"), } } ================================================ FILE: examples/stencil/src/scalar.rs ================================================ //! Scalar implementation pub fn step( x0: i32, x1: i32, y0: i32, y1: i32, z0: i32, z1: i32, n_x: i32, n_y: i32, _n_z: i32, coef: &[f32; 4], vsq: &[f32], a_in: &[f32], a_out: &mut [f32], ) { let n_xy = n_x * n_y; for z in z0..z1 { for y in y0..y1 { for x in x0..x1 { let index = (z * n_xy) + (y * n_x) + x; macro_rules! a_cur { ($x:expr, $y:expr, $z:expr) => { a_in[(index + $x + $y * n_x + $z * n_xy) as usize] }; } macro_rules! a_next { ($x:expr, $y:expr, $z:expr) => { a_out[(index + $x + $y * n_x + $z * n_xy) as usize] }; } let mut div: f32 = coef[0] * a_cur!(0, 0, 0); for i in 1..4 { div += coef[i as usize] * (a_cur!(i, 0, 0) + a_cur!(-i, 0, 0) + a_cur!(0, i, 0) + a_cur!(0, -i, 0) + a_cur!(0, 0, i) + a_cur!(0, 0, -i)); } a_next!(0, 0, 0) = 2. * a_cur!(0, 0, 0) - a_next!(0, 0, 0) + vsq[index as usize] * div; } } } } pub fn scalar( t0: i32, t1: i32, x0: i32, x1: i32, y0: i32, y1: i32, z0: i32, z1: i32, n_x: i32, n_y: i32, n_z: i32, coef: &[f32; 4], vsq: &[f32], a_even: &mut [f32], a_odd: &mut [f32], ) { for t in t0..t1 { if t & 1 == 0 { step( x0, x1, y0, y1, z0, z1, n_x, n_y, n_z, coef, vsq, a_even, a_odd, ); } else { step( x0, x1, y0, y1, z0, z1, n_x, n_y, n_z, coef, vsq, a_odd, a_even, ); } } } #[cfg(all(test, feature = "ispc"))] mod tests { use super::scalar; use crate::ispc_loops::serial; use crate::{assert_data_eq, Data}; #[test] fn scalar_ispc_verify() { let mut data_scalar = Data::default(); data_scalar.exec(scalar); let mut data_ispc = Data::default(); data_ispc.exec(serial); assert_data_eq(&data_scalar, &data_ispc); } } ================================================ FILE: examples/stencil/src/simd.rs ================================================ //! SIMD implementation use packed_simd::*; #[inline(always)] pub(crate) fn step_x8( x0: i32, x1: i32, y0: i32, y1: i32, z0: i32, z1: i32, n_x: i32, n_y: i32, _n_z: i32, coef: &[f32; 4], vsq: &[f32], a_in: &[f32], a_out: &mut [f32], ) { assert!((x1 - x0) % f32x8::lanes() as i32 == 0); let n_xy = n_x * n_y; for z in z0..z1 { let z_idx = z * n_xy; for y in y0..y1 { let y_idx = y * n_x; for x in (x0..x1).step_by(f32x8::lanes()) { unsafe { let out_idx = x + y_idx; let index: i32 = z_idx + out_idx; macro_rules! a_cur { ($x:expr, $y:expr, $z:expr) => { f32x8::from_slice_unaligned_unchecked( &a_in.get_unchecked( (index + $x + $y * n_x + $z * n_xy) as usize.., ), ) }; } let cur_0 = a_cur!(0, 0, 0); let mut div: f32x8 = *coef.get_unchecked(0) * cur_0; for i in 1..4 { let coef = f32x8::splat(*coef.get_unchecked(i)); let sum = { let i = i as i32; a_cur!(i, 0, 0) + a_cur!(-i, 0, 0) + a_cur!(0, i, 0) + a_cur!(0, -i, 0) + a_cur!(0, 0, i) + a_cur!(0, 0, -i) }; div = coef.mul_adde(sum, div); } let vsq = f32x8::from_slice_unaligned_unchecked( vsq.get_unchecked(index as usize..), ); let sum = cur_0.mul_adde( f32x8::splat(2.), -f32x8::from_slice_unaligned_unchecked( a_out.get_unchecked(out_idx as usize..), ), ); let r = vsq.mul_adde(div, sum); r.write_to_slice_unaligned_unchecked( &mut a_out.get_unchecked_mut(out_idx as usize..), ); } } } } } #[inline(always)] fn x8_impl( t0: i32, t1: i32, x0: i32, x1: i32, y0: i32, y1: i32, z0: i32, z1: i32, n_x: i32, n_y: i32, n_z: i32, coef: &[f32; 4], vsq: &[f32], a_even: &mut [f32], a_odd: &mut [f32], ) { for t in t0..t1 { if t & 1 == 0 { a_odd .chunks_mut((n_x * n_y) as usize) .enumerate() .skip(z0 as usize) .take((z1 - z0) as usize) .for_each(|(z, a_odd)| { let z = z as i32; #[rustfmt::skip] step_x8(x0, x1, y0, y1, z, z + 1, n_x, n_y, n_z, coef, vsq, a_even, a_odd, ); }); } else { a_even .chunks_mut((n_x * n_y) as usize) .enumerate() .skip(z0 as usize) .take((z1 - z0) as usize) .for_each(|(z, a_even)| { let z = z as i32; #[rustfmt::skip] step_x8(x0, x1, y0, y1, z, z + 1, n_x, n_y, n_z, coef, vsq, a_odd, a_even, ); }); } } } #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] #[target_feature(enable = "avx2,fma")] unsafe fn x8_impl_avx2( t0: i32, t1: i32, x0: i32, x1: i32, y0: i32, y1: i32, z0: i32, z1: i32, n_x: i32, n_y: i32, n_z: i32, coef: &[f32; 4], vsq: &[f32], a_even: &mut [f32], a_odd: &mut [f32], ) { #[rustfmt::skip] x8_impl(t0, t1, x0, x1, y0, y1, z0, z1, n_x, n_y, n_z, coef, vsq, a_even, a_odd) } #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] #[target_feature(enable = "avx")] unsafe fn x8_impl_avx( t0: i32, t1: i32, x0: i32, x1: i32, y0: i32, y1: i32, z0: i32, z1: i32, n_x: i32, n_y: i32, n_z: i32, coef: &[f32; 4], vsq: &[f32], a_even: &mut [f32], a_odd: &mut [f32], ) { #[rustfmt::skip] x8_impl(t0, t1, x0, x1, y0, y1, z0, z1, n_x, n_y, n_z, coef, vsq, a_even, a_odd) } #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] #[target_feature(enable = "sse4.2")] unsafe fn x8_impl_sse42( t0: i32, t1: i32, x0: i32, x1: i32, y0: i32, y1: i32, z0: i32, z1: i32, n_x: i32, n_y: i32, n_z: i32, coef: &[f32; 4], vsq: &[f32], a_even: &mut [f32], a_odd: &mut [f32], ) { #[rustfmt::skip] x8_impl(t0, t1, x0, x1, y0, y1, z0, z1, n_x, n_y, n_z, coef, vsq, a_even, a_odd) } #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] #[target_feature(enable = "sse2")] unsafe fn x8_impl_sse2( t0: i32, t1: i32, x0: i32, x1: i32, y0: i32, y1: i32, z0: i32, z1: i32, n_x: i32, n_y: i32, n_z: i32, coef: &[f32; 4], vsq: &[f32], a_even: &mut [f32], a_odd: &mut [f32], ) { #[rustfmt::skip] x8_impl(t0, t1, x0, x1, y0, y1, z0, z1, n_x, n_y, n_z, coef, vsq, a_even, a_odd) } unsafe fn x8_impl_def( t0: i32, t1: i32, x0: i32, x1: i32, y0: i32, y1: i32, z0: i32, z1: i32, n_x: i32, n_y: i32, n_z: i32, coef: &[f32; 4], vsq: &[f32], a_even: &mut [f32], a_odd: &mut [f32], ) { #[rustfmt::skip] x8_impl(t0, t1, x0, x1, y0, y1, z0, z1, n_x, n_y, n_z, coef, vsq, a_even, a_odd) } pub fn x8( t0: i32, t1: i32, x0: i32, x1: i32, y0: i32, y1: i32, z0: i32, z1: i32, n_x: i32, n_y: i32, n_z: i32, coef: &[f32; 4], vsq: &[f32], a_even: &mut [f32], a_odd: &mut [f32], ) { #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] unsafe { if is_x86_feature_detected!("avx2") && is_x86_feature_detected!("fma") { #[rustfmt::skip] x8_impl_avx2(t0, t1, x0, x1, y0, y1, z0, z1, n_x, n_y, n_z, coef, vsq, a_even, a_odd) } else if is_x86_feature_detected!("avx") { #[rustfmt::skip] x8_impl_avx(t0, t1, x0, x1, y0, y1, z0, z1, n_x, n_y, n_z, coef, vsq, a_even, a_odd) } else if is_x86_feature_detected!("sse4.2") { #[rustfmt::skip] x8_impl_sse42(t0, t1, x0, x1, y0, y1, z0, z1, n_x, n_y, n_z, coef, vsq, a_even, a_odd) } else if is_x86_feature_detected!("sse2") { #[rustfmt::skip] x8_impl_sse2(t0, t1, x0, x1, y0, y1, z0, z1, n_x, n_y, n_z, coef, vsq, a_even, a_odd) } else { #[rustfmt::skip] x8_impl_def(t0, t1, x0, x1, y0, y1, z0, z1, n_x, n_y, n_z, coef, vsq, a_even, a_odd) } } #[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))] unsafe { #[rustfmt::skip] x8_impl_def(t0, t1, x0, x1, y0, y1, z0, z1, n_x, n_y, n_z, coef, vsq, a_even, a_odd) } } #[cfg(test)] mod tests { use super::x8; use crate::scalar::scalar; use crate::{assert_data_eq, Data}; #[test] fn simd_scalar_verify() { let mut data_simd = Data::default(); data_simd.exec(x8); let mut data_scalar = Data::default(); data_scalar.exec(scalar); assert_data_eq(&data_simd, &data_scalar); } #[cfg(feature = "ispc")] #[test] fn simd_ispc_verify() { use crate::ispc_loops::serial; let mut data_simd = Data::default(); data_simd.exec(x8); let mut data_ispc = Data::default(); data_ispc.exec(serial); assert_data_eq(&data_simd, &data_ispc); } } ================================================ FILE: examples/stencil/src/simd_par.rs ================================================ //! SIMD+Rayon implementation. use crate::simd::step_x8; use rayon::prelude::*; #[inline(always)] fn x8_par_impl( t0: i32, t1: i32, x0: i32, x1: i32, y0: i32, y1: i32, z0: i32, z1: i32, n_x: i32, n_y: i32, n_z: i32, coef: &[f32; 4], vsq: &[f32], a_even: &mut [f32], a_odd: &mut [f32], ) { assert!((z1 - z0) <= n_z); for t in t0..t1 { if t & 1 == 0 { a_odd .par_chunks_mut((n_x * n_y) as usize) .enumerate() .skip(z0 as usize) .take((z1 - z0) as usize) .for_each(|(z, a_odd)| { let z = z as i32; #[rustfmt::skip] step_x8( x0, x1, y0, y1, z, z + 1, n_x, n_y, n_z, coef, vsq, a_even, a_odd, ); }); } else { a_even .par_chunks_mut((n_x * n_y) as usize) .enumerate() .skip(z0 as usize) .take((z1 - z0) as usize) .for_each(|(z, a_even)| { let z = z as i32; #[rustfmt::skip] step_x8( x0, x1, y0, y1, z, z + 1, n_x, n_y, n_z, coef, vsq, a_odd, a_even, ); }); } } } #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] #[target_feature(enable = "avx2")] unsafe fn x8_par_impl_avx2( t0: i32, t1: i32, x0: i32, x1: i32, y0: i32, y1: i32, z0: i32, z1: i32, n_x: i32, n_y: i32, n_z: i32, coef: &[f32; 4], vsq: &[f32], a_even: &mut [f32], a_odd: &mut [f32], ) { x8_par_impl( t0, t1, x0, x1, y0, y1, z0, z1, n_x, n_y, n_z, coef, vsq, a_even, a_odd, ) } #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] #[target_feature(enable = "avx")] unsafe fn x8_par_impl_avx( t0: i32, t1: i32, x0: i32, x1: i32, y0: i32, y1: i32, z0: i32, z1: i32, n_x: i32, n_y: i32, n_z: i32, coef: &[f32; 4], vsq: &[f32], a_even: &mut [f32], a_odd: &mut [f32], ) { x8_par_impl( t0, t1, x0, x1, y0, y1, z0, z1, n_x, n_y, n_z, coef, vsq, a_even, a_odd, ) } #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] #[target_feature(enable = "sse4.2")] unsafe fn x8_par_impl_sse42( t0: i32, t1: i32, x0: i32, x1: i32, y0: i32, y1: i32, z0: i32, z1: i32, n_x: i32, n_y: i32, n_z: i32, coef: &[f32; 4], vsq: &[f32], a_even: &mut [f32], a_odd: &mut [f32], ) { x8_par_impl( t0, t1, x0, x1, y0, y1, z0, z1, n_x, n_y, n_z, coef, vsq, a_even, a_odd, ) } #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] #[target_feature(enable = "sse2")] unsafe fn x8_par_impl_sse2( t0: i32, t1: i32, x0: i32, x1: i32, y0: i32, y1: i32, z0: i32, z1: i32, n_x: i32, n_y: i32, n_z: i32, coef: &[f32; 4], vsq: &[f32], a_even: &mut [f32], a_odd: &mut [f32], ) { x8_par_impl( t0, t1, x0, x1, y0, y1, z0, z1, n_x, n_y, n_z, coef, vsq, a_even, a_odd, ) } unsafe fn x8_par_impl_def( t0: i32, t1: i32, x0: i32, x1: i32, y0: i32, y1: i32, z0: i32, z1: i32, n_x: i32, n_y: i32, n_z: i32, coef: &[f32; 4], vsq: &[f32], a_even: &mut [f32], a_odd: &mut [f32], ) { x8_par_impl( t0, t1, x0, x1, y0, y1, z0, z1, n_x, n_y, n_z, coef, vsq, a_even, a_odd, ) } pub fn x8_par( t0: i32, t1: i32, x0: i32, x1: i32, y0: i32, y1: i32, z0: i32, z1: i32, n_x: i32, n_y: i32, n_z: i32, coef: &[f32; 4], vsq: &[f32], a_even: &mut [f32], a_odd: &mut [f32], ) { #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] unsafe { if is_x86_feature_detected!("avx2") { #[rustfmt::skip] x8_par_impl_avx2(t0, t1, x0, x1, y0, y1, z0, z1, n_x, n_y, n_z, coef, vsq, a_even, a_odd) } else if is_x86_feature_detected!("avx") { #[rustfmt::skip] x8_par_impl_avx(t0, t1, x0, x1, y0, y1, z0, z1, n_x, n_y, n_z, coef, vsq, a_even, a_odd) } else if is_x86_feature_detected!("sse4.2") { #[rustfmt::skip] x8_par_impl_sse42(t0, t1, x0, x1, y0, y1, z0, z1, n_x, n_y, n_z, coef, vsq, a_even, a_odd) } else if is_x86_feature_detected!("sse2") { #[rustfmt::skip] x8_par_impl_sse2(t0, t1, x0, x1, y0, y1, z0, z1, n_x, n_y, n_z, coef, vsq, a_even, a_odd) } else { #[rustfmt::skip] x8_par_impl_def(t0, t1, x0, x1, y0, y1, z0, z1, n_x, n_y, n_z, coef, vsq, a_even, a_odd) } } #[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))] unsafe { #[rustfmt::skip] x8_par_impl_def(t0, t1, x0, x1, y0, y1, z0, z1, n_x, n_y, n_z, coef, vsq, a_even, a_odd) } } #[cfg(test)] mod tests { use super::x8_par; use crate::scalar::scalar; use crate::{assert_data_eq, Data}; #[test] fn simd_par_verify() { let mut data_simd_par = Data::default(); data_simd_par.exec(x8_par); let mut data_scalar = Data::default(); data_scalar.exec(scalar); assert_data_eq(&data_simd_par, &data_scalar); } } ================================================ FILE: examples/stencil/volta/.gitignore ================================================ # Files built by ISPC /objs/ /stencil ================================================ FILE: examples/stencil/volta/Makefile ================================================ EXAMPLE=stencil CPP_SRC=stencil.cpp stencil_serial.cpp ISPC_SRC=stencil.ispc ISPC_IA_TARGETS=sse2-i32x4,sse4-i32x4,avx1-i32x8,avx2-i32x8,avx512knl-i32x16,avx512skx-i32x16 ISPC_ARM_TARGETS=neon include common.mk ================================================ FILE: examples/stencil/volta/common.mk ================================================ TASK_CXX=tasksys.cpp TASK_LIB=-lpthread TASK_OBJ=objs/tasksys.o CXX=clang++ CXXFLAGS+=-Iobjs/ -O3 -march=native CC=clang CCFLAGS+=-Iobjs/ -O3 -march=native LIBS=-lm $(TASK_LIB) -lstdc++ ISPC=ispc ISPC_FLAGS+=-O3 ISPC_HEADER=objs/$(ISPC_SRC:.ispc=_ispc.h) ARCH:=$(shell uname -m | sed -e s/x86_64/x86/ -e s/i686/x86/ -e s/arm.*/arm/ -e s/sa110/arm/) ifeq ($(ARCH),x86) ISPC_OBJS=$(addprefix objs/, $(ISPC_SRC:.ispc=)_ispc.o) COMMA=, ifneq (,$(findstring $(COMMA),$(ISPC_IA_TARGETS))) #$(info multi-target detected: $(ISPC_IA_TARGETS)) ifneq (,$(findstring sse2,$(ISPC_IA_TARGETS))) ISPC_OBJS+=$(addprefix objs/, $(ISPC_SRC:.ispc=)_ispc_sse2.o) endif ifneq (,$(findstring sse4,$(ISPC_IA_TARGETS))) ISPC_OBJS+=$(addprefix objs/, $(ISPC_SRC:.ispc=)_ispc_sse4.o) endif ifneq (,$(findstring avx1-,$(ISPC_IA_TARGETS))) ISPC_OBJS+=$(addprefix objs/, $(ISPC_SRC:.ispc=)_ispc_avx.o) endif ifneq (,$(findstring avx1.1,$(ISPC_IA_TARGETS))) ISPC_OBJS+=$(addprefix objs/, $(ISPC_SRC:.ispc=)_ispc_avx11.o) endif ifneq (,$(findstring avx2,$(ISPC_IA_TARGETS))) ISPC_OBJS+=$(addprefix objs/, $(ISPC_SRC:.ispc=)_ispc_avx2.o) endif ifneq (,$(findstring avx512knl,$(ISPC_IA_TARGETS))) ISPC_OBJS+=$(addprefix objs/, $(ISPC_SRC:.ispc=)_ispc_avx512knl.o) endif ifneq (,$(findstring avx512skx,$(ISPC_IA_TARGETS))) ISPC_OBJS+=$(addprefix objs/, $(ISPC_SRC:.ispc=)_ispc_avx512skx.o) endif endif ISPC_TARGETS=$(ISPC_IA_TARGETS) ARCH_BIT:=$(shell getconf LONG_BIT) ifeq ($(ARCH_BIT),32) ISPC_FLAGS += --arch=x86 CXXFLAGS += -m32 CCFLAGS += -m32 else ISPC_FLAGS += --arch=x86-64 CXXFLAGS += -m64 CCFLAGS += -m64 endif else ifeq ($(ARCH),arm) ISPC_OBJS=$(addprefix objs/, $(ISPC_SRC:.ispc=_ispc.o)) ISPC_TARGETS=$(ISPC_ARM_TARGETS) else $(error Unknown architecture $(ARCH) from uname -m) endif CPP_OBJS=$(addprefix objs/, $(CPP_SRC:.cpp=.o)) CC_OBJS=$(addprefix objs/, $(CC_SRC:.c=.o)) OBJS=$(CPP_OBJS) $(CC_OBJS) $(TASK_OBJ) $(ISPC_OBJS) default: $(EXAMPLE) all: $(EXAMPLE) $(EXAMPLE)-sse4 $(EXAMPLE)-generic16 $(EXAMPLE)-scalar .PHONY: dirs clean dirs: /bin/mkdir -p objs/ objs/%.cpp objs/%.o objs/%.h: dirs clean: /bin/rm -rf objs *~ $(EXAMPLE) $(EXAMPLE)-sse4 $(EXAMPLE)-generic16 ref test $(EXAMPLE): $(OBJS) $(CXX) $(CXXFLAGS) -o $@ $^ $(LIBS) objs/%.o: %.cpp dirs $(ISPC_HEADER) $(CXX) $< $(CXXFLAGS) -c -o $@ objs/%.o: %.c dirs $(ISPC_HEADER) $(CC) $< $(CCFLAGS) -c -o $@ objs/%.o: ../%.cpp dirs $(CXX) $< $(CXXFLAGS) -c -o $@ objs/$(EXAMPLE).o: objs/$(EXAMPLE)_ispc.h dirs objs/%_ispc.h objs/%_ispc.o objs/%_ispc_sse2.o objs/%_ispc_sse4.o objs/%_ispc_avx.o objs/%_ispc_avx11.o objs/%_ispc_avx2.o objs/%_ispc_avx512knl.o objs/%_ispc_avx512skx.o : %.ispc dirs $(ISPC) $(ISPC_FLAGS) --target=$(ISPC_TARGETS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h objs/$(ISPC_SRC:.ispc=)_sse4.cpp: $(ISPC_SRC) $(ISPC) $(ISPC_FLAGS) $< -o $@ --target=generic-4 --emit-c++ --c++-include-file=sse4.h objs/$(ISPC_SRC:.ispc=)_sse4.o: objs/$(ISPC_SRC:.ispc=)_sse4.cpp $(CXX) -I../intrinsics -msse4.2 $< $(CXXFLAGS) -c -o $@ $(EXAMPLE)-sse4: $(CPP_OBJS) objs/$(ISPC_SRC:.ispc=)_sse4.o $(CXX) $(CXXFLAGS) -o $@ $^ $(LIBS) objs/$(ISPC_SRC:.ispc=)_generic16.cpp: $(ISPC_SRC) $(ISPC) $(ISPC_FLAGS) $< -o $@ --target=generic-16 --emit-c++ --c++-include-file=generic-16.h objs/$(ISPC_SRC:.ispc=)_generic16.o: objs/$(ISPC_SRC:.ispc=)_generic16.cpp $(CXX) -I../intrinsics $< $(CXXFLAGS) -c -o $@ $(EXAMPLE)-generic16: $(CPP_OBJS) objs/$(ISPC_SRC:.ispc=)_generic16.o $(CXX) $(CXXFLAGS) -o $@ $^ $(LIBS) objs/$(ISPC_SRC:.ispc=)_scalar.o: $(ISPC_SRC) $(ISPC) $(ISPC_FLAGS) $< -o $@ --target=generic-1 $(EXAMPLE)-scalar: $(CPP_OBJS) objs/$(ISPC_SRC:.ispc=)_scalar.o $(CXX) $(CXXFLAGS) -o $@ $^ $(LIBS) ================================================ FILE: examples/stencil/volta/stencil.cpp ================================================ /* Copyright (c) 2010-2014, Intel Corporation All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifdef _MSC_VER #define _CRT_SECURE_NO_WARNINGS #define NOMINMAX #pragma warning (disable: 4244) #pragma warning (disable: 4305) #endif #include #include #include #include #include #include "../timing.h" #include "stencil_ispc.h" using namespace ispc; extern void loop_stencil_serial(int t0, int t1, int x0, int x1, int y0, int y1, int z0, int z1, int Nx, int Ny, int Nz, const float coef[5], const float vsq[], float Aeven[], float Aodd[]); void InitData(int Nx, int Ny, int Nz, float *A[2], float *vsq) { int offset = 0; for (int z = 0; z < Nz; ++z) for (int y = 0; y < Ny; ++y) for (int x = 0; x < Nx; ++x, ++offset) { A[0][offset] = (x < Nx / 2) ? x / float(Nx) : y / float(Ny); A[1][offset] = 0; vsq[offset] = x*y*z / float(Nx * Ny * Nz); } } int main(int argc, char *argv[]) { static unsigned int test_iterations[] = {3, 3, 3};//the last two numbers must be equal here int Nx = 256, Ny = 256, Nz = 256; int width = 4; if (argc > 1) { if (strncmp(argv[1], "--scale=", 8) == 0) { float scale = atof(argv[1] + 8); Nx *= scale; Ny *= scale; Nz *= scale; } } if ((argc == 4) || (argc == 5)) { for (int i = 0; i < 3; i++) { test_iterations[i] = atoi(argv[argc - 3 + i]); } } float *Aserial[2], *Aispc[2]; Aserial[0] = new float [Nx * Ny * Nz]; Aserial[1] = new float [Nx * Ny * Nz]; Aispc[0] = new float [Nx * Ny * Nz]; Aispc[1] = new float [Nx * Ny * Nz]; float *vsq = new float [Nx * Ny * Nz]; float coeff[4] = { 0.5, -.25, .125, -.0625 }; InitData(Nx, Ny, Nz, Aispc, vsq); // // Compute the image using the ispc implementation on one core; report // the minimum time of three runs. // double minTimeISPC = 1e30; for (unsigned int i = 0; i < test_iterations[0]; ++i) { reset_and_start_timer(); loop_stencil_ispc(0, 6, width, Nx - width, width, Ny - width, width, Nz - width, Nx, Ny, Nz, coeff, vsq, Aispc[0], Aispc[1]); double dt = get_elapsed_mcycles(); printf("@time of ISPC run:\t\t\t[%.3f] million cycles\n", dt); minTimeISPC = std::min(minTimeISPC, dt); } printf("[stencil ispc 1 core]:\t\t[%.3f] million cycles\n", minTimeISPC); InitData(Nx, Ny, Nz, Aispc, vsq); // // Compute the image using the ispc implementation with tasks; report // the minimum time of three runs. // double minTimeISPCTasks = 1e30; for (unsigned int i = 0; i < test_iterations[1]; ++i) { reset_and_start_timer(); loop_stencil_ispc_tasks(0, 6, width, Nx - width, width, Ny - width, width, Nz - width, Nx, Ny, Nz, coeff, vsq, Aispc[0], Aispc[1]); double dt = get_elapsed_mcycles(); printf("@time of ISPC + TASKS run:\t\t\t[%.3f] million cycles\n", dt); minTimeISPCTasks = std::min(minTimeISPCTasks, dt); } printf("[stencil ispc + tasks]:\t\t[%.3f] million cycles\n", minTimeISPCTasks); InitData(Nx, Ny, Nz, Aserial, vsq); // // And run the serial implementation 3 times, again reporting the // minimum time. // double minTimeSerial = 1e30; for (unsigned int i = 0; i < test_iterations[2]; ++i) { reset_and_start_timer(); loop_stencil_serial(0, 6, width, Nx-width, width, Ny - width, width, Nz - width, Nx, Ny, Nz, coeff, vsq, Aserial[0], Aserial[1]); double dt = get_elapsed_mcycles(); printf("@time of serial run:\t\t\t[%.3f] million cycles\n", dt); minTimeSerial = std::min(minTimeSerial, dt); } printf("[stencil serial]:\t\t[%.3f] million cycles\n", minTimeSerial); printf("\t\t\t\t(%.2fx speedup from ISPC, %.2fx speedup from ISPC + tasks)\n", minTimeSerial / minTimeISPC, minTimeSerial / minTimeISPCTasks); // Check for agreement int offset = 0; for (int z = 0; z < Nz; ++z) for (int y = 0; y < Ny; ++y) for (int x = 0; x < Nx; ++x, ++offset) { float error = fabsf((Aserial[1][offset] - Aispc[1][offset]) / Aserial[1][offset]); if (error > 1e-4) printf("Error @ (%d,%d,%d): ispc = %f, serial = %f\n", x, y, z, Aispc[1][offset], Aserial[1][offset]); } return 0; } ================================================ FILE: examples/stencil/volta/stencil.ispc ================================================ /* Copyright (c) 2010-2011, Intel Corporation All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ static void stencil_step(uniform int x0, uniform int x1, uniform int y0, uniform int y1, uniform int z0, uniform int z1, uniform int Nx, uniform int Ny, uniform int Nz, uniform const float coef[4], uniform const float vsq[], uniform const float Ain[], uniform float Aout[]) { const uniform int Nxy = Nx * Ny; foreach (z = z0 ... z1, y = y0 ... y1, x = x0 ... x1) { int index = (z * Nxy) + (y * Nx) + x; #define A_cur(x, y, z) Ain[index + (x) + ((y) * Nx) + ((z) * Nxy)] #define A_next(x, y, z) Aout[index + (x) + ((y) * Nx) + ((z) * Nxy)] float div = coef[0] * A_cur(0, 0, 0) + coef[1] * (A_cur(+1, 0, 0) + A_cur(-1, 0, 0) + A_cur(0, +1, 0) + A_cur(0, -1, 0) + A_cur(0, 0, +1) + A_cur(0, 0, -1)) + coef[2] * (A_cur(+2, 0, 0) + A_cur(-2, 0, 0) + A_cur(0, +2, 0) + A_cur(0, -2, 0) + A_cur(0, 0, +2) + A_cur(0, 0, -2)) + coef[3] * (A_cur(+3, 0, 0) + A_cur(-3, 0, 0) + A_cur(0, +3, 0) + A_cur(0, -3, 0) + A_cur(0, 0, +3) + A_cur(0, 0, -3)); A_next(0, 0, 0) = 2 * A_cur(0, 0, 0) - A_next(0, 0, 0) + vsq[index] * div; } } static task void stencil_step_task(uniform int x0, uniform int x1, uniform int y0, uniform int y1, uniform int z0, uniform int Nx, uniform int Ny, uniform int Nz, uniform const float coef[4], uniform const float vsq[], uniform const float Ain[], uniform float Aout[]) { stencil_step(x0, x1, y0, y1, z0+taskIndex, z0+taskIndex+1, Nx, Ny, Nz, coef, vsq, Ain, Aout); } export void loop_stencil_ispc_tasks(uniform int t0, uniform int t1, uniform int x0, uniform int x1, uniform int y0, uniform int y1, uniform int z0, uniform int z1, uniform int Nx, uniform int Ny, uniform int Nz, uniform const float coef[4], uniform const float vsq[], uniform float Aeven[], uniform float Aodd[]) { for (uniform int t = t0; t < t1; ++t) { // Parallelize across cores as well: each task will work on a slice // of 1 in the z extent of the volume. if ((t & 1) == 0) launch[z1-z0] stencil_step_task(x0, x1, y0, y1, z0, Nx, Ny, Nz, coef, vsq, Aeven, Aodd); else launch[z1-z0] stencil_step_task(x0, x1, y0, y1, z0, Nx, Ny, Nz, coef, vsq, Aodd, Aeven); // We need to wait for all of the launched tasks to finish before // starting the next iteration. sync; } } export void loop_stencil_ispc(uniform int t0, uniform int t1, uniform int x0, uniform int x1, uniform int y0, uniform int y1, uniform int z0, uniform int z1, uniform int Nx, uniform int Ny, uniform int Nz, uniform const float coef[4], uniform const float vsq[], uniform float Aeven[], uniform float Aodd[]) { for (uniform int t = t0; t < t1; ++t) { if ((t & 1) == 0) stencil_step(x0, x1, y0, y1, z0, z1, Nx, Ny, Nz, coef, vsq, Aeven, Aodd); else stencil_step(x0, x1, y0, y1, z0, z1, Nx, Ny, Nz, coef, vsq, Aodd, Aeven); } } ================================================ FILE: examples/stencil/volta/stencil_serial.cpp ================================================ /* Copyright (c) 2010-2011, Intel Corporation All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ static void stencil_step_serial(int x0, int x1, int y0, int y1, int z0, int z1, int Nx, int Ny, int Nz, const float coef[4], const float vsq[], const float Ain[], float Aout[]) { int Nxy = Nx * Ny; for (int z = z0; z < z1; ++z) { for (int y = y0; y < y1; ++y) { for (int x = x0; x < x1; ++x) { int index = (z * Nxy) + (y * Nx) + x; #define A_cur(x, y, z) Ain[index + (x) + ((y) * Nx) + ((z) * Nxy)] #define A_next(x, y, z) Aout[index + (x) + ((y) * Nx) + ((z) * Nxy)] float div = coef[0] * A_cur(0, 0, 0) + coef[1] * (A_cur(+1, 0, 0) + A_cur(-1, 0, 0) + A_cur(0, +1, 0) + A_cur(0, -1, 0) + A_cur(0, 0, +1) + A_cur(0, 0, -1)) + coef[2] * (A_cur(+2, 0, 0) + A_cur(-2, 0, 0) + A_cur(0, +2, 0) + A_cur(0, -2, 0) + A_cur(0, 0, +2) + A_cur(0, 0, -2)) + coef[3] * (A_cur(+3, 0, 0) + A_cur(-3, 0, 0) + A_cur(0, +3, 0) + A_cur(0, -3, 0) + A_cur(0, 0, +3) + A_cur(0, 0, -3)); A_next(0, 0, 0) = 2 * A_cur(0, 0, 0) - A_next(0, 0, 0) + vsq[index] * div; } } } } void loop_stencil_serial(int t0, int t1, int x0, int x1, int y0, int y1, int z0, int z1, int Nx, int Ny, int Nz, const float coef[4], const float vsq[], float Aeven[], float Aodd[]) { for (int t = t0; t < t1; ++t) { if ((t & 1) == 0) stencil_step_serial(x0, x1, y0, y1, z0, z1, Nx, Ny, Nz, coef, vsq, Aeven, Aodd); else stencil_step_serial(x0, x1, y0, y1, z0, z1, Nx, Ny, Nz, coef, vsq, Aodd, Aeven); } } ================================================ FILE: examples/stencil/volta/tasksys.cpp ================================================ /* Copyright (c) 2011-2012, Intel Corporation All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* This file implements simple task systems that provide the three entrypoints used by ispc-generated to code to handle 'launch' and 'sync' statements in ispc programs. See the section "Task Parallelism: Language Syntax" in the ispc documentation for information about using task parallelism in ispc programs, and see the section "Task Parallelism: Runtime Requirements" for information about the task-related entrypoints that are implemented here. There are several task systems in this file, built using: - Microsoft's Concurrency Runtime (ISPC_USE_CONCRT) - Apple's Grand Central Dispatch (ISPC_USE_GCD) - bare pthreads (ISPC_USE_PTHREADS, ISPC_USE_PTHREADS_FULLY_SUBSCRIBED) - Cilk Plus (ISPC_USE_CILK) - TBB (ISPC_USE_TBB_TASK_GROUP, ISPC_USE_TBB_PARALLEL_FOR) - OpenMP (ISPC_USE_OMP) - HPX (ISPC_USE_HPX) The task system implementation can be selected at compile time, by defining the appropriate preprocessor symbol on the command line (for e.g.: -D ISPC_USE_TBB). Not all combinations of platform and task system are meaningful. If no task system is requested, a reasonable default task system for the platform is selected. Here are the task systems that can be selected: #define ISPC_USE_GCD #define ISPC_USE_CONCRT #define ISPC_USE_PTHREADS #define ISPC_USE_PTHREADS_FULLY_SUBSCRIBED #define ISPC_USE_CILK #define ISPC_USE_OMP #define ISPC_USE_TBB_TASK_GROUP #define ISPC_USE_TBB_PARALLEL_FOR The ISPC_USE_PTHREADS_FULLY_SUBSCRIBED model essentially takes over the machine by assigning one pthread to each hyper-thread, and then uses spinlocks and atomics for task management. This model is useful for KNC where tasks can take over the machine, but less so when there are other tasks that need running on the machine. #define ISPC_USE_CREW #define ISPC_USE_HPX The HPX model requires the HPX runtime environment to be set up. This can be done manually, e.g. with hpx::init, or by including hpx/hpx_main.hpp which uses the main() function as entry point and sets up the runtime system. Number of threads can be specified as commandline parameter with --hpx:threads, use "all" to spawn one thread per processing unit. */ #if !(defined ISPC_USE_CONCRT || defined ISPC_USE_GCD || \ defined ISPC_USE_PTHREADS || defined ISPC_USE_PTHREADS_FULLY_SUBSCRIBED || \ defined ISPC_USE_TBB_TASK_GROUP || defined ISPC_USE_TBB_PARALLEL_FOR || \ defined ISPC_USE_OMP || defined ISPC_USE_CILK || \ defined ISPC_USE_HPX) // If no task model chosen from the compiler cmdline, pick a reasonable default #if defined(_WIN32) || defined(_WIN64) #define ISPC_USE_CONCRT #elif defined(__linux__) #define ISPC_USE_PTHREADS #elif defined(__APPLE__) #define ISPC_USE_GCD #endif #if defined(__KNC__) #define ISPC_USE_PTHREADS #endif #endif // No task model specified on compiler cmdline #if defined(_WIN32) || defined(_WIN64) #define ISPC_IS_WINDOWS #elif defined(__linux__) #define ISPC_IS_LINUX #elif defined(__APPLE__) #define ISPC_IS_APPLE #endif #if defined(__KNC__) #define ISPC_IS_KNC #endif #define DBG(x) #ifdef ISPC_IS_WINDOWS #define NOMINMAX #include #endif // ISPC_IS_WINDOWS #ifdef ISPC_USE_CONCRT #include using namespace Concurrency; #endif // ISPC_USE_CONCRT #ifdef ISPC_USE_GCD #include #include #endif // ISPC_USE_GCD #ifdef ISPC_USE_PTHREADS #include #include #include #include #include #include #include #include #include #include #include #endif // ISPC_USE_PTHREADS #ifdef ISPC_USE_PTHREADS_FULLY_SUBSCRIBED #include #include #include #include #include #include #include #include #include #include #include //#include #include #endif // ISPC_USE_PTHREADS_FULLY_SUBSCRIBED #ifdef ISPC_USE_TBB_PARALLEL_FOR #include #endif // ISPC_USE_TBB_PARALLEL_FOR #ifdef ISPC_USE_TBB_TASK_GROUP #include #endif // ISPC_USE_TBB_TASK_GROUP #ifdef ISPC_USE_CILK #include #endif // ISPC_USE_TBB #ifdef ISPC_USE_OMP #include #endif // ISPC_USE_OMP #ifdef ISPC_USE_HPX #include #include #endif // ISPC_USE_HPX #ifdef ISPC_IS_LINUX #include #endif // ISPC_IS_LINUX #include #include #include #include #include #include // Signature of ispc-generated 'task' functions typedef void (*TaskFuncType)(void *data, int threadIndex, int threadCount, int taskIndex, int taskCount, int taskIndex0, int taskIndex1, int taskIndex2, int taskCount0, int taskCount1, int taskCount2); // Small structure used to hold the data for each task #ifdef _MSC_VER __declspec(align(16)) #endif struct TaskInfo { TaskFuncType func; void *data; int taskIndex; int taskCount3d[3]; #if defined( ISPC_USE_CONCRT) event taskEvent; #endif int taskCount() const { return taskCount3d[0]*taskCount3d[1]*taskCount3d[2]; } int taskIndex0() const { return taskIndex % taskCount3d[0]; } int taskIndex1() const { return ( taskIndex / taskCount3d[0] ) % taskCount3d[1]; } int taskIndex2() const { return taskIndex / ( taskCount3d[0]*taskCount3d[1] ); } int taskCount0() const { return taskCount3d[0]; } int taskCount1() const { return taskCount3d[1]; } int taskCount2() const { return taskCount3d[2]; } TaskInfo() { assert(sizeof(TaskInfo) % 32 == 0); } } #ifndef _MSC_VER __attribute__((aligned(32))); #endif ; // ispc expects these functions to have C linkage / not be mangled extern "C" { void ISPCLaunch(void **handlePtr, void *f, void *data, int countx, int county, int countz); void *ISPCAlloc(void **handlePtr, int64_t size, int32_t alignment); void ISPCSync(void *handle); } /////////////////////////////////////////////////////////////////////////// // TaskGroupBase #define LOG_TASK_QUEUE_CHUNK_SIZE 14 #define MAX_TASK_QUEUE_CHUNKS 8 #define TASK_QUEUE_CHUNK_SIZE (1<> LOG_TASK_QUEUE_CHUNK_SIZE); int offset = index & (TASK_QUEUE_CHUNK_SIZE-1); if (chunk == MAX_TASK_QUEUE_CHUNKS) { fprintf(stderr, "A total of %d tasks have been launched from the " "current function--the simple built-in task system can handle " "no more. You can increase the values of TASK_QUEUE_CHUNK_SIZE " "and LOG_TASK_QUEUE_CHUNK_SIZE to work around this limitation. " "Sorry! Exiting.\n", index); exit(1); } if (taskInfo[chunk] == NULL) taskInfo[chunk] = new TaskInfo[TASK_QUEUE_CHUNK_SIZE]; return &taskInfo[chunk][offset]; } inline void * TaskGroupBase::AllocMemory(int64_t size, int32_t alignment) { char *basePtr = memBuffers[curMemBuffer]; intptr_t iptr = (intptr_t)(basePtr + curMemBufferOffset); iptr = (iptr + (alignment-1)) & ~(alignment-1); int newOffset = int(iptr - (intptr_t)basePtr + size); if (newOffset < memBufferSize[curMemBuffer]) { curMemBufferOffset = newOffset; return (char *)iptr; } ++curMemBuffer; curMemBufferOffset = 0; assert(curMemBuffer < NUM_MEM_BUFFERS); int allocSize = 1 << (12 + curMemBuffer); allocSize = std::max(int(size+alignment), allocSize); char *newBuf = new char[allocSize]; memBufferSize[curMemBuffer] = allocSize; memBuffers[curMemBuffer] = newBuf; return AllocMemory(size, alignment); } /////////////////////////////////////////////////////////////////////////// // Atomics and the like static inline void lMemFence() { // Windows atomic functions already contain the fence // KNC doesn't need the memory barrier #if !defined ISPC_IS_KNC && !defined ISPC_IS_WINDOWS __sync_synchronize(); #endif } static void * lAtomicCompareAndSwapPointer(void **v, void *newValue, void *oldValue) { #ifdef ISPC_IS_WINDOWS return InterlockedCompareExchangePointer(v, newValue, oldValue); #else void *result = __sync_val_compare_and_swap(v, oldValue, newValue); lMemFence(); return result; #endif // ISPC_IS_WINDOWS } static int32_t lAtomicCompareAndSwap32(volatile int32_t *v, int32_t newValue, int32_t oldValue) { #ifdef ISPC_IS_WINDOWS return InterlockedCompareExchange((volatile LONG *)v, newValue, oldValue); #else int32_t result = __sync_val_compare_and_swap(v, oldValue, newValue); lMemFence(); return result; #endif // ISPC_IS_WINDOWS } static inline int32_t lAtomicAdd(volatile int32_t *v, int32_t delta) { #ifdef ISPC_IS_WINDOWS return InterlockedExchangeAdd((volatile LONG *)v, delta)+delta; #else return __sync_fetch_and_add(v, delta); #endif } /////////////////////////////////////////////////////////////////////////// #ifdef ISPC_USE_CONCRT // With ConcRT, we don't need to extend TaskGroupBase at all. class TaskGroup : public TaskGroupBase { public: void Launch(int baseIndex, int count); void Sync(); }; #endif // ISPC_USE_CONCRT #ifdef ISPC_USE_GCD /* With Grand Central Dispatch, we associate a GCD dispatch group with each task group. (We'll later wait on this dispatch group when we need to wait on all of the tasks in the group to finish.) */ class TaskGroup : public TaskGroupBase { public: TaskGroup() { gcdGroup = dispatch_group_create(); } void Launch(int baseIndex, int count); void Sync(); private: dispatch_group_t gcdGroup; }; #endif // ISPC_USE_GCD #ifdef ISPC_USE_PTHREADS static void *lTaskEntry(void *arg); class TaskGroup : public TaskGroupBase { public: TaskGroup() { numUnfinishedTasks = 0; waitingTasks.reserve(128); inActiveList = false; } void Reset() { TaskGroupBase::Reset(); numUnfinishedTasks = 0; assert(inActiveList == false); lMemFence(); } void Launch(int baseIndex, int count); void Sync(); private: friend void *lTaskEntry(void *arg); int32_t numUnfinishedTasks; int32_t pad[3]; std::vector waitingTasks; bool inActiveList; }; #endif // ISPC_USE_PTHREADS #ifdef ISPC_USE_CILK class TaskGroup : public TaskGroupBase { public: void Launch(int baseIndex, int count); void Sync(); }; #endif // ISPC_USE_CILK #ifdef ISPC_USE_OMP class TaskGroup : public TaskGroupBase { public: void Launch(int baseIndex, int count); void Sync(); }; #endif // ISPC_USE_OMP #ifdef ISPC_USE_TBB_PARALLEL_FOR class TaskGroup : public TaskGroupBase { public: void Launch(int baseIndex, int count); void Sync(); }; #endif // ISPC_USE_TBB_PARALLEL_FOR #ifdef ISPC_USE_TBB_TASK_GROUP class TaskGroup : public TaskGroupBase { public: void Launch(int baseIndex, int count); void Sync(); private: tbb::task_group tbbTaskGroup; }; #endif // ISPC_USE_TBB_TASK_GROUP #ifdef ISPC_USE_HPX class TaskGroup : public TaskGroupBase { public: void Launch(int baseIndex, int count); void Sync(); private: std::vector> futures; }; #endif // ISPC_USE_HPX /////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////// // Grand Central Dispatch #ifdef ISPC_USE_GCD /* A simple task system for ispc programs based on Apple's Grand Central Dispatch. */ static dispatch_queue_t gcdQueue; static volatile int32_t lock = 0; static void InitTaskSystem() { if (gcdQueue != NULL) return; while (1) { if (lAtomicCompareAndSwap32(&lock, 1, 0) == 0) { if (gcdQueue == NULL) { gcdQueue = dispatch_get_global_queue(DISPATCH_QUEUE_PRIORITY_DEFAULT, 0); assert(gcdQueue != NULL); lMemFence(); } lock = 0; break; } } } static void lRunTask(void *ti) { TaskInfo *taskInfo = (TaskInfo *)ti; // FIXME: these are bogus values; may cause bugs in code that depends // on them having unique values in different threads. int threadIndex = 0; int threadCount = 1; // Actually run the task taskInfo->func(taskInfo->data, threadIndex, threadCount, taskInfo->taskIndex, taskInfo->taskCount(), taskInfo->taskIndex0(), taskInfo->taskIndex1(), taskInfo->taskIndex2(), taskInfo->taskCount0(), taskInfo->taskCount1(), taskInfo->taskCount2()); } inline void TaskGroup::Launch(int baseIndex, int count) { for (int i = 0; i < count; ++i) { TaskInfo *ti = GetTaskInfo(baseIndex + i); dispatch_group_async_f(gcdGroup, gcdQueue, ti, lRunTask); } } inline void TaskGroup::Sync() { dispatch_group_wait(gcdGroup, DISPATCH_TIME_FOREVER); } #endif // ISPC_USE_GCD /////////////////////////////////////////////////////////////////////////// // Concurrency Runtime #ifdef ISPC_USE_CONCRT static void InitTaskSystem() { // No initialization needed } static void __cdecl lRunTask(LPVOID param) { TaskInfo *ti = (TaskInfo *)param; // Actually run the task. // FIXME: like the GCD implementation for OS X, this is passing bogus // values for the threadIndex and threadCount builtins, which in turn // will cause bugs in code that uses those. int threadIndex = 0; int threadCount = 1; ti->func(ti->data, threadIndex, threadCount, ti->taskIndex, ti->taskCount(), ti->taskIndex0(), ti->taskIndex1(), ti->taskIndex2(), ti->taskCount0(), ti->taskCount1(), ti->taskCount2()); // Signal the event that this task is done ti->taskEvent.set(); } inline void TaskGroup::Launch(int baseIndex, int count) { for (int i = 0; i < count; ++i) CurrentScheduler::ScheduleTask(lRunTask, GetTaskInfo(baseIndex + i)); } inline void TaskGroup::Sync() { for (int i = 0; i < nextTaskInfoIndex; ++i) { TaskInfo *ti = GetTaskInfo(i); ti->taskEvent.wait(); ti->taskEvent.reset(); } } #endif // ISPC_USE_CONCRT /////////////////////////////////////////////////////////////////////////// // pthreads #ifdef ISPC_USE_PTHREADS static volatile int32_t lock = 0; static int nThreads; static pthread_t *threads = NULL; static pthread_mutex_t taskSysMutex; static std::vector activeTaskGroups; static sem_t *workerSemaphore; static void * lTaskEntry(void *arg) { int threadIndex = (int)((int64_t)arg); int threadCount = nThreads; while (1) { int err; // // Wait on the semaphore until we're woken up due to the arrival of // more work. // if ((err = sem_wait(workerSemaphore)) != 0) { fprintf(stderr, "Error from sem_wait: %s\n", strerror(err)); exit(1); } // // Acquire the mutex // if ((err = pthread_mutex_lock(&taskSysMutex)) != 0) { fprintf(stderr, "Error from pthread_mutex_lock: %s\n", strerror(err)); exit(1); } if (activeTaskGroups.size() == 0) { // // Task queue is empty, go back and wait on the semaphore // if ((err = pthread_mutex_unlock(&taskSysMutex)) != 0) { fprintf(stderr, "Error from pthread_mutex_unlock: %s\n", strerror(err)); exit(1); } continue; } // // Get the last task group on the active list and the last task // from its waiting tasks list. // TaskGroup *tg = activeTaskGroups.back(); assert(tg->waitingTasks.size() > 0); int taskNumber = tg->waitingTasks.back(); tg->waitingTasks.pop_back(); if (tg->waitingTasks.size() == 0) { // We just took the last task from this task group, so remove // it from the active list. activeTaskGroups.pop_back(); tg->inActiveList = false; } if ((err = pthread_mutex_unlock(&taskSysMutex)) != 0) { fprintf(stderr, "Error from pthread_mutex_unlock: %s\n", strerror(err)); exit(1); } // // And now actually run the task // DBG(fprintf(stderr, "running task %d from group %p\n", taskNumber, tg)); TaskInfo *myTask = tg->GetTaskInfo(taskNumber); myTask->func(myTask->data, threadIndex, threadCount, myTask->taskIndex, myTask->taskCount(), myTask->taskIndex0(), myTask->taskIndex1(), myTask->taskIndex2(), myTask->taskCount0(), myTask->taskCount1(), myTask->taskCount2()); // // Decrement the "number of unfinished tasks" counter in the task // group. // lMemFence(); lAtomicAdd(&tg->numUnfinishedTasks, -1); } pthread_exit(NULL); return 0; } static void InitTaskSystem() { if (threads == NULL) { while (1) { if (lAtomicCompareAndSwap32(&lock, 1, 0) == 0) { if (threads == NULL) { // We launch one fewer thread than there are cores, // since the main thread here will also grab jobs from // the task queue itself. nThreads = sysconf(_SC_NPROCESSORS_ONLN) - 1; int err; if ((err = pthread_mutex_init(&taskSysMutex, NULL)) != 0) { fprintf(stderr, "Error creating mutex: %s\n", strerror(err)); exit(1); } char name[32]; bool success = false; srand(time(NULL)); for (int i = 0; i < 10; i++) { sprintf(name, "ispc_task.%d.%d", (int)getpid(), (int)rand()); workerSemaphore = sem_open(name, O_CREAT, S_IRUSR|S_IWUSR, 0); if (workerSemaphore != SEM_FAILED) { success = true; break; } fprintf(stderr, "Failed to create %s\n", name); } if (!success) { fprintf(stderr, "Error creating semaphore (%s): %s\n", name, strerror(errno)); exit(1); } threads = (pthread_t *)malloc(nThreads * sizeof(pthread_t)); for (int i = 0; i < nThreads; ++i) { err = pthread_create(&threads[i], NULL, &lTaskEntry, (void *)((long long)i)); if (err != 0) { fprintf(stderr, "Error creating pthread %d: %s\n", i, strerror(err)); exit(1); } } activeTaskGroups.reserve(64); } // Make sure all of the above goes to memory before we // clear the lock. lMemFence(); lock = 0; break; } } } } inline void TaskGroup::Launch(int baseCoord, int count) { // // Acquire mutex, add task // int err; if ((err = pthread_mutex_lock(&taskSysMutex)) != 0) { fprintf(stderr, "Error from pthread_mutex_lock: %s\n", strerror(err)); exit(1); } // Add the corresponding set of tasks to the waiting-to-be-run list for // this task group. // // FIXME: it's a little ugly to hold a global mutex for this when we // only need to make sure no one else is accessing this task group's // waitingTasks list. (But a small experiment in switching to a // per-TaskGroup mutex showed worse performance!) for (int i = 0; i < count; ++i) waitingTasks.push_back(baseCoord + i); // Add the task group to the global active list if it isn't there // already. if (inActiveList == false) { activeTaskGroups.push_back(this); inActiveList = true; } if ((err = pthread_mutex_unlock(&taskSysMutex)) != 0) { fprintf(stderr, "Error from pthread_mutex_unlock: %s\n", strerror(err)); exit(1); } // // Update the count of the number of tasks left to run in this task // group. // lMemFence(); lAtomicAdd(&numUnfinishedTasks, count); // // Post to the worker semaphore to wake up worker threads that are // sleeping waiting for tasks to show up // for (int i = 0; i < count; ++i) if ((err = sem_post(workerSemaphore)) != 0) { fprintf(stderr, "Error from sem_post: %s\n", strerror(err)); exit(1); } } inline void TaskGroup::Sync() { DBG(fprintf(stderr, "syncing %p - %d unfinished\n", tg, numUnfinishedTasks)); while (numUnfinishedTasks > 0) { // All of the tasks in this group aren't finished yet. We'll try // to help out here since we don't have anything else to do... DBG(fprintf(stderr, "while syncing %p - %d unfinished\n", tg, numUnfinishedTasks)); // // Acquire the global task system mutex to grab a task to work on // int err; if ((err = pthread_mutex_lock(&taskSysMutex)) != 0) { fprintf(stderr, "Error from pthread_mutex_lock: %s\n", strerror(err)); exit(1); } TaskInfo *myTask = NULL; TaskGroup *runtg = this; if (waitingTasks.size() > 0) { int taskNumber = waitingTasks.back(); waitingTasks.pop_back(); if (waitingTasks.size() == 0) { // There's nothing left to start running from this group, // so remove it from the active task list. activeTaskGroups.erase(std::find(activeTaskGroups.begin(), activeTaskGroups.end(), this)); inActiveList = false; } myTask = GetTaskInfo(taskNumber); DBG(fprintf(stderr, "running task %d from group %p in sync\n", taskNumber, tg)); } else { // Other threads are already working on all of the tasks in // this group, so we can't help out by running one ourself. // We'll try to run one from another group to make ourselves // useful here. if (activeTaskGroups.size() == 0) { // No active task groups left--there's nothing for us to do. if ((err = pthread_mutex_unlock(&taskSysMutex)) != 0) { fprintf(stderr, "Error from pthread_mutex_unlock: %s\n", strerror(err)); exit(1); } // FIXME: We basically end up busy-waiting here, which is // extra wasteful in a world with hyper-threading. It would // be much better to put this thread to sleep on a // condition variable that was signaled when the last task // in this group was finished. #ifndef ISPC_IS_KNC usleep(1); #else _mm_delay_32(8); #endif continue; } // Get a task to run from another task group. runtg = activeTaskGroups.back(); assert(runtg->waitingTasks.size() > 0); int taskNumber = runtg->waitingTasks.back(); runtg->waitingTasks.pop_back(); if (runtg->waitingTasks.size() == 0) { // There's left to start running from this group, so remove // it from the active task list. activeTaskGroups.pop_back(); runtg->inActiveList = false; } myTask = runtg->GetTaskInfo(taskNumber); DBG(fprintf(stderr, "running task %d from other group %p in sync\n", taskNumber, runtg)); } if ((err = pthread_mutex_unlock(&taskSysMutex)) != 0) { fprintf(stderr, "Error from pthread_mutex_unlock: %s\n", strerror(err)); exit(1); } // // Do work for _myTask_ // // FIXME: bogus values for thread index/thread count here as well.. myTask->func(myTask->data, 0, 1, myTask->taskIndex, myTask->taskCount(), myTask->taskIndex0(), myTask->taskIndex1(), myTask->taskIndex2(), myTask->taskCount0(), myTask->taskCount1(), myTask->taskCount2()); // // Decrement the number of unfinished tasks counter // lMemFence(); lAtomicAdd(&runtg->numUnfinishedTasks, -1); } DBG(fprintf(stderr, "sync for %p done!n", tg)); } #endif // ISPC_USE_PTHREADS /////////////////////////////////////////////////////////////////////////// // Cilk Plus #ifdef ISPC_USE_CILK static void InitTaskSystem() { // No initialization needed } inline void TaskGroup::Launch(int baseIndex, int count) { cilk_for(int i = 0; i < count; i++) { TaskInfo *ti = GetTaskInfo(baseIndex + i); // Actually run the task. // Cilk does not expose the task -> thread mapping so we pretend it's 1:1 ti->func(ti->data, ti->taskIndex, ti->taskCount(), ti->taskIndex0(), ti->taskIndex1(), ti->taskIndex2(), ti->taskCount0(), ti->taskCount1(), ti->taskCount2()); } } inline void TaskGroup::Sync() { } #endif // ISPC_USE_CILK /////////////////////////////////////////////////////////////////////////// // OpenMP #ifdef ISPC_USE_OMP static void InitTaskSystem() { // No initialization needed } inline void TaskGroup::Launch(int baseIndex, int count) { #pragma omp parallel { const int threadIndex = omp_get_thread_num(); const int threadCount = omp_get_num_threads(); #pragma omp for schedule(runtime) for(int i = 0; i < count; i++) { TaskInfo *ti = GetTaskInfo(baseIndex + i); // Actually run the task. ti->func(ti->data, threadIndex, threadCount, ti->taskIndex, ti->taskCount(), ti->taskIndex0(), ti->taskIndex1(), ti->taskIndex2(), ti->taskCount0(), ti->taskCount1(), ti->taskCount2()); } } } inline void TaskGroup::Sync() { } #endif // ISPC_USE_OMP /////////////////////////////////////////////////////////////////////////// // Thread Building Blocks #ifdef ISPC_USE_TBB_PARALLEL_FOR static void InitTaskSystem() { // No initialization needed by default //tbb::task_scheduler_init(); } inline void TaskGroup::Launch(int baseIndex, int count) { tbb::parallel_for(0, count, [=](int i) { TaskInfo *ti = GetTaskInfo(baseIndex + i); // Actually run the task. // TBB does not expose the task -> thread mapping so we pretend it's 1:1 int threadIndex = ti->taskIndex; int threadCount = ti->taskCount(); ti->func(ti->data, threadIndex, threadCount, ti->taskIndex, ti->taskCount(), ti->taskIndex0(), ti->taskIndex1(), ti->taskIndex2(), ti->taskCount0(), ti->taskCount1(), ti->taskCount2()); }); } inline void TaskGroup::Sync() { } #endif // ISPC_USE_TBB_PARALLEL_FOR #ifdef ISPC_USE_TBB_TASK_GROUP static void InitTaskSystem() { // No initialization needed by default //tbb::task_scheduler_init(); } inline void TaskGroup::Launch(int baseIndex, int count) { for (int i = 0; i < count; i++) { tbbTaskGroup.run([=]() { TaskInfo *ti = GetTaskInfo(baseIndex + i); // TBB does not expose the task -> thread mapping so we pretend it's 1:1 int threadIndex = ti->taskIndex; int threadCount = ti->taskCount(); ti->func(ti->data, threadIndex, threadCount, ti->taskIndex, ti->taskCount(), ti->taskIndex0(), ti->taskIndex1(), ti->taskIndex2(), ti->taskCount0(), ti->taskCount1(), ti->taskCount2()); }); } } inline void TaskGroup::Sync() { tbbTaskGroup.wait(); } #endif // ISPC_USE_TBB_TASK_GROUP /////////////////////////////////////////////////////////////////////////// // ISPC_USE_HPX #ifdef ISPC_USE_HPX static void InitTaskSystem() { } inline void TaskGroup::Launch(int baseIndex, int count) { for (int i = 0; i < count; ++i) { TaskInfo *ti = GetTaskInfo(baseIndex + i); int threadIndex = i; int threadCount = count; futures.push_back(hpx::async(ti->func, ti->data, threadIndex, threadCount, ti->taskIndex, ti->taskCount(), ti->taskIndex0(), ti->taskIndex1(), ti->taskIndex2(), ti->taskCount0(), ti->taskCount1(), ti->taskCount2())); } } inline void TaskGroup::Sync() { hpx::wait_all(futures); futures.clear(); } #endif /////////////////////////////////////////////////////////////////////////// #ifndef ISPC_USE_PTHREADS_FULLY_SUBSCRIBED #define MAX_FREE_TASK_GROUPS 64 static TaskGroup *freeTaskGroups[MAX_FREE_TASK_GROUPS]; static inline TaskGroup * AllocTaskGroup() { for (int i = 0; i < MAX_FREE_TASK_GROUPS; ++i) { TaskGroup *tg = freeTaskGroups[i]; if (tg != NULL) { void *ptr = lAtomicCompareAndSwapPointer((void **)(&freeTaskGroups[i]), NULL, tg); if (ptr != NULL) { return (TaskGroup *)ptr; } } } return new TaskGroup; } static inline void FreeTaskGroup(TaskGroup *tg) { tg->Reset(); for (int i = 0; i < MAX_FREE_TASK_GROUPS; ++i) { if (freeTaskGroups[i] == NULL) { void *ptr = lAtomicCompareAndSwapPointer((void **)&freeTaskGroups[i], tg, NULL); if (ptr == NULL) return; } } delete tg; } /////////////////////////////////////////////////////////////////////////// void ISPCLaunch(void **taskGroupPtr, void *func, void *data, int count0, int count1, int count2) { const int count = count0*count1*count2; TaskGroup *taskGroup; if (*taskGroupPtr == NULL) { InitTaskSystem(); taskGroup = AllocTaskGroup(); *taskGroupPtr = taskGroup; } else taskGroup = (TaskGroup *)(*taskGroupPtr); int baseIndex = taskGroup->AllocTaskInfo(count); for (int i = 0; i < count; ++i) { TaskInfo *ti = taskGroup->GetTaskInfo(baseIndex+i); ti->func = (TaskFuncType)func; ti->data = data; ti->taskIndex = i; ti->taskCount3d[0] = count0; ti->taskCount3d[1] = count1; ti->taskCount3d[2] = count2; } taskGroup->Launch(baseIndex, count); } void ISPCSync(void *h) { TaskGroup *taskGroup = (TaskGroup *)h; if (taskGroup != NULL) { taskGroup->Sync(); FreeTaskGroup(taskGroup); } } void * ISPCAlloc(void **taskGroupPtr, int64_t size, int32_t alignment) { TaskGroup *taskGroup; if (*taskGroupPtr == NULL) { InitTaskSystem(); taskGroup = AllocTaskGroup(); *taskGroupPtr = taskGroup; } else taskGroup = (TaskGroup *)(*taskGroupPtr); return taskGroup->AllocMemory(size, alignment); } #else // ISPC_USE_PTHREADS_FULLY_SUBSCRIBED #define MAX_LIVE_TASKS 1024 pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER; // Small structure used to hold the data for each task struct Task { public: TaskFuncType func; void *data; volatile int32_t taskIndex; int taskCount; volatile int numDone; int liveIndex; // index in live task queue inline int noMoreWork() { return taskIndex >= taskCount; } /*! given thread is done working on this task --> decrease num locks */ // inline void lock() { lAtomicAdd(&locks,1); } // inline void unlock() { lAtomicAdd(&locks,-1); } inline int nextJob() { return lAtomicAdd(&taskIndex,1); } inline int numJobs() { return taskCount; } inline void schedule(int idx) { taskIndex = 0; numDone = 0; liveIndex = idx; } inline void run(int idx, int threadIdx); inline void markOneDone() { lAtomicAdd(&numDone,1); } inline void wait() { while (!noMoreWork()) { int next = nextJob(); if (next < numJobs()) run(next, 0); } while (numDone != taskCount) { #ifndef ISPC_IS_KNC usleep(1); #else _mm_delay_32(8); #endif } } }; /////////////////////////////////////////////////////////////////////////// class TaskSys { static int numThreadsRunning; struct LiveTask { volatile int locks; /*!< num locks on this task. gets initialized to NUM_THREADS+1, then counted down by every thread that sees this. this value is only valid when 'active' is set to true */ volatile int active; /*! workers will spin on this until it becomes active */ Task *task; inline void doneWithThis() { lAtomicAdd(&locks,-1); } LiveTask() : active(0), locks(-1) {} }; public: volatile int nextScheduleIndex; /*! next index in the task queue where we'll insert a live task */ // inline int inc_begin() { int old = begin; begin = (begin+1)%MAX_TASKS; return old; } // inline int inc_end() { int old = end; end = (end+1)%MAX_TASKS; return old; } LiveTask taskQueue[MAX_LIVE_TASKS]; std::stack taskMem; static TaskSys *global; TaskSys() : nextScheduleIndex(0) { TaskSys::global = this; Task *mem = new Task[MAX_LIVE_TASKS]; //< could actually be more than _live_ tasks for (int i=0;ischedule(liveIndex); taskQueue[liveIndex].locks = numThreadsRunning+1; // num _worker_ threads plus creator taskQueue[liveIndex].active = true; pthread_mutex_unlock(&mutex); } void sync(Task *task) { task->wait(); int liveIndex = task->liveIndex; while (taskQueue[liveIndex].locks > 1) { #ifndef ISPC_IS_KNC usleep(1); #else _mm_delay_32(8); #endif } _mm_free(task->data); pthread_mutex_lock(&mutex); taskMem.push(task); // recycle task index taskQueue[liveIndex].active = false; pthread_mutex_unlock(&mutex); } }; void TaskSys::threadFct() { int myIndex = 0; //lAtomicAdd(&threadIdx,1); while (1) { while (!taskQueue[myIndex].active) { #ifndef ISPC_IS_KNC usleep(4); #else _mm_delay_32(32); #endif continue; } Task *mine = taskQueue[myIndex].task; while (!mine->noMoreWork()) { int job = mine->nextJob(); if (job >= mine->numJobs()) break; mine->run(job,myIndex); } taskQueue[myIndex].doneWithThis(); myIndex = (myIndex+1)%MAX_LIVE_TASKS; } } inline void Task::run(int idx, int threadIdx) { (*this->func)(data,threadIdx,TaskSys::global->nThreads,idx,taskCount); markOneDone(); } void *_threadFct(void *data) { ((TaskSys*)data)->threadFct(); return NULL; } void TaskSys::createThreads() { init(); int reserved = 4; int minid = 2; nThreads = sysconf(_SC_NPROCESSORS_ONLN) - reserved; thread = (pthread_t *)malloc(nThreads * sizeof(pthread_t)); numThreadsRunning = 0; for (int i = 0; i < nThreads; ++i) { pthread_attr_t attr; pthread_attr_init(&attr); pthread_attr_setstacksize(&attr, 2*1024 * 1024); int threadID = minid+i; cpu_set_t cpuset; CPU_ZERO(&cpuset); CPU_SET(threadID,&cpuset); int ret = pthread_attr_setaffinity_np(&attr,sizeof(cpuset),&cpuset); int err = pthread_create(&thread[i], &attr, &_threadFct, this); ++numThreadsRunning; if (err != 0) { fprintf(stderr, "Error creating pthread %d: %s\n", i, strerror(err)); exit(1); } } } TaskSys * TaskSys::global = NULL; int TaskSys::numThreadsRunning = 0; /////////////////////////////////////////////////////////////////////////// void ISPCLaunch(void **taskGroupPtr, void *func, void *data, int count) { Task *ti = *(Task**)taskGroupPtr; ti->func = (TaskFuncType)func; ti->data = data; ti->taskIndex = 0; ti->taskCount = count; TaskSys::global->schedule(ti); } void ISPCSync(void *h) { Task *task = (Task *)h; assert(task); TaskSys::global->sync(task); } void *ISPCAlloc(void **taskGroupPtr, int64_t size, int32_t alignment) { TaskSys::init(); Task *task = TaskSys::global->allocOne(); *taskGroupPtr = task; task->data = _mm_malloc(size,alignment); return task->data;//*taskGroupPtr; } #endif // ISPC_USE_PTHREADS_FULLY_SUBSCRIBED ================================================ FILE: examples/stencil/volta/timing.h ================================================ /* Copyright (c) 2010-2011, Intel Corporation All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include #ifdef __arm__ #include // There's no easy way to get a hardware clock counter on ARM, so instead // we'll pretend it's a 1GHz processor and then compute pretend cycles // based on elapsed time from gettimeofday(). __inline__ uint64_t rdtsc() { static bool first = true; static struct timeval tv_start; if (first) { gettimeofday(&tv_start, NULL); first = false; return 0; } struct timeval tv; gettimeofday(&tv, NULL); tv.tv_sec -= tv_start.tv_sec; tv.tv_usec -= tv_start.tv_usec; return (1000000ull * tv.tv_sec + tv.tv_usec) * 1000ull; } #include static inline double rtc(void) { struct timeval Tvalue; double etime; struct timezone dummy; gettimeofday(&Tvalue,&dummy); etime = (double) Tvalue.tv_sec + 1.e-6*((double) Tvalue.tv_usec); return etime; } #else // __arm__ #ifdef WIN32 #include #define rdtsc __rdtsc #else // WIN32 __inline__ uint64_t rdtsc() { uint32_t low, high; #ifdef __x86_64 __asm__ __volatile__ ("xorl %%eax,%%eax \n cpuid" ::: "%rax", "%rbx", "%rcx", "%rdx" ); #else __asm__ __volatile__ ("xorl %%eax,%%eax \n cpuid" ::: "%eax", "%ebx", "%ecx", "%edx" ); #endif __asm__ __volatile__ ("rdtsc" : "=a" (low), "=d" (high)); return (uint64_t)high << 32 | low; } #include static inline double rtc(void) { struct timeval Tvalue; double etime; struct timezone dummy; gettimeofday(&Tvalue,&dummy); etime = (double) Tvalue.tv_sec + 1.e-6*((double) Tvalue.tv_usec); return etime; } #endif // !WIN32 #endif // !__arm__ static uint64_t start, end; static double tstart, tend; static inline void reset_and_start_timer() { start = rdtsc(); #ifndef WIN32 // Unused in Windows build, rtc() causing link errors tstart = rtc(); #endif } /* Returns the number of millions of elapsed processor cycles since the last reset_and_start_timer() call. */ static inline double get_elapsed_mcycles() { end = rdtsc(); return (end-start) / (1024. * 1024.); } #ifndef WIN32 // Unused in Windows build, rtc() causing link errors static inline double get_elapsed_msec() { tend = rtc(); return (tend - tstart)*1e3; } #endif ================================================ FILE: examples/triangle_xform/Cargo.toml ================================================ [package] name = "triangle_xform" version = "0.1.0" authors = ["Gonzalo Brito Gadeschi "] edition = "2018" [dependencies] packed_simd = { package = "packed_simd", path = "../.." } [dev-dependencies] rand = "0.7.0" time = "0.1.40" ================================================ FILE: examples/triangle_xform/readme.md ================================================ # Transforming triangle vertices using a transformation matrix ## Description This example contains the SIMD implementation of a common computer graphics task: transforming vertices with a matrix. ## Implementation There are two implementations: - scalar version, uses an array-of-structures layout, where each triangle contains three vertices, and each vertex contains only a 3D position vector; the algorithm operates on **one triangle at a time**. - SIMD version, uses a structure-of-arrays layout, where the structure contains, for each of the X, Y, and Z components of a 3D vector, an array of their values; the algorithm operates on **up to N triangles at once**, where N is number of lanes in a SIMD register. To simplify the implementation, the transformation matrix is composed only of simple rotation, scaling and translation matrices. Both implementations are single-threaded. They can be easily parallelized using [rayon] and dividing the list of triangles into chunks. [rayon]: https://github.com/rayon-rs/rayon ## Benchmark results This crate is mainly intended for educational purposes, since performance improvements will likely come from using the transformed triangles in SIMD layout further down the pipeline. In order to compare the generated results, the tests will convert the SIMD output back into a scalar representation. That being said, the crate's tests also come with a micro-benchmark. It is recommended to increase the `TRIANGLE_COUNT` constant to the point where you get accurate benchmark results. Run the unit tests in release mode, and with `stdout` capture disabled: ```sh cargo test --release -- --no-capture ``` Benchmark results on an Intel i5 with AVX, for 2^24 triangles: | algorithm | time | |-----------|--------| | scalar | 255 ms | | simd | 237 ms | (**Note**: the benchmark does not take into account the time required for transforming the data into an SIMD layout) SIMD is a mere 7% faster than the scalar algorithm, since LLVM was already able to vectorize most of the multiplication code. Since we're not doing a lot of processing on the triangles after transforming them, this "benchmark" is very limited by memory bandwidth. ================================================ FILE: examples/triangle_xform/src/lib.rs ================================================ #![allow(clippy::must_use_candidate)] /// Simple matrix type. /// The memory layout is the same as the one for Direct3D/OpenGL: fourth vector /// represents the translation vector `[x, y, z]`. type Matrix = [[f32; 3]; 4]; /// Scalar implementation of the triangle transform. pub mod scalar; /// SIMD implementation of the triangle transform. pub mod simd; #[cfg(test)] mod tests { use super::*; use rand::prelude::*; const TRIANGLE_COUNT: usize = 1 << 5; #[test] fn compare_scalar_simd() { let dist = rand::distributions::Standard; let mut rng = thread_rng(); // Generate a random triangle let triangles = dist .sample_iter(&mut rng) .take(TRIANGLE_COUNT) .collect::>(); // Generate a random matrix let mat: Matrix = dist.sample(&mut rng); // Benchmark scalar performance let mut scalar_xformed = Vec::new(); let scalar_dur = time::Duration::span(|| { scalar_xformed = triangles .iter() .map(|tri| tri.transform(mat)) .collect::>(); }); // Convert the random triangles to a structure-of-arrays format. let triangles = triangles .chunks(simd::VecF::lanes()) .map(|tris| simd::Triangle::pack(tris)) .collect::>(); // Benchmark SIMD performance let mut simd_xformed = Vec::new(); let simd_dur = time::Duration::span(|| { simd_xformed = triangles .iter() .map(|tri| tri.transform(mat)) .collect::>(); }); println!("scalar: {} ms", scalar_dur.num_milliseconds()); println!("simd: {} ms", simd_dur.num_milliseconds()); // Convert SIMD results back to AOS layout for comparison test let simd_xformed = simd_xformed .into_iter() .flat_map(|tri| tri.unpack()) .collect::>(); const EPSILON: f32 = 1E-5; if scalar_xformed != simd_xformed { scalar_xformed.into_iter().zip(simd_xformed.into_iter()).for_each( |(a, b)| { if a != b { a.0.iter().zip(b.0.iter()).for_each( |(v1, v2)| { v1.iter().zip(v2.iter()).for_each( |(a, b)| { assert!( (a - b).abs() <= EPSILON, "Vertex components do not match" ); }, ); }, ); } }, ); } } } ================================================ FILE: examples/triangle_xform/src/scalar.rs ================================================ use super::Matrix; /// Vertex data: a single 3D vector of floats, representing position. pub type Vertex = [f32; 3]; /// Triangle type for array-of-structs layout. #[derive(Debug, Default, Copy, Clone, PartialEq)] pub struct Triangle(pub [Vertex; 3]); impl Triangle { /// Transforms this triangle by multiplying with a matrix. #[inline] pub fn transform(self, mat: Matrix) -> Self { let mut xformed: [Vertex; 3] = Default::default(); let vertices = self.0; let col_a = mat[0]; let col_b = mat[1]; let col_c = mat[2]; let col_d = mat[3]; for k in 0..3 { let v = vertices[k]; let x = col_a[0] * v[0] + col_b[0] * v[1] + col_c[0] * v[2] + col_d[0]; let y = col_a[1] * v[0] + col_b[1] * v[1] + col_c[1] * v[2] + col_d[1]; let z = col_a[2] * v[0] + col_b[2] * v[1] + col_c[2] * v[2] + col_d[2]; xformed[k] = [x, y, z]; } Self(xformed) } } #[cfg(test)] mod tests { use super::*; use rand::{distributions::Standard, prelude::*}; impl Distribution for Standard { fn sample(&self, rng: &mut R) -> Triangle { Triangle(self.sample(rng)) } } #[test] fn translate() { let tri = Triangle([[-0.5, -0.5, 0.0], [0.5, -0.5, 0.0], [0.0, 0.5, 0.0]]); let (x, y, z) = (-0.25, 0.5, 1.0); let matrix = [[1.0, 0.0, 0.0], [0.0, 1.0, 0.0], [0.0, 0.0, 1.0], [x, y, z]]; let tri = tri.transform(matrix); let expected = Triangle([[-0.75, 0.0, 1.0], [0.25, 0.0, 1.0], [-0.25, 1.0, 1.0]]); assert_eq!(tri, expected); } } ================================================ FILE: examples/triangle_xform/src/simd.rs ================================================ use super::Matrix; /// SIMD vector of floats pub type VecF = packed_simd::f32x8; /// SIMD batch of N triangles, where N is SIMD width. #[derive(Debug, Default, Copy, Clone)] pub struct Triangle { pub x: [VecF; 3], pub y: [VecF; 3], pub z: [VecF; 3], } impl Triangle { /// Combines N scalar triangles into a single SIMD triangle. pub fn pack(tris: &[crate::scalar::Triangle]) -> Self { assert_eq!(tris.len(), VecF::lanes()); let mut x = [VecF::splat(0.0); 3]; let mut y = [VecF::splat(0.0); 3]; let mut z = [VecF::splat(0.0); 3]; (0..3).for_each(|k| { let x = &mut x[k]; let y = &mut y[k]; let z = &mut z[k]; (0..VecF::lanes()).for_each(|i| { let t = tris[i]; let vertex = t.0[k]; let tx = vertex[0]; let ty = vertex[1]; let tz = vertex[2]; *x = x.replace(i, tx); *y = y.replace(i, ty); *z = z.replace(i, tz); }); }); Self { x, y, z } } /// Unpacks the N scalar triangles into an array-of-structures layout. pub fn unpack(self) -> Vec { let mut tris = [crate::scalar::Triangle::default(); VecF::lanes()]; (0..3).for_each(|k| { (0..VecF::lanes()).for_each(|i| { let vtx = &mut tris[i].0; vtx[k][0] = self.x[k].extract(i); vtx[k][1] = self.y[k].extract(i); vtx[k][2] = self.z[k].extract(i); }); }); tris.to_vec() } /// Transforms this triangle by multiplying with a matrix. #[inline] pub fn transform(self, mat: Matrix) -> Self { let mut tri = Self::default(); let x = self.x; let y = self.y; let z = self.z; let col_a = mat[0]; let col_b = mat[1]; let col_c = mat[2]; let col_d = mat[3]; for k in 0..3 { let x = x[k]; let y = y[k]; let z = z[k]; tri.x[k] = col_a[0] * x + col_b[0] * y + col_c[0] * z + col_d[0]; tri.y[k] = col_a[1] * x + col_b[1] * y + col_c[1] * z + col_d[1]; tri.z[k] = col_a[2] * x + col_b[2] * y + col_c[2] * z + col_d[2]; } tri } } ================================================ FILE: micro_benchmarks/Cargo.toml ================================================ [package] name = "micro_benchmarks" version = "0.1.0" authors = ["gnzlbg "] autobenches = false edition = "2018" [dev-dependencies] packed_simd = { package = "packed_simd", path = ".." } paste = "0.1.3" criterion = "0.3" [profile.bench] opt-level = 3 debug = false lto = 'fat' debug-assertions = false codegen-units = 1 [[bench]] name = "mask_reductions" harness = false ================================================ FILE: micro_benchmarks/benches/mask_reductions.rs ================================================ //! Benchmarks for the mask reductions `all`, `any`, and `none`. #![deny(rust_2018_idioms)] #![feature(test)] use packed_simd::*; use test::black_box; use criterion::{Benchmark, Criterion, Throughput}; const NO_ITERATIONS: u32 = 1_000; macro_rules! bench { ($id:ident) => { paste::item! { fn [<$id _all>](c: &mut Criterion) { c.bench( stringify!($id), Benchmark::new("all", |b| b.iter(|| { let mut x: $id = Default::default(); for _ in 0..NO_ITERATIONS { if black_box(x).all() { black_box(&mut x); } } })).throughput(Throughput::Elements(NO_ITERATIONS)) ); } fn [<$id _any>](c: &mut Criterion) { c.bench( stringify!($id), Benchmark::new("any", |b| b.iter(|| { let mut x: $id = Default::default(); for _ in 0..NO_ITERATIONS { if black_box(x).any() { black_box(&mut x); } } })).throughput(Throughput::Elements(NO_ITERATIONS)) ); } fn [<$id _none>](c: &mut Criterion) { c.bench( stringify!($id), Benchmark::new("none", |b| b.iter(|| { let mut x: $id = Default::default(); for _ in 0..NO_ITERATIONS { if black_box(x).none() { black_box(&mut x); } } })).throughput(Throughput::Elements(NO_ITERATIONS)) ); } } }; ($($id:ident),*) => { $( bench!($id); )* paste::item! { criterion_group!( benches, $([<$id _all>]),*, $([<$id _any>]),*, $([<$id _none>]),* ); } }; } bench!( m8x2, // 16-bit wide types m8x8, m16x4, m32x2, // 64-bit wide types m8x16, m16x8, m32x4, m64x2, m128x1, // 128-bit wide types m8x32, m16x16, m32x8, m64x4, m128x2, // 256-bit wide types m8x64, m16x32, m32x16, m64x8, m128x4 // 512-bit wide types ); criterion_main!(benches); ================================================ FILE: micro_benchmarks/rust-toolchain ================================================ nightly ================================================ FILE: perf-guide/.gitignore ================================================ /book ================================================ FILE: perf-guide/book.toml ================================================ [book] authors = ["Gonzalo Brito Gadeschi", "Gabriel Majeri"] multilingual = false src = "src" title = "Rust SIMD Performance Guide" description = "This book describes how to write performant SIMD code in Rust." [build] create-missing = false [output.html] additional-css = ["./src/ascii.css"] ================================================ FILE: perf-guide/src/SUMMARY.md ================================================ # Summary [Introduction](./introduction.md) - [Floating-point Math](./float-math/fp.md) - [Short-vector Math Library](./float-math/svml.md) - [Approximate functions](./float-math/approx.md) - [Fused multiply-accumulate](./float-math/fma.md) - [Target features](./target-feature/features.md) - [Using `RUSTFLAGS`](./target-feature/rustflags.md) - [Using the `target_feature` attribute](./target-feature/attribute.md) - [Interaction with inlining](./target-feature/inlining.md) - [Detecting features at runtime](./target-feature/runtime.md) - [Bounds checking](./bound_checks.md) - [Vertical and horizontal operations](./vert-hor-ops.md) - [Performance profiling](./prof/profiling.md) - [Profiling on Linux](./prof/linux.md) - [Using machine code analyzers](./prof/mca.md) ================================================ FILE: perf-guide/src/ascii.css ================================================ code { /* "Source Code Pro" breaks ASCII art */ font-family: Consolas, "Ubuntu Mono", Menlo, "DejaVu Sans Mono", monospace; } ================================================ FILE: perf-guide/src/bound_checks.md ================================================ # Bounds checking Reading and writing packed vectors to/from slices is checked by default. Independently of the configuration options used, the safe functions: * `Simd<[T; N]>::from_slice_aligned(& s[..])` * `Simd<[T; N]>::write_to_slice_aligned(&mut s[..])` always check that: * the slice is big enough to hold the vector * the slice is suitably aligned to perform an aligned load/store for a `Simd<[T; N]>` (this alignment is often much larger than that of `T`). There are `_unaligned` versions that use unaligned load and stores, as well as `unsafe` `_unchecked` that do not perform any checks iff `debug-assertions = false` / `debug = false`. That is, the `_unchecked` methods do still assert size and alignment in debug builds and could also do so in release builds depending on the configuration options. These assertions do often significantly impact performance and you should be aware of them. ================================================ FILE: perf-guide/src/float-math/approx.md ================================================ # Approximate functions ================================================ FILE: perf-guide/src/float-math/fma.md ================================================ # Fused Multiply Add ================================================ FILE: perf-guide/src/float-math/fp.md ================================================ # Floating-point math This chapter contains information pertaining to working with floating-point numbers. ================================================ FILE: perf-guide/src/float-math/svml.md ================================================ # Short Vector Math Library ================================================ FILE: perf-guide/src/introduction.md ================================================ # Introduction ## What is SIMD ## History of SIMD in Rust ## Discover packed_simd Writing fast and portable SIMD algorithms using `packed_simd` is, unfortunately, not trivial. There are many pitfals that one should be aware of, and some idioms that help avoid those pitfalls. This book attempts to document these best practices and provides practical examples on how to apply the tips to _your_ code. ================================================ FILE: perf-guide/src/prof/linux.md ================================================ # Performance profiling on Linux ## Using `perf` [perf](https://perf.wiki.kernel.org/) is the most powerful performance profiler for Linux, featuring support for various hardware Performance Monitoring Units, as well as integration with the kernel's performance events framework. We will only look at how can the `perf` command can be used to profile SIMD code. Full system profiling is outside of the scope of this book. ### Recording The first step is to record a program's execution during an average workload. It helps if you can isolate the parts of your program which have performance issues, and set up a benchmark which can be easily (re)run. Build the benchmark binary in release mode, after having enabled debug info: ```sh $ cargo build --release Finished release [optimized + debuginfo] target(s) in 0.02s ``` Then use the `perf record` subcommand: ```sh $ perf record --call-graph=dwarf ./target/release/my-program [ perf record: Woken up 10 times to write data ] [ perf record: Captured and wrote 2,356 MB perf.data (292 samples) ] ``` Instead of using `--call-graph=dwarf`, which can become pretty slow, you can use `--call-graph=lbr` if you have a processor with support for Last Branch Record (i.e. Intel Haswell and newer). `perf` will, by default, record the count of CPU cycles it takes to execute various parts of your program. You can use the `-e` command line option to enable other performance events, such as `cache-misses`. Use `perf list` to get a list of all hardware counters supported by your CPU. ### Viewing the report The next step is getting a bird's eye view of the program's execution. `perf` provides a `ncurses`-based interface which will get you started. Use `perf report` to open a visualization of your program's performance: ```sh perf report --hierarchy -M intel ``` `--hierarchy` will display a tree-like structure of where your program spent most of its time. `-M intel` enables disassembly output with Intel syntax, which is subjectively more readable than the default AT&T syntax. Here is the output from profiling the `nbody` benchmark: ``` - 100,00% nbody - 94,18% nbody + 93,48% [.] nbody_lib::simd::advance + 0,70% [.] nbody_lib::run + 5,06% libc-2.28.so ``` If you move with the arrow keys to any node in the tree, you can the press `a` to have `perf` _annotate_ that node. This means it will: - disassemble the function - associate every instruction with the percentage of time which was spent executing it - interleaves the disassembly with the source code, assuming it found the debug symbols (you can use `s` to toggle this behaviour) `perf` will, by default, open the instruction which it identified as being the hottest spot in the function: ``` 0,76 │ movapd xmm2,xmm0 0,38 │ movhlps xmm2,xmm0 │ addpd xmm2,xmm0 │ unpcklpd xmm1,xmm2 12,50 │ sqrtpd xmm0,xmm1 1,52 │ mulpd xmm0,xmm1 ``` In this case, `sqrtpd` will be highlighted in red, since that's the instruction which the CPU spends most of its time executing. ## Using Valgrind Valgrind is a set of tools which initially helped C/C++ programmers find unsafe memory accesses in their code. Nowadays the project also has - a heap profiler called `massif` - a cache utilization profiler called `cachegrind` - a call-graph performance profiler called `callgrind` ================================================ FILE: perf-guide/src/prof/mca.md ================================================ # Machine code analysis tools ## The microarchitecture of modern CPUs While you might have heard of Instruction Set Architectures, such as `x86` or `arm` or `mips`, the term _microarchitecture_ (also written here as _µ-arch_), refers to the internal details of an actual family of CPUs, such as Intel's _Haswell_ or AMD's _Jaguar_. Replacing scalar code with SIMD code will improve performance on all CPUs supporting the required vector extensions. However, due to microarchitectural differences, the actual speed-up at runtime might vary. **Example**: a simple example arises when optimizing for AMD K8 CPUs. The assembly generated for an empty function should look like this: ```asm nop ret ``` The `nop` is used to align the `ret` instruction for better performance. However, the compiler will actually generated the following code: ```asm repz ret ``` The `repz` instruction will repeat the following instruction until a certain condition. Of course, in this situation, the function will simply immediately return, and the `ret` instruction is still aligned. However, AMD K8's branch predictor performs better with the latter code. For those looking to absolutely maximize performance for a certain target µ-arch, you will have to read some CPU manuals, or ask the compiler to do it for you with `-C target-cpu`. ### Summary of CPU internals Modern processors are able to execute instructions out-of-order for better performance, by utilizing tricks such as [branch prediction], [instruction pipelining], or [superscalar execution]. [branch prediction]: https://en.wikipedia.org/wiki/Branch_predictor [instruction pipelining]: https://en.wikipedia.org/wiki/Instruction_pipelining [superscalar execution]: https://en.wikipedia.org/wiki/Superscalar_processor SIMD instructions are also subject to these optimizations, meaning it can get pretty difficult to determine where the slowdown happens. For example, if the profiler reports a store operation is slow, one of two things could be happening: - the store is limited by the CPU's memory bandwidth, which is actually an ideal scenario, all things considered; - memory bandwidth is nowhere near its peak, but the value to be stored is at the end of a long chain of operations, and this store is where the profiler encountered the pipeline stall; Since most profilers are simple tools which don't understand the subtleties of instruction scheduling, you ## Analyzing the machine code Certain tools have knowledge of internal CPU microarchitecture, i.e. they know - how many physical [register files] a CPU actually has - what is the latency / throughtput of an instruction - what [µ-ops] are generated for a set of instructions and many other architectural details. [register files]: https://en.wikipedia.org/wiki/Register_file [µ-ops]: https://en.wikipedia.org/wiki/Micro-operation These tools are therefore able to provide accurate information as to why some instructions are inefficient, and where the bottleneck is. The disadvantage is that the output of these tools requires advanced knowledge of the target architecture to understand, i.e. they **cannot** point out what the cause of the issue is explicitly. ## Intel's Architecture Code Analyzer (IACA) [IACA] is a free tool offered by Intel for analyzing the performance of various computational kernels. Being a proprietary, closed source tool, it _only_ supports Intel's µ-arches. [IACA]: https://software.intel.com/en-us/articles/intel-architecture-code-analyzer ## llvm-mca ================================================ FILE: perf-guide/src/prof/profiling.md ================================================ # Performance profiling While the rest of the book provides practical advice on how to improve the performance of SIMD code, this chapter is dedicated to [**performance profiling**][profiling]. Profiling consists of recording a program's execution in order to identify program hotspots. **Important**: most profilers require debug information in order to accurately link the program hotspots back to the corresponding source code lines. Rust will disable debug info generation by default for optimized builds, but you can change that [in your `Cargo.toml`][cargo-ref]. [profiling]: https://en.wikipedia.org/wiki/Profiling_(computer_programming) [cargo-ref]: https://doc.rust-lang.org/cargo/reference/manifest.html#the-profile-sections ================================================ FILE: perf-guide/src/target-feature/attribute.md ================================================ # The `target_feature` attribute ================================================ FILE: perf-guide/src/target-feature/features.md ================================================ # Enabling target features Not all processors of a certain architecture will have SIMD processing units, and using a SIMD instruction which is not supported will trigger undefined behavior. To allow building safe, portable programs, the Rust compiler will **not**, by default, generate any sort of vector instructions, unless it can statically determine they are supported. For example, on AMD64, SSE2 support is architecturally guaranteed. The `x86_64-apple-darwin` target enables up to SSSE3. The get a defintive list of which features are enabled by default on various platforms, refer to the target specifications [in the compiler's source code][targets]. [targets]: https://github.com/rust-lang/rust/tree/master/src/librustc_target/spec ================================================ FILE: perf-guide/src/target-feature/inlining.md ================================================ # Inlining ================================================ FILE: perf-guide/src/target-feature/practice.md ================================================ # Target features in practice Using `RUSTFLAGS` will allow the crate being compiled, as well as all its transitive dependencies to use certain target features. A tehnique used to avoid undefined behavior at runtime is to compile and ship multiple binaries, each compiled with a certain set of features. This might not be feasible in some cases, and can quickly get out of hand as more and more vector extensions are added to an architecture. Rust can be more flexible: you can build a single binary/library which automatically picks the best supported vector instructions depending on the host machine. The trick consists of monomorphizing parts of the code during building, and then using run-time feature detection to select the right code path when running. **NOTE** (x86 specific): because the AVX (256-bit) registers extend the existing SSE (128-bit) registers, mixing SSE and AVX instructions in a program can cause performance issues. The solution is to compile all code, even the code written with 128-bit vectors, with the AVX target feature enabled. This will cause the compiler to prefix the generated instructions with the [VEX] prefix. [VEX]: https://en.wikipedia.org/wiki/VEX_prefix ================================================ FILE: perf-guide/src/target-feature/runtime.md ================================================ # Detecting host features at runtime ================================================ FILE: perf-guide/src/target-feature/rustflags.md ================================================ # Using RUSTFLAGS One of the easiest ways to benefit from SIMD is to allow the compiler to generate code using certain vector instruction extensions. The environment variable `RUSTFLAGS` can be used to pass options for code generation to the Rust compiler. These flags will affect **all** compiled crates. There are two flags which can be used to enable specific vector extensions: ## target-feature - Syntax: `-C target-feature=` - Provides the compiler with a comma-separated set of instruction extensions to enable. **Example**: Use `-C target-feature=+sse3,+avx` to enable generating instructions for [Streaming SIMD Extensions 3](https://en.wikipedia.org/wiki/SSE3) and [Advanced Vector Extensions](https://en.wikipedia.org/wiki/Advanced_Vector_Extensions). - To list target triples for all targets supported by Rust, use: ```sh rustc --print target-list ``` - To list all support target features for a certain target triple, use: ```sh rustc --target=${TRIPLE} --print target-features ``` - Note that all CPU features are independent, and will have to be enabled individually. **Example**: Setting `-C target-feature=+avx2` will _not_ enable `fma`, even though all CPUs which support AVX2 also support FMA. To enable both, one has to use `-C target-feature=+avx2,+fma` - Some features also depend on other features, which need to be enabled for the target instructions to be generated. **Example**: Unless `v7` is specified as the target CPU (see below), to enable NEON on ARM it is necessary to use `-C target-feature=+v7,+neon`. ## target-cpu - Syntax: `-C target-cpu=` - Sets the identifier of a CPU family / model for which to build and optimize the code. **Example**: `RUSTFLAGS='-C target-cpu=cortex-a75'` - To list all supported target CPUs for a certain target triple, use: ```sh rustc --target=${TRIPLE} --print target-cpus ``` **Example**: ```sh rustc --target=i686-pc-windows-msvc --print target-cpus ``` - The compiler will translate this into a list of target features. Therefore, individual feature checks (`#[cfg(target_feature = "...")]`) will still work properly. - It will cause the code generator to optimize the generated code for that specific CPU model. - Using `native` as the CPU model will cause Rust to generate and optimize code for the CPU running the compiler. It is useful when building programs which you plan to only use locally. This should never be used when the generated programs are meant to be run on other computers, such as when packaging for distribution or cross-compiling. ================================================ FILE: perf-guide/src/vert-hor-ops.md ================================================ # Vertical and horizontal operations In SIMD terminology, each vector has a certain "width" (number of lanes). A vector processor is able to perform two kinds of operations on a vector: - Vertical operations: operate on two vectors of the same width, result has same width **Example**: vertical addition of two `f32x4` vectors %0 == | 2 | -3.5 | 0 | 7 | + + + + %1 == | 4 | 1.5 | -1 | 0 | = = = = %0 + %1 == | 6 | -2 | -1 | 7 | - Horizontal operations: reduce the elements of two vectors in some way, the result's elements combine information from the two original ones **Example**: horizontal addition of two `u64x2` vectors %0 == | 1 | 3 | └─+───┘ └───────┐ │ %1 == | 4 | -1 | │ └─+──┘ │ └───┐ │ │ │ ┌─────│───┘ ▼ ▼ %0 + %1 == | 4 | 3 | ## Performance consideration of horizontal operations The result of vertical operations, like vector negation: `-a`, for a given lane, does not depend on the result of the operation for the other lanes. The result of horizontal operations, like the vector `sum` reduction: `a.sum()`, depends on the value of all vector lanes. In virtually all architectures vertical operations are fast, while horizontal operations are, by comparison, very slow. Consider the following two functions for computing the sum of all `f32` values in a slice: ```rust fn fast_sum(x: &[f32]) -> f32 { assert!(x.len() % 4 == 0); let mut sum = f32x4::splat(0.); // [0., 0., 0., 0.] for i in (0..x.len()).step_by(4) { sum += f32x4::from_slice_unaligned(&x[i..]); } sum.sum() } fn slow_sum(x: &[f32]) -> f32 { assert!(x.len() % 4 == 0); let mut sum: f32 = 0.; for i in (0..x.len()).step_by(4) { sum += f32x4::from_slice_unaligned(&x[i..]).sum(); } sum } ``` The inner loop over the slice is where the bulk of the work actually happens. There, the `fast_sum` function perform vertical operations into a vector, doing a single horizontal reduction at the end, while the `slow_sum` function performs horizontal vector operations inside of the loop. On all widely-used architectures, `fast_sum` is a large constant factor faster than `slow_sum`. You can run the [slice_sum]() example and see for yourself. On the particular machine tested there the algorithm using the horizontal vector addition is 2.7x slower than the one using vertical vector operations! ================================================ FILE: rust-toolchain ================================================ nightly ================================================ FILE: rustfmt.toml ================================================ max_width = 110 use_small_heuristics = "Max" wrap_comments = true edition = "2018" error_on_line_overflow = true ================================================ FILE: src/api/bit_manip.rs ================================================ //! Bit manipulations. macro_rules! impl_bit_manip { ([$elem_ty:ident; $elem_count:expr]: $id:ident | $test_tt:tt) => { impl $id { /// Returns the number of ones in the binary representation of /// the lanes of `self`. #[inline] pub fn count_ones(self) -> Self { super::codegen::bit_manip::BitManip::ctpop(self) } /// Returns the number of zeros in the binary representation of /// the lanes of `self`. #[inline] pub fn count_zeros(self) -> Self { super::codegen::bit_manip::BitManip::ctpop(!self) } /// Returns the number of leading zeros in the binary /// representation of the lanes of `self`. #[inline] pub fn leading_zeros(self) -> Self { super::codegen::bit_manip::BitManip::ctlz(self) } /// Returns the number of trailing zeros in the binary /// representation of the lanes of `self`. #[inline] pub fn trailing_zeros(self) -> Self { super::codegen::bit_manip::BitManip::cttz(self) } } test_if! { $test_tt: paste::item! { #[allow(overflowing_literals)] pub mod [<$id _bit_manip>] { #![allow(const_item_mutation)] use super::*; const LANE_WIDTH: usize = mem::size_of::<$elem_ty>() * 8; macro_rules! test_func { ($x:expr, $func:ident) => {{ let mut actual = $x; for i in 0..$id::lanes() { actual = actual.replace( i, $x.extract(i).$func() as $elem_ty ); } let expected = $x.$func(); assert_eq!(actual, expected); }}; } const BYTES: [u8; 64] = [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, ]; fn load_bytes() -> $id { let elems: &mut [$elem_ty] = unsafe { slice::from_raw_parts_mut( BYTES.as_mut_ptr() as *mut $elem_ty, $id::lanes(), ) }; $id::from_slice_unaligned(elems) } #[cfg_attr(not(target_arch = "wasm32"), test)] #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)] fn count_ones() { test_func!($id::splat(0), count_ones); test_func!($id::splat(!0), count_ones); test_func!(load_bytes(), count_ones); } #[cfg_attr(not(target_arch = "wasm32"), test)] #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)] fn count_zeros() { test_func!($id::splat(0), count_zeros); test_func!($id::splat(!0), count_zeros); test_func!(load_bytes(), count_zeros); } #[cfg_attr(not(target_arch = "wasm32"), test)] #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)] fn leading_zeros() { test_func!($id::splat(0), leading_zeros); test_func!($id::splat(1), leading_zeros); // some implementations use `pshufb` which has unique // behavior when the 8th bit is set. test_func!($id::splat(0b1000_0010), leading_zeros); test_func!($id::splat(!0), leading_zeros); test_func!( $id::splat(1 << (LANE_WIDTH - 1)), leading_zeros ); test_func!(load_bytes(), leading_zeros); } #[cfg_attr(not(target_arch = "wasm32"), test)] #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)] fn trailing_zeros() { test_func!($id::splat(0), trailing_zeros); test_func!($id::splat(1), trailing_zeros); test_func!($id::splat(0b1000_0010), trailing_zeros); test_func!($id::splat(!0), trailing_zeros); test_func!( $id::splat(1 << (LANE_WIDTH - 1)), trailing_zeros ); test_func!(load_bytes(), trailing_zeros); } } } } }; } ================================================ FILE: src/api/bitmask.rs ================================================ //! Bitmask API macro_rules! impl_bitmask { ($id:ident | $ibitmask_ty:ident | ($set:expr, $clear:expr) | $test_tt:tt) => { impl $id { /// Creates a bitmask with the MSB of each vector lane. /// /// If the vector has less than 8 lanes, the bits that do not /// correspond to any vector lanes are cleared. #[inline] pub fn bitmask(self) -> $ibitmask_ty { unsafe { codegen::llvm::simd_bitmask(self.0) } } } test_if! { $test_tt: paste::item! { #[cfg(not( // FIXME: https://github.com/rust-lang-nursery/packed_simd/issues/210 target_endian = "big" ))] pub mod [<$id _bitmask>] { use super::*; #[cfg_attr(not(target_arch = "wasm32"), test)] #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)] fn bitmask() { // clear all lanes let vec = $id::splat($clear as _); let bitmask: $ibitmask_ty = 0; assert_eq!(vec.bitmask(), bitmask); // set even lanes let mut vec = $id::splat($clear as _); for i in 0..$id::lanes() { if i % 2 == 0 { vec = vec.replace(i, $set as _); } } // create bitmask with even lanes set: let mut bitmask: $ibitmask_ty = 0; for i in 0..$id::lanes() { if i % 2 == 0 { bitmask |= 1 << i; } } assert_eq!(vec.bitmask(), bitmask); // set odd lanes let mut vec = $id::splat($clear as _); for i in 0..$id::lanes() { if i % 2 != 0 { vec = vec.replace(i, $set as _); } } // create bitmask with odd lanes set: let mut bitmask: $ibitmask_ty = 0; for i in 0..$id::lanes() { if i % 2 != 0 { bitmask |= 1 << i; } } assert_eq!(vec.bitmask(), bitmask); // set all lanes let vec = $id::splat($set as _); let mut bitmask: $ibitmask_ty = 0; for i in 0..$id::lanes() { bitmask |= 1 << i; } assert_eq!(vec.bitmask(), bitmask); } } } } }; } ================================================ FILE: src/api/cast/macros.rs ================================================ //! Macros implementing `FromCast` macro_rules! impl_from_cast_ { ($id:ident[$test_tt:tt]: $from_ty:ident) => { impl crate::api::cast::FromCast<$from_ty> for $id { #[inline] fn from_cast(x: $from_ty) -> Self { use crate::llvm::simd_cast; debug_assert_eq!($from_ty::lanes(), $id::lanes()); Simd(unsafe { simd_cast(x.0) }) } } test_if!{ $test_tt: paste::item! { pub mod [<$id _from_cast_ $from_ty>] { use super::*; #[cfg_attr(not(target_arch = "wasm32"), test)] #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)] fn test() { assert_eq!($id::lanes(), $from_ty::lanes()); } } } } }; } macro_rules! impl_from_cast { ($id:ident[$test_tt:tt]: $($from_ty:ident),*) => { $( impl_from_cast_!($id[$test_tt]: $from_ty); )* } } macro_rules! impl_from_cast_mask_ { ($id:ident[$test_tt:tt]: $from_ty:ident) => { impl crate::api::cast::FromCast<$from_ty> for $id { #[inline] fn from_cast(x: $from_ty) -> Self { debug_assert_eq!($from_ty::lanes(), $id::lanes()); x.ne($from_ty::default()) .select($id::splat(true), $id::splat(false)) } } test_if!{ $test_tt: paste::item! { pub mod [<$id _from_cast_ $from_ty>] { use super::*; #[cfg_attr(not(target_arch = "wasm32"), test)] #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)] fn test() { assert_eq!($id::lanes(), $from_ty::lanes()); let x = $from_ty::default(); let m: $id = x.cast(); assert!(m.none()); } } } } }; } macro_rules! impl_from_cast_mask { ($id:ident[$test_tt:tt]: $($from_ty:ident),*) => { $( impl_from_cast_mask_!($id[$test_tt]: $from_ty); )* } } #[allow(unused)] macro_rules! impl_into_cast { ($id:ident[$test_tt:tt]: $($from_ty:ident),*) => { $( impl_from_cast_!($from_ty[$test_tt]: $id); )* } } ================================================ FILE: src/api/cast/v128.rs ================================================ //! `FromCast` and `IntoCast` implementations for portable 128-bit wide vectors #[rustfmt::skip] use crate::*; impl_from_cast!(i8x16[test_v128]: u8x16, m8x16, i16x16, u16x16, m16x16, i32x16, u32x16, f32x16, m32x16); impl_from_cast!(u8x16[test_v128]: i8x16, m8x16, i16x16, u16x16, m16x16, i32x16, u32x16, f32x16, m32x16); impl_from_cast_mask!(m8x16[test_v128]: i8x16, u8x16, i16x16, u16x16, m16x16, i32x16, u32x16, f32x16, m32x16); impl_from_cast!( i16x8[test_v128]: i8x8, u8x8, m8x8, u16x8, m16x8, i32x8, u32x8, f32x8, m32x8, i64x8, u64x8, f64x8, m64x8, isizex8, usizex8, msizex8 ); impl_from_cast!( u16x8[test_v128]: i8x8, u8x8, m8x8, i16x8, m16x8, i32x8, u32x8, f32x8, m32x8, i64x8, u64x8, f64x8, m64x8, isizex8, usizex8, msizex8 ); impl_from_cast_mask!( m16x8[test_v128]: i8x8, u8x8, m8x8, i16x8, u16x8, i32x8, u32x8, f32x8, m32x8, i64x8, u64x8, f64x8, m64x8, isizex8, usizex8, msizex8 ); impl_from_cast!( i32x4[test_v128]: i8x4, u8x4, m8x4, i16x4, u16x4, m16x4, u32x4, f32x4, m32x4, i64x4, u64x4, f64x4, m64x4, i128x4, u128x4, m128x4, isizex4, usizex4, msizex4 ); impl_from_cast!( u32x4[test_v128]: i8x4, u8x4, m8x4, i16x4, u16x4, m16x4, i32x4, f32x4, m32x4, i64x4, u64x4, f64x4, m64x4, i128x4, u128x4, m128x4, isizex4, usizex4, msizex4 ); impl_from_cast!( f32x4[test_v128]: i8x4, u8x4, m8x4, i16x4, u16x4, m16x4, i32x4, u32x4, m32x4, i64x4, u64x4, f64x4, m64x4, i128x4, u128x4, m128x4, isizex4, usizex4, msizex4 ); impl_from_cast_mask!( m32x4[test_v128]: i8x4, u8x4, m8x4, i16x4, u16x4, m16x4, i32x4, u32x4, f32x4, i64x4, u64x4, f64x4, m64x4, i128x4, u128x4, m128x4, isizex4, usizex4, msizex4 ); impl_from_cast!( i64x2[test_v128]: i8x2, u8x2, m8x2, i16x2, u16x2, m16x2, i32x2, u32x2, f32x2, m32x2, u64x2, f64x2, m64x2, i128x2, u128x2, m128x2, isizex2, usizex2, msizex2 ); impl_from_cast!( u64x2[test_v128]: i8x2, u8x2, m8x2, i16x2, u16x2, m16x2, i32x2, u32x2, f32x2, m32x2, i64x2, f64x2, m64x2, i128x2, u128x2, m128x2, isizex2, usizex2, msizex2 ); impl_from_cast!( f64x2[test_v128]: i8x2, u8x2, m8x2, i16x2, u16x2, m16x2, i32x2, u32x2, f32x2, m32x2, i64x2, u64x2, m64x2, i128x2, u128x2, m128x2, isizex2, usizex2, msizex2 ); impl_from_cast_mask!( m64x2[test_v128]: i8x2, u8x2, m8x2, i16x2, u16x2, m16x2, i32x2, u32x2, f32x2, m32x2, i64x2, u64x2, f64x2, i128x2, u128x2, m128x2, isizex2, usizex2, msizex2 ); impl_from_cast!( isizex2[test_v128]: i8x2, u8x2, m8x2, i16x2, u16x2, m16x2, i32x2, u32x2, f32x2, m32x2, i64x2, u64x2, f64x2, m64x2, i128x2, u128x2, m128x2, usizex2, msizex2 ); impl_from_cast!( usizex2[test_v128]: i8x2, u8x2, m8x2, i16x2, u16x2, m16x2, i32x2, u32x2, f32x2, m32x2, i64x2, u64x2, f64x2, m64x2, i128x2, u128x2, m128x2, isizex2, msizex2 ); impl_from_cast_mask!( msizex2[test_v128]: i8x2, u8x2, m8x2, i16x2, u16x2, m16x2, i32x2, u32x2, f32x2, m32x2, i64x2, u64x2, f64x2, m64x2, i128x2, u128x2, m128x2, isizex2, usizex2 ); // FIXME[test_v128]: 64-bit single element vectors into_cast impls impl_from_cast!(i128x1[test_v128]: u128x1, m128x1); impl_from_cast!(u128x1[test_v128]: i128x1, m128x1); impl_from_cast!(m128x1[test_v128]: i128x1, u128x1); ================================================ FILE: src/api/cast/v16.rs ================================================ //! `FromCast` and `IntoCast` implementations for portable 16-bit wide vectors #[rustfmt::skip] use crate::*; impl_from_cast!( i8x2[test_v16]: u8x2, m8x2, i16x2, u16x2, m16x2, i32x2, u32x2, f32x2, m32x2, i64x2, u64x2, f64x2, m64x2, i128x2, u128x2, m128x2, isizex2, usizex2, msizex2 ); impl_from_cast!( u8x2[test_v16]: i8x2, m8x2, i16x2, u16x2, m16x2, i32x2, u32x2, f32x2, m32x2, i64x2, u64x2, f64x2, m64x2, i128x2, u128x2, m128x2, isizex2, usizex2, msizex2 ); impl_from_cast_mask!( m8x2[test_v16]: i8x2, u8x2, i16x2, u16x2, m16x2, i32x2, u32x2, f32x2, m32x2, i64x2, u64x2, f64x2, m64x2, i128x2, u128x2, m128x2, isizex2, usizex2, msizex2 ); ================================================ FILE: src/api/cast/v256.rs ================================================ //! `FromCast` and `IntoCast` implementations for portable 256-bit wide vectors #[rustfmt::skip] use crate::*; impl_from_cast!(i8x32[test_v256]: u8x32, m8x32, i16x32, u16x32, m16x32); impl_from_cast!(u8x32[test_v256]: i8x32, m8x32, i16x32, u16x32, m16x32); impl_from_cast_mask!(m8x32[test_v256]: i8x32, u8x32, i16x32, u16x32, m16x32); impl_from_cast!(i16x16[test_v256]: i8x16, u8x16, m8x16, u16x16, m16x16, i32x16, u32x16, f32x16, m32x16); impl_from_cast!(u16x16[test_v256]: i8x16, u8x16, m8x16, i16x16, m16x16, i32x16, u32x16, f32x16, m32x16); impl_from_cast_mask!(m16x16[test_v256]: i8x16, u8x16, m8x16, i16x16, u16x16, i32x16, u32x16, f32x16, m32x16); impl_from_cast!( i32x8[test_v256]: i8x8, u8x8, m8x8, i16x8, u16x8, m16x8, u32x8, f32x8, m32x8, i64x8, u64x8, f64x8, m64x8, isizex8, usizex8, msizex8 ); impl_from_cast!( u32x8[test_v256]: i8x8, u8x8, m8x8, i16x8, u16x8, m16x8, i32x8, f32x8, m32x8, i64x8, u64x8, f64x8, m64x8, isizex8, usizex8, msizex8 ); impl_from_cast!( f32x8[test_v256]: i8x8, u8x8, m8x8, i16x8, u16x8, m16x8, i32x8, u32x8, m32x8, i64x8, u64x8, f64x8, m64x8, isizex8, usizex8, msizex8 ); impl_from_cast_mask!( m32x8[test_v256]: i8x8, u8x8, m8x8, i16x8, u16x8, m16x8, i32x8, u32x8, f32x8, i64x8, u64x8, f64x8, m64x8, isizex8, usizex8, msizex8 ); impl_from_cast!( i64x4[test_v256]: i8x4, u8x4, m8x4, i16x4, u16x4, m16x4, i32x4, u32x4, f32x4, m32x4, u64x4, f64x4, m64x4, i128x4, u128x4, m128x4, isizex4, usizex4, msizex4 ); impl_from_cast!( u64x4[test_v256]: i8x4, u8x4, m8x4, i16x4, u16x4, m16x4, i32x4, u32x4, f32x4, m32x4, i64x4, f64x4, m64x4, i128x4, u128x4, m128x4, isizex4, usizex4, msizex4 ); impl_from_cast!( f64x4[test_v256]: i8x4, u8x4, m8x4, i16x4, u16x4, m16x4, i32x4, u32x4, f32x4, m32x4, i64x4, u64x4, m64x4, i128x4, u128x4, m128x4, isizex4, usizex4, msizex4 ); impl_from_cast_mask!( m64x4[test_v256]: i8x4, u8x4, m8x4, i16x4, u16x4, m16x4, i32x4, u32x4, f32x4, m32x4, i64x4, u64x4, f64x4, i128x4, u128x4, m128x4, isizex4, usizex4, msizex4 ); impl_from_cast!( i128x2[test_v256]: i8x2, u8x2, m8x2, i16x2, u16x2, m16x2, i32x2, u32x2, f32x2, m32x2, i64x2, u64x2, f64x2, m64x2, u128x2, m128x2, isizex2, usizex2, msizex2 ); impl_from_cast!( u128x2[test_v256]: i8x2, u8x2, m8x2, i16x2, u16x2, m16x2, i32x2, u32x2, f32x2, m32x2, i64x2, u64x2, f64x2, m64x2, i128x2, m128x2, isizex2, usizex2, msizex2 ); impl_from_cast_mask!( m128x2[test_v256]: i8x2, u8x2, m8x2, i16x2, u16x2, m16x2, i32x2, u32x2, f32x2, m32x2, i64x2, u64x2, m64x2, f64x2, i128x2, u128x2, isizex2, usizex2, msizex2 ); impl_from_cast!( isizex4[test_v256]: i8x4, u8x4, m8x4, i16x4, u16x4, m16x4, i32x4, u32x4, f32x4, m32x4, i64x4, u64x4, f64x4, m64x4, i128x4, u128x4, m128x4, usizex4, msizex4 ); impl_from_cast!( usizex4[test_v256]: i8x4, u8x4, m8x4, i16x4, u16x4, m16x4, i32x4, u32x4, f32x4, m32x4, i64x4, u64x4, f64x4, m64x4, i128x4, u128x4, m128x4, isizex4, msizex4 ); impl_from_cast_mask!( msizex4[test_v256]: i8x4, u8x4, m8x4, i16x4, u16x4, m16x4, i32x4, u32x4, f32x4, m32x4, i64x4, u64x4, f64x4, m64x4, i128x4, u128x4, m128x4, isizex4, usizex4 ); ================================================ FILE: src/api/cast/v32.rs ================================================ //! `FromCast` and `IntoCast` implementations for portable 32-bit wide vectors #[rustfmt::skip] use crate::*; impl_from_cast!( i8x4[test_v32]: u8x4, m8x4, i16x4, u16x4, m16x4, i32x4, u32x4, f32x4, m32x4, i64x4, u64x4, f64x4, m64x4, i128x4, u128x4, m128x4, isizex4, usizex4, msizex4 ); impl_from_cast!( u8x4[test_v32]: i8x4, m8x4, i16x4, u16x4, m16x4, i32x4, u32x4, f32x4, m32x4, i64x4, u64x4, f64x4, m64x4, i128x4, u128x4, m128x4, isizex4, usizex4, msizex4 ); impl_from_cast_mask!( m8x4[test_v32]: i8x4, u8x4, i16x4, u16x4, m16x4, i32x4, u32x4, f32x4, m32x4, i64x4, u64x4, f64x4, m64x4, i128x4, u128x4, m128x4, isizex4, usizex4, msizex4 ); impl_from_cast!( i16x2[test_v32]: i8x2, u8x2, m8x2, u16x2, m16x2, i32x2, u32x2, f32x2, m32x2, i64x2, u64x2, f64x2, m64x2, i128x2, u128x2, m128x2, isizex2, usizex2, msizex2 ); impl_from_cast!( u16x2[test_v32]: i8x2, u8x2, m8x2, i16x2, m16x2, i32x2, u32x2, f32x2, m32x2, i64x2, u64x2, f64x2, m64x2, i128x2, u128x2, m128x2, isizex2, usizex2, msizex2 ); impl_from_cast_mask!( m16x2[test_v32]: i8x2, u8x2, m8x2, i16x2, u16x2, i32x2, u32x2, f32x2, m32x2, i64x2, u64x2, f64x2, m64x2, i128x2, u128x2, m128x2, isizex2, usizex2, msizex2 ); ================================================ FILE: src/api/cast/v512.rs ================================================ //! `FromCast` and `IntoCast` implementations for portable 512-bit wide vectors #[rustfmt::skip] use crate::*; impl_from_cast!(i8x64[test_v512]: u8x64, m8x64); impl_from_cast!(u8x64[test_v512]: i8x64, m8x64); impl_from_cast_mask!(m8x64[test_v512]: i8x64, u8x64); impl_from_cast!(i16x32[test_v512]: i8x32, u8x32, m8x32, u16x32, m16x32); impl_from_cast!(u16x32[test_v512]: i8x32, u8x32, m8x32, i16x32, m16x32); impl_from_cast_mask!(m16x32[test_v512]: i8x32, u8x32, m8x32, i16x32, u16x32); impl_from_cast!(i32x16[test_v512]: i8x16, u8x16, m8x16, i16x16, u16x16, m16x16, u32x16, f32x16, m32x16); impl_from_cast!(u32x16[test_v512]: i8x16, u8x16, m8x16, i16x16, u16x16, m16x16, i32x16, f32x16, m32x16); impl_from_cast!(f32x16[test_v512]: i8x16, u8x16, m8x16, i16x16, u16x16, m16x16, i32x16, u32x16, m32x16); impl_from_cast_mask!(m32x16[test_v512]: i8x16, u8x16, m8x16, i16x16, u16x16, m16x16, i32x16, u32x16, f32x16); impl_from_cast!( i64x8[test_v512]: i8x8, u8x8, m8x8, i16x8, u16x8, m16x8, i32x8, u32x8, f32x8, m32x8, u64x8, f64x8, m64x8, isizex8, usizex8, msizex8 ); impl_from_cast!( u64x8[test_v512]: i8x8, u8x8, m8x8, i16x8, u16x8, m16x8, i32x8, u32x8, f32x8, m32x8, i64x8, f64x8, m64x8, isizex8, usizex8, msizex8 ); impl_from_cast!( f64x8[test_v512]: i8x8, u8x8, m8x8, i16x8, u16x8, m16x8, i32x8, u32x8, f32x8, m32x8, i64x8, u64x8, m64x8, isizex8, usizex8, msizex8 ); impl_from_cast_mask!( m64x8[test_v512]: i8x8, u8x8, m8x8, i16x8, u16x8, m16x8, i32x8, u32x8, f32x8, m32x8, i64x8, u64x8, f64x8, isizex8, usizex8, msizex8 ); impl_from_cast!( i128x4[test_v512]: i8x4, u8x4, m8x4, i16x4, u16x4, m16x4, i32x4, u32x4, f32x4, m32x4, i64x4, u64x4, f64x4, m64x4, u128x4, m128x4, isizex4, usizex4, msizex4 ); impl_from_cast!( u128x4[test_v512]: i8x4, u8x4, m8x4, i16x4, u16x4, m16x4, i32x4, u32x4, f32x4, m32x4, i64x4, u64x4, f64x4, m64x4, i128x4, m128x4, isizex4, usizex4, msizex4 ); impl_from_cast_mask!( m128x4[test_v512]: i8x4, u8x4, m8x4, i16x4, u16x4, m16x4, i32x4, u32x4, f32x4, m32x4, i64x4, u64x4, m64x4, f64x4, i128x4, u128x4, isizex4, usizex4, msizex4 ); impl_from_cast!( isizex8[test_v512]: i8x8, u8x8, m8x8, i16x8, u16x8, m16x8, i32x8, u32x8, f32x8, m32x8, i64x8, u64x8, f64x8, m64x8, usizex8, msizex8 ); impl_from_cast!( usizex8[test_v512]: i8x8, u8x8, m8x8, i16x8, u16x8, m16x8, i32x8, u32x8, f32x8, m32x8, i64x8, u64x8, f64x8, m64x8, isizex8, msizex8 ); impl_from_cast_mask!( msizex8[test_v512]: i8x8, u8x8, m8x8, i16x8, u16x8, m16x8, i32x8, u32x8, f32x8, m32x8, i64x8, u64x8, f64x8, m64x8, isizex8, usizex8 ); ================================================ FILE: src/api/cast/v64.rs ================================================ //! `FromCast` and `IntoCast` implementations for portable 64-bit wide vectors #[rustfmt::skip] use crate::*; impl_from_cast!( i8x8[test_v64]: u8x8, m8x8, i16x8, u16x8, m16x8, i32x8, u32x8, f32x8, m32x8, i64x8, u64x8, f64x8, m64x8, isizex8, usizex8, msizex8 ); impl_from_cast!( u8x8[test_v64]: i8x8, m8x8, i16x8, u16x8, m16x8, i32x8, u32x8, f32x8, m32x8, i64x8, u64x8, f64x8, m64x8, isizex8, usizex8, msizex8 ); impl_from_cast_mask!( m8x8[test_v64]: i8x8, u8x8, i16x8, u16x8, m16x8, i32x8, u32x8, f32x8, m32x8, i64x8, u64x8, f64x8, m64x8, isizex8, usizex8, msizex8 ); impl_from_cast!( i16x4[test_v64]: i8x4, u8x4, m8x4, u16x4, m16x4, i32x4, u32x4, f32x4, m32x4, i64x4, u64x4, f64x4, m64x4, i128x4, u128x4, m128x4, isizex4, usizex4, msizex4 ); impl_from_cast!( u16x4[test_v64]: i8x4, u8x4, m8x4, i16x4, m16x4, i32x4, u32x4, f32x4, m32x4, i64x4, u64x4, f64x4, m64x4, i128x4, u128x4, m128x4, isizex4, usizex4, msizex4 ); impl_from_cast_mask!( m16x4[test_v64]: i8x4, u8x4, m8x4, i16x4, u16x4, i32x4, u32x4, f32x4, m32x4, i64x4, u64x4, f64x4, m64x4, i128x4, u128x4, m128x4, isizex4, usizex4, msizex4 ); impl_from_cast!( i32x2[test_v64]: i8x2, u8x2, m8x2, i16x2, u16x2, m16x2, u32x2, f32x2, m32x2, i64x2, u64x2, f64x2, m64x2, i128x2, u128x2, m128x2, isizex2, usizex2, msizex2 ); impl_from_cast!( u32x2[test_v64]: i8x2, u8x2, m8x2, i16x2, u16x2, m16x2, i32x2, f32x2, m32x2, i64x2, u64x2, f64x2, m64x2, i128x2, u128x2, m128x2, isizex2, usizex2, msizex2 ); impl_from_cast!( f32x2[test_v64]: i8x2, u8x2, m8x2, i16x2, u16x2, m16x2, i32x2, u32x2, m32x2, i64x2, u64x2, f64x2, m64x2, i128x2, u128x2, m128x2, isizex2, usizex2, msizex2 ); impl_from_cast_mask!( m32x2[test_v64]: i8x2, u8x2, m8x2, i16x2, u16x2, m16x2, i32x2, u32x2, f32x2, i64x2, u64x2, f64x2, m64x2, i128x2, u128x2, m128x2, isizex2, usizex2, msizex2 ); ================================================ FILE: src/api/cast.rs ================================================ //! Implementation of `FromCast` and `IntoCast`. #![allow(clippy::module_name_repetitions)] /// Numeric cast from `T` to `Self`. /// /// > Note: This is a temporary workaround until the conversion traits /// specified > in [RFC2484] are implemented. /// /// Numeric cast between vectors with the same number of lanes, such that: /// /// * casting integer vectors whose lane types have the same size (e.g. `i32xN` /// -> `u32xN`) is a **no-op**, /// /// * casting from a larger integer to a smaller integer (e.g. `u32xN` -> /// `u8xN`) will **truncate**, /// /// * casting from a smaller integer to a larger integer (e.g. `u8xN` -> /// `u32xN`) will: /// * **zero-extend** if the source is unsigned, or /// * **sign-extend** if the source is signed, /// /// * casting from a float to an integer will **round the float towards zero**, /// /// * casting from an integer to float will produce the floating point /// representation of the integer, **rounding to nearest, ties to even**, /// /// * casting from an `f32` to an `f64` is perfect and lossless, /// /// * casting from an `f64` to an `f32` **rounds to nearest, ties to even**. /// /// [RFC2484]: https://github.com/rust-lang/rfcs/pull/2484 pub trait FromCast: crate::marker::Sized { /// Numeric cast from `T` to `Self`. fn from_cast(_: T) -> Self; } /// Numeric cast from `Self` to `T`. /// /// > Note: This is a temporary workaround until the conversion traits /// specified > in [RFC2484] are implemented. /// /// Numeric cast between vectors with the same number of lanes, such that: /// /// * casting integer vectors whose lane types have the same size (e.g. `i32xN` /// -> `u32xN`) is a **no-op**, /// /// * casting from a larger integer to a smaller integer (e.g. `u32xN` -> /// `u8xN`) will **truncate**, /// /// * casting from a smaller integer to a larger integer (e.g. `u8xN` -> /// `u32xN`) will: /// * **zero-extend** if the source is unsigned, or /// * **sign-extend** if the source is signed, /// /// * casting from a float to an integer will **round the float towards zero**, /// /// * casting from an integer to float will produce the floating point /// representation of the integer, **rounding to nearest, ties to even**, /// /// * casting from an `f32` to an `f64` is perfect and lossless, /// /// * casting from an `f64` to an `f32` **rounds to nearest, ties to even**. /// /// [RFC2484]: https://github.com/rust-lang/rfcs/pull/2484 pub trait Cast: crate::marker::Sized { /// Numeric cast from `self` to `T`. fn cast(self) -> T; } /// `FromCast` implies `Cast`. impl Cast for T where U: FromCast, { #[inline] fn cast(self) -> U { U::from_cast(self) } } /// `FromCast` and `Cast` are reflexive impl FromCast for T { #[inline] fn from_cast(t: Self) -> Self { t } } #[macro_use] mod macros; mod v16; pub use self::v16::*; mod v32; pub use self::v32::*; mod v64; pub use self::v64::*; mod v128; pub use self::v128::*; mod v256; pub use self::v256::*; mod v512; pub use self::v512::*; ================================================ FILE: src/api/cmp/eq.rs ================================================ //! Implements `Eq` for vector types. macro_rules! impl_cmp_eq { ( [$elem_ty:ident; $elem_count:expr]: $id:ident | $test_tt:tt | ($true:expr, $false:expr) ) => { impl crate::cmp::Eq for $id {} impl crate::cmp::Eq for LexicographicallyOrdered<$id> {} test_if!{ $test_tt: paste::item! { pub mod [<$id _cmp_eq>] { use super::*; #[cfg_attr(not(target_arch = "wasm32"), test)] #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)] fn eq() { fn foo(_: E) {} let a = $id::splat($false); foo(a); } } } } }; } ================================================ FILE: src/api/cmp/ord.rs ================================================ //! Implements `Ord` for vector types. macro_rules! impl_cmp_ord { ( [$elem_ty:ident; $elem_count:expr]: $id:ident | $test_tt:tt | ($true:expr, $false:expr) ) => { impl $id { /// Returns a wrapper that implements `Ord`. #[inline] pub fn lex_ord(&self) -> LexicographicallyOrdered<$id> { LexicographicallyOrdered(*self) } } impl crate::cmp::Ord for LexicographicallyOrdered<$id> { #[inline] fn cmp(&self, other: &Self) -> crate::cmp::Ordering { match self.partial_cmp(other) { Some(x) => x, None => unsafe { crate::hint::unreachable_unchecked() }, } } } test_if!{ $test_tt: paste::item! { pub mod [<$id _cmp_ord>] { use super::*; #[cfg_attr(not(target_arch = "wasm32"), test)] #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)] fn eq() { fn foo(_: E) {} let a = $id::splat($false); foo(a.partial_lex_ord()); foo(a.lex_ord()); } } } } }; } ================================================ FILE: src/api/cmp/partial_eq.rs ================================================ //! Implements `PartialEq` for vector types. macro_rules! impl_cmp_partial_eq { ( [$elem_ty:ident; $elem_count:expr]: $id:ident | $test_tt:tt | ($true:expr, $false:expr) ) => { // FIXME: https://github.com/rust-lang-nursery/rust-clippy/issues/2892 #[allow(clippy::partialeq_ne_impl)] impl crate::cmp::PartialEq<$id> for $id { #[inline] fn eq(&self, other: &Self) -> bool { $id::eq(*self, *other).all() } #[inline] fn ne(&self, other: &Self) -> bool { $id::ne(*self, *other).any() } } // FIXME: https://github.com/rust-lang-nursery/rust-clippy/issues/2892 #[allow(clippy::partialeq_ne_impl)] impl crate::cmp::PartialEq> for LexicographicallyOrdered<$id> { #[inline] fn eq(&self, other: &Self) -> bool { self.0 == other.0 } #[inline] fn ne(&self, other: &Self) -> bool { self.0 != other.0 } } test_if! { $test_tt: paste::item! { pub mod [<$id _cmp_PartialEq>] { use super::*; #[cfg_attr(not(target_arch = "wasm32"), test)] #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)] fn partial_eq() { let a = $id::splat($false); let b = $id::splat($true); assert!(a != b); assert!(!(a == b)); assert!(a == a); assert!(!(a != a)); if $id::lanes() > 1 { let a = $id::splat($false).replace(0, $true); let b = $id::splat($true); assert!(a != b); assert!(!(a == b)); assert!(a == a); assert!(!(a != a)); } } } } } }; } ================================================ FILE: src/api/cmp/partial_ord.rs ================================================ //! Implements `PartialOrd` for vector types. //! //! This implements a lexicographical order. macro_rules! impl_cmp_partial_ord { ([$elem_ty:ident; $elem_count:expr]: $id:ident | $test_tt:tt) => { impl $id { /// Returns a wrapper that implements `PartialOrd`. #[inline] pub fn partial_lex_ord(&self) -> LexicographicallyOrdered<$id> { LexicographicallyOrdered(*self) } } impl crate::cmp::PartialOrd> for LexicographicallyOrdered<$id> { #[inline] fn partial_cmp(&self, other: &Self) -> Option { if PartialEq::eq(self, other) { Some(crate::cmp::Ordering::Equal) } else if PartialOrd::lt(self, other) { Some(crate::cmp::Ordering::Less) } else if PartialOrd::gt(self, other) { Some(crate::cmp::Ordering::Greater) } else { None } } #[inline] fn lt(&self, other: &Self) -> bool { let m_lt = self.0.lt(other.0); let m_eq = self.0.eq(other.0); for i in 0..$id::lanes() { if m_eq.extract(i) { continue; } return m_lt.extract(i); } false } #[inline] fn le(&self, other: &Self) -> bool { self.lt(other) | PartialEq::eq(self, other) } #[inline] fn ge(&self, other: &Self) -> bool { self.gt(other) | PartialEq::eq(self, other) } #[inline] fn gt(&self, other: &Self) -> bool { let m_gt = self.0.gt(other.0); let m_eq = self.0.eq(other.0); for i in 0..$id::lanes() { if m_eq.extract(i) { continue; } return m_gt.extract(i); } false } } }; } macro_rules! test_cmp_partial_ord_int { ([$elem_ty:ident; $elem_count:expr]: $id:ident | $test_tt:tt) => { test_if!{ $test_tt: paste::item! { pub mod [<$id _cmp_PartialOrd>] { use super::*; #[cfg_attr(not(target_arch = "wasm32"), test)] #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)] fn partial_lex_ord() { use crate::testing::utils::{test_cmp}; // constant values let a = $id::splat(0); let b = $id::splat(1); test_cmp(a.partial_lex_ord(), b.partial_lex_ord(), Some(crate::cmp::Ordering::Less)); test_cmp(b.partial_lex_ord(), a.partial_lex_ord(), Some(crate::cmp::Ordering::Greater)); test_cmp(a.partial_lex_ord(), a.partial_lex_ord(), Some(crate::cmp::Ordering::Equal)); test_cmp(b.partial_lex_ord(), b.partial_lex_ord(), Some(crate::cmp::Ordering::Equal)); // variable values: a = [0, 1, 2, 3]; b = [3, 2, 1, 0] let mut a = $id::splat(0); let mut b = $id::splat(0); for i in 0..$id::lanes() { a = a.replace(i, i as $elem_ty); b = b.replace(i, ($id::lanes() - i) as $elem_ty); } test_cmp(a.partial_lex_ord(), b.partial_lex_ord(), Some(crate::cmp::Ordering::Less)); test_cmp(b.partial_lex_ord(), a.partial_lex_ord(), Some(crate::cmp::Ordering::Greater)); test_cmp(a.partial_lex_ord(), a.partial_lex_ord(), Some(crate::cmp::Ordering::Equal)); test_cmp(b.partial_lex_ord(), b.partial_lex_ord(), Some(crate::cmp::Ordering::Equal)); // variable values: a = [0, 1, 2, 3]; b = [0, 1, 2, 4] let mut b = a; b = b.replace( $id::lanes() - 1, a.extract($id::lanes() - 1) + 1 as $elem_ty ); test_cmp(a.partial_lex_ord(), b.partial_lex_ord(), Some(crate::cmp::Ordering::Less)); test_cmp(b.partial_lex_ord(), a.partial_lex_ord(), Some(crate::cmp::Ordering::Greater)); test_cmp(a.partial_lex_ord(), a.partial_lex_ord(), Some(crate::cmp::Ordering::Equal)); test_cmp(b.partial_lex_ord(), b.partial_lex_ord(), Some(crate::cmp::Ordering::Equal)); if $id::lanes() > 2 { // variable values a = [0, 1, 0, 0]; b = [0, 1, 2, 3] let b = a; let mut a = $id::splat(0); a = a.replace(1, 1 as $elem_ty); test_cmp(a.partial_lex_ord(), b.partial_lex_ord(), Some(crate::cmp::Ordering::Less)); test_cmp(b.partial_lex_ord(), a.partial_lex_ord(), Some(crate::cmp::Ordering::Greater)); test_cmp(a.partial_lex_ord(), a.partial_lex_ord(), Some(crate::cmp::Ordering::Equal)); test_cmp(b.partial_lex_ord(), b.partial_lex_ord(), Some(crate::cmp::Ordering::Equal)); // variable values: a = [0, 1, 2, 3]; b = [0, 1, 3, 2] let mut b = a; b = b.replace( 2, a.extract($id::lanes() - 1) + 1 as $elem_ty ); test_cmp(a.partial_lex_ord(), b.partial_lex_ord(), Some(crate::cmp::Ordering::Less)); test_cmp(b.partial_lex_ord(), a.partial_lex_ord(), Some(crate::cmp::Ordering::Greater)); test_cmp(a.partial_lex_ord(), a.partial_lex_ord(), Some(crate::cmp::Ordering::Equal)); test_cmp(b.partial_lex_ord(), b.partial_lex_ord(), Some(crate::cmp::Ordering::Equal)); } } } } } }; } macro_rules! test_cmp_partial_ord_mask { ([$elem_ty:ident; $elem_count:expr]: $id:ident | $test_tt:tt) => { test_if!{ $test_tt: paste::item! { pub mod [<$id _cmp_PartialOrd>] { use super::*; #[cfg_attr(not(target_arch = "wasm32"), test)] #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)] fn partial_lex_ord() { use crate::testing::utils::{test_cmp}; use crate::cmp::Ordering; // constant values let a = $id::splat(false); let b = $id::splat(true); test_cmp(a.partial_lex_ord(), b.partial_lex_ord(), Some(Ordering::Less)); test_cmp(b.partial_lex_ord(), a.partial_lex_ord(), Some(Ordering::Greater)); test_cmp(a.partial_lex_ord(), a.partial_lex_ord(), Some(Ordering::Equal)); test_cmp(b.partial_lex_ord(), b.partial_lex_ord(), Some(Ordering::Equal)); // variable values: // a = [false, false, false, false]; // b = [false, false, false, true] let a = $id::splat(false); let mut b = $id::splat(false); b = b.replace($id::lanes() - 1, true); test_cmp(a.partial_lex_ord(), b.partial_lex_ord(), Some(Ordering::Less)); test_cmp(b.partial_lex_ord(), a.partial_lex_ord(), Some(Ordering::Greater)); test_cmp(a.partial_lex_ord(), a.partial_lex_ord(), Some(Ordering::Equal)); test_cmp(b.partial_lex_ord(), b.partial_lex_ord(), Some(Ordering::Equal)); // variable values: // a = [true, true, true, false]; // b = [true, true, true, true] let mut a = $id::splat(true); let b = $id::splat(true); a = a.replace($id::lanes() - 1, false); test_cmp(a.partial_lex_ord(), b.partial_lex_ord(), Some(Ordering::Less)); test_cmp(b.partial_lex_ord(), a.partial_lex_ord(), Some(Ordering::Greater)); test_cmp(a.partial_lex_ord(), a.partial_lex_ord(), Some(Ordering::Equal)); test_cmp(b.partial_lex_ord(), b.partial_lex_ord(), Some(Ordering::Equal)); if $id::lanes() > 2 { // variable values // a = [false, true, false, false]; // b = [false, true, true, true] let mut a = $id::splat(false); let mut b = $id::splat(true); a = a.replace(1, true); b = b.replace(0, false); test_cmp(a.partial_lex_ord(), b.partial_lex_ord(), Some(Ordering::Less)); test_cmp(b.partial_lex_ord(), a.partial_lex_ord(), Some(Ordering::Greater)); test_cmp(a.partial_lex_ord(), a.partial_lex_ord(), Some(Ordering::Equal)); test_cmp(b.partial_lex_ord(), b.partial_lex_ord(), Some(Ordering::Equal)); } } } } } }; } ================================================ FILE: src/api/cmp/vertical.rs ================================================ //! Vertical (lane-wise) vector comparisons returning vector masks. macro_rules! impl_cmp_vertical { ( [$elem_ty:ident; $elem_count:expr]: $id:ident, $mask_ty:ident, $is_mask:expr,($true:expr, $false:expr) | $test_tt:tt ) => { impl $id { /// Lane-wise equality comparison. #[inline] pub fn eq(self, other: Self) -> $mask_ty { use crate::llvm::simd_eq; Simd(unsafe { simd_eq(self.0, other.0) }) } /// Lane-wise inequality comparison. #[inline] pub fn ne(self, other: Self) -> $mask_ty { use crate::llvm::simd_ne; Simd(unsafe { simd_ne(self.0, other.0) }) } /// Lane-wise less-than comparison. #[inline] pub fn lt(self, other: Self) -> $mask_ty { use crate::llvm::{simd_gt, simd_lt}; if $is_mask { Simd(unsafe { simd_gt(self.0, other.0) }) } else { Simd(unsafe { simd_lt(self.0, other.0) }) } } /// Lane-wise less-than-or-equals comparison. #[inline] pub fn le(self, other: Self) -> $mask_ty { use crate::llvm::{simd_ge, simd_le}; if $is_mask { Simd(unsafe { simd_ge(self.0, other.0) }) } else { Simd(unsafe { simd_le(self.0, other.0) }) } } /// Lane-wise greater-than comparison. #[inline] pub fn gt(self, other: Self) -> $mask_ty { use crate::llvm::{simd_gt, simd_lt}; if $is_mask { Simd(unsafe { simd_lt(self.0, other.0) }) } else { Simd(unsafe { simd_gt(self.0, other.0) }) } } /// Lane-wise greater-than-or-equals comparison. #[inline] pub fn ge(self, other: Self) -> $mask_ty { use crate::llvm::{simd_ge, simd_le}; if $is_mask { Simd(unsafe { simd_le(self.0, other.0) }) } else { Simd(unsafe { simd_ge(self.0, other.0) }) } } } test_if!{ $test_tt: paste::item! { pub mod [<$id _cmp_vertical>] { use super::*; #[cfg_attr(not(target_arch = "wasm32"), test)] #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)] fn cmp() { let a = $id::splat($false); let b = $id::splat($true); let r = a.lt(b); let e = $mask_ty::splat(true); assert!(r == e); let r = a.le(b); assert!(r == e); let e = $mask_ty::splat(false); let r = a.gt(b); assert!(r == e); let r = a.ge(b); assert!(r == e); let r = a.eq(b); assert!(r == e); let mut a = a; let mut b = b; let mut e = e; for i in 0..$id::lanes() { if i % 2 == 0 { a = a.replace(i, $false); b = b.replace(i, $true); e = e.replace(i, true); } else { a = a.replace(i, $true); b = b.replace(i, $false); e = e.replace(i, false); } } let r = a.lt(b); assert!(r == e); } } } } }; } ================================================ FILE: src/api/cmp.rs ================================================ //! Implement cmp traits for vector types #[macro_use] mod partial_eq; #[macro_use] mod eq; #[macro_use] mod partial_ord; #[macro_use] mod ord; #[macro_use] mod vertical; ================================================ FILE: src/api/default.rs ================================================ //! Implements `Default` for vector types. macro_rules! impl_default { ([$elem_ty:ident; $elem_count:expr]: $id:ident | $test_tt:tt) => { impl Default for $id { #[inline] fn default() -> Self { Self::splat($elem_ty::default()) } } test_if!{ $test_tt: paste::item! { // Comparisons use integer casts within mantissa^1 range. #[allow(clippy::float_cmp)] pub mod [<$id _default>] { use super::*; #[cfg_attr(not(target_arch = "wasm32"), test)] #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)] fn default() { let a = $id::default(); for i in 0..$id::lanes() { assert_eq!(a.extract(i), $elem_ty::default()); } } } } } }; } ================================================ FILE: src/api/fmt/binary.rs ================================================ //! Implement Octal formatting macro_rules! impl_fmt_binary { ([$elem_ty:ident; $elem_count:expr]: $id:ident | $test_tt:tt) => { impl crate::fmt::Binary for $id { #[allow(clippy::missing_inline_in_public_items)] fn fmt(&self, f: &mut crate::fmt::Formatter<'_>) -> crate::fmt::Result { write!(f, "{}(", stringify!($id))?; for i in 0..$elem_count { if i > 0 { write!(f, ", ")?; } self.extract(i).fmt(f)?; } write!(f, ")") } } test_if! { $test_tt: paste::item! { pub mod [<$id _fmt_binary>] { use super::*; #[cfg_attr(not(target_arch = "wasm32"), test)] #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)] fn binary() { use arrayvec::{ArrayString,ArrayVec}; type TinyString = ArrayString<[u8; 512]>; use crate::fmt::Write; let v = $id::splat($elem_ty::default()); let mut s = TinyString::new(); write!(&mut s, "{:#b}", v).unwrap(); let mut beg = TinyString::new(); write!(&mut beg, "{}(", stringify!($id)).unwrap(); assert!(s.starts_with(beg.as_str())); assert!(s.ends_with(")")); let s: ArrayVec<[TinyString; 64]> = s.replace(beg.as_str(), "") .replace(")", "").split(",") .map(|v| TinyString::from(v.trim()).unwrap()) .collect(); assert_eq!(s.len(), $id::lanes()); for (index, ss) in s.into_iter().enumerate() { let mut e = TinyString::new(); write!(&mut e, "{:#b}", v.extract(index)).unwrap(); assert_eq!(ss, e); } } } } } }; } ================================================ FILE: src/api/fmt/debug.rs ================================================ //! Implement debug formatting macro_rules! impl_fmt_debug_tests { ([$elem_ty:ty; $elem_count:expr]: $id:ident | $test_tt:tt) => { test_if! { $test_tt: paste::item! { pub mod [<$id _fmt_debug>] { use super::*; #[cfg_attr(not(target_arch = "wasm32"), test)] #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)] fn debug() { use arrayvec::{ArrayString,ArrayVec}; type TinyString = ArrayString<[u8; 512]>; use crate::fmt::Write; let v = $id::default(); let mut s = TinyString::new(); write!(&mut s, "{:?}", v).unwrap(); let mut beg = TinyString::new(); write!(&mut beg, "{}(", stringify!($id)).unwrap(); assert!(s.starts_with(beg.as_str())); assert!(s.ends_with(")")); let s: ArrayVec<[TinyString; 64]> = s.replace(beg.as_str(), "") .replace(")", "").split(",") .map(|v| TinyString::from(v.trim()).unwrap()) .collect(); assert_eq!(s.len(), $id::lanes()); for (index, ss) in s.into_iter().enumerate() { let mut e = TinyString::new(); write!(&mut e, "{:?}", v.extract(index)).unwrap(); assert_eq!(ss, e); } } } } } }; } macro_rules! impl_fmt_debug { ([$elem_ty:ty; $elem_count:expr]: $id:ident | $test_tt:tt) => { impl crate::fmt::Debug for $id { #[allow(clippy::missing_inline_in_public_items)] fn fmt(&self, f: &mut crate::fmt::Formatter<'_>) -> crate::fmt::Result { write!(f, "{}(", stringify!($id))?; for i in 0..$elem_count { if i > 0 { write!(f, ", ")?; } self.extract(i).fmt(f)?; } write!(f, ")") } } impl_fmt_debug_tests!([$elem_ty; $elem_count]: $id | $test_tt); }; } ================================================ FILE: src/api/fmt/lower_hex.rs ================================================ //! Implement `LowerHex` formatting macro_rules! impl_fmt_lower_hex { ([$elem_ty:ident; $elem_count:expr]: $id:ident | $test_tt:tt) => { impl crate::fmt::LowerHex for $id { #[allow(clippy::missing_inline_in_public_items)] fn fmt(&self, f: &mut crate::fmt::Formatter<'_>) -> crate::fmt::Result { write!(f, "{}(", stringify!($id))?; for i in 0..$elem_count { if i > 0 { write!(f, ", ")?; } self.extract(i).fmt(f)?; } write!(f, ")") } } test_if! { $test_tt: paste::item! { pub mod [<$id _fmt_lower_hex>] { use super::*; #[cfg_attr(not(target_arch = "wasm32"), test)] #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)] fn lower_hex() { use arrayvec::{ArrayString,ArrayVec}; type TinyString = ArrayString<[u8; 512]>; use crate::fmt::Write; let v = $id::splat($elem_ty::default()); let mut s = TinyString::new(); write!(&mut s, "{:#x}", v).unwrap(); let mut beg = TinyString::new(); write!(&mut beg, "{}(", stringify!($id)).unwrap(); assert!(s.starts_with(beg.as_str())); assert!(s.ends_with(")")); let s: ArrayVec<[TinyString; 64]> = s.replace(beg.as_str(), "").replace(")", "") .split(",") .map(|v| TinyString::from(v.trim()).unwrap()) .collect(); assert_eq!(s.len(), $id::lanes()); for (index, ss) in s.into_iter().enumerate() { let mut e = TinyString::new(); write!(&mut e, "{:#x}", v.extract(index)).unwrap(); assert_eq!(ss, e); } } } } } }; } ================================================ FILE: src/api/fmt/octal.rs ================================================ //! Implement Octal formatting macro_rules! impl_fmt_octal { ([$elem_ty:ident; $elem_count:expr]: $id:ident | $test_tt:tt) => { impl crate::fmt::Octal for $id { #[allow(clippy::missing_inline_in_public_items)] fn fmt(&self, f: &mut crate::fmt::Formatter<'_>) -> crate::fmt::Result { write!(f, "{}(", stringify!($id))?; for i in 0..$elem_count { if i > 0 { write!(f, ", ")?; } self.extract(i).fmt(f)?; } write!(f, ")") } } test_if! { $test_tt: paste::item! { pub mod [<$id _fmt_octal>] { use super::*; #[cfg_attr(not(target_arch = "wasm32"), test)] #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)] fn octal_hex() { use arrayvec::{ArrayString,ArrayVec}; type TinyString = ArrayString<[u8; 512]>; use crate::fmt::Write; let v = $id::splat($elem_ty::default()); let mut s = TinyString::new(); write!(&mut s, "{:#o}", v).unwrap(); let mut beg = TinyString::new(); write!(&mut beg, "{}(", stringify!($id)).unwrap(); assert!(s.starts_with(beg.as_str())); assert!(s.ends_with(")")); let s: ArrayVec<[TinyString; 64]> = s.replace(beg.as_str(), "").replace(")", "") .split(",") .map(|v| TinyString::from(v.trim()).unwrap()) .collect(); assert_eq!(s.len(), $id::lanes()); for (index, ss) in s.into_iter().enumerate() { let mut e = TinyString::new(); write!(&mut e, "{:#o}", v.extract(index)).unwrap(); assert_eq!(ss, e); } } } } } }; } ================================================ FILE: src/api/fmt/upper_hex.rs ================================================ //! Implement `UpperHex` formatting macro_rules! impl_fmt_upper_hex { ([$elem_ty:ident; $elem_count:expr]: $id:ident | $test_tt:tt) => { impl crate::fmt::UpperHex for $id { #[allow(clippy::missing_inline_in_public_items)] fn fmt(&self, f: &mut crate::fmt::Formatter<'_>) -> crate::fmt::Result { write!(f, "{}(", stringify!($id))?; for i in 0..$elem_count { if i > 0 { write!(f, ", ")?; } self.extract(i).fmt(f)?; } write!(f, ")") } } test_if! { $test_tt: paste::item! { pub mod [<$id _fmt_upper_hex>] { use super::*; #[cfg_attr(not(target_arch = "wasm32"), test)] #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)] fn upper_hex() { use arrayvec::{ArrayString,ArrayVec}; type TinyString = ArrayString<[u8; 512]>; use crate::fmt::Write; let v = $id::splat($elem_ty::default()); let mut s = TinyString::new(); write!(&mut s, "{:#X}", v).unwrap(); let mut beg = TinyString::new(); write!(&mut beg, "{}(", stringify!($id)).unwrap(); assert!(s.starts_with(beg.as_str())); assert!(s.ends_with(")")); let s: ArrayVec<[TinyString; 64]> = s.replace(beg.as_str(), "").replace(")", "") .split(",") .map(|v| TinyString::from(v.trim()).unwrap()) .collect(); assert_eq!(s.len(), $id::lanes()); for (index, ss) in s.into_iter().enumerate() { let mut e = TinyString::new(); write!(&mut e, "{:#X}", v.extract(index)).unwrap(); assert_eq!(ss, e); } } } } } }; } ================================================ FILE: src/api/fmt.rs ================================================ //! Implements formatting APIs #[macro_use] mod debug; #[macro_use] mod lower_hex; #[macro_use] mod upper_hex; #[macro_use] mod octal; #[macro_use] mod binary; ================================================ FILE: src/api/from/from_array.rs ================================================ //! Implements `From<[T; N]>` and `Into<[T; N]>` for vector types. macro_rules! impl_from_array { ([$elem_ty:ident; $elem_count:expr]: $id:ident | $test_tt:tt | ($non_default_array:expr, $non_default_vec:expr)) => { impl From<[$elem_ty; $elem_count]> for $id { #[inline] fn from(array: [$elem_ty; $elem_count]) -> Self { union U { array: [$elem_ty; $elem_count], vec: $id, } unsafe { U { array }.vec } } } impl From<$id> for [$elem_ty; $elem_count] { #[inline] fn from(vec: $id) -> Self { union U { array: [$elem_ty; $elem_count], vec: $id, } unsafe { U { vec }.array } } } // FIXME: `Into::into` is not inline, but due to // the blanket impl in `std`, which is not // marked `default`, we cannot override it here with // specialization. /* impl Into<[$elem_ty; $elem_count]> for $id { #[inline] fn into(self) -> [$elem_ty; $elem_count] { union U { array: [$elem_ty; $elem_count], vec: $id, } unsafe { U { vec: self }.array } } } impl Into<$id> for [$elem_ty; $elem_count] { #[inline] fn into(self) -> $id { union U { array: [$elem_ty; $elem_count], vec: $id, } unsafe { U { array: self }.vec } } } */ test_if! { $test_tt: paste::item! { // Comparisons use integer casts within mantissa^1 range. #[allow(clippy::float_cmp)] mod [<$id _from>] { use super::*; #[test] #[cfg_attr(miri, ignore)] fn array() { let vec: $id = Default::default(); // FIXME: Workaround for arrays with more than 32 // elements. // // Safe because we never take a reference to any // uninitialized element. union W { array: [$elem_ty; $elem_count], other: () } let mut array = W { other: () }; for i in 0..$elem_count { let default: $elem_ty = Default::default(); // note: array.other is the active member and // initialized so we can take a reference to it: let p = unsafe { &mut array.other as *mut () as *mut $elem_ty }; // note: default is a valid bit-pattern for // $elem_ty: unsafe { crate::ptr::write(p.wrapping_add(i), default) }; } // note: the array variant of the union is properly // initialized: let mut array = unsafe { array.array }; array[0] = $non_default_array; let vec = vec.replace(0, $non_default_vec); let vec_from_array = $id::from(array); assert_eq!(vec_from_array, vec); let array_from_vec = <[$elem_ty; $elem_count]>::from(vec); // FIXME: Workaround for arrays with more than 32 // elements. for i in 0..$elem_count { assert_eq!(array_from_vec[i], array[i]); } let vec_from_into_array: $id = array.into(); assert_eq!(vec_from_into_array, vec); let array_from_into_vec: [$elem_ty; $elem_count] = vec.into(); // FIXME: Workaround for arrays with more than 32 // elements. for i in 0..$elem_count { assert_eq!(array_from_into_vec[i], array[i]); } } } } } }; } ================================================ FILE: src/api/from/from_vector.rs ================================================ //! Implements `From` and `Into` for vector types. macro_rules! impl_from_vector { ([$elem_ty:ident; $elem_count:expr]: $id:ident | $test_tt:tt | $source:ident) => { impl From<$source> for $id { #[inline] fn from(source: $source) -> Self { fn static_assert_same_number_of_lanes() where T: crate::sealed::Simd, U: crate::sealed::Simd, { } use crate::llvm::simd_cast; static_assert_same_number_of_lanes::<$id, $source>(); Simd(unsafe { simd_cast(source.0) }) } } // FIXME: `Into::into` is not inline, but due to the blanket impl in // `std`, which is not marked `default`, we cannot override it here // with specialization. /* impl Into<$id> for $source { #[inline] fn into(self) -> $id { unsafe { simd_cast(self) } } } */ test_if! { $test_tt: paste::item! { pub mod [<$id _from_ $source>] { use super::*; #[cfg_attr(not(target_arch = "wasm32"), test)] #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)] fn from() { assert_eq!($id::lanes(), $source::lanes()); let source: $source = Default::default(); let vec: $id = Default::default(); let e = $id::from(source); assert_eq!(e, vec); let e: $id = source.into(); assert_eq!(e, vec); } } } } }; } macro_rules! impl_from_vectors { ([$elem_ty:ident; $elem_count:expr]: $id:ident | $test_tt:tt | $($source:ident),*) => { $( impl_from_vector!( [$elem_ty; $elem_count]: $id | $test_tt | $source ); )* } } ================================================ FILE: src/api/from.rs ================================================ //! Implementations of the `From` and `Into` traits #[macro_use] mod from_array; #[macro_use] mod from_vector; ================================================ FILE: src/api/hash.rs ================================================ //! Implements `Hash` for vector types. macro_rules! impl_hash { ([$elem_ty:ident; $elem_count:expr]: $id:ident | $test_tt:tt) => { impl crate::hash::Hash for $id { #[inline] fn hash(&self, state: &mut H) { unsafe { union A { data: [$elem_ty; $id::lanes()], vec: $id, } A { vec: *self }.data.hash(state) } } } test_if! { $test_tt: paste::item! { pub mod [<$id _hash>] { use super::*; #[cfg_attr(not(target_arch = "wasm32"), test)] #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)] fn hash() { use crate::hash::{Hash, Hasher}; #[allow(deprecated)] use crate::hash::{SipHasher13}; type A = [$elem_ty; $id::lanes()]; let a: A = [42 as $elem_ty; $id::lanes()]; assert_eq!( crate::mem::size_of::(), crate::mem::size_of::<$id>() ); #[allow(deprecated)] let mut a_hash = SipHasher13::new(); let mut v_hash = a_hash.clone(); a.hash(&mut a_hash); // Integer within mantissa^1 range. #[allow(clippy::float_cmp)] let v = $id::splat(42 as $elem_ty); v.hash(&mut v_hash); assert_eq!(a_hash.finish(), v_hash.finish()); } } } } }; } ================================================ FILE: src/api/into_bits/arch_specific.rs ================================================ //! `FromBits` and `IntoBits` between portable vector types and the //! architecture-specific vector types. #[rustfmt::skip] // FIXME: MIPS FromBits/IntoBits #[allow(unused)] use crate::*; /// This macro implements FromBits for the portable and the architecture /// specific vector types. /// /// The "leaf" case is at the bottom, and the most generic case is at the top. /// The generic case is split into smaller cases recursively. macro_rules! impl_arch { ([$arch_head_i:ident[$arch_head_tt:tt]: $($arch_head_ty:ident),*], $([$arch_tail_i:ident[$arch_tail_tt:tt]: $($arch_tail_ty:ident),*]),* | from: $($from_ty:ident),* | into: $($into_ty:ident),* | test: $test_tt:tt) => { impl_arch!( [$arch_head_i[$arch_head_tt]: $($arch_head_ty),*] | from: $($from_ty),* | into: $($into_ty),* | test: $test_tt ); impl_arch!( $([$arch_tail_i[$arch_tail_tt]: $($arch_tail_ty),*]),* | from: $($from_ty),* | into: $($into_ty),* | test: $test_tt ); }; ([$arch:ident[$arch_tt:tt]: $($arch_ty:ident),*] | from: $($from_ty:ident),* | into: $($into_ty:ident),* | test: $test_tt:tt) => { // note: if target is "arm", "+v7,+neon" must be enabled // and the std library must be recompiled with them #[cfg(any( not(target_arch = "arm"), all(target_feature = "v7", target_feature = "neon", any(feature = "core_arch", libcore_neon))) )] // note: if target is "powerpc", "altivec" must be enabled // and the std library must be recompiled with it #[cfg(any( not(target_arch = "powerpc"), all(target_feature = "altivec", feature = "core_arch"), ))] #[cfg(target_arch = $arch_tt)] use crate::arch::$arch::{ $($arch_ty),* }; #[cfg(any( not(target_arch = "arm"), all(target_feature = "v7", target_feature = "neon", any(feature = "core_arch", libcore_neon))) )] #[cfg(any( not(target_arch = "powerpc"), all(target_feature = "altivec", feature = "core_arch"), ))] #[cfg(target_arch = $arch_tt)] impl_arch!($($arch_ty),* | $($from_ty),* | $($into_ty),* | test: $test_tt); }; ($arch_head:ident, $($arch_tail:ident),* | $($from_ty:ident),* | $($into_ty:ident),* | test: $test_tt:tt) => { impl_arch!($arch_head | $($from_ty),* | $($into_ty),* | test: $test_tt); impl_arch!($($arch_tail),* | $($from_ty),* | $($into_ty),* | test: $test_tt); }; ($arch_head:ident | $($from_ty:ident),* | $($into_ty:ident),* | test: $test_tt:tt) => { impl_from_bits!($arch_head[$test_tt]: $($from_ty),*); impl_into_bits!($arch_head[$test_tt]: $($into_ty),*); }; } //////////////////////////////////////////////////////////////////////////////// // Implementations for the 64-bit wide vector types: // FIXME: 64-bit single element types // FIXME: arm/aarch float16x4_t missing impl_arch!( [ arm["arm"]: int8x8_t, uint8x8_t, poly8x8_t, int16x4_t, uint16x4_t, poly16x4_t, int32x2_t, uint32x2_t, float32x2_t, int64x1_t, uint64x1_t ], [ aarch64["aarch64"]: int8x8_t, uint8x8_t, poly8x8_t, int16x4_t, uint16x4_t, poly16x4_t, int32x2_t, uint32x2_t, float32x2_t, int64x1_t, uint64x1_t, float64x1_t ] | from: i8x8, u8x8, m8x8, i16x4, u16x4, m16x4, i32x2, u32x2, f32x2, m32x2 | into: i8x8, u8x8, i16x4, u16x4, i32x2, u32x2, f32x2 | test: test_v64 ); //////////////////////////////////////////////////////////////////////////////// // Implementations for the 128-bit wide vector types: // FIXME: arm/aarch float16x8_t missing // FIXME: ppc vector_pixel missing // FIXME: ppc64 vector_Float16 missing // FIXME: ppc64 vector_signed_long_long missing // FIXME: ppc64 vector_unsigned_long_long missing // FIXME: ppc64 vector_bool_long_long missing // FIXME: ppc64 vector_signed___int128 missing // FIXME: ppc64 vector_unsigned___int128 missing impl_arch!( [x86["x86"]: __m128, __m128i, __m128d], [x86_64["x86_64"]: __m128, __m128i, __m128d], [ arm["arm"]: int8x16_t, uint8x16_t, poly8x16_t, int16x8_t, uint16x8_t, poly16x8_t, int32x4_t, uint32x4_t, float32x4_t, int64x2_t, uint64x2_t ], [ aarch64["aarch64"]: int8x16_t, uint8x16_t, poly8x16_t, int16x8_t, uint16x8_t, poly16x8_t, int32x4_t, uint32x4_t, float32x4_t, int64x2_t, uint64x2_t, float64x2_t ], [ powerpc["powerpc"]: vector_signed_char, vector_unsigned_char, vector_signed_short, vector_unsigned_short, vector_signed_int, vector_unsigned_int, vector_float ], [ powerpc64["powerpc64"]: vector_signed_char, vector_unsigned_char, vector_signed_short, vector_unsigned_short, vector_signed_int, vector_unsigned_int, vector_float, vector_signed_long, vector_unsigned_long, vector_double ] | from: i8x16, u8x16, m8x16, i16x8, u16x8, m16x8, i32x4, u32x4, f32x4, m32x4, i64x2, u64x2, f64x2, m64x2, i128x1, u128x1, m128x1 | into: i8x16, u8x16, i16x8, u16x8, i32x4, u32x4, f32x4, i64x2, u64x2, f64x2, i128x1, u128x1 | test: test_v128 ); impl_arch!( [powerpc["powerpc"]: vector_bool_char], [powerpc64["powerpc64"]: vector_bool_char] | from: m8x16, m16x8, m32x4, m64x2, m128x1 | into: i8x16, u8x16, i16x8, u16x8, i32x4, u32x4, f32x4, i64x2, u64x2, f64x2, i128x1, u128x1, // Masks: m8x16 | test: test_v128 ); impl_arch!( [powerpc["powerpc"]: vector_bool_short], [powerpc64["powerpc64"]: vector_bool_short] | from: m16x8, m32x4, m64x2, m128x1 | into: i8x16, u8x16, i16x8, u16x8, i32x4, u32x4, f32x4, i64x2, u64x2, f64x2, i128x1, u128x1, // Masks: m8x16, m16x8 | test: test_v128 ); impl_arch!( [powerpc["powerpc"]: vector_bool_int], [powerpc64["powerpc64"]: vector_bool_int] | from: m32x4, m64x2, m128x1 | into: i8x16, u8x16, i16x8, u16x8, i32x4, u32x4, f32x4, i64x2, u64x2, f64x2, i128x1, u128x1, // Masks: m8x16, m16x8, m32x4 | test: test_v128 ); impl_arch!( [powerpc64["powerpc64"]: vector_bool_long] | from: m64x2, m128x1 | into: i8x16, u8x16, i16x8, u16x8, i32x4, u32x4, f32x4, i64x2, u64x2, f64x2, i128x1, u128x1, // Masks: m8x16, m16x8, m32x4, m64x2 | test: test_v128 ); //////////////////////////////////////////////////////////////////////////////// // Implementations for the 256-bit wide vector types impl_arch!( [x86["x86"]: __m256, __m256i, __m256d], [x86_64["x86_64"]: __m256, __m256i, __m256d] | from: i8x32, u8x32, m8x32, i16x16, u16x16, m16x16, i32x8, u32x8, f32x8, m32x8, i64x4, u64x4, f64x4, m64x4, i128x2, u128x2, m128x2 | into: i8x32, u8x32, i16x16, u16x16, i32x8, u32x8, f32x8, i64x4, u64x4, f64x4, i128x2, u128x2 | test: test_v256 ); //////////////////////////////////////////////////////////////////////////////// // FIXME: Implementations for the 512-bit wide vector types ================================================ FILE: src/api/into_bits/macros.rs ================================================ //! Macros implementing `FromBits` macro_rules! impl_from_bits_ { ($id:ident[$test_tt:tt]: $from_ty:ident) => { impl crate::api::into_bits::FromBits<$from_ty> for $id { #[inline] fn from_bits(x: $from_ty) -> Self { unsafe { crate::mem::transmute(x) } } } test_if! { $test_tt: paste::item! { pub mod [<$id _from_bits_ $from_ty>] { use super::*; #[cfg_attr(not(target_arch = "wasm32"), test)] #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)] fn test() { use crate::{ ptr::{read_unaligned}, mem::{size_of, zeroed} }; use crate::IntoBits; assert_eq!(size_of::<$id>(), size_of::<$from_ty>()); // This is safe because we never create a reference to // uninitialized memory: let a: $from_ty = unsafe { zeroed() }; let b_0: $id = crate::FromBits::from_bits(a); let b_1: $id = a.into_bits(); // Check that these are byte-wise equal, that is, // that the bit patterns are identical: for i in 0..size_of::<$id>() { // This is safe because we only read initialized // memory in bounds. Also, taking a reference to // `b_i` is ok because the fields are initialized. unsafe { let b_0_v: u8 = read_unaligned( (&b_0 as *const $id as *const u8) .wrapping_add(i) ); let b_1_v: u8 = read_unaligned( (&b_1 as *const $id as *const u8) .wrapping_add(i) ); assert_eq!(b_0_v, b_1_v); } } } } } } }; } macro_rules! impl_from_bits { ($id:ident[$test_tt:tt]: $($from_ty:ident),*) => { $( impl_from_bits_!($id[$test_tt]: $from_ty); )* } } #[allow(unused)] macro_rules! impl_into_bits { ($id:ident[$test_tt:tt]: $($from_ty:ident),*) => { $( impl_from_bits_!($from_ty[$test_tt]: $id); )* } } ================================================ FILE: src/api/into_bits/v128.rs ================================================ //! `FromBits` and `IntoBits` implementations for portable 128-bit wide vectors #[rustfmt::skip] #[allow(unused)] // wasm_bindgen_test use crate::*; impl_from_bits!( i8x16[test_v128]: u8x16, m8x16, i16x8, u16x8, m16x8, i32x4, u32x4, f32x4, m32x4, i64x2, u64x2, f64x2, m64x2, i128x1, u128x1, m128x1 ); impl_from_bits!( u8x16[test_v128]: i8x16, m8x16, i16x8, u16x8, m16x8, i32x4, u32x4, f32x4, m32x4, i64x2, u64x2, f64x2, m64x2, i128x1, u128x1, m128x1 ); impl_from_bits!(m8x16[test_v128]: m16x8, m32x4, m64x2, m128x1); impl_from_bits!( i16x8[test_v128]: i8x16, u8x16, m8x16, u16x8, m16x8, i32x4, u32x4, f32x4, m32x4, i64x2, u64x2, f64x2, m64x2, i128x1, u128x1, m128x1 ); impl_from_bits!( u16x8[test_v128]: i8x16, u8x16, m8x16, i16x8, m16x8, i32x4, u32x4, f32x4, m32x4, i64x2, u64x2, f64x2, m64x2, i128x1, u128x1, m128x1 ); impl_from_bits!(m16x8[test_v128]: m32x4, m64x2, m128x1); impl_from_bits!( i32x4[test_v128]: i8x16, u8x16, m8x16, i16x8, u16x8, m16x8, u32x4, f32x4, m32x4, i64x2, u64x2, f64x2, m64x2, i128x1, u128x1, m128x1 ); impl_from_bits!( u32x4[test_v128]: i8x16, u8x16, m8x16, i16x8, u16x8, m16x8, i32x4, f32x4, m32x4, i64x2, u64x2, f64x2, m64x2, i128x1, u128x1, m128x1 ); impl_from_bits!( f32x4[test_v128]: i8x16, u8x16, m8x16, i16x8, u16x8, m16x8, i32x4, u32x4, m32x4, i64x2, u64x2, f64x2, m64x2, i128x1, u128x1, m128x1 ); impl_from_bits!(m32x4[test_v128]: m64x2, m128x1); impl_from_bits!( i64x2[test_v128]: i8x16, u8x16, m8x16, i16x8, u16x8, m16x8, i32x4, u32x4, f32x4, m32x4, u64x2, f64x2, m64x2, i128x1, u128x1, m128x1 ); impl_from_bits!( u64x2[test_v128]: i8x16, u8x16, m8x16, i16x8, u16x8, m16x8, i32x4, u32x4, f32x4, m32x4, i64x2, f64x2, m64x2, i128x1, u128x1, m128x1 ); impl_from_bits!( f64x2[test_v128]: i8x16, u8x16, m8x16, i16x8, u16x8, m16x8, i32x4, u32x4, f32x4, m32x4, i64x2, u64x2, m64x2, i128x1, u128x1, m128x1 ); impl_from_bits!(m64x2[test_v128]: m128x1); impl_from_bits!( i128x1[test_v128]: i8x16, u8x16, m8x16, i16x8, u16x8, m16x8, i32x4, u32x4, f32x4, m32x4, i64x2, u64x2, f64x2, m64x2, u128x1, m128x1 ); impl_from_bits!( u128x1[test_v128]: i8x16, u8x16, m8x16, i16x8, u16x8, m16x8, i32x4, u32x4, f32x4, m32x4, i64x2, u64x2, f64x2, m64x2, i128x1, m128x1 ); // note: m128x1 cannot be constructed from all the other masks bit patterns in // here ================================================ FILE: src/api/into_bits/v16.rs ================================================ //! `FromBits` and `IntoBits` implementations for portable 16-bit wide vectors #[rustfmt::skip] #[allow(unused)] // wasm_bindgen_test use crate::*; impl_from_bits!(i8x2[test_v16]: u8x2, m8x2); impl_from_bits!(u8x2[test_v16]: i8x2, m8x2); // note: m8x2 cannot be constructed from all i8x2 or u8x2 bit patterns ================================================ FILE: src/api/into_bits/v256.rs ================================================ //! `FromBits` and `IntoBits` implementations for portable 256-bit wide vectors #[rustfmt::skip] #[allow(unused)] // wasm_bindgen_test use crate::*; impl_from_bits!( i8x32[test_v256]: u8x32, m8x32, i16x16, u16x16, m16x16, i32x8, u32x8, f32x8, m32x8, i64x4, u64x4, f64x4, m64x4, i128x2, u128x2, m128x2 ); impl_from_bits!( u8x32[test_v256]: i8x32, m8x32, i16x16, u16x16, m16x16, i32x8, u32x8, f32x8, m32x8, i64x4, u64x4, f64x4, m64x4, i128x2, u128x2, m128x2 ); impl_from_bits!(m8x32[test_v256]: m16x16, m32x8, m64x4, m128x2); impl_from_bits!( i16x16[test_v256]: i8x32, u8x32, m8x32, u16x16, m16x16, i32x8, u32x8, f32x8, m32x8, i64x4, u64x4, f64x4, m64x4, i128x2, u128x2, m128x2 ); impl_from_bits!( u16x16[test_v256]: i8x32, u8x32, m8x32, i16x16, m16x16, i32x8, u32x8, f32x8, m32x8, i64x4, u64x4, f64x4, m64x4, i128x2, u128x2, m128x2 ); impl_from_bits!(m16x16[test_v256]: m32x8, m64x4, m128x2); impl_from_bits!( i32x8[test_v256]: i8x32, u8x32, m8x32, i16x16, u16x16, m16x16, u32x8, f32x8, m32x8, i64x4, u64x4, f64x4, m64x4, i128x2, u128x2, m128x2 ); impl_from_bits!( u32x8[test_v256]: i8x32, u8x32, m8x32, i16x16, u16x16, m16x16, i32x8, f32x8, m32x8, i64x4, u64x4, f64x4, m64x4, i128x2, u128x2, m128x2 ); impl_from_bits!( f32x8[test_v256]: i8x32, u8x32, m8x32, i16x16, u16x16, m16x16, i32x8, u32x8, m32x8, i64x4, u64x4, f64x4, m64x4, i128x2, u128x2, m128x2 ); impl_from_bits!(m32x8[test_v256]: m64x4, m128x2); impl_from_bits!( i64x4[test_v256]: i8x32, u8x32, m8x32, i16x16, u16x16, m16x16, i32x8, u32x8, f32x8, m32x8, u64x4, f64x4, m64x4, i128x2, u128x2, m128x2 ); impl_from_bits!( u64x4[test_v256]: i8x32, u8x32, m8x32, i16x16, u16x16, m16x16, i32x8, u32x8, f32x8, m32x8, i64x4, f64x4, m64x4, i128x2, u128x2, m128x2 ); impl_from_bits!( f64x4[test_v256]: i8x32, u8x32, m8x32, i16x16, u16x16, m16x16, i32x8, u32x8, f32x8, m32x8, i64x4, u64x4, m64x4, i128x2, u128x2, m128x2 ); impl_from_bits!(m64x4[test_v256]: m128x2); impl_from_bits!( i128x2[test_v256]: i8x32, u8x32, m8x32, i16x16, u16x16, m16x16, i32x8, u32x8, f32x8, m32x8, i64x4, u64x4, f64x4, m64x4, u128x2, m128x2 ); impl_from_bits!( u128x2[test_v256]: i8x32, u8x32, m8x32, i16x16, u16x16, m16x16, i32x8, u32x8, f32x8, m32x8, i64x4, u64x4, f64x4, m64x4, i128x2, m128x2 ); // note: m128x2 cannot be constructed from all the other masks bit patterns in // here ================================================ FILE: src/api/into_bits/v32.rs ================================================ //! `FromBits` and `IntoBits` implementations for portable 32-bit wide vectors #[rustfmt::skip] #[allow(unused)] // wasm_bindgen_test use crate::*; impl_from_bits!(i8x4[test_v32]: u8x4, m8x4, i16x2, u16x2, m16x2); impl_from_bits!(u8x4[test_v32]: i8x4, m8x4, i16x2, u16x2, m16x2); impl_from_bits!(m8x4[test_v32]: m16x2); impl_from_bits!(i16x2[test_v32]: i8x4, u8x4, m8x4, u16x2, m16x2); impl_from_bits!(u16x2[test_v32]: i8x4, u8x4, m8x4, i16x2, m16x2); // note: m16x2 cannot be constructed from all m8x4 bit patterns ================================================ FILE: src/api/into_bits/v512.rs ================================================ //! `FromBits` and `IntoBits` implementations for portable 512-bit wide vectors #[rustfmt::skip] #[allow(unused)] // wasm_bindgen_test use crate::*; impl_from_bits!( i8x64[test_v512]: u8x64, m8x64, i16x32, u16x32, m16x32, i32x16, u32x16, f32x16, m32x16, i64x8, u64x8, f64x8, m64x8, i128x4, u128x4, m128x4 ); impl_from_bits!( u8x64[test_v512]: i8x64, m8x64, i16x32, u16x32, m16x32, i32x16, u32x16, f32x16, m32x16, i64x8, u64x8, f64x8, m64x8, i128x4, u128x4, m128x4 ); impl_from_bits!(m8x64[test_v512]: m16x32, m32x16, m64x8, m128x4); impl_from_bits!( i16x32[test_v512]: i8x64, u8x64, m8x64, u16x32, m16x32, i32x16, u32x16, f32x16, m32x16, i64x8, u64x8, f64x8, m64x8, i128x4, u128x4, m128x4 ); impl_from_bits!( u16x32[test_v512]: i8x64, u8x64, m8x64, i16x32, m16x32, i32x16, u32x16, f32x16, m32x16, i64x8, u64x8, f64x8, m64x8, i128x4, u128x4, m128x4 ); impl_from_bits!(m16x32[test_v512]: m32x16, m64x8, m128x4); impl_from_bits!( i32x16[test_v512]: i8x64, u8x64, m8x64, i16x32, u16x32, m16x32, u32x16, f32x16, m32x16, i64x8, u64x8, f64x8, m64x8, i128x4, u128x4, m128x4 ); impl_from_bits!( u32x16[test_v512]: i8x64, u8x64, m8x64, i16x32, u16x32, m16x32, i32x16, f32x16, m32x16, i64x8, u64x8, f64x8, m64x8, i128x4, u128x4, m128x4 ); impl_from_bits!( f32x16[test_v512]: i8x64, u8x64, m8x64, i16x32, u16x32, m16x32, i32x16, u32x16, m32x16, i64x8, u64x8, f64x8, m64x8, i128x4, u128x4, m128x4 ); impl_from_bits!(m32x16[test_v512]: m64x8, m128x4); impl_from_bits!( i64x8[test_v512]: i8x64, u8x64, m8x64, i16x32, u16x32, m16x32, i32x16, u32x16, f32x16, m32x16, u64x8, f64x8, m64x8, i128x4, u128x4, m128x4 ); impl_from_bits!( u64x8[test_v512]: i8x64, u8x64, m8x64, i16x32, u16x32, m16x32, i32x16, u32x16, f32x16, m32x16, i64x8, f64x8, m64x8, i128x4, u128x4, m128x4 ); impl_from_bits!( f64x8[test_v512]: i8x64, u8x64, m8x64, i16x32, u16x32, m16x32, i32x16, u32x16, f32x16, m32x16, i64x8, u64x8, m64x8, i128x4, u128x4, m128x4 ); impl_from_bits!(m64x8[test_v512]: m128x4); impl_from_bits!( i128x4[test_v512]: i8x64, u8x64, m8x64, i16x32, u16x32, m16x32, i32x16, u32x16, f32x16, m32x16, i64x8, u64x8, f64x8, m64x8, u128x4, m128x4 ); impl_from_bits!( u128x4[test_v512]: i8x64, u8x64, m8x64, i16x32, u16x32, m16x32, i32x16, u32x16, f32x16, m32x16, i64x8, u64x8, f64x8, m64x8, i128x4, m128x4 ); // note: m128x4 cannot be constructed from all the other masks bit patterns in // here ================================================ FILE: src/api/into_bits/v64.rs ================================================ //! `FromBits` and `IntoBits` implementations for portable 64-bit wide vectors #[rustfmt::skip] #[allow(unused)] // wasm_bindgen_test use crate::*; impl_from_bits!(i8x8[test_v64]: u8x8, m8x8, i16x4, u16x4, m16x4, i32x2, u32x2, f32x2, m32x2); impl_from_bits!(u8x8[test_v64]: i8x8, m8x8, i16x4, u16x4, m16x4, i32x2, u32x2, f32x2, m32x2); impl_from_bits!(m8x8[test_v64]: m16x4, m32x2); impl_from_bits!(i16x4[test_v64]: i8x8, u8x8, m8x8, u16x4, m16x4, i32x2, u32x2, f32x2, m32x2); impl_from_bits!(u16x4[test_v64]: i8x8, u8x8, m8x8, i16x4, m16x4, i32x2, u32x2, f32x2, m32x2); impl_from_bits!(m16x4[test_v64]: m32x2); impl_from_bits!(i32x2[test_v64]: i8x8, u8x8, m8x8, i16x4, u16x4, m16x4, u32x2, f32x2, m32x2); impl_from_bits!(u32x2[test_v64]: i8x8, u8x8, m8x8, i16x4, u16x4, m16x4, i32x2, f32x2, m32x2); impl_from_bits!(f32x2[test_v64]: i8x8, u8x8, m8x8, i16x4, u16x4, m16x4, i32x2, u32x2, m32x2); // note: m32x2 cannot be constructed from all m16x4 or m8x8 bit patterns ================================================ FILE: src/api/into_bits.rs ================================================ //! Implementation of `FromBits` and `IntoBits`. /// Safe lossless bitwise conversion from `T` to `Self`. #[cfg_attr(doc_cfg, doc(cfg(feature = "into_bits")))] pub trait FromBits: crate::marker::Sized { /// Safe lossless bitwise transmute from `T` to `Self`. fn from_bits(t: T) -> Self; } /// Safe lossless bitwise conversion from `Self` to `T`. #[cfg_attr(doc_cfg, doc(cfg(feature = "into_bits")))] pub trait IntoBits: crate::marker::Sized { /// Safe lossless bitwise transmute from `self` to `T`. fn into_bits(self) -> T; } /// `FromBits` implies `IntoBits`. impl IntoBits for T where U: FromBits, { #[inline] fn into_bits(self) -> U { debug_assert!(crate::mem::size_of::() == crate::mem::size_of::()); U::from_bits(self) } } /// `FromBits` and `IntoBits` are reflexive impl FromBits for T { #[inline] fn from_bits(t: Self) -> Self { t } } #[macro_use] mod macros; mod v16; pub use self::v16::*; mod v32; pub use self::v32::*; mod v64; pub use self::v64::*; mod v128; pub use self::v128::*; mod v256; pub use self::v256::*; mod v512; pub use self::v512::*; mod arch_specific; pub use self::arch_specific::*; ================================================ FILE: src/api/math/float/abs.rs ================================================ //! Implements vertical (lane-wise) floating-point `abs`. macro_rules! impl_math_float_abs { ([$elem_ty:ident; $elem_count:expr]: $id:ident | $test_tt:tt) => { impl $id { /// Absolute value. #[inline] pub fn abs(self) -> Self { use crate::codegen::math::float::abs::Abs; Abs::abs(self) } } test_if!{ $test_tt: paste::item! { pub mod [<$id _math_abs>] { use super::*; #[cfg_attr(not(target_arch = "wasm32"), test)] #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)] fn abs() { let o = $id::splat(1 as $elem_ty); assert_eq!(o, o.abs()); let mo = $id::splat(-1 as $elem_ty); assert_eq!(o, mo.abs()); } } } } }; } ================================================ FILE: src/api/math/float/consts.rs ================================================ macro_rules! impl_float_consts { ([$elem_ty:ident; $elem_count:expr]: $id:ident) => { impl $id { /// Machine epsilon value. pub const EPSILON: $id = $id::splat(core::$elem_ty::EPSILON); /// Smallest finite value. pub const MIN: $id = $id::splat(core::$elem_ty::MIN); /// Smallest positive normal value. pub const MIN_POSITIVE: $id = $id::splat(core::$elem_ty::MIN_POSITIVE); /// Largest finite value. pub const MAX: $id = $id::splat(core::$elem_ty::MAX); /// Not a Number (NaN). pub const NAN: $id = $id::splat(core::$elem_ty::NAN); /// Infinity (∞). pub const INFINITY: $id = $id::splat(core::$elem_ty::INFINITY); /// Negative infinity (-∞). pub const NEG_INFINITY: $id = $id::splat(core::$elem_ty::NEG_INFINITY); /// Archimedes' constant (π) pub const PI: $id = $id::splat(core::$elem_ty::consts::PI); /// π/2 pub const FRAC_PI_2: $id = $id::splat(core::$elem_ty::consts::FRAC_PI_2); /// π/3 pub const FRAC_PI_3: $id = $id::splat(core::$elem_ty::consts::FRAC_PI_3); /// π/4 pub const FRAC_PI_4: $id = $id::splat(core::$elem_ty::consts::FRAC_PI_4); /// π/6 pub const FRAC_PI_6: $id = $id::splat(core::$elem_ty::consts::FRAC_PI_6); /// π/8 pub const FRAC_PI_8: $id = $id::splat(core::$elem_ty::consts::FRAC_PI_8); /// 1/π pub const FRAC_1_PI: $id = $id::splat(core::$elem_ty::consts::FRAC_1_PI); /// 2/π pub const FRAC_2_PI: $id = $id::splat(core::$elem_ty::consts::FRAC_2_PI); /// 2/sqrt(π) pub const FRAC_2_SQRT_PI: $id = $id::splat(core::$elem_ty::consts::FRAC_2_SQRT_PI); /// sqrt(2) pub const SQRT_2: $id = $id::splat(core::$elem_ty::consts::SQRT_2); /// 1/sqrt(2) pub const FRAC_1_SQRT_2: $id = $id::splat(core::$elem_ty::consts::FRAC_1_SQRT_2); /// Euler's number (e) pub const E: $id = $id::splat(core::$elem_ty::consts::E); /// log2(e) pub const LOG2_E: $id = $id::splat(core::$elem_ty::consts::LOG2_E); /// log10(e) pub const LOG10_E: $id = $id::splat(core::$elem_ty::consts::LOG10_E); /// ln(2) pub const LN_2: $id = $id::splat(core::$elem_ty::consts::LN_2); /// ln(10) pub const LN_10: $id = $id::splat(core::$elem_ty::consts::LN_10); } }; } ================================================ FILE: src/api/math/float/cos.rs ================================================ //! Implements vertical (lane-wise) floating-point `cos`. macro_rules! impl_math_float_cos { ([$elem_ty:ident; $elem_count:expr]: $id:ident | $test_tt:tt) => { impl $id { /// Cosine. #[inline] pub fn cos(self) -> Self { use crate::codegen::math::float::cos::Cos; Cos::cos(self) } /// Cosine of `self * PI`. #[inline] pub fn cos_pi(self) -> Self { use crate::codegen::math::float::cos_pi::CosPi; CosPi::cos_pi(self) } } test_if!{ $test_tt: paste::item! { pub mod [<$id _math_cos>] { use super::*; #[cfg_attr(not(target_arch = "wasm32"), test)] #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)] fn cos() { use crate::$elem_ty::consts::PI; let z = $id::splat(0 as $elem_ty); let o = $id::splat(1 as $elem_ty); let p = $id::splat(PI as $elem_ty); let ph = $id::splat(PI as $elem_ty / 2.); let z_r = $id::splat((PI as $elem_ty / 2.).cos()); let o_r = $id::splat((PI as $elem_ty).cos()); assert_eq!(o, z.cos()); assert_eq!(z_r, ph.cos()); assert_eq!(o_r, p.cos()); } } } } }; } ================================================ FILE: src/api/math/float/exp.rs ================================================ //! Implements vertical (lane-wise) floating-point `exp`. macro_rules! impl_math_float_exp { ([$elem_ty:ident; $elem_count:expr]: $id:ident | $test_tt:tt) => { impl $id { /// Returns the exponential function of `self`: `e^(self)`. #[inline] pub fn exp(self) -> Self { use crate::codegen::math::float::exp::Exp; Exp::exp(self) } } test_if!{ $test_tt: paste::item! { pub mod [<$id _math_exp>] { use super::*; #[cfg_attr(not(target_arch = "wasm32"), test)] #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)] fn exp() { let z = $id::splat(0 as $elem_ty); let o = $id::splat(1 as $elem_ty); assert_eq!(o, z.exp()); let e = $id::splat(crate::f64::consts::E as $elem_ty); let tol = $id::splat(2.4e-4 as $elem_ty); assert!((e - o.exp()).abs().le(tol).all()); } } } } }; } ================================================ FILE: src/api/math/float/ln.rs ================================================ //! Implements vertical (lane-wise) floating-point `ln`. macro_rules! impl_math_float_ln { ([$elem_ty:ident; $elem_count:expr]: $id:ident | $test_tt:tt) => { impl $id { /// Returns the natural logarithm of `self`. #[inline] pub fn ln(self) -> Self { use crate::codegen::math::float::ln::Ln; Ln::ln(self) } } test_if!{ $test_tt: paste::item! { pub mod [<$id _math_ln>] { use super::*; #[cfg_attr(not(target_arch = "wasm32"), test)] #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)] fn ln() { let z = $id::splat(0 as $elem_ty); let o = $id::splat(1 as $elem_ty); assert_eq!(z, o.ln()); let e = $id::splat(crate::f64::consts::E as $elem_ty); let tol = $id::splat(2.4e-4 as $elem_ty); assert!((o - e.ln()).abs().le(tol).all()); } } } } }; } ================================================ FILE: src/api/math/float/mul_add.rs ================================================ //! Implements vertical (lane-wise) floating-point `mul_add`. macro_rules! impl_math_float_mul_add { ([$elem_ty:ident; $elem_count:expr]: $id:ident | $test_tt:tt) => { impl $id { /// Fused multiply add: `self * y + z` #[inline] pub fn mul_add(self, y: Self, z: Self) -> Self { use crate::codegen::math::float::mul_add::MulAdd; MulAdd::mul_add(self, y, z) } } test_if!{ $test_tt: paste::item! { pub mod [<$id _math_mul_add>] { use super::*; #[cfg_attr(not(target_arch = "wasm32"), test)] #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)] fn mul_add() { let z = $id::splat(0 as $elem_ty); let o = $id::splat(1 as $elem_ty); let t = $id::splat(2 as $elem_ty); let t3 = $id::splat(3 as $elem_ty); let f = $id::splat(4 as $elem_ty); assert_eq!(z, z.mul_add(z, z)); assert_eq!(o, o.mul_add(o, z)); assert_eq!(o, o.mul_add(z, o)); assert_eq!(o, z.mul_add(o, o)); assert_eq!(t, o.mul_add(o, o)); assert_eq!(t, o.mul_add(t, z)); assert_eq!(t, t.mul_add(o, z)); assert_eq!(f, t.mul_add(t, z)); assert_eq!(f, t.mul_add(o, t)); assert_eq!(t3, t.mul_add(o, o)); } } } } }; } ================================================ FILE: src/api/math/float/mul_adde.rs ================================================ //! Implements vertical (lane-wise) floating-point `mul_adde`. macro_rules! impl_math_float_mul_adde { ([$elem_ty:ident; $elem_count:expr]: $id:ident | $test_tt:tt) => { impl $id { /// Fused multiply add estimate: ~= `self * y + z` /// /// While fused multiply-add (`fma`) has infinite precision, /// `mul_adde` has _at worst_ the same precision of a multiply followed by an add. /// This might be more efficient on architectures that do not have an `fma` instruction. #[inline] pub fn mul_adde(self, y: Self, z: Self) -> Self { use crate::codegen::math::float::mul_adde::MulAddE; MulAddE::mul_adde(self, y, z) } } test_if!{ $test_tt: paste::item! { pub mod [<$id _math_mul_adde>] { use super::*; #[cfg_attr(not(target_arch = "wasm32"), test)] #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)] fn mul_adde() { let z = $id::splat(0 as $elem_ty); let o = $id::splat(1 as $elem_ty); let t = $id::splat(2 as $elem_ty); let t3 = $id::splat(3 as $elem_ty); let f = $id::splat(4 as $elem_ty); assert_eq!(z, z.mul_adde(z, z)); assert_eq!(o, o.mul_adde(o, z)); assert_eq!(o, o.mul_adde(z, o)); assert_eq!(o, z.mul_adde(o, o)); assert_eq!(t, o.mul_adde(o, o)); assert_eq!(t, o.mul_adde(t, z)); assert_eq!(t, t.mul_adde(o, z)); assert_eq!(f, t.mul_adde(t, z)); assert_eq!(f, t.mul_adde(o, t)); assert_eq!(t3, t.mul_adde(o, o)); } } } } }; } ================================================ FILE: src/api/math/float/powf.rs ================================================ //! Implements vertical (lane-wise) floating-point `powf`. macro_rules! impl_math_float_powf { ([$elem_ty:ident; $elem_count:expr]: $id:ident | $test_tt:tt) => { impl $id { /// Raises `self` number to the floating point power of `x`. #[inline] pub fn powf(self, x: Self) -> Self { use crate::codegen::math::float::powf::Powf; Powf::powf(self, x) } } test_if!{ $test_tt: paste::item! { pub mod [<$id _math_powf>] { use super::*; #[cfg_attr(not(target_arch = "wasm32"), test)] #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)] fn powf() { let z = $id::splat(0 as $elem_ty); let o = $id::splat(1 as $elem_ty); let t = $id::splat(2 as $elem_ty); assert_eq!(o, o.powf(z)); assert_eq!(o, t.powf(z)); assert_eq!(o, o.powf(o)); assert_eq!(t, t.powf(o)); let f = $id::splat(4 as $elem_ty); assert_eq!(f, t.powf(t)); } } } } }; } ================================================ FILE: src/api/math/float/recpre.rs ================================================ //! Implements vertical (lane-wise) floating-point `recpre`. macro_rules! impl_math_float_recpre { ([$elem_ty:ident; $elem_count:expr]: $id:ident | $test_tt:tt) => { impl $id { /// Reciprocal estimate: `~= 1. / self`. /// /// FIXME: The precision of the estimate is currently unspecified. #[inline] pub fn recpre(self) -> Self { $id::splat(1.) / self } } test_if!{ $test_tt: paste::item! { pub mod [<$id _math_recpre>] { use super::*; #[cfg_attr(not(target_arch = "wasm32"), test)] #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)] fn recpre() { let tol = $id::splat(2.4e-4 as $elem_ty); let o = $id::splat(1 as $elem_ty); let error = (o - o.recpre()).abs(); assert!(error.le(tol).all()); let t = $id::splat(2 as $elem_ty); let e = 0.5; let error = (e - t.recpre()).abs(); assert!(error.le(tol).all()); } } } } }; } ================================================ FILE: src/api/math/float/rsqrte.rs ================================================ //! Implements vertical (lane-wise) floating-point `rsqrte`. macro_rules! impl_math_float_rsqrte { ([$elem_ty:ident; $elem_count:expr]: $id:ident | $test_tt:tt) => { impl $id { /// Reciprocal square-root estimate: `~= 1. / self.sqrt()`. /// /// FIXME: The precision of the estimate is currently unspecified. #[inline] pub fn rsqrte(self) -> Self { unsafe { use crate::llvm::simd_fsqrt; $id::splat(1.) / Simd(simd_fsqrt(self.0)) } } } test_if!{ $test_tt: paste::item! { pub mod [<$id _math_rsqrte>] { use super::*; #[cfg_attr(not(target_arch = "wasm32"), test)] #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)] fn rsqrte() { use crate::$elem_ty::consts::SQRT_2; let tol = $id::splat(2.4e-4 as $elem_ty); let o = $id::splat(1 as $elem_ty); let error = (o - o.rsqrte()).abs(); assert!(error.le(tol).all()); let t = $id::splat(2 as $elem_ty); let e = 1. / SQRT_2; let error = (e - t.rsqrte()).abs(); assert!(error.le(tol).all()); } } } } }; } ================================================ FILE: src/api/math/float/sin.rs ================================================ //! Implements vertical (lane-wise) floating-point `sin`. macro_rules! impl_math_float_sin { ([$elem_ty:ident; $elem_count:expr]: $id:ident | $test_tt:tt) => { impl $id { /// Sine. #[inline] pub fn sin(self) -> Self { use crate::codegen::math::float::sin::Sin; Sin::sin(self) } /// Sine of `self * PI`. #[inline] pub fn sin_pi(self) -> Self { use crate::codegen::math::float::sin_pi::SinPi; SinPi::sin_pi(self) } /// Sine and cosine of `self * PI`. #[inline] pub fn sin_cos_pi(self) -> (Self, Self) { use crate::codegen::math::float::sin_cos_pi::SinCosPi; SinCosPi::sin_cos_pi(self) } } test_if!{ $test_tt: paste::item! { pub mod [<$id _math_sin>] { use super::*; #[cfg_attr(not(target_arch = "wasm32"), test)] #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)] fn sin() { use crate::$elem_ty::consts::PI; let z = $id::splat(0 as $elem_ty); let p = $id::splat(PI as $elem_ty); let ph = $id::splat(PI as $elem_ty / 2.); let o_r = $id::splat((PI as $elem_ty / 2.).sin()); let z_r = $id::splat((PI as $elem_ty).sin()); assert_eq!(z, z.sin()); assert_eq!(o_r, ph.sin()); assert_eq!(z_r, p.sin()); } } } } }; } ================================================ FILE: src/api/math/float/sqrt.rs ================================================ //! Implements vertical (lane-wise) floating-point `sqrt`. macro_rules! impl_math_float_sqrt { ([$elem_ty:ident; $elem_count:expr]: $id:ident | $test_tt:tt) => { impl $id { #[inline] pub fn sqrt(self) -> Self { use crate::codegen::math::float::sqrt::Sqrt; Sqrt::sqrt(self) } } test_if!{ $test_tt: paste::item! { pub mod [<$id _math_sqrt>] { use super::*; #[cfg_attr(not(target_arch = "wasm32"), test)] #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)] fn sqrt() { use crate::$elem_ty::consts::SQRT_2; let z = $id::splat(0 as $elem_ty); let o = $id::splat(1 as $elem_ty); assert_eq!(z, z.sqrt()); assert_eq!(o, o.sqrt()); let t = $id::splat(2 as $elem_ty); let e = $id::splat(SQRT_2); assert_eq!(e, t.sqrt()); } } } } }; } ================================================ FILE: src/api/math/float/sqrte.rs ================================================ //! Implements vertical (lane-wise) floating-point `sqrte`. macro_rules! impl_math_float_sqrte { ([$elem_ty:ident; $elem_count:expr]: $id:ident | $test_tt:tt) => { impl $id { /// Square-root estimate. /// /// FIXME: The precision of the estimate is currently unspecified. #[inline] pub fn sqrte(self) -> Self { use crate::codegen::math::float::sqrte::Sqrte; Sqrte::sqrte(self) } } test_if!{ $test_tt: paste::item! { pub mod [<$id _math_sqrte>] { use super::*; #[cfg_attr(not(target_arch = "wasm32"), test)] #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)] fn sqrte() { use crate::$elem_ty::consts::SQRT_2; let tol = $id::splat(2.4e-4 as $elem_ty); let z = $id::splat(0 as $elem_ty); let error = (z - z.sqrte()).abs(); assert!(error.le(tol).all()); let o = $id::splat(1 as $elem_ty); let error = (o - o.sqrte()).abs(); assert!(error.le(tol).all()); let t = $id::splat(2 as $elem_ty); let e = $id::splat(SQRT_2 as $elem_ty); let error = (e - t.sqrte()).abs(); assert!(error.le(tol).all()); } } } } }; } ================================================ FILE: src/api/math/float/tanh.rs ================================================ //! Implements vertical (lane-wise) floating-point `tanh`. macro_rules! impl_math_float_tanh { ([$elem_ty:ident; $elem_count:expr]: $id:ident | $test_tt:tt) => { impl $id { /// Tanh. #[inline] pub fn tanh(self) -> Self { use crate::codegen::math::float::tanh::Tanh; Tanh::tanh(self) } } test_if!{ $test_tt: paste::item! { pub mod [<$id _math_tanh>] { use super::*; #[cfg_attr(not(target_arch = "wasm32"), test)] #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)] fn tanh() { let z = $id::splat(0 as $elem_ty); assert_eq!(z, z.tanh()); } } } } }; } ================================================ FILE: src/api/math/float.rs ================================================ //! Implements vertical floating-point math operations. #[macro_use] mod abs; #[macro_use] mod consts; #[macro_use] mod cos; #[macro_use] mod exp; #[macro_use] mod powf; #[macro_use] mod ln; #[macro_use] mod mul_add; #[macro_use] mod mul_adde; #[macro_use] mod recpre; #[macro_use] mod rsqrte; #[macro_use] mod sin; #[macro_use] mod sqrt; #[macro_use] mod sqrte; #[macro_use] mod tanh; macro_rules! impl_float_category { ([$elem_ty:ident; $elem_count:expr]: $id:ident, $mask_ty:ident) => { impl $id { #[inline] pub fn is_nan(self) -> $mask_ty { self.ne(self) } #[inline] pub fn is_infinite(self) -> $mask_ty { self.eq(Self::INFINITY) | self.eq(Self::NEG_INFINITY) } #[inline] pub fn is_finite(self) -> $mask_ty { !(self.is_nan() | self.is_infinite()) } } }; } ================================================ FILE: src/api/math.rs ================================================ //! Implements vertical math operations #[macro_use] mod float; ================================================ FILE: src/api/minimal/iuf.rs ================================================ //! Minimal API of signed integer, unsigned integer, and floating-point //! vectors. macro_rules! impl_minimal_iuf { ([$elem_ty:ident; $elem_count:expr]: $id:ident | $ielem_ty:ident | $test_tt:tt | $($elem_name:ident),+ | $(#[$doc:meta])*) => { $(#[$doc])* pub type $id = Simd<[$elem_ty; $elem_count]>; impl sealed::Simd for $id { type Element = $elem_ty; const LANES: usize = $elem_count; type LanesType = [u32; $elem_count]; } impl $id { /// Creates a new instance with each vector elements initialized /// with the provided values. #[inline] #[allow(clippy::too_many_arguments)] pub const fn new($($elem_name: $elem_ty),*) -> Self { Simd(codegen::$id($($elem_name as $ielem_ty),*)) } /// Returns the number of vector lanes. #[inline] pub const fn lanes() -> usize { $elem_count } /// Constructs a new instance with each element initialized to /// `value`. #[inline] pub const fn splat(value: $elem_ty) -> Self { Simd(codegen::$id($({ #[allow(non_camel_case_types, dead_code)] struct $elem_name; value as $ielem_ty }),*)) } /// Extracts the value at `index`. /// /// # Panics /// /// If `index >= Self::lanes()`. #[inline] pub fn extract(self, index: usize) -> $elem_ty { assert!(index < $elem_count); unsafe { self.extract_unchecked(index) } } /// Extracts the value at `index`. /// /// # Safety /// /// If `index >= Self::lanes()` the behavior is undefined. #[inline] pub unsafe fn extract_unchecked(self, index: usize) -> $elem_ty { use crate::llvm::simd_extract; let e: $ielem_ty = simd_extract(self.0, index as u32); e as $elem_ty } /// Returns a new vector where the value at `index` is replaced by `new_value`. /// /// # Panics /// /// If `index >= Self::lanes()`. #[inline] #[must_use = "replace does not modify the original value - \ it returns a new vector with the value at `index` \ replaced by `new_value`d" ] pub fn replace(self, index: usize, new_value: $elem_ty) -> Self { assert!(index < $elem_count); unsafe { self.replace_unchecked(index, new_value) } } /// Returns a new vector where the value at `index` is replaced by `new_value`. /// /// # Safety /// /// If `index >= Self::lanes()` the behavior is undefined. #[inline] #[must_use = "replace_unchecked does not modify the original value - \ it returns a new vector with the value at `index` \ replaced by `new_value`d" ] pub unsafe fn replace_unchecked( self, index: usize, new_value: $elem_ty, ) -> Self { use crate::llvm::simd_insert; Simd(simd_insert(self.0, index as u32, new_value as $ielem_ty)) } } test_if!{ $test_tt: paste::item! { // Comparisons use integer casts within mantissa^1 range. #[allow(clippy::float_cmp)] pub mod [<$id _minimal>] { use super::*; #[cfg_attr(not(target_arch = "wasm32"), test)] #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)] fn minimal() { // lanes: assert_eq!($elem_count, $id::lanes()); // splat and extract / extract_unchecked: const VAL: $elem_ty = 7 as $elem_ty; const VEC: $id = $id::splat(VAL); for i in 0..$id::lanes() { assert_eq!(VAL, VEC.extract(i)); assert_eq!( VAL, unsafe { VEC.extract_unchecked(i) } ); } // replace / replace_unchecked let new_vec = VEC.replace(0, 42 as $elem_ty); for i in 0..$id::lanes() { if i == 0 { assert_eq!(42 as $elem_ty, new_vec.extract(i)); } else { assert_eq!(VAL, new_vec.extract(i)); } } let new_vec = unsafe { VEC.replace_unchecked(0, 42 as $elem_ty) }; for i in 0..$id::lanes() { if i == 0 { assert_eq!(42 as $elem_ty, new_vec.extract(i)); } else { assert_eq!(VAL, new_vec.extract(i)); } } } // FIXME: wasm-bindgen-test does not support #[should_panic] // #[cfg_attr(not(target_arch = "wasm32"), test)] #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)] #[cfg(not(target_arch = "wasm32"))] #[test] #[should_panic] fn extract_panic_oob() { const VAL: $elem_ty = 7 as $elem_ty; const VEC: $id = $id::splat(VAL); let _ = VEC.extract($id::lanes()); } // FIXME: wasm-bindgen-test does not support #[should_panic] // #[cfg_attr(not(target_arch = "wasm32"), test)] #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)] #[cfg(not(target_arch = "wasm32"))] #[test] #[should_panic] fn replace_panic_oob() { const VAL: $elem_ty = 7 as $elem_ty; const VEC: $id = $id::splat(VAL); let _ = VEC.replace($id::lanes(), 42 as $elem_ty); } } } } } } ================================================ FILE: src/api/minimal/mask.rs ================================================ //! Minimal API of mask vectors. macro_rules! impl_minimal_mask { ([$elem_ty:ident; $elem_count:expr]: $id:ident | $ielem_ty:ident | $test_tt:tt | $($elem_name:ident),+ | $(#[$doc:meta])*) => { $(#[$doc])* pub type $id = Simd<[$elem_ty; $elem_count]>; impl sealed::Simd for $id { type Element = $elem_ty; const LANES: usize = $elem_count; type LanesType = [u32; $elem_count]; } impl $id { /// Creates a new instance with each vector elements initialized /// with the provided values. #[inline] #[allow(clippy::too_many_arguments)] pub const fn new($($elem_name: bool),*) -> Self { Simd(codegen::$id($(Self::bool_to_internal($elem_name)),*)) } /// Converts a boolean type into the type of the vector lanes. #[inline] #[allow(clippy::indexing_slicing)] const fn bool_to_internal(x: bool) -> $ielem_ty { [0 as $ielem_ty, !(0 as $ielem_ty)][x as usize] } /// Returns the number of vector lanes. #[inline] pub const fn lanes() -> usize { $elem_count } /// Constructs a new instance with each element initialized to /// `value`. #[inline] pub const fn splat(value: bool) -> Self { Simd(codegen::$id($({ #[allow(non_camel_case_types, dead_code)] struct $elem_name; Self::bool_to_internal(value) }),*)) } /// Extracts the value at `index`. /// /// # Panics /// /// If `index >= Self::lanes()`. #[inline] pub fn extract(self, index: usize) -> bool { assert!(index < $elem_count); unsafe { self.extract_unchecked(index) } } /// Extracts the value at `index`. /// /// # Safety /// /// If `index >= Self::lanes()` the behavior is undefined. #[inline] pub unsafe fn extract_unchecked(self, index: usize) -> bool { use crate::llvm::simd_extract; let x: $ielem_ty = simd_extract(self.0, index as u32); x != 0 } /// Returns a new vector where the value at `index` is replaced by /// `new_value`. /// /// # Panics /// /// If `index >= Self::lanes()`. #[inline] #[must_use = "replace does not modify the original value - \ it returns a new vector with the value at `index` \ replaced by `new_value`d" ] pub fn replace(self, index: usize, new_value: bool) -> Self { assert!(index < $elem_count); unsafe { self.replace_unchecked(index, new_value) } } /// Returns a new vector where the value at `index` is replaced by /// `new_value`. /// /// # Safety /// /// If `index >= Self::lanes()` the behavior is undefined. #[inline] #[must_use = "replace_unchecked does not modify the original value - \ it returns a new vector with the value at `index` \ replaced by `new_value`d" ] pub unsafe fn replace_unchecked( self, index: usize, new_value: bool, ) -> Self { use crate::llvm::simd_insert; Simd(simd_insert(self.0, index as u32, Self::bool_to_internal(new_value))) } } test_if!{ $test_tt: paste::item! { pub mod [<$id _minimal>] { use super::*; #[cfg_attr(not(target_arch = "wasm32"), test)] #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)] fn minimal() { // TODO: test new // lanes: assert_eq!($elem_count, $id::lanes()); // splat and extract / extract_unchecked: let vec = $id::splat(true); for i in 0..$id::lanes() { assert_eq!(true, vec.extract(i)); assert_eq!(true, unsafe { vec.extract_unchecked(i) } ); } // replace / replace_unchecked let new_vec = vec.replace(0, false); for i in 0..$id::lanes() { if i == 0 { assert_eq!(false, new_vec.extract(i)); } else { assert_eq!(true, new_vec.extract(i)); } } let new_vec = unsafe { vec.replace_unchecked(0, false) }; for i in 0..$id::lanes() { if i == 0 { assert_eq!(false, new_vec.extract(i)); } else { assert_eq!(true, new_vec.extract(i)); } } } // FIXME: wasm-bindgen-test does not support #[should_panic] // #[cfg_attr(not(target_arch = "wasm32"), test)] // #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)] #[cfg(not(target_arch = "wasm32"))] #[test] #[should_panic] fn extract_panic_oob() { let vec = $id::splat(false); let _ = vec.extract($id::lanes()); } // FIXME: wasm-bindgen-test does not support #[should_panic] // #[cfg_attr(not(target_arch = "wasm32"), test)] // #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)] #[cfg(not(target_arch = "wasm32"))] #[test] #[should_panic] fn replace_panic_oob() { let vec = $id::splat(false); let _ = vec.replace($id::lanes(), true); } } } } } } ================================================ FILE: src/api/minimal/ptr.rs ================================================ //! Minimal API of pointer vectors. macro_rules! impl_minimal_p { ([$elem_ty:ty; $elem_count:expr]: $id:ident, $mask_ty:ident, $usize_ty:ident, $isize_ty:ident | $ref:ident | $test_tt:tt | $($elem_name:ident),+ | ($true:expr, $false:expr) | $(#[$doc:meta])*) => { $(#[$doc])* pub type $id = Simd<[$elem_ty; $elem_count]>; impl sealed::Simd for $id { type Element = $elem_ty; const LANES: usize = $elem_count; type LanesType = [u32; $elem_count]; } impl $id { /// Creates a new instance with each vector elements initialized /// with the provided values. #[inline] #[allow(clippy::too_many_arguments)] pub const fn new($($elem_name: $elem_ty),*) -> Self { Simd(codegen::$id($($elem_name),*)) } /// Returns the number of vector lanes. #[inline] pub const fn lanes() -> usize { $elem_count } /// Constructs a new instance with each element initialized to /// `value`. #[inline] pub const fn splat(value: $elem_ty) -> Self { Simd(codegen::$id($({ #[allow(non_camel_case_types, dead_code)] struct $elem_name; value }),*)) } /// Constructs a new instance with each element initialized to /// `null`. #[inline] pub const fn null() -> Self { Self::splat(crate::ptr::null_mut() as $elem_ty) } /// Returns a mask that selects those lanes that contain `null` /// pointers. #[inline] pub fn is_null(self) -> $mask_ty { self.eq(Self::null()) } /// Extracts the value at `index`. /// /// # Panics /// /// If `index >= Self::lanes()`. #[inline] pub fn extract(self, index: usize) -> $elem_ty { assert!(index < $elem_count); unsafe { self.extract_unchecked(index) } } /// Extracts the value at `index`. /// /// # Safety /// /// If `index >= Self::lanes()` the behavior is undefined. #[inline] pub unsafe fn extract_unchecked(self, index: usize) -> $elem_ty { use crate::llvm::simd_extract; simd_extract(self.0, index as u32) } /// Returns a new vector where the value at `index` is replaced by /// `new_value`. /// /// # Panics /// /// If `index >= Self::lanes()`. #[inline] #[must_use = "replace does not modify the original value - \ it returns a new vector with the value at `index` \ replaced by `new_value`d" ] #[allow(clippy::not_unsafe_ptr_arg_deref)] pub fn replace(self, index: usize, new_value: $elem_ty) -> Self { assert!(index < $elem_count); unsafe { self.replace_unchecked(index, new_value) } } /// Returns a new vector where the value at `index` is replaced by `new_value`. /// /// # Safety /// /// If `index >= Self::lanes()` the behavior is undefined. #[inline] #[must_use = "replace_unchecked does not modify the original value - \ it returns a new vector with the value at `index` \ replaced by `new_value`d" ] pub unsafe fn replace_unchecked( self, index: usize, new_value: $elem_ty, ) -> Self { use crate::llvm::simd_insert; Simd(simd_insert(self.0, index as u32, new_value)) } } test_if!{ $test_tt: paste::item! { pub mod [<$id _minimal>] { use super::*; #[cfg_attr(not(target_arch = "wasm32"), test)] #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)] fn minimal() { // lanes: assert_eq!($elem_count, $id::::lanes()); // splat and extract / extract_unchecked: let VAL7: <$id as sealed::Simd>::Element = $ref!(7); let VAL42: <$id as sealed::Simd>::Element = $ref!(42); let VEC: $id = $id::splat(VAL7); for i in 0..$id::::lanes() { assert_eq!(VAL7, VEC.extract(i)); assert_eq!( VAL7, unsafe { VEC.extract_unchecked(i) } ); } // replace / replace_unchecked let new_vec = VEC.replace(0, VAL42); for i in 0..$id::::lanes() { if i == 0 { assert_eq!(VAL42, new_vec.extract(i)); } else { assert_eq!(VAL7, new_vec.extract(i)); } } let new_vec = unsafe { VEC.replace_unchecked(0, VAL42) }; for i in 0..$id::::lanes() { if i == 0 { assert_eq!(VAL42, new_vec.extract(i)); } else { assert_eq!(VAL7, new_vec.extract(i)); } } let mut n = $id::::null(); assert_eq!( n, $id::::splat(unsafe { crate::mem::zeroed() }) ); assert!(n.is_null().all()); n = n.replace( 0, unsafe { crate::mem::transmute(1_isize) } ); assert!(!n.is_null().all()); if $id::::lanes() > 1 { assert!(n.is_null().any()); } else { assert!(!n.is_null().any()); } } // FIXME: wasm-bindgen-test does not support #[should_panic] // #[cfg_attr(not(target_arch = "wasm32"), test)] // #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)] #[cfg(not(target_arch = "wasm32"))] #[test] #[should_panic] fn extract_panic_oob() { let VAL: <$id as sealed::Simd>::Element = $ref!(7); let VEC: $id = $id::splat(VAL); let _ = VEC.extract($id::::lanes()); } // FIXME: wasm-bindgen-test does not support #[should_panic] // #[cfg_attr(not(target_arch = "wasm32"), test)] // #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)] #[cfg(not(target_arch = "wasm32"))] #[test] #[should_panic] fn replace_panic_oob() { let VAL: <$id as sealed::Simd>::Element = $ref!(7); let VAL42: <$id as sealed::Simd>::Element = $ref!(42); let VEC: $id = $id::splat(VAL); let _ = VEC.replace($id::::lanes(), VAL42); } } } } impl crate::fmt::Debug for $id { #[allow(clippy::missing_inline_in_public_items)] fn fmt(&self, f: &mut crate::fmt::Formatter<'_>) -> crate::fmt::Result { write!( f, "{}<{}>(", stringify!($id), crate::intrinsics::type_name::() )?; for i in 0..$elem_count { if i > 0 { write!(f, ", ")?; } self.extract(i).fmt(f)?; } write!(f, ")") } } test_if!{ $test_tt: paste::item! { pub mod [<$id _fmt_debug>] { use super::*; #[cfg_attr(not(target_arch = "wasm32"), test)] #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)] fn debug() { use arrayvec::{ArrayString,ArrayVec}; type TinyString = ArrayString<[u8; 512]>; use crate::fmt::Write; let v = $id::::default(); let mut s = TinyString::new(); write!(&mut s, "{:?}", v).unwrap(); let mut beg = TinyString::new(); write!(&mut beg, "{}(", stringify!($id)).unwrap(); assert!( s.starts_with(beg.as_str()), "s = {} (should start with = {})", s, beg ); assert!(s.ends_with(")")); let s: ArrayVec<[TinyString; 64]> = s.replace(beg.as_str(), "") .replace(")", "").split(",") .map(|v| TinyString::from(v.trim()).unwrap()) .collect(); assert_eq!(s.len(), $id::::lanes()); for (index, ss) in s.into_iter().enumerate() { let mut e = TinyString::new(); write!(&mut e, "{:?}", v.extract(index)).unwrap(); assert_eq!(ss, e); } } } } } impl Default for $id { #[inline] fn default() -> Self { // FIXME: ptrs do not implement default Self::null() } } test_if!{ $test_tt: paste::item! { pub mod [<$id _default>] { use super::*; #[cfg_attr(not(target_arch = "wasm32"), test)] #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)] fn default() { let a = $id::::default(); for i in 0..$id::::lanes() { assert_eq!( a.extract(i), unsafe { crate::mem::zeroed() } ); } } } } } impl $id { /// Lane-wise equality comparison. #[inline] pub fn eq(self, other: Self) -> $mask_ty { unsafe { use crate::llvm::simd_eq; let a: $usize_ty = crate::mem::transmute(self); let b: $usize_ty = crate::mem::transmute(other); Simd(simd_eq(a.0, b.0)) } } /// Lane-wise inequality comparison. #[inline] pub fn ne(self, other: Self) -> $mask_ty { unsafe { use crate::llvm::simd_ne; let a: $usize_ty = crate::mem::transmute(self); let b: $usize_ty = crate::mem::transmute(other); Simd(simd_ne(a.0, b.0)) } } /// Lane-wise less-than comparison. #[inline] pub fn lt(self, other: Self) -> $mask_ty { unsafe { use crate::llvm::simd_lt; let a: $usize_ty = crate::mem::transmute(self); let b: $usize_ty = crate::mem::transmute(other); Simd(simd_lt(a.0, b.0)) } } /// Lane-wise less-than-or-equals comparison. #[inline] pub fn le(self, other: Self) -> $mask_ty { unsafe { use crate::llvm::simd_le; let a: $usize_ty = crate::mem::transmute(self); let b: $usize_ty = crate::mem::transmute(other); Simd(simd_le(a.0, b.0)) } } /// Lane-wise greater-than comparison. #[inline] pub fn gt(self, other: Self) -> $mask_ty { unsafe { use crate::llvm::simd_gt; let a: $usize_ty = crate::mem::transmute(self); let b: $usize_ty = crate::mem::transmute(other); Simd(simd_gt(a.0, b.0)) } } /// Lane-wise greater-than-or-equals comparison. #[inline] pub fn ge(self, other: Self) -> $mask_ty { unsafe { use crate::llvm::simd_ge; let a: $usize_ty = crate::mem::transmute(self); let b: $usize_ty = crate::mem::transmute(other); Simd(simd_ge(a.0, b.0)) } } } test_if!{ $test_tt: paste::item! { pub mod [<$id _cmp_vertical>] { use super::*; #[cfg_attr(not(target_arch = "wasm32"), test)] #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)] fn cmp() { let a = $id::::null(); let b = $id::::splat(unsafe { crate::mem::transmute(1_isize) }); let r = a.lt(b); let e = $mask_ty::splat(true); assert!(r == e); let r = a.le(b); assert!(r == e); let e = $mask_ty::splat(false); let r = a.gt(b); assert!(r == e); let r = a.ge(b); assert!(r == e); let r = a.eq(b); assert!(r == e); let mut a = a; let mut b = b; let mut e = e; for i in 0..$id::::lanes() { if i % 2 == 0 { a = a.replace( i, unsafe { crate::mem::transmute(0_isize) } ); b = b.replace( i, unsafe { crate::mem::transmute(1_isize) } ); e = e.replace(i, true); } else { a = a.replace( i, unsafe { crate::mem::transmute(1_isize) } ); b = b.replace( i, unsafe { crate::mem::transmute(0_isize) } ); e = e.replace(i, false); } } let r = a.lt(b); assert!(r == e); } } } } #[allow(clippy::partialeq_ne_impl)] impl crate::cmp::PartialEq<$id> for $id { #[inline] fn eq(&self, other: &Self) -> bool { $id::::eq(*self, *other).all() } #[inline] fn ne(&self, other: &Self) -> bool { $id::::ne(*self, *other).any() } } // FIXME: https://github.com/rust-lang-nursery/rust-clippy/issues/2892 #[allow(clippy::partialeq_ne_impl)] impl crate::cmp::PartialEq>> for LexicographicallyOrdered<$id> { #[inline] fn eq(&self, other: &Self) -> bool { self.0 == other.0 } #[inline] fn ne(&self, other: &Self) -> bool { self.0 != other.0 } } test_if!{ $test_tt: paste::item! { pub mod [<$id _cmp_PartialEq>] { use super::*; #[cfg_attr(not(target_arch = "wasm32"), test)] #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)] fn partial_eq() { let a = $id::::null(); let b = $id::::splat(unsafe { crate::mem::transmute(1_isize) }); assert!(a != b); assert!(!(a == b)); assert!(a == a); assert!(!(a != a)); if $id::::lanes() > 1 { let a = $id::::null().replace(0, unsafe { crate::mem::transmute(1_isize) }); let b = $id::::splat(unsafe { crate::mem::transmute(1_isize) }); assert!(a != b); assert!(!(a == b)); assert!(a == a); assert!(!(a != a)); } } } } } impl crate::cmp::Eq for $id {} impl crate::cmp::Eq for LexicographicallyOrdered<$id> {} test_if!{ $test_tt: paste::item! { pub mod [<$id _cmp_eq>] { use super::*; #[cfg_attr(not(target_arch = "wasm32"), test)] #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)] fn eq() { fn foo(_: E) {} let a = $id::::null(); foo(a); } } } } impl From<[$elem_ty; $elem_count]> for $id { #[inline] fn from(array: [$elem_ty; $elem_count]) -> Self { unsafe { // FIXME: unnecessary zeroing; better than UB. let mut u: Self = crate::mem::zeroed(); crate::ptr::copy_nonoverlapping( &array as *const [$elem_ty; $elem_count] as *const u8, &mut u as *mut Self as *mut u8, crate::mem::size_of::() ); u } } } impl Into<[$elem_ty; $elem_count]> for $id { #[inline] fn into(self) -> [$elem_ty; $elem_count] { unsafe { // FIXME: unnecessary zeroing; better than UB. let mut u: [$elem_ty; $elem_count] = crate::mem::zeroed(); crate::ptr::copy_nonoverlapping( &self as *const $id as *const u8, &mut u as *mut [$elem_ty; $elem_count] as *mut u8, crate::mem::size_of::() ); u } } } test_if!{ $test_tt: paste::item! { pub mod [<$id _from>] { use super::*; #[cfg_attr(not(target_arch = "wasm32"), test)] #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)] fn array() { let values = [1_i32; $elem_count]; let mut vec: $id = Default::default(); let mut array = [ $id::::null().extract(0); $elem_count ]; for i in 0..$elem_count { let ptr = &values[i] as *const i32 as *mut i32; vec = vec.replace(i, ptr); array[i] = ptr; } // FIXME: there is no impl of From<$id> for [$elem_ty; N] // let a0 = From::from(vec); // assert_eq!(a0, array); #[allow(unused_assignments)] let mut a1 = array; a1 = vec.into(); assert_eq!(a1, array); let v0: $id = From::from(array); assert_eq!(v0, vec); let v1: $id = array.into(); assert_eq!(v1, vec); } } } } impl $id { /// Instantiates a new vector with the values of the `slice`. /// /// # Panics /// /// If `slice.len() < Self::lanes()` or `&slice[0]` is not aligned /// to an `align_of::()` boundary. #[inline] pub fn from_slice_aligned(slice: &[$elem_ty]) -> Self { unsafe { assert!(slice.len() >= $elem_count); let target_ptr = slice.as_ptr(); assert!( target_ptr.align_offset(crate::mem::align_of::()) == 0 ); Self::from_slice_aligned_unchecked(slice) } } /// Instantiates a new vector with the values of the `slice`. /// /// # Panics /// /// If `slice.len() < Self::lanes()`. #[inline] pub fn from_slice_unaligned(slice: &[$elem_ty]) -> Self { unsafe { assert!(slice.len() >= $elem_count); Self::from_slice_unaligned_unchecked(slice) } } /// Instantiates a new vector with the values of the `slice`. /// /// # Safety /// /// If `slice.len() < Self::lanes()` or `&slice[0]` is not aligned /// to an `align_of::()` boundary, the behavior is undefined. #[inline] pub unsafe fn from_slice_aligned_unchecked(slice: &[$elem_ty]) -> Self { #[allow(clippy::cast_ptr_alignment)] *(slice.as_ptr().cast()) } /// Instantiates a new vector with the values of the `slice`. /// /// # Safety /// /// If `slice.len() < Self::lanes()` the behavior is undefined. #[inline] pub unsafe fn from_slice_unaligned_unchecked( slice: &[$elem_ty], ) -> Self { use crate::mem::size_of; let target_ptr = slice.as_ptr().cast(); let mut x = Self::splat(crate::ptr::null_mut() as $elem_ty); let self_ptr = &mut x as *mut Self as *mut u8; crate::ptr::copy_nonoverlapping( target_ptr, self_ptr, size_of::(), ); x } } test_if!{ $test_tt: paste::item! { pub mod [<$id _slice_from_slice>] { use super::*; use crate::iter::Iterator; #[cfg_attr(not(target_arch = "wasm32"), test)] #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)] fn from_slice_unaligned() { let (null, non_null) = ptr_vals!($id); let mut unaligned = [ non_null; $id::::lanes() + 1 ]; unaligned[0] = null; let vec = $id::::from_slice_unaligned( &unaligned[1..] ); for (index, &b) in unaligned.iter().enumerate() { if index == 0 { assert_eq!(b, null); } else { assert_eq!(b, non_null); assert_eq!(b, vec.extract(index - 1)); } } } // FIXME: wasm-bindgen-test does not support #[should_panic] // #[cfg_attr(not(target_arch = "wasm32"), test)] // #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)] #[cfg(not(target_arch = "wasm32"))] #[test] #[should_panic] fn from_slice_unaligned_fail() { let (_null, non_null) = ptr_vals!($id); let unaligned = [non_null; $id::::lanes() + 1]; // the slice is not large enough => panic let _vec = $id::::from_slice_unaligned( &unaligned[2..] ); } union A { data: [<$id as sealed::Simd>::Element; 2 * $id::::lanes()], _vec: $id, } #[cfg_attr(not(target_arch = "wasm32"), test)] #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)] fn from_slice_aligned() { let (null, non_null) = ptr_vals!($id); let mut aligned = A { data: [null; 2 * $id::::lanes()], }; for i in $id::::lanes()..(2 * $id::::lanes()) { unsafe { aligned.data[i] = non_null; } } let vec = unsafe { $id::::from_slice_aligned( &aligned.data[$id::::lanes()..] ) }; for (index, &b) in unsafe { aligned.data.iter().enumerate() } { if index < $id::::lanes() { assert_eq!(b, null); } else { assert_eq!(b, non_null); assert_eq!( b, vec.extract(index - $id::::lanes()) ); } } } // FIXME: wasm-bindgen-test does not support #[should_panic] // #[cfg_attr(not(target_arch = "wasm32"), test)] // #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)] #[cfg(not(target_arch = "wasm32"))] #[test] #[should_panic] fn from_slice_aligned_fail_lanes() { let (_null, non_null) = ptr_vals!($id); let aligned = A { data: [non_null; 2 * $id::::lanes()], }; // the slice is not large enough => panic let _vec = unsafe { $id::::from_slice_aligned( &aligned.data[2 * $id::::lanes()..] ) }; } // FIXME: wasm-bindgen-test does not support #[should_panic] // #[cfg_attr(not(target_arch = "wasm32"), test)] // #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)] #[cfg(not(target_arch = "wasm32"))] #[test] #[should_panic] fn from_slice_aligned_fail_align() { unsafe { let (null, _non_null) = ptr_vals!($id); let aligned = A { data: [null; 2 * $id::::lanes()], }; // get a pointer to the front of data let ptr = aligned.data.as_ptr(); // offset pointer by one element let ptr = ptr.wrapping_add(1); if ptr.align_offset( crate::mem::align_of::<$id>() ) == 0 { // the pointer is properly aligned, so // from_slice_aligned won't fail here (e.g. this // can happen for i128x1). So we panic to make // the "should_fail" test pass: panic!("ok"); } // create a slice - this is safe, because the // elements of the slice exist, are properly // initialized, and properly aligned: let s = slice::from_raw_parts( ptr, $id::::lanes() ); // this should always panic because the slice // alignment does not match the alignment // requirements for the vector type: let _vec = $id::::from_slice_aligned(s); } } } } } impl $id { /// Writes the values of the vector to the `slice`. /// /// # Panics /// /// If `slice.len() < Self::lanes()` or `&slice[0]` is not /// aligned to an `align_of::()` boundary. #[inline] pub fn write_to_slice_aligned(self, slice: &mut [$elem_ty]) { unsafe { assert!(slice.len() >= $elem_count); let target_ptr = slice.as_mut_ptr(); assert!( target_ptr.align_offset(crate::mem::align_of::()) == 0 ); self.write_to_slice_aligned_unchecked(slice); } } /// Writes the values of the vector to the `slice`. /// /// # Panics /// /// If `slice.len() < Self::lanes()`. #[inline] pub fn write_to_slice_unaligned(self, slice: &mut [$elem_ty]) { unsafe { assert!(slice.len() >= $elem_count); self.write_to_slice_unaligned_unchecked(slice); } } /// Writes the values of the vector to the `slice`. /// /// # Safety /// /// If `slice.len() < Self::lanes()` or `&slice[0]` is not /// aligned to an `align_of::()` boundary, the behavior is /// undefined. #[inline] pub unsafe fn write_to_slice_aligned_unchecked( self, slice: &mut [$elem_ty], ) { #[allow(clippy::cast_ptr_alignment)] *(slice.as_mut_ptr().cast()) = self; } /// Writes the values of the vector to the `slice`. /// /// # Safety /// /// If `slice.len() < Self::lanes()` the behavior is undefined. #[inline] pub unsafe fn write_to_slice_unaligned_unchecked( self, slice: &mut [$elem_ty], ) { let target_ptr = slice.as_mut_ptr().cast(); let self_ptr = &self as *const Self as *const u8; crate::ptr::copy_nonoverlapping( self_ptr, target_ptr, crate::mem::size_of::(), ); } } test_if!{ $test_tt: paste::item! { pub mod [<$id _slice_write_to_slice>] { use super::*; use crate::iter::Iterator; #[cfg_attr(not(target_arch = "wasm32"), test)] #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)] fn write_to_slice_unaligned() { let (null, non_null) = ptr_vals!($id); let mut unaligned = [null; $id::::lanes() + 1]; let vec = $id::::splat(non_null); vec.write_to_slice_unaligned(&mut unaligned[1..]); for (index, &b) in unaligned.iter().enumerate() { if index == 0 { assert_eq!(b, null); } else { assert_eq!(b, non_null); assert_eq!(b, vec.extract(index - 1)); } } } // FIXME: wasm-bindgen-test does not support #[should_panic] // #[cfg_attr(not(target_arch = "wasm32"), test)] // #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)] #[cfg(not(target_arch = "wasm32"))] #[test] #[should_panic] fn write_to_slice_unaligned_fail() { let (null, non_null) = ptr_vals!($id); let mut unaligned = [null; $id::::lanes() + 1]; let vec = $id::::splat(non_null); // the slice is not large enough => panic vec.write_to_slice_unaligned(&mut unaligned[2..]); } union A { data: [<$id as sealed::Simd>::Element; 2 * $id::::lanes()], _vec: $id, } #[cfg_attr(not(target_arch = "wasm32"), test)] #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)] fn write_to_slice_aligned() { let (null, non_null) = ptr_vals!($id); let mut aligned = A { data: [null; 2 * $id::::lanes()], }; let vec = $id::::splat(non_null); unsafe { vec.write_to_slice_aligned( &mut aligned.data[$id::::lanes()..] ) }; for (index, &b) in unsafe { aligned.data.iter().enumerate() } { if index < $id::::lanes() { assert_eq!(b, null); } else { assert_eq!(b, non_null); assert_eq!( b, vec.extract(index - $id::::lanes()) ); } } } // FIXME: wasm-bindgen-test does not support #[should_panic] // #[cfg_attr(not(target_arch = "wasm32"), test)] // #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)] #[cfg(not(target_arch = "wasm32"))] #[test] #[should_panic] fn write_to_slice_aligned_fail_lanes() { let (null, non_null) = ptr_vals!($id); let mut aligned = A { data: [null; 2 * $id::::lanes()], }; let vec = $id::::splat(non_null); // the slice is not large enough => panic unsafe { vec.write_to_slice_aligned( &mut aligned.data[2 * $id::::lanes()..] ) }; } // FIXME: wasm-bindgen-test does not support #[should_panic] // #[cfg_attr(not(target_arch = "wasm32"), test)] // #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)] #[cfg(not(target_arch = "wasm32"))] #[test] #[should_panic] fn write_to_slice_aligned_fail_align() { let (null, non_null) = ptr_vals!($id); unsafe { let mut aligned = A { data: [null; 2 * $id::::lanes()], }; // get a pointer to the front of data let ptr = aligned.data.as_mut_ptr(); // offset pointer by one element let ptr = ptr.wrapping_add(1); if ptr.align_offset( crate::mem::align_of::<$id>() ) == 0 { // the pointer is properly aligned, so // write_to_slice_aligned won't fail here (e.g. // this can happen for i128x1). So we panic to // make the "should_fail" test pass: panic!("ok"); } // create a slice - this is safe, because the // elements of the slice exist, are properly // initialized, and properly aligned: let s = slice::from_raw_parts_mut( ptr, $id::::lanes() ); // this should always panic because the slice // alignment does not match the alignment // requirements for the vector type: let vec = $id::::splat(non_null); vec.write_to_slice_aligned(s); } } } } } impl crate::hash::Hash for $id { #[inline] fn hash(&self, state: &mut H) { let s: $usize_ty = unsafe { crate::mem::transmute(*self) }; s.hash(state) } } test_if! { $test_tt: paste::item! { pub mod [<$id _hash>] { use super::*; #[cfg_attr(not(target_arch = "wasm32"), test)] #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)] fn hash() { use crate::hash::{Hash, Hasher}; #[allow(deprecated)] use crate::hash::{SipHasher13}; let values = [1_i32; $elem_count]; let mut vec: $id = Default::default(); let mut array = [ $id::::null().extract(0); $elem_count ]; for i in 0..$elem_count { let ptr = &values[i] as *const i32 as *mut i32; vec = vec.replace(i, ptr); array[i] = ptr; } #[allow(deprecated)] let mut a_hash = SipHasher13::new(); let mut v_hash = a_hash.clone(); array.hash(&mut a_hash); vec.hash(&mut v_hash); assert_eq!(a_hash.finish(), v_hash.finish()); } } } } impl $id { /// Calculates the offset from a pointer. /// /// `count` is in units of `T`; e.g. a count of `3` represents a /// pointer offset of `3 * size_of::()` bytes. /// /// # Safety /// /// If any of the following conditions are violated, the result is /// Undefined Behavior: /// /// * Both the starting and resulting pointer must be either in /// bounds or one byte past the end of an allocated object. /// /// * The computed offset, in bytes, cannot overflow an `isize`. /// /// * The offset being in bounds cannot rely on "wrapping around" /// the address space. That is, the infinite-precision sum, in bytes /// must fit in a `usize`. /// /// The compiler and standard library generally tries to ensure /// allocations never reach a size where an offset is a concern. For /// instance, `Vec` and `Box` ensure they never allocate more than /// `isize::MAX` bytes, so `vec.as_ptr().offset(vec.len() as isize)` /// is always safe. /// /// Most platforms fundamentally can't even construct such an /// allocation. For instance, no known 64-bit platform can ever /// serve a request for 263 bytes due to page-table limitations or /// splitting the address space. However, some 32-bit and 16-bit /// platforms may successfully serve a request for more than /// `isize::MAX` bytes with things like Physical Address Extension. /// As such, memory acquired directly from allocators or memory /// mapped files may be too large to handle with this function. /// /// Consider using `wrapping_offset` instead if these constraints /// are difficult to satisfy. The only advantage of this method is /// that it enables more aggressive compiler optimizations. #[inline] pub unsafe fn offset(self, count: $isize_ty) -> Self { // FIXME: should use LLVM's `add nsw nuw` self.wrapping_offset(count) } /// Calculates the offset from a pointer using wrapping arithmetic. /// /// `count` is in units of `T`; e.g. a count of `3` represents a /// pointer offset of `3 * size_of::()` bytes. /// /// # Safety /// /// The resulting pointer does not need to be in bounds, but it is /// potentially hazardous to dereference (which requires unsafe). /// /// Always use `.offset(count)` instead when possible, because /// offset allows the compiler to optimize better. #[inline] pub fn wrapping_offset(self, count: $isize_ty) -> Self { unsafe { let x: $isize_ty = crate::mem::transmute(self); // note: {+,*} currently performs a `wrapping_{add, mul}` crate::mem::transmute( x + (count * crate::mem::size_of::() as isize) ) } } /// Calculates the distance between two pointers. /// /// The returned value is in units of `T`: the distance in bytes is /// divided by `mem::size_of::()`. /// /// This function is the inverse of offset. /// /// # Safety /// /// If any of the following conditions are violated, the result is /// Undefined Behavior: /// /// * Both the starting and other pointer must be either in bounds /// or one byte past the end of the same allocated object. /// /// * The distance between the pointers, in bytes, cannot overflow /// an `isize`. /// /// * The distance between the pointers, in bytes, must be an exact /// multiple of the size of `T`. /// /// * The distance being in bounds cannot rely on "wrapping around" /// the address space. /// /// The compiler and standard library generally try to ensure /// allocations never reach a size where an offset is a concern. For /// instance, `Vec` and `Box` ensure they never allocate more than /// `isize::MAX` bytes, so `ptr_into_vec.offset_from(vec.as_ptr())` /// is always safe. /// /// Most platforms fundamentally can't even construct such an /// allocation. For instance, no known 64-bit platform can ever /// serve a request for 263 bytes due to page-table limitations or /// splitting the address space. However, some 32-bit and 16-bit /// platforms may successfully serve a request for more than /// `isize::MAX` bytes with things like Physical Address Extension. /// As such, memory acquired directly from allocators or memory /// mapped files may be too large to handle with this function. /// /// Consider using `wrapping_offset_from` instead if these constraints /// are difficult to satisfy. The only advantage of this method is /// that it enables more aggressive compiler optimizations. #[inline] pub unsafe fn offset_from(self, origin: Self) -> $isize_ty { // FIXME: should use LLVM's `sub nsw nuw`. self.wrapping_offset_from(origin) } /// Calculates the distance between two pointers. /// /// The returned value is in units of `T`: the distance in bytes is /// divided by `mem::size_of::()`. /// /// If the address different between the two pointers is not a /// multiple of `mem::size_of::()` then the result of the /// division is rounded towards zero. /// /// Though this method is safe for any two pointers, note that its /// result will be mostly useless if the two pointers aren't into /// the same allocated object, for example if they point to two /// different local variables. #[inline] pub fn wrapping_offset_from(self, origin: Self) -> $isize_ty { let x: $isize_ty = unsafe { crate::mem::transmute(self) }; let y: $isize_ty = unsafe { crate::mem::transmute(origin) }; // note: {-,/} currently perform wrapping_{sub, div} (y - x) / (crate::mem::size_of::() as isize) } /// Calculates the offset from a pointer (convenience for /// `.offset(count as isize)`). /// /// `count` is in units of `T`; e.g. a count of 3 represents a /// pointer offset of `3 * size_of::()` bytes. /// /// # Safety /// /// If any of the following conditions are violated, the result is /// Undefined Behavior: /// /// * Both the starting and resulting pointer must be either in /// bounds or one byte past the end of an allocated object. /// /// * The computed offset, in bytes, cannot overflow an `isize`. /// /// * The offset being in bounds cannot rely on "wrapping around" /// the address space. That is, the infinite-precision sum must fit /// in a `usize`. /// /// The compiler and standard library generally tries to ensure /// allocations never reach a size where an offset is a concern. For /// instance, `Vec` and `Box` ensure they never allocate more than /// `isize::MAX` bytes, so `vec.as_ptr().add(vec.len())` is always /// safe. /// /// Most platforms fundamentally can't even construct such an /// allocation. For instance, no known 64-bit platform can ever /// serve a request for 263 bytes due to page-table limitations or /// splitting the address space. However, some 32-bit and 16-bit /// platforms may successfully serve a request for more than /// `isize::MAX` bytes with things like Physical Address Extension. /// As such, memory acquired directly from allocators or memory /// mapped files may be too large to handle with this function. /// /// Consider using `wrapping_offset` instead if these constraints /// are difficult to satisfy. The only advantage of this method is /// that it enables more aggressive compiler optimizations. #[inline] #[allow(clippy::should_implement_trait)] pub unsafe fn add(self, count: $usize_ty) -> Self { self.offset(count.cast()) } /// Calculates the offset from a pointer (convenience for /// `.offset((count as isize).wrapping_neg())`). /// /// `count` is in units of T; e.g. a `count` of 3 represents a /// pointer offset of `3 * size_of::()` bytes. /// /// # Safety /// /// If any of the following conditions are violated, the result is /// Undefined Behavior: /// /// * Both the starting and resulting pointer must be either in /// bounds or one byte past the end of an allocated object. /// /// * The computed offset cannot exceed `isize::MAX` **bytes**. /// /// * The offset being in bounds cannot rely on "wrapping around" /// the address space. That is, the infinite-precision sum must fit /// in a usize. /// /// The compiler and standard library generally tries to ensure /// allocations never reach a size where an offset is a concern. For /// instance, `Vec` and `Box` ensure they never allocate more than /// `isize::MAX` bytes, so /// `vec.as_ptr().add(vec.len()).sub(vec.len())` is always safe. /// /// Most platforms fundamentally can't even construct such an /// allocation. For instance, no known 64-bit platform can ever /// serve a request for 263 bytes due to page-table /// limitations or splitting the address space. However, some 32-bit /// and 16-bit platforms may successfully serve a request for more /// than `isize::MAX` bytes with things like Physical Address /// Extension. As such, memory acquired directly from allocators or /// memory mapped files *may* be too large to handle with this /// function. /// /// Consider using `wrapping_offset` instead if these constraints /// are difficult to satisfy. The only advantage of this method is /// that it enables more aggressive compiler optimizations. #[inline] #[allow(clippy::should_implement_trait)] pub unsafe fn sub(self, count: $usize_ty) -> Self { let x: $isize_ty = count.cast(); // note: - is currently wrapping_neg self.offset(-x) } /// Calculates the offset from a pointer using wrapping arithmetic. /// (convenience for `.wrapping_offset(count as isize)`) /// /// `count` is in units of T; e.g. a `count` of 3 represents a /// pointer offset of `3 * size_of::()` bytes. /// /// # Safety /// /// The resulting pointer does not need to be in bounds, but it is /// potentially hazardous to dereference (which requires `unsafe`). /// /// Always use `.add(count)` instead when possible, because `add` /// allows the compiler to optimize better. #[inline] pub fn wrapping_add(self, count: $usize_ty) -> Self { self.wrapping_offset(count.cast()) } /// Calculates the offset from a pointer using wrapping arithmetic. /// (convenience for `.wrapping_offset((count as /// isize).wrapping_sub())`) /// /// `count` is in units of T; e.g. a `count` of 3 represents a /// pointer offset of `3 * size_of::()` bytes. /// /// # Safety /// /// The resulting pointer does not need to be in bounds, but it is /// potentially hazardous to dereference (which requires `unsafe`). /// /// Always use `.sub(count)` instead when possible, because `sub` /// allows the compiler to optimize better. #[inline] pub fn wrapping_sub(self, count: $usize_ty) -> Self { let x: $isize_ty = count.cast(); self.wrapping_offset(-1 * x) } } impl $id { /// Shuffle vector elements according to `indices`. #[inline] pub fn shuffle1_dyn(self, indices: I) -> Self where Self: codegen::shuffle1_dyn::Shuffle1Dyn, { codegen::shuffle1_dyn::Shuffle1Dyn::shuffle1_dyn(self, indices) } } test_if! { $test_tt: paste::item! { pub mod [<$id _shuffle1_dyn>] { use super::*; #[cfg_attr(not(target_arch = "wasm32"), test)] #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)] fn shuffle1_dyn() { let (null, non_null) = ptr_vals!($id); // alternating = [non_null, null, non_null, null, ...] let mut alternating = $id::::splat(null); for i in 0..$id::::lanes() { if i % 2 == 0 { alternating = alternating.replace(i, non_null); } } type Indices = <$id as codegen::shuffle1_dyn::Shuffle1Dyn>::Indices; // even = [0, 0, 2, 2, 4, 4, ..] let even = { let mut v = Indices::splat(0); for i in 0..$id::::lanes() { if i % 2 == 0 { v = v.replace(i, (i as u8).into()); } else { v = v.replace(i, (i as u8 - 1).into()); } } v }; // odd = [1, 1, 3, 3, 5, 5, ...] let odd = { let mut v = Indices::splat(0); for i in 0..$id::::lanes() { if i % 2 != 0 { v = v.replace(i, (i as u8).into()); } else { v = v.replace(i, (i as u8 + 1).into()); } } v }; assert_eq!( alternating.shuffle1_dyn(even), $id::::splat(non_null) ); if $id::::lanes() > 1 { assert_eq!( alternating.shuffle1_dyn(odd), $id::::splat(null) ); } } } } } }; } ================================================ FILE: src/api/minimal.rs ================================================ #[macro_use] mod iuf; #[macro_use] mod mask; #[macro_use] mod ptr; ================================================ FILE: src/api/ops/scalar_arithmetic.rs ================================================ //! Vertical (lane-wise) vector-scalar / scalar-vector arithmetic operations. macro_rules! impl_ops_scalar_arithmetic { ([$elem_ty:ident; $elem_count:expr]: $id:ident | $test_tt:tt) => { impl crate::ops::Add<$elem_ty> for $id { type Output = Self; #[inline] fn add(self, other: $elem_ty) -> Self { self + $id::splat(other) } } impl crate::ops::Add<$id> for $elem_ty { type Output = $id; #[inline] fn add(self, other: $id) -> $id { $id::splat(self) + other } } impl crate::ops::Sub<$elem_ty> for $id { type Output = Self; #[inline] fn sub(self, other: $elem_ty) -> Self { self - $id::splat(other) } } impl crate::ops::Sub<$id> for $elem_ty { type Output = $id; #[inline] fn sub(self, other: $id) -> $id { $id::splat(self) - other } } impl crate::ops::Mul<$elem_ty> for $id { type Output = Self; #[inline] fn mul(self, other: $elem_ty) -> Self { self * $id::splat(other) } } impl crate::ops::Mul<$id> for $elem_ty { type Output = $id; #[inline] fn mul(self, other: $id) -> $id { $id::splat(self) * other } } impl crate::ops::Div<$elem_ty> for $id { type Output = Self; #[inline] fn div(self, other: $elem_ty) -> Self { self / $id::splat(other) } } impl crate::ops::Div<$id> for $elem_ty { type Output = $id; #[inline] fn div(self, other: $id) -> $id { $id::splat(self) / other } } impl crate::ops::Rem<$elem_ty> for $id { type Output = Self; #[inline] fn rem(self, other: $elem_ty) -> Self { self % $id::splat(other) } } impl crate::ops::Rem<$id> for $elem_ty { type Output = $id; #[inline] fn rem(self, other: $id) -> $id { $id::splat(self) % other } } impl crate::ops::AddAssign<$elem_ty> for $id { #[inline] fn add_assign(&mut self, other: $elem_ty) { *self = *self + other; } } impl crate::ops::SubAssign<$elem_ty> for $id { #[inline] fn sub_assign(&mut self, other: $elem_ty) { *self = *self - other; } } impl crate::ops::MulAssign<$elem_ty> for $id { #[inline] fn mul_assign(&mut self, other: $elem_ty) { *self = *self * other; } } impl crate::ops::DivAssign<$elem_ty> for $id { #[inline] fn div_assign(&mut self, other: $elem_ty) { *self = *self / other; } } impl crate::ops::RemAssign<$elem_ty> for $id { #[inline] fn rem_assign(&mut self, other: $elem_ty) { *self = *self % other; } } test_if!{ $test_tt: paste::item! { pub mod [<$id _ops_scalar_arith>] { use super::*; #[cfg_attr(not(target_arch = "wasm32"), test)] #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)] fn ops_scalar_arithmetic() { let zi = 0 as $elem_ty; let oi = 1 as $elem_ty; let ti = 2 as $elem_ty; let fi = 4 as $elem_ty; let z = $id::splat(zi); let o = $id::splat(oi); let t = $id::splat(ti); let f = $id::splat(fi); // add assert_eq!(zi + z, z); assert_eq!(z + zi, z); assert_eq!(oi + z, o); assert_eq!(o + zi, o); assert_eq!(ti + z, t); assert_eq!(t + zi, t); assert_eq!(ti + t, f); assert_eq!(t + ti, f); // sub assert_eq!(zi - z, z); assert_eq!(z - zi, z); assert_eq!(oi - z, o); assert_eq!(o - zi, o); assert_eq!(ti - z, t); assert_eq!(t - zi, t); assert_eq!(fi - t, t); assert_eq!(f - ti, t); assert_eq!(f - o - o, t); assert_eq!(f - oi - oi, t); // mul assert_eq!(zi * z, z); assert_eq!(z * zi, z); assert_eq!(zi * o, z); assert_eq!(z * oi, z); assert_eq!(zi * t, z); assert_eq!(z * ti, z); assert_eq!(oi * t, t); assert_eq!(o * ti, t); assert_eq!(ti * t, f); assert_eq!(t * ti, f); // div assert_eq!(zi / o, z); assert_eq!(z / oi, z); assert_eq!(ti / o, t); assert_eq!(t / oi, t); assert_eq!(fi / o, f); assert_eq!(f / oi, f); assert_eq!(ti / t, o); assert_eq!(t / ti, o); assert_eq!(fi / t, t); assert_eq!(f / ti, t); // rem assert_eq!(oi % o, z); assert_eq!(o % oi, z); assert_eq!(fi % t, z); assert_eq!(f % ti, z); { let mut v = z; assert_eq!(v, z); v += oi; // add_assign assert_eq!(v, o); v -= oi; // sub_assign assert_eq!(v, z); v = t; v *= oi; // mul_assign assert_eq!(v, t); v *= ti; assert_eq!(v, f); v /= oi; // div_assign assert_eq!(v, f); v /= ti; assert_eq!(v, t); v %= ti; // rem_assign assert_eq!(v, z); } } } } } }; } ================================================ FILE: src/api/ops/scalar_bitwise.rs ================================================ //! Vertical (lane-wise) vector-scalar / scalar-vector bitwise operations. macro_rules! impl_ops_scalar_bitwise { ( [$elem_ty:ident; $elem_count:expr]: $id:ident | $test_tt:tt | ($true:expr, $false:expr) ) => { impl crate::ops::BitXor<$elem_ty> for $id { type Output = Self; #[inline] fn bitxor(self, other: $elem_ty) -> Self { self ^ $id::splat(other) } } impl crate::ops::BitXor<$id> for $elem_ty { type Output = $id; #[inline] fn bitxor(self, other: $id) -> $id { $id::splat(self) ^ other } } impl crate::ops::BitAnd<$elem_ty> for $id { type Output = Self; #[inline] fn bitand(self, other: $elem_ty) -> Self { self & $id::splat(other) } } impl crate::ops::BitAnd<$id> for $elem_ty { type Output = $id; #[inline] fn bitand(self, other: $id) -> $id { $id::splat(self) & other } } impl crate::ops::BitOr<$elem_ty> for $id { type Output = Self; #[inline] fn bitor(self, other: $elem_ty) -> Self { self | $id::splat(other) } } impl crate::ops::BitOr<$id> for $elem_ty { type Output = $id; #[inline] fn bitor(self, other: $id) -> $id { $id::splat(self) | other } } impl crate::ops::BitAndAssign<$elem_ty> for $id { #[inline] fn bitand_assign(&mut self, other: $elem_ty) { *self = *self & other; } } impl crate::ops::BitOrAssign<$elem_ty> for $id { #[inline] fn bitor_assign(&mut self, other: $elem_ty) { *self = *self | other; } } impl crate::ops::BitXorAssign<$elem_ty> for $id { #[inline] fn bitxor_assign(&mut self, other: $elem_ty) { *self = *self ^ other; } } test_if!{ $test_tt: paste::item! { pub mod [<$id _ops_scalar_bitwise>] { use super::*; #[cfg_attr(not(target_arch = "wasm32"), test)] #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)] fn ops_scalar_bitwise() { let zi = 0 as $elem_ty; let oi = 1 as $elem_ty; let ti = 2 as $elem_ty; let z = $id::splat(zi); let o = $id::splat(oi); let t = $id::splat(ti); // BitAnd: assert_eq!(oi & o, o); assert_eq!(o & oi, o); assert_eq!(oi & z, z); assert_eq!(o & zi, z); assert_eq!(zi & o, z); assert_eq!(z & oi, z); assert_eq!(zi & z, z); assert_eq!(z & zi, z); assert_eq!(ti & t, t); assert_eq!(t & ti, t); assert_eq!(ti & o, z); assert_eq!(t & oi, z); assert_eq!(oi & t, z); assert_eq!(o & ti, z); // BitOr: assert_eq!(oi | o, o); assert_eq!(o | oi, o); assert_eq!(oi | z, o); assert_eq!(o | zi, o); assert_eq!(zi | o, o); assert_eq!(z | oi, o); assert_eq!(zi | z, z); assert_eq!(z | zi, z); assert_eq!(ti | t, t); assert_eq!(t | ti, t); assert_eq!(zi | t, t); assert_eq!(z | ti, t); assert_eq!(ti | z, t); assert_eq!(t | zi, t); // BitXOR: assert_eq!(oi ^ o, z); assert_eq!(o ^ oi, z); assert_eq!(zi ^ z, z); assert_eq!(z ^ zi, z); assert_eq!(zi ^ o, o); assert_eq!(z ^ oi, o); assert_eq!(oi ^ z, o); assert_eq!(o ^ zi, o); assert_eq!(ti ^ t, z); assert_eq!(t ^ ti, z); assert_eq!(ti ^ z, t); assert_eq!(t ^ zi, t); assert_eq!(zi ^ t, t); assert_eq!(z ^ ti, t); { // AndAssign: let mut v = o; v &= ti; assert_eq!(v, z); } { // OrAssign: let mut v = z; v |= oi; assert_eq!(v, o); } { // XORAssign: let mut v = z; v ^= oi; assert_eq!(v, o); } } } } } }; } ================================================ FILE: src/api/ops/scalar_mask_bitwise.rs ================================================ //! Vertical (lane-wise) vector-vector bitwise operations. macro_rules! impl_ops_scalar_mask_bitwise { ( [$elem_ty:ident; $elem_count:expr]: $id:ident | $test_tt:tt | ($true:expr, $false:expr) ) => { impl crate::ops::BitXor for $id { type Output = Self; #[inline] fn bitxor(self, other: bool) -> Self { self ^ $id::splat(other) } } impl crate::ops::BitXor<$id> for bool { type Output = $id; #[inline] fn bitxor(self, other: $id) -> $id { $id::splat(self) ^ other } } impl crate::ops::BitAnd for $id { type Output = Self; #[inline] fn bitand(self, other: bool) -> Self { self & $id::splat(other) } } impl crate::ops::BitAnd<$id> for bool { type Output = $id; #[inline] fn bitand(self, other: $id) -> $id { $id::splat(self) & other } } impl crate::ops::BitOr for $id { type Output = Self; #[inline] fn bitor(self, other: bool) -> Self { self | $id::splat(other) } } impl crate::ops::BitOr<$id> for bool { type Output = $id; #[inline] fn bitor(self, other: $id) -> $id { $id::splat(self) | other } } impl crate::ops::BitAndAssign for $id { #[inline] fn bitand_assign(&mut self, other: bool) { *self = *self & other; } } impl crate::ops::BitOrAssign for $id { #[inline] fn bitor_assign(&mut self, other: bool) { *self = *self | other; } } impl crate::ops::BitXorAssign for $id { #[inline] fn bitxor_assign(&mut self, other: bool) { *self = *self ^ other; } } test_if!{ $test_tt: paste::item! { pub mod [<$id _ops_scalar_mask_bitwise>] { use super::*; #[cfg_attr(not(target_arch = "wasm32"), test)] #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)] fn ops_scalar_mask_bitwise() { let ti = true; let fi = false; let t = $id::splat(ti); let f = $id::splat(fi); assert!(t != f); assert!(!(t == f)); // BitAnd: assert_eq!(ti & f, f); assert_eq!(t & fi, f); assert_eq!(fi & t, f); assert_eq!(f & ti, f); assert_eq!(ti & t, t); assert_eq!(t & ti, t); assert_eq!(fi & f, f); assert_eq!(f & fi, f); // BitOr: assert_eq!(ti | f, t); assert_eq!(t | fi, t); assert_eq!(fi | t, t); assert_eq!(f | ti, t); assert_eq!(ti | t, t); assert_eq!(t | ti, t); assert_eq!(fi | f, f); assert_eq!(f | fi, f); // BitXOR: assert_eq!(ti ^ f, t); assert_eq!(t ^ fi, t); assert_eq!(fi ^ t, t); assert_eq!(f ^ ti, t); assert_eq!(ti ^ t, f); assert_eq!(t ^ ti, f); assert_eq!(fi ^ f, f); assert_eq!(f ^ fi, f); { // AndAssign: let mut v = f; v &= ti; assert_eq!(v, f); } { // OrAssign: let mut v = f; v |= ti; assert_eq!(v, t); } { // XORAssign: let mut v = f; v ^= ti; assert_eq!(v, t); } } } } } }; } ================================================ FILE: src/api/ops/scalar_shifts.rs ================================================ //! Vertical (lane-wise) vector-scalar shifts operations. macro_rules! impl_ops_scalar_shifts { ([$elem_ty:ident; $elem_count:expr]: $id:ident | $test_tt:tt) => { impl crate::ops::Shl for $id { type Output = Self; #[inline] fn shl(self, other: u32) -> Self { self << $id::splat(other as $elem_ty) } } impl crate::ops::Shr for $id { type Output = Self; #[inline] fn shr(self, other: u32) -> Self { self >> $id::splat(other as $elem_ty) } } impl crate::ops::ShlAssign for $id { #[inline] fn shl_assign(&mut self, other: u32) { *self = *self << other; } } impl crate::ops::ShrAssign for $id { #[inline] fn shr_assign(&mut self, other: u32) { *self = *self >> other; } } test_if!{ $test_tt: paste::item! { pub mod [<$id _ops_scalar_shifts>] { use super::*; #[cfg_attr(not(target_arch = "wasm32"), test)] #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)] #[cfg_attr(any(target_arch = "s390x", target_arch = "sparc64"), allow(unreachable_code, unused_variables) )] #[cfg(not(target_arch = "aarch64"))] //~^ FIXME: https://github.com/rust-lang/packed_simd/issues/317 fn ops_scalar_shifts() { let z = $id::splat(0 as $elem_ty); let o = $id::splat(1 as $elem_ty); let t = $id::splat(2 as $elem_ty); let f = $id::splat(4 as $elem_ty); { let zi = 0 as u32; let oi = 1 as u32; let ti = 2 as u32; let maxi = (mem::size_of::<$elem_ty>() * 8 - 1) as u32; // shr assert_eq!(z >> zi, z); assert_eq!(z >> oi, z); assert_eq!(z >> ti, z); assert_eq!(z >> ti, z); #[cfg(any(target_arch = "s390x", target_arch = "sparc64"))] { // FIXME: https://github.com/rust-lang-nursery/packed_simd/issues/13 return; } assert_eq!(o >> zi, o); assert_eq!(t >> zi, t); assert_eq!(f >> zi, f); assert_eq!(f >> maxi, z); assert_eq!(o >> oi, z); assert_eq!(t >> oi, o); assert_eq!(t >> ti, z); assert_eq!(f >> oi, t); assert_eq!(f >> ti, o); assert_eq!(f >> maxi, z); // shl assert_eq!(z << zi, z); assert_eq!(o << zi, o); assert_eq!(t << zi, t); assert_eq!(f << zi, f); assert_eq!(f << maxi, z); assert_eq!(o << oi, t); assert_eq!(o << ti, f); assert_eq!(t << oi, f); { // shr_assign let mut v = o; v >>= oi; assert_eq!(v, z); } { // shl_assign let mut v = o; v <<= oi; assert_eq!(v, t); } } } } } } }; } ================================================ FILE: src/api/ops/vector_arithmetic.rs ================================================ //! Vertical (lane-wise) vector-vector arithmetic operations. macro_rules! impl_ops_vector_arithmetic { ([$elem_ty:ident; $elem_count:expr]: $id:ident | $test_tt:tt) => { impl crate::ops::Add for $id { type Output = Self; #[inline] fn add(self, other: Self) -> Self { use crate::llvm::simd_add; unsafe { Simd(simd_add(self.0, other.0)) } } } impl crate::ops::Sub for $id { type Output = Self; #[inline] fn sub(self, other: Self) -> Self { use crate::llvm::simd_sub; unsafe { Simd(simd_sub(self.0, other.0)) } } } impl crate::ops::Mul for $id { type Output = Self; #[inline] fn mul(self, other: Self) -> Self { use crate::llvm::simd_mul; unsafe { Simd(simd_mul(self.0, other.0)) } } } impl crate::ops::Div for $id { type Output = Self; #[inline] fn div(self, other: Self) -> Self { use crate::llvm::simd_div; unsafe { Simd(simd_div(self.0, other.0)) } } } impl crate::ops::Rem for $id { type Output = Self; #[inline] fn rem(self, other: Self) -> Self { use crate::llvm::simd_rem; unsafe { Simd(simd_rem(self.0, other.0)) } } } impl crate::ops::AddAssign for $id { #[inline] fn add_assign(&mut self, other: Self) { *self = *self + other; } } impl crate::ops::SubAssign for $id { #[inline] fn sub_assign(&mut self, other: Self) { *self = *self - other; } } impl crate::ops::MulAssign for $id { #[inline] fn mul_assign(&mut self, other: Self) { *self = *self * other; } } impl crate::ops::DivAssign for $id { #[inline] fn div_assign(&mut self, other: Self) { *self = *self / other; } } impl crate::ops::RemAssign for $id { #[inline] fn rem_assign(&mut self, other: Self) { *self = *self % other; } } test_if!{ $test_tt: paste::item! { pub mod [<$id _ops_vector_arith>] { use super::*; #[cfg_attr(not(target_arch = "wasm32"), test)] #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)] fn ops_vector_arithmetic() { let z = $id::splat(0 as $elem_ty); let o = $id::splat(1 as $elem_ty); let t = $id::splat(2 as $elem_ty); let f = $id::splat(4 as $elem_ty); // add assert_eq!(z + z, z); assert_eq!(o + z, o); assert_eq!(t + z, t); assert_eq!(t + t, f); // sub assert_eq!(z - z, z); assert_eq!(o - z, o); assert_eq!(t - z, t); assert_eq!(f - t, t); assert_eq!(f - o - o, t); // mul assert_eq!(z * z, z); assert_eq!(z * o, z); assert_eq!(z * t, z); assert_eq!(o * t, t); assert_eq!(t * t, f); // div assert_eq!(z / o, z); assert_eq!(t / o, t); assert_eq!(f / o, f); assert_eq!(t / t, o); assert_eq!(f / t, t); // rem assert_eq!(o % o, z); assert_eq!(f % t, z); { let mut v = z; assert_eq!(v, z); v += o; // add_assign assert_eq!(v, o); v -= o; // sub_assign assert_eq!(v, z); v = t; v *= o; // mul_assign assert_eq!(v, t); v *= t; assert_eq!(v, f); v /= o; // div_assign assert_eq!(v, f); v /= t; assert_eq!(v, t); v %= t; // rem_assign assert_eq!(v, z); } } } } } }; } ================================================ FILE: src/api/ops/vector_bitwise.rs ================================================ //! Vertical (lane-wise) vector-vector bitwise operations. macro_rules! impl_ops_vector_bitwise { ( [$elem_ty:ident; $elem_count:expr]: $id:ident | $test_tt:tt | ($true:expr, $false:expr) ) => { impl crate::ops::Not for $id { type Output = Self; #[inline] fn not(self) -> Self { Self::splat($true) ^ self } } impl crate::ops::BitXor for $id { type Output = Self; #[inline] fn bitxor(self, other: Self) -> Self { use crate::llvm::simd_xor; unsafe { Simd(simd_xor(self.0, other.0)) } } } impl crate::ops::BitAnd for $id { type Output = Self; #[inline] fn bitand(self, other: Self) -> Self { use crate::llvm::simd_and; unsafe { Simd(simd_and(self.0, other.0)) } } } impl crate::ops::BitOr for $id { type Output = Self; #[inline] fn bitor(self, other: Self) -> Self { use crate::llvm::simd_or; unsafe { Simd(simd_or(self.0, other.0)) } } } impl crate::ops::BitAndAssign for $id { #[inline] fn bitand_assign(&mut self, other: Self) { *self = *self & other; } } impl crate::ops::BitOrAssign for $id { #[inline] fn bitor_assign(&mut self, other: Self) { *self = *self | other; } } impl crate::ops::BitXorAssign for $id { #[inline] fn bitxor_assign(&mut self, other: Self) { *self = *self ^ other; } } test_if!{ $test_tt: paste::item! { pub mod [<$id _ops_vector_bitwise>] { use super::*; #[cfg_attr(not(target_arch = "wasm32"), test)] #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)] fn ops_vector_bitwise() { let z = $id::splat(0 as $elem_ty); let o = $id::splat(1 as $elem_ty); let t = $id::splat(2 as $elem_ty); let m = $id::splat(!z.extract(0)); // Not: assert_eq!(!z, m); assert_eq!(!m, z); // BitAnd: assert_eq!(o & o, o); assert_eq!(o & z, z); assert_eq!(z & o, z); assert_eq!(z & z, z); assert_eq!(t & t, t); assert_eq!(t & o, z); assert_eq!(o & t, z); // BitOr: assert_eq!(o | o, o); assert_eq!(o | z, o); assert_eq!(z | o, o); assert_eq!(z | z, z); assert_eq!(t | t, t); assert_eq!(z | t, t); assert_eq!(t | z, t); // BitXOR: assert_eq!(o ^ o, z); assert_eq!(z ^ z, z); assert_eq!(z ^ o, o); assert_eq!(o ^ z, o); assert_eq!(t ^ t, z); assert_eq!(t ^ z, t); assert_eq!(z ^ t, t); { // AndAssign: let mut v = o; v &= t; assert_eq!(v, z); } { // OrAssign: let mut v = z; v |= o; assert_eq!(v, o); } { // XORAssign: let mut v = z; v ^= o; assert_eq!(v, o); } } } } } }; } ================================================ FILE: src/api/ops/vector_float_min_max.rs ================================================ //! Vertical (lane-wise) vector `min` and `max` for floating-point vectors. macro_rules! impl_ops_vector_float_min_max { ([$elem_ty:ident; $elem_count:expr]: $id:ident | $test_tt:tt) => { impl $id { /// Minimum of two vectors. /// /// Returns a new vector containing the minimum value of each of /// the input vector lanes. #[inline] pub fn min(self, x: Self) -> Self { use crate::llvm::simd_fmin; unsafe { Simd(simd_fmin(self.0, x.0)) } } /// Maximum of two vectors. /// /// Returns a new vector containing the maximum value of each of /// the input vector lanes. #[inline] pub fn max(self, x: Self) -> Self { use crate::llvm::simd_fmax; unsafe { Simd(simd_fmax(self.0, x.0)) } } } test_if!{ $test_tt: paste::item! { #[cfg(not(any( // FIXME: https://github.com/rust-lang-nursery/packed_simd/issues/223 all(target_arch = "mips", target_endian = "big"), target_arch = "mips64", )))] pub mod [<$id _ops_vector_min_max>] { use super::*; #[cfg_attr(not(target_arch = "wasm32"), test)] #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)] fn min_max() { let n = crate::$elem_ty::NAN; let o = $id::splat(1. as $elem_ty); let t = $id::splat(2. as $elem_ty); let mut m = o; // [1., 2., 1., 2., ...] let mut on = o; for i in 0..$id::lanes() { if i % 2 == 0 { m = m.replace(i, 2. as $elem_ty); on = on.replace(i, n); } } assert_eq!(o.min(t), o); assert_eq!(t.min(o), o); assert_eq!(m.min(o), o); assert_eq!(o.min(m), o); assert_eq!(m.min(t), m); assert_eq!(t.min(m), m); assert_eq!(o.max(t), t); assert_eq!(t.max(o), t); assert_eq!(m.max(o), m); assert_eq!(o.max(m), m); assert_eq!(m.max(t), t); assert_eq!(t.max(m), t); assert_eq!(on.min(o), o); assert_eq!(o.min(on), o); assert_eq!(on.max(o), o); assert_eq!(o.max(on), o); } } } } }; } ================================================ FILE: src/api/ops/vector_int_min_max.rs ================================================ //! Vertical (lane-wise) vector `min` and `max` for integer vectors. macro_rules! impl_ops_vector_int_min_max { ([$elem_ty:ident; $elem_count:expr]: $id:ident | $test_tt:tt) => { impl $id { /// Minimum of two vectors. /// /// Returns a new vector containing the minimum value of each of /// the input vector lanes. #[inline] pub fn min(self, x: Self) -> Self { self.lt(x).select(self, x) } /// Maximum of two vectors. /// /// Returns a new vector containing the maximum value of each of /// the input vector lanes. #[inline] pub fn max(self, x: Self) -> Self { self.gt(x).select(self, x) } } test_if!{$test_tt: paste::item! { pub mod [<$id _ops_vector_min_max>] { use super::*; #[cfg_attr(not(target_arch = "wasm32"), test)] #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)] fn min_max() { let o = $id::splat(1 as $elem_ty); let t = $id::splat(2 as $elem_ty); let mut m = o; for i in 0..$id::lanes() { if i % 2 == 0 { m = m.replace(i, 2 as $elem_ty); } } assert_eq!(o.min(t), o); assert_eq!(t.min(o), o); assert_eq!(m.min(o), o); assert_eq!(o.min(m), o); assert_eq!(m.min(t), m); assert_eq!(t.min(m), m); assert_eq!(o.max(t), t); assert_eq!(t.max(o), t); assert_eq!(m.max(o), m); assert_eq!(o.max(m), m); assert_eq!(m.max(t), t); assert_eq!(t.max(m), t); } } } } }; } ================================================ FILE: src/api/ops/vector_mask_bitwise.rs ================================================ //! Vertical (lane-wise) vector-vector bitwise operations. macro_rules! impl_ops_vector_mask_bitwise { ( [$elem_ty:ident; $elem_count:expr]: $id:ident | $test_tt:tt | ($true:expr, $false:expr) ) => { impl crate::ops::Not for $id { type Output = Self; #[inline] fn not(self) -> Self { Self::splat($true) ^ self } } impl crate::ops::BitXor for $id { type Output = Self; #[inline] fn bitxor(self, other: Self) -> Self { use crate::llvm::simd_xor; unsafe { Simd(simd_xor(self.0, other.0)) } } } impl crate::ops::BitAnd for $id { type Output = Self; #[inline] fn bitand(self, other: Self) -> Self { use crate::llvm::simd_and; unsafe { Simd(simd_and(self.0, other.0)) } } } impl crate::ops::BitOr for $id { type Output = Self; #[inline] fn bitor(self, other: Self) -> Self { use crate::llvm::simd_or; unsafe { Simd(simd_or(self.0, other.0)) } } } impl crate::ops::BitAndAssign for $id { #[inline] fn bitand_assign(&mut self, other: Self) { *self = *self & other; } } impl crate::ops::BitOrAssign for $id { #[inline] fn bitor_assign(&mut self, other: Self) { *self = *self | other; } } impl crate::ops::BitXorAssign for $id { #[inline] fn bitxor_assign(&mut self, other: Self) { *self = *self ^ other; } } test_if!{ $test_tt: paste::item! { pub mod [<$id _ops_vector_mask_bitwise>] { use super::*; #[cfg_attr(not(target_arch = "wasm32"), test)] #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)] fn ops_vector_mask_bitwise() { let t = $id::splat(true); let f = $id::splat(false); assert!(t != f); assert!(!(t == f)); // Not: assert_eq!(!t, f); assert_eq!(t, !f); // BitAnd: assert_eq!(t & f, f); assert_eq!(f & t, f); assert_eq!(t & t, t); assert_eq!(f & f, f); // BitOr: assert_eq!(t | f, t); assert_eq!(f | t, t); assert_eq!(t | t, t); assert_eq!(f | f, f); // BitXOR: assert_eq!(t ^ f, t); assert_eq!(f ^ t, t); assert_eq!(t ^ t, f); assert_eq!(f ^ f, f); { // AndAssign: let mut v = f; v &= t; assert_eq!(v, f); } { // OrAssign: let mut v = f; v |= t; assert_eq!(v, t); } { // XORAssign: let mut v = f; v ^= t; assert_eq!(v, t); } } } } } }; } ================================================ FILE: src/api/ops/vector_neg.rs ================================================ //! Vertical (lane-wise) vector `Neg`. macro_rules! impl_ops_vector_neg { ([$elem_ty:ident; $elem_count:expr]: $id:ident | $test_tt:tt) => { impl crate::ops::Neg for $id { type Output = Self; #[inline] fn neg(self) -> Self { Self::splat(-1 as $elem_ty) * self } } test_if!{ $test_tt: paste::item! { pub mod [<$id _ops_vector_neg>] { use super::*; #[cfg_attr(not(target_arch = "wasm32"), test)] #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)] fn neg() { let z = $id::splat(0 as $elem_ty); let o = $id::splat(1 as $elem_ty); let t = $id::splat(2 as $elem_ty); let f = $id::splat(4 as $elem_ty); let nz = $id::splat(-(0 as $elem_ty)); let no = $id::splat(-(1 as $elem_ty)); let nt = $id::splat(-(2 as $elem_ty)); let nf = $id::splat(-(4 as $elem_ty)); assert_eq!(-z, nz); assert_eq!(-o, no); assert_eq!(-t, nt); assert_eq!(-f, nf); assert_eq!(z, -nz); assert_eq!(o, -no); assert_eq!(t, -nt); assert_eq!(f, -nf); } } } } }; } ================================================ FILE: src/api/ops/vector_rotates.rs ================================================ //! Vertical (lane-wise) vector rotates operations. #![allow(unused)] macro_rules! impl_ops_vector_rotates { ([$elem_ty:ident; $elem_count:expr]: $id:ident | $test_tt:tt) => { impl $id { /// Shifts the bits of each lane to the left by the specified /// amount in the corresponding lane of `n`, wrapping the /// truncated bits to the end of the resulting integer. /// /// Note: this is neither the same operation as `<<` nor equivalent /// to `slice::rotate_left`. #[inline] pub fn rotate_left(self, n: $id) -> $id { const LANE_WIDTH: $elem_ty = crate::mem::size_of::<$elem_ty>() as $elem_ty * 8; // Protect against undefined behavior for over-long bit shifts let n = n % LANE_WIDTH; (self << n) | (self >> ((LANE_WIDTH - n) % LANE_WIDTH)) } /// Shifts the bits of each lane to the right by the specified /// amount in the corresponding lane of `n`, wrapping the /// truncated bits to the beginning of the resulting integer. /// /// Note: this is neither the same operation as `>>` nor equivalent /// to `slice::rotate_right`. #[inline] pub fn rotate_right(self, n: $id) -> $id { const LANE_WIDTH: $elem_ty = crate::mem::size_of::<$elem_ty>() as $elem_ty * 8; // Protect against undefined behavior for over-long bit shifts let n = n % LANE_WIDTH; (self >> n) | (self << ((LANE_WIDTH - n) % LANE_WIDTH)) } } test_if!{ $test_tt: paste::item! { // FIXME: // https://github.com/rust-lang-nursery/packed_simd/issues/75 #[cfg(not(any( target_arch = "s390x", target_arch = "sparc64", )))] pub mod [<$id _ops_vector_rotate>] { use super::*; #[cfg_attr(not(target_arch = "wasm32"), test)] #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)] #[cfg(not(target_arch = "aarch64"))] //~^ FIXME: https://github.com/rust-lang/packed_simd/issues/317 fn rotate_ops() { let z = $id::splat(0 as $elem_ty); let o = $id::splat(1 as $elem_ty); let t = $id::splat(2 as $elem_ty); let f = $id::splat(4 as $elem_ty); let max = $id::splat( (mem::size_of::<$elem_ty>() * 8 - 1) as $elem_ty); // rotate_right assert_eq!(z.rotate_right(z), z); assert_eq!(z.rotate_right(o), z); assert_eq!(z.rotate_right(t), z); assert_eq!(o.rotate_right(z), o); assert_eq!(t.rotate_right(z), t); assert_eq!(f.rotate_right(z), f); assert_eq!(f.rotate_right(max), f << 1); assert_eq!(o.rotate_right(o), o << max); assert_eq!(t.rotate_right(o), o); assert_eq!(t.rotate_right(t), o << max); assert_eq!(f.rotate_right(o), t); assert_eq!(f.rotate_right(t), o); // rotate_left assert_eq!(z.rotate_left(z), z); assert_eq!(o.rotate_left(z), o); assert_eq!(t.rotate_left(z), t); assert_eq!(f.rotate_left(z), f); assert_eq!(f.rotate_left(max), t); assert_eq!(o.rotate_left(o), t); assert_eq!(o.rotate_left(t), f); assert_eq!(t.rotate_left(o), f); } } } } }; } ================================================ FILE: src/api/ops/vector_shifts.rs ================================================ //! Vertical (lane-wise) vector-vector shifts operations. macro_rules! impl_ops_vector_shifts { ([$elem_ty:ident; $elem_count:expr]: $id:ident | $test_tt:tt) => { impl crate::ops::Shl<$id> for $id { type Output = Self; #[inline] fn shl(self, other: Self) -> Self { use crate::llvm::simd_shl; unsafe { Simd(simd_shl(self.0, other.0)) } } } impl crate::ops::Shr<$id> for $id { type Output = Self; #[inline] fn shr(self, other: Self) -> Self { use crate::llvm::simd_shr; unsafe { Simd(simd_shr(self.0, other.0)) } } } impl crate::ops::ShlAssign<$id> for $id { #[inline] fn shl_assign(&mut self, other: Self) { *self = *self << other; } } impl crate::ops::ShrAssign<$id> for $id { #[inline] fn shr_assign(&mut self, other: Self) { *self = *self >> other; } } test_if!{ $test_tt: paste::item! { pub mod [<$id _ops_vector_shifts>] { use super::*; #[cfg_attr(not(target_arch = "wasm32"), test)] #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)] #[cfg_attr(any(target_arch = "s390x", target_arch = "sparc64"), allow(unreachable_code, unused_variables) )] #[cfg(not(target_arch = "aarch64"))] //~^ FIXME: https://github.com/rust-lang/packed_simd/issues/317 fn ops_vector_shifts() { let z = $id::splat(0 as $elem_ty); let o = $id::splat(1 as $elem_ty); let t = $id::splat(2 as $elem_ty); let f = $id::splat(4 as $elem_ty); let max =$id::splat( (mem::size_of::<$elem_ty>() * 8 - 1) as $elem_ty ); // shr assert_eq!(z >> z, z); assert_eq!(z >> o, z); assert_eq!(z >> t, z); assert_eq!(z >> t, z); #[cfg(any(target_arch = "s390x", target_arch = "sparc64"))] { // FIXME: rust produces bad codegen for shifts: // https://github.com/rust-lang-nursery/packed_simd/issues/13 return; } assert_eq!(o >> z, o); assert_eq!(t >> z, t); assert_eq!(f >> z, f); assert_eq!(f >> max, z); assert_eq!(o >> o, z); assert_eq!(t >> o, o); assert_eq!(t >> t, z); assert_eq!(f >> o, t); assert_eq!(f >> t, o); assert_eq!(f >> max, z); // shl assert_eq!(z << z, z); assert_eq!(o << z, o); assert_eq!(t << z, t); assert_eq!(f << z, f); assert_eq!(f << max, z); assert_eq!(o << o, t); assert_eq!(o << t, f); assert_eq!(t << o, f); { // shr_assign let mut v = o; v >>= o; assert_eq!(v, z); } { // shl_assign let mut v = o; v <<= o; assert_eq!(v, t); } } } } } }; } ================================================ FILE: src/api/ops.rs ================================================ //! Implementation of the `ops` traits #[macro_use] mod vector_mask_bitwise; #[macro_use] mod scalar_mask_bitwise; #[macro_use] mod vector_arithmetic; #[macro_use] mod scalar_arithmetic; #[macro_use] mod vector_bitwise; #[macro_use] mod scalar_bitwise; #[macro_use] mod vector_shifts; #[macro_use] mod scalar_shifts; #[macro_use] mod vector_rotates; #[macro_use] mod vector_neg; #[macro_use] mod vector_int_min_max; #[macro_use] mod vector_float_min_max; ================================================ FILE: src/api/ptr/gather_scatter.rs ================================================ //! Implements masked gather and scatters for vectors of pointers macro_rules! impl_ptr_read { ([$elem_ty:ty; $elem_count:expr]: $id:ident, $mask_ty:ident | $test_tt:tt) => { impl $id where [T; $elem_count]: sealed::SimdArray, { /// Reads selected vector elements from memory. /// /// Instantiates a new vector by reading the values from `self` for /// those lanes whose `mask` is `true`, and using the elements of /// `value` otherwise. /// /// No memory is accessed for those lanes of `self` whose `mask` is /// `false`. /// /// # Safety /// /// This method is unsafe because it dereferences raw pointers. The /// pointers must be aligned to `mem::align_of::()`. #[inline] pub unsafe fn read( self, mask: Simd<[M; $elem_count]>, value: Simd<[T; $elem_count]>, ) -> Simd<[T; $elem_count]> where M: sealed::Mask, [M; $elem_count]: sealed::SimdArray, { use crate::llvm::simd_gather; Simd(simd_gather(value.0, self.0, mask.0)) } } test_if! { $test_tt: paste::item! { mod [<$id _read>] { use super::*; #[test] fn read() { let mut v = [0_i32; $elem_count]; for i in 0..$elem_count { v[i] = i as i32; } let mut ptr = $id::::null(); for i in 0..$elem_count { ptr = ptr.replace(i, &v[i] as *const i32 as *mut i32 ); } // all mask elements are true: let mask = $mask_ty::splat(true); let def = Simd::<[i32; $elem_count]>::splat(42_i32); let r: Simd<[i32; $elem_count]> = unsafe { ptr.read(mask, def) }; assert_eq!( r, Simd::<[i32; $elem_count]>::from_slice_unaligned( &v ) ); let mut mask = mask; for i in 0..$elem_count { if i % 2 != 0 { mask = mask.replace(i, false); } } // even mask elements are true, odd ones are false: let r: Simd<[i32; $elem_count]> = unsafe { ptr.read(mask, def) }; let mut e = v; for i in 0..$elem_count { if i % 2 != 0 { e[i] = 42; } } assert_eq!( r, Simd::<[i32; $elem_count]>::from_slice_unaligned( &e ) ); // all mask elements are false: let mask = $mask_ty::splat(false); let def = Simd::<[i32; $elem_count]>::splat(42_i32); let r: Simd<[i32; $elem_count]> = unsafe { ptr.read(mask, def) } ; assert_eq!(r, def); } } } } }; } macro_rules! impl_ptr_write { ([$elem_ty:ty; $elem_count:expr]: $id:ident, $mask_ty:ident | $test_tt:tt) => { impl $id where [T; $elem_count]: sealed::SimdArray, { /// Writes selected vector elements to memory. /// /// Writes the lanes of `values` for which the mask is `true` to /// their corresponding memory addresses in `self`. /// /// No memory is accessed for those lanes of `self` whose `mask` is /// `false`. /// /// Overlapping memory addresses of `self` are written to in order /// from the lest-significant to the most-significant element. /// /// # Safety /// /// This method is unsafe because it dereferences raw pointers. The /// pointers must be aligned to `mem::align_of::()`. #[inline] pub unsafe fn write(self, mask: Simd<[M; $elem_count]>, value: Simd<[T; $elem_count]>) where M: sealed::Mask, [M; $elem_count]: sealed::SimdArray, { use crate::llvm::simd_scatter; simd_scatter(value.0, self.0, mask.0) } } test_if! { $test_tt: paste::item! { mod [<$id _write>] { use super::*; #[test] fn write() { // forty_two = [42, 42, 42, ...] let forty_two = Simd::<[i32; $elem_count]>::splat(42_i32); // This test will write to this array let mut arr = [0_i32; $elem_count]; for i in 0..$elem_count { arr[i] = i as i32; } // arr = [0, 1, 2, ...] let mut ptr = $id::::null(); for i in 0..$elem_count { ptr = ptr.replace(i, unsafe { arr.as_ptr().add(i) as *mut i32 }); } // ptr = [&arr[0], &arr[1], ...] // write `forty_two` to all elements of `v` { let backup = arr; unsafe { ptr.write($mask_ty::splat(true), forty_two) }; assert_eq!(arr, [42_i32; $elem_count]); arr = backup; // arr = [0, 1, 2, ...] } // write 42 to even elements of arr: { // set odd elements of the mask to false let mut mask = $mask_ty::splat(true); for i in 0..$elem_count { if i % 2 != 0 { mask = mask.replace(i, false); } } // mask = [true, false, true, false, ...] // expected result r = [42, 1, 42, 3, 42, 5, ...] let mut r = arr; for i in 0..$elem_count { if i % 2 == 0 { r[i] = 42; } } let backup = arr; unsafe { ptr.write(mask, forty_two) }; assert_eq!(arr, r); arr = backup; // arr = [0, 1, 2, 3, ...] } // write 42 to no elements of arr { let backup = arr; unsafe { ptr.write($mask_ty::splat(false), forty_two) }; assert_eq!(arr, backup); } } } } } }; } ================================================ FILE: src/api/ptr.rs ================================================ //! Vector of pointers #[macro_use] mod gather_scatter; ================================================ FILE: src/api/reductions/bitwise.rs ================================================ //! Implements portable horizontal bitwise vector reductions. #![allow(unused)] macro_rules! impl_reduction_bitwise { ( [$elem_ty:ident; $elem_count:expr]: $id:ident | $ielem_ty:ident | $test_tt:tt | ($convert:expr) | ($true:expr, $false:expr) ) => { impl $id { /// Lane-wise bitwise `and` of the vector elements. /// /// Note: if the vector has one lane, the first element of the /// vector is returned. #[inline] pub fn and(self) -> $elem_ty { #[cfg(not(target_arch = "aarch64"))] { use crate::llvm::simd_reduce_and; let r: $ielem_ty = unsafe { simd_reduce_and(self.0) }; $convert(r) } #[cfg(target_arch = "aarch64")] { // FIXME: broken on aarch64 // https://github.com/rust-lang-nursery/packed_simd/issues/15 let mut x = self.extract(0) as $elem_ty; for i in 1..$id::lanes() { x &= self.extract(i) as $elem_ty; } x } } /// Lane-wise bitwise `or` of the vector elements. /// /// Note: if the vector has one lane, the first element of the /// vector is returned. #[inline] pub fn or(self) -> $elem_ty { #[cfg(not(target_arch = "aarch64"))] { use crate::llvm::simd_reduce_or; let r: $ielem_ty = unsafe { simd_reduce_or(self.0) }; $convert(r) } #[cfg(target_arch = "aarch64")] { // FIXME: broken on aarch64 // https://github.com/rust-lang-nursery/packed_simd/issues/15 let mut x = self.extract(0) as $elem_ty; for i in 1..$id::lanes() { x |= self.extract(i) as $elem_ty; } x } } /// Lane-wise bitwise `xor` of the vector elements. /// /// Note: if the vector has one lane, the first element of the /// vector is returned. #[inline] pub fn xor(self) -> $elem_ty { #[cfg(not(target_arch = "aarch64"))] { use crate::llvm::simd_reduce_xor; let r: $ielem_ty = unsafe { simd_reduce_xor(self.0) }; $convert(r) } #[cfg(target_arch = "aarch64")] { // FIXME: broken on aarch64 // https://github.com/rust-lang-nursery/packed_simd/issues/15 let mut x = self.extract(0) as $elem_ty; for i in 1..$id::lanes() { x ^= self.extract(i) as $elem_ty; } x } } } test_if!{ $test_tt: paste::item! { pub mod [<$id _reduction_bitwise>] { use super::*; #[cfg_attr(not(target_arch = "wasm32"), test)] #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)] fn and() { let v = $id::splat($false); assert_eq!(v.and(), $false); let v = $id::splat($true); assert_eq!(v.and(), $true); let v = $id::splat($false); let v = v.replace(0, $true); if $id::lanes() > 1 { assert_eq!(v.and(), $false); } else { assert_eq!(v.and(), $true); } let v = $id::splat($true); let v = v.replace(0, $false); assert_eq!(v.and(), $false); } #[cfg_attr(not(target_arch = "wasm32"), test)] #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)] fn or() { let v = $id::splat($false); assert_eq!(v.or(), $false); let v = $id::splat($true); assert_eq!(v.or(), $true); let v = $id::splat($false); let v = v.replace(0, $true); assert_eq!(v.or(), $true); let v = $id::splat($true); let v = v.replace(0, $false); if $id::lanes() > 1 { assert_eq!(v.or(), $true); } else { assert_eq!(v.or(), $false); } } #[cfg_attr(not(target_arch = "wasm32"), test)] #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)] fn xor() { let v = $id::splat($false); assert_eq!(v.xor(), $false); let v = $id::splat($true); if $id::lanes() > 1 { assert_eq!(v.xor(), $false); } else { assert_eq!(v.xor(), $true); } let v = $id::splat($false); let v = v.replace(0, $true); assert_eq!(v.xor(), $true); let v = $id::splat($true); let v = v.replace(0, $false); if $id::lanes() > 1 { assert_eq!(v.xor(), $true); } else { assert_eq!(v.xor(), $false); } } } } } }; } ================================================ FILE: src/api/reductions/float_arithmetic.rs ================================================ //! Implements portable horizontal float vector arithmetic reductions. macro_rules! impl_reduction_float_arithmetic { ([$elem_ty:ident; $elem_count:expr]: $id:ident | $test_tt:tt) => { impl $id { /// Horizontal sum of the vector elements. /// /// The intrinsic performs a tree-reduction of the vector elements. /// That is, for an 8 element vector: /// /// > ((x0 + x1) + (x2 + x3)) + ((x4 + x5) + (x6 + x7)) /// /// If one of the vector element is `NaN` the reduction returns /// `NaN`. The resulting `NaN` is not required to be equal to any /// of the `NaN`s in the vector. #[inline] pub fn sum(self) -> $elem_ty { #[cfg(not(target_arch = "aarch64"))] { use crate::llvm::simd_reduce_add_ordered; unsafe { simd_reduce_add_ordered(self.0, 0 as $elem_ty) } } #[cfg(target_arch = "aarch64")] { // FIXME: broken on AArch64 // https://github.com/rust-lang-nursery/packed_simd/issues/15 let mut x = self.extract(0) as $elem_ty; for i in 1..$id::lanes() { x += self.extract(i) as $elem_ty; } x } } /// Horizontal product of the vector elements. /// /// The intrinsic performs a tree-reduction of the vector elements. /// That is, for an 8 element vector: /// /// > ((x0 * x1) * (x2 * x3)) * ((x4 * x5) * (x6 * x7)) /// /// If one of the vector element is `NaN` the reduction returns /// `NaN`. The resulting `NaN` is not required to be equal to any /// of the `NaN`s in the vector. #[inline] pub fn product(self) -> $elem_ty { #[cfg(not(target_arch = "aarch64"))] { use crate::llvm::simd_reduce_mul_ordered; unsafe { simd_reduce_mul_ordered(self.0, 1 as $elem_ty) } } #[cfg(target_arch = "aarch64")] { // FIXME: broken on AArch64 // https://github.com/rust-lang-nursery/packed_simd/issues/15 let mut x = self.extract(0) as $elem_ty; for i in 1..$id::lanes() { x *= self.extract(i) as $elem_ty; } x } } } impl crate::iter::Sum for $id { #[inline] fn sum>(iter: I) -> $id { iter.fold($id::splat(0.), crate::ops::Add::add) } } impl crate::iter::Product for $id { #[inline] fn product>(iter: I) -> $id { iter.fold($id::splat(1.), crate::ops::Mul::mul) } } impl<'a> crate::iter::Sum<&'a $id> for $id { #[inline] fn sum>(iter: I) -> $id { iter.fold($id::splat(0.), |a, b| crate::ops::Add::add(a, *b)) } } impl<'a> crate::iter::Product<&'a $id> for $id { #[inline] fn product>(iter: I) -> $id { iter.fold($id::splat(1.), |a, b| crate::ops::Mul::mul(a, *b)) } } test_if! { $test_tt: paste::item! { // Comparisons use integer casts within mantissa^1 range. #[allow(clippy::float_cmp)] pub mod [<$id _reduction_float_arith>] { use super::*; fn alternating(x: usize) -> $id { let mut v = $id::splat(1 as $elem_ty); for i in 0..$id::lanes() { if i % x == 0 { v = v.replace(i, 2 as $elem_ty); } } v } #[cfg_attr(not(target_arch = "wasm32"), test)] #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)] fn sum() { let v = $id::splat(0 as $elem_ty); assert_eq!(v.sum(), 0 as $elem_ty); let v = $id::splat(1 as $elem_ty); assert_eq!(v.sum(), $id::lanes() as $elem_ty); let v = alternating(2); assert_eq!( v.sum(), ($id::lanes() / 2 + $id::lanes()) as $elem_ty ); } #[cfg_attr(not(target_arch = "wasm32"), test)] #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)] fn product() { let v = $id::splat(0 as $elem_ty); assert_eq!(v.product(), 0 as $elem_ty); let v = $id::splat(1 as $elem_ty); assert_eq!(v.product(), 1 as $elem_ty); let f = match $id::lanes() { 64 => 16, 32 => 8, 16 => 4, _ => 2, }; let v = alternating(f); assert_eq!( v.product(), (2_usize.pow(($id::lanes() / f) as u32) as $elem_ty) ); } #[cfg_attr(not(target_arch = "wasm32"), test)] #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)] #[allow(unreachable_code)] fn sum_nan() { // FIXME: https://bugs.llvm.org/show_bug.cgi?id=36732 // https://github.com/rust-lang-nursery/packed_simd/issues/6 return; let n0 = crate::$elem_ty::NAN; let v0 = $id::splat(-3.0); for i in 0..$id::lanes() { let mut v = v0.replace(i, n0); // If the vector contains a NaN the result is NaN: assert!( v.sum().is_nan(), "nan at {} => {} | {:?}", i, v.sum(), v ); for j in 0..i { v = v.replace(j, n0); assert!(v.sum().is_nan()); } } let v = $id::splat(n0); assert!(v.sum().is_nan(), "all nans | {:?}", v); } #[cfg_attr(not(target_arch = "wasm32"), test)] #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)] #[allow(unreachable_code)] fn product_nan() { // FIXME: https://bugs.llvm.org/show_bug.cgi?id=36732 // https://github.com/rust-lang-nursery/packed_simd/issues/6 return; let n0 = crate::$elem_ty::NAN; let v0 = $id::splat(-3.0); for i in 0..$id::lanes() { let mut v = v0.replace(i, n0); // If the vector contains a NaN the result is NaN: assert!( v.product().is_nan(), "nan at {} => {} | {:?}", i, v.product(), v ); for j in 0..i { v = v.replace(j, n0); assert!(v.product().is_nan()); } } let v = $id::splat(n0); assert!(v.product().is_nan(), "all nans | {:?}", v); } #[cfg_attr(not(target_arch = "wasm32"), test)] #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)] #[allow(unused, dead_code)] fn sum_roundoff() { // Performs a tree-reduction fn tree_reduce_sum(a: &[$elem_ty]) -> $elem_ty { assert!(!a.is_empty()); if a.len() == 1 { a[0] } else if a.len() == 2 { a[0] + a[1] } else { let mid = a.len() / 2; let (left, right) = a.split_at(mid); tree_reduce_sum(left) + tree_reduce_sum(right) } } let mut start = crate::$elem_ty::EPSILON; let mut scalar_reduction = 0. as $elem_ty; let mut v = $id::splat(0. as $elem_ty); for i in 0..$id::lanes() { let c = if i % 2 == 0 { 1e3 } else { -1. }; start *= ::core::$elem_ty::consts::PI * c; scalar_reduction += start; v = v.replace(i, start); } let simd_reduction = v.sum(); let mut a = [0. as $elem_ty; $id::lanes()]; v.write_to_slice_unaligned(&mut a); let tree_reduction = tree_reduce_sum(&a); // tolerate 1 ULP difference: let red_bits = simd_reduction.to_bits(); let tree_bits = tree_reduction.to_bits(); assert!( if red_bits > tree_bits { red_bits - tree_bits } else { tree_bits - red_bits } < 2, "vector: {:?} | simd_reduction: {:?} | \ tree_reduction: {} | scalar_reduction: {}", v, simd_reduction, tree_reduction, scalar_reduction ); } #[cfg_attr(not(target_arch = "wasm32"), test)] #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)] #[allow(unused, dead_code)] fn product_roundoff() { use ::core::convert::TryInto; // Performs a tree-reduction fn tree_reduce_product(a: &[$elem_ty]) -> $elem_ty { assert!(!a.is_empty()); if a.len() == 1 { a[0] } else if a.len() == 2 { a[0] * a[1] } else { let mid = a.len() / 2; let (left, right) = a.split_at(mid); tree_reduce_product(left) * tree_reduce_product(right) } } let mut start = crate::$elem_ty::EPSILON; let mut scalar_reduction = 1. as $elem_ty; let mut v = $id::splat(0. as $elem_ty); for i in 0..$id::lanes() { let c = if i % 2 == 0 { 1e3 } else { -1. }; start *= ::core::$elem_ty::consts::PI * c; scalar_reduction *= start; v = v.replace(i, start); } let simd_reduction = v.product(); let mut a = [0. as $elem_ty; $id::lanes()]; v.write_to_slice_unaligned(&mut a); let tree_reduction = tree_reduce_product(&a); // FIXME: Too imprecise, even only for product(f32x8). // Figure out how to narrow this down. let ulp_limit = $id::lanes() / 2; let red_bits = simd_reduction.to_bits(); let tree_bits = tree_reduction.to_bits(); assert!( if red_bits > tree_bits { red_bits - tree_bits } else { tree_bits - red_bits } < ulp_limit.try_into().unwrap(), "vector: {:?} | simd_reduction: {:?} | \ tree_reduction: {} | scalar_reduction: {}", v, simd_reduction, tree_reduction, scalar_reduction ); } } } } }; } ================================================ FILE: src/api/reductions/integer_arithmetic.rs ================================================ //! Implements portable horizontal integer vector arithmetic reductions. macro_rules! impl_reduction_integer_arithmetic { ([$elem_ty:ident; $elem_count:expr]: $id:ident | $ielem_ty:ident | $test_tt:tt) => { impl $id { /// Horizontal wrapping sum of the vector elements. /// /// The intrinsic performs a tree-reduction of the vector elements. /// That is, for an 8 element vector: /// /// > ((x0 + x1) + (x2 + x3)) + ((x4 + x5) + (x6 + x7)) /// /// If an operation overflows it returns the mathematical result /// modulo `2^n` where `n` is the number of times it overflows. #[inline] pub fn wrapping_sum(self) -> $elem_ty { #[cfg(not(target_arch = "aarch64"))] { use crate::llvm::simd_reduce_add_ordered; let v: $ielem_ty = unsafe { simd_reduce_add_ordered(self.0, 0 as $ielem_ty) }; v as $elem_ty } #[cfg(target_arch = "aarch64")] { // FIXME: broken on AArch64 // https://github.com/rust-lang-nursery/packed_simd/issues/15 let mut x = self.extract(0) as $elem_ty; for i in 1..$id::lanes() { x = x.wrapping_add(self.extract(i) as $elem_ty); } x } } /// Horizontal wrapping product of the vector elements. /// /// The intrinsic performs a tree-reduction of the vector elements. /// That is, for an 8 element vector: /// /// > ((x0 * x1) * (x2 * x3)) * ((x4 * x5) * (x6 * x7)) /// /// If an operation overflows it returns the mathematical result /// modulo `2^n` where `n` is the number of times it overflows. #[inline] pub fn wrapping_product(self) -> $elem_ty { #[cfg(not(target_arch = "aarch64"))] { use crate::llvm::simd_reduce_mul_ordered; let v: $ielem_ty = unsafe { simd_reduce_mul_ordered(self.0, 1 as $ielem_ty) }; v as $elem_ty } #[cfg(target_arch = "aarch64")] { // FIXME: broken on AArch64 // https://github.com/rust-lang-nursery/packed_simd/issues/15 let mut x = self.extract(0) as $elem_ty; for i in 1..$id::lanes() { x = x.wrapping_mul(self.extract(i) as $elem_ty); } x } } } impl crate::iter::Sum for $id { #[inline] fn sum>(iter: I) -> $id { iter.fold($id::splat(0), crate::ops::Add::add) } } impl crate::iter::Product for $id { #[inline] fn product>(iter: I) -> $id { iter.fold($id::splat(1), crate::ops::Mul::mul) } } impl<'a> crate::iter::Sum<&'a $id> for $id { #[inline] fn sum>(iter: I) -> $id { iter.fold($id::splat(0), |a, b| crate::ops::Add::add(a, *b)) } } impl<'a> crate::iter::Product<&'a $id> for $id { #[inline] fn product>(iter: I) -> $id { iter.fold($id::splat(1), |a, b| crate::ops::Mul::mul(a, *b)) } } test_if! { $test_tt: paste::item! { pub mod [<$id _reduction_int_arith>] { use super::*; fn alternating(x: usize) -> $id { let mut v = $id::splat(1 as $elem_ty); for i in 0..$id::lanes() { if i % x == 0 { v = v.replace(i, 2 as $elem_ty); } } v } #[cfg_attr(not(target_arch = "wasm32"), test)] #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)] fn wrapping_sum() { let v = $id::splat(0 as $elem_ty); assert_eq!(v.wrapping_sum(), 0 as $elem_ty); let v = $id::splat(1 as $elem_ty); assert_eq!(v.wrapping_sum(), $id::lanes() as $elem_ty); let v = alternating(2); if $id::lanes() > 1 { assert_eq!( v.wrapping_sum(), ($id::lanes() / 2 + $id::lanes()) as $elem_ty ); } else { assert_eq!( v.wrapping_sum(), 2 as $elem_ty ); } } #[cfg_attr(not(target_arch = "wasm32"), test)] #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)] fn wrapping_sum_overflow() { let start = $elem_ty::max_value() - ($id::lanes() as $elem_ty / 2); let v = $id::splat(start as $elem_ty); let vwrapping_sum = v.wrapping_sum(); let mut wrapping_sum = start; for _ in 1..$id::lanes() { wrapping_sum = wrapping_sum.wrapping_add(start); } assert_eq!(wrapping_sum, vwrapping_sum, "v = {:?}", v); } #[cfg_attr(not(target_arch = "wasm32"), test)] #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)] fn wrapping_product() { let v = $id::splat(0 as $elem_ty); assert_eq!(v.wrapping_product(), 0 as $elem_ty); let v = $id::splat(1 as $elem_ty); assert_eq!(v.wrapping_product(), 1 as $elem_ty); let f = match $id::lanes() { 64 => 16, 32 => 8, 16 => 4, _ => 2, }; let v = alternating(f); if $id::lanes() > 1 { assert_eq!( v.wrapping_product(), (2_usize.pow(($id::lanes() / f) as u32) as $elem_ty) ); } else { assert_eq!( v.wrapping_product(), 2 as $elem_ty ); } } #[cfg_attr(not(target_arch = "wasm32"), test)] #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)] fn wrapping_product_overflow() { let start = $elem_ty::max_value() - ($id::lanes() as $elem_ty / 2); let v = $id::splat(start as $elem_ty); let vmul = v.wrapping_product(); let mut mul = start; for _ in 1..$id::lanes() { mul = mul.wrapping_mul(start); } assert_eq!(mul, vmul, "v = {:?}", v); } } } } }; } ================================================ FILE: src/api/reductions/mask.rs ================================================ //! Implements portable horizontal mask reductions. macro_rules! impl_reduction_mask { ([$elem_ty:ident; $elem_count:expr]: $id:ident | $test_tt:tt) => { impl $id { /// Are `all` vector lanes `true`? #[inline] pub fn all(self) -> bool { unsafe { crate::codegen::reductions::mask::All::all(self) } } /// Is `any` vector lane `true`? #[inline] pub fn any(self) -> bool { unsafe { crate::codegen::reductions::mask::Any::any(self) } } /// Are `all` vector lanes `false`? #[inline] pub fn none(self) -> bool { !self.any() } } test_if! { $test_tt: paste::item! { pub mod [<$id _reduction>] { use super::*; #[cfg_attr(not(target_arch = "wasm32"), test)] #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)] fn all() { let a = $id::splat(true); assert!(a.all()); let a = $id::splat(false); assert!(!a.all()); if $id::lanes() > 1 { for i in 0..$id::lanes() { let mut a = $id::splat(true); a = a.replace(i, false); assert!(!a.all()); let mut a = $id::splat(false); a = a.replace(i, true); assert!(!a.all()); } } } #[cfg_attr(not(target_arch = "wasm32"), test)] #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)] fn any() { let a = $id::splat(true); assert!(a.any()); let a = $id::splat(false); assert!(!a.any()); if $id::lanes() > 1 { for i in 0..$id::lanes() { let mut a = $id::splat(true); a = a.replace(i, false); assert!(a.any()); let mut a = $id::splat(false); a = a.replace(i, true); assert!(a.any()); } } } #[cfg_attr(not(target_arch = "wasm32"), test)] #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)] fn none() { let a = $id::splat(true); assert!(!a.none()); let a = $id::splat(false); assert!(a.none()); if $id::lanes() > 1 { for i in 0..$id::lanes() { let mut a = $id::splat(true); a = a.replace(i, false); assert!(!a.none()); let mut a = $id::splat(false); a = a.replace(i, true); assert!(!a.none()); } } } } } } }; } ================================================ FILE: src/api/reductions/min_max.rs ================================================ //! Implements portable horizontal vector min/max reductions. macro_rules! impl_reduction_min_max { ([$elem_ty:ident; $elem_count:expr]: $id:ident | $ielem_ty:ident | $test_tt:tt) => { impl $id { /// Largest vector element value. #[inline] pub fn max_element(self) -> $elem_ty { #[cfg(not(any( target_arch = "aarch64", target_arch = "arm", target_arch = "powerpc64", target_arch = "wasm32", )))] { use crate::llvm::simd_reduce_max; let v: $ielem_ty = unsafe { simd_reduce_max(self.0) }; v as $elem_ty } #[cfg(any( target_arch = "aarch64", target_arch = "arm", target_arch = "powerpc64", target_arch = "wasm32", ))] { // FIXME: broken on AArch64 // https://github.com/rust-lang-nursery/packed_simd/issues/15 // FIXME: broken on WASM32 // https://github.com/rust-lang-nursery/packed_simd/issues/91 let mut x = self.extract(0); for i in 1..$id::lanes() { x = x.max(self.extract(i)); } x } } /// Smallest vector element value. #[inline] pub fn min_element(self) -> $elem_ty { #[cfg(not(any( target_arch = "aarch64", target_arch = "arm", all(target_arch = "x86", not(target_feature = "sse2")), target_arch = "powerpc64", target_arch = "wasm32", ),))] { use crate::llvm::simd_reduce_min; let v: $ielem_ty = unsafe { simd_reduce_min(self.0) }; v as $elem_ty } #[cfg(any( target_arch = "aarch64", target_arch = "arm", all(target_arch = "x86", not(target_feature = "sse2")), target_arch = "powerpc64", target_arch = "wasm32", ))] { // FIXME: broken on AArch64 // https://github.com/rust-lang-nursery/packed_simd/issues/15 // FIXME: broken on i586-unknown-linux-gnu // https://github.com/rust-lang-nursery/packed_simd/issues/22 // FIXME: broken on WASM32 // https://github.com/rust-lang-nursery/packed_simd/issues/91 let mut x = self.extract(0); for i in 1..$id::lanes() { x = x.min(self.extract(i)); } x } } } test_if! {$test_tt: paste::item! { // Comparisons use integer casts within mantissa^1 range. #[allow(clippy::float_cmp)] pub mod [<$id _reduction_min_max>] { use super::*; #[cfg_attr(not(target_arch = "wasm32"), test)] #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)] pub fn max_element() { let v = $id::splat(0 as $elem_ty); assert_eq!(v.max_element(), 0 as $elem_ty); if $id::lanes() > 1 { let v = v.replace(1, 1 as $elem_ty); assert_eq!(v.max_element(), 1 as $elem_ty); } let v = v.replace(0, 2 as $elem_ty); assert_eq!(v.max_element(), 2 as $elem_ty); } #[cfg_attr(not(target_arch = "wasm32"), test)] #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)] pub fn min_element() { let v = $id::splat(0 as $elem_ty); assert_eq!(v.min_element(), 0 as $elem_ty); if $id::lanes() > 1 { let v = v.replace(1, 1 as $elem_ty); assert_eq!(v.min_element(), 0 as $elem_ty); } let v = $id::splat(1 as $elem_ty); let v = v.replace(0, 2 as $elem_ty); if $id::lanes() > 1 { assert_eq!(v.min_element(), 1 as $elem_ty); } else { assert_eq!(v.min_element(), 2 as $elem_ty); } if $id::lanes() > 1 { let v = $id::splat(2 as $elem_ty); let v = v.replace(1, 1 as $elem_ty); assert_eq!(v.min_element(), 1 as $elem_ty); } } } } } }; } macro_rules! test_reduction_float_min_max { ([$elem_ty:ident; $elem_count:expr]: $id:ident | $test_tt:tt) => { test_if! { $test_tt: paste::item! { // Comparisons use integer casts within mantissa^1 range. #[allow(clippy::float_cmp)] pub mod [<$id _reduction_min_max_nan>] { use super::*; #[cfg_attr(not(target_arch = "wasm32"), test)] #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)] fn min_element_test() { let n = crate::$elem_ty::NAN; assert_eq!(n.min(-3.), -3.); assert_eq!((-3. as $elem_ty).min(n), -3.); let v0 = $id::splat(-3.); let target_with_broken_last_lane_nan = !cfg!(any( target_arch = "arm", target_arch = "aarch64", all(target_arch = "x86", not(target_feature = "sse2") ), target_arch = "powerpc64", target_arch = "wasm32", )); // The vector is initialized to `-3.`s: [-3, -3, -3, -3] for i in 0..$id::lanes() { // We replace the i-th element of the vector with // `NaN`: [-3, -3, -3, NaN] let mut v = v0.replace(i, n); // If the NaN is in the last place, the LLVM // implementation of these methods is broken on some // targets: if i == $id::lanes() - 1 && target_with_broken_last_lane_nan { assert_eq!(v.min_element(), -3., "[A]: nan at {} => {} | {:?}", i, v.min_element(), v); // If we replace all the elements in the vector // up-to the `i-th` lane with `NaN`s, the result // is still always `-3.` unless all elements of // the vector are `NaN`s: for j in 0..i { v = v.replace(j, n); if j == i-1 { assert!(v.min_element().is_nan(), "[B]: nan at {} => {} | {:?}", i, v.min_element(), v); } else { assert_eq!(v.min_element(), -3., "[B]: nan at {} => {} | {:?}", i, v.min_element(), v); } } // We are done here, since we were in the last // lane which is the last iteration of the loop. break } // We are not in the last lane, and there is only // one `NaN` in the vector. // If the vector has one lane, the result is `NaN`: if $id::lanes() == 1 { assert!(v.min_element().is_nan(), "[C]: all nans | v={:?} | min={} | \ is_nan: {}", v, v.min_element(), v.min_element().is_nan() ); // And we are done, since the vector only has // one lane anyways. break; } // The vector has more than one lane, since there is // only one `NaN` in the vector, the result is // always `-3`. assert_eq!(v.min_element(), -3., "[D]: nan at {} => {} | {:?}", i, v.min_element(), v); // If we replace all the elements in the vector // up-to the `i-th` lane with `NaN`s, the result is // still always `-3.` unless all elements of the // vector are `NaN`s: for j in 0..i { v = v.replace(j, n); if i == $id::lanes() - 1 && j == i - 1 { // All elements of the vector are `NaN`s, // therefore the result is NaN as well. // // Note: the #lanes of the vector is > 1, so // "i - 1" does not overflow. assert!(v.min_element().is_nan(), "[E]: all nans | v={:?} | min={} | \ is_nan: {}", v, v.min_element(), v.min_element().is_nan()); } else { // There are non-`NaN` elements in the // vector, therefore the result is `-3.`: assert_eq!(v.min_element(), -3., "[F]: nan at {} => {} | {:?}", i, v.min_element(), v); } } } // If the vector contains all NaNs the result is NaN: assert!($id::splat(n).min_element().is_nan(), "all nans | v={:?} | min={} | is_nan: {}", $id::splat(n), $id::splat(n).min_element(), $id::splat(n).min_element().is_nan()); } #[cfg_attr(not(target_arch = "wasm32"), test)] #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)] fn max_element_test() { let n = crate::$elem_ty::NAN; assert_eq!(n.max(-3.), -3.); assert_eq!((-3. as $elem_ty).max(n), -3.); let v0 = $id::splat(-3.); let target_with_broken_last_lane_nan = !cfg!(any( target_arch = "arm", target_arch = "aarch64", target_arch = "powerpc64", target_arch = "wasm32", )); // The vector is initialized to `-3.`s: [-3, -3, -3, -3] for i in 0..$id::lanes() { // We replace the i-th element of the vector with // `NaN`: [-3, -3, -3, NaN] let mut v = v0.replace(i, n); // If the NaN is in the last place, the LLVM // implementation of these methods is broken on some // targets: if i == $id::lanes() - 1 && target_with_broken_last_lane_nan { assert_eq!(v.max_element(), -3., "[A]: nan at {} => {} | {:?}", i, v.max_element(), v); // If we replace all the elements in the vector // up-to the `i-th` lane with `NaN`s, the result // is still always `-3.` unless all elements of // the vector are `NaN`s: for j in 0..i { v = v.replace(j, n); if j == i-1 { assert!(v.min_element().is_nan(), "[B]: nan at {} => {} | {:?}", i, v.min_element(), v); } else { assert_eq!(v.max_element(), -3., "[B]: nan at {} => {} | {:?}", i, v.max_element(), v); } } // We are done here, since we were in the last // lane which is the last iteration of the loop. break } // We are not in the last lane, and there is only // one `NaN` in the vector. // If the vector has one lane, the result is `NaN`: if $id::lanes() == 1 { assert!(v.max_element().is_nan(), "[C]: all nans | v={:?} | min={} | \ is_nan: {}", v, v.max_element(), v.max_element().is_nan()); // And we are done, since the vector only has // one lane anyways. break; } // The vector has more than one lane, since there is // only one `NaN` in the vector, the result is // always `-3`. assert_eq!(v.max_element(), -3., "[D]: nan at {} => {} | {:?}", i, v.max_element(), v); // If we replace all the elements in the vector // up-to the `i-th` lane with `NaN`s, the result is // still always `-3.` unless all elements of the // vector are `NaN`s: for j in 0..i { v = v.replace(j, n); if i == $id::lanes() - 1 && j == i - 1 { // All elements of the vector are `NaN`s, // therefore the result is NaN as well. // // Note: the #lanes of the vector is > 1, so // "i - 1" does not overflow. assert!(v.max_element().is_nan(), "[E]: all nans | v={:?} | max={} | \ is_nan: {}", v, v.max_element(), v.max_element().is_nan()); } else { // There are non-`NaN` elements in the // vector, therefore the result is `-3.`: assert_eq!(v.max_element(), -3., "[F]: nan at {} => {} | {:?}", i, v.max_element(), v); } } } // If the vector contains all NaNs the result is NaN: assert!($id::splat(n).max_element().is_nan(), "all nans | v={:?} | max={} | is_nan: {}", $id::splat(n), $id::splat(n).max_element(), $id::splat(n).max_element().is_nan()); } } } } }; } ================================================ FILE: src/api/reductions.rs ================================================ //! Reductions #[macro_use] mod float_arithmetic; #[macro_use] mod integer_arithmetic; #[macro_use] mod bitwise; #[macro_use] mod mask; #[macro_use] mod min_max; ================================================ FILE: src/api/select.rs ================================================ //! Implements mask's `select`. /// Implements mask select method macro_rules! impl_select { ([$elem_ty:ident; $elem_count:expr]: $id:ident | $test_tt:tt) => { impl $id { /// Selects elements of `a` and `b` using mask. /// /// The lanes of the result for which the mask is `true` contain /// the values of `a`. The remaining lanes contain the values of /// `b`. #[inline] pub fn select(self, a: Simd, b: Simd) -> Simd where T: sealed::SimdArray::NT>, { use crate::llvm::simd_select; Simd(unsafe { simd_select(self.0, a.0, b.0) }) } } test_select!(bool, $id, $id, (false, true) | $test_tt); }; } macro_rules! test_select { ( $elem_ty:ident, $mask_ty:ident, $vec_ty:ident,($small:expr, $large:expr) | $test_tt:tt ) => { test_if! { $test_tt: paste::item! { pub mod [<$vec_ty _select>] { use super::*; #[cfg_attr(not(target_arch = "wasm32"), test)] #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)] fn select() { let o = $small as $elem_ty; let t = $large as $elem_ty; let a = $vec_ty::splat(o); let b = $vec_ty::splat(t); let m = a.lt(b); assert_eq!(m.select(a, b), a); let m = b.lt(a); assert_eq!(m.select(b, a), a); let mut c = a; let mut d = b; let mut m_e = $mask_ty::splat(false); for i in 0..$vec_ty::lanes() { if i % 2 == 0 { let c_tmp = c.extract(i); c = c.replace(i, d.extract(i)); d = d.replace(i, c_tmp); } else { m_e = m_e.replace(i, true); } } let m = c.lt(d); assert_eq!(m_e, m); assert_eq!(m.select(c, d), a); } } } } }; } ================================================ FILE: src/api/shuffle.rs ================================================ //! Implements portable vector shuffles with immediate indices. // FIXME: comprehensive tests // https://github.com/rust-lang-nursery/packed_simd/issues/20 /// Shuffles vector elements. /// /// This macro returns a new vector that contains a shuffle of the elements in /// one (`shuffle!(vec, [indices...])`) or two (`shuffle!(vec0, vec1, /// [indices...])`) input vectors. /// /// The type of `vec0` and `vec1` must be equal, and the element type of the /// resulting vector is the element type of the input vector. /// /// The number of `indices` must be a power-of-two in range `[0, 64)`, since /// currently, the largest vector supported by the library has 64 lanes. The /// length of the resulting vector equals the number of indices provided. /// /// The indices must be in range `[0, M * N)` where `M` is the number of input /// vectors (`1` or `2`) and `N` is the number of lanes of the input vectors. /// The indices `i` in range `[0, N)` refer to the `i`-th element of `vec0`, /// while the indices in range `[N, 2*N)` refer to the `i - N`-th element of /// `vec1`. /// /// # Examples /// /// Shuffling elements of two vectors: /// /// ``` /// # use packed_simd::*; /// # fn main() { /// // Shuffle allows reordering the elements: /// let x = i32x4::new(1, 2, 3, 4); /// let y = i32x4::new(5, 6, 7, 8); /// let r = shuffle!(x, y, [4, 0, 5, 1]); /// assert_eq!(r, i32x4::new(5, 1, 6, 2)); /// /// // The resulting vector can als be smaller than the input: /// let r = shuffle!(x, y, [1, 6]); /// assert_eq!(r, i32x2::new(2, 7)); /// /// // Or larger: /// let r = shuffle!(x, y, [1, 3, 4, 2, 1, 7, 2, 2]); /// assert_eq!(r, i32x8::new(2, 4, 5, 3, 2, 8, 3, 3)); /// // At most 2 * the number of lanes in the input vector. /// # } /// ``` /// /// Shuffling elements of one vector: /// /// ``` /// # use packed_simd::*; /// # fn main() { /// // Shuffle allows reordering the elements of a vector: /// let x = i32x4::new(1, 2, 3, 4); /// let r = shuffle!(x, [2, 1, 3, 0]); /// assert_eq!(r, i32x4::new(3, 2, 4, 1)); /// /// // The resulting vector can be smaller than the input: /// let r = shuffle!(x, [1, 3]); /// assert_eq!(r, i32x2::new(2, 4)); /// /// // Equal: /// let r = shuffle!(x, [1, 3, 2, 0]); /// assert_eq!(r, i32x4::new(2, 4, 3, 1)); /// /// // Or larger: /// let r = shuffle!(x, [1, 3, 2, 2, 1, 3, 2, 2]); /// assert_eq!(r, i32x8::new(2, 4, 3, 3, 2, 4, 3, 3)); /// // At most 2 * the number of lanes in the input vector. /// # } /// ``` #[macro_export] macro_rules! shuffle { ($vec0:expr, $vec1:expr, [$l0:expr, $l1:expr]) => {{ #[allow(unused_unsafe)] unsafe { $crate::Simd($crate::__shuffle_vector2::<{[$l0, $l1]}, _, _>( $vec0.0, $vec1.0, )) } }}; ($vec0:expr, $vec1:expr, [$l0:expr, $l1:expr, $l2:expr, $l3:expr]) => {{ #[allow(unused_unsafe)] unsafe { $crate::Simd($crate::__shuffle_vector4::<{[$l0, $l1, $l2, $l3]}, _, _>( $vec0.0, $vec1.0, )) } }}; ($vec0:expr, $vec1:expr, [$l0:expr, $l1:expr, $l2:expr, $l3:expr, $l4:expr, $l5:expr, $l6:expr, $l7:expr]) => {{ #[allow(unused_unsafe)] unsafe { $crate::Simd($crate::__shuffle_vector8::<{[$l0, $l1, $l2, $l3, $l4, $l5, $l6, $l7]}, _, _>( $vec0.0, $vec1.0, )) } }}; ($vec0:expr, $vec1:expr, [$l0:expr, $l1:expr, $l2:expr, $l3:expr, $l4:expr, $l5:expr, $l6:expr, $l7:expr, $l8:expr, $l9:expr, $l10:expr, $l11:expr, $l12:expr, $l13:expr, $l14:expr, $l15:expr]) => {{ #[allow(unused_unsafe)] unsafe { $crate::Simd($crate::__shuffle_vector16::<{ [ $l0, $l1, $l2, $l3, $l4, $l5, $l6, $l7, $l8, $l9, $l10, $l11, $l12, $l13, $l14, $l15, ] }, _, _>( $vec0.0, $vec1.0, )) } }}; ($vec0:expr, $vec1:expr, [$l0:expr, $l1:expr, $l2:expr, $l3:expr, $l4:expr, $l5:expr, $l6:expr, $l7:expr, $l8:expr, $l9:expr, $l10:expr, $l11:expr, $l12:expr, $l13:expr, $l14:expr, $l15:expr, $l16:expr, $l17:expr, $l18:expr, $l19:expr, $l20:expr, $l21:expr, $l22:expr, $l23:expr, $l24:expr, $l25:expr, $l26:expr, $l27:expr, $l28:expr, $l29:expr, $l30:expr, $l31:expr]) => {{ #[allow(unused_unsafe)] unsafe { $crate::Simd($crate::__shuffle_vector32::<{ [ $l0, $l1, $l2, $l3, $l4, $l5, $l6, $l7, $l8, $l9, $l10, $l11, $l12, $l13, $l14, $l15, $l16, $l17, $l18, $l19, $l20, $l21, $l22, $l23, $l24, $l25, $l26, $l27, $l28, $l29, $l30, $l31, ] }, _, _>( $vec0.0, $vec1.0, )) } }}; ($vec0:expr, $vec1:expr, [$l0:expr, $l1:expr, $l2:expr, $l3:expr, $l4:expr, $l5:expr, $l6:expr, $l7:expr, $l8:expr, $l9:expr, $l10:expr, $l11:expr, $l12:expr, $l13:expr, $l14:expr, $l15:expr, $l16:expr, $l17:expr, $l18:expr, $l19:expr, $l20:expr, $l21:expr, $l22:expr, $l23:expr, $l24:expr, $l25:expr, $l26:expr, $l27:expr, $l28:expr, $l29:expr, $l30:expr, $l31:expr, $l32:expr, $l33:expr, $l34:expr, $l35:expr, $l36:expr, $l37:expr, $l38:expr, $l39:expr, $l40:expr, $l41:expr, $l42:expr, $l43:expr, $l44:expr, $l45:expr, $l46:expr, $l47:expr, $l48:expr, $l49:expr, $l50:expr, $l51:expr, $l52:expr, $l53:expr, $l54:expr, $l55:expr, $l56:expr, $l57:expr, $l58:expr, $l59:expr, $l60:expr, $l61:expr, $l62:expr, $l63:expr]) => {{ #[allow(unused_unsafe)] unsafe { $crate::Simd($crate::__shuffle_vector64::<{[ $l0, $l1, $l2, $l3, $l4, $l5, $l6, $l7, $l8, $l9, $l10, $l11, $l12, $l13, $l14, $l15, $l16, $l17, $l18, $l19, $l20, $l21, $l22, $l23, $l24, $l25, $l26, $l27, $l28, $l29, $l30, $l31, $l32, $l33, $l34, $l35, $l36, $l37, $l38, $l39, $l40, $l41, $l42, $l43, $l44, $l45, $l46, $l47, $l48, $l49, $l50, $l51, $l52, $l53, $l54, $l55, $l56, $l57, $l58, $l59, $l60, $l61, $l62, $l63, ]}, _, _>( $vec0.0, $vec1.0, )) } }}; ($vec:expr, [$($l:expr),*]) => { match $vec { v => shuffle!(v, v, [$($l),*]) } }; } ================================================ FILE: src/api/shuffle1_dyn.rs ================================================ //! Shuffle vector elements according to a dynamic vector of indices. macro_rules! impl_shuffle1_dyn { ([$elem_ty:ident; $elem_count:expr]: $id:ident | $test_tt:tt) => { impl $id { /// Shuffle vector elements according to `indices`. #[inline] pub fn shuffle1_dyn(self, indices: I) -> Self where Self: codegen::shuffle1_dyn::Shuffle1Dyn, { codegen::shuffle1_dyn::Shuffle1Dyn::shuffle1_dyn(self, indices) } } }; } macro_rules! test_shuffle1_dyn { ([$elem_ty:ident; $elem_count:expr]: $id:ident | $test_tt:tt) => { test_if! { $test_tt: paste::item! { pub mod [<$id _shuffle1_dyn>] { use super::*; #[cfg_attr(not(target_arch = "wasm32"), test)] #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)] fn shuffle1_dyn() { let increasing = { let mut v = $id::splat(0 as $elem_ty); for i in 0..$id::lanes() { v = v.replace(i, i as $elem_ty); } v }; let decreasing = { let mut v = $id::splat(0 as $elem_ty); for i in 0..$id::lanes() { v = v.replace( i, ($id::lanes() - 1 - i) as $elem_ty ); } v }; type Indices = < $id as codegen::shuffle1_dyn::Shuffle1Dyn >::Indices; let increasing_ids: Indices = increasing.cast(); let decreasing_ids: Indices = decreasing.cast(); assert_eq!( increasing.shuffle1_dyn(increasing_ids), increasing, "(i,i)=>i" ); assert_eq!( decreasing.shuffle1_dyn(increasing_ids), decreasing, "(d,i)=>d" ); assert_eq!( increasing.shuffle1_dyn(decreasing_ids), decreasing, "(i,d)=>d" ); assert_eq!( decreasing.shuffle1_dyn(decreasing_ids), increasing, "(d,d)=>i" ); for i in 0..$id::lanes() { let v_ids: Indices = $id::splat(i as $elem_ty).cast(); assert_eq!(increasing.shuffle1_dyn(v_ids), $id::splat(increasing.extract(i)) ); assert_eq!(decreasing.shuffle1_dyn(v_ids), $id::splat(decreasing.extract(i)) ); assert_eq!( $id::splat(i as $elem_ty) .shuffle1_dyn(increasing_ids), $id::splat(i as $elem_ty) ); assert_eq!( $id::splat(i as $elem_ty) .shuffle1_dyn(decreasing_ids), $id::splat(i as $elem_ty) ); } } } } } }; } macro_rules! test_shuffle1_dyn_mask { ([$elem_ty:ident; $elem_count:expr]: $id:ident | $test_tt:tt) => { test_if! { $test_tt: paste::item! { pub mod [<$id _shuffle1_dyn>] { use super::*; #[cfg_attr(not(target_arch = "wasm32"), test)] #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)] fn shuffle1_dyn() { // alternating = [true, false, true, false, ...] let mut alternating = $id::splat(false); for i in 0..$id::lanes() { if i % 2 == 0 { alternating = alternating.replace(i, true); } } type Indices = < $id as codegen::shuffle1_dyn::Shuffle1Dyn >::Indices; // even = [0, 0, 2, 2, 4, 4, ..] let even = { let mut v = Indices::splat(0); for i in 0..$id::lanes() { if i % 2 == 0 { v = v.replace(i, (i as u8).into()); } else { v = v.replace(i, (i as u8 - 1).into()); } } v }; // odd = [1, 1, 3, 3, 5, 5, ...] let odd = { let mut v = Indices::splat(0); for i in 0..$id::lanes() { if i % 2 != 0 { v = v.replace(i, (i as u8).into()); } else { v = v.replace(i, (i as u8 + 1).into()); } } v }; assert_eq!( alternating.shuffle1_dyn(even), $id::splat(true) ); if $id::lanes() > 1 { assert_eq!( alternating.shuffle1_dyn(odd), $id::splat(false) ); } } } } } }; } ================================================ FILE: src/api/slice/from_slice.rs ================================================ //! Implements methods to read a vector type from a slice. macro_rules! impl_slice_from_slice { ([$elem_ty:ident; $elem_count:expr]: $id:ident | $test_tt:tt) => { impl $id { /// Instantiates a new vector with the values of the `slice`. /// /// # Panics /// /// If `slice.len() < Self::lanes()` or `&slice[0]` is not aligned /// to an `align_of::()` boundary. #[inline] pub fn from_slice_aligned(slice: &[$elem_ty]) -> Self { unsafe { assert!(slice.len() >= $elem_count); let target_ptr = slice.as_ptr(); assert_eq!(target_ptr.align_offset(crate::mem::align_of::()), 0); Self::from_slice_aligned_unchecked(slice) } } /// Instantiates a new vector with the values of the `slice`. /// /// # Panics /// /// If `slice.len() < Self::lanes()`. #[inline] pub fn from_slice_unaligned(slice: &[$elem_ty]) -> Self { unsafe { assert!(slice.len() >= $elem_count); Self::from_slice_unaligned_unchecked(slice) } } /// Instantiates a new vector with the values of the `slice`. /// /// # Safety /// /// If `slice.len() < Self::lanes()` or `&slice[0]` is not aligned /// to an `align_of::()` boundary, the behavior is undefined. #[inline] pub unsafe fn from_slice_aligned_unchecked(slice: &[$elem_ty]) -> Self { debug_assert!(slice.len() >= $elem_count); let target_ptr = slice.as_ptr(); debug_assert_eq!(target_ptr.align_offset(crate::mem::align_of::()), 0); #[allow(clippy::cast_ptr_alignment)] *(target_ptr as *const Self) } /// Instantiates a new vector with the values of the `slice`. /// /// # Safety /// /// If `slice.len() < Self::lanes()` the behavior is undefined. #[inline] pub unsafe fn from_slice_unaligned_unchecked(slice: &[$elem_ty]) -> Self { use crate::mem::size_of; debug_assert!(slice.len() >= $elem_count); let target_ptr = slice.as_ptr().cast(); let mut x = Self::splat(0 as $elem_ty); let self_ptr = &mut x as *mut Self as *mut u8; crate::ptr::copy_nonoverlapping(target_ptr, self_ptr, size_of::()); x } } test_if! { $test_tt: paste::item! { // Comparisons use integer casts within mantissa^1 range. #[allow(clippy::float_cmp)] pub mod [<$id _slice_from_slice>] { use super::*; use crate::iter::Iterator; #[cfg_attr(not(target_arch = "wasm32"), test)] #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)] fn from_slice_unaligned() { let mut unaligned = [42 as $elem_ty; $id::lanes() + 1]; unaligned[0] = 0 as $elem_ty; let vec = $id::from_slice_unaligned(&unaligned[1..]); for (index, &b) in unaligned.iter().enumerate() { if index == 0 { assert_eq!(b, 0 as $elem_ty); } else { assert_eq!(b, 42 as $elem_ty); assert_eq!(b, vec.extract(index - 1)); } } } // FIXME: wasm-bindgen-test does not support #[should_panic] // #[cfg_attr(not(target_arch = "wasm32"), test)] // #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)] #[cfg(not(target_arch = "wasm32"))] #[test] #[should_panic] fn from_slice_unaligned_fail() { let mut unaligned = [42 as $elem_ty; $id::lanes() + 1]; unaligned[0] = 0 as $elem_ty; // the slice is not large enough => panic let _vec = $id::from_slice_unaligned(&unaligned[2..]); } union A { data: [$elem_ty; 2 * $id::lanes()], _vec: $id, } #[cfg_attr(not(target_arch = "wasm32"), test)] #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)] fn from_slice_aligned() { let mut aligned = A { data: [0 as $elem_ty; 2 * $id::lanes()], }; for i in $id::lanes()..(2 * $id::lanes()) { unsafe { aligned.data[i] = 42 as $elem_ty; } } let vec = unsafe { $id::from_slice_aligned( &aligned.data[$id::lanes()..] ) }; for (index, &b) in unsafe { aligned.data.iter().enumerate() } { if index < $id::lanes() { assert_eq!(b, 0 as $elem_ty); } else { assert_eq!(b, 42 as $elem_ty); assert_eq!( b, vec.extract(index - $id::lanes()) ); } } } // FIXME: wasm-bindgen-test does not support #[should_panic] // #[cfg_attr(not(target_arch = "wasm32"), test)] // #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)] #[cfg(not(target_arch = "wasm32"))] #[test] #[should_panic] fn from_slice_aligned_fail_lanes() { let aligned = A { data: [0 as $elem_ty; 2 * $id::lanes()], }; let _vec = unsafe { $id::from_slice_aligned( &aligned.data[2 * $id::lanes()..] ) }; } // FIXME: wasm-bindgen-test does not support #[should_panic] // #[cfg_attr(not(target_arch = "wasm32"), test)] // #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)] #[cfg(not(target_arch = "wasm32"))] #[test] #[should_panic] fn from_slice_aligned_fail_align() { unsafe { let aligned = A { data: [0 as $elem_ty; 2 * $id::lanes()], }; // get a pointer to the front of data let ptr: *const $elem_ty = aligned.data.as_ptr() as *const $elem_ty; // offset pointer by one element let ptr = ptr.wrapping_add(1); if ptr.align_offset( crate::mem::align_of::<$id>() ) == 0 { // the pointer is properly aligned, so // from_slice_aligned won't fail here (e.g. this // can happen for i128x1). So we panic to make // the "should_fail" test pass: panic!("ok"); } // create a slice - this is safe, because the // elements of the slice exist, are properly // initialized, and properly aligned: let s: &[$elem_ty] = slice::from_raw_parts( ptr, $id::lanes() ); // this should always panic because the slice // alignment does not match the alignment // requirements for the vector type: let _vec = $id::from_slice_aligned(s); } } } } } }; } ================================================ FILE: src/api/slice/write_to_slice.rs ================================================ //! Implements methods to write a vector type to a slice. macro_rules! impl_slice_write_to_slice { ([$elem_ty:ident; $elem_count:expr]: $id:ident | $test_tt:tt) => { impl $id { /// Writes the values of the vector to the `slice`. /// /// # Panics /// /// If `slice.len() < Self::lanes()` or `&slice[0]` is not /// aligned to an `align_of::()` boundary. #[inline] pub fn write_to_slice_aligned(self, slice: &mut [$elem_ty]) { unsafe { assert!(slice.len() >= $elem_count); let target_ptr = slice.as_mut_ptr(); assert_eq!(target_ptr.align_offset(crate::mem::align_of::()), 0); self.write_to_slice_aligned_unchecked(slice); } } /// Writes the values of the vector to the `slice`. /// /// # Panics /// /// If `slice.len() < Self::lanes()`. #[inline] pub fn write_to_slice_unaligned(self, slice: &mut [$elem_ty]) { unsafe { assert!(slice.len() >= $elem_count); self.write_to_slice_unaligned_unchecked(slice); } } /// Writes the values of the vector to the `slice`. /// /// # Safety /// /// If `slice.len() < Self::lanes()` or `&slice[0]` is not /// aligned to an `align_of::()` boundary, the behavior is /// undefined. #[inline] pub unsafe fn write_to_slice_aligned_unchecked(self, slice: &mut [$elem_ty]) { debug_assert!(slice.len() >= $elem_count); let target_ptr = slice.as_mut_ptr(); debug_assert_eq!(target_ptr.align_offset(crate::mem::align_of::()), 0); #[allow(clippy::cast_ptr_alignment)] #[allow(clippy::cast_ptr_alignment)] #[allow(clippy::cast_ptr_alignment)] #[allow(clippy::cast_ptr_alignment)] *(target_ptr as *mut Self) = self; } /// Writes the values of the vector to the `slice`. /// /// # Safety /// /// If `slice.len() < Self::lanes()` the behavior is undefined. #[inline] pub unsafe fn write_to_slice_unaligned_unchecked(self, slice: &mut [$elem_ty]) { debug_assert!(slice.len() >= $elem_count); let target_ptr = slice.as_mut_ptr().cast(); let self_ptr = &self as *const Self as *const u8; crate::ptr::copy_nonoverlapping(self_ptr, target_ptr, crate::mem::size_of::()); } } test_if! { $test_tt: paste::item! { // Comparisons use integer casts within mantissa^1 range. #[allow(clippy::float_cmp)] pub mod [<$id _slice_write_to_slice>] { use super::*; use crate::iter::Iterator; #[cfg_attr(not(target_arch = "wasm32"), test)] #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)] fn write_to_slice_unaligned() { let mut unaligned = [0 as $elem_ty; $id::lanes() + 1]; let vec = $id::splat(42 as $elem_ty); vec.write_to_slice_unaligned(&mut unaligned[1..]); for (index, &b) in unaligned.iter().enumerate() { if index == 0 { assert_eq!(b, 0 as $elem_ty); } else { assert_eq!(b, 42 as $elem_ty); assert_eq!(b, vec.extract(index - 1)); } } } // FIXME: wasm-bindgen-test does not support #[should_panic] // #[cfg_attr(not(target_arch = "wasm32"), test)] // #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)] #[cfg(not(target_arch = "wasm32"))] #[test] #[should_panic] fn write_to_slice_unaligned_fail() { let mut unaligned = [0 as $elem_ty; $id::lanes() + 1]; let vec = $id::splat(42 as $elem_ty); vec.write_to_slice_unaligned(&mut unaligned[2..]); } union A { data: [$elem_ty; 2 * $id::lanes()], _vec: $id, } #[cfg_attr(not(target_arch = "wasm32"), test)] #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)] fn write_to_slice_aligned() { let mut aligned = A { data: [0 as $elem_ty; 2 * $id::lanes()], }; let vec = $id::splat(42 as $elem_ty); unsafe { vec.write_to_slice_aligned( &mut aligned.data[$id::lanes()..] ); for (idx, &b) in aligned.data.iter().enumerate() { if idx < $id::lanes() { assert_eq!(b, 0 as $elem_ty); } else { assert_eq!(b, 42 as $elem_ty); assert_eq!( b, vec.extract(idx - $id::lanes()) ); } } } } // FIXME: wasm-bindgen-test does not support #[should_panic] // #[cfg_attr(not(target_arch = "wasm32"), test)] // #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)] #[cfg(not(target_arch = "wasm32"))] #[test] #[should_panic] fn write_to_slice_aligned_fail_lanes() { let mut aligned = A { data: [0 as $elem_ty; 2 * $id::lanes()], }; let vec = $id::splat(42 as $elem_ty); unsafe { vec.write_to_slice_aligned( &mut aligned.data[2 * $id::lanes()..] ) }; } // FIXME: wasm-bindgen-test does not support #[should_panic] // #[cfg_attr(not(target_arch = "wasm32"), test)] // #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)] #[cfg(not(target_arch = "wasm32"))] #[test] #[should_panic] fn write_to_slice_aligned_fail_align() { unsafe { let mut aligned = A { data: [0 as $elem_ty; 2 * $id::lanes()], }; // get a pointer to the front of data let ptr: *mut $elem_ty = aligned.data.as_mut_ptr() as *mut $elem_ty; // offset pointer by one element let ptr = ptr.wrapping_add(1); if ptr.align_offset(crate::mem::align_of::<$id>()) == 0 { // the pointer is properly aligned, so // write_to_slice_aligned won't fail here (e.g. // this can happen for i128x1). So we panic to // make the "should_fail" test pass: panic!("ok"); } // create a slice - this is safe, because the // elements of the slice exist, are properly // initialized, and properly aligned: let s: &mut [$elem_ty] = slice::from_raw_parts_mut(ptr, $id::lanes()); // this should always panic because the slice // alignment does not match the alignment // requirements for the vector type: let vec = $id::splat(42 as $elem_ty); vec.write_to_slice_aligned(s); } } } } } }; } ================================================ FILE: src/api/slice.rs ================================================ //! Slice from/to methods #[macro_use] mod from_slice; #[macro_use] mod write_to_slice; ================================================ FILE: src/api/swap_bytes.rs ================================================ //! Horizontal swap bytes macro_rules! impl_swap_bytes { ([$elem_ty:ident; $elem_count:expr]: $id:ident | $test_tt:tt) => { impl $id { /// Reverses the byte order of the vector. #[inline] pub fn swap_bytes(self) -> Self { super::codegen::swap_bytes::SwapBytes::swap_bytes(self) } /// Converts self to little endian from the target's endianness. /// /// On little endian this is a no-op. On big endian the bytes are /// swapped. #[inline] pub fn to_le(self) -> Self { #[cfg(target_endian = "little")] { self } #[cfg(not(target_endian = "little"))] { self.swap_bytes() } } /// Converts self to big endian from the target's endianness. /// /// On big endian this is a no-op. On little endian the bytes are /// swapped. #[inline] pub fn to_be(self) -> Self { #[cfg(target_endian = "big")] { self } #[cfg(not(target_endian = "big"))] { self.swap_bytes() } } /// Converts a vector from little endian to the target's endianness. /// /// On little endian this is a no-op. On big endian the bytes are /// swapped. #[inline] pub fn from_le(x: Self) -> Self { #[cfg(target_endian = "little")] { x } #[cfg(not(target_endian = "little"))] { x.swap_bytes() } } /// Converts a vector from big endian to the target's endianness. /// /// On big endian this is a no-op. On little endian the bytes are /// swapped. #[inline] pub fn from_be(x: Self) -> Self { #[cfg(target_endian = "big")] { x } #[cfg(not(target_endian = "big"))] { x.swap_bytes() } } } test_if! { $test_tt: paste::item! { pub mod [<$id _swap_bytes>] { use super::*; const BYTES: [u8; 64] = [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, ]; macro_rules! swap { ($func: ident) => {{ // catch possible future >512 vectors assert!(mem::size_of::<$id>() <= 64); let mut actual = BYTES; let elems: &mut [$elem_ty] = unsafe { slice::from_raw_parts_mut( actual.as_mut_ptr() as *mut $elem_ty, $id::lanes(), ) }; let vec = $id::from_slice_unaligned(elems); $id::$func(vec).write_to_slice_unaligned(elems); actual }}; } macro_rules! test_swap { ($func: ident) => {{ let actual = swap!($func); let expected = BYTES.iter().rev() .skip(64 - crate::mem::size_of::<$id>()); assert!(actual.iter().zip(expected) .all(|(x, y)| x == y)); }}; } macro_rules! test_no_swap { ($func: ident) => {{ let actual = swap!($func); let expected = BYTES.iter() .take(mem::size_of::<$id>()); assert!(actual.iter().zip(expected) .all(|(x, y)| x == y)); }}; } #[cfg_attr(not(target_arch = "wasm32"), test)] #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)] fn swap_bytes() { test_swap!(swap_bytes); } #[cfg_attr(not(target_arch = "wasm32"), test)] #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)] fn to_le() { #[cfg(target_endian = "little")] { test_no_swap!(to_le); } #[cfg(not(target_endian = "little"))] { test_swap!(to_le); } } #[cfg_attr(not(target_arch = "wasm32"), test)] #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)] fn to_be() { #[cfg(target_endian = "big")] { test_no_swap!(to_be); } #[cfg(not(target_endian = "big"))] { test_swap!(to_be); } } #[cfg_attr(not(target_arch = "wasm32"), test)] #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)] fn from_le() { #[cfg(target_endian = "little")] { test_no_swap!(from_le); } #[cfg(not(target_endian = "little"))] { test_swap!(from_le); } } #[cfg_attr(not(target_arch = "wasm32"), test)] #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)] fn from_be() { #[cfg(target_endian = "big")] { test_no_swap!(from_be); } #[cfg(not(target_endian = "big"))] { test_swap!(from_be); } } } } } }; } ================================================ FILE: src/api.rs ================================================ //! Implements the Simd<[T; N]> APIs #[macro_use] mod bitmask; pub(crate) mod cast; #[macro_use] mod cmp; #[macro_use] mod default; #[macro_use] mod fmt; #[macro_use] mod from; #[macro_use] mod hash; #[macro_use] mod math; #[macro_use] mod minimal; #[macro_use] mod ops; #[macro_use] mod ptr; #[macro_use] mod reductions; #[macro_use] mod select; #[macro_use] mod shuffle; #[macro_use] mod shuffle1_dyn; #[macro_use] mod slice; #[macro_use] mod swap_bytes; #[macro_use] mod bit_manip; #[cfg(feature = "into_bits")] pub(crate) mod into_bits; macro_rules! impl_i { ([$elem_ty:ident; $elem_n:expr]: $tuple_id:ident, $mask_ty:ident | $ielem_ty:ident, $ibitmask_ty:ident | $test_tt:tt | $($elem_ids:ident),* | From: $($from_vec_ty:ident),* | $(#[$doc:meta])*) => { impl_minimal_iuf!([$elem_ty; $elem_n]: $tuple_id | $ielem_ty | $test_tt | $($elem_ids),* | $(#[$doc])*); impl_ops_vector_arithmetic!([$elem_ty; $elem_n]: $tuple_id | $test_tt); impl_ops_scalar_arithmetic!([$elem_ty; $elem_n]: $tuple_id | $test_tt); impl_ops_vector_bitwise!( [$elem_ty; $elem_n]: $tuple_id | $test_tt | (!(0 as $elem_ty), 0) ); impl_ops_scalar_bitwise!( [$elem_ty; $elem_n]: $tuple_id | $test_tt | (!(0 as $elem_ty), 0) ); impl_ops_vector_shifts!([$elem_ty; $elem_n]: $tuple_id | $test_tt); impl_ops_scalar_shifts!([$elem_ty; $elem_n]: $tuple_id | $test_tt); impl_ops_vector_rotates!([$elem_ty; $elem_n]: $tuple_id | $test_tt); impl_ops_vector_neg!([$elem_ty; $elem_n]: $tuple_id | $test_tt); impl_ops_vector_int_min_max!( [$elem_ty; $elem_n]: $tuple_id | $test_tt ); impl_reduction_integer_arithmetic!( [$elem_ty; $elem_n]: $tuple_id | $ielem_ty | $test_tt ); impl_reduction_min_max!( [$elem_ty; $elem_n]: $tuple_id | $ielem_ty | $test_tt ); impl_reduction_bitwise!( [$elem_ty; $elem_n]: $tuple_id | $ielem_ty | $test_tt | (|x|{ x as $elem_ty }) | (!(0 as $elem_ty), 0) ); impl_fmt_debug!([$elem_ty; $elem_n]: $tuple_id | $test_tt); impl_fmt_lower_hex!([$elem_ty; $elem_n]: $tuple_id | $test_tt); impl_fmt_upper_hex!([$elem_ty; $elem_n]: $tuple_id | $test_tt); impl_fmt_octal!([$elem_ty; $elem_n]: $tuple_id | $test_tt); impl_fmt_binary!([$elem_ty; $elem_n]: $tuple_id | $test_tt); impl_from_array!([$elem_ty; $elem_n]: $tuple_id | $test_tt | (1, 1)); impl_from_vectors!( [$elem_ty; $elem_n]: $tuple_id | $test_tt | $($from_vec_ty),* ); impl_default!([$elem_ty; $elem_n]: $tuple_id | $test_tt); impl_hash!([$elem_ty; $elem_n]: $tuple_id | $test_tt); impl_slice_from_slice!([$elem_ty; $elem_n]: $tuple_id | $test_tt); impl_slice_write_to_slice!([$elem_ty; $elem_n]: $tuple_id | $test_tt); impl_swap_bytes!([$elem_ty; $elem_n]: $tuple_id | $test_tt); impl_bit_manip!([$elem_ty; $elem_n]: $tuple_id | $test_tt); impl_shuffle1_dyn!([$elem_ty; $elem_n]: $tuple_id | $test_tt); impl_cmp_partial_eq!( [$elem_ty; $elem_n]: $tuple_id | $test_tt | (0, 1) ); impl_cmp_eq!([$elem_ty; $elem_n]: $tuple_id | $test_tt | (0, 1)); impl_cmp_vertical!( [$elem_ty; $elem_n]: $tuple_id, $mask_ty, false, (1, 0) | $test_tt ); impl_cmp_partial_ord!([$elem_ty; $elem_n]: $tuple_id | $test_tt); impl_cmp_ord!([$elem_ty; $elem_n]: $tuple_id | $test_tt | (0, 1)); impl_bitmask!($tuple_id | $ibitmask_ty | (-1, 0) | $test_tt); test_select!($elem_ty, $mask_ty, $tuple_id, (1, 2) | $test_tt); test_cmp_partial_ord_int!([$elem_ty; $elem_n]: $tuple_id | $test_tt); test_shuffle1_dyn!([$elem_ty; $elem_n]: $tuple_id | $test_tt); } } macro_rules! impl_u { ([$elem_ty:ident; $elem_n:expr]: $tuple_id:ident, $mask_ty:ident | $ielem_ty:ident, $ibitmask_ty:ident | $test_tt:tt | $($elem_ids:ident),* | From: $($from_vec_ty:ident),* | $(#[$doc:meta])*) => { impl_minimal_iuf!([$elem_ty; $elem_n]: $tuple_id | $ielem_ty | $test_tt | $($elem_ids),* | $(#[$doc])*); impl_ops_vector_arithmetic!([$elem_ty; $elem_n]: $tuple_id | $test_tt); impl_ops_scalar_arithmetic!([$elem_ty; $elem_n]: $tuple_id | $test_tt); impl_ops_vector_bitwise!( [$elem_ty; $elem_n]: $tuple_id | $test_tt | (!(0 as $elem_ty), 0) ); impl_ops_scalar_bitwise!( [$elem_ty; $elem_n]: $tuple_id | $test_tt | (!(0 as $elem_ty), 0) ); impl_ops_vector_shifts!([$elem_ty; $elem_n]: $tuple_id | $test_tt); impl_ops_scalar_shifts!([$elem_ty; $elem_n]: $tuple_id | $test_tt); impl_ops_vector_rotates!([$elem_ty; $elem_n]: $tuple_id | $test_tt); impl_ops_vector_int_min_max!( [$elem_ty; $elem_n]: $tuple_id | $test_tt ); impl_reduction_integer_arithmetic!( [$elem_ty; $elem_n]: $tuple_id | $ielem_ty | $test_tt ); impl_reduction_min_max!( [$elem_ty; $elem_n]: $tuple_id | $ielem_ty | $test_tt ); impl_reduction_bitwise!( [$elem_ty; $elem_n]: $tuple_id | $ielem_ty | $test_tt | (|x|{ x as $elem_ty }) | (!(0 as $elem_ty), 0) ); impl_fmt_debug!([$elem_ty; $elem_n]: $tuple_id | $test_tt); impl_fmt_lower_hex!([$elem_ty; $elem_n]: $tuple_id | $test_tt); impl_fmt_upper_hex!([$elem_ty; $elem_n]: $tuple_id | $test_tt); impl_fmt_octal!([$elem_ty; $elem_n]: $tuple_id | $test_tt); impl_fmt_binary!([$elem_ty; $elem_n]: $tuple_id | $test_tt); impl_from_array!([$elem_ty; $elem_n]: $tuple_id | $test_tt | (1, 1)); impl_from_vectors!( [$elem_ty; $elem_n]: $tuple_id | $test_tt | $($from_vec_ty),* ); impl_default!([$elem_ty; $elem_n]: $tuple_id | $test_tt); impl_hash!([$elem_ty; $elem_n]: $tuple_id | $test_tt); impl_slice_from_slice!([$elem_ty; $elem_n]: $tuple_id | $test_tt); impl_slice_write_to_slice!([$elem_ty; $elem_n]: $tuple_id | $test_tt); impl_swap_bytes!([$elem_ty; $elem_n]: $tuple_id | $test_tt); impl_bit_manip!([$elem_ty; $elem_n]: $tuple_id | $test_tt); impl_shuffle1_dyn!([$elem_ty; $elem_n]: $tuple_id | $test_tt); impl_cmp_partial_eq!( [$elem_ty; $elem_n]: $tuple_id | $test_tt | (1, 0) ); impl_cmp_eq!([$elem_ty; $elem_n]: $tuple_id | $test_tt | (0, 1)); impl_cmp_vertical!( [$elem_ty; $elem_n]: $tuple_id, $mask_ty, false, (1, 0) | $test_tt ); impl_cmp_partial_ord!([$elem_ty; $elem_n]: $tuple_id | $test_tt); impl_cmp_ord!([$elem_ty; $elem_n]: $tuple_id | $test_tt | (0, 1)); impl_bitmask!($tuple_id | $ibitmask_ty | ($ielem_ty::max_value(), 0) | $test_tt); test_select!($elem_ty, $mask_ty, $tuple_id, (1, 2) | $test_tt); test_cmp_partial_ord_int!([$elem_ty; $elem_n]: $tuple_id | $test_tt); test_shuffle1_dyn!([$elem_ty; $elem_n]: $tuple_id | $test_tt); } } macro_rules! impl_f { ([$elem_ty:ident; $elem_n:expr]: $tuple_id:ident, $mask_ty:ident | $ielem_ty:ident | $test_tt:tt | $($elem_ids:ident),* | From: $($from_vec_ty:ident),* | $(#[$doc:meta])*) => { impl_minimal_iuf!([$elem_ty; $elem_n]: $tuple_id | $ielem_ty | $test_tt | $($elem_ids),* | $(#[$doc])*); impl_ops_vector_arithmetic!([$elem_ty; $elem_n]: $tuple_id | $test_tt); impl_ops_scalar_arithmetic!([$elem_ty; $elem_n]: $tuple_id | $test_tt); impl_ops_vector_neg!([$elem_ty; $elem_n]: $tuple_id | $test_tt); impl_ops_vector_float_min_max!( [$elem_ty; $elem_n]: $tuple_id | $test_tt ); impl_reduction_float_arithmetic!( [$elem_ty; $elem_n]: $tuple_id | $test_tt); impl_reduction_min_max!( [$elem_ty; $elem_n]: $tuple_id | $ielem_ty | $test_tt ); impl_fmt_debug!([$elem_ty; $elem_n]: $tuple_id | $test_tt); impl_from_array!([$elem_ty; $elem_n]: $tuple_id | $test_tt | (1., 1.)); impl_from_vectors!( [$elem_ty; $elem_n]: $tuple_id | $test_tt | $($from_vec_ty),* ); impl_default!([$elem_ty; $elem_n]: $tuple_id | $test_tt); impl_cmp_partial_eq!( [$elem_ty; $elem_n]: $tuple_id | $test_tt | (1., 0.) ); impl_slice_from_slice!([$elem_ty; $elem_n]: $tuple_id | $test_tt); impl_slice_write_to_slice!([$elem_ty; $elem_n]: $tuple_id | $test_tt); impl_shuffle1_dyn!([$elem_ty; $elem_n]: $tuple_id | $test_tt); impl_float_consts!([$elem_ty; $elem_n]: $tuple_id); impl_float_category!([$elem_ty; $elem_n]: $tuple_id, $mask_ty); // floating-point math impl_math_float_abs!([$elem_ty; $elem_n]: $tuple_id | $test_tt); impl_math_float_cos!([$elem_ty; $elem_n]: $tuple_id | $test_tt); impl_math_float_exp!([$elem_ty; $elem_n]: $tuple_id | $test_tt); impl_math_float_ln!([$elem_ty; $elem_n]: $tuple_id | $test_tt); impl_math_float_mul_add!([$elem_ty; $elem_n]: $tuple_id | $test_tt); impl_math_float_mul_adde!([$elem_ty; $elem_n]: $tuple_id | $test_tt); impl_math_float_powf!([$elem_ty; $elem_n]: $tuple_id | $test_tt); impl_math_float_recpre!([$elem_ty; $elem_n]: $tuple_id | $test_tt); impl_math_float_rsqrte!([$elem_ty; $elem_n]: $tuple_id | $test_tt); impl_math_float_sin!([$elem_ty; $elem_n]: $tuple_id | $test_tt); impl_math_float_sqrt!([$elem_ty; $elem_n]: $tuple_id | $test_tt); impl_math_float_sqrte!([$elem_ty; $elem_n]: $tuple_id | $test_tt); impl_math_float_tanh!([$elem_ty; $elem_n]: $tuple_id | $test_tt); impl_cmp_vertical!( [$elem_ty; $elem_n]: $tuple_id, $mask_ty, false, (1., 0.) | $test_tt ); test_select!($elem_ty, $mask_ty, $tuple_id, (1., 2.) | $test_tt); test_reduction_float_min_max!( [$elem_ty; $elem_n]: $tuple_id | $test_tt ); test_shuffle1_dyn!([$elem_ty; $elem_n]: $tuple_id | $test_tt); } } macro_rules! impl_m { ([$elem_ty:ident; $elem_n:expr]: $tuple_id:ident | $ielem_ty:ident, $ibitmask_ty:ident | $test_tt:tt | $($elem_ids:ident),* | From: $($from_vec_ty:ident),* | $(#[$doc:meta])*) => { impl_minimal_mask!( [$elem_ty; $elem_n]: $tuple_id | $ielem_ty | $test_tt | $($elem_ids),* | $(#[$doc])* ); impl_ops_vector_mask_bitwise!( [$elem_ty; $elem_n]: $tuple_id | $test_tt | (true, false) ); impl_ops_scalar_mask_bitwise!( [$elem_ty; $elem_n]: $tuple_id | $test_tt | (true, false) ); impl_reduction_bitwise!( [bool; $elem_n]: $tuple_id | $ielem_ty | $test_tt | (|x|{ x != 0 }) | (true, false) ); impl_reduction_mask!([$elem_ty; $elem_n]: $tuple_id | $test_tt); impl_fmt_debug!([bool; $elem_n]: $tuple_id | $test_tt); impl_from_array!( [$elem_ty; $elem_n]: $tuple_id | $test_tt | (crate::$elem_ty::new(true), true) ); impl_from_vectors!( [$elem_ty; $elem_n]: $tuple_id | $test_tt | $($from_vec_ty),* ); impl_default!([bool; $elem_n]: $tuple_id | $test_tt); impl_cmp_partial_eq!( [$elem_ty; $elem_n]: $tuple_id | $test_tt | (true, false) ); impl_cmp_eq!( [$elem_ty; $elem_n]: $tuple_id | $test_tt | (true, false) ); impl_cmp_vertical!( [$elem_ty; $elem_n]: $tuple_id, $tuple_id, true, (true, false) | $test_tt ); impl_select!([$elem_ty; $elem_n]: $tuple_id | $test_tt); impl_cmp_partial_ord!([$elem_ty; $elem_n]: $tuple_id | $test_tt); impl_cmp_ord!( [$elem_ty; $elem_n]: $tuple_id | $test_tt | (false, true) ); impl_shuffle1_dyn!([$elem_ty; $elem_n]: $tuple_id | $test_tt); impl_bitmask!($tuple_id | $ibitmask_ty | (true, false) | $test_tt); test_cmp_partial_ord_mask!([$elem_ty; $elem_n]: $tuple_id | $test_tt); test_shuffle1_dyn_mask!([$elem_ty; $elem_n]: $tuple_id | $test_tt); } } macro_rules! impl_const_p { ([$elem_ty:ty; $elem_n:expr]: $tuple_id:ident, $mask_ty:ident, $usize_ty:ident, $isize_ty:ident | $test_tt:tt | $($elem_ids:ident),* | From: $($from_vec_ty:ident),* | $(#[$doc:meta])*) => { impl_minimal_p!( [$elem_ty; $elem_n]: $tuple_id, $mask_ty, $usize_ty, $isize_ty | ref_ | $test_tt | $($elem_ids),* | (1 as $elem_ty, 0 as $elem_ty) | $(#[$doc])* ); impl_ptr_read!([$elem_ty; $elem_n]: $tuple_id, $mask_ty | $test_tt); } } macro_rules! impl_mut_p { ([$elem_ty:ty; $elem_n:expr]: $tuple_id:ident, $mask_ty:ident, $usize_ty:ident, $isize_ty:ident | $test_tt:tt | $($elem_ids:ident),* | From: $($from_vec_ty:ident),* | $(#[$doc:meta])*) => { impl_minimal_p!( [$elem_ty; $elem_n]: $tuple_id, $mask_ty, $usize_ty, $isize_ty | ref_mut_ | $test_tt | $($elem_ids),* | (1 as $elem_ty, 0 as $elem_ty) | $(#[$doc])* ); impl_ptr_read!([$elem_ty; $elem_n]: $tuple_id, $mask_ty | $test_tt); impl_ptr_write!([$elem_ty; $elem_n]: $tuple_id, $mask_ty | $test_tt); } } ================================================ FILE: src/codegen/bit_manip.rs ================================================ //! LLVM bit manipulation intrinsics. #[rustfmt::skip] pub(crate) use crate::*; #[allow(improper_ctypes, dead_code)] extern "C" { #[link_name = "llvm.ctlz.v2i8"] fn ctlz_u8x2(x: u8x2, is_zero_undef: bool) -> u8x2; #[link_name = "llvm.ctlz.v4i8"] fn ctlz_u8x4(x: u8x4, is_zero_undef: bool) -> u8x4; #[link_name = "llvm.ctlz.v8i8"] fn ctlz_u8x8(x: u8x8, is_zero_undef: bool) -> u8x8; #[link_name = "llvm.ctlz.v16i8"] fn ctlz_u8x16(x: u8x16, is_zero_undef: bool) -> u8x16; #[link_name = "llvm.ctlz.v32i8"] fn ctlz_u8x32(x: u8x32, is_zero_undef: bool) -> u8x32; #[link_name = "llvm.ctlz.v64i8"] fn ctlz_u8x64(x: u8x64, is_zero_undef: bool) -> u8x64; #[link_name = "llvm.ctlz.v2i16"] fn ctlz_u16x2(x: u16x2, is_zero_undef: bool) -> u16x2; #[link_name = "llvm.ctlz.v4i16"] fn ctlz_u16x4(x: u16x4, is_zero_undef: bool) -> u16x4; #[link_name = "llvm.ctlz.v8i16"] fn ctlz_u16x8(x: u16x8, is_zero_undef: bool) -> u16x8; #[link_name = "llvm.ctlz.v16i16"] fn ctlz_u16x16(x: u16x16, is_zero_undef: bool) -> u16x16; #[link_name = "llvm.ctlz.v32i16"] fn ctlz_u16x32(x: u16x32, is_zero_undef: bool) -> u16x32; #[link_name = "llvm.ctlz.v2i32"] fn ctlz_u32x2(x: u32x2, is_zero_undef: bool) -> u32x2; #[link_name = "llvm.ctlz.v4i32"] fn ctlz_u32x4(x: u32x4, is_zero_undef: bool) -> u32x4; #[link_name = "llvm.ctlz.v8i32"] fn ctlz_u32x8(x: u32x8, is_zero_undef: bool) -> u32x8; #[link_name = "llvm.ctlz.v16i32"] fn ctlz_u32x16(x: u32x16, is_zero_undef: bool) -> u32x16; #[link_name = "llvm.ctlz.v2i64"] fn ctlz_u64x2(x: u64x2, is_zero_undef: bool) -> u64x2; #[link_name = "llvm.ctlz.v4i64"] fn ctlz_u64x4(x: u64x4, is_zero_undef: bool) -> u64x4; #[link_name = "llvm.ctlz.v8i64"] fn ctlz_u64x8(x: u64x8, is_zero_undef: bool) -> u64x8; #[link_name = "llvm.ctlz.v1i128"] fn ctlz_u128x1(x: u128x1, is_zero_undef: bool) -> u128x1; #[link_name = "llvm.ctlz.v2i128"] fn ctlz_u128x2(x: u128x2, is_zero_undef: bool) -> u128x2; #[link_name = "llvm.ctlz.v4i128"] fn ctlz_u128x4(x: u128x4, is_zero_undef: bool) -> u128x4; #[link_name = "llvm.cttz.v2i8"] fn cttz_u8x2(x: u8x2, is_zero_undef: bool) -> u8x2; #[link_name = "llvm.cttz.v4i8"] fn cttz_u8x4(x: u8x4, is_zero_undef: bool) -> u8x4; #[link_name = "llvm.cttz.v8i8"] fn cttz_u8x8(x: u8x8, is_zero_undef: bool) -> u8x8; #[link_name = "llvm.cttz.v16i8"] fn cttz_u8x16(x: u8x16, is_zero_undef: bool) -> u8x16; #[link_name = "llvm.cttz.v32i8"] fn cttz_u8x32(x: u8x32, is_zero_undef: bool) -> u8x32; #[link_name = "llvm.cttz.v64i8"] fn cttz_u8x64(x: u8x64, is_zero_undef: bool) -> u8x64; #[link_name = "llvm.cttz.v2i16"] fn cttz_u16x2(x: u16x2, is_zero_undef: bool) -> u16x2; #[link_name = "llvm.cttz.v4i16"] fn cttz_u16x4(x: u16x4, is_zero_undef: bool) -> u16x4; #[link_name = "llvm.cttz.v8i16"] fn cttz_u16x8(x: u16x8, is_zero_undef: bool) -> u16x8; #[link_name = "llvm.cttz.v16i16"] fn cttz_u16x16(x: u16x16, is_zero_undef: bool) -> u16x16; #[link_name = "llvm.cttz.v32i16"] fn cttz_u16x32(x: u16x32, is_zero_undef: bool) -> u16x32; #[link_name = "llvm.cttz.v2i32"] fn cttz_u32x2(x: u32x2, is_zero_undef: bool) -> u32x2; #[link_name = "llvm.cttz.v4i32"] fn cttz_u32x4(x: u32x4, is_zero_undef: bool) -> u32x4; #[link_name = "llvm.cttz.v8i32"] fn cttz_u32x8(x: u32x8, is_zero_undef: bool) -> u32x8; #[link_name = "llvm.cttz.v16i32"] fn cttz_u32x16(x: u32x16, is_zero_undef: bool) -> u32x16; #[link_name = "llvm.cttz.v2i64"] fn cttz_u64x2(x: u64x2, is_zero_undef: bool) -> u64x2; #[link_name = "llvm.cttz.v4i64"] fn cttz_u64x4(x: u64x4, is_zero_undef: bool) -> u64x4; #[link_name = "llvm.cttz.v8i64"] fn cttz_u64x8(x: u64x8, is_zero_undef: bool) -> u64x8; #[link_name = "llvm.cttz.v1i128"] fn cttz_u128x1(x: u128x1, is_zero_undef: bool) -> u128x1; #[link_name = "llvm.cttz.v2i128"] fn cttz_u128x2(x: u128x2, is_zero_undef: bool) -> u128x2; #[link_name = "llvm.cttz.v4i128"] fn cttz_u128x4(x: u128x4, is_zero_undef: bool) -> u128x4; #[link_name = "llvm.ctpop.v2i8"] fn ctpop_u8x2(x: u8x2) -> u8x2; #[link_name = "llvm.ctpop.v4i8"] fn ctpop_u8x4(x: u8x4) -> u8x4; #[link_name = "llvm.ctpop.v8i8"] fn ctpop_u8x8(x: u8x8) -> u8x8; #[link_name = "llvm.ctpop.v16i8"] fn ctpop_u8x16(x: u8x16) -> u8x16; #[link_name = "llvm.ctpop.v32i8"] fn ctpop_u8x32(x: u8x32) -> u8x32; #[link_name = "llvm.ctpop.v64i8"] fn ctpop_u8x64(x: u8x64) -> u8x64; #[link_name = "llvm.ctpop.v2i16"] fn ctpop_u16x2(x: u16x2) -> u16x2; #[link_name = "llvm.ctpop.v4i16"] fn ctpop_u16x4(x: u16x4) -> u16x4; #[link_name = "llvm.ctpop.v8i16"] fn ctpop_u16x8(x: u16x8) -> u16x8; #[link_name = "llvm.ctpop.v16i16"] fn ctpop_u16x16(x: u16x16) -> u16x16; #[link_name = "llvm.ctpop.v32i16"] fn ctpop_u16x32(x: u16x32) -> u16x32; #[link_name = "llvm.ctpop.v2i32"] fn ctpop_u32x2(x: u32x2) -> u32x2; #[link_name = "llvm.ctpop.v4i32"] fn ctpop_u32x4(x: u32x4) -> u32x4; #[link_name = "llvm.ctpop.v8i32"] fn ctpop_u32x8(x: u32x8) -> u32x8; #[link_name = "llvm.ctpop.v16i32"] fn ctpop_u32x16(x: u32x16) -> u32x16; #[link_name = "llvm.ctpop.v2i64"] fn ctpop_u64x2(x: u64x2) -> u64x2; #[link_name = "llvm.ctpop.v4i64"] fn ctpop_u64x4(x: u64x4) -> u64x4; #[link_name = "llvm.ctpop.v8i64"] fn ctpop_u64x8(x: u64x8) -> u64x8; #[link_name = "llvm.ctpop.v1i128"] fn ctpop_u128x1(x: u128x1) -> u128x1; #[link_name = "llvm.ctpop.v2i128"] fn ctpop_u128x2(x: u128x2) -> u128x2; #[link_name = "llvm.ctpop.v4i128"] fn ctpop_u128x4(x: u128x4) -> u128x4; } pub(crate) trait BitManip { fn ctpop(self) -> Self; fn ctlz(self) -> Self; fn cttz(self) -> Self; } macro_rules! impl_bit_manip { (inner: $ty:ident, $scalar:ty, $uty:ident, $ctpop:ident, $ctlz:ident, $cttz:ident) => { // FIXME: several LLVM intrinsics break on s390x https://github.com/rust-lang-nursery/packed_simd/issues/192 #[cfg(target_arch = "s390x")] impl_bit_manip! { scalar: $ty, $scalar } #[cfg(not(target_arch = "s390x"))] impl BitManip for $ty { #[inline] fn ctpop(self) -> Self { let y: $uty = self.cast(); unsafe { $ctpop(y).cast() } } #[inline] fn ctlz(self) -> Self { let y: $uty = self.cast(); // the ctxx intrinsics need compile-time constant // `is_zero_undef` unsafe { $ctlz(y, false).cast() } } #[inline] fn cttz(self) -> Self { let y: $uty = self.cast(); unsafe { $cttz(y, false).cast() } } } }; (sized_inner: $ty:ident, $scalar:ty, $uty:ident) => { #[cfg(target_arch = "s390x")] impl_bit_manip! { scalar: $ty, $scalar } #[cfg(not(target_arch = "s390x"))] impl BitManip for $ty { #[inline] fn ctpop(self) -> Self { let y: $uty = self.cast(); $uty::ctpop(y).cast() } #[inline] fn ctlz(self) -> Self { let y: $uty = self.cast(); $uty::ctlz(y).cast() } #[inline] fn cttz(self) -> Self { let y: $uty = self.cast(); $uty::cttz(y).cast() } } }; (scalar: $ty:ident, $scalar:ty) => { impl BitManip for $ty { #[inline] fn ctpop(self) -> Self { let mut ones = self; for i in 0..Self::lanes() { ones = ones.replace(i, self.extract(i).count_ones() as $scalar); } ones } #[inline] fn ctlz(self) -> Self { let mut lz = self; for i in 0..Self::lanes() { lz = lz.replace(i, self.extract(i).leading_zeros() as $scalar); } lz } #[inline] fn cttz(self) -> Self { let mut tz = self; for i in 0..Self::lanes() { tz = tz.replace(i, self.extract(i).trailing_zeros() as $scalar); } tz } } }; ($uty:ident, $uscalar:ty, $ity:ident, $iscalar:ty, $ctpop:ident, $ctlz:ident, $cttz:ident) => { impl_bit_manip! { inner: $uty, $uscalar, $uty, $ctpop, $ctlz, $cttz } impl_bit_manip! { inner: $ity, $iscalar, $uty, $ctpop, $ctlz, $cttz } }; (sized: $usize:ident, $uscalar:ty, $isize:ident, $iscalar:ty, $ty:ident) => { impl_bit_manip! { sized_inner: $usize, $uscalar, $ty } impl_bit_manip! { sized_inner: $isize, $iscalar, $ty } }; } impl_bit_manip! { u8x2 , u8, i8x2, i8, ctpop_u8x2, ctlz_u8x2, cttz_u8x2 } impl_bit_manip! { u8x4 , u8, i8x4, i8, ctpop_u8x4, ctlz_u8x4, cttz_u8x4 } #[cfg(not(target_arch = "aarch64"))] // see below impl_bit_manip! { u8x8 , u8, i8x8, i8, ctpop_u8x8, ctlz_u8x8, cttz_u8x8 } impl_bit_manip! { u8x16 , u8, i8x16, i8, ctpop_u8x16, ctlz_u8x16, cttz_u8x16 } impl_bit_manip! { u8x32 , u8, i8x32, i8, ctpop_u8x32, ctlz_u8x32, cttz_u8x32 } impl_bit_manip! { u8x64 , u8, i8x64, i8, ctpop_u8x64, ctlz_u8x64, cttz_u8x64 } impl_bit_manip! { u16x2 , u16, i16x2, i16, ctpop_u16x2, ctlz_u16x2, cttz_u16x2 } impl_bit_manip! { u16x4 , u16, i16x4, i16, ctpop_u16x4, ctlz_u16x4, cttz_u16x4 } impl_bit_manip! { u16x8 , u16, i16x8, i16, ctpop_u16x8, ctlz_u16x8, cttz_u16x8 } impl_bit_manip! { u16x16 , u16, i16x16, i16, ctpop_u16x16, ctlz_u16x16, cttz_u16x16 } impl_bit_manip! { u16x32 , u16, i16x32, i16, ctpop_u16x32, ctlz_u16x32, cttz_u16x32 } impl_bit_manip! { u32x2 , u32, i32x2, i32, ctpop_u32x2, ctlz_u32x2, cttz_u32x2 } impl_bit_manip! { u32x4 , u32, i32x4, i32, ctpop_u32x4, ctlz_u32x4, cttz_u32x4 } impl_bit_manip! { u32x8 , u32, i32x8, i32, ctpop_u32x8, ctlz_u32x8, cttz_u32x8 } impl_bit_manip! { u32x16 , u32, i32x16, i32, ctpop_u32x16, ctlz_u32x16, cttz_u32x16 } impl_bit_manip! { u64x2 , u64, i64x2, i64, ctpop_u64x2, ctlz_u64x2, cttz_u64x2 } impl_bit_manip! { u64x4 , u64, i64x4, i64, ctpop_u64x4, ctlz_u64x4, cttz_u64x4 } impl_bit_manip! { u64x8 , u64, i64x8, i64, ctpop_u64x8, ctlz_u64x8, cttz_u64x8 } impl_bit_manip! { u128x1 , u128, i128x1, i128, ctpop_u128x1, ctlz_u128x1, cttz_u128x1 } impl_bit_manip! { u128x2 , u128, i128x2, i128, ctpop_u128x2, ctlz_u128x2, cttz_u128x2 } impl_bit_manip! { u128x4 , u128, i128x4, i128, ctpop_u128x4, ctlz_u128x4, cttz_u128x4 } #[cfg(target_arch = "aarch64")] impl BitManip for u8x8 { #[inline] fn ctpop(self) -> Self { let y: u8x8 = self.cast(); unsafe { ctpop_u8x8(y).cast() } } #[inline] fn ctlz(self) -> Self { let y: u8x8 = self.cast(); unsafe { ctlz_u8x8(y, false).cast() } } #[inline] fn cttz(self) -> Self { // FIXME: LLVM cttz.v8i8 broken on aarch64 https://github.com/rust-lang-nursery/packed_simd/issues/191 // OPTIMIZE: adapt the algorithm used for v8i16/etc to Rust's aarch64 // intrinsics let mut tz = self; for i in 0..Self::lanes() { tz = tz.replace(i, self.extract(i).trailing_zeros() as u8); } tz } } #[cfg(target_arch = "aarch64")] impl BitManip for i8x8 { #[inline] fn ctpop(self) -> Self { let y: u8x8 = self.cast(); unsafe { ctpop_u8x8(y).cast() } } #[inline] fn ctlz(self) -> Self { let y: u8x8 = self.cast(); unsafe { ctlz_u8x8(y, false).cast() } } #[inline] fn cttz(self) -> Self { // FIXME: LLVM cttz.v8i8 broken on aarch64 https://github.com/rust-lang-nursery/packed_simd/issues/191 // OPTIMIZE: adapt the algorithm used for v8i16/etc to Rust's aarch64 // intrinsics let mut tz = self; for i in 0..Self::lanes() { tz = tz.replace(i, self.extract(i).trailing_zeros() as i8); } tz } } cfg_if! { if #[cfg(target_pointer_width = "8")] { impl_bit_manip! { sized: usizex2, usize, isizex2, isize, u8x2 } impl_bit_manip! { sized: usizex4, usize, isizex4, isize, u8x4 } impl_bit_manip! { sized: usizex8, usize, isizex8, isize, u8x8 } } else if #[cfg(target_pointer_width = "16")] { impl_bit_manip! { sized: usizex2, usize, isizex2, isize, u16x2 } impl_bit_manip! { sized: usizex4, usize, isizex4, isize, u16x4 } impl_bit_manip! { sized: usizex8, usize, isizex8, isize, u16x8 } } else if #[cfg(target_pointer_width = "32")] { impl_bit_manip! { sized: usizex2, usize, isizex2, isize, u32x2 } impl_bit_manip! { sized: usizex4, usize, isizex4, isize, u32x4 } impl_bit_manip! { sized: usizex8, usize, isizex8, isize, u32x8 } } else if #[cfg(target_pointer_width = "64")] { impl_bit_manip! { sized: usizex2, usize, isizex2, isize, u64x2 } impl_bit_manip! { sized: usizex4, usize, isizex4, isize, u64x4 } impl_bit_manip! { sized: usizex8, usize, isizex8, isize, u64x8 } } else { compile_error!("unsupported target_pointer_width"); } } ================================================ FILE: src/codegen/llvm.rs ================================================ //! LLVM's platform intrinsics #![allow(dead_code)] use crate::sealed::Shuffle; #[allow(unused_imports)] // FIXME: spurious warning? use crate::sealed::Simd; extern "platform-intrinsic" { fn simd_shuffle(x: T, y: T, idx: I) -> U; } #[allow(clippy::missing_safety_doc)] #[inline] pub unsafe fn __shuffle_vector2(x: T, y: T) -> U where T: Simd, ::Element: Shuffle<[u32; 2], Output = U>, { simd_shuffle(x, y, IDX) } #[allow(clippy::missing_safety_doc)] #[inline] pub unsafe fn __shuffle_vector4(x: T, y: T) -> U where T: Simd, ::Element: Shuffle<[u32; 4], Output = U>, { simd_shuffle(x, y, IDX) } #[allow(clippy::missing_safety_doc)] #[inline] pub unsafe fn __shuffle_vector8(x: T, y: T) -> U where T: Simd, ::Element: Shuffle<[u32; 8], Output = U>, { simd_shuffle(x, y, IDX) } #[allow(clippy::missing_safety_doc)] #[inline] pub unsafe fn __shuffle_vector16(x: T, y: T) -> U where T: Simd, ::Element: Shuffle<[u32; 16], Output = U>, { simd_shuffle(x, y, IDX) } #[allow(clippy::missing_safety_doc)] #[inline] pub unsafe fn __shuffle_vector32(x: T, y: T) -> U where T: Simd, ::Element: Shuffle<[u32; 32], Output = U>, { simd_shuffle(x, y, IDX) } #[allow(clippy::missing_safety_doc)] #[inline] pub unsafe fn __shuffle_vector64(x: T, y: T) -> U where T: Simd, ::Element: Shuffle<[u32; 64], Output = U>, { simd_shuffle(x, y, IDX) } extern "platform-intrinsic" { pub(crate) fn simd_eq(x: T, y: T) -> U; pub(crate) fn simd_ne(x: T, y: T) -> U; pub(crate) fn simd_lt(x: T, y: T) -> U; pub(crate) fn simd_le(x: T, y: T) -> U; pub(crate) fn simd_gt(x: T, y: T) -> U; pub(crate) fn simd_ge(x: T, y: T) -> U; pub(crate) fn simd_insert(x: T, idx: u32, val: U) -> T; pub(crate) fn simd_extract(x: T, idx: u32) -> U; pub(crate) fn simd_cast(x: T) -> U; pub(crate) fn simd_add(x: T, y: T) -> T; pub(crate) fn simd_sub(x: T, y: T) -> T; pub(crate) fn simd_mul(x: T, y: T) -> T; pub(crate) fn simd_div(x: T, y: T) -> T; pub(crate) fn simd_rem(x: T, y: T) -> T; pub(crate) fn simd_shl(x: T, y: T) -> T; pub(crate) fn simd_shr(x: T, y: T) -> T; pub(crate) fn simd_and(x: T, y: T) -> T; pub(crate) fn simd_or(x: T, y: T) -> T; pub(crate) fn simd_xor(x: T, y: T) -> T; pub(crate) fn simd_reduce_add_unordered(x: T) -> U; pub(crate) fn simd_reduce_mul_unordered(x: T) -> U; pub(crate) fn simd_reduce_add_ordered(x: T, acc: U) -> U; pub(crate) fn simd_reduce_mul_ordered(x: T, acc: U) -> U; pub(crate) fn simd_reduce_min(x: T) -> U; pub(crate) fn simd_reduce_max(x: T) -> U; pub(crate) fn simd_reduce_min_nanless(x: T) -> U; pub(crate) fn simd_reduce_max_nanless(x: T) -> U; pub(crate) fn simd_reduce_and(x: T) -> U; pub(crate) fn simd_reduce_or(x: T) -> U; pub(crate) fn simd_reduce_xor(x: T) -> U; pub(crate) fn simd_reduce_all(x: T) -> bool; pub(crate) fn simd_reduce_any(x: T) -> bool; pub(crate) fn simd_select(m: M, a: T, b: T) -> T; pub(crate) fn simd_fmin(a: T, b: T) -> T; pub(crate) fn simd_fmax(a: T, b: T) -> T; pub(crate) fn simd_fsqrt(a: T) -> T; pub(crate) fn simd_fma(a: T, b: T, c: T) -> T; pub(crate) fn simd_gather(value: T, pointers: P, mask: M) -> T; pub(crate) fn simd_scatter(value: T, pointers: P, mask: M); pub(crate) fn simd_bitmask(value: T) -> U; } ================================================ FILE: src/codegen/math/float/abs.rs ================================================ //! Vertical floating-point `fabs` #![allow(unused)] // FIXME 64-bit 1 elem vectors fabs use crate::*; pub(crate) trait Abs { fn abs(self) -> Self; } #[allow(improper_ctypes)] extern "C" { #[link_name = "llvm.fabs.v2f32"] fn fabs_v2f32(x: f32x2) -> f32x2; #[link_name = "llvm.fabs.v4f32"] fn fabs_v4f32(x: f32x4) -> f32x4; #[link_name = "llvm.fabs.v8f32"] fn fabs_v8f32(x: f32x8) -> f32x8; #[link_name = "llvm.fabs.v16f32"] fn fabs_v16f32(x: f32x16) -> f32x16; /* FIXME 64-bit fabsgle elem vectors #[link_name = "llvm.fabs.v1f64"] fn fabs_v1f64(x: f64x1) -> f64x1; */ #[link_name = "llvm.fabs.v2f64"] fn fabs_v2f64(x: f64x2) -> f64x2; #[link_name = "llvm.fabs.v4f64"] fn fabs_v4f64(x: f64x4) -> f64x4; #[link_name = "llvm.fabs.v8f64"] fn fabs_v8f64(x: f64x8) -> f64x8; #[link_name = "llvm.fabs.f32"] fn fabs_f32(x: f32) -> f32; #[link_name = "llvm.fabs.f64"] fn fabs_f64(x: f64) -> f64; } gen_unary_impl_table!(Abs, abs); cfg_if! { if #[cfg(target_arch = "s390x")] { // FIXME: https://github.com/rust-lang-nursery/packed_simd/issues/14 impl_unary!(f32x2[f32; 2]: fabs_f32); impl_unary!(f32x4[f32; 4]: fabs_f32); impl_unary!(f32x8[f32; 8]: fabs_f32); impl_unary!(f32x16[f32; 16]: fabs_f32); impl_unary!(f64x2[f64; 2]: fabs_f64); impl_unary!(f64x4[f64; 4]: fabs_f64); impl_unary!(f64x8[f64; 8]: fabs_f64); } else if #[cfg(all(target_arch = "x86_64", feature = "sleef-sys"))] { use sleef_sys::*; cfg_if! { if #[cfg(target_feature = "avx2")] { impl_unary!(f32x2[t => f32x4]: Sleef_fabsf4_avx2128); impl_unary!(f32x16[h => f32x8]: Sleef_fabsf8_avx2); impl_unary!(f64x8[h => f64x4]: Sleef_fabsd4_avx2); impl_unary!(f32x4: Sleef_fabsf4_avx2128); impl_unary!(f32x8: Sleef_fabsf8_avx2); impl_unary!(f64x2: Sleef_fabsd2_avx2128); impl_unary!(f64x4: Sleef_fabsd4_avx2); } else if #[cfg(target_feature = "avx")] { impl_unary!(f32x2[t => f32x4]: Sleef_fabsf4_sse4); impl_unary!(f32x16[h => f32x8]: Sleef_fabsf8_avx); impl_unary!(f64x8[h => f64x4]: Sleef_fabsd4_avx); impl_unary!(f32x4: Sleef_fabsf4_sse4); impl_unary!(f32x8: Sleef_fabsf8_avx); impl_unary!(f64x2: Sleef_fabsd2_sse4); impl_unary!(f64x4: Sleef_fabsd4_avx); } else if #[cfg(target_feature = "sse4.2")] { impl_unary!(f32x2[t => f32x4]: Sleef_fabsf4_sse4); impl_unary!(f32x16[q => f32x4]: Sleef_fabsf4_sse4); impl_unary!(f64x8[q => f64x2]: Sleef_fabsd2_sse4); impl_unary!(f32x4: Sleef_fabsf4_sse4); impl_unary!(f32x8[h => f32x4]: Sleef_fabsf4_sse4); impl_unary!(f64x2: Sleef_fabsd2_sse4); impl_unary!(f64x4[h => f64x2]: Sleef_fabsd2_sse4); } else { impl_unary!(f32x2[f32; 2]: fabs_f32); impl_unary!(f32x16: fabs_v16f32); impl_unary!(f64x8: fabs_v8f64); impl_unary!(f32x4: fabs_v4f32); impl_unary!(f32x8: fabs_v8f32); impl_unary!(f64x2: fabs_v2f64); impl_unary!(f64x4: fabs_v4f64); } } } else { impl_unary!(f32x2[f32; 2]: fabs_f32); impl_unary!(f32x4: fabs_v4f32); impl_unary!(f32x8: fabs_v8f32); impl_unary!(f32x16: fabs_v16f32); impl_unary!(f64x2: fabs_v2f64); impl_unary!(f64x4: fabs_v4f64); impl_unary!(f64x8: fabs_v8f64); } } ================================================ FILE: src/codegen/math/float/cos.rs ================================================ //! Vertical floating-point `cos` #![allow(unused)] // FIXME 64-bit 1 elem vector cos use crate::*; pub(crate) trait Cos { fn cos(self) -> Self; } #[allow(improper_ctypes)] extern "C" { #[link_name = "llvm.cos.v2f32"] fn cos_v2f32(x: f32x2) -> f32x2; #[link_name = "llvm.cos.v4f32"] fn cos_v4f32(x: f32x4) -> f32x4; #[link_name = "llvm.cos.v8f32"] fn cos_v8f32(x: f32x8) -> f32x8; #[link_name = "llvm.cos.v16f32"] fn cos_v16f32(x: f32x16) -> f32x16; /* FIXME 64-bit cosgle elem vectors #[link_name = "llvm.cos.v1f64"] fn cos_v1f64(x: f64x1) -> f64x1; */ #[link_name = "llvm.cos.v2f64"] fn cos_v2f64(x: f64x2) -> f64x2; #[link_name = "llvm.cos.v4f64"] fn cos_v4f64(x: f64x4) -> f64x4; #[link_name = "llvm.cos.v8f64"] fn cos_v8f64(x: f64x8) -> f64x8; #[link_name = "llvm.cos.f32"] fn cos_f32(x: f32) -> f32; #[link_name = "llvm.cos.f64"] fn cos_f64(x: f64) -> f64; } gen_unary_impl_table!(Cos, cos); cfg_if! { if #[cfg(target_arch = "s390x")] { // FIXME: https://github.com/rust-lang-nursery/packed_simd/issues/14 impl_unary!(f32x2[f32; 2]: cos_f32); impl_unary!(f32x4[f32; 4]: cos_f32); impl_unary!(f32x8[f32; 8]: cos_f32); impl_unary!(f32x16[f32; 16]: cos_f32); impl_unary!(f64x2[f64; 2]: cos_f64); impl_unary!(f64x4[f64; 4]: cos_f64); impl_unary!(f64x8[f64; 8]: cos_f64); } else if #[cfg(all(target_arch = "x86_64", feature = "sleef-sys"))] { use sleef_sys::*; cfg_if! { if #[cfg(target_feature = "avx2")] { impl_unary!(f32x2[t => f32x4]: Sleef_cosf4_u10avx2128); impl_unary!(f32x16[h => f32x8]: Sleef_cosf8_u10avx2); impl_unary!(f64x8[h => f64x4]: Sleef_cosd4_u10avx2); impl_unary!(f32x4: Sleef_cosf4_u10avx2128); impl_unary!(f32x8: Sleef_cosf8_u10avx2); impl_unary!(f64x2: Sleef_cosd2_u10avx2128); impl_unary!(f64x4: Sleef_cosd4_u10avx2); } else if #[cfg(target_feature = "avx")] { impl_unary!(f32x2[t => f32x4]: Sleef_cosf4_u10sse4); impl_unary!(f32x16[h => f32x8]: Sleef_cosf8_u10avx); impl_unary!(f64x8[h => f64x4]: Sleef_cosd4_u10avx); impl_unary!(f32x4: Sleef_cosf4_u10sse4); impl_unary!(f32x8: Sleef_cosf8_u10avx); impl_unary!(f64x2: Sleef_cosd2_u10sse4); impl_unary!(f64x4: Sleef_cosd4_u10avx); } else if #[cfg(target_feature = "sse4.2")] { impl_unary!(f32x2[t => f32x4]: Sleef_cosf4_u10sse4); impl_unary!(f32x16[q => f32x4]: Sleef_cosf4_u10sse4); impl_unary!(f64x8[q => f64x2]: Sleef_cosd2_u10sse4); impl_unary!(f32x4: Sleef_cosf4_u10sse4); impl_unary!(f32x8[h => f32x4]: Sleef_cosf4_u10sse4); impl_unary!(f64x2: Sleef_cosd2_u10sse4); impl_unary!(f64x4[h => f64x2]: Sleef_cosd2_u10sse4); } else { impl_unary!(f32x2[f32; 2]: cos_f32); impl_unary!(f32x16: cos_v16f32); impl_unary!(f64x8: cos_v8f64); impl_unary!(f32x4: cos_v4f32); impl_unary!(f32x8: cos_v8f32); impl_unary!(f64x2: cos_v2f64); impl_unary!(f64x4: cos_v4f64); } } } else { impl_unary!(f32x2[f32; 2]: cos_f32); impl_unary!(f32x4: cos_v4f32); impl_unary!(f32x8: cos_v8f32); impl_unary!(f32x16: cos_v16f32); impl_unary!(f64x2: cos_v2f64); impl_unary!(f64x4: cos_v4f64); impl_unary!(f64x8: cos_v8f64); } } ================================================ FILE: src/codegen/math/float/cos_pi.rs ================================================ //! Vertical floating-point `cos` #![allow(unused)] // FIXME 64-bit 1 elem vectors cos_pi use crate::*; pub(crate) trait CosPi { fn cos_pi(self) -> Self; } gen_unary_impl_table!(CosPi, cos_pi); macro_rules! impl_def { ($vid:ident, $PI:path) => { impl CosPi for $vid { #[inline] fn cos_pi(self) -> Self { (self * Self::splat($PI)).cos() } } }; } macro_rules! impl_def32 { ($vid:ident) => { impl_def!($vid, crate::f32::consts::PI); }; } macro_rules! impl_def64 { ($vid:ident) => { impl_def!($vid, crate::f64::consts::PI); }; } cfg_if! { if #[cfg(all(target_arch = "x86_64", feature = "sleef-sys"))] { use sleef_sys::*; cfg_if! { if #[cfg(target_feature = "avx2")] { impl_unary!(f32x2[t => f32x4]: Sleef_cospif4_u05avx2128); impl_unary!(f32x16[h => f32x8]: Sleef_cospif8_u05avx2); impl_unary!(f64x8[h => f64x4]: Sleef_cospid4_u05avx2); impl_unary!(f32x4: Sleef_cospif4_u05avx2128); impl_unary!(f32x8: Sleef_cospif8_u05avx2); impl_unary!(f64x2: Sleef_cospid2_u05avx2128); impl_unary!(f64x4: Sleef_cospid4_u05avx2); } else if #[cfg(target_feature = "avx")] { impl_unary!(f32x2[t => f32x4]: Sleef_cospif4_u05sse4); impl_unary!(f32x16[h => f32x8]: Sleef_cospif8_u05avx); impl_unary!(f64x8[h => f64x4]: Sleef_cospid4_u05avx); impl_unary!(f32x4: Sleef_cospif4_u05sse4); impl_unary!(f32x8: Sleef_cospif8_u05avx); impl_unary!(f64x2: Sleef_cospid2_u05sse4); impl_unary!(f64x4: Sleef_cospid4_u05avx); } else if #[cfg(target_feature = "sse4.2")] { impl_unary!(f32x2[t => f32x4]: Sleef_cospif4_u05sse4); impl_unary!(f32x16[q => f32x4]: Sleef_cospif4_u05sse4); impl_unary!(f64x8[q => f64x2]: Sleef_cospid2_u05sse4); impl_unary!(f32x4: Sleef_cospif4_u05sse4); impl_unary!(f32x8[h => f32x4]: Sleef_cospif4_u05sse4); impl_unary!(f64x2: Sleef_cospid2_u05sse4); impl_unary!(f64x4[h => f64x2]: Sleef_cospid2_u05sse4); } else { impl_def32!(f32x2); impl_def32!(f32x4); impl_def32!(f32x8); impl_def32!(f32x16); impl_def64!(f64x2); impl_def64!(f64x4); impl_def64!(f64x8); } } } else { impl_def32!(f32x2); impl_def32!(f32x4); impl_def32!(f32x8); impl_def32!(f32x16); impl_def64!(f64x2); impl_def64!(f64x4); impl_def64!(f64x8); } } ================================================ FILE: src/codegen/math/float/exp.rs ================================================ //! Vertical floating-point `exp` #![allow(unused)] // FIXME 64-bit expgle elem vectors misexpg use crate::*; pub(crate) trait Exp { fn exp(self) -> Self; } #[allow(improper_ctypes)] extern "C" { #[link_name = "llvm.exp.v2f32"] fn exp_v2f32(x: f32x2) -> f32x2; #[link_name = "llvm.exp.v4f32"] fn exp_v4f32(x: f32x4) -> f32x4; #[link_name = "llvm.exp.v8f32"] fn exp_v8f32(x: f32x8) -> f32x8; #[link_name = "llvm.exp.v16f32"] fn exp_v16f32(x: f32x16) -> f32x16; /* FIXME 64-bit expgle elem vectors #[link_name = "llvm.exp.v1f64"] fn exp_v1f64(x: f64x1) -> f64x1; */ #[link_name = "llvm.exp.v2f64"] fn exp_v2f64(x: f64x2) -> f64x2; #[link_name = "llvm.exp.v4f64"] fn exp_v4f64(x: f64x4) -> f64x4; #[link_name = "llvm.exp.v8f64"] fn exp_v8f64(x: f64x8) -> f64x8; #[link_name = "llvm.exp.f32"] fn exp_f32(x: f32) -> f32; #[link_name = "llvm.exp.f64"] fn exp_f64(x: f64) -> f64; } gen_unary_impl_table!(Exp, exp); cfg_if! { if #[cfg(target_arch = "s390x")] { // FIXME: https://github.com/rust-lang-nursery/packed_simd/issues/14 impl_unary!(f32x2[f32; 2]: exp_f32); impl_unary!(f32x4[f32; 4]: exp_f32); impl_unary!(f32x8[f32; 8]: exp_f32); impl_unary!(f32x16[f32; 16]: exp_f32); impl_unary!(f64x2[f64; 2]: exp_f64); impl_unary!(f64x4[f64; 4]: exp_f64); impl_unary!(f64x8[f64; 8]: exp_f64); } else if #[cfg(all(target_arch = "x86_64", feature = "sleef-sys"))] { use sleef_sys::*; cfg_if! { if #[cfg(target_feature = "avx2")] { impl_unary!(f32x2[t => f32x4]: Sleef_expf4_u10avx2128); impl_unary!(f32x16[h => f32x8]: Sleef_expf8_u10avx2); impl_unary!(f64x8[h => f64x4]: Sleef_expd4_u10avx2); impl_unary!(f32x4: Sleef_expf4_u10avx2128); impl_unary!(f32x8: Sleef_expf8_u10avx2); impl_unary!(f64x2: Sleef_expd2_u10avx2128); impl_unary!(f64x4: Sleef_expd4_u10avx2); } else if #[cfg(target_feature = "avx")] { impl_unary!(f32x2[t => f32x4]: Sleef_expf4_u10sse4); impl_unary!(f32x16[h => f32x8]: Sleef_expf8_u10avx); impl_unary!(f64x8[h => f64x4]: Sleef_expd4_u10avx); impl_unary!(f32x4: Sleef_expf4_u10sse4); impl_unary!(f32x8: Sleef_expf8_u10avx); impl_unary!(f64x2: Sleef_expd2_u10sse4); impl_unary!(f64x4: Sleef_expd4_u10avx); } else if #[cfg(target_feature = "sse4.2")] { impl_unary!(f32x2[t => f32x4]: Sleef_expf4_u10sse4); impl_unary!(f32x16[q => f32x4]: Sleef_expf4_u10sse4); impl_unary!(f64x8[q => f64x2]: Sleef_expd2_u10sse4); impl_unary!(f32x4: Sleef_expf4_u10sse4); impl_unary!(f32x8[h => f32x4]: Sleef_expf4_u10sse4); impl_unary!(f64x2: Sleef_expd2_u10sse4); impl_unary!(f64x4[h => f64x2]: Sleef_expd2_u10sse4); } else if #[cfg(target_feature = "sse2")] { impl_unary!(f32x2[t => f32x4]: Sleef_expf4_u10sse2); impl_unary!(f32x16[q => f32x4]: Sleef_expf4_u10sse2); impl_unary!(f64x8[q => f64x2]: Sleef_expd2_u10sse2); impl_unary!(f32x4: Sleef_expf4_u10sse2); impl_unary!(f32x8[h => f32x4]: Sleef_expf4_u10sse2); impl_unary!(f64x2: Sleef_expd2_u10sse2); impl_unary!(f64x4[h => f64x2]: Sleef_expd2_u10sse2); } else { impl_unary!(f32x2[f32; 2]: exp_f32); impl_unary!(f32x16: exp_v16f32); impl_unary!(f64x8: exp_v8f64); impl_unary!(f32x4: exp_v4f32); impl_unary!(f32x8: exp_v8f32); impl_unary!(f64x2: exp_v2f64); impl_unary!(f64x4: exp_v4f64); } } } else { impl_unary!(f32x2[f32; 2]: exp_f32); impl_unary!(f32x4: exp_v4f32); impl_unary!(f32x8: exp_v8f32); impl_unary!(f32x16: exp_v16f32); impl_unary!(f64x2: exp_v2f64); impl_unary!(f64x4: exp_v4f64); impl_unary!(f64x8: exp_v8f64); } } ================================================ FILE: src/codegen/math/float/ln.rs ================================================ //! Vertical floating-point `ln` #![allow(unused)] // FIXME 64-bit lngle elem vectors mislng use crate::*; pub(crate) trait Ln { fn ln(self) -> Self; } #[allow(improper_ctypes)] extern "C" { #[link_name = "llvm.log.v2f32"] fn ln_v2f32(x: f32x2) -> f32x2; #[link_name = "llvm.log.v4f32"] fn ln_v4f32(x: f32x4) -> f32x4; #[link_name = "llvm.log.v8f32"] fn ln_v8f32(x: f32x8) -> f32x8; #[link_name = "llvm.log.v16f32"] fn ln_v16f32(x: f32x16) -> f32x16; /* FIXME 64-bit lngle elem vectors #[link_name = "llvm.log.v1f64"] fn ln_v1f64(x: f64x1) -> f64x1; */ #[link_name = "llvm.log.v2f64"] fn ln_v2f64(x: f64x2) -> f64x2; #[link_name = "llvm.log.v4f64"] fn ln_v4f64(x: f64x4) -> f64x4; #[link_name = "llvm.log.v8f64"] fn ln_v8f64(x: f64x8) -> f64x8; #[link_name = "llvm.log.f32"] fn ln_f32(x: f32) -> f32; #[link_name = "llvm.log.f64"] fn ln_f64(x: f64) -> f64; } gen_unary_impl_table!(Ln, ln); cfg_if! { if #[cfg(target_arch = "s390x")] { // FIXME: https://github.com/rust-lang-nursery/packed_simd/issues/14 impl_unary!(f32x2[f32; 2]: ln_f32); impl_unary!(f32x4[f32; 4]: ln_f32); impl_unary!(f32x8[f32; 8]: ln_f32); impl_unary!(f32x16[f32; 16]: ln_f32); impl_unary!(f64x2[f64; 2]: ln_f64); impl_unary!(f64x4[f64; 4]: ln_f64); impl_unary!(f64x8[f64; 8]: ln_f64); } else if #[cfg(all(target_arch = "x86_64", feature = "sleef-sys"))] { use sleef_sys::*; cfg_if! { if #[cfg(target_feature = "avx2")] { impl_unary!(f32x2[t => f32x4]: Sleef_logf4_u10avx2128); impl_unary!(f32x16[h => f32x8]: Sleef_logf8_u10avx2); impl_unary!(f64x8[h => f64x4]: Sleef_logd4_u10avx2); impl_unary!(f32x4: Sleef_logf4_u10avx2128); impl_unary!(f32x8: Sleef_logf8_u10avx2); impl_unary!(f64x2: Sleef_logd2_u10avx2128); impl_unary!(f64x4: Sleef_logd4_u10avx2); } else if #[cfg(target_feature = "avx")] { impl_unary!(f32x2[t => f32x4]: Sleef_logf4_u10sse4); impl_unary!(f32x16[h => f32x8]: Sleef_logf8_u10avx); impl_unary!(f64x8[h => f64x4]: Sleef_logd4_u10avx); impl_unary!(f32x4: Sleef_logf4_u10sse4); impl_unary!(f32x8: Sleef_logf8_u10avx); impl_unary!(f64x2: Sleef_logd2_u10sse4); impl_unary!(f64x4: Sleef_logd4_u10avx); } else if #[cfg(target_feature = "sse4.2")] { impl_unary!(f32x2[t => f32x4]: Sleef_logf4_u10sse4); impl_unary!(f32x16[q => f32x4]: Sleef_logf4_u10sse4); impl_unary!(f64x8[q => f64x2]: Sleef_logd2_u10sse4); impl_unary!(f32x4: Sleef_logf4_u10sse4); impl_unary!(f32x8[h => f32x4]: Sleef_logf4_u10sse4); impl_unary!(f64x2: Sleef_logd2_u10sse4); impl_unary!(f64x4[h => f64x2]: Sleef_logd2_u10sse4); } else if #[cfg(target_feature = "sse2")] { impl_unary!(f32x2[t => f32x4]: Sleef_logf4_u10sse2); impl_unary!(f32x16[q => f32x4]: Sleef_logf4_u10sse2); impl_unary!(f64x8[q => f64x2]: Sleef_logd2_u10sse2); impl_unary!(f32x4: Sleef_logf4_u10sse2); impl_unary!(f32x8[h => f32x4]: Sleef_logf4_u10sse2); impl_unary!(f64x2: Sleef_logd2_u10sse2); impl_unary!(f64x4[h => f64x2]: Sleef_logd2_u10sse2); } else { impl_unary!(f32x2[f32; 2]: ln_f32); impl_unary!(f32x16: ln_v16f32); impl_unary!(f64x8: ln_v8f64); impl_unary!(f32x4: ln_v4f32); impl_unary!(f32x8: ln_v8f32); impl_unary!(f64x2: ln_v2f64); impl_unary!(f64x4: ln_v4f64); } } } else { impl_unary!(f32x2[f32; 2]: ln_f32); impl_unary!(f32x4: ln_v4f32); impl_unary!(f32x8: ln_v8f32); impl_unary!(f32x16: ln_v16f32); impl_unary!(f64x2: ln_v2f64); impl_unary!(f64x4: ln_v4f64); impl_unary!(f64x8: ln_v8f64); } } ================================================ FILE: src/codegen/math/float/macros.rs ================================================ //! Utility macros #![allow(unused)] macro_rules! impl_unary_ { // implementation mapping 1:1 (vec | $trait_id:ident, $trait_method:ident, $vec_id:ident, $fun:ident) => { impl $trait_id for $vec_id { #[inline] fn $trait_method(self) -> Self { unsafe { use crate::mem::transmute; transmute($fun(transmute(self))) } } } }; // implementation mapping 1:1 for when `$fun` is a generic function // like some of the fp math rustc intrinsics (e.g. `fn fun(x: T) -> T`). (gen | $trait_id:ident, $trait_method:ident, $vec_id:ident, $fun:ident) => { impl $trait_id for $vec_id { #[inline] fn $trait_method(self) -> Self { unsafe { use crate::mem::transmute; transmute($fun(self.0)) } } } }; (scalar | $trait_id:ident, $trait_method:ident, $vec_id:ident, [$sid:ident; $scount:expr], $fun:ident) => { impl $trait_id for $vec_id { #[inline] fn $trait_method(self) -> Self { unsafe { union U { vec: $vec_id, scalars: [$sid; $scount], } let mut scalars = U { vec: self }.scalars; for i in &mut scalars { *i = $fun(*i); } U { scalars }.vec } } } }; // implementation calling fun twice on each of the vector halves: (halves | $trait_id:ident, $trait_method:ident, $vec_id:ident, $vech_id:ident, $fun:ident) => { impl $trait_id for $vec_id { #[inline] fn $trait_method(self) -> Self { unsafe { use crate::mem::transmute; union U { vec: $vec_id, halves: [$vech_id; 2], } let mut halves = U { vec: self }.halves; *halves.get_unchecked_mut(0) = transmute($fun(transmute(*halves.get_unchecked(0)))); *halves.get_unchecked_mut(1) = transmute($fun(transmute(*halves.get_unchecked(1)))); U { halves }.vec } } } }; // implementation calling fun four times on each of the vector quarters: (quarter | $trait_id:ident, $trait_method:ident, $vec_id:ident, $vecq_id:ident, $fun:ident) => { impl $trait_id for $vec_id { #[inline] fn $trait_method(self) -> Self { unsafe { use crate::mem::transmute; union U { vec: $vec_id, quarters: [$vecq_id; 4], } let mut quarters = U { vec: self }.quarters; *quarters.get_unchecked_mut(0) = transmute($fun(transmute(*quarters.get_unchecked(0)))); *quarters.get_unchecked_mut(1) = transmute($fun(transmute(*quarters.get_unchecked(1)))); *quarters.get_unchecked_mut(2) = transmute($fun(transmute(*quarters.get_unchecked(2)))); *quarters.get_unchecked_mut(3) = transmute($fun(transmute(*quarters.get_unchecked(3)))); U { quarters }.vec } } } }; // implementation calling fun once on a vector twice as large: (twice | $trait_id:ident, $trait_method:ident, $vec_id:ident, $vect_id:ident, $fun:ident) => { impl $trait_id for $vec_id { #[inline] fn $trait_method(self) -> Self { unsafe { use crate::mem::{transmute, uninitialized}; union U { vec: [$vec_id; 2], twice: $vect_id, } let twice = U { vec: [self, uninitialized()] }.twice; let twice = transmute($fun(transmute(twice))); *(U { twice }.vec.get_unchecked(0)) } } } }; } macro_rules! gen_unary_impl_table { ($trait_id:ident, $trait_method:ident) => { macro_rules! impl_unary { ($vid:ident: $fun:ident) => { impl_unary_!(vec | $trait_id, $trait_method, $vid, $fun); }; ($vid:ident[g]: $fun:ident) => { impl_unary_!(gen | $trait_id, $trait_method, $vid, $fun); }; ($vid:ident[$sid:ident; $sc:expr]: $fun:ident) => { impl_unary_!(scalar | $trait_id, $trait_method, $vid, [$sid; $sc], $fun); }; ($vid:ident[s]: $fun:ident) => { impl_unary_!(scalar | $trait_id, $trait_method, $vid, $fun); }; ($vid:ident[h => $vid_h:ident]: $fun:ident) => { impl_unary_!(halves | $trait_id, $trait_method, $vid, $vid_h, $fun); }; ($vid:ident[q => $vid_q:ident]: $fun:ident) => { impl_unary_!(quarter | $trait_id, $trait_method, $vid, $vid_q, $fun); }; ($vid:ident[t => $vid_t:ident]: $fun:ident) => { impl_unary_!(twice | $trait_id, $trait_method, $vid, $vid_t, $fun); }; } }; } macro_rules! impl_tertiary_ { // implementation mapping 1:1 (vec | $trait_id:ident, $trait_method:ident, $vec_id:ident, $fun:ident) => { impl $trait_id for $vec_id { #[inline] fn $trait_method(self, y: Self, z: Self) -> Self { unsafe { use crate::mem::transmute; transmute($fun(transmute(self), transmute(y), transmute(z))) } } } }; (scalar | $trait_id:ident, $trait_method:ident, $vec_id:ident, [$sid:ident; $scount:expr], $fun:ident) => { impl $trait_id for $vec_id { #[inline] fn $trait_method(self, y: Self, z: Self) -> Self { unsafe { union U { vec: $vec_id, scalars: [$sid; $scount], } let mut x = U { vec: self }.scalars; let y = U { vec: y }.scalars; let z = U { vec: z }.scalars; for (x, (y, z)) in (&mut scalars).zip(&y).zip(&z) { *i = $fun(*i, *y, *z); } U { vec: x }.vec } } } }; // implementation calling fun twice on each of the vector halves: (halves | $trait_id:ident, $trait_method:ident, $vec_id:ident, $vech_id:ident, $fun:ident) => { impl $trait_id for $vec_id { #[inline] fn $trait_method(self, y: Self, z: Self) -> Self { unsafe { use crate::mem::transmute; union U { vec: $vec_id, halves: [$vech_id; 2], } let mut x_halves = U { vec: self }.halves; let y_halves = U { vec: y }.halves; let z_halves = U { vec: z }.halves; *x_halves.get_unchecked_mut(0) = transmute($fun( transmute(*x_halves.get_unchecked(0)), transmute(*y_halves.get_unchecked(0)), transmute(*z_halves.get_unchecked(0)), )); *x_halves.get_unchecked_mut(1) = transmute($fun( transmute(*x_halves.get_unchecked(1)), transmute(*y_halves.get_unchecked(1)), transmute(*z_halves.get_unchecked(1)), )); U { halves: x_halves }.vec } } } }; // implementation calling fun four times on each of the vector quarters: (quarter | $trait_id:ident, $trait_method:ident, $vec_id:ident, $vecq_id:ident, $fun:ident) => { impl $trait_id for $vec_id { #[inline] fn $trait_method(self, y: Self, z: Self) -> Self { unsafe { use crate::mem::transmute; union U { vec: $vec_id, quarters: [$vecq_id; 4], } let mut x_quarters = U { vec: self }.quarters; let y_quarters = U { vec: y }.quarters; let z_quarters = U { vec: z }.quarters; *x_quarters.get_unchecked_mut(0) = transmute($fun( transmute(*x_quarters.get_unchecked(0)), transmute(*y_quarters.get_unchecked(0)), transmute(*z_quarters.get_unchecked(0)), )); *x_quarters.get_unchecked_mut(1) = transmute($fun( transmute(*x_quarters.get_unchecked(1)), transmute(*y_quarters.get_unchecked(1)), transmute(*z_quarters.get_unchecked(1)), )); *x_quarters.get_unchecked_mut(2) = transmute($fun( transmute(*x_quarters.get_unchecked(2)), transmute(*y_quarters.get_unchecked(2)), transmute(*z_quarters.get_unchecked(2)), )); *x_quarters.get_unchecked_mut(3) = transmute($fun( transmute(*x_quarters.get_unchecked(3)), transmute(*y_quarters.get_unchecked(3)), transmute(*z_quarters.get_unchecked(3)), )); U { quarters: x_quarters }.vec } } } }; // implementation calling fun once on a vector twice as large: (twice | $trait_id:ident, $trait_method:ident, $vec_id:ident, $vect_id:ident, $fun:ident) => { impl $trait_id for $vec_id { #[inline] fn $trait_method(self, y: Self, z: Self) -> Self { unsafe { use crate::mem::{transmute, uninitialized}; union U { vec: [$vec_id; 2], twice: $vect_id, } let x_twice = U { vec: [self, uninitialized()] }.twice; let y_twice = U { vec: [y, uninitialized()] }.twice; let z_twice = U { vec: [z, uninitialized()] }.twice; let twice: $vect_id = transmute($fun(transmute(x_twice), transmute(y_twice), transmute(z_twice))); *(U { twice }.vec.get_unchecked(0)) } } } }; } macro_rules! gen_tertiary_impl_table { ($trait_id:ident, $trait_method:ident) => { macro_rules! impl_tertiary { ($vid:ident: $fun:ident) => { impl_tertiary_!(vec | $trait_id, $trait_method, $vid, $fun); }; ($vid:ident[$sid:ident; $sc:expr]: $fun:ident) => { impl_tertiary_!(scalar | $trait_id, $trait_method, $vid, [$sid; $sc], $fun); }; ($vid:ident[s]: $fun:ident) => { impl_tertiary_!(scalar | $trait_id, $trait_method, $vid, $fun); }; ($vid:ident[h => $vid_h:ident]: $fun:ident) => { impl_tertiary_!(halves | $trait_id, $trait_method, $vid, $vid_h, $fun); }; ($vid:ident[q => $vid_q:ident]: $fun:ident) => { impl_tertiary_!(quarter | $trait_id, $trait_method, $vid, $vid_q, $fun); }; ($vid:ident[t => $vid_t:ident]: $fun:ident) => { impl_tertiary_!(twice | $trait_id, $trait_method, $vid, $vid_t, $fun); }; } }; } macro_rules! impl_binary_ { // implementation mapping 1:1 (vec | $trait_id:ident, $trait_method:ident, $vec_id:ident, $fun:ident) => { impl $trait_id for $vec_id { #[inline] fn $trait_method(self, y: Self) -> Self { unsafe { use crate::mem::transmute; transmute($fun(transmute(self), transmute(y))) } } } }; (scalar | $trait_id:ident, $trait_method:ident, $vec_id:ident, [$sid:ident; $scount:expr], $fun:ident) => { impl $trait_id for $vec_id { #[inline] fn $trait_method(self, y: Self) -> Self { unsafe { union U { vec: $vec_id, scalars: [$sid; $scount], } let mut x = U { vec: self }.scalars; let y = U { vec: y }.scalars; for (x, y) in x.iter_mut().zip(&y) { *x = $fun(*x, *y); } U { scalars: x }.vec } } } }; // implementation calling fun twice on each of the vector halves: (halves | $trait_id:ident, $trait_method:ident, $vec_id:ident, $vech_id:ident, $fun:ident) => { impl $trait_id for $vec_id { #[inline] fn $trait_method(self, y: Self) -> Self { unsafe { use crate::mem::transmute; union U { vec: $vec_id, halves: [$vech_id; 2], } let mut x_halves = U { vec: self }.halves; let y_halves = U { vec: y }.halves; *x_halves.get_unchecked_mut(0) = transmute($fun( transmute(*x_halves.get_unchecked(0)), transmute(*y_halves.get_unchecked(0)), )); *x_halves.get_unchecked_mut(1) = transmute($fun( transmute(*x_halves.get_unchecked(1)), transmute(*y_halves.get_unchecked(1)), )); U { halves: x_halves }.vec } } } }; // implementation calling fun four times on each of the vector quarters: (quarter | $trait_id:ident, $trait_method:ident, $vec_id:ident, $vecq_id:ident, $fun:ident) => { impl $trait_id for $vec_id { #[inline] fn $trait_method(self, y: Self) -> Self { unsafe { use crate::mem::transmute; union U { vec: $vec_id, quarters: [$vecq_id; 4], } let mut x_quarters = U { vec: self }.quarters; let y_quarters = U { vec: y }.quarters; *x_quarters.get_unchecked_mut(0) = transmute($fun( transmute(*x_quarters.get_unchecked(0)), transmute(*y_quarters.get_unchecked(0)), )); *x_quarters.get_unchecked_mut(1) = transmute($fun( transmute(*x_quarters.get_unchecked(1)), transmute(*y_quarters.get_unchecked(1)), )); *x_quarters.get_unchecked_mut(2) = transmute($fun( transmute(*x_quarters.get_unchecked(2)), transmute(*y_quarters.get_unchecked(2)), )); *x_quarters.get_unchecked_mut(3) = transmute($fun( transmute(*x_quarters.get_unchecked(3)), transmute(*y_quarters.get_unchecked(3)), )); U { quarters: x_quarters }.vec } } } }; // implementation calling fun once on a vector twice as large: (twice | $trait_id:ident, $trait_method:ident, $vec_id:ident, $vect_id:ident, $fun:ident) => { impl $trait_id for $vec_id { #[inline] fn $trait_method(self, y: Self) -> Self { unsafe { use crate::mem::{transmute, uninitialized}; union U { vec: [$vec_id; 2], twice: $vect_id, } let x_twice = U { vec: [self, uninitialized()] }.twice; let y_twice = U { vec: [y, uninitialized()] }.twice; let twice: $vect_id = transmute($fun(transmute(x_twice), transmute(y_twice))); *(U { twice }.vec.get_unchecked(0)) } } } }; } macro_rules! gen_binary_impl_table { ($trait_id:ident, $trait_method:ident) => { macro_rules! impl_binary { ($vid:ident: $fun:ident) => { impl_binary_!(vec | $trait_id, $trait_method, $vid, $fun); }; ($vid:ident[$sid:ident; $sc:expr]: $fun:ident) => { impl_binary_!(scalar | $trait_id, $trait_method, $vid, [$sid; $sc], $fun); }; ($vid:ident[s]: $fun:ident) => { impl_binary_!(scalar | $trait_id, $trait_method, $vid, $fun); }; ($vid:ident[h => $vid_h:ident]: $fun:ident) => { impl_binary_!(halves | $trait_id, $trait_method, $vid, $vid_h, $fun); }; ($vid:ident[q => $vid_q:ident]: $fun:ident) => { impl_binary_!(quarter | $trait_id, $trait_method, $vid, $vid_q, $fun); }; ($vid:ident[t => $vid_t:ident]: $fun:ident) => { impl_binary_!(twice | $trait_id, $trait_method, $vid, $vid_t, $fun); }; } }; } ================================================ FILE: src/codegen/math/float/mul_add.rs ================================================ //! Vertical floating-point `mul_add` #![allow(unused)] use crate::*; // FIXME: 64-bit 1 element mul_add pub(crate) trait MulAdd { fn mul_add(self, y: Self, z: Self) -> Self; } #[cfg(not(target_arch = "s390x"))] #[allow(improper_ctypes)] extern "C" { #[link_name = "llvm.fma.v2f32"] fn fma_v2f32(x: f32x2, y: f32x2, z: f32x2) -> f32x2; #[link_name = "llvm.fma.v4f32"] fn fma_v4f32(x: f32x4, y: f32x4, z: f32x4) -> f32x4; #[link_name = "llvm.fma.v8f32"] fn fma_v8f32(x: f32x8, y: f32x8, z: f32x8) -> f32x8; #[link_name = "llvm.fma.v16f32"] fn fma_v16f32(x: f32x16, y: f32x16, z: f32x16) -> f32x16; /* FIXME 64-bit single elem vectors #[link_name = "llvm.fma.v1f64"] fn fma_v1f64(x: f64x1, y: f64x1, z: f64x1) -> f64x1; */ #[link_name = "llvm.fma.v2f64"] fn fma_v2f64(x: f64x2, y: f64x2, z: f64x2) -> f64x2; #[link_name = "llvm.fma.v4f64"] fn fma_v4f64(x: f64x4, y: f64x4, z: f64x4) -> f64x4; #[link_name = "llvm.fma.v8f64"] fn fma_v8f64(x: f64x8, y: f64x8, z: f64x8) -> f64x8; } gen_tertiary_impl_table!(MulAdd, mul_add); cfg_if! { if #[cfg(target_arch = "s390x")] { // FIXME: https://github.com/rust-lang-nursery/packed_simd/issues/14 macro_rules! impl_broken { ($id:ident) => { impl MulAdd for $id { #[inline] fn mul_add(self, y: Self, z: Self) -> Self { self * y + z } } }; } impl_broken!(f32x2); impl_broken!(f32x4); impl_broken!(f32x8); impl_broken!(f32x16); impl_broken!(f64x2); impl_broken!(f64x4); impl_broken!(f64x8); } else if #[cfg(all(target_arch = "x86_64", feature = "sleef-sys"))] { use sleef_sys::*; cfg_if! { if #[cfg(target_feature = "avx2")] { impl_tertiary!(f32x2[t => f32x4]: Sleef_fmaf4_avx2128); impl_tertiary!(f32x16[h => f32x8]: Sleef_fmaf8_avx2); impl_tertiary!(f64x8[h => f64x4]: Sleef_fmad4_avx2); impl_tertiary!(f32x4: Sleef_fmaf4_avx2128); impl_tertiary!(f32x8: Sleef_fmaf8_avx2); impl_tertiary!(f64x2: Sleef_fmad2_avx2128); impl_tertiary!(f64x4: Sleef_fmad4_avx2); } else if #[cfg(target_feature = "avx")] { impl_tertiary!(f32x2[t => f32x4]: Sleef_fmaf4_sse4); impl_tertiary!(f32x16[h => f32x8]: Sleef_fmaf8_avx); impl_tertiary!(f64x8[h => f64x4]: Sleef_fmad4_avx); impl_tertiary!(f32x4: Sleef_fmaf4_sse4); impl_tertiary!(f32x8: Sleef_fmaf8_avx); impl_tertiary!(f64x2: Sleef_fmad2_sse4); impl_tertiary!(f64x4: Sleef_fmad4_avx); } else if #[cfg(target_feature = "sse4.2")] { impl_tertiary!(f32x2[t => f32x4]: Sleef_fmaf4_sse4); impl_tertiary!(f32x16[q => f32x4]: Sleef_fmaf4_sse4); impl_tertiary!(f64x8[q => f64x2]: Sleef_fmad2_sse4); impl_tertiary!(f32x4: Sleef_fmaf4_sse4); impl_tertiary!(f32x8[h => f32x4]: Sleef_fmaf4_sse4); impl_tertiary!(f64x2: Sleef_fmad2_sse4); impl_tertiary!(f64x4[h => f64x2]: Sleef_fmad2_sse4); } else { impl_tertiary!(f32x2: fma_v2f32); impl_tertiary!(f32x16: fma_v16f32); impl_tertiary!(f64x8: fma_v8f64); impl_tertiary!(f32x4: fma_v4f32); impl_tertiary!(f32x8: fma_v8f32); impl_tertiary!(f64x2: fma_v2f64); impl_tertiary!(f64x4: fma_v4f64); } } } else { impl_tertiary!(f32x2: fma_v2f32); impl_tertiary!(f32x4: fma_v4f32); impl_tertiary!(f32x8: fma_v8f32); impl_tertiary!(f32x16: fma_v16f32); // impl_tertiary!(f64x1: fma_v1f64); // FIXME 64-bit fmagle elem vectors impl_tertiary!(f64x2: fma_v2f64); impl_tertiary!(f64x4: fma_v4f64); impl_tertiary!(f64x8: fma_v8f64); } } ================================================ FILE: src/codegen/math/float/mul_adde.rs ================================================ //! Approximation for floating-point `mul_add` use crate::*; // FIXME: 64-bit 1 element mul_adde pub(crate) trait MulAddE { fn mul_adde(self, y: Self, z: Self) -> Self; } #[cfg(not(target_arch = "s390x"))] #[allow(improper_ctypes)] extern "C" { #[link_name = "llvm.fmuladd.v2f32"] fn fmuladd_v2f32(x: f32x2, y: f32x2, z: f32x2) -> f32x2; #[link_name = "llvm.fmuladd.v4f32"] fn fmuladd_v4f32(x: f32x4, y: f32x4, z: f32x4) -> f32x4; #[link_name = "llvm.fmuladd.v8f32"] fn fmuladd_v8f32(x: f32x8, y: f32x8, z: f32x8) -> f32x8; #[link_name = "llvm.fmuladd.v16f32"] fn fmuladd_v16f32(x: f32x16, y: f32x16, z: f32x16) -> f32x16; /* FIXME 64-bit single elem vectors #[link_name = "llvm.fmuladd.v1f64"] fn fmuladd_v1f64(x: f64x1, y: f64x1, z: f64x1) -> f64x1; */ #[link_name = "llvm.fmuladd.v2f64"] fn fmuladd_v2f64(x: f64x2, y: f64x2, z: f64x2) -> f64x2; #[link_name = "llvm.fmuladd.v4f64"] fn fmuladd_v4f64(x: f64x4, y: f64x4, z: f64x4) -> f64x4; #[link_name = "llvm.fmuladd.v8f64"] fn fmuladd_v8f64(x: f64x8, y: f64x8, z: f64x8) -> f64x8; } macro_rules! impl_mul_adde { ($id:ident : $fn:ident) => { impl MulAddE for $id { #[inline] fn mul_adde(self, y: Self, z: Self) -> Self { #[cfg(not(target_arch = "s390x"))] { use crate::mem::transmute; unsafe { transmute($fn(transmute(self), transmute(y), transmute(z))) } } #[cfg(target_arch = "s390x")] { // FIXME: https://github.com/rust-lang-nursery/packed_simd/issues/14 self * y + z } } } }; } impl_mul_adde!(f32x2: fmuladd_v2f32); impl_mul_adde!(f32x4: fmuladd_v4f32); impl_mul_adde!(f32x8: fmuladd_v8f32); impl_mul_adde!(f32x16: fmuladd_v16f32); // impl_mul_adde!(f64x1: fma_v1f64); // FIXME 64-bit fmagle elem vectors impl_mul_adde!(f64x2: fmuladd_v2f64); impl_mul_adde!(f64x4: fmuladd_v4f64); impl_mul_adde!(f64x8: fmuladd_v8f64); ================================================ FILE: src/codegen/math/float/powf.rs ================================================ //! Vertical floating-point `powf` #![allow(unused)] // FIXME 64-bit powfgle elem vectors mispowfg use crate::*; pub(crate) trait Powf { fn powf(self, x: Self) -> Self; } #[allow(improper_ctypes)] extern "C" { #[link_name = "llvm.pow.v2f32"] fn powf_v2f32(x: f32x2, y: f32x2) -> f32x2; #[link_name = "llvm.pow.v4f32"] fn powf_v4f32(x: f32x4, y: f32x4) -> f32x4; #[link_name = "llvm.pow.v8f32"] fn powf_v8f32(x: f32x8, y: f32x8) -> f32x8; #[link_name = "llvm.pow.v16f32"] fn powf_v16f32(x: f32x16, y: f32x16) -> f32x16; /* FIXME 64-bit powfgle elem vectors #[link_name = "llvm.pow.v1f64"] fn powf_v1f64(x: f64x1, y: f64x1) -> f64x1; */ #[link_name = "llvm.pow.v2f64"] fn powf_v2f64(x: f64x2, y: f64x2) -> f64x2; #[link_name = "llvm.pow.v4f64"] fn powf_v4f64(x: f64x4, y: f64x4) -> f64x4; #[link_name = "llvm.pow.v8f64"] fn powf_v8f64(x: f64x8, y: f64x8) -> f64x8; #[link_name = "llvm.pow.f32"] fn powf_f32(x: f32, y: f32) -> f32; #[link_name = "llvm.pow.f64"] fn powf_f64(x: f64, y: f64) -> f64; } gen_binary_impl_table!(Powf, powf); cfg_if! { if #[cfg(target_arch = "s390x")] { // FIXME: https://github.com/rust-lang-nursery/packed_simd/issues/14 impl_binary!(f32x2[f32; 2]: powf_f32); impl_binary!(f32x4[f32; 4]: powf_f32); impl_binary!(f32x8[f32; 8]: powf_f32); impl_binary!(f32x16[f32; 16]: powf_f32); impl_binary!(f64x2[f64; 2]: powf_f64); impl_binary!(f64x4[f64; 4]: powf_f64); impl_binary!(f64x8[f64; 8]: powf_f64); } else if #[cfg(all(target_arch = "x86_64", feature = "sleef-sys"))] { use sleef_sys::*; cfg_if! { if #[cfg(target_feature = "avx2")] { impl_binary!(f32x2[t => f32x4]: Sleef_powf4_u10avx2128); impl_binary!(f32x16[h => f32x8]: Sleef_powf8_u10avx2); impl_binary!(f64x8[h => f64x4]: Sleef_powd4_u10avx2); impl_binary!(f32x4: Sleef_powf4_u10avx2128); impl_binary!(f32x8: Sleef_powf8_u10avx2); impl_binary!(f64x2: Sleef_powd2_u10avx2128); impl_binary!(f64x4: Sleef_powd4_u10avx2); } else if #[cfg(target_feature = "avx")] { impl_binary!(f32x2[t => f32x4]: Sleef_powf4_u10sse4); impl_binary!(f32x16[h => f32x8]: Sleef_powf8_u10avx); impl_binary!(f64x8[h => f64x4]: Sleef_powd4_u10avx); impl_binary!(f32x4: Sleef_powf4_u10sse4); impl_binary!(f32x8: Sleef_powf8_u10avx); impl_binary!(f64x2: Sleef_powd2_u10sse4); impl_binary!(f64x4: Sleef_powd4_u10avx); } else if #[cfg(target_feature = "sse4.2")] { impl_binary!(f32x2[t => f32x4]: Sleef_powf4_u10sse4); impl_binary!(f32x16[q => f32x4]: Sleef_powf4_u10sse4); impl_binary!(f64x8[q => f64x2]: Sleef_powd2_u10sse4); impl_binary!(f32x4: Sleef_powf4_u10sse4); impl_binary!(f32x8[h => f32x4]: Sleef_powf4_u10sse4); impl_binary!(f64x2: Sleef_powd2_u10sse4); impl_binary!(f64x4[h => f64x2]: Sleef_powd2_u10sse4); } else if #[cfg(target_feature = "sse2")] { impl_binary!(f32x2[t => f32x4]: Sleef_powf4_u10sse2); impl_binary!(f32x16[q => f32x4]: Sleef_powf4_u10sse2); impl_binary!(f64x8[q => f64x2]: Sleef_powd2_u10sse2); impl_binary!(f32x4: Sleef_powf4_u10sse2); impl_binary!(f32x8[h => f32x4]: Sleef_powf4_u10sse2); impl_binary!(f64x2: Sleef_powd2_u10sse2); impl_binary!(f64x4[h => f64x2]: Sleef_powd2_u10sse2); } else { impl_binary!(f32x2[f32; 2]: powf_f32); impl_binary!(f32x4: powf_v4f32); impl_binary!(f32x8: powf_v8f32); impl_binary!(f32x16: powf_v16f32); impl_binary!(f64x2: powf_v2f64); impl_binary!(f64x4: powf_v4f64); impl_binary!(f64x8: powf_v8f64); } } } else { impl_binary!(f32x2[f32; 2]: powf_f32); impl_binary!(f32x4: powf_v4f32); impl_binary!(f32x8: powf_v8f32); impl_binary!(f32x16: powf_v16f32); impl_binary!(f64x2: powf_v2f64); impl_binary!(f64x4: powf_v4f64); impl_binary!(f64x8: powf_v8f64); } } ================================================ FILE: src/codegen/math/float/sin.rs ================================================ //! Vertical floating-point `sin` #![allow(unused)] // FIXME 64-bit 1 elem vectors sin use crate::*; pub(crate) trait Sin { fn sin(self) -> Self; } #[allow(improper_ctypes)] extern "C" { #[link_name = "llvm.sin.v2f32"] fn sin_v2f32(x: f32x2) -> f32x2; #[link_name = "llvm.sin.v4f32"] fn sin_v4f32(x: f32x4) -> f32x4; #[link_name = "llvm.sin.v8f32"] fn sin_v8f32(x: f32x8) -> f32x8; #[link_name = "llvm.sin.v16f32"] fn sin_v16f32(x: f32x16) -> f32x16; /* FIXME 64-bit single elem vectors #[link_name = "llvm.sin.v1f64"] fn sin_v1f64(x: f64x1) -> f64x1; */ #[link_name = "llvm.sin.v2f64"] fn sin_v2f64(x: f64x2) -> f64x2; #[link_name = "llvm.sin.v4f64"] fn sin_v4f64(x: f64x4) -> f64x4; #[link_name = "llvm.sin.v8f64"] fn sin_v8f64(x: f64x8) -> f64x8; #[link_name = "llvm.sin.f32"] fn sin_f32(x: f32) -> f32; #[link_name = "llvm.sin.f64"] fn sin_f64(x: f64) -> f64; } gen_unary_impl_table!(Sin, sin); cfg_if! { if #[cfg(target_arch = "s390x")] { // FIXME: https://github.com/rust-lang-nursery/packed_simd/issues/14 impl_unary!(f32x2[f32; 2]: sin_f32); impl_unary!(f32x4[f32; 4]: sin_f32); impl_unary!(f32x8[f32; 8]: sin_f32); impl_unary!(f32x16[f32; 16]: sin_f32); impl_unary!(f64x2[f64; 2]: sin_f64); impl_unary!(f64x4[f64; 4]: sin_f64); impl_unary!(f64x8[f64; 8]: sin_f64); } else if #[cfg(all(target_arch = "x86_64", feature = "sleef-sys"))] { use sleef_sys::*; cfg_if! { if #[cfg(target_feature = "avx2")] { impl_unary!(f32x2[t => f32x4]: Sleef_sinf4_u10avx2128); impl_unary!(f32x16[h => f32x8]: Sleef_sinf8_u10avx2); impl_unary!(f64x8[h => f64x4]: Sleef_sind4_u10avx2); impl_unary!(f32x4: Sleef_sinf4_u10avx2128); impl_unary!(f32x8: Sleef_sinf8_u10avx2); impl_unary!(f64x2: Sleef_sind2_u10avx2128); impl_unary!(f64x4: Sleef_sind4_u10avx2); } else if #[cfg(target_feature = "avx")] { impl_unary!(f32x2[t => f32x4]: Sleef_sinf4_u10sse4); impl_unary!(f32x16[h => f32x8]: Sleef_sinf8_u10avx); impl_unary!(f64x8[h => f64x4]: Sleef_sind4_u10avx); impl_unary!(f32x4: Sleef_sinf4_u10sse4); impl_unary!(f32x8: Sleef_sinf8_u10avx); impl_unary!(f64x2: Sleef_sind2_u10sse4); impl_unary!(f64x4: Sleef_sind4_u10avx); } else if #[cfg(target_feature = "sse4.2")] { impl_unary!(f32x2[t => f32x4]: Sleef_sinf4_u10sse4); impl_unary!(f32x16[q => f32x4]: Sleef_sinf4_u10sse4); impl_unary!(f64x8[q => f64x2]: Sleef_sind2_u10sse4); impl_unary!(f32x4: Sleef_sinf4_u10sse4); impl_unary!(f32x8[h => f32x4]: Sleef_sinf4_u10sse4); impl_unary!(f64x2: Sleef_sind2_u10sse4); impl_unary!(f64x4[h => f64x2]: Sleef_sind2_u10sse4); } else { impl_unary!(f32x2[f32; 2]: sin_f32); impl_unary!(f32x16: sin_v16f32); impl_unary!(f64x8: sin_v8f64); impl_unary!(f32x4: sin_v4f32); impl_unary!(f32x8: sin_v8f32); impl_unary!(f64x2: sin_v2f64); impl_unary!(f64x4: sin_v4f64); } } } else { impl_unary!(f32x2[f32; 2]: sin_f32); impl_unary!(f32x4: sin_v4f32); impl_unary!(f32x8: sin_v8f32); impl_unary!(f32x16: sin_v16f32); impl_unary!(f64x2: sin_v2f64); impl_unary!(f64x4: sin_v4f64); impl_unary!(f64x8: sin_v8f64); } } ================================================ FILE: src/codegen/math/float/sin_cos_pi.rs ================================================ //! Vertical floating-point `sin_cos` #![allow(unused)] // FIXME 64-bit 1 elem vectors sin_cos use crate::*; pub(crate) trait SinCosPi: Sized { type Output; fn sin_cos_pi(self) -> Self::Output; } macro_rules! impl_def { ($vid:ident, $PI:path) => { impl SinCosPi for $vid { type Output = (Self, Self); #[inline] fn sin_cos_pi(self) -> Self::Output { let v = self * Self::splat($PI); (v.sin(), v.cos()) } } }; } macro_rules! impl_def32 { ($vid:ident) => { impl_def!($vid, crate::f32::consts::PI); }; } macro_rules! impl_def64 { ($vid:ident) => { impl_def!($vid, crate::f64::consts::PI); }; } macro_rules! impl_unary_t { ($vid:ident: $fun:ident) => { impl SinCosPi for $vid { type Output = (Self, Self); fn sin_cos_pi(self) -> Self::Output { unsafe { use crate::mem::transmute; transmute($fun(transmute(self))) } } } }; ($vid:ident[t => $vid_t:ident]: $fun:ident) => { impl SinCosPi for $vid { type Output = (Self, Self); fn sin_cos_pi(self) -> Self::Output { unsafe { use crate::mem::{transmute, uninitialized}; union U { vec: [$vid; 2], twice: $vid_t, } let twice = U { vec: [self, uninitialized()] }.twice; let twice = transmute($fun(transmute(twice))); union R { twice: ($vid_t, $vid_t), vecs: ([$vid; 2], [$vid; 2]), } let r = R { twice }.vecs; (*r.0.get_unchecked(0), *r.0.get_unchecked(1)) } } } }; ($vid:ident[h => $vid_h:ident]: $fun:ident) => { impl SinCosPi for $vid { type Output = (Self, Self); fn sin_cos_pi(self) -> Self::Output { unsafe { use crate::mem::transmute; union U { vec: $vid, halves: [$vid_h; 2], } let halves = U { vec: self }.halves; let res_0: ($vid_h, $vid_h) = transmute($fun(transmute(*halves.get_unchecked(0)))); let res_1: ($vid_h, $vid_h) = transmute($fun(transmute(*halves.get_unchecked(1)))); union R { result: ($vid, $vid), halves: ([$vid_h; 2], [$vid_h; 2]), } R { halves: ([res_0.0, res_1.0], [res_0.1, res_1.1]) }.result } } } }; ($vid:ident[q => $vid_q:ident]: $fun:ident) => { impl SinCosPi for $vid { type Output = (Self, Self); fn sin_cos_pi(self) -> Self::Output { unsafe { use crate::mem::transmute; union U { vec: $vid, quarters: [$vid_q; 4], } let quarters = U { vec: self }.quarters; let res_0: ($vid_q, $vid_q) = transmute($fun(transmute(*quarters.get_unchecked(0)))); let res_1: ($vid_q, $vid_q) = transmute($fun(transmute(*quarters.get_unchecked(1)))); let res_2: ($vid_q, $vid_q) = transmute($fun(transmute(*quarters.get_unchecked(2)))); let res_3: ($vid_q, $vid_q) = transmute($fun(transmute(*quarters.get_unchecked(3)))); union R { result: ($vid, $vid), quarters: ([$vid_q; 4], [$vid_q; 4]), } R { quarters: ( [res_0.0, res_1.0, res_2.0, res_3.0], [res_0.1, res_1.1, res_2.1, res_3.1], ), } .result } } } }; } cfg_if! { if #[cfg(all(target_arch = "x86_64", feature = "sleef-sys"))] { use sleef_sys::*; cfg_if! { if #[cfg(target_feature = "avx2")] { impl_unary_t!(f32x2[t => f32x4]: Sleef_sincospif4_u05avx2128); impl_unary_t!(f32x16[h => f32x8]: Sleef_sincospif8_u05avx2); impl_unary_t!(f64x8[h => f64x4]: Sleef_sincospid4_u05avx2); impl_unary_t!(f32x4: Sleef_sincospif4_u05avx2128); impl_unary_t!(f32x8: Sleef_sincospif8_u05avx2); impl_unary_t!(f64x2: Sleef_sincospid2_u05avx2128); impl_unary_t!(f64x4: Sleef_sincospid4_u05avx2); } else if #[cfg(target_feature = "avx")] { impl_unary_t!(f32x2[t => f32x4]: Sleef_sincospif4_u05sse4); impl_unary_t!(f32x16[h => f32x8]: Sleef_sincospif8_u05avx); impl_unary_t!(f64x8[h => f64x4]: Sleef_sincospid4_u05avx); impl_unary_t!(f32x4: Sleef_sincospif4_u05sse4); impl_unary_t!(f32x8: Sleef_sincospif8_u05avx); impl_unary_t!(f64x2: Sleef_sincospid2_u05sse4); impl_unary_t!(f64x4: Sleef_sincospid4_u05avx); } else if #[cfg(target_feature = "sse4.2")] { impl_unary_t!(f32x2[t => f32x4]: Sleef_sincospif4_u05sse4); impl_unary_t!(f32x16[q => f32x4]: Sleef_sincospif4_u05sse4); impl_unary_t!(f64x8[q => f64x2]: Sleef_sincospid2_u05sse4); impl_unary_t!(f32x4: Sleef_sincospif4_u05sse4); impl_unary_t!(f32x8[h => f32x4]: Sleef_sincospif4_u05sse4); impl_unary_t!(f64x2: Sleef_sincospid2_u05sse4); impl_unary_t!(f64x4[h => f64x2]: Sleef_sincospid2_u05sse4); } else { impl_def32!(f32x2); impl_def32!(f32x4); impl_def32!(f32x8); impl_def32!(f32x16); impl_def64!(f64x2); impl_def64!(f64x4); impl_def64!(f64x8); } } } else { impl_def32!(f32x2); impl_def32!(f32x4); impl_def32!(f32x8); impl_def32!(f32x16); impl_def64!(f64x2); impl_def64!(f64x4); impl_def64!(f64x8); } } ================================================ FILE: src/codegen/math/float/sin_pi.rs ================================================ //! Vertical floating-point `sin_pi` #![allow(unused)] // FIXME 64-bit 1 elem vectors sin_pi use crate::*; pub(crate) trait SinPi { fn sin_pi(self) -> Self; } gen_unary_impl_table!(SinPi, sin_pi); macro_rules! impl_def { ($vid:ident, $PI:path) => { impl SinPi for $vid { #[inline] fn sin_pi(self) -> Self { (self * Self::splat($PI)).sin() } } }; } macro_rules! impl_def32 { ($vid:ident) => { impl_def!($vid, crate::f32::consts::PI); }; } macro_rules! impl_def64 { ($vid:ident) => { impl_def!($vid, crate::f64::consts::PI); }; } cfg_if! { if #[cfg(all(target_arch = "x86_64", feature = "sleef-sys"))] { use sleef_sys::*; cfg_if! { if #[cfg(target_feature = "avx2")] { impl_unary!(f32x2[t => f32x4]: Sleef_sinpif4_u05avx2128); impl_unary!(f32x16[h => f32x8]: Sleef_sinpif8_u05avx2); impl_unary!(f64x8[h => f64x4]: Sleef_sinpid4_u05avx2); impl_unary!(f32x4: Sleef_sinpif4_u05avx2128); impl_unary!(f32x8: Sleef_sinpif8_u05avx2); impl_unary!(f64x2: Sleef_sinpid2_u05avx2128); impl_unary!(f64x4: Sleef_sinpid4_u05avx2); } else if #[cfg(target_feature = "avx")] { impl_unary!(f32x2[t => f32x4]: Sleef_sinpif4_u05sse4); impl_unary!(f32x16[h => f32x8]: Sleef_sinpif8_u05avx); impl_unary!(f64x8[h => f64x4]: Sleef_sinpid4_u05avx); impl_unary!(f32x4: Sleef_sinpif4_u05sse4); impl_unary!(f32x8: Sleef_sinpif8_u05avx); impl_unary!(f64x2: Sleef_sinpid2_u05sse4); impl_unary!(f64x4: Sleef_sinpid4_u05avx); } else if #[cfg(target_feature = "sse4.2")] { impl_unary!(f32x2[t => f32x4]: Sleef_sinpif4_u05sse4); impl_unary!(f32x16[q => f32x4]: Sleef_sinpif4_u05sse4); impl_unary!(f64x8[q => f64x2]: Sleef_sinpid2_u05sse4); impl_unary!(f32x4: Sleef_sinpif4_u05sse4); impl_unary!(f32x8[h => f32x4]: Sleef_sinpif4_u05sse4); impl_unary!(f64x2: Sleef_sinpid2_u05sse4); impl_unary!(f64x4[h => f64x2]: Sleef_sinpid2_u05sse4); } else { impl_def32!(f32x2); impl_def32!(f32x4); impl_def32!(f32x8); impl_def32!(f32x16); impl_def64!(f64x2); impl_def64!(f64x4); impl_def64!(f64x8); } } } else { impl_def32!(f32x2); impl_def32!(f32x4); impl_def32!(f32x8); impl_def32!(f32x16); impl_def64!(f64x2); impl_def64!(f64x4); impl_def64!(f64x8); } } ================================================ FILE: src/codegen/math/float/sqrt.rs ================================================ //! Vertical floating-point `sqrt` #![allow(unused)] // FIXME 64-bit 1 elem vectors sqrt use crate::*; pub(crate) trait Sqrt { fn sqrt(self) -> Self; } #[allow(improper_ctypes)] extern "C" { #[link_name = "llvm.sqrt.v2f32"] fn sqrt_v2f32(x: f32x2) -> f32x2; #[link_name = "llvm.sqrt.v4f32"] fn sqrt_v4f32(x: f32x4) -> f32x4; #[link_name = "llvm.sqrt.v8f32"] fn sqrt_v8f32(x: f32x8) -> f32x8; #[link_name = "llvm.sqrt.v16f32"] fn sqrt_v16f32(x: f32x16) -> f32x16; /* FIXME 64-bit sqrtgle elem vectors #[link_name = "llvm.sqrt.v1f64"] fn sqrt_v1f64(x: f64x1) -> f64x1; */ #[link_name = "llvm.sqrt.v2f64"] fn sqrt_v2f64(x: f64x2) -> f64x2; #[link_name = "llvm.sqrt.v4f64"] fn sqrt_v4f64(x: f64x4) -> f64x4; #[link_name = "llvm.sqrt.v8f64"] fn sqrt_v8f64(x: f64x8) -> f64x8; #[link_name = "llvm.sqrt.f32"] fn sqrt_f32(x: f32) -> f32; #[link_name = "llvm.sqrt.f64"] fn sqrt_f64(x: f64) -> f64; } gen_unary_impl_table!(Sqrt, sqrt); cfg_if! { if #[cfg(target_arch = "s390x")] { // FIXME: https://github.com/rust-lang-nursery/packed_simd/issues/14 impl_unary!(f32x2[f32; 2]: sqrt_f32); impl_unary!(f32x4[f32; 4]: sqrt_f32); impl_unary!(f32x8[f32; 8]: sqrt_f32); impl_unary!(f32x16[f32; 16]: sqrt_f32); impl_unary!(f64x2[f64; 2]: sqrt_f64); impl_unary!(f64x4[f64; 4]: sqrt_f64); impl_unary!(f64x8[f64; 8]: sqrt_f64); } else if #[cfg(all(target_arch = "x86_64", feature = "sleef-sys"))] { use sleef_sys::*; cfg_if! { if #[cfg(target_feature = "avx2")] { impl_unary!(f32x2[t => f32x4]: Sleef_sqrtf4_avx2128); impl_unary!(f32x16[h => f32x8]: Sleef_sqrtf8_avx2); impl_unary!(f64x8[h => f64x4]: Sleef_sqrtd4_avx2); impl_unary!(f32x4: Sleef_sqrtf4_avx2128); impl_unary!(f32x8: Sleef_sqrtf8_avx2); impl_unary!(f64x2: Sleef_sqrtd2_avx2128); impl_unary!(f64x4: Sleef_sqrtd4_avx2); } else if #[cfg(target_feature = "avx")] { impl_unary!(f32x2[t => f32x4]: Sleef_sqrtf4_sse4); impl_unary!(f32x16[h => f32x8]: Sleef_sqrtf8_avx); impl_unary!(f64x8[h => f64x4]: Sleef_sqrtd4_avx); impl_unary!(f32x4: Sleef_sqrtf4_sse4); impl_unary!(f32x8: Sleef_sqrtf8_avx); impl_unary!(f64x2: Sleef_sqrtd2_sse4); impl_unary!(f64x4: Sleef_sqrtd4_avx); } else if #[cfg(target_feature = "sse4.2")] { impl_unary!(f32x2[t => f32x4]: Sleef_sqrtf4_sse4); impl_unary!(f32x16[q => f32x4]: Sleef_sqrtf4_sse4); impl_unary!(f64x8[q => f64x2]: Sleef_sqrtd2_sse4); impl_unary!(f32x4: Sleef_sqrtf4_sse4); impl_unary!(f32x8[h => f32x4]: Sleef_sqrtf4_sse4); impl_unary!(f64x2: Sleef_sqrtd2_sse4); impl_unary!(f64x4[h => f64x2]: Sleef_sqrtd2_sse4); } else { impl_unary!(f32x2[f32; 2]: sqrt_f32); impl_unary!(f32x16: sqrt_v16f32); impl_unary!(f64x8: sqrt_v8f64); impl_unary!(f32x4: sqrt_v4f32); impl_unary!(f32x8: sqrt_v8f32); impl_unary!(f64x2: sqrt_v2f64); impl_unary!(f64x4: sqrt_v4f64); } } } else { impl_unary!(f32x2[f32; 2]: sqrt_f32); impl_unary!(f32x4: sqrt_v4f32); impl_unary!(f32x8: sqrt_v8f32); impl_unary!(f32x16: sqrt_v16f32); impl_unary!(f64x2: sqrt_v2f64); impl_unary!(f64x4: sqrt_v4f64); impl_unary!(f64x8: sqrt_v8f64); } } ================================================ FILE: src/codegen/math/float/sqrte.rs ================================================ //! Vertical floating-point `sqrt` #![allow(unused)] // FIXME 64-bit 1 elem vectors sqrte use crate::llvm::simd_fsqrt; use crate::*; pub(crate) trait Sqrte { fn sqrte(self) -> Self; } gen_unary_impl_table!(Sqrte, sqrte); cfg_if! { if #[cfg(all(target_arch = "x86_64", feature = "sleef-sys"))] { use sleef_sys::*; cfg_if! { if #[cfg(target_feature = "avx2")] { impl_unary!(f32x2[t => f32x4]: Sleef_sqrtf4_u35avx2128); impl_unary!(f32x16[h => f32x8]: Sleef_sqrtf8_u35avx2); impl_unary!(f64x8[h => f64x4]: Sleef_sqrtd4_u35avx2); impl_unary!(f32x4: Sleef_sqrtf4_u35avx2128); impl_unary!(f32x8: Sleef_sqrtf8_u35avx2); impl_unary!(f64x2: Sleef_sqrtd2_u35avx2128); impl_unary!(f64x4: Sleef_sqrtd4_u35avx2); } else if #[cfg(target_feature = "avx")] { impl_unary!(f32x2[t => f32x4]: Sleef_sqrtf4_u35sse4); impl_unary!(f32x16[h => f32x8]: Sleef_sqrtf8_u35avx); impl_unary!(f64x8[h => f64x4]: Sleef_sqrtd4_u35avx); impl_unary!(f32x4: Sleef_sqrtf4_u35sse4); impl_unary!(f32x8: Sleef_sqrtf8_u35avx); impl_unary!(f64x2: Sleef_sqrtd2_u35sse4); impl_unary!(f64x4: Sleef_sqrtd4_u35avx); } else if #[cfg(target_feature = "sse4.2")] { impl_unary!(f32x2[t => f32x4]: Sleef_sqrtf4_u35sse4); impl_unary!(f32x16[q => f32x4]: Sleef_sqrtf4_u35sse4); impl_unary!(f64x8[q => f64x2]: Sleef_sqrtd2_u35sse4); impl_unary!(f32x4: Sleef_sqrtf4_u35sse4); impl_unary!(f32x8[h => f32x4]: Sleef_sqrtf4_u35sse4); impl_unary!(f64x2: Sleef_sqrtd2_u35sse4); impl_unary!(f64x4[h => f64x2]: Sleef_sqrtd2_u35sse4); } else { impl_unary!(f32x2[g]: simd_fsqrt); impl_unary!(f32x16[g]: simd_fsqrt); impl_unary!(f64x8[g]: simd_fsqrt); impl_unary!(f32x4[g]: simd_fsqrt); impl_unary!(f32x8[g]: simd_fsqrt); impl_unary!(f64x2[g]: simd_fsqrt); impl_unary!(f64x4[g]: simd_fsqrt); } } } else { impl_unary!(f32x2[g]: simd_fsqrt); impl_unary!(f32x4[g]: simd_fsqrt); impl_unary!(f32x8[g]: simd_fsqrt); impl_unary!(f32x16[g]: simd_fsqrt); impl_unary!(f64x2[g]: simd_fsqrt); impl_unary!(f64x4[g]: simd_fsqrt); impl_unary!(f64x8[g]: simd_fsqrt); } } ================================================ FILE: src/codegen/math/float/tanh.rs ================================================ //! Vertical floating-point `tanh` #![allow(unused)] // FIXME 64-bit 1 elem vectors tanh #[cfg(not(feature = "std"))] use num_traits::Float; use crate::*; pub(crate) trait Tanh { fn tanh(self) -> Self; } macro_rules! define_tanh { ($name:ident, $basetype:ty, $simdtype:ty, $lanes:expr, $trait:path) => { fn $name(x: $simdtype) -> $simdtype { use core::intrinsics::transmute; let mut buf: [$basetype; $lanes] = unsafe { transmute(x) }; for elem in &mut buf { *elem = <$basetype as $trait>::tanh(*elem); } unsafe { transmute(buf) } } }; (f32 => $name:ident, $type:ty, $lanes:expr) => { define_tanh!($name, f32, $type, $lanes, Float); }; (f64 => $name:ident, $type:ty, $lanes:expr) => { define_tanh!($name, f64, $type, $lanes, Float); }; } // llvm does not seem to expose the hyperbolic versions of trigonometric // functions; we thus call the classical rust versions on all of them (which // stem from cmath). define_tanh!(f32 => tanh_v2f32, f32x2, 2); define_tanh!(f32 => tanh_v4f32, f32x4, 4); define_tanh!(f32 => tanh_v8f32, f32x8, 8); define_tanh!(f32 => tanh_v16f32, f32x16, 16); define_tanh!(f64 => tanh_v2f64, f64x2, 2); define_tanh!(f64 => tanh_v4f64, f64x4, 4); define_tanh!(f64 => tanh_v8f64, f64x8, 8); fn tanh_f32(x: f32) -> f32 { Float::tanh(x) } fn tanh_f64(x: f64) -> f64 { Float::tanh(x) } gen_unary_impl_table!(Tanh, tanh); cfg_if! { if #[cfg(target_arch = "s390x")] { // FIXME: https://github.com/rust-lang-nursery/packed_simd/issues/14 impl_unary!(f32x2[f32; 2]: tanh_f32); impl_unary!(f32x4[f32; 4]: tanh_f32); impl_unary!(f32x8[f32; 8]: tanh_f32); impl_unary!(f32x16[f32; 16]: tanh_f32); impl_unary!(f64x2[f64; 2]: tanh_f64); impl_unary!(f64x4[f64; 4]: tanh_f64); impl_unary!(f64x8[f64; 8]: tanh_f64); } else if #[cfg(all(target_arch = "x86_64", feature = "sleef-sys"))] { use sleef_sys::*; cfg_if! { if #[cfg(target_feature = "avx2")] { impl_unary!(f32x2[t => f32x4]: Sleef_tanhf4_u10avx2128); impl_unary!(f32x16[h => f32x8]: Sleef_tanhf8_u10avx2); impl_unary!(f64x8[h => f64x4]: Sleef_tanhd4_u10avx2); impl_unary!(f32x4: Sleef_tanhf4_u10avx2128); impl_unary!(f32x8: Sleef_tanhf8_u10avx2); impl_unary!(f64x2: Sleef_tanhd2_u10avx2128); impl_unary!(f64x4: Sleef_tanhd4_u10avx2); } else if #[cfg(target_feature = "avx")] { impl_unary!(f32x2[t => f32x4]: Sleef_tanhf4_u10sse4); impl_unary!(f32x16[h => f32x8]: Sleef_tanhf8_u10avx); impl_unary!(f64x8[h => f64x4]: Sleef_tanhd4_u10avx); impl_unary!(f32x4: Sleef_tanhf4_u10sse4); impl_unary!(f32x8: Sleef_tanhf8_u10avx); impl_unary!(f64x2: Sleef_tanhd2_u10sse4); impl_unary!(f64x4: Sleef_tanhd4_u10avx); } else if #[cfg(target_feature = "sse4.2")] { impl_unary!(f32x2[t => f32x4]: Sleef_tanhf4_u10sse4); impl_unary!(f32x16[q => f32x4]: Sleef_tanhf4_u10sse4); impl_unary!(f64x8[q => f64x2]: Sleef_tanhd2_u10sse4); impl_unary!(f32x4: Sleef_tanhf4_u10sse4); impl_unary!(f32x8[h => f32x4]: Sleef_tanhf4_u10sse4); impl_unary!(f64x2: Sleef_tanhd2_u10sse4); impl_unary!(f64x4[h => f64x2]: Sleef_tanhd2_u10sse4); } else { impl_unary!(f32x2[f32; 2]: tanh_f32); impl_unary!(f32x16: tanh_v16f32); impl_unary!(f64x8: tanh_v8f64); impl_unary!(f32x4: tanh_v4f32); impl_unary!(f32x8: tanh_v8f32); impl_unary!(f64x2: tanh_v2f64); impl_unary!(f64x4: tanh_v4f64); } } } else { impl_unary!(f32x2[f32; 2]: tanh_f32); impl_unary!(f32x4: tanh_v4f32); impl_unary!(f32x8: tanh_v8f32); impl_unary!(f32x16: tanh_v16f32); impl_unary!(f64x2: tanh_v2f64); impl_unary!(f64x4: tanh_v4f64); impl_unary!(f64x8: tanh_v8f64); } } ================================================ FILE: src/codegen/math/float.rs ================================================ //! Vertical floating-point math operations. #![allow(clippy::useless_transmute)] #[macro_use] pub(crate) mod macros; pub(crate) mod abs; pub(crate) mod cos; pub(crate) mod cos_pi; pub(crate) mod exp; pub(crate) mod ln; pub(crate) mod mul_add; pub(crate) mod mul_adde; pub(crate) mod powf; pub(crate) mod sin; pub(crate) mod sin_cos_pi; pub(crate) mod sin_pi; pub(crate) mod sqrt; pub(crate) mod sqrte; pub(crate) mod tanh; ================================================ FILE: src/codegen/math.rs ================================================ //! Vertical math operations pub(crate) mod float; ================================================ FILE: src/codegen/pointer_sized_int.rs ================================================ //! Provides `isize` and `usize` use cfg_if::cfg_if; cfg_if! { if #[cfg(target_pointer_width = "8")] { pub(crate) type isize_ = i8; pub(crate) type usize_ = u8; } else if #[cfg(target_pointer_width = "16")] { pub(crate) type isize_ = i16; pub(crate) type usize_ = u16; } else if #[cfg(target_pointer_width = "32")] { pub(crate) type isize_ = i32; pub(crate) type usize_ = u32; } else if #[cfg(target_pointer_width = "64")] { pub(crate) type isize_ = i64; pub(crate) type usize_ = u64; } else if #[cfg(target_pointer_width = "64")] { pub(crate) type isize_ = i64; pub(crate) type usize_ = u64; } else if #[cfg(target_pointer_width = "128")] { pub(crate) type isize_ = i128; pub(crate) type usize_ = u128; } else { compile_error!("unsupported target_pointer_width"); } } ================================================ FILE: src/codegen/reductions/mask/aarch64.rs ================================================ //! Mask reductions implementation for `aarch64` targets /// 128-bit wide vectors macro_rules! aarch64_128_neon_impl { ($id:ident, $vmin:ident, $vmax:ident) => { impl All for $id { #[inline] #[target_feature(enable = "neon")] unsafe fn all(self) -> bool { use crate::arch::aarch64::$vmin; $vmin(crate::mem::transmute(self)) != 0 } } impl Any for $id { #[inline] #[target_feature(enable = "neon")] unsafe fn any(self) -> bool { use crate::arch::aarch64::$vmax; $vmax(crate::mem::transmute(self)) != 0 } } }; } /// 64-bit wide vectors macro_rules! aarch64_64_neon_impl { ($id:ident, $vec128:ident) => { impl All for $id { #[inline] #[target_feature(enable = "neon")] unsafe fn all(self) -> bool { // Duplicates the 64-bit vector into a 128-bit one and // calls all on that. union U { halves: ($id, $id), vec: $vec128, } U { halves: (self, self) }.vec.all() } } impl Any for $id { #[inline] #[target_feature(enable = "neon")] unsafe fn any(self) -> bool { union U { halves: ($id, $id), vec: $vec128, } U { halves: (self, self) }.vec.any() } } }; } /// Mask reduction implementation for `aarch64` targets macro_rules! impl_mask_reductions { // 64-bit wide masks (m8x8) => { aarch64_64_neon_impl!(m8x8, m8x16); }; (m16x4) => { aarch64_64_neon_impl!(m16x4, m16x8); }; (m32x2) => { aarch64_64_neon_impl!(m32x2, m32x4); }; // 128-bit wide masks (m8x16) => { aarch64_128_neon_impl!(m8x16, vminvq_u8, vmaxvq_u8); }; (m16x8) => { aarch64_128_neon_impl!(m16x8, vminvq_u16, vmaxvq_u16); }; (m32x4) => { aarch64_128_neon_impl!(m32x4, vminvq_u32, vmaxvq_u32); }; // Fallback to LLVM's default code-generation: ($id:ident) => { fallback_impl!($id); }; } ================================================ FILE: src/codegen/reductions/mask/arm.rs ================================================ //! Mask reductions implementation for `arm` targets /// Implementation for ARM + v7 + NEON for 64-bit or 128-bit wide vectors with /// more than two elements. macro_rules! arm_128_v7_neon_impl { ($id:ident, $half:ident, $vpmin:ident, $vpmax:ident) => { impl All for $id { #[inline] #[target_feature(enable = "v7,neon")] unsafe fn all(self) -> bool { use crate::arch::arm::$vpmin; use crate::mem::transmute; union U { halves: ($half, $half), vec: $id, } let halves = U { vec: self }.halves; let h: $half = transmute($vpmin(transmute(halves.0), transmute(halves.1))); h.all() } } impl Any for $id { #[inline] #[target_feature(enable = "v7,neon")] unsafe fn any(self) -> bool { use crate::arch::arm::$vpmax; use crate::mem::transmute; union U { halves: ($half, $half), vec: $id, } let halves = U { vec: self }.halves; let h: $half = transmute($vpmax(transmute(halves.0), transmute(halves.1))); h.any() } } }; } /// Mask reduction implementation for `arm` targets macro_rules! impl_mask_reductions { // 128-bit wide masks (m8x16) => { arm_128_v7_neon_impl!(m8x16, m8x8, vpmin_u8, vpmax_u8); }; (m16x8) => { arm_128_v7_neon_impl!(m16x8, m16x4, vpmin_u16, vpmax_u16); }; (m32x4) => { arm_128_v7_neon_impl!(m32x4, m32x2, vpmin_u32, vpmax_u32); }; // Fallback to LLVM's default code-generation: ($id:ident) => { fallback_impl!($id); }; } ================================================ FILE: src/codegen/reductions/mask/fallback.rs ================================================ //! Default mask reduction implementations. /// Default mask reduction implementation macro_rules! impl_mask_reductions { ($id:ident) => { fallback_impl!($id); }; } ================================================ FILE: src/codegen/reductions/mask/fallback_impl.rs ================================================ //! Default implementation of a mask reduction for any target. macro_rules! fallback_to_other_impl { ($id:ident, $other:ident) => { impl All for $id { #[inline] unsafe fn all(self) -> bool { let m: $other = crate::mem::transmute(self); m.all() } } impl Any for $id { #[inline] unsafe fn any(self) -> bool { let m: $other = crate::mem::transmute(self); m.any() } } }; } /// Fallback implementation. macro_rules! fallback_impl { // 16-bit wide masks: (m8x2) => { impl All for m8x2 { #[inline] unsafe fn all(self) -> bool { let i: u16 = crate::mem::transmute(self); i == u16::max_value() } } impl Any for m8x2 { #[inline] unsafe fn any(self) -> bool { let i: u16 = crate::mem::transmute(self); i != 0 } } }; // 32-bit wide masks (m8x4) => { impl All for m8x4 { #[inline] unsafe fn all(self) -> bool { let i: u32 = crate::mem::transmute(self); i == u32::max_value() } } impl Any for m8x4 { #[inline] unsafe fn any(self) -> bool { let i: u32 = crate::mem::transmute(self); i != 0 } } }; (m16x2) => { fallback_to_other_impl!(m16x2, m8x4); }; // 64-bit wide masks: (m8x8) => { impl All for m8x8 { #[inline] unsafe fn all(self) -> bool { let i: u64 = crate::mem::transmute(self); i == u64::max_value() } } impl Any for m8x8 { #[inline] unsafe fn any(self) -> bool { let i: u64 = crate::mem::transmute(self); i != 0 } } }; (m16x4) => { fallback_to_other_impl!(m16x4, m8x8); }; (m32x2) => { fallback_to_other_impl!(m32x2, m16x4); }; // FIXME: 64x1 maxk // 128-bit wide masks: (m8x16) => { impl All for m8x16 { #[inline] unsafe fn all(self) -> bool { let i: u128 = crate::mem::transmute(self); i == u128::max_value() } } impl Any for m8x16 { #[inline] unsafe fn any(self) -> bool { let i: u128 = crate::mem::transmute(self); i != 0 } } }; (m16x8) => { fallback_to_other_impl!(m16x8, m8x16); }; (m32x4) => { fallback_to_other_impl!(m32x4, m16x8); }; (m64x2) => { fallback_to_other_impl!(m64x2, m32x4); }; (m128x1) => { fallback_to_other_impl!(m128x1, m64x2); }; // 256-bit wide masks (m8x32) => { impl All for m8x32 { #[inline] unsafe fn all(self) -> bool { let i: [u128; 2] = crate::mem::transmute(self); let o: [u128; 2] = [u128::max_value(); 2]; i == o } } impl Any for m8x32 { #[inline] unsafe fn any(self) -> bool { let i: [u128; 2] = crate::mem::transmute(self); let o: [u128; 2] = [0; 2]; i != o } } }; (m16x16) => { fallback_to_other_impl!(m16x16, m8x32); }; (m32x8) => { fallback_to_other_impl!(m32x8, m16x16); }; (m64x4) => { fallback_to_other_impl!(m64x4, m32x8); }; (m128x2) => { fallback_to_other_impl!(m128x2, m64x4); }; // 512-bit wide masks (m8x64) => { impl All for m8x64 { #[inline] unsafe fn all(self) -> bool { let i: [u128; 4] = crate::mem::transmute(self); let o: [u128; 4] = [u128::max_value(); 4]; i == o } } impl Any for m8x64 { #[inline] unsafe fn any(self) -> bool { let i: [u128; 4] = crate::mem::transmute(self); let o: [u128; 4] = [0; 4]; i != o } } }; (m16x32) => { fallback_to_other_impl!(m16x32, m8x64); }; (m32x16) => { fallback_to_other_impl!(m32x16, m16x32); }; (m64x8) => { fallback_to_other_impl!(m64x8, m32x16); }; (m128x4) => { fallback_to_other_impl!(m128x4, m64x8); }; // Masks with pointer-sized elements64 (msizex2) => { cfg_if! { if #[cfg(target_pointer_width = "64")] { fallback_to_other_impl!(msizex2, m64x2); } else if #[cfg(target_pointer_width = "32")] { fallback_to_other_impl!(msizex2, m32x2); } else { compile_error!("unsupported target_pointer_width"); } } }; (msizex4) => { cfg_if! { if #[cfg(target_pointer_width = "64")] { fallback_to_other_impl!(msizex4, m64x4); } else if #[cfg(target_pointer_width = "32")] { fallback_to_other_impl!(msizex4, m32x4); } else { compile_error!("unsupported target_pointer_width"); } } }; (msizex8) => { cfg_if! { if #[cfg(target_pointer_width = "64")] { fallback_to_other_impl!(msizex8, m64x8); } else if #[cfg(target_pointer_width = "32")] { fallback_to_other_impl!(msizex8, m32x8); } else { compile_error!("unsupported target_pointer_width"); } } }; } macro_rules! recurse_half { ($vid:ident, $vid_h:ident) => { impl All for $vid { #[inline] unsafe fn all(self) -> bool { union U { halves: ($vid_h, $vid_h), vec: $vid, } let halves = U { vec: self }.halves; halves.0.all() && halves.1.all() } } impl Any for $vid { #[inline] unsafe fn any(self) -> bool { union U { halves: ($vid_h, $vid_h), vec: $vid, } let halves = U { vec: self }.halves; halves.0.any() || halves.1.any() } } }; } ================================================ FILE: src/codegen/reductions/mask/x86/avx.rs ================================================ //! Mask reductions implementation for `x86` and `x86_64` targets with `AVX` /// `x86`/`x86_64` 256-bit `AVX` implementation /// FIXME: it might be faster here to do two `_mm_movmask_epi8` #[cfg(target_feature = "avx")] macro_rules! x86_m8x32_avx_impl { ($id:ident) => { impl All for $id { #[inline] #[target_feature(enable = "avx")] unsafe fn all(self) -> bool { #[cfg(target_arch = "x86")] use crate::arch::x86::_mm256_testc_si256; #[cfg(target_arch = "x86_64")] use crate::arch::x86_64::_mm256_testc_si256; _mm256_testc_si256(crate::mem::transmute(self), crate::mem::transmute($id::splat(true))) != 0 } } impl Any for $id { #[inline] #[target_feature(enable = "avx")] unsafe fn any(self) -> bool { #[cfg(target_arch = "x86")] use crate::arch::x86::_mm256_testz_si256; #[cfg(target_arch = "x86_64")] use crate::arch::x86_64::_mm256_testz_si256; _mm256_testz_si256(crate::mem::transmute(self), crate::mem::transmute(self)) == 0 } } }; } /// `x86`/`x86_64` 256-bit m32x8 `AVX` implementation macro_rules! x86_m32x8_avx_impl { ($id:ident) => { impl All for $id { #[inline] #[target_feature(enable = "sse")] unsafe fn all(self) -> bool { #[cfg(target_arch = "x86")] use crate::arch::x86::_mm256_movemask_ps; #[cfg(target_arch = "x86_64")] use crate::arch::x86_64::_mm256_movemask_ps; // _mm256_movemask_ps(a) creates a 8bit mask containing the // most significant bit of each lane of `a`. If all bits are // set, then all 8 lanes of the mask are true. _mm256_movemask_ps(crate::mem::transmute(self)) == 0b_1111_1111_i32 } } impl Any for $id { #[inline] #[target_feature(enable = "sse")] unsafe fn any(self) -> bool { #[cfg(target_arch = "x86")] use crate::arch::x86::_mm256_movemask_ps; #[cfg(target_arch = "x86_64")] use crate::arch::x86_64::_mm256_movemask_ps; _mm256_movemask_ps(crate::mem::transmute(self)) != 0 } } }; } /// `x86`/`x86_64` 256-bit m64x4 `AVX` implementation macro_rules! x86_m64x4_avx_impl { ($id:ident) => { impl All for $id { #[inline] #[target_feature(enable = "sse")] unsafe fn all(self) -> bool { #[cfg(target_arch = "x86")] use crate::arch::x86::_mm256_movemask_pd; #[cfg(target_arch = "x86_64")] use crate::arch::x86_64::_mm256_movemask_pd; // _mm256_movemask_pd(a) creates a 4bit mask containing the // most significant bit of each lane of `a`. If all bits are // set, then all 4 lanes of the mask are true. _mm256_movemask_pd(crate::mem::transmute(self)) == 0b_1111_i32 } } impl Any for $id { #[inline] #[target_feature(enable = "sse")] unsafe fn any(self) -> bool { #[cfg(target_arch = "x86")] use crate::arch::x86::_mm256_movemask_pd; #[cfg(target_arch = "x86_64")] use crate::arch::x86_64::_mm256_movemask_pd; _mm256_movemask_pd(crate::mem::transmute(self)) != 0 } } }; } ================================================ FILE: src/codegen/reductions/mask/x86/avx2.rs ================================================ //! Mask reductions implementation for `x86` and `x86_64` targets with `AVX2`. #![allow(unused)] /// x86/x86_64 256-bit m8x32 AVX2 implementation macro_rules! x86_m8x32_avx2_impl { ($id:ident) => { impl All for $id { #[inline] #[target_feature(enable = "sse2")] unsafe fn all(self) -> bool { #[cfg(target_arch = "x86")] use crate::arch::x86::_mm256_movemask_epi8; #[cfg(target_arch = "x86_64")] use crate::arch::x86_64::_mm256_movemask_epi8; // _mm256_movemask_epi8(a) creates a 32bit mask containing the // most significant bit of each byte of `a`. If all // bits are set, then all 32 lanes of the mask are // true. _mm256_movemask_epi8(crate::mem::transmute(self)) == -1_i32 } } impl Any for $id { #[inline] #[target_feature(enable = "sse2")] unsafe fn any(self) -> bool { #[cfg(target_arch = "x86")] use crate::arch::x86::_mm256_movemask_epi8; #[cfg(target_arch = "x86_64")] use crate::arch::x86_64::_mm256_movemask_epi8; _mm256_movemask_epi8(crate::mem::transmute(self)) != 0 } } }; } ================================================ FILE: src/codegen/reductions/mask/x86/sse.rs ================================================ //! Mask reductions implementation for `x86` and `x86_64` targets with `SSE`. #![allow(unused)] /// `x86`/`x86_64` 128-bit `m32x4` `SSE` implementation macro_rules! x86_m32x4_sse_impl { ($id:ident) => { impl All for $id { #[inline] #[target_feature(enable = "sse")] unsafe fn all(self) -> bool { #[cfg(target_arch = "x86")] use crate::arch::x86::_mm_movemask_ps; #[cfg(target_arch = "x86_64")] use crate::arch::x86_64::_mm_movemask_ps; // _mm_movemask_ps(a) creates a 4bit mask containing the // most significant bit of each lane of `a`. If all // bits are set, then all 4 lanes of the mask are // true. _mm_movemask_ps(crate::mem::transmute(self)) == 0b_1111_i32 } } impl Any for $id { #[inline] #[target_feature(enable = "sse")] unsafe fn any(self) -> bool { #[cfg(target_arch = "x86")] use crate::arch::x86::_mm_movemask_ps; #[cfg(target_arch = "x86_64")] use crate::arch::x86_64::_mm_movemask_ps; _mm_movemask_ps(crate::mem::transmute(self)) != 0 } } }; } ================================================ FILE: src/codegen/reductions/mask/x86/sse2.rs ================================================ //! Mask reductions implementation for `x86` and `x86_64` targets with `SSE2`. #![allow(unused)] /// `x86`/`x86_64` 128-bit m64x2 `SSE2` implementation macro_rules! x86_m64x2_sse2_impl { ($id:ident) => { impl All for $id { #[inline] #[target_feature(enable = "sse")] unsafe fn all(self) -> bool { #[cfg(target_arch = "x86")] use crate::arch::x86::_mm_movemask_pd; #[cfg(target_arch = "x86_64")] use crate::arch::x86_64::_mm_movemask_pd; // _mm_movemask_pd(a) creates a 2bit mask containing the // most significant bit of each lane of `a`. If all // bits are set, then all 2 lanes of the mask are // true. _mm_movemask_pd(crate::mem::transmute(self)) == 0b_11_i32 } } impl Any for $id { #[inline] #[target_feature(enable = "sse")] unsafe fn any(self) -> bool { #[cfg(target_arch = "x86")] use crate::arch::x86::_mm_movemask_pd; #[cfg(target_arch = "x86_64")] use crate::arch::x86_64::_mm_movemask_pd; _mm_movemask_pd(crate::mem::transmute(self)) != 0 } } }; } /// `x86`/`x86_64` 128-bit m8x16 `SSE2` implementation macro_rules! x86_m8x16_sse2_impl { ($id:ident) => { impl All for $id { #[inline] #[target_feature(enable = "sse2")] unsafe fn all(self) -> bool { #[cfg(target_arch = "x86")] use crate::arch::x86::_mm_movemask_epi8; #[cfg(target_arch = "x86_64")] use crate::arch::x86_64::_mm_movemask_epi8; // _mm_movemask_epi8(a) creates a 16bit mask containing the // most significant bit of each byte of `a`. If all // bits are set, then all 16 lanes of the mask are // true. _mm_movemask_epi8(crate::mem::transmute(self)) == i32::from(u16::max_value()) } } impl Any for $id { #[inline] #[target_feature(enable = "sse2")] unsafe fn any(self) -> bool { #[cfg(target_arch = "x86")] use crate::arch::x86::_mm_movemask_epi8; #[cfg(target_arch = "x86_64")] use crate::arch::x86_64::_mm_movemask_epi8; _mm_movemask_epi8(crate::mem::transmute(self)) != 0 } } }; } ================================================ FILE: src/codegen/reductions/mask/x86.rs ================================================ //! Mask reductions implementation for `x86` and `x86_64` targets #[cfg(target_feature = "sse")] #[macro_use] mod sse; #[cfg(target_feature = "sse2")] #[macro_use] mod sse2; #[cfg(target_feature = "avx")] #[macro_use] mod avx; #[cfg(target_feature = "avx2")] #[macro_use] mod avx2; /// x86 64-bit m8x8 implementation macro_rules! x86_m8x8_impl { ($id:ident) => { fallback_impl!($id); }; } /// x86 128-bit m8x16 implementation macro_rules! x86_m8x16_impl { ($id:ident) => { cfg_if! { if #[cfg(target_feature = "sse2")] { x86_m8x16_sse2_impl!($id); } else { fallback_impl!($id); } } }; } /// x86 128-bit m32x4 implementation macro_rules! x86_m32x4_impl { ($id:ident) => { cfg_if! { if #[cfg(target_feature = "sse")] { x86_m32x4_sse_impl!($id); } else { fallback_impl!($id); } } }; } /// x86 128-bit m64x2 implementation macro_rules! x86_m64x2_impl { ($id:ident) => { cfg_if! { if #[cfg(target_feature = "sse2")] { x86_m64x2_sse2_impl!($id); } else if #[cfg(target_feature = "sse")] { x86_m32x4_sse_impl!($id); } else { fallback_impl!($id); } } }; } /// x86 256-bit m8x32 implementation macro_rules! x86_m8x32_impl { ($id:ident, $half_id:ident) => { cfg_if! { if #[cfg(target_feature = "avx2")] { x86_m8x32_avx2_impl!($id); } else if #[cfg(target_feature = "avx")] { x86_m8x32_avx_impl!($id); } else if #[cfg(target_feature = "sse2")] { recurse_half!($id, $half_id); } else { fallback_impl!($id); } } }; } /// x86 256-bit m32x8 implementation macro_rules! x86_m32x8_impl { ($id:ident, $half_id:ident) => { cfg_if! { if #[cfg(target_feature = "avx")] { x86_m32x8_avx_impl!($id); } else if #[cfg(target_feature = "sse")] { recurse_half!($id, $half_id); } else { fallback_impl!($id); } } }; } /// x86 256-bit m64x4 implementation macro_rules! x86_m64x4_impl { ($id:ident, $half_id:ident) => { cfg_if! { if #[cfg(target_feature = "avx")] { x86_m64x4_avx_impl!($id); } else if #[cfg(target_feature = "sse")] { recurse_half!($id, $half_id); } else { fallback_impl!($id); } } }; } /// Fallback implementation. macro_rules! x86_intr_impl { ($id:ident) => { impl All for $id { #[inline] unsafe fn all(self) -> bool { use crate::llvm::simd_reduce_all; simd_reduce_all(self.0) } } impl Any for $id { #[inline] unsafe fn any(self) -> bool { use crate::llvm::simd_reduce_any; simd_reduce_any(self.0) } } }; } /// Mask reduction implementation for `x86` and `x86_64` targets macro_rules! impl_mask_reductions { // 64-bit wide masks (m8x8) => { x86_m8x8_impl!(m8x8); }; (m16x4) => { x86_m8x8_impl!(m16x4); }; (m32x2) => { x86_m8x8_impl!(m32x2); }; // 128-bit wide masks (m8x16) => { x86_m8x16_impl!(m8x16); }; (m16x8) => { x86_m8x16_impl!(m16x8); }; (m32x4) => { x86_m32x4_impl!(m32x4); }; (m64x2) => { x86_m64x2_impl!(m64x2); }; (m128x1) => { x86_intr_impl!(m128x1); }; // 256-bit wide masks: (m8x32) => { x86_m8x32_impl!(m8x32, m8x16); }; (m16x16) => { x86_m8x32_impl!(m16x16, m16x8); }; (m32x8) => { x86_m32x8_impl!(m32x8, m32x4); }; (m64x4) => { x86_m64x4_impl!(m64x4, m64x2); }; (m128x2) => { x86_intr_impl!(m128x2); }; (msizex2) => { cfg_if! { if #[cfg(target_pointer_width = "64")] { fallback_to_other_impl!(msizex2, m64x2); } else if #[cfg(target_pointer_width = "32")] { fallback_to_other_impl!(msizex2, m32x2); } else { compile_error!("unsupported target_pointer_width"); } } }; (msizex4) => { cfg_if! { if #[cfg(target_pointer_width = "64")] { fallback_to_other_impl!(msizex4, m64x4); } else if #[cfg(target_pointer_width = "32")] { fallback_to_other_impl!(msizex4, m32x4); } else { compile_error!("unsupported target_pointer_width"); } } }; (msizex8) => { cfg_if! { if #[cfg(target_pointer_width = "64")] { fallback_to_other_impl!(msizex8, m64x8); } else if #[cfg(target_pointer_width = "32")] { fallback_to_other_impl!(msizex8, m32x8); } else { compile_error!("unsupported target_pointer_width"); } } }; // Fallback to LLVM's default code-generation: ($id:ident) => { fallback_impl!($id); }; } ================================================ FILE: src/codegen/reductions/mask.rs ================================================ //! Code generation workaround for `all()` mask horizontal reduction. //! //! Works around [LLVM bug 36702]. //! //! [LLVM bug 36702]: https://bugs.llvm.org/show_bug.cgi?id=36702 #![allow(unused_macros)] use crate::*; pub(crate) trait All: crate::marker::Sized { unsafe fn all(self) -> bool; } pub(crate) trait Any: crate::marker::Sized { unsafe fn any(self) -> bool; } #[macro_use] mod fallback_impl; cfg_if! { if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] { #[macro_use] mod x86; } else if #[cfg(all(target_arch = "arm", target_feature = "v7", target_feature = "neon", any(feature = "core_arch", libcore_neon)))] { #[macro_use] mod arm; } else if #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] { #[macro_use] mod aarch64; } else { #[macro_use] mod fallback; } } impl_mask_reductions!(m8x2); impl_mask_reductions!(m8x4); impl_mask_reductions!(m8x8); impl_mask_reductions!(m8x16); impl_mask_reductions!(m8x32); impl_mask_reductions!(m8x64); impl_mask_reductions!(m16x2); impl_mask_reductions!(m16x4); impl_mask_reductions!(m16x8); impl_mask_reductions!(m16x16); impl_mask_reductions!(m16x32); impl_mask_reductions!(m32x2); impl_mask_reductions!(m32x4); impl_mask_reductions!(m32x8); impl_mask_reductions!(m32x16); // FIXME: 64-bit single element vector // impl_mask_reductions!(m64x1); impl_mask_reductions!(m64x2); impl_mask_reductions!(m64x4); impl_mask_reductions!(m64x8); impl_mask_reductions!(m128x1); impl_mask_reductions!(m128x2); impl_mask_reductions!(m128x4); impl_mask_reductions!(msizex2); impl_mask_reductions!(msizex4); impl_mask_reductions!(msizex8); ================================================ FILE: src/codegen/reductions.rs ================================================ pub(crate) mod mask; ================================================ FILE: src/codegen/shuffle.rs ================================================ //! Implementations of the `ShuffleResult` trait for the different numbers of //! lanes and vector element types. use crate::masks::*; use crate::sealed::{Seal, Shuffle}; macro_rules! impl_shuffle { ($array:ty, $base:ty, $out:ty) => { impl Seal<$array> for $base {} impl Shuffle<$array> for $base { type Output = $out; } }; } impl_shuffle! { [u32; 2], i8, crate::codegen::i8x2 } impl_shuffle! { [u32; 4], i8, crate::codegen::i8x4 } impl_shuffle! { [u32; 8], i8, crate::codegen::i8x8 } impl_shuffle! { [u32; 16], i8, crate::codegen::i8x16 } impl_shuffle! { [u32; 32], i8, crate::codegen::i8x32 } impl_shuffle! { [u32; 64], i8, crate::codegen::i8x64 } impl_shuffle! { [u32; 2], u8, crate::codegen::u8x2 } impl_shuffle! { [u32; 4], u8, crate::codegen::u8x4 } impl_shuffle! { [u32; 8], u8, crate::codegen::u8x8 } impl_shuffle! { [u32; 16], u8, crate::codegen::u8x16 } impl_shuffle! { [u32; 32], u8, crate::codegen::u8x32 } impl_shuffle! { [u32; 64], u8, crate::codegen::u8x64 } impl_shuffle! { [u32; 2], m8, crate::codegen::m8x2 } impl_shuffle! { [u32; 4], m8, crate::codegen::m8x4 } impl_shuffle! { [u32; 8], m8, crate::codegen::m8x8 } impl_shuffle! { [u32; 16], m8, crate::codegen::m8x16 } impl_shuffle! { [u32; 32], m8, crate::codegen::m8x32 } impl_shuffle! { [u32; 64], m8, crate::codegen::m8x64 } impl_shuffle! { [u32; 2], i16, crate::codegen::i16x2 } impl_shuffle! { [u32; 4], i16, crate::codegen::i16x4 } impl_shuffle! { [u32; 8], i16, crate::codegen::i16x8 } impl_shuffle! { [u32; 16], i16, crate::codegen::i16x16 } impl_shuffle! { [u32; 32], i16, crate::codegen::i16x32 } impl_shuffle! { [u32; 2], u16, crate::codegen::u16x2 } impl_shuffle! { [u32; 4], u16, crate::codegen::u16x4 } impl_shuffle! { [u32; 8], u16, crate::codegen::u16x8 } impl_shuffle! { [u32; 16], u16, crate::codegen::u16x16 } impl_shuffle! { [u32; 32], u16, crate::codegen::u16x32 } impl_shuffle! { [u32; 2], m16, crate::codegen::m16x2 } impl_shuffle! { [u32; 4], m16, crate::codegen::m16x4 } impl_shuffle! { [u32; 8], m16, crate::codegen::m16x8 } impl_shuffle! { [u32; 16], m16, crate::codegen::m16x16 } impl_shuffle! { [u32; 2], i32, crate::codegen::i32x2 } impl_shuffle! { [u32; 4], i32, crate::codegen::i32x4 } impl_shuffle! { [u32; 8], i32, crate::codegen::i32x8 } impl_shuffle! { [u32; 16], i32, crate::codegen::i32x16 } impl_shuffle! { [u32; 2], u32, crate::codegen::u32x2 } impl_shuffle! { [u32; 4], u32, crate::codegen::u32x4 } impl_shuffle! { [u32; 8], u32, crate::codegen::u32x8 } impl_shuffle! { [u32; 16], u32, crate::codegen::u32x16 } impl_shuffle! { [u32; 2], f32, crate::codegen::f32x2 } impl_shuffle! { [u32; 4], f32, crate::codegen::f32x4 } impl_shuffle! { [u32; 8], f32, crate::codegen::f32x8 } impl_shuffle! { [u32; 16], f32, crate::codegen::f32x16 } impl_shuffle! { [u32; 2], m32, crate::codegen::m32x2 } impl_shuffle! { [u32; 4], m32, crate::codegen::m32x4 } impl_shuffle! { [u32; 8], m32, crate::codegen::m32x8 } impl_shuffle! { [u32; 16], m32, crate::codegen::m32x16 } /* FIXME: 64-bit single element vector impl_shuffle! { [u32; 1], i64, crate::codegen::i64x1 } */ impl_shuffle! { [u32; 2], i64, crate::codegen::i64x2 } impl_shuffle! { [u32; 4], i64, crate::codegen::i64x4 } impl_shuffle! { [u32; 8], i64, crate::codegen::i64x8 } /* FIXME: 64-bit single element vector impl_shuffle! { [u32; 1], i64, crate::codegen::i64x1 } */ impl_shuffle! { [u32; 2], u64, crate::codegen::u64x2 } impl_shuffle! { [u32; 4], u64, crate::codegen::u64x4 } impl_shuffle! { [u32; 8], u64, crate::codegen::u64x8 } /* FIXME: 64-bit single element vector impl_shuffle! { [u32; 1], i64, crate::codegen::i64x1 } */ impl_shuffle! { [u32; 2], f64, crate::codegen::f64x2 } impl_shuffle! { [u32; 4], f64, crate::codegen::f64x4 } impl_shuffle! { [u32; 8], f64, crate::codegen::f64x8 } /* FIXME: 64-bit single element vector impl_shuffle! { [u32; 1], i64, crate::codegen::i64x1 } */ impl_shuffle! { [u32; 2], m64, crate::codegen::m64x2 } impl_shuffle! { [u32; 4], m64, crate::codegen::m64x4 } impl_shuffle! { [u32; 8], m64, crate::codegen::m64x8 } impl_shuffle! { [u32; 2], isize, crate::codegen::isizex2 } impl_shuffle! { [u32; 4], isize, crate::codegen::isizex4 } impl_shuffle! { [u32; 8], isize, crate::codegen::isizex8 } impl_shuffle! { [u32; 2], usize, crate::codegen::usizex2 } impl_shuffle! { [u32; 4], usize, crate::codegen::usizex4 } impl_shuffle! { [u32; 8], usize, crate::codegen::usizex8 } impl_shuffle! { [u32; 2], msize, crate::codegen::msizex2 } impl_shuffle! { [u32; 4], msize, crate::codegen::msizex4 } impl_shuffle! { [u32; 8], msize, crate::codegen::msizex8 } impl Seal<[u32; 2]> for *const T {} impl Shuffle<[u32; 2]> for *const T { type Output = crate::codegen::cptrx2; } impl Seal<[u32; 4]> for *const T {} impl Shuffle<[u32; 4]> for *const T { type Output = crate::codegen::cptrx4; } impl Seal<[u32; 8]> for *const T {} impl Shuffle<[u32; 8]> for *const T { type Output = crate::codegen::cptrx8; } impl Seal<[u32; 2]> for *mut T {} impl Shuffle<[u32; 2]> for *mut T { type Output = crate::codegen::mptrx2; } impl Seal<[u32; 4]> for *mut T {} impl Shuffle<[u32; 4]> for *mut T { type Output = crate::codegen::mptrx4; } impl Seal<[u32; 8]> for *mut T {} impl Shuffle<[u32; 8]> for *mut T { type Output = crate::codegen::mptrx8; } impl_shuffle! { [u32; 1], i128, crate::codegen::i128x1 } impl_shuffle! { [u32; 2], i128, crate::codegen::i128x2 } impl_shuffle! { [u32; 4], i128, crate::codegen::i128x4 } impl_shuffle! { [u32; 1], u128, crate::codegen::u128x1 } impl_shuffle! { [u32; 2], u128, crate::codegen::u128x2 } impl_shuffle! { [u32; 4], u128, crate::codegen::u128x4 } impl_shuffle! { [u32; 1], m128, crate::codegen::m128x1 } impl_shuffle! { [u32; 2], m128, crate::codegen::m128x2 } impl_shuffle! { [u32; 4], m128, crate::codegen::m128x4 } ================================================ FILE: src/codegen/shuffle1_dyn.rs ================================================ //! Shuffle vector lanes with run-time indices. use crate::*; pub trait Shuffle1Dyn { type Indices; fn shuffle1_dyn(self, _: Self::Indices) -> Self; } // Fallback implementation macro_rules! impl_fallback { ($id:ident) => { impl Shuffle1Dyn for $id { type Indices = Self; #[inline] fn shuffle1_dyn(self, indices: Self::Indices) -> Self { let mut result = Self::splat(0); for i in 0..$id::lanes() { result = result.replace(i, self.extract(indices.extract(i) as usize)); } result } } }; } macro_rules! impl_shuffle1_dyn { (u8x8) => { cfg_if! { if #[cfg(all( any( all(target_arch = "aarch64", target_feature = "neon"), all(target_arch = "doesnotexist", target_feature = "v7", target_feature = "neon") ), any(feature = "core_arch", libcore_neon) ) )] { impl Shuffle1Dyn for u8x8 { type Indices = Self; #[inline] fn shuffle1_dyn(self, indices: Self::Indices) -> Self { #[cfg(target_arch = "aarch64")] use crate::arch::aarch64::vtbl1_u8; #[cfg(target_arch = "doesnotexist")] use crate::arch::arm::vtbl1_u8; // This is safe because the binary is compiled with // neon enabled at compile-time and can therefore only // run on CPUs that have it enabled. unsafe { Simd(mem::transmute( vtbl1_u8(mem::transmute(self.0), crate::mem::transmute(indices.0)) )) } } } } else { impl_fallback!(u8x8); } } }; (u8x16) => { cfg_if! { if #[cfg(all(any(target_arch = "x86", target_arch = "x86_64"), target_feature = "ssse3"))] { impl Shuffle1Dyn for u8x16 { type Indices = Self; #[inline] fn shuffle1_dyn(self, indices: Self::Indices) -> Self { #[cfg(target_arch = "x86")] use crate::arch::x86::_mm_shuffle_epi8; #[cfg(target_arch = "x86_64")] use crate::arch::x86_64::_mm_shuffle_epi8; // This is safe because the binary is compiled with // ssse3 enabled at compile-time and can therefore only // run on CPUs that have it enabled. unsafe { Simd(mem::transmute( _mm_shuffle_epi8(mem::transmute(self.0), crate::mem::transmute(indices)) )) } } } } else if #[cfg(all(target_arch = "aarch64", target_feature = "neon", any(feature = "core_arch", libcore_neon)))] { impl Shuffle1Dyn for u8x16 { type Indices = Self; #[inline] fn shuffle1_dyn(self, indices: Self::Indices) -> Self { use crate::arch::aarch64::vqtbl1q_u8; // This is safe because the binary is compiled with // neon enabled at compile-time and can therefore only // run on CPUs that have it enabled. unsafe { Simd(mem::transmute( vqtbl1q_u8(mem::transmute(self.0), crate::mem::transmute(indices.0)) )) } } } } else if #[cfg(all(target_arch = "doesnotexist", target_feature = "v7", target_feature = "neon", any(feature = "core_arch", libcore_neon)))] { impl Shuffle1Dyn for u8x16 { type Indices = Self; #[inline] fn shuffle1_dyn(self, indices: Self::Indices) -> Self { use crate::arch::arm::vtbl2_u8; // This is safe because the binary is compiled with // neon enabled at compile-time and can therefore only // run on CPUs that have it enabled. unsafe { union U { j: u8x16, s: (u8x8, u8x8), } let (i0, i1) = U { j: y }.s; let r0 = vtbl2_u8( mem::transmute(x), crate::mem::transmute(i0) ); let r1 = vtbl2_u8( mem::transmute(x), crate::mem::transmute(i1) ); let r = U { s: (r0, r1) }.j; Simd(mem::transmute(r)) } } } } else { impl_fallback!(u8x16); } } }; (u16x8) => { impl Shuffle1Dyn for u16x8 { type Indices = Self; #[inline] fn shuffle1_dyn(self, indices: Self::Indices) -> Self { let indices: u8x8 = (indices * 2).cast(); let indices: u8x16 = shuffle!(indices, [0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7]); let v = u8x16::new(0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1); let indices = indices + v; unsafe { let s: u8x16 = crate::mem::transmute(self); crate::mem::transmute(s.shuffle1_dyn(indices)) } } } }; (u32x4) => { cfg_if! { if #[cfg(all(any(target_arch = "x86", target_arch = "x86_64"), target_feature = "avx"))] { impl Shuffle1Dyn for u32x4 { type Indices = Self; #[inline] fn shuffle1_dyn(self, indices: Self::Indices) -> Self { #[cfg(target_arch = "x86")] use crate::arch::x86::{_mm_permutevar_ps}; #[cfg(target_arch = "x86_64")] use crate::arch::x86_64::{_mm_permutevar_ps}; unsafe { crate::mem::transmute( _mm_permutevar_ps( crate::mem::transmute(self.0), crate::mem::transmute(indices.0) ) ) } } } } else { impl Shuffle1Dyn for u32x4 { type Indices = Self; #[inline] fn shuffle1_dyn(self, indices: Self::Indices) -> Self { let indices: u8x4 = (indices * 4).cast(); let indices: u8x16 = shuffle!( indices, [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3] ); let v = u8x16::new( 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 ); let indices = indices + v; unsafe { let s: u8x16 =crate::mem::transmute(self); crate::mem::transmute(s.shuffle1_dyn(indices)) } } } } } }; (u64x2) => { cfg_if! { if #[cfg(all(any(target_arch = "x86", target_arch = "x86_64"), target_feature = "avx"))] { impl Shuffle1Dyn for u64x2 { type Indices = Self; #[inline] fn shuffle1_dyn(self, indices: Self::Indices) -> Self { #[cfg(target_arch = "x86")] use crate::arch::x86::{_mm_permutevar_pd}; #[cfg(target_arch = "x86_64")] use crate::arch::x86_64::{_mm_permutevar_pd}; // _mm_permutevar_pd uses the _second_ bit of each // element to perform the selection, that is: 0b00 => 0, // 0b10 => 1: let indices = indices << 1; unsafe { crate::mem::transmute( _mm_permutevar_pd( crate::mem::transmute(self), crate::mem::transmute(indices) ) ) } } } } else { impl Shuffle1Dyn for u64x2 { type Indices = Self; #[inline] fn shuffle1_dyn(self, indices: Self::Indices) -> Self { let indices: u8x2 = (indices * 8).cast(); let indices: u8x16 = shuffle!( indices, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1] ); let v = u8x16::new( 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7 ); let indices = indices + v; unsafe { let s: u8x16 =crate::mem::transmute(self); crate::mem::transmute(s.shuffle1_dyn(indices)) } } } } } }; (u128x1) => { impl Shuffle1Dyn for u128x1 { type Indices = Self; #[inline] fn shuffle1_dyn(self, _indices: Self::Indices) -> Self { self } } }; ($id:ident) => { impl_fallback!($id); }; } impl_shuffle1_dyn!(u8x2); impl_shuffle1_dyn!(u8x4); impl_shuffle1_dyn!(u8x8); impl_shuffle1_dyn!(u8x16); impl_shuffle1_dyn!(u8x32); impl_shuffle1_dyn!(u8x64); impl_shuffle1_dyn!(u16x2); impl_shuffle1_dyn!(u16x4); impl_shuffle1_dyn!(u16x8); impl_shuffle1_dyn!(u16x16); impl_shuffle1_dyn!(u16x32); impl_shuffle1_dyn!(u32x2); impl_shuffle1_dyn!(u32x4); impl_shuffle1_dyn!(u32x8); impl_shuffle1_dyn!(u32x16); impl_shuffle1_dyn!(u64x2); impl_shuffle1_dyn!(u64x4); impl_shuffle1_dyn!(u64x8); impl_shuffle1_dyn!(usizex2); impl_shuffle1_dyn!(usizex4); impl_shuffle1_dyn!(usizex8); impl_shuffle1_dyn!(u128x1); impl_shuffle1_dyn!(u128x2); impl_shuffle1_dyn!(u128x4); // Implementation for non-unsigned vector types macro_rules! impl_shuffle1_dyn_non_u { ($id:ident, $uid:ident) => { impl Shuffle1Dyn for $id { type Indices = $uid; #[inline] fn shuffle1_dyn(self, indices: Self::Indices) -> Self { unsafe { let u: $uid = crate::mem::transmute(self); crate::mem::transmute(u.shuffle1_dyn(indices)) } } } }; } impl_shuffle1_dyn_non_u!(i8x2, u8x2); impl_shuffle1_dyn_non_u!(i8x4, u8x4); impl_shuffle1_dyn_non_u!(i8x8, u8x8); impl_shuffle1_dyn_non_u!(i8x16, u8x16); impl_shuffle1_dyn_non_u!(i8x32, u8x32); impl_shuffle1_dyn_non_u!(i8x64, u8x64); impl_shuffle1_dyn_non_u!(i16x2, u16x2); impl_shuffle1_dyn_non_u!(i16x4, u16x4); impl_shuffle1_dyn_non_u!(i16x8, u16x8); impl_shuffle1_dyn_non_u!(i16x16, u16x16); impl_shuffle1_dyn_non_u!(i16x32, u16x32); impl_shuffle1_dyn_non_u!(i32x2, u32x2); impl_shuffle1_dyn_non_u!(i32x4, u32x4); impl_shuffle1_dyn_non_u!(i32x8, u32x8); impl_shuffle1_dyn_non_u!(i32x16, u32x16); impl_shuffle1_dyn_non_u!(i64x2, u64x2); impl_shuffle1_dyn_non_u!(i64x4, u64x4); impl_shuffle1_dyn_non_u!(i64x8, u64x8); impl_shuffle1_dyn_non_u!(isizex2, usizex2); impl_shuffle1_dyn_non_u!(isizex4, usizex4); impl_shuffle1_dyn_non_u!(isizex8, usizex8); impl_shuffle1_dyn_non_u!(i128x1, u128x1); impl_shuffle1_dyn_non_u!(i128x2, u128x2); impl_shuffle1_dyn_non_u!(i128x4, u128x4); impl_shuffle1_dyn_non_u!(m8x2, u8x2); impl_shuffle1_dyn_non_u!(m8x4, u8x4); impl_shuffle1_dyn_non_u!(m8x8, u8x8); impl_shuffle1_dyn_non_u!(m8x16, u8x16); impl_shuffle1_dyn_non_u!(m8x32, u8x32); impl_shuffle1_dyn_non_u!(m8x64, u8x64); impl_shuffle1_dyn_non_u!(m16x2, u16x2); impl_shuffle1_dyn_non_u!(m16x4, u16x4); impl_shuffle1_dyn_non_u!(m16x8, u16x8); impl_shuffle1_dyn_non_u!(m16x16, u16x16); impl_shuffle1_dyn_non_u!(m16x32, u16x32); impl_shuffle1_dyn_non_u!(m32x2, u32x2); impl_shuffle1_dyn_non_u!(m32x4, u32x4); impl_shuffle1_dyn_non_u!(m32x8, u32x8); impl_shuffle1_dyn_non_u!(m32x16, u32x16); impl_shuffle1_dyn_non_u!(m64x2, u64x2); impl_shuffle1_dyn_non_u!(m64x4, u64x4); impl_shuffle1_dyn_non_u!(m64x8, u64x8); impl_shuffle1_dyn_non_u!(msizex2, usizex2); impl_shuffle1_dyn_non_u!(msizex4, usizex4); impl_shuffle1_dyn_non_u!(msizex8, usizex8); impl_shuffle1_dyn_non_u!(m128x1, u128x1); impl_shuffle1_dyn_non_u!(m128x2, u128x2); impl_shuffle1_dyn_non_u!(m128x4, u128x4); impl_shuffle1_dyn_non_u!(f32x2, u32x2); impl_shuffle1_dyn_non_u!(f32x4, u32x4); impl_shuffle1_dyn_non_u!(f32x8, u32x8); impl_shuffle1_dyn_non_u!(f32x16, u32x16); impl_shuffle1_dyn_non_u!(f64x2, u64x2); impl_shuffle1_dyn_non_u!(f64x4, u64x4); impl_shuffle1_dyn_non_u!(f64x8, u64x8); // Implementation for non-unsigned vector types macro_rules! impl_shuffle1_dyn_ptr { ($id:ident, $uid:ident) => { impl Shuffle1Dyn for $id { type Indices = $uid; #[inline] fn shuffle1_dyn(self, indices: Self::Indices) -> Self { unsafe { let u: $uid = crate::mem::transmute(self); crate::mem::transmute(u.shuffle1_dyn(indices)) } } } }; } impl_shuffle1_dyn_ptr!(cptrx2, usizex2); impl_shuffle1_dyn_ptr!(cptrx4, usizex4); impl_shuffle1_dyn_ptr!(cptrx8, usizex8); impl_shuffle1_dyn_ptr!(mptrx2, usizex2); impl_shuffle1_dyn_ptr!(mptrx4, usizex4); impl_shuffle1_dyn_ptr!(mptrx8, usizex8); ================================================ FILE: src/codegen/swap_bytes.rs ================================================ //! Horizontal swap bytes reductions. // FIXME: investigate using `llvm.bswap` // https://github.com/rust-lang-nursery/packed_simd/issues/19 use crate::*; pub(crate) trait SwapBytes { fn swap_bytes(self) -> Self; } macro_rules! impl_swap_bytes { (v16: $($id:ident,)+) => { $( impl SwapBytes for $id { #[inline] fn swap_bytes(self) -> Self { shuffle!(self, [1, 0]) } } )+ }; (v32: $($id:ident,)+) => { $( impl SwapBytes for $id { #[inline] #[allow(clippy::useless_transmute)] fn swap_bytes(self) -> Self { unsafe { let bytes: u8x4 = crate::mem::transmute(self); let result: u8x4 = shuffle!(bytes, [3, 2, 1, 0]); crate::mem::transmute(result) } } } )+ }; (v64: $($id:ident,)+) => { $( impl SwapBytes for $id { #[inline] #[allow(clippy::useless_transmute)] fn swap_bytes(self) -> Self { unsafe { let bytes: u8x8 = crate::mem::transmute(self); let result: u8x8 = shuffle!( bytes, [7, 6, 5, 4, 3, 2, 1, 0] ); crate::mem::transmute(result) } } } )+ }; (v128: $($id:ident,)+) => { $( impl SwapBytes for $id { #[inline] #[allow(clippy::useless_transmute)] fn swap_bytes(self) -> Self { unsafe { let bytes: u8x16 = crate::mem::transmute(self); let result: u8x16 = shuffle!(bytes, [ 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 ]); crate::mem::transmute(result) } } } )+ }; (v256: $($id:ident,)+) => { $( impl SwapBytes for $id { #[inline] #[allow(clippy::useless_transmute)] fn swap_bytes(self) -> Self { unsafe { let bytes: u8x32 = crate::mem::transmute(self); let result: u8x32 = shuffle!(bytes, [ 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 ]); crate::mem::transmute(result) } } } )+ }; (v512: $($id:ident,)+) => { $( impl SwapBytes for $id { #[inline] #[allow(clippy::useless_transmute)] fn swap_bytes(self) -> Self { unsafe { let bytes: u8x64 = crate::mem::transmute(self); let result: u8x64 = shuffle!(bytes, [ 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 ]); crate::mem::transmute(result) } } } )+ }; } impl_swap_bytes!(v16: u8x2, i8x2,); impl_swap_bytes!(v32: u8x4, i8x4, u16x2, i16x2,); // FIXME: 64-bit single element vector impl_swap_bytes!(v64: u8x8, i8x8, u16x4, i16x4, u32x2, i32x2 /* u64x1, i64x1, */,); impl_swap_bytes!(v128: u8x16, i8x16, u16x8, i16x8, u32x4, i32x4, u64x2, i64x2, u128x1, i128x1,); impl_swap_bytes!(v256: u8x32, i8x32, u16x16, i16x16, u32x8, i32x8, u64x4, i64x4, u128x2, i128x2,); impl_swap_bytes!(v512: u8x64, i8x64, u16x32, i16x32, u32x16, i32x16, u64x8, i64x8, u128x4, i128x4,); cfg_if! { if #[cfg(target_pointer_width = "8")] { impl_swap_bytes!(v16: isizex2, usizex2,); impl_swap_bytes!(v32: isizex4, usizex4,); impl_swap_bytes!(v64: isizex8, usizex8,); } else if #[cfg(target_pointer_width = "16")] { impl_swap_bytes!(v32: isizex2, usizex2,); impl_swap_bytes!(v64: isizex4, usizex4,); impl_swap_bytes!(v128: isizex8, usizex8,); } else if #[cfg(target_pointer_width = "32")] { impl_swap_bytes!(v64: isizex2, usizex2,); impl_swap_bytes!(v128: isizex4, usizex4,); impl_swap_bytes!(v256: isizex8, usizex8,); } else if #[cfg(target_pointer_width = "64")] { impl_swap_bytes!(v128: isizex2, usizex2,); impl_swap_bytes!(v256: isizex4, usizex4,); impl_swap_bytes!(v512: isizex8, usizex8,); } else { compile_error!("unsupported target_pointer_width"); } } ================================================ FILE: src/codegen/v128.rs ================================================ //! Internal 128-bit wide vector types use crate::masks::*; #[rustfmt::skip] impl_simd_array!( [i8; 16]: i8x16 | i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8 ); #[rustfmt::skip] impl_simd_array!( [u8; 16]: u8x16 | u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8 ); #[rustfmt::skip] impl_simd_array!( [m8; 16]: m8x16 | i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8 ); impl_simd_array!([i16; 8]: i16x8 | i16, i16, i16, i16, i16, i16, i16, i16); impl_simd_array!([u16; 8]: u16x8 | u16, u16, u16, u16, u16, u16, u16, u16); impl_simd_array!([m16; 8]: m16x8 | i16, i16, i16, i16, i16, i16, i16, i16); impl_simd_array!([i32; 4]: i32x4 | i32, i32, i32, i32); impl_simd_array!([u32; 4]: u32x4 | u32, u32, u32, u32); impl_simd_array!([f32; 4]: f32x4 | f32, f32, f32, f32); impl_simd_array!([m32; 4]: m32x4 | i32, i32, i32, i32); impl_simd_array!([i64; 2]: i64x2 | i64, i64); impl_simd_array!([u64; 2]: u64x2 | u64, u64); impl_simd_array!([f64; 2]: f64x2 | f64, f64); impl_simd_array!([m64; 2]: m64x2 | i64, i64); impl_simd_array!([i128; 1]: i128x1 | i128); impl_simd_array!([u128; 1]: u128x1 | u128); impl_simd_array!([m128; 1]: m128x1 | i128); ================================================ FILE: src/codegen/v16.rs ================================================ //! Internal 16-bit wide vector types use crate::masks::*; impl_simd_array!([i8; 2]: i8x2 | i8, i8); impl_simd_array!([u8; 2]: u8x2 | u8, u8); impl_simd_array!([m8; 2]: m8x2 | i8, i8); ================================================ FILE: src/codegen/v256.rs ================================================ //! Internal 256-bit wide vector types use crate::masks::*; #[rustfmt::skip] impl_simd_array!( [i8; 32]: i8x32 | i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8 ); #[rustfmt::skip] impl_simd_array!( [u8; 32]: u8x32 | u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8 ); #[rustfmt::skip] impl_simd_array!( [m8; 32]: m8x32 | i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8 ); #[rustfmt::skip] impl_simd_array!( [i16; 16]: i16x16 | i16, i16, i16, i16, i16, i16, i16, i16, i16, i16, i16, i16, i16, i16, i16, i16 ); #[rustfmt::skip] impl_simd_array!( [u16; 16]: u16x16 | u16, u16, u16, u16, u16, u16, u16, u16, u16, u16, u16, u16, u16, u16, u16, u16 ); #[rustfmt::skip] impl_simd_array!( [m16; 16]: m16x16 | i16, i16, i16, i16, i16, i16, i16, i16, i16, i16, i16, i16, i16, i16, i16, i16 ); impl_simd_array!([i32; 8]: i32x8 | i32, i32, i32, i32, i32, i32, i32, i32); impl_simd_array!([u32; 8]: u32x8 | u32, u32, u32, u32, u32, u32, u32, u32); impl_simd_array!([f32; 8]: f32x8 | f32, f32, f32, f32, f32, f32, f32, f32); impl_simd_array!([m32; 8]: m32x8 | i32, i32, i32, i32, i32, i32, i32, i32); impl_simd_array!([i64; 4]: i64x4 | i64, i64, i64, i64); impl_simd_array!([u64; 4]: u64x4 | u64, u64, u64, u64); impl_simd_array!([f64; 4]: f64x4 | f64, f64, f64, f64); impl_simd_array!([m64; 4]: m64x4 | i64, i64, i64, i64); impl_simd_array!([i128; 2]: i128x2 | i128, i128); impl_simd_array!([u128; 2]: u128x2 | u128, u128); impl_simd_array!([m128; 2]: m128x2 | i128, i128); ================================================ FILE: src/codegen/v32.rs ================================================ //! Internal 32-bit wide vector types use crate::masks::*; impl_simd_array!([i8; 4]: i8x4 | i8, i8, i8, i8); impl_simd_array!([u8; 4]: u8x4 | u8, u8, u8, u8); impl_simd_array!([m8; 4]: m8x4 | i8, i8, i8, i8); impl_simd_array!([i16; 2]: i16x2 | i16, i16); impl_simd_array!([u16; 2]: u16x2 | u16, u16); impl_simd_array!([m16; 2]: m16x2 | i16, i16); ================================================ FILE: src/codegen/v512.rs ================================================ //! Internal 512-bit wide vector types use crate::masks::*; #[rustfmt::skip] impl_simd_array!( [i8; 64]: i8x64 | i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8 ); #[rustfmt::skip] impl_simd_array!( [u8; 64]: u8x64 | u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8, u8 ); #[rustfmt::skip] impl_simd_array!( [m8; 64]: m8x64 | i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8 ); #[rustfmt::skip] impl_simd_array!( [i16; 32]: i16x32 | i16, i16, i16, i16, i16, i16, i16, i16, i16, i16, i16, i16, i16, i16, i16, i16, i16, i16, i16, i16, i16, i16, i16, i16, i16, i16, i16, i16, i16, i16, i16, i16 ); #[rustfmt::skip] impl_simd_array!( [u16; 32]: u16x32 | u16, u16, u16, u16, u16, u16, u16, u16, u16, u16, u16, u16, u16, u16, u16, u16, u16, u16, u16, u16, u16, u16, u16, u16, u16, u16, u16, u16, u16, u16, u16, u16 ); #[rustfmt::skip] impl_simd_array!( [m16; 32]: m16x32 | i16, i16, i16, i16, i16, i16, i16, i16, i16, i16, i16, i16, i16, i16, i16, i16, i16, i16, i16, i16, i16, i16, i16, i16, i16, i16, i16, i16, i16, i16, i16, i16 ); #[rustfmt::skip] impl_simd_array!( [i32; 16]: i32x16 | i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 ); #[rustfmt::skip] impl_simd_array!( [u32; 16]: u32x16 | u32, u32, u32, u32, u32, u32, u32, u32, u32, u32, u32, u32, u32, u32, u32, u32 ); #[rustfmt::skip] impl_simd_array!( [f32; 16]: f32x16 | f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32 ); #[rustfmt::skip] impl_simd_array!( [m32; 16]: m32x16 | i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 ); impl_simd_array!([i64; 8]: i64x8 | i64, i64, i64, i64, i64, i64, i64, i64); impl_simd_array!([u64; 8]: u64x8 | u64, u64, u64, u64, u64, u64, u64, u64); impl_simd_array!([f64; 8]: f64x8 | f64, f64, f64, f64, f64, f64, f64, f64); impl_simd_array!([m64; 8]: m64x8 | i64, i64, i64, i64, i64, i64, i64, i64); impl_simd_array!([i128; 4]: i128x4 | i128, i128, i128, i128); impl_simd_array!([u128; 4]: u128x4 | u128, u128, u128, u128); impl_simd_array!([m128; 4]: m128x4 | i128, i128, i128, i128); ================================================ FILE: src/codegen/v64.rs ================================================ //! Internal 64-bit wide vector types use crate::masks::*; impl_simd_array!([i8; 8]: i8x8 | i8, i8, i8, i8, i8, i8, i8, i8); impl_simd_array!([u8; 8]: u8x8 | u8, u8, u8, u8, u8, u8, u8, u8); impl_simd_array!([m8; 8]: m8x8 | i8, i8, i8, i8, i8, i8, i8, i8); impl_simd_array!([i16; 4]: i16x4 | i16, i16, i16, i16); impl_simd_array!([u16; 4]: u16x4 | u16, u16, u16, u16); impl_simd_array!([m16; 4]: m16x4 | i16, i16, i16, i16); impl_simd_array!([i32; 2]: i32x2 | i32, i32); impl_simd_array!([u32; 2]: u32x2 | u32, u32); impl_simd_array!([f32; 2]: f32x2 | f32, f32); impl_simd_array!([m32; 2]: m32x2 | i32, i32); impl_simd_array!([i64; 1]: i64x1 | i64); impl_simd_array!([u64; 1]: u64x1 | u64); impl_simd_array!([f64; 1]: f64x1 | f64); impl_simd_array!([m64; 1]: m64x1 | i64); ================================================ FILE: src/codegen/vPtr.rs ================================================ //! Pointer vector types macro_rules! impl_simd_ptr { ([$ptr_ty:ty; $elem_count:expr]: $tuple_id:ident | $ty:ident | $($tys:ty),*) => { #[derive(Copy, Clone)] #[repr(simd)] pub struct $tuple_id<$ty>($(pub(crate) $tys),*); //^^^^^^^ leaked through SimdArray impl<$ty> crate::sealed::Seal for [$ptr_ty; $elem_count] {} impl<$ty> crate::sealed::SimdArray for [$ptr_ty; $elem_count] { type Tuple = $tuple_id<$ptr_ty>; type T = $ptr_ty; const N: usize = $elem_count; type NT = [u32; $elem_count]; } impl<$ty> crate::sealed::Seal for $tuple_id<$ptr_ty> {} impl<$ty> crate::sealed::Simd for $tuple_id<$ptr_ty> { type Element = $ptr_ty; const LANES: usize = $elem_count; type LanesType = [u32; $elem_count]; } } } impl_simd_ptr!([*const T; 2]: cptrx2 | T | T, T); impl_simd_ptr!([*const T; 4]: cptrx4 | T | T, T, T, T); impl_simd_ptr!([*const T; 8]: cptrx8 | T | T, T, T, T, T, T, T, T); impl_simd_ptr!([*mut T; 2]: mptrx2 | T | T, T); impl_simd_ptr!([*mut T; 4]: mptrx4 | T | T, T, T, T); impl_simd_ptr!([*mut T; 8]: mptrx8 | T | T, T, T, T, T, T, T, T); ================================================ FILE: src/codegen/vSize.rs ================================================ //! Vector types with pointer-sized elements use crate::codegen::pointer_sized_int::{isize_, usize_}; use crate::masks::*; impl_simd_array!([isize; 2]: isizex2 | isize_, isize_); impl_simd_array!([usize; 2]: usizex2 | usize_, usize_); impl_simd_array!([msize; 2]: msizex2 | isize_, isize_); impl_simd_array!([isize; 4]: isizex4 | isize_, isize_, isize_, isize_); impl_simd_array!([usize; 4]: usizex4 | usize_, usize_, usize_, usize_); impl_simd_array!([msize; 4]: msizex4 | isize_, isize_, isize_, isize_); impl_simd_array!([isize; 8]: isizex8 | isize_, isize_, isize_, isize_, isize_, isize_, isize_, isize_); impl_simd_array!([usize; 8]: usizex8 | usize_, usize_, usize_, usize_, usize_, usize_, usize_, usize_); impl_simd_array!([msize; 8]: msizex8 | isize_, isize_, isize_, isize_, isize_, isize_, isize_, isize_); ================================================ FILE: src/codegen.rs ================================================ //! Code-generation utilities pub(crate) mod bit_manip; pub(crate) mod llvm; pub(crate) mod math; pub(crate) mod reductions; pub(crate) mod shuffle; pub(crate) mod shuffle1_dyn; pub(crate) mod swap_bytes; macro_rules! impl_simd_array { ([$elem_ty:ident; $elem_count:expr]: $tuple_id:ident | $($elem_tys:ident),*) => { #[derive(Copy, Clone)] #[repr(simd)] pub struct $tuple_id($(pub(crate) $elem_tys),*); //^^^^^^^ leaked through SimdArray impl crate::sealed::Seal for [$elem_ty; $elem_count] {} impl crate::sealed::SimdArray for [$elem_ty; $elem_count] { type Tuple = $tuple_id; type T = $elem_ty; const N: usize = $elem_count; type NT = [u32; $elem_count]; } impl crate::sealed::Seal for $tuple_id {} impl crate::sealed::Simd for $tuple_id { type Element = $elem_ty; const LANES: usize = $elem_count; type LanesType = [u32; $elem_count]; } } } pub(crate) mod pointer_sized_int; pub(crate) mod v16; pub(crate) use self::v16::*; pub(crate) mod v32; pub(crate) use self::v32::*; pub(crate) mod v64; pub(crate) use self::v64::*; pub(crate) mod v128; pub(crate) use self::v128::*; pub(crate) mod v256; pub(crate) use self::v256::*; pub(crate) mod v512; pub(crate) use self::v512::*; pub(crate) mod vSize; pub(crate) use self::vSize::*; pub(crate) mod vPtr; pub(crate) use self::vPtr::*; ================================================ FILE: src/lib.rs ================================================ //! # Portable packed SIMD vectors //! //! This crate is proposed for stabilization as `std::packed_simd` in [RFC2366: //! `std::simd`](https://github.com/rust-lang/rfcs/pull/2366) . //! //! The examples available in the //! [`examples/`](https://github.com/rust-lang-nursery/packed_simd/tree/master/examples) //! sub-directory of the crate showcase how to use the library in practice. //! //! ## Table of contents //! //! - [Introduction](#introduction) //! - [Vector types](#vector-types) //! - [Conditional operations](#conditional-operations) //! - [Conversions](#conversions) //! - [Hardware Features](#hardware-features) //! - [Performance guide](https://rust-lang-nursery.github.io/packed_simd/perf-guide/) //! //! ## Introduction //! //! This crate exports [`Simd<[T; N]>`][`Simd`]: a packed vector of `N` //! elements of type `T` as well as many type aliases for this type: for //! example, [`f32x4`], which is just an alias for `Simd<[f32; 4]>`. //! //! The operations on packed vectors are, by default, "vertical", that is, they //! are applied to each vector lane in isolation of the others: //! //! ``` //! # use packed_simd::*; //! let a = i32x4::new(1, 2, 3, 4); //! let b = i32x4::new(5, 6, 7, 8); //! assert_eq!(a + b, i32x4::new(6, 8, 10, 12)); //! ``` //! //! Many "horizontal" operations are also provided: //! //! ``` //! # use packed_simd::*; //! # let a = i32x4::new(1, 2, 3, 4); //! assert_eq!(a.wrapping_sum(), 10); //! ``` //! //! In virtually all architectures vertical operations are fast, while //! horizontal operations are, by comparison, much slower. That is, the //! most portably-efficient way of performing a reduction over a slice //! is to collect the results into a vector using vertical operations, //! and performing a single horizontal operation at the end: //! //! ``` //! # use packed_simd::*; //! fn reduce(x: &[i32]) -> i32 { //! assert_eq!(x.len() % 4, 0); //! let mut sum = i32x4::splat(0); // [0, 0, 0, 0] //! for i in (0..x.len()).step_by(4) { //! sum += i32x4::from_slice_unaligned(&x[i..]); //! } //! sum.wrapping_sum() //! } //! //! let x = [0, 1, 2, 3, 4, 5, 6, 7]; //! assert_eq!(reduce(&x), 28); //! ``` //! //! ## Vector types //! //! The vector type aliases are named according to the following scheme: //! //! > `{element_type}x{number_of_lanes} == Simd<[element_type; //! number_of_lanes]>` //! //! where the following element types are supported: //! //! * `i{element_width}`: signed integer //! * `u{element_width}`: unsigned integer //! * `f{element_width}`: float //! * `m{element_width}`: mask (see below) //! * `*{const,mut} T`: `const` and `mut` pointers //! //! ## Basic operations //! //! ``` //! # use packed_simd::*; //! // Sets all elements to `0`: //! let a = i32x4::splat(0); //! //! // Reads a vector from a slice: //! let mut arr = [0, 0, 0, 1, 2, 3, 4, 5]; //! let b = i32x4::from_slice_unaligned(&arr); //! //! // Reads the 4-th element of a vector: //! assert_eq!(b.extract(3), 1); //! //! // Returns a new vector where the 4-th element is replaced with `1`: //! let a = a.replace(3, 1); //! assert_eq!(a, b); //! //! // Writes a vector to a slice: //! let a = a.replace(2, 1); //! a.write_to_slice_unaligned(&mut arr[4..]); //! assert_eq!(arr, [0, 0, 0, 1, 0, 0, 1, 1]); //! ``` //! //! ## Conditional operations //! //! One often needs to perform an operation on some lanes of the vector. Vector //! masks, like `m32x4`, allow selecting on which vector lanes an operation is //! to be performed: //! //! ``` //! # use packed_simd::*; //! let a = i32x4::new(1, 1, 2, 2); //! //! // Add `1` to the first two lanes of the vector. //! let m = m16x4::new(true, true, false, false); //! let a = m.select(a + 1, a); //! assert_eq!(a, i32x4::splat(2)); //! ``` //! //! The elements of a vector mask are either `true` or `false`. Here `true` //! means that a lane is "selected", while `false` means that a lane is not //! selected. //! //! All vector masks implement a `mask.select(a: T, b: T) -> T` method that //! works on all vectors that have the same number of lanes as the mask. The //! resulting vector contains the elements of `a` for those lanes for which the //! mask is `true`, and the elements of `b` otherwise. //! //! The example constructs a mask with the first two lanes set to `true` and //! the last two lanes set to `false`. This selects the first two lanes of `a + //! 1` and the last two lanes of `a`, producing a vector where the first two //! lanes have been incremented by `1`. //! //! > note: mask `select` can be used on vector types that have the same number //! > of lanes as the mask. The example shows this by using [`m16x4`] instead //! > of [`m32x4`]. It is _typically_ more performant to use a mask element //! > width equal to the element width of the vectors being operated upon. //! > This is, however, not true for 512-bit wide vectors when targeting //! > AVX-512, where the most efficient masks use only 1-bit per element. //! //! All vertical comparison operations returns masks: //! //! ``` //! # use packed_simd::*; //! let a = i32x4::new(1, 1, 3, 3); //! let b = i32x4::new(2, 2, 0, 0); //! //! // ge: >= (Greater Eequal; see also lt, le, gt, eq, ne). //! let m = a.ge(i32x4::splat(2)); //! //! if m.any() { //! // all / any / none allow coherent control flow //! let d = m.select(a, b); //! assert_eq!(d, i32x4::new(2, 2, 3, 3)); //! } //! ``` //! //! ## Conversions //! //! * **lossless widening conversions**: [`From`]/[`Into`] are implemented for //! vectors with the same number of lanes when the conversion is value //! preserving (same as in `std`). //! //! * **safe bitwise conversions**: The cargo feature `into_bits` provides the //! `IntoBits/FromBits` traits (`x.into_bits()`). These perform safe bitwise //! `transmute`s when all bit patterns of the source type are valid bit //! patterns of the target type and are also implemented for the //! architecture-specific vector types of `std::arch`. For example, `let x: //! u8x8 = m8x8::splat(true).into_bits();` is provided because all `m8x8` bit //! patterns are valid `u8x8` bit patterns. However, the opposite is not //! true, not all `u8x8` bit patterns are valid `m8x8` bit-patterns, so this //! operation cannot be performed safely using `x.into_bits()`; one needs to //! use `unsafe { crate::mem::transmute(x) }` for that, making sure that the //! value in the `u8x8` is a valid bit-pattern of `m8x8`. //! //! * **numeric casts** (`as`): are performed using [`FromCast`]/[`Cast`] //! (`x.cast()`), just like `as`: //! //! * casting integer vectors whose lane types have the same size (e.g. //! `i32xN` -> `u32xN`) is a **no-op**, //! //! * casting from a larger integer to a smaller integer (e.g. `u32xN` -> //! `u8xN`) will **truncate**, //! //! * casting from a smaller integer to a larger integer (e.g. `u8xN` -> //! `u32xN`) will: //! * **zero-extend** if the source is unsigned, or //! * **sign-extend** if the source is signed, //! //! * casting from a float to an integer will **round the float towards //! zero**, //! //! * casting from an integer to float will produce the floating point //! representation of the integer, **rounding to nearest, ties to even**, //! //! * casting from an `f32` to an `f64` is perfect and lossless, //! //! * casting from an `f64` to an `f32` **rounds to nearest, ties to even**. //! //! Numeric casts are not very "precise": sometimes lossy, sometimes value //! preserving, etc. //! //! ## Hardware Features //! //! This crate can use different hardware features based on your configured //! `RUSTFLAGS`. For example, with no configured `RUSTFLAGS`, `u64x8` on //! x86_64 will use SSE2 operations like `PCMPEQD`. If you configure //! `RUSTFLAGS='-C target-feature=+avx2,+avx'` on supported x86_64 hardware //! the same `u64x8` may use wider AVX2 operations like `VPCMPEQQ`. It is //! important for performance and for hardware support requirements that //! you choose an appropriate set of `target-feature` and `target-cpu` //! options during builds. For more information, see the [Performance //! guide](https://rust-lang-nursery.github.io/packed_simd/perf-guide/) #![feature( adt_const_params, repr_simd, rustc_attrs, platform_intrinsics, stdsimd, arm_target_feature, link_llvm_intrinsics, core_intrinsics, stmt_expr_attributes, custom_inner_attributes, )] #![allow(non_camel_case_types, non_snake_case, // FIXME: these types are unsound in C FFI already // See https://github.com/rust-lang/rust/issues/53346 improper_ctypes_definitions, incomplete_features, clippy::cast_possible_truncation, clippy::cast_lossless, clippy::cast_possible_wrap, clippy::cast_precision_loss, // TODO: manually add the `#[must_use]` attribute where appropriate clippy::must_use_candidate, // This lint is currently broken for generic code // See https://github.com/rust-lang/rust-clippy/issues/3410 clippy::use_self, clippy::wrong_self_convention, clippy::from_over_into, )] #![cfg_attr(test, feature(hashmap_internals))] #![cfg_attr(doc_cfg, feature(doc_cfg))] #![deny(rust_2018_idioms, clippy::missing_inline_in_public_items)] #![no_std] use cfg_if::cfg_if; cfg_if! { if #[cfg(feature = "core_arch")] { #[allow(unused_imports)] use core_arch as arch; } else { #[allow(unused_imports)] use core::arch; } } #[cfg(all(target_arch = "wasm32", test))] use wasm_bindgen_test::*; #[allow(unused_imports)] use core::{ /* arch (handled above), */ cmp, f32, f64, fmt, hash, hint, i128, i16, i32, i64, i8, intrinsics, isize, iter, marker, mem, ops, ptr, slice, u128, u16, u32, u64, u8, usize, }; #[macro_use] mod testing; #[macro_use] mod api; mod codegen; mod sealed; pub use crate::sealed::{Mask, Shuffle, Simd as SimdVector, SimdArray}; /// Packed SIMD vector type. /// /// # Examples /// /// ``` /// # use packed_simd::Simd; /// let v = Simd::<[i32; 4]>::new(0, 1, 2, 3); /// assert_eq!(v.extract(2), 2); /// ``` #[repr(transparent)] #[derive(Copy, Clone)] pub struct Simd( // FIXME: this type should be private, // but it currently must be public for the // `shuffle!` macro to work: it needs to // access the internal `repr(simd)` type // to call the shuffle intrinsics. #[doc(hidden)] pub ::Tuple, ); impl sealed::Seal for Simd {} /// Wrapper over `T` implementing a lexicoraphical order via the `PartialOrd` /// and/or `Ord` traits. #[repr(transparent)] #[derive(Copy, Clone, Debug)] #[allow(clippy::missing_inline_in_public_items)] pub struct LexicographicallyOrdered(T); mod masks; pub use self::masks::*; mod v16; pub use self::v16::*; mod v32; pub use self::v32::*; mod v64; pub use self::v64::*; mod v128; pub use self::v128::*; mod v256; pub use self::v256::*; mod v512; pub use self::v512::*; mod vSize; pub use self::vSize::*; mod vPtr; pub use self::vPtr::*; pub use self::api::cast::*; #[cfg(feature = "into_bits")] pub use self::api::into_bits::*; // Re-export the shuffle intrinsics required by the `shuffle!` macro. #[doc(hidden)] pub use self::codegen::llvm::{ __shuffle_vector16, __shuffle_vector2, __shuffle_vector32, __shuffle_vector4, __shuffle_vector64, __shuffle_vector8, }; pub(crate) mod llvm { pub(crate) use crate::codegen::llvm::*; } ================================================ FILE: src/masks.rs ================================================ //! Mask types macro_rules! impl_mask_ty { ($id:ident : $elem_ty:ident | #[$doc:meta]) => { #[$doc] #[derive(Copy, Clone)] pub struct $id($elem_ty); impl crate::sealed::Seal for $id {} impl crate::sealed::Mask for $id { #[inline] fn test(&self) -> bool { $id::test(self) } } impl $id { /// Instantiate a mask with `value` #[inline] pub fn new(x: bool) -> Self { if x { $id(!0) } else { $id(0) } } /// Test if the mask is set #[inline] pub fn test(&self) -> bool { self.0 != 0 } } impl Default for $id { #[inline] fn default() -> Self { $id(0) } } #[allow(clippy::partialeq_ne_impl)] impl PartialEq<$id> for $id { #[inline] fn eq(&self, other: &Self) -> bool { self.0 == other.0 } #[inline] fn ne(&self, other: &Self) -> bool { self.0 != other.0 } } impl Eq for $id {} impl PartialOrd<$id> for $id { #[inline] fn partial_cmp(&self, other: &Self) -> Option { use crate::cmp::Ordering; if self == other { Some(Ordering::Equal) } else if self.0 > other.0 { // Note: // * false = 0_i // * true == !0_i == -1_i Some(Ordering::Less) } else { Some(Ordering::Greater) } } #[inline] fn lt(&self, other: &Self) -> bool { self.0 > other.0 } #[inline] fn gt(&self, other: &Self) -> bool { self.0 < other.0 } #[inline] fn le(&self, other: &Self) -> bool { self.0 >= other.0 } #[inline] fn ge(&self, other: &Self) -> bool { self.0 <= other.0 } } impl Ord for $id { #[inline] fn cmp(&self, other: &Self) -> crate::cmp::Ordering { match self.partial_cmp(other) { Some(x) => x, None => unsafe { crate::hint::unreachable_unchecked() }, } } } impl crate::hash::Hash for $id { #[inline] fn hash(&self, state: &mut H) { (self.0 != 0).hash(state); } } impl crate::fmt::Debug for $id { #[inline] fn fmt(&self, fmtter: &mut crate::fmt::Formatter<'_>) -> Result<(), crate::fmt::Error> { write!(fmtter, "{}({})", stringify!($id), self.0 != 0) } } }; } impl_mask_ty!(m8: i8 | /// 8-bit wide mask. ); impl_mask_ty!(m16: i16 | /// 16-bit wide mask. ); impl_mask_ty!(m32: i32 | /// 32-bit wide mask. ); impl_mask_ty!(m64: i64 | /// 64-bit wide mask. ); impl_mask_ty!(m128: i128 | /// 128-bit wide mask. ); impl_mask_ty!(msize: isize | /// isize-wide mask. ); ================================================ FILE: src/sealed.rs ================================================ //! Sealed traits /// A sealed trait, this is logically private to the crate /// and will prevent implementations from outside the crate pub trait Seal {} /// Trait implemented by arrays that can be SIMD types. pub trait SimdArray: Seal { /// The type of the #[repr(simd)] type. type Tuple: Copy + Clone; /// The element type of the vector. type T; /// The number of elements in the array. const N: usize; /// The type: `[u32; Self::N]`. type NT; } /// This traits is used to constraint the arguments /// and result type of the portable shuffles. #[doc(hidden)] pub trait Shuffle: Seal { // Lanes is a `[u32; N]` where `N` is the number of vector lanes /// The result type of the shuffle. type Output; } /// This trait is implemented by all SIMD vector types. pub trait Simd: Seal { /// Element type of the SIMD vector type Element; /// The number of elements in the SIMD vector. const LANES: usize; /// The type: `[u32; Self::N]`. type LanesType; } /// This trait is implemented by all mask types pub trait Mask: Seal { fn test(&self) -> bool; } ================================================ FILE: src/testing/macros.rs ================================================ //! Testing macros macro_rules! test_if { ($cfg_tt:tt: $it:item) => { #[cfg(any( // Test everything if: // // * tests are enabled, // * no features about exclusively testing // specific vector classes are enabled all(test, not(any( test_v16, test_v32, test_v64, test_v128, test_v256, test_v512, test_none, // disables all tests ))), // Test if: // // * tests are enabled // * a particular cfg token tree returns true all(test, $cfg_tt), ))] $it }; } #[cfg(test)] #[allow(unused)] macro_rules! ref_ { ($anything:tt) => { &$anything }; } #[cfg(test)] #[allow(unused)] macro_rules! ref_mut_ { ($anything:tt) => { &mut $anything }; } ================================================ FILE: src/testing/utils.rs ================================================ //! Testing utilities #![allow(dead_code)] // FIXME: Or don't. But it's true this is a problematic comparison. #![allow(clippy::neg_cmp_op_on_partial_ord)] use crate::{cmp::PartialOrd, fmt::Debug, LexicographicallyOrdered}; /// Tests PartialOrd for `a` and `b` where `a < b` is true. pub fn test_lt(a: LexicographicallyOrdered, b: LexicographicallyOrdered) where LexicographicallyOrdered: Debug + PartialOrd, { assert!(a < b, "{:?}, {:?}", a, b); assert!(b > a, "{:?}, {:?}", a, b); assert!(!(a == b), "{:?}, {:?}", a, b); assert_ne!(a, b, "{:?}, {:?}", a, b); assert!(a <= b, "{:?}, {:?}", a, b); assert!(b >= a, "{:?}, {:?}", a, b); // The elegance of the mathematical expression of irreflexivity is more // than clippy can handle. #[allow(clippy::eq_op)] { // Irreflexivity assert!(!(a < a), "{:?}, {:?}", a, b); assert!(!(b < b), "{:?}, {:?}", a, b); assert!(!(a > a), "{:?}, {:?}", a, b); assert!(!(b > b), "{:?}, {:?}", a, b); assert!(a <= a, "{:?}, {:?}", a, b); assert!(b <= b, "{:?}, {:?}", a, b); } } /// Tests PartialOrd for `a` and `b` where `a <= b` is true. pub fn test_le(a: LexicographicallyOrdered, b: LexicographicallyOrdered) where LexicographicallyOrdered: Debug + PartialOrd, { assert!(a <= b, "{:?}, {:?}", a, b); assert!(b >= a, "{:?}, {:?}", a, b); assert!(a <= b, "{:?}, {:?}", a, b); assert!(b >= a, "{:?}, {:?}", a, b); if a == b { assert!(!(a < b), "{:?}, {:?}", a, b); assert!(!(b > a), "{:?}, {:?}", a, b); assert!(!(a != b), "{:?}, {:?}", a, b); } else { assert_ne!(a, b, "{:?}, {:?}", a, b); test_lt(a, b); } } /// Test PartialOrd::partial_cmp for `a` and `b` returning `Ordering` pub fn test_cmp( a: LexicographicallyOrdered, b: LexicographicallyOrdered, o: Option, ) where LexicographicallyOrdered: PartialOrd + Debug, T: Debug + crate::sealed::Simd + Copy + Clone, ::Element: Default + Copy + Clone + PartialOrd, { assert!(T::LANES <= 64, "array length in these two arrays needs updating"); let mut arr_a: [T::Element; 64] = [Default::default(); 64]; let mut arr_b: [T::Element; 64] = [Default::default(); 64]; unsafe { crate::ptr::write_unaligned(arr_a.as_mut_ptr() as *mut LexicographicallyOrdered, a) } unsafe { crate::ptr::write_unaligned(arr_b.as_mut_ptr() as *mut LexicographicallyOrdered, b) } let expected = arr_a[0..T::LANES].partial_cmp(&arr_b[0..T::LANES]); let result = a.partial_cmp(&b); assert_eq!(expected, result, "{:?}, {:?}", a, b); assert_eq!(o, result, "{:?}, {:?}", a, b); match o { Some(crate::cmp::Ordering::Less) => { test_lt(a, b); test_le(a, b); } Some(crate::cmp::Ordering::Greater) => { test_lt(b, a); test_le(b, a); } Some(crate::cmp::Ordering::Equal) => { assert!(a == b, "{:?}, {:?}", a, b); assert!(!(a != b), "{:?}, {:?}", a, b); assert!(!(a < b), "{:?}, {:?}", a, b); assert!(!(b < a), "{:?}, {:?}", a, b); assert!(!(a > b), "{:?}, {:?}", a, b); assert!(!(b > a), "{:?}, {:?}", a, b); test_le(a, b); test_le(b, a); } None => { assert!(!(a == b), "{:?}, {:?}", a, b); assert!(!(a != b), "{:?}, {:?}", a, b); assert!(!(a < b), "{:?}, {:?}", a, b); assert!(!(a > b), "{:?}, {:?}", a, b); assert!(!(b < a), "{:?}, {:?}", a, b); assert!(!(b > a), "{:?}, {:?}", a, b); assert!(!(a <= b), "{:?}, {:?}", a, b); assert!(!(b <= a), "{:?}, {:?}", a, b); assert!(!(a >= b), "{:?}, {:?}", a, b); assert!(!(b >= a), "{:?}, {:?}", a, b); } } } // Returns a tuple containing two distinct pointer values of the same type as // the element type of the Simd vector `$id`. #[allow(unused)] macro_rules! ptr_vals { ($id:ty) => { // expands to an expression #[allow(unused_unsafe)] unsafe { // all bits cleared let clear: <$id as sealed::Simd>::Element = crate::mem::zeroed(); // all bits set let set: <$id as sealed::Simd>::Element = crate::mem::transmute(-1_isize); (clear, set) } }; } ================================================ FILE: src/testing.rs ================================================ //! Testing macros and other utilities. #[macro_use] mod macros; #[cfg(test)] #[macro_use] pub(crate) mod utils; ================================================ FILE: src/v128.rs ================================================ //! 128-bit wide vector types #[rustfmt::skip] use crate::*; impl_i!([i8; 16]: i8x16, m8x16 | i8, u16 | test_v128 | x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15 | From: | /// A 128-bit vector with 16 `i8` lanes. ); impl_u!([u8; 16]: u8x16, m8x16 | u8, u16 | test_v128 | x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15 | From: | /// A 128-bit vector with 16 `u8` lanes. ); impl_m!([m8; 16]: m8x16 | i8, u16 | test_v128 | x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15 | From: m16x16 | /// A 128-bit vector mask with 16 `m8` lanes. ); impl_i!([i16; 8]: i16x8, m16x8 | i16, u8 | test_v128 | x0, x1, x2, x3, x4, x5, x6, x7 | From: i8x8, u8x8 | /// A 128-bit vector with 8 `i16` lanes. ); impl_u!([u16; 8]: u16x8, m16x8 | u16, u8 | test_v128 | x0, x1, x2, x3, x4, x5, x6, x7 | From: u8x8 | /// A 128-bit vector with 8 `u16` lanes. ); impl_m!([m16; 8]: m16x8 | i16, u8 | test_v128 | x0, x1, x2, x3, x4, x5, x6, x7 | From: m8x8, m32x8 | /// A 128-bit vector mask with 8 `m16` lanes. ); impl_i!([i32; 4]: i32x4, m32x4 | i32, u8 | test_v128 | x0, x1, x2, x3 | From: i8x4, u8x4, i16x4, u16x4 | /// A 128-bit vector with 4 `i32` lanes. ); impl_u!([u32; 4]: u32x4, m32x4 | u32, u8 | test_v128 | x0, x1, x2, x3 | From: u8x4, u16x4 | /// A 128-bit vector with 4 `u32` lanes. ); impl_f!([f32; 4]: f32x4, m32x4 | f32 | test_v128 | x0, x1, x2, x3 | From: i8x4, u8x4, i16x4, u16x4 | /// A 128-bit vector with 4 `f32` lanes. ); impl_m!([m32; 4]: m32x4 | i32, u8 | test_v128 | x0, x1, x2, x3 | From: m8x4, m16x4, m64x4 | /// A 128-bit vector mask with 4 `m32` lanes. ); impl_i!([i64; 2]: i64x2, m64x2 | i64, u8 | test_v128 | x0, x1 | From: i8x2, u8x2, i16x2, u16x2, i32x2, u32x2 | /// A 128-bit vector with 2 `i64` lanes. ); impl_u!([u64; 2]: u64x2, m64x2 | u64, u8 | test_v128 | x0, x1 | From: u8x2, u16x2, u32x2 | /// A 128-bit vector with 2 `u64` lanes. ); impl_f!([f64; 2]: f64x2, m64x2 | f64 | test_v128 | x0, x1 | From: i8x2, u8x2, i16x2, u16x2, i32x2, u32x2, f32x2 | /// A 128-bit vector with 2 `f64` lanes. ); impl_m!([m64; 2]: m64x2 | i64, u8 | test_v128 | x0, x1 | From: m8x2, m16x2, m32x2, m128x2 | /// A 128-bit vector mask with 2 `m64` lanes. ); impl_i!([i128; 1]: i128x1, m128x1 | i128, u8 | test_v128 | x0 | From: /*i8x1, u8x1, i16x1, u16x1, i32x1, u32x1, i64x1, u64x1 */ | // FIXME: unary small vector types /// A 128-bit vector with 1 `i128` lane. ); impl_u!([u128; 1]: u128x1, m128x1 | u128, u8 | test_v128 | x0 | From: /*u8x1, u16x1, u32x1, u64x1 */ | // FIXME: unary small vector types /// A 128-bit vector with 1 `u128` lane. ); impl_m!([m128; 1]: m128x1 | i128, u8 | test_v128 | x0 | From: /*m8x1, m16x1, m32x1, m64x1 */ | // FIXME: unary small vector types /// A 128-bit vector mask with 1 `m128` lane. ); ================================================ FILE: src/v16.rs ================================================ //! 16-bit wide vector types use crate::*; impl_i!([i8; 2]: i8x2, m8x2 | i8, u8 | test_v16 | x0, x1 | From: | /// A 16-bit vector with 2 `i8` lanes. ); impl_u!([u8; 2]: u8x2, m8x2 | u8, u8 | test_v16 | x0, x1 | From: | /// A 16-bit vector with 2 `u8` lanes. ); impl_m!([m8; 2]: m8x2 | i8, u8 | test_v16 | x0, x1 | From: m16x2, m32x2, m64x2, m128x2 | /// A 16-bit vector mask with 2 `m8` lanes. ); ================================================ FILE: src/v256.rs ================================================ //! 256-bit wide vector types #[rustfmt::skip] use crate::*; impl_i!([i8; 32]: i8x32, m8x32 | i8, u32 | test_v256 | x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, x30, x31 | From: | /// A 256-bit vector with 32 `i8` lanes. ); impl_u!([u8; 32]: u8x32, m8x32 | u8, u32 | test_v256 | x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, x30, x31 | From: | /// A 256-bit vector with 32 `u8` lanes. ); impl_m!([m8; 32]: m8x32 | i8, u32 | test_v256 | x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, x30, x31 | From: | /// A 256-bit vector mask with 32 `m8` lanes. ); impl_i!([i16; 16]: i16x16, m16x16 | i16, u16 | test_v256 | x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15 | From: i8x16, u8x16 | /// A 256-bit vector with 16 `i16` lanes. ); impl_u!([u16; 16]: u16x16, m16x16 | u16, u16 | test_v256 | x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15 | From: u8x16 | /// A 256-bit vector with 16 `u16` lanes. ); impl_m!([m16; 16]: m16x16 | i16, u16 | test_v256 | x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15 | From: m8x16 | /// A 256-bit vector mask with 16 `m16` lanes. ); impl_i!([i32; 8]: i32x8, m32x8 | i32, u8 | test_v256 | x0, x1, x2, x3, x4, x5, x6, x7 | From: i8x8, u8x8, i16x8, u16x8 | /// A 256-bit vector with 8 `i32` lanes. ); impl_u!([u32; 8]: u32x8, m32x8 | u32, u8 | test_v256 | x0, x1, x2, x3, x4, x5, x6, x7 | From: u8x8, u16x8 | /// A 256-bit vector with 8 `u32` lanes. ); impl_f!([f32; 8]: f32x8, m32x8 | f32 | test_v256 | x0, x1, x2, x3, x4, x5, x6, x7 | From: i8x8, u8x8, i16x8, u16x8 | /// A 256-bit vector with 8 `f32` lanes. ); impl_m!([m32; 8]: m32x8 | i32, u8 | test_v256 | x0, x1, x2, x3, x4, x5, x6, x7 | From: m8x8, m16x8 | /// A 256-bit vector mask with 8 `m32` lanes. ); impl_i!([i64; 4]: i64x4, m64x4 | i64, u8 | test_v256 | x0, x1, x2, x3 | From: i8x4, u8x4, i16x4, u16x4, i32x4, u32x4 | /// A 256-bit vector with 4 `i64` lanes. ); impl_u!([u64; 4]: u64x4, m64x4 | u64, u8 | test_v256 | x0, x1, x2, x3 | From: u8x4, u16x4, u32x4 | /// A 256-bit vector with 4 `u64` lanes. ); impl_f!([f64; 4]: f64x4, m64x4 | f64 | test_v256 | x0, x1, x2, x3 | From: i8x4, u8x4, i16x4, u16x4, i32x4, u32x4, f32x4 | /// A 256-bit vector with 4 `f64` lanes. ); impl_m!([m64; 4]: m64x4 | i64, u8 | test_v256 | x0, x1, x2, x3 | From: m8x4, m16x4, m32x4 | /// A 256-bit vector mask with 4 `m64` lanes. ); impl_i!([i128; 2]: i128x2, m128x2 | i128, u8 | test_v256 | x0, x1 | From: i8x2, u8x2, i16x2, u16x2, i32x2, u32x2, i64x2, u64x2 | /// A 256-bit vector with 2 `i128` lanes. ); impl_u!([u128; 2]: u128x2, m128x2 | u128, u8 | test_v256 | x0, x1 | From: u8x2, u16x2, u32x2, u64x2 | /// A 256-bit vector with 2 `u128` lanes. ); impl_m!([m128; 2]: m128x2 | i128, u8 | test_v256 | x0, x1 | From: m8x2, m16x2, m32x2, m64x2 | /// A 256-bit vector mask with 2 `m128` lanes. ); ================================================ FILE: src/v32.rs ================================================ //! 32-bit wide vector types use crate::*; impl_i!([i8; 4]: i8x4, m8x4 | i8, u8 | test_v32 | x0, x1, x2, x3 | From: | /// A 32-bit vector with 4 `i8` lanes. ); impl_u!([u8; 4]: u8x4, m8x4 | u8, u8 | test_v32 | x0, x1, x2, x3 | From: | /// A 32-bit vector with 4 `u8` lanes. ); impl_m!([m8; 4]: m8x4 | i8, u8 | test_v32 | x0, x1, x2, x3 | From: m16x4, m32x4, m64x4 | /// A 32-bit vector mask with 4 `m8` lanes. ); impl_i!([i16; 2]: i16x2, m16x2 | i16, u8 | test_v32 | x0, x1 | From: i8x2, u8x2 | /// A 32-bit vector with 2 `i16` lanes. ); impl_u!([u16; 2]: u16x2, m16x2 | u16, u8 | test_v32 | x0, x1 | From: u8x2 | /// A 32-bit vector with 2 `u16` lanes. ); impl_m!([m16; 2]: m16x2 | i16, u8 | test_v32 | x0, x1 | From: m8x2, m32x2, m64x2, m128x2 | /// A 32-bit vector mask with 2 `m16` lanes. ); ================================================ FILE: src/v512.rs ================================================ //! 512-bit wide vector types #[rustfmt::skip] use crate::*; impl_i!([i8; 64]: i8x64, m8x64 | i8, u64 | test_v512 | x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, x30, x31, x32, x33, x34, x35, x36, x37, x38, x39, x40, x41, x42, x43, x44, x45, x46, x47, x48, x49, x50, x51, x52, x53, x54, x55, x56, x57, x58, x59, x60, x61, x62, x63 | From: | /// A 512-bit vector with 64 `i8` lanes. ); impl_u!([u8; 64]: u8x64, m8x64 | u8, u64 | test_v512 | x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, x30, x31, x32, x33, x34, x35, x36, x37, x38, x39, x40, x41, x42, x43, x44, x45, x46, x47, x48, x49, x50, x51, x52, x53, x54, x55, x56, x57, x58, x59, x60, x61, x62, x63 | From: | /// A 512-bit vector with 64 `u8` lanes. ); impl_m!([m8; 64]: m8x64 | i8, u64 | test_v512 | x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, x30, x31, x32, x33, x34, x35, x36, x37, x38, x39, x40, x41, x42, x43, x44, x45, x46, x47, x48, x49, x50, x51, x52, x53, x54, x55, x56, x57, x58, x59, x60, x61, x62, x63 | From: | /// A 512-bit vector mask with 64 `m8` lanes. ); impl_i!([i16; 32]: i16x32, m16x32 | i16, u32 | test_v512 | x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, x30, x31 | From: i8x32, u8x32 | /// A 512-bit vector with 32 `i16` lanes. ); impl_u!([u16; 32]: u16x32, m16x32 | u16, u32 | test_v512 | x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, x30, x31 | From: u8x32 | /// A 512-bit vector with 32 `u16` lanes. ); impl_m!([m16; 32]: m16x32 | i16, u32 | test_v512 | x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, x30, x31 | From: m8x32 | /// A 512-bit vector mask with 32 `m16` lanes. ); impl_i!([i32; 16]: i32x16, m32x16 | i32, u16 | test_v512 | x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15 | From: i8x16, u8x16, i16x16, u16x16 | /// A 512-bit vector with 16 `i32` lanes. ); impl_u!([u32; 16]: u32x16, m32x16 | u32, u16 | test_v512 | x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15 | From: u8x16, u16x16 | /// A 512-bit vector with 16 `u32` lanes. ); impl_f!([f32; 16]: f32x16, m32x16 | f32 | test_v512 | x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15 | From: i8x16, u8x16, i16x16, u16x16 | /// A 512-bit vector with 16 `f32` lanes. ); impl_m!([m32; 16]: m32x16 | i32, u16 | test_v512 | x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15 | From: m8x16, m16x16 | /// A 512-bit vector mask with 16 `m32` lanes. ); impl_i!([i64; 8]: i64x8, m64x8 | i64, u8 | test_v512 | x0, x1, x2, x3, x4, x5, x6, x7 | From: i8x8, u8x8, i16x8, u16x8, i32x8, u32x8 | /// A 512-bit vector with 8 `i64` lanes. ); impl_u!([u64; 8]: u64x8, m64x8 | u64, u8 | test_v512 | x0, x1, x2, x3, x4, x5, x6, x7 | From: u8x8, u16x8, u32x8 | /// A 512-bit vector with 8 `u64` lanes. ); impl_f!([f64; 8]: f64x8, m64x8 | f64 | test_v512 | x0, x1, x2, x3, x4, x5, x6, x7 | From: i8x8, u8x8, i16x8, u16x8, i32x8, u32x8, f32x8 | /// A 512-bit vector with 8 `f64` lanes. ); impl_m!([m64; 8]: m64x8 | i64, u8 | test_v512 | x0, x1, x2, x3, x4, x5, x6, x7 | From: m8x8, m16x8, m32x8 | /// A 512-bit vector mask with 8 `m64` lanes. ); impl_i!([i128; 4]: i128x4, m128x4 | i128, u8 | test_v512 | x0, x1, x2, x3 | From: i8x4, u8x4, i16x4, u16x4, i32x4, u32x4, i64x4, u64x4 | /// A 512-bit vector with 4 `i128` lanes. ); impl_u!([u128; 4]: u128x4, m128x4 | u128, u8 | test_v512 | x0, x1, x2, x3 | From: u8x4, u16x4, u32x4, u64x4 | /// A 512-bit vector with 4 `u128` lanes. ); impl_m!([m128; 4]: m128x4 | i128, u8 | test_v512 | x0, x1, x2, x3 | From: m8x4, m16x4, m32x4, m64x4 | /// A 512-bit vector mask with 4 `m128` lanes. ); ================================================ FILE: src/v64.rs ================================================ //! 64-bit wide vector types #[rustfmt::skip] use super::*; impl_i!([i8; 8]: i8x8, m8x8 | i8, u8 | test_v64 | x0, x1, x2, x3, x4, x5, x6, x7 | From: | /// A 64-bit vector with 8 `i8` lanes. ); impl_u!([u8; 8]: u8x8, m8x8 | u8, u8 | test_v64 | x0, x1, x2, x3, x4, x5, x6, x7 | From: | /// A 64-bit vector with 8 `u8` lanes. ); impl_m!([m8; 8]: m8x8 | i8, u8 | test_v64 | x0, x1, x2, x3, x4, x5, x6, x7 | From: m16x8, m32x8 | /// A 64-bit vector mask with 8 `m8` lanes. ); impl_i!([i16; 4]: i16x4, m16x4 | i16, u8 | test_v64 | x0, x1, x2, x3 | From: i8x4, u8x4 | /// A 64-bit vector with 4 `i16` lanes. ); impl_u!([u16; 4]: u16x4, m16x4 | u16, u8 | test_v64 | x0, x1, x2, x3 | From: u8x4 | /// A 64-bit vector with 4 `u16` lanes. ); impl_m!([m16; 4]: m16x4 | i16, u8 | test_v64 | x0, x1, x2, x3 | From: m8x4, m32x4, m64x4 | /// A 64-bit vector mask with 4 `m16` lanes. ); impl_i!([i32; 2]: i32x2, m32x2 | i32, u8 | test_v64 | x0, x1 | From: i8x2, u8x2, i16x2, u16x2 | /// A 64-bit vector with 2 `i32` lanes. ); impl_u!([u32; 2]: u32x2, m32x2 | u32, u8 | test_v64 | x0, x1 | From: u8x2, u16x2 | /// A 64-bit vector with 2 `u32` lanes. ); impl_m!([m32; 2]: m32x2 | i32, u8 | test_v64 | x0, x1 | From: m8x2, m16x2, m64x2, m128x2 | /// A 64-bit vector mask with 2 `m32` lanes. ); impl_f!([f32; 2]: f32x2, m32x2 | f32 | test_v64 | x0, x1 | From: i8x2, u8x2, i16x2, u16x2 | /// A 64-bit vector with 2 `f32` lanes. ); /* impl_i!([i64; 1]: i64x1, m64x1 | i64, u8 | test_v64 | x0 | From: /*i8x1, u8x1, i16x1, u16x1, i32x1, u32x1*/ | // FIXME: primitive to vector conversion /// A 64-bit vector with 1 `i64` lanes. ); impl_u!([u64; 1]: u64x1, m64x1 | u64, u8 | test_v64 | x0 | From: /*u8x1, u16x1, u32x1*/ | // FIXME: primitive to vector conversion /// A 64-bit vector with 1 `u64` lanes. ); impl_m!([m64; 1]: m64x1 | i64, u8 | test_v64 | x0 | From: /*m8x1, m16x1, m32x1, */ m128x1 | // FIXME: unary small vector types /// A 64-bit vector mask with 1 `m64` lanes. ); impl_f!([f64; 1]: f64x1, m64x1 | f64 | test_v64 | x0 | From: /*i8x1, u8x1, i16x1, u16x1, i32x1, u32x1, f32x1*/ | // FIXME: unary small vector types /// A 64-bit vector with 1 `f64` lanes. ); */ ================================================ FILE: src/vPtr.rs ================================================ //! Vectors of pointers #[rustfmt::skip] use crate::*; impl_const_p!( [*const T; 2]: cptrx2, msizex2, usizex2, isizex2 | test_v128 | x0, x1 | From: | /// A vector with 2 `*const T` lanes ); impl_mut_p!( [*mut T; 2]: mptrx2, msizex2, usizex2, isizex2 | test_v128 | x0, x1 | From: | /// A vector with 2 `*mut T` lanes ); impl_const_p!( [*const T; 4]: cptrx4, msizex4, usizex4, isizex4 | test_v256 | x0, x1, x2, x3 | From: | /// A vector with 4 `*const T` lanes ); impl_mut_p!( [*mut T; 4]: mptrx4, msizex4, usizex4, isizex4 | test_v256 | x0, x1, x2, x3 | From: | /// A vector with 4 `*mut T` lanes ); impl_const_p!( [*const T; 8]: cptrx8, msizex8, usizex8, isizex8 | test_v512 | x0, x1, x2, x3, x4, x5, x6, x7 | From: | /// A vector with 8 `*const T` lanes ); impl_mut_p!( [*mut T; 8]: mptrx8, msizex8, usizex8, isizex8 | test_v512 | x0, x1, x2, x3, x4, x5, x6, x7 | From: | /// A vector with 8 `*mut T` lanes ); ================================================ FILE: src/vSize.rs ================================================ //! Vectors with pointer-sized elements use crate::codegen::pointer_sized_int::{isize_, usize_}; use crate::*; impl_i!([isize; 2]: isizex2, msizex2 | isize_, u8 | test_v128 | x0, x1| From: | /// A vector with 2 `isize` lanes. ); impl_u!([usize; 2]: usizex2, msizex2 | usize_, u8 | test_v128 | x0, x1| From: | /// A vector with 2 `usize` lanes. ); impl_m!([msize; 2]: msizex2 | isize_, u8 | test_v128 | x0, x1 | From: | /// A vector mask with 2 `msize` lanes. ); impl_i!([isize; 4]: isizex4, msizex4 | isize_, u8 | test_v256 | x0, x1, x2, x3 | From: | /// A vector with 4 `isize` lanes. ); impl_u!([usize; 4]: usizex4, msizex4 | usize_, u8 | test_v256 | x0, x1, x2, x3| From: | /// A vector with 4 `usize` lanes. ); impl_m!([msize; 4]: msizex4 | isize_, u8 | test_v256 | x0, x1, x2, x3 | From: | /// A vector mask with 4 `msize` lanes. ); impl_i!([isize; 8]: isizex8, msizex8 | isize_, u8 | test_v512 | x0, x1, x2, x3, x4, x5, x6, x7 | From: | /// A vector with 8 `isize` lanes. ); impl_u!([usize; 8]: usizex8, msizex8 | usize_, u8 | test_v512 | x0, x1, x2, x3, x4, x5, x6, x7 | From: | /// A vector with 8 `usize` lanes. ); impl_m!([msize; 8]: msizex8 | isize_, u8 | test_v512 | x0, x1, x2, x3, x4, x5, x6, x7 | From: | /// A vector mask with 8 `msize` lanes. ); ================================================ FILE: tests/endianness.rs ================================================ #[cfg(target_arch = "wasm32")] use wasm_bindgen_test::*; use packed_simd::*; use std::{mem, slice}; #[cfg_attr(not(target_arch = "wasm32"), test)] #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)] fn endian_indexing() { let v = i32x4::new(0, 1, 2, 3); assert_eq!(v.extract(0), 0); assert_eq!(v.extract(1), 1); assert_eq!(v.extract(2), 2); assert_eq!(v.extract(3), 3); } #[cfg_attr(not(target_arch = "wasm32"), test)] #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)] fn endian_bitcasts() { #[rustfmt::skip] let x = i8x16::new( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, ); let t: i16x8 = unsafe { mem::transmute(x) }; let e: i16x8 = if cfg!(target_endian = "little") { i16x8::new(256, 770, 1284, 1798, 2312, 2826, 3340, 3854) } else { i16x8::new(1, 515, 1029, 1543, 2057, 2571, 3085, 3599) }; assert_eq!(t, e); } #[cfg_attr(not(target_arch = "wasm32"), test)] #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)] fn endian_casts() { #[rustfmt::skip] let x = i8x16::new( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, ); let t: i16x16 = x.into(); // simd_cast #[rustfmt::skip] let e = i16x16::new( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, ); assert_eq!(t, e); } #[cfg_attr(not(target_arch = "wasm32"), test)] #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)] fn endian_load_and_stores() { #[rustfmt::skip] let x = i8x16::new( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, ); let mut y: [i16; 8] = [0; 8]; x.write_to_slice_unaligned(unsafe { slice::from_raw_parts_mut(&mut y as *mut _ as *mut i8, 16) }); let e: [i16; 8] = if cfg!(target_endian = "little") { [256, 770, 1284, 1798, 2312, 2826, 3340, 3854] } else { [1, 515, 1029, 1543, 2057, 2571, 3085, 3599] }; assert_eq!(y, e); let z = i8x16::from_slice_unaligned(unsafe { slice::from_raw_parts(&y as *const _ as *const i8, 16) }); assert_eq!(z, x); } #[cfg_attr(not(target_arch = "wasm32"), test)] #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)] fn endian_array_union() { union A { data: [f32; 4], vec: f32x4, } let x: [f32; 4] = unsafe { A { vec: f32x4::new(0., 1., 2., 3.) }.data }; // As all of these are integer values within the mantissa^1 range, it // would be very unusual for them to actually fail to compare. #[allow(clippy::float_cmp)] { assert_eq!(x[0], 0_f32); assert_eq!(x[1], 1_f32); assert_eq!(x[2], 2_f32); assert_eq!(x[3], 3_f32); } let y: f32x4 = unsafe { A { data: [3., 2., 1., 0.] }.vec }; assert_eq!(y, f32x4::new(3., 2., 1., 0.)); union B { data: [i8; 16], vec: i8x16, } #[rustfmt::skip] let x = i8x16::new( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, ); let x: [i8; 16] = unsafe { B { vec: x }.data }; for (i, v) in x.iter().enumerate() { assert_eq!(i as i8, *v); } #[rustfmt::skip] let y = [ 15, 14, 13, 12, 11, 19, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 ]; #[rustfmt::skip] let e = i8x16::new( 15, 14, 13, 12, 11, 19, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 ); let z = unsafe { B { data: y }.vec }; assert_eq!(z, e); union C { data: [i16; 8], vec: i8x16, } #[rustfmt::skip] let x = i8x16::new( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, ); let x: [i16; 8] = unsafe { C { vec: x }.data }; let e: [i16; 8] = if cfg!(target_endian = "little") { [256, 770, 1284, 1798, 2312, 2826, 3340, 3854] } else { [1, 515, 1029, 1543, 2057, 2571, 3085, 3599] }; assert_eq!(x, e); } #[cfg_attr(not(target_arch = "wasm32"), test)] #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)] fn endian_tuple_access() { type F32x4T = (f32, f32, f32, f32); union A { data: F32x4T, vec: f32x4, } let x: F32x4T = unsafe { A { vec: f32x4::new(0., 1., 2., 3.) }.data }; // As all of these are integer values within the mantissa^1 range, it // would be very unusual for them to actually fail to compare. #[allow(clippy::float_cmp)] { assert_eq!(x.0, 0_f32); assert_eq!(x.1, 1_f32); assert_eq!(x.2, 2_f32); assert_eq!(x.3, 3_f32); } let y: f32x4 = unsafe { A { data: (3., 2., 1., 0.) }.vec }; assert_eq!(y, f32x4::new(3., 2., 1., 0.)); #[rustfmt::skip] type I8x16T = (i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8); union B { data: I8x16T, vec: i8x16, } #[rustfmt::skip] let x = i8x16::new( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, ); let x: I8x16T = unsafe { B { vec: x }.data }; assert_eq!(x.0, 0); assert_eq!(x.1, 1); assert_eq!(x.2, 2); assert_eq!(x.3, 3); assert_eq!(x.4, 4); assert_eq!(x.5, 5); assert_eq!(x.6, 6); assert_eq!(x.7, 7); assert_eq!(x.8, 8); assert_eq!(x.9, 9); assert_eq!(x.10, 10); assert_eq!(x.11, 11); assert_eq!(x.12, 12); assert_eq!(x.13, 13); assert_eq!(x.14, 14); assert_eq!(x.15, 15); #[rustfmt::skip] let y = ( 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 ); let z: i8x16 = unsafe { B { data: y }.vec }; #[rustfmt::skip] let e = i8x16::new( 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 ); assert_eq!(e, z); #[rustfmt::skip] type I16x8T = (i16, i16, i16, i16, i16, i16, i16, i16); union C { data: I16x8T, vec: i8x16, } #[rustfmt::skip] let x = i8x16::new( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, ); let x: I16x8T = unsafe { C { vec: x }.data }; let e: [i16; 8] = if cfg!(target_endian = "little") { [256, 770, 1284, 1798, 2312, 2826, 3340, 3854] } else { [1, 515, 1029, 1543, 2057, 2571, 3085, 3599] }; assert_eq!(x.0, e[0]); assert_eq!(x.1, e[1]); assert_eq!(x.2, e[2]); assert_eq!(x.3, e[3]); assert_eq!(x.4, e[4]); assert_eq!(x.5, e[5]); assert_eq!(x.6, e[6]); assert_eq!(x.7, e[7]); #[rustfmt::skip] #[repr(C)] #[derive(Copy ,Clone)] pub struct Tup(pub i8, pub i8, pub i16, pub i8, pub i8, pub i16, pub i8, pub i8, pub i16, pub i8, pub i8, pub i16); union D { data: Tup, vec: i8x16, } #[rustfmt::skip] let x = i8x16::new( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, ); let x: Tup = unsafe { D { vec: x }.data }; let e: [i16; 12] = if cfg!(target_endian = "little") { [0, 1, 770, 4, 5, 1798, 8, 9, 2826, 12, 13, 3854] } else { [0, 1, 515, 4, 5, 1543, 8, 9, 2571, 12, 13, 3599] }; assert_eq!(x.0 as i16, e[0]); assert_eq!(x.1 as i16, e[1]); assert_eq!(x.2 as i16, e[2]); assert_eq!(x.3 as i16, e[3]); assert_eq!(x.4 as i16, e[4]); assert_eq!(x.5 as i16, e[5]); assert_eq!(x.6 as i16, e[6]); assert_eq!(x.7 as i16, e[7]); assert_eq!(x.8 as i16, e[8]); assert_eq!(x.9 as i16, e[9]); assert_eq!(x.10 as i16, e[10]); assert_eq!(x.11 as i16, e[11]); } ================================================ FILE: verify/verify/Cargo.toml ================================================ [package] name = "verify" version = "0.1.0" authors = ["gnzlbg "] edition = "2018" [dev-dependencies] stdarch-test = { git = "https://github.com/rust-lang/stdarch.git" } packed_simd = { package = "packed_simd", path = "../.." } cfg-if = "^0.1" paste = "^0.1.3" ================================================ FILE: verify/verify/readme.md ================================================ # Machine code verification ## Quick start To run the verification tests run: ``` cargo test --release ``` on this crate, eventually passing the required target features via `RUSTFLAGS`. For example, `RUSTFLAGS="-C target-feature=+avx2"`. This crate only contains tests, and the tests only run in `--release` mode. Therefore, building this crate with anything different from `cargo test --release` does not make much sense. ## How it works This crates verifies the machine code generated for some of the portable packed vector APIs by disassembling the API at run-time and comparing the machine code generated against the desired one for a particular target and target features. This is done by using the [`stdarch-test`](https://github.com/rust-lang/stdarch/tree/master/crates/stdarch-test) crate, which exposes the `assert_instr` procedural macro. It is used like this: ```rust // The verification functions must be #[inline]: #[inline] // Enable the target features required for the desired code generation // on the different targets: #[cfg_attr( any(target_arch = "x86", target_arch = "x86_64"), target_feature(enable = "avx512f,avx512vl") )] // Check that the disassembly contains a particular instruction: #[cfg_attr( any(target_arch = "x86", target_arch = "x86_64"), assert_instr(vpro) )] unsafe fn rotate_right_variable(x: u64x8, v: u64x8) -> u64x8 { x.rotate_right(v) } ``` The `assert_instr` procedural macro creates a test that contains a `#[inline(never)]` function that calls the API. It then gets a function pointer to this function, and calls `stdarch_test::assert` with it, the function name, and the expected assembly instruction. `stdarch_test` uses `objdump` or similar to disassemble itself, it then looks for the function address and name in the disassembly, and verifies that the machine code for the function contains the instruction. ================================================ FILE: verify/verify/rust-toolchain ================================================ nightly ================================================ FILE: verify/verify/src/api/math/float/mod.rs ================================================ mod mul_add; ================================================ FILE: verify/verify/src/api/math/float/mul_add.rs ================================================ #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] mod x86 { mod f32x4 { #![allow(unused)] use packed_simd::*; use stdarch_test::assert_instr; #[inline] #[target_feature(enable = "sse,fma")] #[assert_instr(vfmadd)] unsafe fn fused_multiply_add(a: f32x4, b: f32x4, c: f32x4) -> f32x4 { a.mul_add(b, c) } #[inline] #[target_feature(enable = "sse,fma")] #[assert_instr(vfmsub)] unsafe fn fused_multiply_sub(a: f32x4, b: f32x4, c: f32x4) -> f32x4 { a.mul_add(b, -c) } #[inline] #[target_feature(enable = "sse,fma")] #[assert_instr(vfnmadd)] unsafe fn fused_negate_multiply_add( a: f32x4, b: f32x4, c: f32x4, ) -> f32x4 { a.mul_add(-b, c) } #[inline] #[target_feature(enable = "sse,fma")] #[assert_instr(vfnmsub)] unsafe fn fused_negate_multiply_sub( a: f32x4, b: f32x4, c: f32x4, ) -> f32x4 { a.mul_add(-b, -c) } #[inline] #[target_feature(enable = "sse,fma")] #[assert_instr(vfmaddsub)] unsafe fn fused_multiply_add_sub( a: f32x4, b: f32x4, c: f32x4, ) -> f32x4 { let add = a.mul_add(b, c); let sub = a.mul_add(b, -c); m32x4::new(false, true, false, true).select(add, sub) } #[inline] #[target_feature(enable = "sse,fma")] #[assert_instr(vfmsubadd)] unsafe fn fused_multiply_sub_add( a: f32x4, b: f32x4, c: f32x4, ) -> f32x4 { let add = a.mul_add(b, c); let sub = a.mul_add(b, -c); m32x4::new(true, false, true, false).select(add, sub) } } } ================================================ FILE: verify/verify/src/api/math.rs ================================================ mod float; ================================================ FILE: verify/verify/src/api/ops/vector_rotates/x86.rs ================================================ mod u64x8 { #![allow(unused)] use packed_simd::*; use stdarch_test::assert_instr; #[inline] #[target_feature(enable = "avx512f")] #[assert_instr(vpro)] unsafe fn rotate_right_variable(x: u64x8, v: u64x8) -> u64x8 { x.rotate_right(v) } #[inline] #[target_feature(enable = "avx512f")] #[assert_instr(vpro)] unsafe fn rotate_left_variable(x: u64x8, v: u64x8) -> u64x8 { x.rotate_left(v) } #[inline] #[target_feature(enable = "avx512f")] #[assert_instr(vpro)] unsafe fn rotate_right(x: u64x8) -> u64x8 { x.rotate_right(u64x8::splat(12)) } #[inline] #[target_feature(enable = "avx512f")] #[assert_instr(vpro)] unsafe fn rotate_left(x: u64x8) -> u64x8 { x.rotate_left(u64x8::splat(12)) } #[inline] #[target_feature(enable = "avx512f,avx512vl")] #[assert_instr(vpro)] unsafe fn rotate_left_x2(x: u64x2) -> u64x2 { x.rotate_left(u64x2::splat(12)) } } ================================================ FILE: verify/verify/src/api/ops/vector_rotates.rs ================================================ use cfg_if::cfg_if; cfg_if! { if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] { mod x86; } } ================================================ FILE: verify/verify/src/api/ops.rs ================================================ mod vector_rotates; ================================================ FILE: verify/verify/src/api/reductions/mask/avx.rs ================================================ //! Verification of the mask reduction API for `x86`/`x86_64`+`SSE2` use packed_simd::*; use stdarch_test::assert_instr; macro_rules! verify { ($id:ident => $instr:tt) => { verify_mask!($id["avx"] => $instr); } } // 128-bit wide: verify!(m8x16 => vpmovmskb); verify!(m16x8 => vpmovmskb); verify!(m32x4 => vmovmskps); verify!(m64x2 => vmovmskpd); // FIXME: verify!(m128x1 => vmovmskpd); // 256-bit wide: verify!(m8x32 => vptest); verify!(m16x16 => vptest); verify!(m32x8 => vmovmskps); verify!(m64x4 => vmovmskpd); // FIXME: verify!(m128x2 => vmovmskpd); // FIXME: 512-bit wide masks ================================================ FILE: verify/verify/src/api/reductions/mask/avx2.rs ================================================ //! Verification of the mask reduction API for `x86`/`x86_64`+`SSE2` use packed_simd::*; use stdarch_test::assert_instr; macro_rules! verify { ($id:ident => $instr:tt) => { verify_mask!($id["avx2"] => $instr); } } // 128-bit wide: verify!(m8x16 => vpmovmskb); verify!(m16x8 => vpmovmskb); verify!(m32x4 => vmovmskps); verify!(m64x2 => vmovmskpd); // FIXME: verify!(m128x1 => vmovmskpd); // 256-bit wide: verify!(m8x32 => vpmovmskb); verify!(m16x16 => vpmovmskb); verify!(m32x8 => vmovmskps); verify!(m64x4 => vmovmskpd); // FIXME: verify!(m128x2 => vmovmskpd); // FIXME: 512-bit wide masks ================================================ FILE: verify/verify/src/api/reductions/mask/sse.rs ================================================ //! Verification of the mask reduction API for `x86`/`x86_64`+`SSE` #![allow(unused)] use packed_simd::*; use stdarch_test::assert_instr; macro_rules! verify { ($id:ident => $instr:tt) => { verify_mask!($id["sse"] => $instr); } } // 128-bit wide: verify!(m32x4 => movmskps); verify!(m64x2 => movmskps); // FIXME: verify!(m128x1 => movmskps); // 256-bit wide: verify!(m32x8 => movmskps); verify!(m64x4 => movmskps); // FIXME: verify!(m128x2 => movmskps); // FIXME: 512-bit wide masks ================================================ FILE: verify/verify/src/api/reductions/mask/sse2.rs ================================================ //! Verification of the mask reduction API for `x86`/`x86_64`+`SSE2` use packed_simd::*; use stdarch_test::assert_instr; macro_rules! verify { ($id:ident => $instr:tt) => { verify_mask!($id["sse2"] => $instr); } } // 128-bit wide: verify!(m8x16 => pmovmskb); verify!(m16x8 => pmovmskb); verify!(m32x4 => movmskps); verify!(m64x2 => movmskpd); // FIXME: verify!(m128x1 => movmskpd); // 256-bit wide: verify!(m8x32 => pmovmskb); verify!(m16x16 => pmovmskb); verify!(m32x8 => movmskps); verify!(m64x4 => movmskpd); // FIXME: verify!(m128x2 => movmskpd); // FIXME: 512-bit wide masks ================================================ FILE: verify/verify/src/api/reductions/mask.rs ================================================ //! Verify the mask reduction API. use cfg_if::cfg_if; #[allow(unused)] macro_rules! verify_mask { ($mask_id:ident[$target_feature:tt] => $all_instr:tt, $any_instr:tt, $none_instr:tt) => { paste::item! { #[inline] #[target_feature(enable = $target_feature)] #[assert_instr($all_instr)] pub unsafe fn [<$mask_id _all>](x: $mask_id) -> bool { x.all() } #[inline] #[target_feature(enable = $target_feature)] #[assert_instr($any_instr)] pub unsafe fn [<$mask_id _any>](x: $mask_id) -> bool { x.any() } #[inline] #[target_feature(enable = $target_feature)] #[assert_instr($none_instr)] pub unsafe fn [<$mask_id _none>](x: $mask_id) -> bool { x.none() } } }; ($mask_id:ident[$target_feature:tt] => $instr:tt) => { verify_mask!($mask_id[$target_feature] => $instr, $instr, $instr); }; } cfg_if! { if #[cfg(all(any(target_arch = "x86", target_arch = "x86_64")), target_feature = "sse")] { // FIXME: avx512 #[cfg(all(not(target_feature = "avx512f"), target_feature = "avx2"))] mod avx2; #[cfg(all(not(target_feature = "avx2"), target_feature = "avx"))] mod avx; #[cfg(all(not(target_feature = "avx"), target_feature = "sse2"))] mod sse2; #[cfg(all(not(target_feature = "sse2"), target_feature = "sse"))] mod sse; } } ================================================ FILE: verify/verify/src/api/reductions.rs ================================================ mod mask; ================================================ FILE: verify/verify/src/api.rs ================================================ use cfg_if::cfg_if; cfg_if! { if #[cfg(debug_assertions)] { compile_error!("the verify tests only run in --release mode"); } } mod math; mod ops; mod reductions; ================================================ FILE: verify/verify/src/lib.rs ================================================ // FIXME: these types are unsound in C FFI already // See https://github.com/rust-lang/rust/issues/53346 #![allow(improper_ctypes_definitions)] #![deny(rust_2018_idioms)] #![cfg_attr(test, feature(avx512_target_feature, abi_vectorcall))] #[cfg(test)] mod api;