Repository: agenium-scale/nsimd Branch: master Commit: 702f4d179ff0 Files: 148 Total size: 2.7 MB Directory structure: gitextract_56lzr4bw/ ├── .clang-format ├── .gitignore ├── CMakeLists.txt ├── CONTRIBUTING.md ├── LICENSE ├── README.md ├── benches/ │ └── benches.hpp ├── build.nsconfig ├── doc/ │ ├── Makefile.nix │ ├── Makefile.win │ ├── markdown/ │ │ ├── compilers_and_versions.md │ │ ├── concepts.md │ │ ├── defines.md │ │ ├── faq.md │ │ ├── fp16.md │ │ ├── how_tests_are_done.md │ │ ├── memory.md │ │ ├── modules/ │ │ │ ├── .gitignore │ │ │ └── fixed_point/ │ │ │ └── overview.md │ │ ├── pack.md │ │ └── tutorial.md │ ├── md2html.cpp │ └── what_is_wrapped.cpp ├── egg/ │ ├── __init__.py │ ├── common.py │ ├── cuda.py │ ├── experiments/ │ │ ├── gen_sleef_operators.py │ │ ├── round-ppc.c │ │ └── upcvt-sve.c │ ├── gen_adv_c_api.py │ ├── gen_adv_cxx_api.py │ ├── gen_archis.py │ ├── gen_base_apis.py │ ├── gen_benches.py │ ├── gen_doc.py │ ├── gen_friendly_but_not_optimized.py │ ├── gen_modules.py │ ├── gen_scalar_utilities.py │ ├── gen_src.py │ ├── gen_tests.py │ ├── get_sleef_code.py │ ├── hatch.py │ ├── modules/ │ │ ├── fixed_point/ │ │ │ ├── gen_doc.py │ │ │ ├── gen_tests.py │ │ │ └── hatch.py │ │ ├── memory_management/ │ │ │ └── hatch.py │ │ ├── random/ │ │ │ └── hatch.py │ │ ├── spmd/ │ │ │ └── hatch.py │ │ └── tet1d/ │ │ └── hatch.py │ ├── oneapi.py │ ├── operators.py │ ├── platform_arm.py │ ├── platform_cpu.py │ ├── platform_ppc.py │ ├── platform_x86.py │ ├── rocm.py │ ├── scalar.py │ └── x86_load_store_deg234.py ├── examples/ │ ├── module_fixed_point.cpp │ └── tutorial.cpp ├── include/ │ └── nsimd/ │ ├── c_adv_api.h │ ├── cxx_adv_api.hpp │ ├── cxx_adv_api_aliases.hpp │ ├── modules/ │ │ ├── fixed_point.hpp │ │ ├── memory_management.hpp │ │ ├── spmd.hpp │ │ └── tet1d.hpp │ ├── nsimd-all.h │ ├── nsimd-all.hpp │ └── nsimd.h ├── scripts/ │ ├── FindNSIMD.cmake │ ├── aarch64-linux-gnu-clang++.sh │ ├── aarch64-linux-gnu-clang.sh │ ├── build-tests.bat │ ├── build-tests.sh │ ├── build.bat │ ├── build.sh │ ├── ci-clang.txt │ ├── ci-scale.txt │ ├── ci-test.txt │ ├── ci.sh │ ├── compile-gmp-mpfr-for-wasm.sh │ ├── gen_github_doc.sh │ ├── hipcc.sh │ ├── init-benches-deps.sh │ ├── local-ci-rerun.ini │ ├── local-ci.ini │ ├── local-ci.sh │ ├── one-liner.c │ ├── powerpc64le-linux-gnu-clang++.sh │ ├── powerpc64le-linux-gnu-clang.sh │ ├── setup.bat │ └── setup.sh ├── src/ │ ├── dd.h │ ├── df.h │ ├── estrin.h │ ├── fp16.cpp │ ├── gpu.cpp │ ├── helperadvsimd.h │ ├── helperavx.h │ ├── helperavx2.h │ ├── helperavx512f.h │ ├── helperneon32.h │ ├── helperpower_128.h │ ├── helpersse2.h │ ├── helpersve.h │ ├── memory.cpp │ ├── misc.h │ ├── rempitab.c │ ├── rename.h │ ├── renameadvsimd.h │ ├── renameavx.h │ ├── renameavx2.h │ ├── renameavx512f.h │ ├── renameneon32.h │ ├── renamesse2.h │ ├── renamesse4.h │ ├── renamesve.h │ ├── renamevsx.h │ ├── sleefdp.c │ ├── sleefsimddp.c │ ├── sleefsimddp_emulation.c │ ├── sleefsimdsp.c │ ├── sleefsimdsp_emulation.c │ ├── sleefsp.c │ └── ufp.cpp └── tests/ ├── CMakeLists.txt.sh ├── FindNSIMD.cmake.sh ├── allocator.cpp ├── assign_arith.cpp ├── booleans.cpp ├── c11_vec.c ├── cxx_adv_api_aliases.cpp ├── fp16.prec11.c ├── get_pack.cpp ├── memory.cpp ├── memory.prec11.c ├── modules/ │ └── common.hpp ├── nsimd-all.cpp ├── nsimd.cpp ├── nsimd.prec11.c ├── operator_vector_scalar.cpp ├── shifts.cpp ├── templated_loads_stores.cpp ├── tests_helpers.hpp ├── to_pack.cpp ├── to_pack_interleave.cpp └── ufp.cpp ================================================ FILE CONTENTS ================================================ ================================================ FILE: .clang-format ================================================ Standard: Cpp03 ColumnLimit: 79 ================================================ FILE: .gitignore ================================================ # Common build dirs build*/ # Dependencies nstools/ # Binaries *.o *.so *.pyc *.exe *.dll *.dylib # Generated files ## API src/api_*.cpp src/api_* ## Plateform specific code include/nsimd/arm include/nsimd/cpu include/nsimd/cxx_adv_api_functions.hpp include/nsimd/friendly_but_not_optimized.hpp include/nsimd/functions.h include/nsimd/ppc include/nsimd/x86 ## Tests tests/c_base tests/cxx_base tests/cxx_adv tests/modules/tet1d/ tests/modules/fixed_point/ tests/modules/rand/*.cpp tests/modules/spmd/ tests/modules/random/ ## Benches benches/cxx_adv ## Modules include/nsimd/modules/tet1d/ include/nsimd/modules/spmd/ include/nsimd/modules/fixed_point/ include/nsimd/scalar_utilities.h ## Doc doc/html/* !doc/html/assets/ doc/markdown/overview.md doc/markdown/api.md doc/markdown/api_*.md doc/markdown/module_fixed_point_api*.md doc/markdown/module_fixed_point_overview.md doc/markdown/module_spmd_api*.md doc/markdown/module_spmd_overview.md doc/markdown/module_memory_management_overview.md doc/md2html doc/tmp.html ## Ulps ulps/ ## CI _ci/ ================================================ FILE: CMakeLists.txt ================================================ # MIT License # # Copyright (c) 2021 Agenium Scale # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. cmake_minimum_required(VERSION 3.0.2) project(NSIMD VERSION 3.0 LANGUAGES C CXX) # ----------------------------------------------------------------------------- # First check that NSIMD code has been generated if (NOT EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/include/nsimd/functions.h") if (WIN32) execute_process(COMMAND python ${CMAKE_CURRENT_SOURCE_DIR}\\egg\\hatch.py -lf) else() execute_process(COMMAND python3 ${CMAKE_CURRENT_SOURCE_DIR}/egg/hatch.py -lf) endif() endif() # ----------------------------------------------------------------------------- # Compilations options option(NSIMD_ARM32_IS_ARMEL "Set whether ARM32 is in fact armel or armhf" ON) function(nsimd_get_compiler_argument simd_ext argument) if (MSVC) if (CMAKE_CL_64) set(mapping_sse2 "/DSSE2") set(mapping_sse42 "/DSSE42") else() set(mapping_sse2 "/DSSE2;/arch:SSE2") set(mapping_sse42 "/DSSE42;/arch:SSE2") endif() set(mapping_avx "/DAVX;/arch:AVX") set(mapping_avx2 "/DAVX2;/arch:AVX2") set(mapping_avx512_knl "/DAVX512_KNL;/arch:AVX512") set(mapping_avx512_skylake "/DAVX512_SKYLAKE;/arch:AVX512") set(mapping_neon128 "/DNEON128;/arch:VFPv4") set(mapping_aarch64 "/DAARCH64") set(mapping_sve "/DSVE") set(mapping_sve128 "/DSVE128") set(mapping_sve256 "/DSVE256") set(mapping_sve512 "/DSVE512") set(mapping_sve1024 "/DSVE1024") set(mapping_sve2048 "/DSVE2048") set(mapping_vmx "/DVMX") set(mapping_vsx "/DVSX") set(mapping_cuda "/DCUDA") set(mapping_rocm "/DROCM") set(mapping_oneapi "/ONEAPI") else() set(mapping_sse2 "-DSSE2;-msse2" ) set(mapping_sse42 "-DSSE42;-msse4.2" ) set(mapping_avx "-DAVX;-mavx;-mno-avx256-split-unaligned-load" ";-mno-avx256-split-unaligned-store" ) set(mapping_avx2 "-DAVX2;-mavx2;-mfma;-mno-avx256-split-unaligned-load" ";-mno-avx256-split-unaligned-store" ) set(mapping_avx512_knl "-DAVX512_KNL;-mavx512f;-mavx512pf;-mavx512er" ";-mavx512cd") set(mapping_avx512_skylake "-DAVX512_SKYLAKE;-mavx512f;-mavx512dq" ";-mavx512cd;-mavx512bw;-mavx512vl") if (NSIMD_ARM32_IS_ARMEL) set(mapping_neon128 "-DNEON128;-mfloat-abi=softfp;-mfpu=neon") else() set(mapping_neon128 "-DNEON128;-mfpu=neon") endif() set(mapping_aarch64 "-DAARCH64") set(mapping_sve "-DSVE;-march=armv8.2-a+sve") set(mapping_sve128 "-DSVE128;-march=armv8.2-a+sve;-msve-vector-bits=128") set(mapping_sve256 "-DSVE256;-march=armv8.2-a+sve;-msve-vector-bits=256") set(mapping_sve512 "-DSVE512;-march=armv8.2-a+sve;-msve-vector-bits=512") set(mapping_sve1024 "-DSVE1024;-march=armv8.2-a+sve" ";-msve-vector-bits=1024") set(mapping_sve2048 "-DSVE2048 -march=armv8.2-a+sve" ";-msve-vector-bits=2048") set(mapping_vmx "-DVMX;-mcpu=powerpc64le;-maltivec") set(mapping_vsx "-DVSX;-mcpu=powerpc64le;-mvsx") set(mapping_cuda "-DCUDA") set(mapping_rocm "-DROCM") set(mapping_oneapi "-DONEAPI") endif() if (DEFINED mapping_${simd_ext}) set(${argument} "${mapping_${simd_ext}}" PARENT_SCOPE) else() if (MSVC) set(${argument} "/DCPU" PARENT_SCOPE) else() set(${argument} "-DCPU" PARENT_SCOPE) endif() endif() endfunction() if (NOT DEFINED simd) set(simd "cpu") endif() nsimd_get_compiler_argument(${simd} NSIMD_COMPILATION_OPTIONS) # ----------------------------------------------------------------------------- # Object file selection set(NSIMD_OBJS "fp16;gpu;memory;api_cpu;rempitab;sleefsp;sleefdp") if ("${simd}" STREQUAL "sse2") set(NSIMD_OBJS "${NSIMD_OBJS};api_sse2;sleef_sse2_f32;sleef_sse2_f64") elseif ("${simd}" STREQUAL "sse42") set(NSIMD_OBJS "${NSIMD_OBJS};api_sse2;api_sse42;" "sleef_sse2_f32;sleef_sse2_f64;" "sleef_sse42_f32;sleef_sse42_f64") elseif ("${simd}" STREQUAL "avx") set(NSIMD_OBJS "${NSIMD_OBJS};api_sse2;api_sse42;api_avx;" "sleef_sse2_f32;sleef_sse2_f64;" "sleef_sse42_f32;sleef_sse42_f64;" "sleef_avx_f32;sleef_avx_f64") elseif ("${simd}" STREQUAL "avx2") set(NSIMD_OBJS "${NSIMD_OBJS};api_sse2;api_sse42;api_avx;api_avx2;" "sleef_sse2_f32;sleef_sse2_f64;" "sleef_sse42_f32;sleef_sse42_f64;" "sleef_avx_f32;sleef_avx_f64;" "sleef_avx2_f32;sleef_avx2_f64") elseif ("${simd}" STREQUAL "avx512_knl") set(NSIMD_OBJS "${NSIMD_OBJS};api_sse2;api_sse42;api_avx;api_avx2" "sleef_sse2_f32;sleef_sse2_f64;" "sleef_sse42_f32;sleef_sse42_f64;" "sleef_avx_f32;sleef_avx_f64;" "sleef_avx2_f32;sleef_avx2_f64;" "api_avx512_knl;sleef_avx512_knl_f32;sleef_avx512_knl_f64") elseif ("${simd}" STREQUAL "avx512_skylake") set(NSIMD_OBJS "${NSIMD_OBJS};api_sse2;api_sse42;api_avx;api_avx2;" "api_avx512_skylake;sleef_avx512_skylake_f32;" "sleef_sse2_f32;sleef_sse2_f64;" "sleef_sse42_f32;sleef_sse42_f64;" "sleef_avx_f32;sleef_avx_f64;" "sleef_avx2_f32;sleef_avx2_f64;" "sleef_avx512_skylake_f64") elseif ("${simd}" STREQUAL "neon128") set(NSIMD_OBJS "${NSIMD_OBJS};api_neon128;" "sleef_neon128_f32;sleef_neon128_f64") elseif ("${simd}" STREQUAL "aarch64") set(NSIMD_OBJS "${NSIMD_OBJS};api_aarch64;" "sleef_aarch64_f32;sleef_aarch64_f64") elseif ("${simd}" STREQUAL "sve") set(NSIMD_OBJS "${NSIMD_OBJS};api_aarch64;api_sve;" "sleef_aarch64_f32;sleef_aarch64_f64;" "sleef_sve_f32;sleef_sve_f64") elseif ("${simd}" STREQUAL "sve128") set(NSIMD_OBJS "${NSIMD_OBJS};api_aarch64;api_sve128;" "sleef_aarch64_f32;sleef_aarch64_f64;" "sleef_sve128_f32;sleef_sve128_f64") elseif ("${simd}" STREQUAL "sve256") set(NSIMD_OBJS "${NSIMD_OBJS};api_aarch64;api_sve256;" "sleef_aarch64_f32;sleef_aarch64_f64;" "sleef_sve256_f32;sleef_sve256_f64") elseif ("${simd}" STREQUAL "sve512") set(NSIMD_OBJS "${NSIMD_OBJS};api_aarch64;api_sve512;" "sleef_aarch64_f32;sleef_aarch64_f64;" "sleef_sve512_f32;sleef_sve512_f64") elseif ("${simd}" STREQUAL "sve1024") set(NSIMD_OBJS "${NSIMD_OBJS};api_aarch64;api_sve1024;" "sleef_aarch64_f32;sleef_aarch64_f64;" "sleef_sve1024_f32;sleef_sve1024_f64") elseif ("${simd}" STREQUAL "sve2048") set(NSIMD_OBJS "${NSIMD_OBJS};api_aarch64;api_sve2048;" "sleef_aarch64_f32;sleef_aarch64_f64;" "sleef_sve2048_f32;sleef_sve2048_f64") elseif ("${simd}" STREQUAL "vmx") set(NSIMD_OBJS "${NSIMD_OBJS};api_vmx;sleef_vmx_f32;sleef_vmx_f64") elseif ("${simd}" STREQUAL "vsx") set(NSIMD_OBJS "${NSIMD_OBJS};api_vmx;api_vsx;sleef_vmx_f32;sleef_vmx_f64;" "sleef_vsx_f32;sleef_vsx_f64") endif() # ----------------------------------------------------------------------------- # Rules for building the library set(NSIMD_LIB_DEPS "") foreach(o ${NSIMD_OBJS}) if (EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/${o}.cpp") add_library(${o} OBJECT "${CMAKE_CURRENT_SOURCE_DIR}/src/${o}.cpp") elseif(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/${o}.c") add_library(${o} OBJECT "${CMAKE_CURRENT_SOURCE_DIR}/src/${o}.c") elseif(("${o}" STREQUAL "sleef_neon128_f64") OR ("${o}" STREQUAL "sleef_vmx_f64")) add_library(${o} OBJECT "${CMAKE_CURRENT_SOURCE_DIR}/src/sleefsimddp_emulation.c") elseif("${o}" STREQUAL "sleef_vmx_f32") add_library(${o} OBJECT "${CMAKE_CURRENT_SOURCE_DIR}/src/sleefsimdsp_emulation.c") elseif(o MATCHES "sleef_.*_f32") add_library(${o} OBJECT "${CMAKE_CURRENT_SOURCE_DIR}/src/sleefsimdsp.c") elseif(o MATCHES "sleef_.*_f64") add_library(${o} OBJECT "${CMAKE_CURRENT_SOURCE_DIR}/src/sleefsimddp.c") endif() if (MSVC) set(sleef_cflags "/DNDEBUG;/DDORENAME=1") else() set(sleef_cflags "-DNDEBUG;-DDORENAME=1") endif() set_property(TARGET ${o} PROPERTY POSITION_INDEPENDENT_CODE ON) target_include_directories(${o} PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}/include") if (MSVC) target_compile_definitions(${o} PUBLIC "/D_CRT_SECURE_NO_WARNINGS") endif() set(buf "") if ("${o}" STREQUAL "api_sse2") nsimd_get_compiler_argument("sse2" buf) elseif ("${o}" STREQUAL "api_sse42") nsimd_get_compiler_argument("sse42" buf) elseif ("${o}" STREQUAL "api_avx") nsimd_get_compiler_argument("avx" buf) elseif ("${o}" STREQUAL "api_avx2") nsimd_get_compiler_argument("avx2" buf) elseif ("${o}" STREQUAL "api_avx512_knl") nsimd_get_compiler_argument("avx512_knl" buf) elseif ("${o}" STREQUAL "api_avx512_skylake") nsimd_get_compiler_argument("avx512_skylake" buf) elseif ("${o}" STREQUAL "api_neon128") nsimd_get_compiler_argument("neon128" buf) elseif ("${o}" STREQUAL "api_aarch64") nsimd_get_compiler_argument("aarch64" buf) elseif ("${o}" STREQUAL "api_sve") nsimd_get_compiler_argument("sve" buf) elseif ("${o}" STREQUAL "api_sve128") nsimd_get_compiler_argument("sve128" buf) elseif ("${o}" STREQUAL "api_sve256") nsimd_get_compiler_argument("sve256" buf) elseif ("${o}" STREQUAL "api_sve512") nsimd_get_compiler_argument("sve512" buf) elseif ("${o}" STREQUAL "api_sve1024") nsimd_get_compiler_argument("sve1024" buf) elseif ("${o}" STREQUAL "api_sve2048") nsimd_get_compiler_argument("sve2048" buf) elseif ("${o}" STREQUAL "api_vmx") nsimd_get_compiler_argument("vmx" buf) elseif ("${o}" STREQUAL "api_vsx") nsimd_get_compiler_argument("vsx" buf) elseif ("${o}" STREQUAL "api_cuda") nsimd_get_compiler_argument("cuda" buf) elseif ("${o}" STREQUAL "api_rocm") nsimd_get_compiler_argument("rocm" buf) elseif ("${o}" STREQUAL "api_cpu") nsimd_get_compiler_argument("cpu" buf) elseif ("${o}" STREQUAL "rempitab") list(APPEND buf "${sleef_cflags}") elseif ("${o}" STREQUAL "sleefsp") list(APPEND buf "${sleef_cflags}") elseif ("${o}" STREQUAL "sleefdp") list(APPEND buf "${sleef_cflags}") elseif ("${o}" MATCHES "sleef_sse2_") nsimd_get_compiler_argument("sse2" buf) list(APPEND buf "-DNSIMD_SSE2;-DENABLE_SSE2=1;${sleef_cflags}") elseif ("${o}" MATCHES "sleef_sse42_") nsimd_get_compiler_argument("sse42" buf) list(APPEND buf "-DNSIMD_SSE42;-DENABLE_SSE4=1;${sleef_cflags}") elseif ("${o}" MATCHES "sleef_avx_") nsimd_get_compiler_argument("avx" buf) list(APPEND buf "-DNSIMD_AVX;-DENABLE_AVX=1;${sleef_cflags}") elseif ("${o}" MATCHES "sleef_avx2_") nsimd_get_compiler_argument("avx2" buf) list(APPEND buf "-DNSIMD_AVX2;-DENABLE_AVX2=1;${sleef_cflags}") elseif ("${o}" MATCHES "sleef_avx512_knl_") nsimd_get_compiler_argument("avx512_knl" buf) list(APPEND buf "-DNSIMD_AVX512_KNL;-DENABLE_AVX512F=1;${sleef_cflags}") elseif ("${o}" MATCHES "sleef_avx512_skylake_") nsimd_get_compiler_argument("avx512_skylake" buf) list(APPEND buf "-DNSIMD_AVX512_SKYLAKE;-DENABLE_AVX512F=1;${sleef_cflags}") elseif ("${o}" MATCHES "sleef_neon128_") nsimd_get_compiler_argument("neon128" buf) list(APPEND buf "-DNSIMD_NEON128;-DENABLE_NEON32=1;${sleef_cflags}") elseif ("${o}" MATCHES "sleef_aarch64_") nsimd_get_compiler_argument("aarch64" buf) list(APPEND buf "-DNSIMD_AARCH64;-DENABLE_ADVSIMD=1;${sleef_cflags}") elseif ("${o}" MATCHES "sleef_sve_") nsimd_get_compiler_argument("sve" buf) list(APPEND buf "-DNSIMD_SVE;-DENABLE_SVE=1;${sleef_cflags}") elseif ("${o}" MATCHES "sleef_sve128_") nsimd_get_compiler_argument("sve128" buf) list(APPEND buf "-DNSIMD_SVE128;-DENABLE_SVE=1;${sleef_cflags}") elseif ("${o}" MATCHES "sleef_sve256_") nsimd_get_compiler_argument("sve256" buf) list(APPEND buf "-DNSIMD_SVE256;-DENABLE_SVE=1;${sleef_cflags}") elseif ("${o}" MATCHES "sleef_sve512_") nsimd_get_compiler_argument("sve512" buf) list(APPEND buf "-DNSIMD_SVE512;-DENABLE_SVE=1;${sleef_cflags}") elseif ("${o}" MATCHES "sleef_sve1024_") nsimd_get_compiler_argument("sve1024" buf) list(APPEND buf "-DNSIMD_SVE1024;-DENABLE_SVE=1;${sleef_cflags}") elseif ("${o}" MATCHES "sleef_sve2048_") nsimd_get_compiler_argument("sve2048" buf) list(APPEND buf "-DNSIMD_SVE2048;-DENABLE_SVE=1;${sleef_cflags}") elseif ("${o}" MATCHES "sleef_vmx_") nsimd_get_compiler_argument("vmx" buf) list(APPEND buf "-DNSIMD_VMX;-DENABLE_VSX=1;${sleef_cflags}") elseif ("${o}" MATCHES "sleef_vsx_") nsimd_get_compiler_argument("vsx" buf) list(APPEND buf "-DNSIMD_VSX;-DENABLE_VSX=1;${sleef_cflags}") else() set(buf "") endif() if (NOT "${buf}" STREQUAL "") target_compile_options(${o} PUBLIC "${buf}") endif() list(APPEND NSIMD_LIB_DEPS "$") endforeach() set(NSIMD_LIB_TARGET "nsimd_${simd}") add_library(${NSIMD_LIB_TARGET} SHARED ${NSIMD_LIB_DEPS}) # ----------------------------------------------------------------------------- # Installation stuff if (WIN32) install(TARGETS ${NSIMD_LIB_TARGET} RUNTIME DESTINATION lib ARCHIVE DESTINATION lib) else() install(TARGETS ${NSIMD_LIB_TARGET} LIBRARY DESTINATION lib) endif() install(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/include/nsimd DESTINATION include) ================================================ FILE: CONTRIBUTING.md ================================================ ## How to Contribute to `nsimd`? You are welcome to contribute to `nsimd`. This document gives some details on how to add/wrap new intrinsics. When you have finished fixing some bugs or adding some new features, please make a pull request. One of our repository maintainer will then merge or comment the pull request. ## Prerequisites - Respect the philosophy of the library (see [index](index.md).) - Basic knowledge of Python 3. - Good knowledge of C. - Good knowledge of C++. - Good knowledge of SIMD programming. ## How Do I Add Support for a New Intrinsic? ### Introduction `nsimd` currently supports the following architectures: - `CPU`: + `CPU` called `CPU` in source code. This "extension" is not really one as it is only present so that code written with `nsimd` can compile and run on targets not supported by `nsimd` or with no SIMD. - Intel: + `SSE2` called `SSE2` in source code. + `SSE4.2` called `SSE42` in source code. + `AVX` called `AVX` in source code. + `AVX2` called `AVX2` in source code. + `AVX-512` as found on KNLs called `AVX512_KNL` in source code. + `AVX-512` as found on Xeon Skylake CPUs called `AVX512_SKYLAKE` in source code. - Arm + `NEON` 128 bits as found on ARMv7 CPUs called `NEON128` in source code. + `NEON` 128 bits as found on Aarch64 CPUs called `AARCH64` in source code. + `SVE` called `SVE` in source code. + `SVE` 128 bits known at compiled time called `SVE128` in source code. + `SVE` 256 bits known at compiled time called `SVE256` in source code. + `SVE` 512 bits known at compiled time called `SVE512` in source code. + `SVE` 1024 bits known at compiled time called `SVE1024` in source code. + `SVE` 2048 bits known at compiled time called `SVE2048` in source code. - IBM POWERPC + `VMX` 128 bits as found on POWER6 CPUs called `VMX` in source code. + `VSX` 128 bits as found on POWER7/8 CPUs called `VSX` in source code. - NVIDIA + `CUDA` called `CUDA` in source code - AMD + `ROCm` called `ROCM` in source code - Intel oneAPI + `oneAPI` called `ONEAPI` in source code `nsimd` currently supports the following types: - `i8`: signed integers over 8 bits (usually `signed char`), - `u8`: unsigned integers over 8 bits (usually `unsigned char`), - `i16`: signed integers over 16 bits (usually `short`), - `u16`: unsigned integers over 16 bits (usually `unsigned short`), - `i32`: signed integers over 32 bits (usually `int`), - `u32`: unsigned integers over 32 bits (usually `unsigned int`), - `i64`: signed integers over 64 bits (usually `long`), - `u64`: unsigned integers over 64 bits (usually `unsigned long`), - `f16`: floating point numbers over 16 bits in IEEE format called `float16` in the rest of this document (), - `f32`: floating point numbers over 32 bits (usually `float`) - `f64`: floating point numbers over 64 bits (usually `double`), As C and C++ do not support `float16`, `nsimd` provides its own types to handle them. Therefore special care has to be taken when implementing intrinsics/operators on architecures that do not natively supports them. We will make the following misuse of language in the rest of this document. The type taken by intrinsics is of course a SIMD vector and more precisely a SIMD vector of chars or a SIMD vector of `short`s or a SIMD vector of `int`s… Therefore when we will talk about an intrinsic, we will say that it takes type `T` as arguments when it takes in fact a SIMD vector of `T`. ### Our imaginary intrinsic We will add support to the library for the following imaginary intrinsic: given a SIMD vector, suppose that this intrisic called `foo` takes each element `x` of the vector and compute `1 / (1 - x) + 1 / (1 - x)^2`. Moreover suppose that hardware vendors all propose this intrisic only for floatting point numbers as follows: - CPU (no intrinsics is given of course in standard C and C++) - Intel (no intrinsics is given for `float16`s) + `SSE2`: no intrinsics is provided. + `SSE42`: `_mm_foo_ps` for `float`s and `_mm_foo_pd` for `double`s. + `AVX`: no intrinsics is provided. + `AVX2`: `_mm256_foo_ps` for `float`s and `_mm256_foo_pd` for `double`s. + `AVX512_KNL`: no intrinsics is provided. + `AVX512_SKYLAKE`: `_mm512_foo_ps` for `float`s and `_mm512_foo_pd` for `double`s. - ARM + `NEON128`: `vfooq_f16` for `float16`s, `vfooq_f32` for `float`s and no intrinsics for `double`s. + `AARCH64`: same as `NEON128` but `vfooq_f64` for doubles. + `SVE`: `svfoo_f16`, `svfoo_f32` and `svfoo_f64` for respectively `float16`s, `float`s and `double`s. + `SVE128`: `svfoo_f16`, `svfoo_f32` and `svfoo_f64` for respectively `float16`s, `float`s and `double`s. + `SVE256`: `svfoo_f16`, `svfoo_f32` and `svfoo_f64` for respectively `float16`s, `float`s and `double`s. + `SVE512`: `svfoo_f16`, `svfoo_f32` and `svfoo_f64` for respectively `float16`s, `float`s and `double`s. + `SVE1024`: `svfoo_f16`, `svfoo_f32` and `svfoo_f64` for respectively `float16`s, `float`s and `double`s. + `SVE2048`: `svfoo_f16`, `svfoo_f32` and `svfoo_f64` for respectively `float16`s, `float`s and `double`s. - IBM POWERPC + `VMX`: `vec_foo` for `float`s and no intrinsics for `double`s. + `VSX`: `vec_foo` for `float`s and `double`s. - NVIDIA + `CUDA`: no intrinsics is provided. - AMD + `ROCM`: no intrinsics is provided. - Intel oneAPI + `ONEAPI`: no intrinsics is provided. First thing to do is to declare this new intrinsic to the generation system. A lot of work is done by the generation system such as generating all functions signatures for C and C++ APIs, tests, benchmarks and documentation. Of course the default documentation does not say much but you can add a better description. ### Registering the intrinsic (or operator) A function or an intrinsic is called an operator in the generation system. Go at the bottom of `egg/operators.py` and add the following just after the `Rsqrt11` class. ```python class Foo(Operator): full_name = 'foo' signature = 'v foo v' types = common.ftypes domain = Domain('R\{1}') categories = [DocBasicArithmetic] ``` This little class will be processed by the generation system so that operator `foo` will be available for the end-user of the library in both C and C++ APIs. Each member of this class controls how the generation is be done: - `full_name` is a string containing the human readable name of the operator. If not given, the class name will be taken for it. - `signature` is a string describing what kind of arguments and how many takes the operator. This member is mandatory and must respect the following syntax: `return_type name_of_operator arg1_type arg2_type ...` where `return_type` and the `arg*_type` can be taken from the following list: + `v ` SIMD vector parameter + `vx2 ` Structure of 2 SIMD vector parameters + `vx3 ` Structure of 3 SIMD vector parameters + `vx4 ` Structure of 4 SIMD vector parameters + `l ` SIMD vector of logicals parameter + `s ` Scalar parameter + `* ` Pointer to scalar parameter + `c* ` Pointer to const scalar parameter + `_ ` void (only for return type) + `p ` Parameter (integer) In our case `v foo v` means that `foo` takes one SIMD vector as argument and returns a SIMD vector as output. Several signatures will be generated for this intrinsic according to the types it can supports. In our case the intrinsic only support floatting point types. - `types` is a Python list indicating which types are supported by the intrinsic. If not given, the intrinsic is supposed to support all types. Some Python lists are predefined to help the programmer: + `ftypes = ['f64', 'f32', 'f16'] ` All floatting point types + `ftypes_no_f16 = ['f64', 'f32'] ` + `itypes = ['i64', 'i32', 'i16', 'i8'] ` All signed integer types + `utypes = ['u64', 'u32', 'u16', 'u8'] ` All unsigned integer types + `iutypes = itypes + utypes` + `types = ftypes + iutypes` - `domain` is a string indicating the mathematical domain of definition of the operator. This helps for benchmarks and tests for generating random numbers as inputs in the correct interval. In our case `R\{1}` means all real numbers (of course all floating point numbers) expect `-1` for which the operator cannot be computed. For examples see how other operators are defined in `egg/operators.py`. - `categories` is a list of Python classes that indicates the generation system to which categories `foo` belongs. The list of available categories is as follow: + `DocShuffle ` for Shuffle functions + `DocTrigo ` for Trigonometric functions + `DocHyper ` for Hyperbolic functions + `DocExpLog ` for Exponential and logarithmic functions + `DocBasicArithmetic ` for Basic arithmetic operators + `DocBitsOperators ` for Bits manipulation operators + `DocLogicalOperators ` for Logicals operators + `DocMisc ` for Miscellaneous + `DocLoadStore ` for Loads & stores + `DocComparison ` for Comparison operators + `DocRounding ` for Rounding functions + `DocConversion ` for Conversion operators If no category corresponds to the operator you want to add to `nsimd` then feel free to create a new category (see the bottom of this document) Many other members are supported by the generation system. We describe them quickly here and will give more details in a later version of this document. Default values are given in square brakets: - `cxx_operator [= None]` in case the operator has a corresponding C++ operator. - `autogen_cxx_adv [= True]` in case the C++ advanced API signatures for this operator must not be auto-generated. - `output_to [= common.OUTPUT_TO_SAME_TYPE]` in case the operator output type differs from its input type. Possible values are: + `OUTPUT_TO_SAME_TYPE`: output is of same type as input. + `OUTPUT_TO_SAME_SIZE_TYPES`: output can be any type of same bit size. + `OUTPUT_TO_UP_TYPES`: output can be any type of bit size twice the bit bit size of the input. In this case the input type will never be a 64-bits type. + `OUTPUT_TO_DOWN_TYPES`: output can be any type of bit size half the bit bit size of the input. In this case the input type will never be a 8-bits type. - `src [= False]` in case the code must be compiled in the library. - `load_store [= False]` in case the operator loads/store data from/to memory. - `do_bench [= True]` in case benchmarks for the operator must not be auto-generated. - `desc [= '']` description (in Markdown format) that will appear in the documentation for the operator. - `bench_auto_against_cpu [= True]` for auto-generation of benchmark against `nsimd` CPU implementation. - `bench_auto_against_mipp [= False]` for auto-generation of benchmark against the MIPP library. - `bench_auto_against_sleef [= False]` for auto-generation of benchmark against the Sleef library. - `bench_auto_against_std [= False]` for auto-generation of benchmark against the standard library. - `tests_mpfr [= False]` in case the operator has an MPFR counterpart for comparison, then test the correctness of the operator against it. - `tests_ulps [= False]` in case the auto-generated tests has to compare ULPs (). - `has_scalar_impl [= True]` in case the operator has a CPU scalar and GPU implementation. ### Implementing the operator Now that the operator is registered, all signatures will be generated but the implemenatations will be missing. Type ```sh python3 egg/hatch.py -lf ``` and the following files (among many other) should appear: - `include/nsimd/cpu/cpu/foo.h` - `include/nsimd/x86/sse2/foo.h` - `include/nsimd/x86/sse42/foo.h` - `include/nsimd/x86/avx/foo.h` - `include/nsimd/x86/avx2/foo.h` - `include/nsimd/x86/avx512_knl/foo.h` - `include/nsimd/x86/avx512_skylake/foo.h` - `include/nsimd/arm/neon128/foo.h` - `include/nsimd/arm/aarch64/foo.h` - `include/nsimd/arm/sve/foo.h` - `include/nsimd/arm/sve128/foo.h` - `include/nsimd/arm/sve256/foo.h` - `include/nsimd/arm/sve512/foo.h` - `include/nsimd/arm/sve1024/foo.h` - `include/nsimd/arm/sve2048/foo.h` - `include/nsimd/ppc/vmx/foo.h` - `include/nsimd/ppc/vsx/foo.h` They each correspond to the implementations of the operator for each supported architectures. When openening one of these files the implementations in plain C and then in C++ (falling back to the C function) should be there but all the C implementations are reduced to `abort();`. This is the default when none is provided. Note that the "cpu" architecture is just a fallback involving no SIMD at all. This is used on architectures not supported by `nsimd` or when the architectures does not offer any SIMD. Providing implementations for `foo` is done by completing the following Python files: - `egg/platform_cpu.py` - `egg/platform_x86.py` - `egg/platform_arm.py` - `egg/platform_ppc.py` - `egg/scalar.py` - `egg/cuda.py` - `egg/hip.py` - `egg/oneapi.py` The idea is to produce plain C (not C++) code using Python string format. Each of the Python files provides some helper functions to ease as much as possible the programmer's job. But every file provides the same "global" variables available in every functions and is designed in the same way: 1. At the bottom of the file is the `get_impl` function taking the following arguments: + `func ` the name of the operator the system is currently auto-generating. + `simd_ext ` the SIMD extension for which the system wants the implemetation. + `from_typ ` the input type of the argument that will be passed to the operator. + `to_typ ` the output type produced by the operator. 2. Inside this function lies a Python dictionary that provides functions implementing each operator. The string containing the C code for the implementations can be put here directly but usually the string is returned by a Python function that is written above in the same file. 3. At the top of the file lies helper functions that helps generating code. This is specific to each architecture. Do not hesitate to look at it. Let's begin by the `cpu` implementations. It turns out that there is no SIMD extension in this case, and by convention, `simd_ext == 'cpu'` and this argument can therefore be ignored. So we first add an entry to the `impls` Python dictionary of the `get_impl` function: ```python impls = { ... 'reverse': reverse1(from_typ), 'addv': addv(from_typ), 'foo': foo1(from_typ) # Added at the bottom of the dictionary } if simd_ext != 'cpu': raise ValueError('Unknown SIMD extension "{}"'.format(simd_ext)) ... ``` Then, above in the file we write the Python function `foo1` that will provide the C implementation of operator `foo`: ```python def foo1(typ): return func_body( '''ret.v{{i}} = ({typ})1 / (({typ})1 - {in0}.v{{i}}) + ({typ})1 / ((({typ})1 - {in0}.v{{i}}) * (({typ})1 - {in0}.v{{i}}));'''. \ format(**fmtspec), typ) ``` First note that the arguments names passed to the operator in its C implementation are not known in the Python side. Several other parameters are not known or are cumbersome to find out. Therefore each function has access to the `fmtspec` Python dictionary that hold some of these values: - `in0`: name of the first parameter for the C implementation. - `in1`: name of the second parameter for the C implementation. - `in2`: name of the third parameter for the C implementation. - `simd_ext`: name of the SIMD extension (for the cpu architecture, this is equal to `"cpu"`). - `from_typ`: type of the input. - `to_typ`: type of the output. - `typ`: equals `from_typ`, shorter to write as usually `from_typ == to_typ`. - `utyp`: bitfield type of the same size of `typ`. - `typnbits`: number of bits in `typ`. The CPU extension can emulate 64-bits or 128-bits wide SIMD vectors. Each type is a struct containing as much members as necessary so that `sizeof(T) * (number of members) == 64 or 128`. In order to avoid the developper to write two cases (64-bits wide and 128-bits wide) the `func_body` function is provided as a helper. Note that the index `{{i}}` is in double curly brackets to go through two Python string formats: 1. The first pass is done within the `foo1` Python function and replaces `{typ}` and `{in0}`. In this pass `{{i}}` is formatted into `{i}`. 2. The second pass is done by the `func_body` function which unrolls the string to the necessary number and replace `{i}` by the corresponding number. The produced C code will look like one would written the same statement for each members of the input struct. Then note that as plain C (and C++) does not support native 16-bits wide floating point types `nsimd` emulates it with a C struct containing 4 floats (32-bits swide floatting point numbers). In some cases extra care has to be taken to handle this type. For each SIMD extension one can find a `types.h` file (for `cpu` the files can be found in `include/nsimd/cpu/cpu/types.h`) that declares all SIMD types. If you have any doubt on a given type do not hesitate to take a look at this file. Note also that this file is auto-generated and is therefore readable only after a successfull first `python3 egg/hatch -Af`. Now that the `cpu` implementation is written, you should be able to write the implementation of `foo` for other architectures. Each architecture has its particularities. We will cover them now by providing directly the Python implementations and explaining in less details. Finally note that `clang-format` is called by the generation system to autoformat produced C/C++ code. Therefore prefer indenting C code strings within the Python according to Python indentations, do not write C code beginning at column 0 in Python files. ### For Intel ```python def foo1(simd_ext, typ): if typ == 'f16': return '''nsimd_{simd_ext}_vf16 ret; ret.v1 = {pre}foo_ps({in0}.v1); ret.v2 = {pre}foo_ps({in0}.v2); return ret;'''.format(**fmtspec) if simd_ext == 'sse2': return emulate_op1('foo', 'sse2', typ) if simd_ext in ['avx', 'avx512_knl']: return split_opn('foo', simd_ext, typ, 1) return 'return {pre}foo{suf}({in0});'.format(**fmtspec) ``` Here are some notes concerning the Intel implementation: 1. `float16`s are emulated with two SIMD vectors of `float`s. 2. When the intrinsic is provided by Intel one can access it easily by constructing it with `{pre}` and `{suf}`. Indeed all Intel intrinsics names follow a pattern with a prefix indicating the SIMD extension and a suffix indicating the type of data. As for `{in0}`, `{pre}` and `{suf}` are provided and contain the correct values with respect to `simd_ext` and `typ`, you do not need to compute them yourself. 3. When the intrinsic is not provided by Intel then one has to use tricks. + For `SSE2` one can use complete emulation, that is, putting the content of the SIMD vector into a C-array, working on it with a simple for loop and loading back the result into the resulting SIMD vector. As said before a lot of helper functions are provided and the `emulate_op1` Python function avoid writing by hand this for-loop emulation. + For `AVX` and `AVX512_KNL`, one can fallback to the "lower" SIMD extension (`SSE42` for `AVX` and `AVX2` for `AVX512_KNL`) by splitting the input vector into two smaller vectors belonging to the "lower" SIMD extension. In this case again the tedious and cumbersome work is done by the `split_opn` Python function. 4. Do not forget to add the `foo` entry to the `impls` dictionary in the `get_impl` Python function. ### For ARM ```python def foo1(simd_ext, typ): ret = f16f64(simd_ext, typ, 'foo', 'foo', 1) if ret != '': return ret if simd_ext in neon: return 'return vfooq_{suf}({in0});'.format(**fmtspec) else: return 'return svfoo_{suf}_z({svtrue}, {in0});'.format(**fmtspec) ``` Here are some notes concerning the ARM implementation: 1. `float16`s can be natively supported but this is not mandatory. 2. On 32-bits ARM chips, intrinsics on `double` almost never exist. 3. The Python helper function `f16f64` hides a lot of details concerning the above two points. If the function returns a non empty string then it means that the returned string contains C code to handle the case given by the pair `(simd_ext, typ)`. We advise you to look at the generated C code. You will see the `nsimd_FP16` macro used. When defined it indicates that `nsimd` is compiled with native `float16` support. This also affect SIMD types (see `nsimd/include/arm/*/types.h`.) 4. Do not forget to add the `foo` entry to the `impls` dictionary in the `get_impl` Python function. ### For IBM POWERPC ```python def foo1(simd_ext, typ): if has_to_be_emulated(simd_ext, typ): return emulation_code(op, simd_ext, typ, ['v', 'v']) else: return 'return vec_foo({in0});'.format(**fmtspec) ``` Here are some notes concerning the PPC implementation: 1. For VMX, intrinsics on `double` almost never exist. 2. The Python helper function `has_to_be_emulated` returns `True` when the implementation of `foo` concerns float16 or `double`s for `VMX`. When this function returns True you can then use `emulation_code`. 3. The `emulation_code` function returns a generic implementation of an operator. However this iplementation is not suitable for any operator and the programmer has to take care of that. 4. Do not forget to add the `foo` entry to the `impls` dictionary in the `get_impl` Python function. ### The scalar CPU version ```python def foo1(func, typ): normal = \ 'return ({typ})(1 / (1 - {in0}) + 1 / ((1 - {in0}) * (1 - {in0})));'. \ if typ == 'f16': return \ '''#ifdef NSIMD_NATIVE_FP16 {normal} #else return nsimd_f32_to_f16({normal_fp16}); #endif'''. \ format(normal=normal.format(**fmtspec), normal_fp16=normal.format(in0='nsimd_f16_to_f32({in0}))) else: return normal.format(**fmtspec) ``` The only caveat for the CPU scalar implementation is to handle float16 correctly. The easiest way to do is to have the same implementation as float32 but replacing `{in0}`'s by `nsimd_f16_to_f32({in0})`'s and converting back the float32 result to a float16. ### The GPU versions The GPU generator Python files `cuda.py`, `rocm.py` and `oneapi.py` are a bit different from the other files but it is easy to find where to add the relevant pieces of code. Note that ROCm syntax is fully compatible with CUDA's one only needs to modify the `cuda.py` file while it easy to understand `oneapi.py`. The code to add for float32's is as follows to be added inside the `get_impl` Python function. ```python return '1 / (1 - {in0}) + 1 / ((1 - {in0}) * (1 - {in0}))'.format(**fmtspec) ``` The code for CUDA and ROCm to add for float16's is as follows. It has to be added inside the `get_impl_f16` Python function. ```python arch53_code = '''__half one = __float2half(1.0f); return __hadd( __hdiv(one, __hsub(one, {in0})), __hmul( __hdiv(one, __hsub(one, {in0})), __hdiv(one, __hsub(one, {in0})) ) );'''.format(**fmtspec) ``` As Intel oneAPI natively support float16's the code is the same as the one for floats: ```python return '1 / (1 - {in0}) + 1 / ((1 - {in0}) * (1 - {in0}))'.format(**fmtspec) ``` ### Implementing the test for the operator Now that we have written the implementations for the `foo` operator we must write the corresponding tests. For tests all generations are done by `egg/gen_tests.py`. Writing tests is more simple. The intrinsic that we just implemented can be tested by an already-written test pattern code, namely by the `gen_test` Python function. Here is how the `egg/gen_tests.py` is organized: 1. The entry point is the `doit` function located at the bottom of the file. 2. In the `doit` function a dispatching is done according to the operator that is to be tested. All operators cannot be tested by the same C/C++ code. The reading of all different kind of tests is rather easy and we are not going through all the code in this document. 3. All Python functions generating test code begins with the following: ```python filename = get_filename(opts, op, typ, lang) if filename == None: return ``` This must be the case for newly created function. The `get_filename` function ensures that the file must be created with respect to the command line options given to the `egg/hatch.py` script. Then note that to output to a file the Python function `open_utf8` must be used to handle Windows and to automatically put the MIT license at the beginning of generated files. 4. Tests must be written for C base API, the C++ base API and the C++ advanced API. If you need to create a new kind of tests then the best way is to copy-paste the Python function that produces the test that resembles the most to the test you want. Then modify the newly function to suit your needs. Here is a quick overview of Python functions present in the `egg/gen_test.py` file: - `gen_nbtrue`, `gen_adv`, `gen_all_any` generate tests for reduction operators. - `gen_reinterpret_convert` generates tests for non closed operators. - `gen_load_store` generates tests for load/store operators. - `gen_reverse` generates tests for one type of shuffle but can be extended for other kind of shuffles. - `gen_test` generates tests for "standard" operators, typically those who do some computations. This is the kind of tests that can handle our `foo` operator and therefore nothing has to be done on our part. ## Not all tests are to be done As explained in doing all tests is not recommanded. Take for example the `cvt` operator. Testing `cvt` from say `f32` to `i32` is complicated as the result depends on how NaN, infinities are handled and on the current round mode. In turn these prameters depends on the vendor, the chip, the bugs in the chip, the chosen rounding mode by users or other softwares... The function `should_i_do_the_test` gives an hint on whether to implement the test or not. Its code is really simple and you may need to modify it. The listing below is a possible implementation that takes care of the case described in the previous paragraph. ```python def should_i_do_the_test(operator, tt='', t=''): if operator.name == 'cvt' and t in common.ftypes and tt in common.iutypes: # When converting from float to int to float then we may not # get the initial result because of roundings. As tests are usually # done by going back and forth then both directions get tested in the # end return False if operator.name == 'reinterpret' and t in common.iutypes and \ tt in common.ftypes: # When reinterpreting from int to float we may get NaN or infinities # and no ones knows what this will give when going back to ints # especially when float16 are emulated. Again as tests are done by # going back and forth both directions get tested in the end. return False if operator.name in ['notb', 'andb', 'andnotb', 'xorb', 'orb'] and \ t == 'f16': # Bit operations on float16 are hard to check because they are # emulated in most cases. Therefore going back and forth with # reinterprets for doing bitwise operations make the bit in the last # place to wrong. This is normal but makes testing real hard. So for # now we do not test them on float16. return False if operator.name in ['len', 'set1', 'set1l', 'mask_for_loop_tail', 'loadu', 'loada', 'storeu', 'storea', 'loadla', 'loadlu', 'storela', 'storelu', 'if_else1']: # These functions are used in almost every tests so we consider # that they are extensively tested. return False if operator.name in ['store2a', 'store2u', 'store3a', 'store3u', 'store4a', 'store4u', 'scatter', 'scatter_linear', 'downcvt', 'to_logical']: # These functions are tested along with their load counterparts. # downcvt is tested along with upcvt and to_logical is tested with # to_mask return False return True ``` ### Conclusion At first sight the implementation of `foo` seems complicated because intrinsics for all types and all architectures are not provided by vendors. But `nsimd` provides a lot of helper functions and tries to put away details so that wrapping intrinsics is quickly done and easy, the goal is that the programmer concentrate on the implementation itself. But be aware that more complicated tricks can be implemented. Browse through a `platform_*.py` file to see what kind of tricks are used and how they are implemented. ## How do I add a new category? Adding a category is way much simplier than an operator. It suffices to add a class with only one member named `title` as follows: ```python class DocMyCategoryName(DocCategory): title = 'My category name functions' ``` The class must inherit from the `DocCategory` class and its name must begin with `Doc`. The system will then take it into account, generate the entry in the documentation and so on. ## How to I add a new module? A module is a set of functionnalities that make sense to be provided alongside NSIMD but that cannot be part of NSIMD's core. Therefore it is not mandatory to provide all C and C++ APIs versions or to support all operators. For what follows let's call the module we want to implement `mymod`. Include files (written by hand or generated by Python) must be placed into the `nsimd/include/nsimd/modules/mymod` directory and a master header file must be placed at `nsimd/include/nsimd/modules/mymod.h`. You are free to organize the `nsimd/include/nsimd/modules/mymod` folder as you see fit. Your module has to be found by NSIMD generation system. For this you must create the `nsimd/egg/modules/mymod` directory and `nsimd/egg/modules/mymod/hatch.py` file. The latter must expose the following functions: - `def name()` Return a human readable module name beginning with a uppercase letter. - `def desc()` Return a small description of 4-5 lines of text for the module. This text will appear in the `modules.md` file that lists all the available modules. - `def doc_menu()` Return a Python dictionnary containing the menu for when the generation system produces the HTML pages of documentation for the module. The entry markdown file must be `nsimd/doc/markdown/module_mymod_overview.md` for module documentation. Then if your module has no other documentation pages this function can simply returns `dict()`. Otherwise if has to return `{'menu_label': 'filename_suffix', ...}` where `menu_label` is a menu entry to be displayed and pointing to `nsimd/egg/module_mymod_filename_suffix.md`. Several fucntion in `egg/common.py` (`import common`) have to be used to ease crafting documentation pages filenames: + `def get_markdown_dir(opts)` Return the folder into which markdown for documentation have to be put. + `def get_markdown_file(opts, name, module='')` Return the filename to be passed to the `common.open_utf8` function. The `name` argument acts as a suffix as explained above while the `module` argument if the name of the module. - `def doit(opts)` Is the real entry point of the module. This function has the responsability to generate all the code for your module. It can of course import all Python files from NSIMD and take advantage of the `operators.py` file. To respect the switches passed by the user at command line it is recommanded to write this function as follows. ```python def doit(opts): common.myprint(opts, 'Generating module mymod') if opts.library: gen_module_headers(opts) if opts.tests: gen_tests(opts) if opts.doc: gen_doc(opts) ``` Tests for the module have to be put into the `nsimd/tests/mymod` directory. ## How to I add a new platform? The list of supported platforms is determined by looking in the `egg` directory and listing all `platform_*.py` files. Each file must contain all SIMD extensions for a given platform. For example the default (no SIMD) is given by `platform_cpu.py`. All the Intel SIMD extensions are given by `platform_x86.py`. Each Python file that implements a platform must be named `platform_[name for platform].py` and must export at least the following functions: - `def get_simd_exts()` Return the list of SIMD extensions implemented by this file as a Python list. - `def get_prev_simd_ext(simd_ext)` Usually SIMD extensions are added over time by vendors and a chip implementing a SIMD extension supports previous SIMD extension. This function must return the previous SIMD extension supported by the vendor if it exists otherwise it must return the empty string. Note that `cpu` is the only SIMD extensions that has no previous SIMD extensions. Every other SIMD extension has at least `cpu` as previous SIMD extension. - `def get_native_typ(simd_ext, typ)` Return the native SIMD type corresponding of the SIMD extension `simd_ext` whose elements are of type `typ`. If `typ` or `simd_ext` is not known then a ValueError exception must be raised. - `def get_type(simd_ext, typ)` Returns the "intrinsic" SIMD type corresponding to the given arithmetic type. If `typ` or `simd_ext` is not known then a ValueError exception must be raised. - `def get_additional_include(func, simd_ext, typ)` Returns additional include if need be for the implementation of `func` for the given `simd_ext` and `typ`. - `def get_logical_type(simd_ext, typ)` Returns the "intrinsic" logical SIMD type corresponding to the given arithmetic type. If `typ` or `simd_ext` is not known then a ValueError exception must be raised. - `def get_nb_registers(simd_ext)` Returns the number of registers for this SIMD extension. - `def get_impl(func, simd_ext, from_typ, to_typ)` Returns the implementation (C code) for `func` on type `typ` for `simd_ext`. If `typ` or `simd_ext` is not known then a ValueError exception must be raised. Any `func` given satisfies `S func(T a0, T a1, ... T an)`. - `def has_compatible_SoA_types(simd_ext)` Returns True iff the given `simd_ext` has structure of arrays types compatible with NSIMD i.e. whose members are v1, v2, ... Returns False otherwise. If `simd_ext` is not known then a ValueError exception must be raised. - `def get_SoA_type(simd_ext, typ, deg)` Returns the structure of arrays types for the given `typ`, `simd_ext` and `deg`. If `simd_ext` is not known or does not name a type whose corresponding SoA types are compatible with NSIMD then a ValueError exception must be raised. - `def emulate_fp16(simd_ext)` Returns True iff the given SIMD extension has to emulate FP16's with two FP32's. Then you are free to implement the SIMd extensions for the platform. See above on how to add the implementations of operators. ================================================ FILE: LICENSE ================================================ Copyright (c) 2019 Agenium Scale Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: README.md ================================================ Documentation can be found [here](https://agenium-scale.github.io/nsimd/). We put a lot of effort into [testing](https://agenium-scale.github.io/nsimd/how_tests_are_done.html). # What is NSIMD? At its core, NSIMD is a vectorization library that abstracts [SIMD programming](). It was designed to exploit the maximum power of processors at a low development cost. NSIMD comes with modules. As of now two of them adds support for GPUs to NSIMD. The direction that NSIMD is taking is to provide several programming paradigms to address different problems and to allow a wider support of architectures. With two of its modules NSIMD provides three programming paradigms: - Imperative programming provided by NSIMD core that supports a lots of CPU/SIMD extensions. - Expressions templates provided by the TET1D module that supports all architectures from NSIMD core and adds support for NVIDIA and AMD GPUs. - Single Program Multiple Data provided by the SPMD module that supports all architectures from NSIMD core and adds support for NVIDIA and AMD GPUs. ## Supported architectures | Architecture | NSIMD core | TET1D module | SPMD module | |:--------------------------------------|:----------:|:------------:|:-----------:| | CPU (scalar functions) | Y | Y | Y | | CPU (128-bits SIMD emulation) | Y | Y | Y | | Intel SSE 2 | Y | Y | Y | | Intel SSE 4.2 | Y | Y | Y | | Intel AVX | Y | Y | Y | | Intel AVX2 | Y | Y | Y | | Intel AVX-512 for KNLs | Y | Y | Y | | Intel AVX-512 for Skylake processors | Y | Y | Y | | Arm NEON 128 bits (ARMv7 and earlier) | Y | Y | Y | | Arm NEON 128 bits (ARMv8 and later) | Y | Y | Y | | Arm SVE (original sizeless SVE) | Y | Y | Y | | Arm fixed sized SVE | Y | Y | Y | | IBM POWERPC VMX | Y | Y | Y | | IBM POWERPC VSX | Y | Y | Y | | NVIDIA CUDA | N | Y | Y | | AMD ROCm | N | Y | Y | | Intel oneAPI | N | Y | Y | ## Contributions | Contributor | Contribution(s) | |:---------------------|:--------------------------------------------------| | Guillaume Quintin | Maintainer + main contributor | | Alan Kelly | Arm NEON + mathematical functions | | Kenny Péou | Fixed point module | | Xavier Berault | PowerPC VMX and VSX | | Vianney Stricher | NSIMD core + oneAPI in SPMD and TET1D modules | | Quentin Khan | Soa/AoS loads and stores | | Paul Gannay | PowerPC VMX, VSX + testing system | | Charly Chevalier | Benchmarking system + Python internals | | Erik Schnetter | Fixes + code generation | | Lénaïc Bagnères | Fixes + TET1D module | | Jean-Didier Pailleux | Shuffles operators | ## How it works? To achieve maximum performance, NSIMD mainly relies on the inline optimization pass of the compiler. Therefore using any mainstream compiler such as GCC, Clang, MSVC, XL C/C++, ICC and others with NSIMD will give you a zero-cost SIMD abstraction library. To allow inlining, a lot of code is placed in header files. *Small* functions such as addition, multiplication, square root, etc, are all present in header files whereas big functions such as I/O are put in source files that are compiled as a `.so`/`.dll` library. NSIMD provides C89, C11, C++98, C++11, C++14 and C++20 APIs. All APIs allow writing generic code. For the C API this is achieved through a thin layer of macros and with the `_Generic` keyword for the C advanced API; for the C++ APIs it is achieved using templates and function overloading. The C++ APIs are split into two. The first part is a C-like API with only function calls and direct type definitions for SIMD types while the second one provides operator overloading, higher level type definitions that allows unrolling. C++11, C++14 APIs add for instance templated type definitions and templated constants while the C++20 API uses concepts for better error reporting. Binary compatibility is guaranteed by the fact that only a C ABI is exposed. The C++ API only wraps the C calls. ## Supported compilers NSIMD is tested with GCC, Clang, MSVC, NVCC, HIPCC and ARMClang. As a C89 and a C++98 API are provided, other compilers should work fine. Old compiler versions should work as long as they support the targeted SIMD extension. For instance, NSIMD can compile SSE 4.2 code with MSVC 2010. # Build the library ## CMake As CMake is widely used as a build system, we have added support for building the library only and the corresponding find module. ```sh mkdir build cd build cmake .. -Dsimd=SIMD_EXT make make install ``` where `SIMD_EXT` is one of the following: CPU, SSE2, SSE42, AVX, AVX2, AVX512\_KNL, AVX512\_SKYLAKE, NEON128, AARCH64, SVE, SVE128, SVE256, SVE512, SVE1024, SVE2048, VMX, VSX, CUDA, ROCM. Note that when compiling for NEON128 on Linux one has to choose the ABI, either armel or armhf. Default is armel. As CMake is unable to autodetect this parameter one has to tell CMake manually. ```sh cmake .. -Dsimd=neon128 # for armel cmake .. -Dsimd=neon128 -DNSIMD_ARM32_IS_ARMEL=OFF # for armhf ``` We provide in the `scripts` directory a CMake find module to find NSIMD on your system. One can let the module find NSIMD on its own, if several versions for different SIMD extensions of NSIMD are installed then the module will find and return one. There is no guaranty on which versions will be chosen by the module. ```cmake find_package(NSIMD) ``` If one wants a specific version of the library for a given SIMD extension then use the `COMPONENTS` part of `find_package`. Only one component is supported at a time. ```cmake find_package(NSIMD COMPONENTS avx2) # find only NSIMD for Intel AVX2 find_package(NSIMD COMPONENTS sve) # find only NSIMD for Arm SVE find_package(NSIMD COMPONENTS sse2 sse42) # unsupported ``` ## Nsconfig The support for CMake has been limited to building the library only. If you wish to run tests or contribute you need to use nsconfig as CMake has several flaws: - too slow especially on Windows, - inability to use several compilers at once, - inability to have a portable build system, - very poor support for portable compilation flags, - ... ## Dependencies (nsconfig only) Generating C/C++ files is done by the Python3 code contained in the `egg`. Python should be installed by default on any Linux distro. On Windows it comes with the latest versions of Visual Studio on Windows (), you can also download and install it directly from . The Python code can call `clang-format` to properly format all generated C/C++ source. On Linux you can install it via your package manager. On Windows you can use the official binary at . Compiling the library requires a C++98 compiler. Any version of GCC, Clang or MSVC will do. Note that the produced library and header files for the end-user are C89, C++98, C++11 compatible. Note that C/C++ files are generated by a bunch of Python scripts and they must be executed first before running building the library. ## Build for Linux ```bash bash scripts/build.sh for simd_ext1/.../simd_extN with comp1/.../compN ``` For each combination a directory `build-simd_ext-comp` will be created and will contain the library. Supported SIMD extension are: - sse2 - sse42 - avx - avx2 - avx512\_knl - avx512\_skylake - neon128 - aarch64 - sve - sve128 - sve256 - sve512 - sve1024 - sve2048 - vmx - vsx - cuda - rocm Supported compiler are: - gcc - clang - icc - armclang - xlc - dpcpp - fcc - cl - nvcc - hipcc Note that certain combination of SIMD extension/compilers are not supported such as aarch64 with icc, or avx512\_skylake with nvcc. ## Build on Windows Make sure you are typing in a Visual Studio prompt. The command is almost the same as for Linux with the same constraints on the pairs SIMD extension/compilers. ```batch scripts\build.bat for simd_ext1/.../simd_extN with comp1/.../compN ``` ## More details on building the library The library uses a tool called nsconfig () which is basically a Makefile translator. If you have just built NSIMD following what's described above you should have a `nstools` directory which contains `bin/nsconfig`. If not you can generate it using on Linux ```bash bash scripts/setup.sh ``` and on Windows ```batch scripts\setup.bat ``` Then you can use `nsconfig` directly it has a syntax similar to CMake at command line. Here is a quick tutorial with Linux command line. We first go to the NSIMD directory and generate both NSIMD and nsconfig. ```bash $ cd nsimd $ python3 egg/hatch.py -ltf $ bash scripts/setup.sh $ mkdir build $ cd build ``` Help can be displayed using `--help`. ```bash $ ../nstools/bin/nsconfig --help usage: nsconfig [OPTIONS]... DIRECTORY Configure project for compilation. -v verbose mode, useful for debugging -nodev Build system will never call nsconfig -DVAR=VALUE Set value of variable VAR to VALUE -list-vars List project specific variable -GBUILD_SYSTEM Produce files for build system BUILD_SYSTEM Supported BUILD_SYSTEM: make POSIX Makefile gnumake GNU Makefile nmake Microsot Visual Studio NMake Makefile ninja Ninja build file (this is the default) list-vars List project specific variables -oOUTPUT Output to OUTPUT instead of default -suite=SUITE Use compilers from SUITE as default ones Supported SUITE: gcc The GNU compiler collection msvc Microsoft C and C++ compiler llvm The LLVM compiler infrastructure armclang Arm suite of compilers based on LLVM xlc IBM suite of compilers fcc_trad_mode Fujitsu compiler in traditional mode fcc_clang_mode Fujitsu compiler in clang mode emscripten Emscripten suite for compiling into JS icc Intel C amd C++ compiler rocm Radeon Open Compute compilers oneapi Intel oneAPI compilers cuda, cuda+gcc, cuda+clang, cuda+msvc Nvidia CUDA C++ compiler -comp=COMMAND,COMPILER[,PATH[,VERSION[,ARCHI]]] Use COMPILER when COMMAND is invoked for compilation If VERSION and/or ARCHI are not given, nsconfig will try to determine those. This is useful for cross compiling and/or setting the CUDA host compiler. COMMAND must be in { cc, c++, gcc, g++, cl, icc, nvcc, hipcc, hcc, clang, clang++, armclang, armclang++, cuda-host-c++, emcc, em++ } ; VERSION is compiler dependant. Note that VERSION can be set to only major number(s) in which case nsconfig fill missing numbers with zeros. Supported ARCHI: x86 Intel 32-bits ISA x86_64 Intel/AMD 64-bits ISA armel ARMv5 and ARMv6 32-bits ISA armhf ARMv7 32-bits ISA aarch64 ARM 64-bits ISA ppc64el PowerPC 64-bits little entian wasm32 WebAssembly with 32-bits memory indexing wasm64 WebAssembly with 64-bits memory indexing Supported COMPILER: gcc, g++ GNU Compiler Collection clang, clang++ LLVM Compiler Infrastructure emcc, em++ Emscripten compilers msvc, cl Microsoft Visual C++ armclang, armclang++ ARM Compiler xlc, xlc++ IBM Compiler icc Intel C/C++ Compiler dpcpp Intel DPC++ Compiler nvcc Nvidia CUDA compiler hipcc ROCm HIP compiler fcc_trad_mode, FCC_trad_mode Fujitsu C and C++ traditionnal compiler fcc_clang_mode, FCC_clang_mode Fujitsu C and C++ traditionnal compiler -prefix=PREFIX Set path for installation to PREFIX -h, --help Print the current help NOTE: Nvidia CUDA compiler (nvcc) needs a host compiler. Usually on Linux systems it is GCC while on Windows systems it is MSVC. If nvcc is chosen as the default C++ compiler via the -suite switch, then its host compiler can be invoked in compilation commands with 'cuda-host-c++'. The latter defaults to GCC on Linux systems and MSVC on Windows systems. The user can of course choose a specific version and path of this host compiler via the '-comp=cuda-host-c++,... parameters. If nvcc is not chosen as the default C++ compiler but is used for compilation then its default C++ host compiler is 'c++'. The latter can also be customized via the '-comp=c++,...' command line switch. ``` Each project can defined its own set of variable controlling the generation of the ninja file of Makefile. ```bash $ ../nstools/bin/nsconfig .. -list-vars Project variables list: name | description -----------------|----------------------------------- simd | SIMD extension to use cuda_arch_flags | CUDA target arch flag(s) for tests static_libstdcpp | Compile the libstdc++ statically cpp20_tests | Enable C++20 tests ``` Finally one can choose what to do and compile NSIMD and its tests. ```bash $ ../nstools/bin/nsconfig .. -Dsimd=avx2 $ ninja $ ninja tests ``` Nsconfig comes with nstest a small tool to execute tests. ```bash $ ../nstools/bin/nstest -j20 ``` ## Cross compilation It is useful to cross-compile for example when you are on a Intel workstation and want to compile for a Raspberry Pi. Nsconfig generate some code, compile and run it to obtain informations on the C or C++ compilers. When cross compiling, unless you configured your Linux box with binfmt\_misc to tranparently execute aarch64 binaries on a x86\_64 host you need to give nsconfig all the informations about the compilers so that it does not need to run aarch64 code on x86\_64 host. ```bash $ ../nstools/bin/nsconfig .. -Dsimd=aarch64 \ -comp=cc,gcc,aarch64-linux-gnu-gcc,10.0,aarch64 \ -comp=c++,gcc,aarch64-linux-gnu-g++,10.0,aarch64 ``` ## Defines that control NSIMD compilation and usage Several defines control NSIMD. - `FMA` or `NSIMD_FMA` indicate to NSIMD that fma intrinsics can be used when compiling code. This is useful on Intel SSE2, SSE42, AVX and AVX2. - `FP16` or `NSIMD_FP16` indicate to NSIMD that the targeted architecture natively (and possibly partially) supports IEEE float16's. This is useful when compiling for Intel SSE2, SSE42, AVX and AVX2, Arm NEON128 and AARCH64. # Philosophy of NSIMD Originally the library aimed at providing a portable zero-cost abstraction over SIMD vendor intrinsics disregarding the underlying SIMD vector length. NSIMD will of course continue to wrap SIMD intrinsics from various vendors but more efforts will be put into writing NSIMD modules and improving the existing ones especially the SPMD module. ## The SPMD paradigm It is our belief that SPMD is a good paradigm for writing vectorized code. It helps both the developer and the compiler writer. It forces the developers to better arrange its data ion memory more suited for vectorization. On the compiler side it is more simplier to write a "SPMD compiler" than a standard C/C++/Fortran compiler that tries to autovectorize some weird loop with data scattered all around the place. Our priority for our SPMD module are the following: - Add oneAPI/SYCL support. - Provide a richer API. - Provide cross-lane data transfer. - Provide a way to abstract shared memory. Our approach can be roughly compared to ISPC () but from a library point of view. ## Wrapping intrinsics in NSIMD core NSIMD was designed following as closely as possible the following guidelines: - Correctness primes over speed except for corner cases which may include the following: + Buggy intrinsics on rare input values (denormal numbers, infinities, NaNs) in which case a slower but correct alternative may be proposed to bypass the buggy intrinsics. + A buggy intrinsics but for a specific version of a family of chips. It would be unreasonable to penalize the majority of users vs. a few (or even no) users. - Emulate with tricks and intrinsic integer arithmetic when not available. - Use common names as found in common computation libraries. - Do not hide SIMD registers, one variable (of a type such as `nsimd::pack`) matches one register. When possible force the user to think different between SIMD code and scalar code. - Make the life of the compiler as easy as possible: keep the code simple to allow the compiler to perform as many optimizations as possible. - Favor the advanced C++ API. You may wrap intrinsics that require compile time knowledge of the underlying vector length but this should be done with caution. Wrapping intrinsics that do not exist for all types is difficult and may require casting or emulation. For instance, 8 bit integer vector multiplication using SSE2 does not exist. We can either process each pair of integers individually or we can cast the 8 bit vectors to 16 bit vectors, do the multiplication and cast them back to 8 bit vectors. In the second case, chaining operations will generate many unwanted casts. To avoid hiding important details to the user, overloads of operators involving scalars and SIMD vectors are not provided by default. Those can be included explicitely to emphasize the fact that using expressions like `scalar + vector` might incur an optimization penalty. The use of `nsimd::pack` may not be portable to ARM SVE and therefore must be included manually. ARM SVE registers can only be stored in sizeless strucs (`__sizeless_struct`). This feature (as of 2019/04/05) is only supported by the ARM compiler. We do not know whether other compilers will use the same keyword or paradigm to support SVE intrinsics. # Contributing to NSIMD The wrapping of intrinsics, the writing of test and bench files are tedious and repetitive tasks. Most of those are generated using Python scripts that can be found in `egg`. - Intrinsics that do not require to known the vector length can be wrapped and will be accepted with no problem. - Intrinsics that do require the vector length at compile time can be wrapped but it is up to the maintainer to accept it. - Use `clang-format` when writing C or C++ code. - The `.cpp` files are written in C++98. - The headers files must be compatible with C89 (when possible otherwise C99), C++98, C++11, C++14 up to and including C++20. Please see for more details. # LICENSES NSIMD contains files from the excellent [Sleef library](https://sleef.org/) whose license is stated below. The corresponding files are all located in the `src` folder and have retained their original license notices. ## NSIMD license Copyright (c) 2021 Agenium Scale Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ## Sleef license ([Boost Software License v1.0](https://www.boost.org/LICENSE_1_0.txt)) Boost Software License - Version 1.0 - August 17th, 2003 Permission is hereby granted, free of charge, to any person or organization obtaining a copy of the software and accompanying documentation covered by this license (the "Software") to use, reproduce, display, distribute, execute, and transmit the Software, and to prepare derivative works of the Software, and to permit third-parties to whom the Software is furnished to do so, all subject to the following: The copyright notices in the Software and this entire statement, including the above license grant, this restriction and the following disclaimer, must be included in all copies of the Software, in whole or in part, and all derivative works of the Software, unless such copies or derivative works are solely in the form of machine-executable object code generated by a source language processor. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: benches/benches.hpp ================================================ #ifndef BENCHES_HPP #define BENCHES_HPP #include #include #include namespace nsimd { namespace benches { template double rand_sign() { if (std::is_unsigned::value) { return 1.; } else { return (::rand() % 2) ? 1. : -1.; } } template T rand_bits(T min, T max = std::numeric_limits::max()) { T r; do { int nbits = sizeof(T) * CHAR_BIT; u64 x = 0; for (int i = 0; i < nbits; ++i) { x |= u64(::rand() % 2) << i; } r = *((T*)&x); } while (r < min || r > max); return r; } template T rand_from(T min, T max = std::numeric_limits::max()) { // From: http://c-faq.com/lib/randrange.html return T(double(min) + (double(::rand()) / (double(RAND_MAX) / (double(max) - double(min) + 1)))); } template T rand_fp(T min, T max) { T r; if (std::isinf(min) && std::isinf(max)) { // For now, we're not using this method for random number //r = rand_bits(min, max); r = rand_from(-1000000, 1000000); } else { r = rand_from(min, max); } return r; } template T rand(T min, T max = std::numeric_limits::max()) { return rand_from(min, max); } template <> float rand(float min, float max) { return rand_fp(min, max); } template <> double rand(double min, double max) { return rand_fp(min, max); } } } #endif ================================================ FILE: build.nsconfig ================================================ # MIT License # # Copyright (c) 2021 Agenium Scale # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. package_name nsimd-3.0 ## ---------------------------------------------------------------------------- ## Get OS/Compiler specific file extensions set o = @obj_ext set exe = @exe_ext set s = @asm_ext set so = @shared_lib_ext set lib = @shared_link_ext set root = @source_dir set make = @make_command set build = @build_dir set root = @source_dir set ccomp = @ccomp_name set cppcomp = @cppcomp_name ## ---------------------------------------------------------------------------- ## Some defaults ifnot_set "SIMD extension to use" simd = cpu ifnot_set "CUDA target arch flag(s) for tests" cuda_arch_flags = "" ifnot_set "Compile the libstdc++ statically" static_libstdcpp = true ifnot_set "Enable C++20 tests" cpp20_tests = "" ## ---------------------------------------------------------------------------- ## Targets for compilation set o_for_ = fp16$o memory$o ufp$o api_cpu$o rempitab$o \ sleefsp$o sleefdp$o gpu$o set o_for_cpu = $o_for_ set o_for_cuda = $o_for_ set o_for_rocm = $o_for_ set o_for_oneapi = $o_for_ set o_for_sse2 = $o_for_cpu api_sse2$o sleef_sse2_f32$o \ sleef_sse2_f64$o set o_for_sse42 = $o_for_sse2 api_sse42$o sleef_sse42_f32$o \ sleef_sse42_f64$o set o_for_avx = $o_for_sse42 api_avx$o sleef_avx_f32$o \ sleef_avx_f64$o set o_for_avx2 = $o_for_avx api_avx2$o sleef_avx2_f32$o \ sleef_avx2_f64$o set o_for_avx512_knl = $o_for_avx2 api_avx512_knl$o \ sleef_avx512_knl_f32$o sleef_avx512_knl_f64$o set o_for_avx512_skylake = $o_for_avx2 api_avx512_skylake$o \ sleef_avx512_skylake_f32$o \ sleef_avx512_skylake_f64$o set o_for_neon128 = $o_for_cpu api_neon128$o sleef_neon128_f32$o \ sleef_neon128_f64$o set o_for_aarch64 = $o_for_cpu api_aarch64$o sleef_aarch64_f32$o \ sleef_aarch64_f64$o set o_for_sve = $o_for_aarch64 api_sve$o sleef_sve_f32$o \ sleef_sve_f64$o set o_for_sve128 = $o_for_aarch64 api_sve128$o sleef_sve128_f32$o \ sleef_sve128_f64$o set o_for_sve256 = $o_for_aarch64 api_sve256$o sleef_sve256_f32$o \ sleef_sve256_f64$o set o_for_sve512 = $o_for_aarch64 api_sve512$o sleef_sve512_f32$o \ sleef_sve512_f64$o set o_for_sve1024 = $o_for_aarch64 api_sve1024$o sleef_sve1024_f32$o \ sleef_sve1024_f64$o set o_for_sve2048 = $o_for_aarch64 api_sve2048$o sleef_sve2048_f32$o \ sleef_sve2048_f64$o set o_for_vmx = $o_for_cpu api_vmx$o sleef_vmx_f32$o sleef_vmx_f64$o set o_for_vsx = $o_for_vmx api_vsx$o sleef_vsx_f32$o sleef_vsx_f64$o ## ---------------------------------------------------------------------------- ## SIMD compiler flags lambda cflags_for_generic_* = -DCPU set cflags_for_generic_cuda = -DCUDA set cflags_for_generic_rocm = -DROCM set cflags_for_generic_oneapi = -DONEAPI set cflags_for_ = ${cflags_for_generic_$simd$} set cflags_for_cpu = $cflags_for_ set cflags_for_cuda = -DCUDA set cflags_for_rocm = -DROCM set cflags_for_oneapi = -DONEAPI set cflags_for_sse2 = -DSSE2 -msse2 set cflags_for_sse42 = -DSSE42 -msse42 set cflags_for_avx = -DAVX -mavx set cflags_for_avx2 = -DAVX2 -mavx2 -DFMA -mfma -DFP16 -mfp16 set cflags_for_avx512_knl = -DAVX512_KNL -mavx512_knl -mfma -DFP16 -mfp16 set cflags_for_avx512_skylake = -DAVX512_SKYLAKE -mavx512_skylake -mfma \ -DFP16 -mfp16 set cflags_for_neon128 = -DNEON128 -mneon128 set cflags_for_aarch64 = -DAARCH64 -maarch64 set cflags_for_sve = -DSVE -msve set cflags_for_sve128 = -DSVE128 -msve128 set cflags_for_sve256 = -DSVE256 -msve256 set cflags_for_sve512 = -DSVE512 -msve512 set cflags_for_sve1024 = -DSVE1024 -msve1024 set cflags_for_sve2048 = -DSVE2048 -msve2048 set cflags_for_vmx = -DVMX -mvmx set cflags_for_vsx = -DVSX -mvsx ## ---------------------------------------------------------------------------- ## std default flag lambda std_flag_for_* = -std=c++98 set std_flag_for_rocm = -std=c++11 set std_flag_for_oneapi = -std=c++17 ## ---------------------------------------------------------------------------- ## libstdc++ linking mode set libstdcpp_static_link_true = -static-libstdc++ set libstdcpp_static_link_false = ## ---------------------------------------------------------------------------- ## Some defaults set flags = -Wall -fPIC -O2 -I$root$/include -DNDEBUG set cflags = ${std_flag_for_$simd$} $flags \ ${libstdcpp_static_link_$static_libstdcpp$} set sleef_cflags = -fPIC -O2 -I$root$/src -DNDEBUG -DDORENAME=1 ## ---------------------------------------------------------------------------- ## Default building rules phony all deps libnsimd_$simd$$so$ build_file libnsimd_$simd$$so deps ${o_for_$simd$} c++ -fPIC -shared @in -o @out set ldflags = -fPIC -L. -lnsimd_$simd ## ---------------------------------------------------------------------------- ## Generic (emulation) rules for building build_file gpu$o autodeps $root$/src/gpu.cpp c++ $cflags$ $cflags_for_cpu @in -c -o @out build_file ufp$o autodeps $root$/src/ufp.cpp c++ $cflags$ $cflags_for_cpu @in -c -o @out build_file fp16$o autodeps $root$/src/fp16.cpp c++ $cflags$ $cflags_for_cpu @in -c -o @out build_file memory$o autodeps $root$/src/memory.cpp c++ $cflags$ $cflags_for_cpu @in -c -o @out build_file rempitab$o autodeps $root$/src/rempitab.c cc $sleef_cflags$ -c @in -o @out build_file sleefsp$o autodeps $root$/src/sleefsp.c cc $sleef_cflags$ -c @in -o @out build_file sleefdp$o autodeps $root$/src/sleefdp.c cc $sleef_cflags$ -c @in -o @out build_file api_cpu$o autodeps $root$/src/api_cpu.cpp c++ $cflags$ $cflags_for_cpu -c @in -o @out ## ---------------------------------------------------------------------------- ## Intel rules for building build_file api_sse2$o autodeps $root$/src/api_sse2.cpp c++ $cflags$ -c $cflags_for_sse2 @in -o @out build_file sleef_sse2_f32$o autodeps $root$/src/sleefsimdsp.c cc $sleef_cflags$ -c -msse2 -DNSIMD_SSE2 -DENABLE_SSE2=1 @in -o @out build_file sleef_sse2_f64$o autodeps $root$/src/sleefsimddp.c cc $sleef_cflags$ -c -msse2 -DNSIMD_SSE2 -DENABLE_SSE2=1 @in -o @out build_file api_sse42$o autodeps $root$/src/api_sse42.cpp c++ $cflags$ -c $cflags_for_sse42 @in -o @out build_file sleef_sse42_f32$o autodeps $root$/src/sleefsimdsp.c cc $sleef_cflags$ -c -msse42 -DNSIMD_SSE42 -DENABLE_SSE4=1 @in -o @out build_file sleef_sse42_f64$o autodeps $root$/src/sleefsimddp.c cc $sleef_cflags$ -c -msse42 -DNSIMD_SSE42 -DENABLE_SSE4=1 @in -o @out build_file api_avx$o autodeps $root$/src/api_avx.cpp c++ $cflags$ -c $cflags_for_avx @in -o @out build_file sleef_avx_f32$o autodeps $root$/src/sleefsimdsp.c cc $sleef_cflags$ -c -mavx -DNSIMD_AVX -DENABLE_AVX=1 @in -o @out build_file sleef_avx_f64$o autodeps $root$/src/sleefsimddp.c cc $sleef_cflags$ -c -mavx -DNSIMD_AVX -DENABLE_AVX=1 @in -o @out build_file api_avx2$o autodeps $root$/src/api_avx2.cpp c++ $cflags$ -c $cflags_for_avx2 @in -o @out build_file sleef_avx2_f32$o autodeps $root$/src/sleefsimdsp.c cc $sleef_cflags$ -c -mavx2 -mfma -DNSIMD_AVX2 -DENABLE_AVX2=1 \ @in -o @out build_file sleef_avx2_f64$o autodeps $root$/src/sleefsimddp.c cc $sleef_cflags$ -c -mavx2 -mfma -DNSIMD_AVX2 -DENABLE_AVX2=1 \ @in -o @out build_file api_avx512_knl$o autodeps $root$/src/api_avx512_knl.cpp c++ $cflags$ -c $cflags_for_avx512_knl @in -o @out build_file sleef_avx512_knl_f32$o autodeps $root$/src/sleefsimdsp.c cc $sleef_cflags$ -c -mavx512_knl -DNSIMD_AVX512_KNL \ -DENABLE_AVX512F=1 @in -o @out build_file sleef_avx512_knl_f64$o autodeps $root$/src/sleefsimddp.c cc $sleef_cflags$ -c -mavx512_knl -DNSIMD_AVX512_KNL \ -DENABLE_AVX512F=1 @in -o @out build_file api_avx512_skylake$o autodeps $root$/src/api_avx512_skylake.cpp c++ $cflags$ -c $cflags_for_avx512_skylake @in -o @out build_file sleef_avx512_skylake_f32$o autodeps $root$/src/sleefsimdsp.c cc $sleef_cflags$ -c -mavx512_knl -DNSIMD_AVX512_SKYLAKE \ -DENABLE_AVX512F=1 @in -o @out build_file sleef_avx512_skylake_f64$o autodeps $root$/src/sleefsimddp.c cc $sleef_cflags$ -c -mavx512_knl -DNSIMD_AVX512_SKYLAKE \ -DENABLE_AVX512F=1 @in -o @out ## ---------------------------------------------------------------------------- ## ARM 32 bits rules for building build_file api_neon128$o autodeps $root$/src/api_neon128.cpp c++ $cflags$ -c $cflags_for_neon128 @in -o @out build_file sleef_neon128_f32$o autodeps $root$/src/sleefsimdsp.c cc $sleef_cflags$ -c -mneon128 -DNSIMD_NEON128 \ -DENABLE_NEON32=1 @in -o @out build_file sleef_neon128_f64$o autodeps $root$/src/sleefsimddp_emulation.c cc $sleef_cflags$ -c -mneon128 -DNSIMD_NEON128 -DENABLE_NEON32=1 \ -I$root$/include @in -o @out ## ---------------------------------------------------------------------------- ## ARM 64 bits rules for building build_file api_aarch64$o autodeps $root$/src/api_aarch64.cpp c++ $cflags$ -c $cflags_for_aarch64 @in -o @out build_file sleef_aarch64_f32$o autodeps $root$/src/sleefsimdsp.c cc $sleef_cflags$ -c -maarch64 -DNSIMD_AARCH64 \ -DENABLE_ADVSIMD=1 @in -o @out build_file sleef_aarch64_f64$o autodeps $root$/src/sleefsimddp.c cc $sleef_cflags$ -c -maarch64 -DNSIMD_AARCH64 \ -DENABLE_ADVSIMD=1 @in -o @out build_file api_sve$o autodeps $root$/src/api_sve.cpp c++ $cflags$ -c $cflags_for_sve @in -o @out build_file sleef_sve_f32$o autodeps $root$/src/sleefsimdsp.c cc $sleef_cflags$ -c -msve -DNSIMD_SVE -DENABLE_SVE=1 @in -o @out build_file sleef_sve_f64$o autodeps $root$/src/sleefsimddp.c cc $sleef_cflags$ -c -msve -DNSIMD_SVE -DENABLE_SVE=1 @in -o @out build_file api_sve128$o autodeps $root$/src/api_sve128.cpp c++ $cflags$ -c $cflags_for_sve128 @in -o @out build_file sleef_sve128_f32$o autodeps $root$/src/sleefsimdsp.c cc $sleef_cflags$ -c -msve128 -DNSIMD_SVE128 -DENABLE_SVE=1 @in -o @out build_file sleef_sve128_f64$o autodeps $root$/src/sleefsimddp.c cc $sleef_cflags$ -c -msve128 -DNSIMD_SVE128 -DENABLE_SVE=1 @in -o @out build_file api_sve256$o autodeps $root$/src/api_sve256.cpp c++ $cflags$ -c $cflags_for_sve256 @in -o @out build_file sleef_sve256_f32$o autodeps $root$/src/sleefsimdsp.c cc $sleef_cflags$ -c -msve256 -DNSIMD_SVE256 -DENABLE_SVE=1 @in -o @out build_file sleef_sve256_f64$o autodeps $root$/src/sleefsimddp.c cc $sleef_cflags$ -c -msve256 -DNSIMD_SVE256 -DENABLE_SVE=1 @in -o @out build_file api_sve512$o autodeps $root$/src/api_sve512.cpp c++ $cflags$ -c $cflags_for_sve512 @in -o @out build_file sleef_sve512_f32$o autodeps $root$/src/sleefsimdsp.c cc $sleef_cflags$ -c -msve512 -DNSIMD_SVE512 -DENABLE_SVE=1 @in -o @out build_file sleef_sve512_f64$o autodeps $root$/src/sleefsimddp.c cc $sleef_cflags$ -c -msve512 -DNSIMD_SVE512 -DENABLE_SVE=1 @in -o @out build_file api_sve1024$o autodeps $root$/src/api_sve1024.cpp c++ $cflags$ -c $cflags_for_sve1024 @in -o @out build_file sleef_sve1024_f32$o autodeps $root$/src/sleefsimdsp.c cc $sleef_cflags$ -c -msve1024 -DNSIMD_SVE1024 -DENABLE_SVE=1 \ @in -o @out build_file sleef_sve1024_f64$o autodeps $root$/src/sleefsimddp.c cc $sleef_cflags$ -c -msve1024 -DNSIMD_SVE1024 -DENABLE_SVE=1 \ @in -o @out build_file api_sve2048$o autodeps $root$/src/api_sve2048.cpp c++ $cflags$ -c $cflags_for_sve2048 @in -o @out build_file sleef_sve2048_f32$o autodeps $root$/src/sleefsimdsp.c cc $sleef_cflags$ -c -msve2048 -DNSIMD_SVE2048 -DENABLE_SVE=1 \ @in -o @out build_file sleef_sve2048_f64$o autodeps $root$/src/sleefsimddp.c cc $sleef_cflags$ -c -msve2048 -DNSIMD_SVE2048 -DENABLE_SVE=1 \ @in -o @out ## ---------------------------------------------------------------------------- ## POWERPC rules for building build_file api_vmx$o autodeps $root$/src/api_vmx.cpp c++ $cflags$ -c $cflags_for_vmx @in -o @out build_file sleef_vmx_f32$o autodeps $root$/src/sleefsimdsp_emulation.c cc $sleef_cflags$ -c -mvmx -DNSIMD_VMX -DENABLE_VSX=1 \ -I$root$/include @in -o @out build_file sleef_vmx_f64$o autodeps $root$/src/sleefsimddp_emulation.c cc $sleef_cflags$ -c -mvmx -DNSIMD_VMX -DENABLE_VSX=1 \ -I$root$/include @in -o @out build_file api_vsx$o autodeps $root$/src/api_vsx.cpp c++ $cflags$ -c $cflags_for_vsx @in -o @out build_file sleef_vsx_f32$o autodeps $root$/src/sleefsimdsp.c cc $sleef_cflags$ -c -mvsx -DNSIMD_VSX -DENABLE_VSX=1 @in -o @out build_file sleef_vsx_f64$o autodeps $root$/src/sleefsimddp.c cc $sleef_cflags$ -c -mvsx -DNSIMD_VSX -DENABLE_VSX=1 @in -o @out ## ---------------------------------------------------------------------------- ## Installation and packaging install_file libnsimd_${simd}$so lib [W] install_file libnsimd_${simd}$lib lib install_dir $root$/include/nsimd include install_dir $root$/doc/html doc ## ---------------------------------------------------------------------------- ## Tests # Lambda arguments: suite, compiler, std, simd_ext # By default all tests will be considered lambda tests_*_*_* = ok # Now disable some possibilities on certain compilers set tests_clang_c89_vmx = "" set tests_clang_c89_vsx = "" set tests_clang_c89_sve = "" lambda tests_*_c89_cuda = "" lambda tests_*_c99_cuda = "" lambda tests_*_c11_cuda = "" lambda tests_*_cpp17_cuda = "" lambda tests_*_c89_rocm = "" lambda tests_*_c99_rocm = "" lambda tests_*_c11_rocm = "" lambda tests_*_cpp98_rocm = "" lambda tests_*_cpp17_rocm = "" lambda tests_*_c89_oneapi = "" lambda tests_*_c99_oneapi = "" lambda tests_*_c11_oneapi = "" lambda tests_dpcpp_cpp98_* = "" lambda tests_dpcpp_cpp11_* = "" set c89_enabled = ${tests_$ccomp$_c89_$simd$} set c89.files = "" set c99_enabled = ${tests_$ccomp$_c99_$simd$} set c99.files = "" set c11_enabled = ${tests_$ccomp$_c11_$simd$} set c11.files = "" set cpp98_enabled = ${tests_$cppcomp$_cpp98_$simd$} set cpp98.files = "" set cpp11_enabled = ${tests_$cppcomp$_cpp11_$simd$} set cpp11.files = "" set cpp17_enabled = ${tests_$cppcomp$_cpp17_$simd$} set cpp17.files = "" set cpp20.files = "" set tests_flags = $cuda_arch_flags $flags ${cflags_for_$simd$} -lm $ldflags echo Test compilation flags: $tests_flags$ [$c89_enabled$] build_files c89 foreach glob:$root$/tests/*.prec11.c \ as tests.%r.c89$exe \ autodeps @item libnsimd_$simd$$so$ [$c89_enabled$] cc -std=c89 @item $tests_flags -o @out [$c89_enabled$] phony tests.c89 deps $c89.files [$c99_enabled$] build_files c99 foreach glob:$root$/tests/*.prec11.c \ as tests.%r.c99$exe \ autodeps @item libnsimd_$simd$$so$ [$c99_enabled$] cc -std=c99 @item $tests_flags -o @out [$c99_enabled$] phony tests.c99 deps $c99.files [$c11_enabled$] build_files c11 foreach glob:$root$/tests/*.c \ as tests.%r.c11$exe \ autodeps @item libnsimd_$simd$$so$ [$c11_enabled$] cc -std=c11 @item $tests_flags -o @out [$c11_enabled$] phony tests.c11 deps $c11.files [$cpp98_enabled$] build_files cpp98 foreach glob:$root$/tests/*.cpp \ as tests.%r.cpp98$exe \ autodeps @item libnsimd_$simd$$so$ [$cpp98_enabled$] c++ -std=c++98 @item $tests_flags -o @out [$cpp98_enabled$] phony tests.cpp98 deps $cpp98.files [$cpp11_enabled$] build_files cpp11 foreach glob:$root$/tests/*.cpp \ as tests.%r.cpp11$exe \ autodeps @item libnsimd_$simd$$so$ [$cpp11_enabled$] c++ -std=c++11 @item $tests_flags -o @out [$cpp11_enabled$] phony tests.cpp11 deps $cpp11.files [$cpp17_enabled$] build_files cpp17 foreach glob:$root$/tests/*.cpp \ as tests.%r.cpp17$exe \ autodeps @item libnsimd_$simd$$so$ [$cpp17_enabled$] c++ -std=c++17 @item $tests_flags -o @out [$cpp17_enabled$] phony tests.cpp17 deps $cpp17.files [$cpp20_tests$] build_files cpp20 foreach glob:$root$/tests/*.cpp \ as tests.%r.cpp20$exe \ autodeps @item libnsimd_$simd$$so$ [$cpp20_tests$] c++ -std=c++20 @item $tests_flags -o @out [$cpp20_tests$] phony tests.cpp20 deps $cpp20.files # Phony target for tests phony tests deps $c89.files $c99.files $c11.files $cpp98.files $cpp11.files \ $cpp17.files $cpp20.files ## ---------------------------------------------------------------------------- ## Examples build_files examples_cpp98 foreach glob:$root$/examples/*.cpp \ as examples.%r.cpp98$exe \ autodeps @item libnsimd_$simd$$so$ c++ -std=c++98 @item $tests_flags -o @out phony examples.cpp98 deps $examples_cpp98.files ================================================ FILE: doc/Makefile.nix ================================================ # Copyright (c) 2020 Agenium Scale # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. NS2_ROOT = ../nstools/ns2 CXX = c++ CXX_FLAGS = -O2 -Wall -Wextra -pedantic -std=c++11 all: md2html what_is_wrapped libns2.a: $(NS2_ROOT)/../.git/logs/HEAD Makefile.nix rm -rf libns2 mkdir -p libns2 cp $(NS2_ROOT)/lib/*.cpp libns2 (cd libns2 && $(CXX) $(CXX_FLAGS) -I../$(NS2_ROOT)/include -c *.cpp) ar rcs $@ libns2/*.o rm -rf libns2 md2html: libns2.a md2html.cpp Makefile.nix $(CXX) $(CXX_FLAGS) md2html.cpp -I$(NS2_ROOT)/include -o $@ -L. -lns2 what_is_wrapped: libns2.a what_is_wrapped.cpp Makefile.nix $(CXX) $(CXX_FLAGS) what_is_wrapped.cpp -I$(NS2_ROOT)/include -o $@ \ -L. -lns2 ================================================ FILE: doc/Makefile.win ================================================ # Copyright (c) 2020 Agenium Scale # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. NS2_ROOT = ..\nstools\ns2 CXX = cl CXX_FLAGS = /nologo /Ox /W3 /EHsc /DNS_NO_DLLSPEC /D_CRT_SECURE_NO_WARNINGS all: md2html.exe what_is_wrapped.exe libns2.lib: $(NS2_ROOT)\..\.git\logs\HEAD Makefile.win if exist libns2 rd /Q /S libns2 md libns2 copy /Y $(NS2_ROOT)\lib\*.cpp libns2 (cd libns2 && $(CXX) $(CXX_FLAGS) -I..\$(NS2_ROOT)\include /c *.cpp) lib /nologo /out:libns2.lib libns2\*.obj rd /Q /S libns2 md2html.exe: libns2.lib md2html.cpp Makefile.win $(CXX) $(CXX_FLAGS) /I$(NS2_ROOT)\include md2html.cpp libns2.lib \ Shlwapi.lib Dbghelp.lib /Fe$@ what_is_wrapped.exe: libns2.lib what_is_wrapped.cpp Makefile.win $(CXX) $(CXX_FLAGS) /I$(NS2_ROOT)\include what_is_wrapped.cpp \ libns2.lib Shlwapi.lib Dbghelp.lib /Fe$@ ================================================ FILE: doc/markdown/compilers_and_versions.md ================================================ `nsimd` is tested with GCC, Clang and MSVC. As a C89 and a C++98 API are provided, other compilers should work fine. Old compiler versions should work as long as they support the targeted SIMD extension. For instance, `nsimd` can compile on MSVC 2010 `SSE4.2` code. `nsimd` requires a C or a C++ compiler and is actually daily tested on the following compilers for the following hardware: **Compiler** | **Version** | **Architecture** | **Extensions** ----------------------- | ----------- | ---------------- | -------------- GCC | 8.3.0 | Intel | `SSE2`, `SSE4.2`, `AVX`, `AVX2`, `AVX-512` (`KNL` and `SKYLAKE`) Clang | 7.0.1 | Intel | `SSE2`, `SSE4.2`, `AVX`, `AVX2`, `AVX-512` (`KNL` and `SKYLAKE`) GCC | 8.3.0 | ARM | `Aarch64`, `NEON` (`ARMv7`), `SVE` Clang | 7.0.1 | ARM | `Aarch64`, `NEON` (`ARMv7`), `SVE` Microsoft Visual Studio | 2017 | Intel | `SSE4.2` Intel C++ Compiler | 19.0.4.243 | Intel | `SSE2`, `SSE4.2`, `AVX`, `AVX2`, `AVX-512` (`SKYLAKE`) ================================================ FILE: doc/markdown/concepts.md ================================================ # C++20 concepts As of C++20, concepts are available. We quote to introduce concepts. *Class templates, function templates, and non-template functions (typically members of class templates) may be associated with a constraint, which specifies the requirements on template arguments, which can be used to select the most appropriate function overloads and template specializations.* *Named sets of such requirements are called concepts. Each concept is a predicate, evaluated at compile time, and becomes a part of the interface of a template where it is used as a constraint* ## Concepts provided by NSIMD All concepts provided by NSIMD comes in two forms: - The native C++20 form in the `nsimd` namespace - As a macro for keeping the compatibility with older versions of C++ The following tables list all concepts and is exhaustive. Native concepts are accessible through the `nsimd` namespace. They take only one argument. Their macro counterparts take no argument as they are meant to be used as constraint placeholder types. When compiling for older C++ versions NSIMD concepts macros are simply read as `typename` by the compiler. Table for base C and C++ APIs: | Native concept | Macro | Description | |:----------------------------|:-----------------------------------|:-----------------------------------------------| | `simd_ext_c` | `NSIMD_CONCEPT_SIMD_EXT` | Valid SIMD extension | | `simd_value_type_c` | `NSIMD_CONCEPT_VALUE_TYPE` | Valid NSIMD underlying value type | | `simd_value_type_or_bool_c` | `NSIMD_CONCEPT_VALUE_TYPE_OR_BOOL` | Valid NSIMD underlying value type or `bool` | | `alignment_c` | `NSIMD_CONCEPT_ALIGNMENT` | Valid NSIMD alignment `aligned` or `unaligned` | Table for advanced C++ API: | Native concept | Macro | Description | |:---------------|:-------------------------|:----------------------| | `is_pack_c` | `NSIMD_CONCEPT_PACK` | Valid NSIMD pack | | `is_packl_c` | `NSIMD_CONCEPT_PACKL` | Valid NSIMD packl | | `is_packx1_c` | `NSIMD_CONCEPT_PACKX1` | Valid NSIMD packx1 | | `is_packx2_c` | `NSIMD_CONCEPT_PACKX2` | Valid NSIMD packx2 | | `is_packx3_c` | `NSIMD_CONCEPT_PACKX3` | Valid NSIMD packx3 | | `is_packx4_c` | `NSIMD_CONCEPT_PACKX4` | Valid NSIMD packx4 | | `any_pack_c` | `NSIMD_CONCEPT_ANY_PACK` | Any of the above pack | ## Expressing C++20 constraints Expressing constraints can of course be done with the `requires` keyword. But for compatibility with older C++ versions NSIMD provides `NSIMD_REQUIRES` which take as onyl argument the constraints. ```c++ template NSIMD_REQUIRES(sizeof(T) == sizeof(S)) void foo(T, S); ``` It is advised to use doubled parenthesis as coma in the constraints expression can be interpreted as argument separators for the macro itself. ```c++ template NSIMD_REQUIRES((std::is_same)) void foo(T, S); ``` Note that when expressing constraints using `nsimd::sizeof_v`'s prefer the NSIMD definition of sizeof for the following reason: when dealing with float16's one cannot know the underlying representation of such a type as it is non-portable and non-standard, but NSIMD provides helper functions to transparently deal with float16's as if they were 16-bits wide. Therefore expressing sizeof equality should be done with `nsimd::sizeof_v`. ```c++ template NSIMD_REQUIRES((nsimd::sizeof_v == nsimd::sizeof_v)) void foo(T, S); ``` ================================================ FILE: doc/markdown/defines.md ================================================ # Defines provided by NSIMD NSIMD uses macros (not function macros) that we call defines to make choices in its code at copmile time. Most of them can be of use to the end-user so we list them here. ## Compiler detection The compiler detection is automatically done by NSIMD as it is relatively easy. | Define | Compiler | |---------------------|---------------------------------------------------| | `NSIMD_IS_MSVC` | Microsoft Visual C++ | | `NSIMD_IS_HIPCC` | ROCm HIP compiler (warning, see below) | | `NSIMD_IS_NVCC` | NVIDIA CUDA Compiler | | `NSIMD_IS_ICC` | Intel C++ Compiler | | `NSIMD_IS_CLANG` | Clang/LLVM | | `NSIMD_IS_GCC` | GNU Compiler Collection | | `NSIMD_IS_FCC` | Fujitsu compiler | **Warning**: some HIP versions do not declare themselves at all so it impossible to find out that HIP is the compiler. As HIP is based on clang, without help NSIMD will detect Clang. It is up to the end-user to compile with `-D__HIPCC__` for NSIMD to detect HIP. Note that we do support the Armclang C and C++ compilers but for NSIMD there is no need to have code different from Clang's specific code so we do no provide a macro to detect this compiler in particular. Note also that two of the above macros can be defined at the same time. This happens typically when compiling for a device. For example when compiling for NVIDIA CUDA with nvcc both `NSIMD_IS_NVCC` and `NSIMD_IS_GCC` (when the host compiler is GCC). ## Compilation environment and contants | Define | Description | Possible values | |-------------------|-----------------------|---------------------------------| | `NSIMD_C` | C version | 1989, 1999, 2011 | | `NSIMD_CXX` | C++ version | 1998, 2011, 2014, 2017, 2020 | | `NSIMD_WORD_SIZE` | Machine word size | 32, 64 | | `NSIMD_U8_MIN` | Minimum value for u8 | 0 | | `NSIMD_U8_MAX` | Maximum value for u8 | 255 | | `NSIMD_I8_MIN` | Minimum value for i8 | -128 | | `NSIMD_I8_MAX` | Maximum value for i8 | 127 | | `NSIMD_U16_MIN` | Minimum value for u16 | 0 | | `NSIMD_U16_MAX` | Maximum value for u16 | 65535 | | `NSIMD_I16_MIN` | Minimum value for i16 | -32768 | | `NSIMD_I16_MAX` | Maximum value for i16 | 32767 | | `NSIMD_U32_MIN` | Minimum value for u32 | 0 | | `NSIMD_U32_MAX` | Maximum value for u32 | 4294967295 | | `NSIMD_I32_MIN` | Minimum value for i32 | -2147483648 | | `NSIMD_I32_MAX` | Maximum value for i32 | 2147483647 | | `NSIMD_U64_MIN` | Minimum value for u64 | 0 | | `NSIMD_U64_MAX` | Maximum value for u64 | 18446744073709551615 | | `NSIMD_I64_MIN` | Minimum value for i64 | -9223372036854775808 | | `NSIMD_I64_MAX` | Maximum value for i64 | 9223372036854775807 | | `NSIMD_DLLSPEC` | (Windows) DLL storage-class information | `__declspec(dllexport)` or `__declspec(dllimport)` | | `NSIMD_DLLSPEC` | (Unix) storage-class information | `extern` or nothing | | `NSIMD_C_LINKAGE_FOR_F16` | Indicate whether functions involving f16 have C linkage | defined or not | ## Targeted architecture detection Contrary to the compiler detection, the targeted architecture is not done autoamtically by NSIMD as is really hard and some compilers do not provide the necessary informations. So in order to have a consistent way of targeting an architecture this is up to the end-user to specify it using one of the following defines. | Define | Targeted architecture | |------------------------|---------------------------------------------------| | `NSIMD_CPU` | Generic, no SIMD, emulation | | `NSIMD_SSE2` | Intel SSE2 | | `NSIMD_SSE42` | Intel SSE4.2 | | `NSIMD_AVX` | Intel AVX | | `NSIMD_AVX2` | Intel AVX2 | | `NSIMD_AVX512_KNL` | Intel AVX-512 as found on KNLs | | `NSIMD_AVX512_SKYLAKE` | Intel AVX-512 as found on Xeon Skylake | | `NSIMD_NEON128` | Arm NEON 128 bits as found on 32-bits Arm chips | | `NSIMD_AARCH64` | Arm NEON 128 bits as found on 64-bits Arm chips | | `NSIMD_SVE` | Arm SVE (length agnostic) | | `NSIMD_SVE128` | Arm SVE (size known at compilation to 128 bits) | | `NSIMD_SVE256` | Arm SVE (size known at compilation to 256 bits) | | `NSIMD_SVE512` | Arm SVE (size known at compilation to 512 bits) | | `NSIMD_SVE1024` | Arm SVE (size known at compilation to 1024 bits) | | `NSIMD_SVE2048` | Arm SVE (size known at compilation to 2048 bits) | | `NSIMD_CUDA` | Nvidia CUDA | | `NSIMD_ROCM` | AMD ROCm architectures | | `NSIMD_VMX` | IBM POWERPC VMX (Altivec) | | `NSIMD_VSX` | IBM POWERPC VSX (Altivec) | | `NSIMD_FP16` | Architecture supports natively IEEE float16 | | `NSIMD_FMA` | Architecture supports natively FMAs | ## Targeted architecture constants | Define | Description | |-----------------------|----------------------------------------------------| | `NSIMD_NB_REGISTERS` | Number of SIMD registers | | `NSIMD_MAX_LEN_BIT` | Maximum number of bits in a SIMD register | | `NSIMD_MAX_LEN_i8` | Maximum number of i8's in a SIMD register | | `NSIMD_MAX_LEN_u8` | Maximum number of u8's in a SIMD register | | `NSIMD_MAX_LEN_i16` | Maximum number of i16's in a SIMD register | | `NSIMD_MAX_LEN_u16` | Maximum number of u16's in a SIMD register | | `NSIMD_MAX_LEN_i32` | Maximum number of i32's in a SIMD register | | `NSIMD_MAX_LEN_u32` | Maximum number of u32's in a SIMD register | | `NSIMD_MAX_LEN_i64` | Maximum number of i64's in a SIMD register | | `NSIMD_MAX_LEN_u64` | Maximum number of u64's in a SIMD register | NSIMD provides a mean to write generic code by using the `NSIMD_MAX_LEN` macros whose argument is one of { i8, u8, i16, u16, i32, u32, i64, u64 }. ```c++ #define T ??? // to be defined as a base type int main(void) { T buf[NSIMD_MAX_LEN(T)]; // an array of T's for loading/storing ... return 0; } ``` ## Other useful macros NSIMD provides macros to concatenate blobs so that generic programming in pure C is possible. - `#define NSIMD_PP_CAT_2(a, b)` concatenates `a` and `b`. - `#define NSIMD_PP_CAT_3(a, b, c)` concatenates `a`, `b` and `c`. - `#define NSIMD_PP_CAT_4(a, b, c, d)` concatenates `a`, `b`, `c` and `d`. - `#define NSIMD_PP_CAT_5(a, b, c, d, e)` concatenates `a`, `b`, `c`, `d` and `e`. - `#define NSIMD_PP_CAT_6(a, b, c, d, e, f)` concatenates `a`, `b`, `c`, `d`, `e` and `f`. ================================================ FILE: doc/markdown/faq.md ================================================ # Frequently Asked Questions ## Is it good practice to use a `nsimd::pack` as a `std::vector`? No, these are two very different objects. A `nsimd::pack` represent a SIMD register whereas a `std::vector` represents a chunk of memory. You should separate concerns and use `std::vector` to store data in your structs or classes, `nsimd::pack` should only be used in computation kernels and nowhere else especially not in structs or classes. ## Why is the speed-up of my code not as expected? There are several reasons which can reduce the speed-up: - Have you enabled compiler optimizations? You must enable all compiler optimizations (like `-O3`). - Have you compiled in 64 bit mode? There is significant performance increase on architectures supporting 64 bit binaries. - Is your code trivially vectorizable? Modern compilers can vectorize trivial code segments automatically. If you benchmark a trivial scalar code versus a vectorized code, the compiler may vectorize the scalar code, thereby giving similar performance to the vectorized version. - Some architectures do not provides certains functionnalities. For example AVX2 chips do not provide a way to convert long to double. So using `nsimd::cvt` will produce an emulation for-loop in the resulting binary. To know which intrinsics are used by NSIMD you can consult . ## Why did my code segfaulted or crashed? The most common cause of segfaults in SIMD codes is accessing non-aligned memory. For best performance, all memory should be aligned. NSIMD includes an aligned memory allocation function and an aligned memory allocator to help you with this. Please refer to for details on how to ensure that you memory is correctly aligned. Another common cause is to read or write data beyond the allocated memory. Do not forget that loading data into a SIMD vector will result in loading 16 bytes (or 4 floats) from memory. If this read occurs at the last 2 elements of allocated memory then a segfault will be generated. ## My code compiled for AVX is not twice as fast as for SSE, why? Not all SSE instructions have an equivalent AVX instruction. As a consequence NSIMD uses two SSE operations to emulate the equivalent AVX operation. Also, the cycles required for certain instructions are not equal on both architectures, for example, `sqrt` on `SSE` requires 13-14 cycles whereas `sqrt` on `AVX` requires 21-28 cycles. Please refer [here](https://www.agner.org/optimize/instruction_tables.pdf) for more information. Very few integer operations are supported on AVX, AVX2 is required for most integer operations. If a NSIMD function is called on an integer AVX register, this register will be split into two SSE registers and the equivalent instruction called on both register. In the case, no speed-up will be observed compared with SSE code. This is true also on POWER 7, where double is not supported. ## I disassembled my code, and the generated code is less than optimal, why? - Have you compiled in release mode, with full optimizations options? - Have you used a 64 bit compiler? - There are many SIMD related bugs across all compilers, and some compilers generate less than optimal code in some cases. Is it possible to update your compiler to a more modern compiler? - We provide workarounds for several compiler bugs, however, we may have missed some. You may also have found a bug in `nsimd`. Please report this through issues on our github with a minimal code example. We responds quickly to bug reports and do our best to patch them as quickly as possible. ## How can I use a certain intrinsic? If you require a certain intrinsic, you may search inside of NSIMD for it and then call the relevant function or look at . In rare cases, the intrinsic may not be included in NSIMD as we map the intrinsic wherever it makes sense semantically. If a certain intrinsic does not fit inside of this model, if may be excluded. In this case, you may call it yourself, however, note this will not be portable. To use a particular intrinsic say `_mm_avg_epu8`, you can write the following. ```c++ nsimd::pack a, b, result; result = nsimd::pack(_mm_avg_epu8(a.native_register(), b.native_register())); ``` ## How do I convert integers/floats to/from logicals? Use [`nsimd::to_mask`](api_to-mask.md) and [`nsimd::to_logical`](api_to-logical.md). ## How about shuffles? General shuffles are not provided by NSIMD. You can see [issue 8 on github](https://github.com/agenium-scale/nsimd/issues/8). For now we provide only some length agnostic shuffles such as zip and unzip, see [the shuffle API](api.md) at the Shuffle section. ## Are there C++ STL like algorithms? No. You are welcome to [contribute](contribute.md) to NSIMD and add them as a NSIMD module. You should use [expressions templates](module_tet1d_overview.md) instead. Strictly conforment STL algorithms do not provide means to control for example the unroll factor or the number of threads per block when compiling for GPUs. ## Are there masked operators in NSIMD? Yes, we provide masked loads and stores, see [the api](api.md) at the "Loads & stores" section. We also provide the [`nsimd::mask_for_loop_tail`](api_mask-for-loop-tail.md) which computes the mask for ending loops. But note that using these is not recommanded as on most architectures there are no intrinsic. This will result in slow code. It is recommanded to finish loops using a scalar implementation. ## Are there gathers and scatter in NSIMD? Yes, we provide gathers and scatters, see [the api](api.md) at the "Loads & stores" section. Note also that as most architectures do not provide such intrinsics and so this could result in slow code. ## Why does not NSIMD recognize the target architecture automatically? Autodetecting the SIMD extension is compiler/compiler version/cpu/system dependant which means a lot of code for a (most likely buggy) feature which can be an inconvenience sometimes. Plus some compilers do not permit this feature. For example cf. and . Thus a "manual" system is always necessary. ## Why some operators have their names ending with an "1"? This is because of C++ and our will not to use C++-useless-complicated stuff. Taking the example with `if_else`, suppose that we have called it "if\_else" without the "1". When working with packs, one wants to be able to use `if_else` in this manner: ```c++ int main() { using namespace nsimd; typedef pack pi; typedef pack pf; int n; int *a, *b; // suppose both points to n ints float *fa, *fb; // suppose both points to n floats for (int i = 0; i < n; i += len()) { packl cond = (loada(&a[i]) < loada(&b[i])); storea(&fb[i], if_else(cond, load(&fb[i]), set1(0.0f))); } return 0; } ``` But this causes a compiler error, the overload of `if_else` is ambiguous. Sure one can use many C++-ish techniques to tackle this problem but we chose not to as the goal is to make the life of the compiler as easy as possible. So as we want to favor the C++ advanced API as it is the most human readable, users of the C and C++ base APIs will have to use `if_else1`. ================================================ FILE: doc/markdown/fp16.md ================================================ # IEEE float16 related functions NSIMD natively supports IEEE float16's. This means that NSIMD provides types and functions to deal with them. When the targeted architecture supports them then NSIMD will use approriate intrinsics otherwise emulation with float32's will be used. - When emulating, as float16's are not natively supported by neither C or C++ emulation is done with float32's. - Intel architectures do not support IEEE float16 arithmetic, they only provide, as an extension, supports for convertion to/from float32. When compiling NSIMD for Intel architectures use `-DFP16` to activate the conversion intrinsics if available on your machine. Note that AVX-512 has thoses natively. - Arm architectures can provide native float16 arithmetic. For 32-bits and 64-bits chips (ARMv7 and Aarch64) chips float16 support is optional. When compiling with `-DFP16`, NSIMD will use float16-related intrinsics. Note that for SVE chips float16's are mandatory hence NSIMD will use appropriate intrinsics with or without `-DFP16`. - CUDA provides supports for converting float16's to/from float32's. These are always used by NSIMD. But it is only since devices of compute capabilities 5.3 and above that float16's arithmetic is provided. NSIMD will always use CUDA float16's functions so there is no need to compile with `-DFP16`. - ROCm HIP supports float16's except for the first versions. For now NSIMD assumes that it is always the case and use HIP float16 API. There is no need for `-DFP16`. ## Float16's related functions and types NSIMD provide the `f16` type which represents a IEEE float16. Note that depending on the targeted architecture and the presence of `-DFP16` the float16 type can typedefs many different types. Therefore the two following functions are provided and can be used to convert a float16 from/to a float32. These functions preserve NaN's and infinities. When converting from a float32 to a float16 saturation to infinities is performed when the float32 cannot be represented as a float16. | Function signature | Availability | |---------------------------------------------------|--------------| | `f16 nsimd_f32_to_f16(f32 a);` | C and C++ | | `f32 nsimd_f16_to_f32(f16 a);` | C and C++ | | `f16 nsimd::f32_to_f16(f32 a);` | C++ only | | `f32 nsimd::f16_to_f32(f16 a);` | C++ only | For loading/storing float16's NSIMD provides other conversion function to/from 16-bits unsigned integers. The integers will hold the IEEE binary representation of the float16's. | Function signature | Availability | |---------------------------------------------------|--------------| | `u16 nsimd_f32_to_u16(f32 a);` | C and C++ | | `f32 nsimd_u16_to_f32(u16 a);` | C and C++ | | `u16 nsimd::f32_to_u16(f32 a);` | C++ only | | `f32 nsimd::u16_to_f32(u16 a);` | C++ only | The `nsimd_*` functions listed above do not use the same linkage type depending on the targeted architecture. When compiling for GPUs the corresponding symbols names are mangled. They use C++ ABI because the float16 type is defined as a C++ class and not as a C struct. We therefore inherit from the implementation of CUDA and HIP/ROCm. Linkage types are listed below. | Function signature | CUDA/ROCm | Other architectures | |-----------------------------------|-------------|---------------------| | `f16 nsimd_f32_to_f16(f32 a);` | C++ linkage | C linkage | | `f32 nsimd_f16_to_f32(f16 a);` | C++ linkage | C linkage | | `f16 nsimd::f32_to_f16(f32 a);` | C++ linkage | C++ linkage | | `f32 nsimd::f16_to_f32(f16 a);` | C++ linkage | C++ linkage | | `u16 nsimd_f32_to_u16(f32 a);` | C++ linkage | C linkage | | `f32 nsimd_u16_to_f32(u16 a);` | C++ linkage | C linkage | | `u16 nsimd::f32_to_u16(f32 a);` | C++ linkage | C++ linkage | | `f32 nsimd::u16_to_f32(u16 a);` | C++ linkage | C++ linkage | It is possible to know at compile time in which situation we are. The `NSIMD_C_LINKAGE_FOR_F16` macro if defined means that C linkage is used for `nsimd_*` functions. ================================================ FILE: doc/markdown/how_tests_are_done.md ================================================ # How tests are done? First and foremost note that this is a work in progress and that we are doing our best to have serious testing of the library. We can also state our conclusion on testing: we are not and never will be satisfied with our tests, there are not enough of them, we want more. The current system has on average 15000 tests by SIMD extensions. Thanks to our "Python" approach we can automatically generate tests for all operators and for all types. This has greatly helped us in finding bugs. But, as you know, bugs are always there. ## Why write this? Testing the library has been taken seriously since its very beginning. Tests have gone through several stages: - The first one was during the development of the first version of the library. Tests of operators were done with random numbers as input. Those random numbers were all powers of 2 to ease the comparisons of basic arithmetic types. NaNs and infinities were not generated as inputs and operators behaviors with those inputs were not tested - For the second stage random numbers generators have been improved to emit NaNs and infinities. It allowed us to detect many errors in operators, mostly in math functions like cos, sin, exp... But we also discovered bugs in hardware when NaNs and infinities are given to intrinsics. - The third stage which the current test system takes into account the experience we gain with the privous two. As we have abandonned the buggy and slow implementations of math functions coming from Boost.SIMD and now rely on the excellent Sleef () we trust that the math functions are correctly tested. In more details we do not generate NaNs and infinities anymore because we trust functions coming from Sleef and we do not want to write code in our tests to bypass hardware bugs. We only care that our wrapping are correct adn that `nsimd::add` correctly calls add, the fact that the add does not work correctly is a hardware bug then and not the problem of the library. Tests on floatting points are done using ULPs. ULP means units in the last place and is commonly used for the comparison of floatting point numbers. It is in general a bad idea to compare floats with the `==` operators as it essentially compares bits. Instead we want to check if the results of two computations are "not to far away from each other". When checking an operator, let's say, on CPUs and GPUs, we to take into account that - the rounding mode may be different and - the precision of the calculation may be different. ## ULPs This chapter is dedicated to math proof concerning ULPs. Indeed people use this notion but proofs are hard to find. We give our own definition of distance in ULPs, compare it to the usual one and give pros and cons. We assume the reader is familiar with basic mathematics. For this entire chapter fix the following: - an integer $b > 1$ (will be our radix), - an integer $p > 1$ (will be the number of digits in the mantissa) - an integer $M > 1$ (will be the minimum exponent allowed for floatting point numbers) A floatting point number is an element of $\mathbb{R}$ of the form $m b^e$ with $e \geq -M$ and $m \in \mathbb{Z}$. More precisely we define the set of floatting point numbers $F$ to be the union of the following two sets: - $\{ mb^e \in F \text{ with } e > -M \}$ the *normal* numbers. - $\{ mb^{-M} \in F \text{ with } m \in \mathbb{Z} \text{ and } 0 < |m| < b^p \}$ the *denormal* or *subnormal* numbers. The set $F$ can be viewed as a subset of $\mathbb{R}$ with the mapping $\phi : (m, e) \mapsto mb^e$ and we will make this abuse of notation in what follows. Usually the sign of the floatting point number is separated from $m$ but we include it "inside" $m$ as it does not change the proofs below and simplifies the notations. Let $a_i \in F$ for $i = 1,2$ such that $a_i = m_i b^{e_i}$. **Proposition:** $\phi$ is injective. **Proof:** Suppose that $a_1 = a_2$ or $m_1b^{e_1} = m_2b^{e_2}$. If $a_1$ and $a_2$ are subnormal numbers then $e_1 = e_2 = -M$ and $m_1 = m_2$. If $a_1$ and $a_2$ are normal numbers suppose that $e_2 > e_1$, then $|\frac{m_2b^{e_2}}{m_1b^{e_1}}| > b^{e_2 + p - 1 - e_1 - p} = b^{e_2 - e_1 - 1} \geq b^{1 - 1} = 1$ therefore $m_2b^{e_2} \neq m_1b^{e_1}$ which is absurd hence $e_1 = e_2$ and as a consequence $m_1 = m_2$. **Definition:** We define the *distance in ULPs between $a_1$ and $a_2$* denoted by $U(a_1, a_2)$ to be: - $|m_1b^{e_1 - e_2} - m_2|$ if $e_1 \geq e_2$, - $|m_1 - m_2b^{e_2 - e_1}|$ otherwise. **Example:** Take $a_1 = 123456 \times 10^5$ and $a_2 = 123789 \times 10^5$ Then as the exponents of $a_1$ and $a_2$ are the same we have $U(123456 \times 10^5, 123789 \times 10^5) = |123789 - 123456| = 333$. The following proposition confort the name "units in the last place". **Proposition:** Let $f = \lfloor \log_b U(a_1, a_2) \rfloor + 1$ and suppose that $a_1, a_2$ are of same sign and have the same exponents, then either the first $p - f$ digits of $m_1$ and $m_2$ are identical or their difference is $\pm 1$. **Proof:** For $i = 1,2$ there exists $q_i \in \mathbb{Z}$ and $0 \leq r_i < b^f$ such that $m_i = q_i b^f + r_i$. Then $|q_1 - q_2| \leq \frac{|m_1 - m_2| + |r_1 - r_2|}{b^f} < \frac{b^{\log_b(U(a_1, a_2)} + b^f}{b^f} = 2$ So that either $q_1 = q_2$ or $q_1 - q_2 = \pm 1$. It is interesting to know what are the cases when $q_1 - q_2 \pm 1$. Suppose that $0 \leq m_1 < m_2$ and that $q_1 = q_2 + 1$ then $m_1 = q_1 b^f + r_1 \geq q_2 b^f + b^f > q_2 b^f + r_2 = m_2$ which contradicts the hypothesis hence $q_1 \leq q_2$. Finally $r_1 + U(a_1, a_2) = r_1 + (m_2 - m_1) = q_2 b^f + r_2 - q_1 b^f = r_2 + b_f$ so that: - $r_1 + U(a_1, a_2) \geq b^f$ and - $r_1 = r_2 + (b_f - U(a_1, a_2)) = r_2 + (b^f - b^{\log_b(U(a_1, a_2))}) > r_2$. **Example:** Taking back $a_1 = 123456 \times 10^5$ and $a_2 = 123789 \times 10^5$. As $q_1 = q_2$ we have the first 3 digits of $a_1$ and $a_2$ that are identical and they differ by their last $\log_{10} \lfloor U(a_1, a_2) \rfloor + 1 = \lfloor \log_{10}(333) \rfloor + 1 = 3$ **Example:** Now take $a_1 = 899900 \times 10^5$ and $a_2 = 900100 \times 10^5$. We have $f = 3$ but $q_2 = q_1 + 1$ and $r_2 = 900 > 100 = r_1$ and $r_2 + U(a_1, a_2) = 1100 \geq 1000 = 10^3$. The propositions above show that our definition of the ULP distance is well choosen as we have the following results: - (second proposition) is measures de number of different digits at the end of the mantissa. - (first proposition) if we write the numbers differently but still in base $b$ we only change the number of different digits in the last places by some zeros. The latter number being the exponent of $b$ that represents the difference in scaling of both representations of floatting point numbers. We show now how to compute it using the IEEE 754 floatting point numbers representation. A floatting point number $(m, e) \in F$ is stored in memory (and registers) as the integer $\pm ((e + M)b^p + |m|)$. **Proposition:** If $e_2 \geq e_1 + 2$ then $U(a_1, a_2) \geq b^p$. **Proof:** We have $U(a_1, a_2) = |m_2 b^{e_2 - e_1} - m_1| \geq ||m_2| b^{e_2 - e_1} - |m_1||$. But $m_2$ is a normal number otherwise we would have $e_2 = -M = e_1$ so that $|m_2| \geq b^{p - 1}$ and we have $|m_2| b^{e_2 - e_1} \geq b^{p - 1 + e_2 - e_1} \geq b^{p + 1} > |m_1|$, therefore $||m_2| b^{e_2 - e_1} - |m_1|| \geq |m_2|b^2 - |m_1| > b^{p - 1 + 2} - b^p = b^p$. The proposition above basically states that if two floatting point numbers are two orders of magnitude away then that have no digits in common, and that there are godd chances that comparing them is not interesting at all. The usual definition of the distance in ULPs is roughly given as the number of floatting point numbers between the two considered floatting point numbers. More precisely we will denote it by $V$ and it is defined as follows: - $V(a_1, a_2) = |(e_1 + M)b^p + |m_1| - (e_2 + M)b^p - |m_2||$ if $a_1$ and $a_2$ have the same signs - $V(a_1, a_2) = (e_1 + M)b^p + |m_1| + (e_2 + M)b^p + |m_2|$ otherwise. **Proposition:** If $e_1 = e_2$ and $a_1$, $a_2$ have the same sign then $U(a_1, a_2) = V(a_1, a_2)$. **Proof:** We have $V(a_1, a_2) = |(e_1 + M)b^p + m_1 - (e_2 + M)b^p - m_2|$, but as $e_1 = e_2$, we end up with $V(a_1, a_2) = |m_1 - m_2| = U(a_1, a_2)$. **Proposition:** $V(a_1, a_2) = 1$ is equivalent to $U(a_1, a_2) = 1$. **Proof:** The proposition is true if $e_1 = e_2$. Suppose that $e_2 > e_1$. Note that $a_2$ is a normal number so that $m_2 \geq b^{p - 1}$. We first suppose that $V(a_1, a_2) = 1$. Then by the definition of $V$, $a_1$ and $a_2$ have same sign otherwise $V(a_1, a_2) \geq 2$ and we suppose that $a_i \geq 0$. Moreover we have $e_2 = e_1 + 1$ otherwise we would have that $a_1 = m_1b^{e_1} < m_1b^{e_1 + 1} < m_2b^{e_1 + 2} \leq a_2$. Now we have $(b^p - 1)b^{e_1} < b^{p - 1}b^{e_1 + 1}$ and let $(b^p - 1)b^{e_1} \leq mb^e \leq b^{p - 1}b^{e_1 + 1}$. First note that if $a = mb^e$ is a normal number then $m \geq b^{p - 1}$ and if $a$ is a subnormal number then $e = -M$ in which case we also have $e_1 = -M$ and $m \geq b^p - 1 \geq b^{p - 1}$. In any case $m \geq b^{p - 1}$. We have $(b^p - 1)/m b^{e_1} < b^e < b^{p - 1}/m b^{e_1 + 1}$. But $1 \leq (b^p - 1) / m$ and $b^{p - 1} / m \leq 1$ so that $b^{e_1} \leq b^e \leq b^{e_1 + 1}$ and $e = e_1$ or $e = e_1 + 1$. In the first case $(b^p - 1)b^{e_1} \leq mb^{e_1}$ so that $b^p - 1 \leq m$ but $m < b^p$ and $m = b^p - 1$. In the second case $mb^{e_1 + 1} \leq b^{p - 1}b^{e_1 + 1}$ so that $m \leq b^{p - 1}$ but $b^{p - 1} \leq m$ and $m = b^{p - 1}$. We have proven that two consecutive elements of $F$ with $e_2 = e_1 + 1$ are neessary of the form $a_1 = (b^p - 1)b^{e_1}$ and $a_2 = b^{p - 1}b^{e_1 + 1}$. Now we can compute $U(a_1, a_2) = |bb^{p - 1} - (b^p - 1)| = 1$. Conversely, suppose that $U(a_1, a_2) = 1$, then $|b^{e_2 - e_1}m_2 - m_1| = 1$. Suppose that $b^{e_2 - e_1}m_2 - m_1 = -1$, then $-1 \geq bb^{p - 1} - b^p = 0$ which is absurd. We then have $b^{e_2 - e_1}m_2 - m_1 = 1$. Suppose that $e_2 \geq e_1 + 2$ then we would have that $b^{e_2 - e_1}m_2 - m_1 \geq b^2b^{p - 1} - b^p \geq b^p$ which is absurd so that $e_2 = e_1 + 1$ and $bm_2 - m_1 = 1$. Suppose that $m_2 \geq b^{p - 1} + 1$ then $bm_2 - m_1 \geq b^p + b - (b^p - 1) \geq 2$ which is absurd so that $m_2 = b^{p - 1}$ and as a consequence $m_1 = b^p - 1$. If $a_1, a_2 < 0$, then $V(a_1, a_2) = 1$ is equivalent by definition to $V(-a_1, -a_2) = 1$ which is equivalent to $U(-a_1, -a_2) = 1$ which is by definition equivalent to $U(a_1, a_2) = 1$. **Proposition:** Suppose that $e_1 \leq e_2 \leq e_1 + 1$ then $V \leq U \leq bV$. **Proof:** The proposition is true if $e_1 = e_2$. Suppose now that $e_2 = e_1 + 1$. Then we have $b^p + m_2 - m_1 \geq b^p + b^{p - 1} - b^p \geq 0$ so that $V(a_1, a_2) = b^p + m_2 - m_1 = b^p + m_2(1 - b) + bm_2 - m_1$. But $b^p + m_2(1 - b) \leq b^p + b^p(1 - b) \leq 0$ and $bm_2 - m_1 \geq bb^{p - 1} - b^p = 0$ so that $V(a_1, a_2) \leq bm_2 - m_1 = U(a_1, a_2)$. On the other hand we have $bm_2 - m_1 \leq b(b^p + m_2 - m_1 + m_1 - m_1/b - b^p)$ but $m_1 - m_1/b - b^p \leq b^p - b^{p - 1}/b - b^p \leq 0$ so that $U(a_1, a_2) \leq b(b^p + m_2 - m_1) = bV(a_1, a_2)$. **Remark:** The previous propositions shows that the difference between $V$ and $U$ is only visible when the arguments have differents exponents and are non consecutive. Our version of the distance in ULPs puts more weights when crossing powers of $b$. Also if $e_2 \geq e_1 + 2$ then we have seen that $a_1$ and $a_2$ have nothing in common which is indicated by the fact that $U, V \geq b^p$. **Definition:** We now define the relative distance $D(a_1, a_2)$ between $a_1$ and $a_2$ to be $|a_1 - a_2| / \min(|a_1|, |a_2|)$. **Proposition:** As $U$ is defined in a "mathematical" way compared to $V$ then the relation between $U$ and $D$ is straightforward and we have $D(a_1, a_2) = U(a_1, a_2) / |m_1|$. Moreover we have $b^{-q}U \leq D \leq b^{1 - q}U$ where $q$ is the greatest integer such that $b^{q - 1} \leq |m_1| < b^q$. In particular if $a_1$ is a normal number then $p = q$. **Proof:** Suppose that $|a_1| < |a_2|$, then we have three cases: - If $a_2$ is denormal, then so is $a_1$ and $e_1 = -M = e_2$. - If $a_2$ is normal, then: + If $a_1$ is denormal then $e_1 < e_2$. + If $a_1$ and $a_2$ are normal numbers then $|m_1/m_2| b^{e_1 - e_2} < 1$ but $|m_1/m_2| \geq b^{p - 1} / b^p = b^{-1}$ and we have $b^{e_1 - e_2 - 1} < 1$ so that $e_1 < e_2 + 1$ or $e_1 \leq e_2$. In any case we have $e_1 \leq e_2$, as a consequence we have $D(a_1, a_2) = |m_1b^{e_1} - m_2b^{e_2}| / \min(|m_1|b^{e_1}, |m_2|b^{e_2}) = |m_1 - m_2b^{e_2 - e_1}| / \min(|m_1|, |m_2|b^{e_2 - e_1})$. Therefore $D(a_1, a_2) = U(a_1, a_2) / \min(|m_1|, |m_2|b^{e_2 - e_1})$. Now if $e_1 = e_2$ then $\min(|m_1|, |m_2|) = |m_1|$ but if $e_2 > e_1$ then $a_2$ is a normal number and $|m_1| < b^p = b \times b^{p - 1} \leq b^{e_2 - e_1} |m_2|$ and again $\min(|m_1|, |m_2|b^{e_2 - e_1}) = |m_1|$. Applying $b^{q - 1} \leq |m_1| < b^q$ we get that $b^{-q}U \leq D \leq b^{1 - q}U$. If moreover $a_1$ is a normal number then by definition $p = q$. **Remark:** Using the inequality of the previous proposition and taking the base-$b$ logarithm we get $-q + \log U \leq \log D \leq 1 - q + \log U$ and then $-q + \lfloor \log U \rfloor \leq \lfloor \log D \rfloor \leq 1 - q + \lfloor \log U \rfloor$ hence two possibilities: - $-q + \lfloor \log U \rfloor = \lfloor \log D \rfloor$ in which case $\lfloor \log U \rfloor + (-\lfloor \log D \rfloor) = q$. - $1 - q + \lfloor \log U \rfloor = \lfloor \log D \rfloor$ in which case $1 + \lfloor \log U \rfloor + (-\lfloor \log D \rfloor) = q$. According to a above proposition we know that $f = 1 + \lfloor \log U \rfloor$ can be interpreted as the number of differents digits in the last places of the mantissa. Write $\mathcal{D} = - \lfloor \log D \rfloor$ then $q \leq f + \mathcal{D} \leq q + 1$. The latter inequality shows that $\mathcal{D}$ can be interpreted as the number of digits which are the same in the mantissa near the "first" place. Note that for denormal numbers the "first" places are near the bit of most significance. We can conclude this remark with the interpretation that two floatting point numbers have at least $\mathcal{D} - 1$ digits in common in the first place of the mantissa and $f$ digits which are different in the last place of the mantissa. **Algorithm:** We give below the C code for $U$ with a caveat. As seen in a previous proposition when $e_2 \geq e_1 + 2$ the arguments have no digit in common and can be considered too far away in which case we return `INT_MAX` (or `LONG_MAX`). As a side effect is that the code will be free of multiprecision integers (which would be necessary as soon as $|e_2 - e_1| \geq 12$) hence lesser dependencies, readability, maintainability and performances. When $|e_2 - e_1| \leq 1$ we use the formula of the definition. ```c /* We suppose that floats are IEEE754 and not NaN nor infinity */ struct fl_t{ int mantissa; int exponent; }; fl_t decompose(float a_) { fl_t ret; unsigned int a; memcpy(&a, &a_, sizeof(float)); /* avoid aliasing */ ret.exponent = (int)((a >> 23) & 0xff) - 127; if (ret.exponent == -127) { /* denormal number */ ret.mantissa = (int)(a & 0x007fffff); } else { ret.mantissa = (int)((1 << 23) | (a & 0x007fffff)); } if (a >> 31) { ret.mantissa = -ret.mantissa; } return ret; } int distance_ulps(float a_, float b_) { fl_t a, b; a = decompose(a_); b = decompose(b_); if (a.exponent - b.exponent < -1 || a.exponent - b.exponent > 1) { return INT_MAX; } int d; if (a.exponent == b.exponent) { d = a.mantissa = b.mantissa; } else if (a.exponent > b.exponent) { d = 2 * a.mantissa - b.mantissa; } else { d = 2 * b.mantissa - a.mantissa; } return d > 0 ? d : -d; } ``` The algorithm for computing $\mathcal{D} - 1$ follows: ```c int d(float a_, float b_) { float absa = fabsf(a_); float absb = fabsf(b_); /* ensure that |a_| <= |b_| */ if (absb < absa) { float tmp = absa; absa = absb; absb = tmp; } fl_t a = decompose(absa); int q = 0; for (q = 0; q <= 23 && (2 << q) <= a.mantissa; q++); int ulps = distance_ulps(a_, b_); int lu; for (lu = 0; lu <= 30 && (2 << (lu + 1)) <= a.mantissa; lu++); return q - (lu + 1) - 1; } ``` ## What we really do in the tests As said above buggy intrinsics can be easily found. But the bugs appears for corner cases typically involving NaNs and/or infinities. But according to the philosophy of NSIMD, it is not the job of its standard operators to propose a non buggy alternative to a buggy intrinsics. But we still have the problem of testing. A consequence of the philosophy of NSIMD is that we only have to test that intrinsics are correctly wrapped. We can reasonably assume that testing for floatting point numbers on only normal numbers is more than sufficient. Moreover, an implementation (buggy or not), may have different parameters set that controls how floatting point arithmetic is done on various components of the chip. An non exhaustive list includes: - Rounding modes (which is not controlled by NSIMD as it is a library) - FTZ/DAZ (flush to zero) denormal values never appear. - FTZ/DAZ on some components (SIMD parts) and not others (scalar parts) - Non IEEE behavior (eg. some NVIDIA GPU and ARMv7 chips) - A mix of the above - A buggy mix of the above As a consequence we do not compare floats using the operator `=` nor do we use a weird-buggy formula involving the machine epsilon. Instead we use the algorithm above to make sure that the first bits are correct. More precisely we use the following algorithm and its variants for float16 and doubles where `ufp` stands for `units in the first place`. ```c /* a_ and b_ must be IEEE754 and normal numbers */ int ufps(float a_, float b_) { unsigned int a, b; memcpy(&a, &a_, 4); memcpy(&b, &b_, 4); int ea = (int)((a >> 23) & 0xff); int eb = (int)((b >> 23) & 0xff); if (ea - eb > 1 || ea - eb < -1) { return 0; } int ma = (int)(a & 0x007fffff); int mb = (int)(b & 0x007fffff); int d = 0; if (ea == eb) { d = ma - mb; } else if (ea > eb) { d = 2 * ma - mb; } else { d = 2 * mb - ma); } d = (d >= 0 ? d : -d); int i = 0; for (; i < 30 && d >= (1 << i); i++); return 23 - i; } ``` ================================================ FILE: doc/markdown/memory.md ================================================ # Memory functions Although the purpose of NSIMD is not to provide a full memory container library, it provides some helper functions to facilitate the end-user. The functions below only deals with CPUs. If your needs concerns GPUs or memory transfers between CPUs and GPUs see the [memory management module](module_memory_management_overview.md). ## Memory functions available in C and C++ - `void *nsimd_aligned_alloc(nsimd_nat n);` Returns a pointer to `n` bytes of aligned memory. It returns NULL is an error occurs. - `void nsimd_aligned_free(void *ptr);` Frees the memory pointed to by `ptr`. ## Memory functions available in C++ - `void *nsimd::aligned_alloc(nsimd_nat n);` Returns a pointer to `n` bytes of aligned memory. It returns NULL is an error occurs. - `void nsimd::aligned_free(void *ptr);` Frees the memory pointed to by `ptr`. - `template T *nsimd::aligned_alloc_for(nsimd_nat n);` Returns a pointer to `n` `T`'s of aligned memory. It returns NULL is an error occurs. - `template void nsimd::aligned_free_for(void *ptr);` Free memory pointed to by `ptr`. ## C++ allocators for `std::vector`'s NSIMD provides C++ allocators so that memory used by C++ container such as `std::vector`'s will be suitably aligned in memory. - `template class nsimd::allocator;` The class for allocating aligned memory inside C++ containers. Exemple: ```c++ #include int main() { int n = // number of float's to allocate std::vector > myvector(size_t(n)); // In what follows ptr is a pointer suitably aligned for the current SIMD // targeted architecture. float *ptr; // C++98 ptr = &myvector[0]; // C++11 and above ptr = myvector.data(); } ``` As there is no portable way of having aligned scoped memory, one can use the NSIMD allocators to emulate such memory. ```c++ #include template void test() { std::vector > mem(size_t(N)); T *ptr; // C++98 ptr = &mem[0]; // scoped aligned memory // C++11 and above ptr = mem.data(); // scoped aligned memory } int main() { test(); test(); } ``` ## C++ scoped memory allocation NSIMD provides a struct helper for the user to allocate a chunk of memory and don't care about its release. It uses C++ RAII. ```c++ namespace nsimd { template class scoped_aligned_mem_for { template scoped_aligned_mem(I n); // Construct a struct an array of n T's. T *get(); // Return the pointer to access memory. }; } int main() { // Allocates 1024 floats in memory. It will be freed when the function (or // the program) terminates. nsimd::scoped_aligned_mem_for buffer(1024); return 0; } ``` ================================================ FILE: doc/markdown/modules/.gitignore ================================================ */api*.md ================================================ FILE: doc/markdown/modules/fixed_point/overview.md ================================================ # NSIMD fixed point module ## Description This module implements a fixed-point numbers support for the `nsimd` library. Fixed-point numbers are integer types used to represent decimal numbers. A number `lf` of bits are used to encode its integer part, and `rt` bits are used to encode its fractional part. The fixed_point module uses the templated type `nsimd::fixed_point::fp_t` to represent a fixed_point number. All the basic floating-point arithmetic operaors have been defined, therefore fp_t elements can be manipulated as normal numbers. The fixed_point module will use a `int8_t`, `int16_t`, or `int32_t` integer type for storage, depending on the value of `lf + 2 * rt`. All the functions of the module are under the namespace `nsimd::fixed_point`, and match the same interface than `nsimd`. The `fp_t` struct type is defined in `fixed.hpp`, and the associated simd `fpsimd_t` struct type is defined in `simd.hpp`. The modules redefines the `nsimd` pack type for fixed-point numbers, templated with `lf` and `rt` : ```C++ namespace nsimd { namespace fixed_point { template struct pack; } // namespace fixed_point } // namespace nsimd ``` Then, the pack can be manipulated as an `nsimd` pack like other scalar types. ## Compatibility The fixed point module is a C++ only API, compatible with the C++98 standard. It has the same compilers and hardware support than the main `nsimd` API (see the [API index](../../index.md)). ## Example Here is a minimal example(main.cpp) : ```C++ #include #include #include #include float rand_float() { return 4.0f * ((float) rand() / (float) RAND_MAX) - 2.0f; } int main() { // We use fixed point numbers with 8 bits of integer part and 8 bits of // decimal part. It will use a 32 bits integer for internal storage. typedef nsimd::fixed_point::fp_t<8, 8> fp_t; typedef nsimd::fixed_point::pack fp_pack_t; const size_t v_size = nsimd::fixed_point::len(fp_t()); fp_t *input0 = (fp_t*)malloc(v_size * sizeof(fp_t)); fp_t *input1 = (fp_t *)malloc(v_size * sizeof(fp_t)); fp_t *res = (fp_t *)malloc(v_size * sizeof(fp_t)); // Input and output initializations for(size_t i = 0; i < nsimd::fixed_point::len(fp_t()); i++) { input0[i] = fp_t(rand_float()); input1[i] = fp_t(rand_float()); } fp_pack_t v0 = nsimd::fixed_point::loadu(input0); fp_pack_t v1 = nsimd::fixed_point::loadu(input1); fp_pack_t vres = nsimd::fixed_point::add(v0, v1); nsimd::fixed_point::storeu(res, vres); for(size_t i = 0; i < nsimd::fixed_point::len(fp_t()); i++) { std::cout << float(input0[i]) << " | " << float(input1[i]) << " | " << float(res[i]) << "\n"; } std::cout << std::endl; return EXIT_SUCCESS; } ``` To test with avx2 run : ```bash export NSIMD_ROOT= g++ -o main -I$NSIMD_ROOT/include -mavx2 -DNSIMD_AVX2 main.cpp ./main ``` The console output will look like this : ```console $>./main 1.35938 | -0.421875 | 0.9375 1.13281 | 1.19531 | 2.32812 1.64844 | -1.21094 | 0.4375 -0.660156 | 1.07422 | 0.414062 -0.890625 | 0.214844 | -0.675781 -0.0898438 | 0.515625 | 0.425781 -0.539062 | 0.0546875 | -0.484375 1.80859 | 1.66406 | 3.47266 ``` ================================================ FILE: doc/markdown/pack.md ================================================ # NSIMD pack and related functions The advanced C++ API provides types that represents SIMD registers. These types are struct that allows NSIMD to define infix operators. In this page NSIMD concepts are reported in the documentation but you can think of them as usual `typename`s. ## The Pack type ```c++ template struct pack { // Typedef to retrieve the native SIMD type typedef typename simd_traits::simd_vector simd_vector; // Typedef to retrieve T typedef T value_type; // Typedef to retrieve SimdExt typedef SimdExt simd_ext; // Static member to retrive N static const int unroll = N; // Ctor that splats `s`, the resulting vector will be [s, s, s, ...] template pack(S const &s); // Ctor that takes a SIMD vector of native type // ONLY AVAILABLE when N == 1 pack(simd_vector v); // Retrieve the underlying native SIMD vector // ONLY AVAILABLE when N == 1 simd_vector native_register() const; }; ``` Example: ```c++ #include #include int main() { nsimd::pack v(2.0f); std::cout << v << '\n'; vf32 nv = v.native_register(); nv = nsimd::add(nv, nv, f32()); std::cout << nsimd::pack(nv) << '\n'; return 0; } ``` ### Infix operators available for packs - `pack operator+(pack const &, pack const &);` - `pack operator*(pack const &, pack const &);` - `pack operator-(pack const &, pack const &);` - `pack operator/(pack const &, pack const &);` - `pack operator-(pack const &);` - `pack operator|(pack const &, pack const &);` - `pack operator^(pack const &, pack const &);` - `pack operator&(pack const &, pack const &);` - `pack operator~(pack const &);` - `pack operator<<(pack const &, int);` (only available for integers) - `pack operator>>(pack const &, int);` (only available for integers) ### Assignment operators available for packs - `pack operator+=(pack const &);` - `pack operator-=(pack const &);` - `pack operator*=(pack const &);` - `pack operator/=(pack const &);` - `pack &operator|=(pack const &other);` - `pack &operator&=(pack const &other);` - `pack &operator^=(pack const &other);` - `pack &operator<<=(int);` - `pack &operator>>=(int);` ### Function aliases The C++ standard provides functions with different names that does exactly the same thing. This is due to the retro compatibility with C. Take the `fmin` C function as an example. In C this function give the minimum between doubles only. The C++ standard provides overloads to this function so that it can work on floats and long doubles. The aliases provided by NSIMD have the same purpose but they are not provided as operator on their own because their real purpose is to write generic code that can work on scalar and SIMD vector types. As such they are only relevant for the advanced C++ API. - `pack fmin(pack const &, pack const &);` - `pack fmax(pack const &, pack const &);` - `pack fabs(pack const &);` They are contained in the `nsimd/cxx_adv_api_aliases.hpp` header and not provided by default to respect the philosophy of NSIMD which is force the use to think different between SIMD code and scalar code. They are provided automatically when including `nsimd/nsimd-all.hpp`. ## The Packl type ```c++ template struct packl { // Typedef to retrieve the native SIMD type typedef typename simd_traits::simd_vectorl simd_vectorl; // Typedef to retrieve T typedef T value_type; // Typedef to retrieve SimdExt typedef SimdExt simd_ext; // Static member to retrive N static const int unroll = N; // Ctor that splats `s`, the resulting vector will be [s, s, s, ...] template packl(S const &s); // Ctor that takes a SIMD vector of native type // ONLY AVAILABLE when N == 1 packl(simd_vectorl v); // Retrieve the underlying native SIMD vector // ONLY AVAILABLE when N == 1 simd_vector native_register() const; }; ``` Example: ```c++ #include #include int main() { nsimd::pack v(2.0f); nsimd::packl mask; mask = nsimd::eq(v, v); std::cout << v << '\n'; mask = nsimd::neq(v, v); std::cout << v << '\n'; return 0; } ``` ### Infix operators involving packls - `packl operator&&(packl const &, packl const &);` - `packl operator||(packl const &, packl const &);` - `packl operator!(packl const &, packl const &);` - `packl operator==(pack const &, pack const &);` - `packl operator!=(pack const &, pack const &);` - `packl operator<(pack const &, pack const &);` - `packl operator<=(pack const &, pack const &);` - `packl operator>(pack const &, pack const &);` - `packl operator>=(pack const &, pack const &);` ## Packs for SoA/AoS Types containing several SIMD vectors are also provided to help the user manipulate arrays of structures. When working, let's say, on complex numbers, loading them from memory with layout `RIRIRIRIRIRI...` can be done with the `load2*` operators that will returns 2 SIMD vectors `RRRR` and `IIII` where `R` stands for real part and `I` for imaginary part. Similarily loading an RGB image from memory stored following the layout `RGBRGBRGBRGB...` can be done with `load3*` to get 3 SIMD vectors `RRRR`, `GGGG` and `BBBB`. ### Packx1 ```c++ template NSIMD_STRUCT packx1 { // Usual typedefs and static members typedef typename simd_traits::simd_vector simd_vector; typedef T value_type; typedef SimdExt simd_ext; static const int unroll = N; static const int soa_num_packs = 1; // Member v0 for reading and writing pack v0; }; ``` ### Packx2 ```c++ template NSIMD_STRUCT packx2 { // Usual typedefs and static members typedef typename simd_traits::simd_vector simd_vector; typedef T value_type; typedef SimdExt simd_ext; static const int unroll = N; static const int soa_num_packs = 2; // Members for reading and writing pack v0; pack v1; }; ``` ### Packx3 ```c++ template NSIMD_STRUCT packx3 { // Usual typedefs and static members typedef typename simd_traits::simd_vector simd_vector; typedef T value_type; typedef SimdExt simd_ext; static const int unroll = N; static const int soa_num_packs = 3; // Members for reading and writing pack v0; pack v1; pack v2; }; ``` ### Packx4 ```c++ template NSIMD_STRUCT packx4 { // Usual typedefs and static members typedef typename simd_traits::simd_vector simd_vector; typedef T value_type; typedef SimdExt simd_ext; static const int unroll = N; static const int soa_num_packs = 4; // Members for reading and writing pack v0; pack v1; pack v2; pack v3; }; ``` ### Functions involving packx2, packx3 and packx4 The following functions converts packxs into unrolled packs. The difference between the `to_pack` and `to_pack_interleave` families of functions is in the way they flatten (or deinterleave) the structure of SIMD vectors. ```c++ template pack to_pack(const packx2 &); template pack to_pack(const packx3 &); template pack to_pack(const packx4 &); template pack to_pack_interleave(const packx2 &); template pack to_pack_interleave(const packx3 &); template pack to_pack_interleave(const packx4 &); ``` The `to_pack` family of functions performs the following operations: ``` packx2 = | v0 = [u0 u1 u2] | ---> [u0 u1 u2 w0 w1 w2] = pack | v1 = [w0 w1 w2] | ``` while the `to_pack_interleave` family of functions does the following: ``` packx2 = | v0 = [u0 u1 u2] | ---> [u0 w0 v1 w1 v2 w2] = pack | v1 = [w0 w1 w2] | ``` ================================================ FILE: doc/markdown/tutorial.md ================================================ # NSIMD tutorial In this tutorial we will write and compile a simple SIMD kernel to become familiar with the basics of NSIMD. We will also see different aspects of SIMD programming: - aligned vs. unaligned data access - basic SIMD arithmetic - SIMD loops - SIMD branching - architecture selection at runtime ## SIMD basics SIMD programming means using the CPU SIMD registers to performs operations on several data at once. A SIMD vector should be viewed as a set of bits which are interpreted by the operators that operate on them. Taking a 128-bits wide SIMD register, it can be interpreted as: - 16 signed/unsigned chars - 8 signed/unsigned shorts - 4 signed/unsigned ints - 4 floats - 2 signed/unsigned longs - 2 doubles as shown in the picture below. ![Register layout](img/register.png) ## Computation kernel We will explain the rewriting of the following kernel which uppercases ASCII letters only. @[INCLUDE_CODE:L7:L16](../../examples/tutorial.cpp) Here is the corresponding SIMD version. Explanations to follow. @[INCLUDE_CODE:L18:L39](../../examples/tutorial.cpp) ## Getting started with NSIMD All APIs of NSIMD core is available with this include: @[INCLUDE_CODE:L1:L1](../../examples/tutorial.cpp) For ease of programming with use the NSIMD namespace inside the `uppercase_simd` function. @[INCLUDE_CODE:L20:L20](../../examples/tutorial.cpp) ## SIMD vectors A `nsimd::pack` can be considered analogous to a SIMD register (on your or any other machine). Operations performed on packs - from elementary operations such as addition to complicated functions such as `nsimd::rsqrt11(x)` - will be performed using SIMD registers and operations if supported by your hardware. As shown below, data must be manually loaded into and stored from these registers. Again, for ease of programming we typedef a pack of T's. @[INCLUDE_CODE:L21:L21](../../examples/tutorial.cpp) NSIMD provides another type of pack called `nsimd::packl` which handles vectors of booleans. @[INCLUDE_CODE:L22:L22](../../examples/tutorial.cpp) This distinction between pack's and packl's is necessary ffor two reasons: - On recent hardware, SIMD vectors of booleans are handled by dedicated registers. - Pack and Packl must have different semantics as arithmetic operators on booleans have no sense as well as logical operators on Pack's. ## Loading data from memory One way to construct a `nsimd::pack` is to simply declare (default-construct) it. Such a pack may *not* be zero-initialized and thus may *contain arbitrary values*. Another way to construct a `nsimd::pack` is to fill it with a single value. This so-called splatting constructor takes one scalar value and replicates it in all elements of the pack. But most common usage to construct a `nsimd::pack` is by using the copy constructor from loading functions. @[INCLUDE_CODE:L27:L27](../../examples/tutorial.cpp) ## Aligned vs. unaligned memory Alignement of a given pointer `ptr` to memory to some value `A` means that `ptr % A == 0`. On older hardware loading data from unaligned memory can result in performance penalty. On recent hardware it is hard to exhibit a difference. NSIMD provides two versions of "load": - `loada` for loading data from aligned memory - `loadu` for loading data from unaligned momery Note that using `loada` on unaligned pointer may result in segfaults. As recent hardware have good support for unaligned memory we use `loadu`. @[INCLUDE_CODE:L27:L27](../../examples/tutorial.cpp) To ensure that data allocated by `std::vector` is aligned, NSIMD provide a C++ allocator. ```c++ std::vector > data; ``` When loading data from memory you must ensure that there is sufficient data in the block of memory you load from to fill a `nsimd::pack`. For example, on an `AVX` capable machine, a SIMD vector of `float` (32 bits) contains 8 elements. Therefore, there must be at least 8 floats in the memory block you load data from otherwise loading may result in segfaults. More on this below. ## Operations on pack's and packl's Once initialized, `nsimd::pack` instances can be used to perform arithmetic. Usual operations are provided by NSIMD such: - addition - substraction - multiplication - division - square root - bitwise and/or/xor - ... @[INCLUDE_CODE:L28:L29](../../examples/tutorial.cpp) C++ operators are also overloaded for pack's and packl's as well as between pack's and scalars or packl's and booleans. ## SIMD branching NSIMD provide the `if_else` operator which fill the output, lane by lane, according to the lane value of its first argument: - if it is true, the output lane will be filled with the second argument's lane - if it is false, the output lane will be filled with the third argument's lane Therefore the branching: @[INCLUDE_CODE:L10:L14](../../examples/tutorial.cpp) will be rewritten as @[INCLUDE_CODE:L28:L30](../../examples/tutorial.cpp) or as a one liner @[INCLUDE_CODE:L36:L36](../../examples/tutorial.cpp) ## SIMD loops A SIMD loop is similar to its scalar counterpart except that instead of going through data one element at a time it goes 4 by 4 or 8 by 8 elements at a time. More precisely SIMD loops generally goes from steps equal to pack's length. Therefore the scalar loop @[INCLUDE_CODE:L9:L9](../../examples/tutorial.cpp) is rewritten as @[INCLUDE_CODE:L23:L26](../../examples/tutorial.cpp) Note that going step by step will only cover most of the data except maybe the tail of data in case that the number of elements is not a multiple of the Pack's length. Therefore to perform computations on the tail one has to load data from only `n` elements where `n < len()`. One can use `maskz_loadu` which will load data only on lanes that are marked as true by another argument to the function. @[INCLUDE_CODE:L35:L35](../../examples/tutorial.cpp) The mask can be computed manually but NSIMD provides a function for it. @[INCLUDE_CODE:L34:L34](../../examples/tutorial.cpp) Then the computation on the tail is exactly the same as within the loop. Put together it gives for the tail: @[INCLUDE_CODE:L34:L37](../../examples/tutorial.cpp) Then the entire loop reads as follows. @[INCLUDE_CODE:L25:L37](../../examples/tutorial.cpp) ## Compiling the Code Here is the complete listing of the code. @[INCLUDE_CODE](../../examples/tutorial.cpp) The compilation of a program using `nsimd` is like any other library. ```bash c++ -O3 -DAVX2 -mavx2 -L/path/to/lib -lnsimd_avx2 -I/path/to/include tutorial.cpp ``` When compiling with NSIMD, you have to decide at compile time the targeted SIMD extensions, AVX2 in the example above. It is therefore necessary to give `-mavx2` to the compiler for it to emit AVX2 instructions. To tell NSIMD that AVX2 has to be used the `-DAVX2` has to be passed to the compiler. For an exhaustive list of defines controlling compilation see . There is a .so file for each SIMD extension, it is therefore necessary to link against the proper .so file. ## Runtime selection of SIMD extensions It is sometimes necessary to have several versions of a given algorithm for different SIMD extensions. This is rather to do with NSIMD. Basically the idea is to write the algorithm in a generic manner using pack's as shown above. It is then sufficient to compile the same soure file for different SIMD extensions and then link the resulting object files altogether. Suppose that a file named `uppercase.cpp` contains the following code: @[INCLUDE_CODE:L18:L38](../../examples/tutorial.cpp) This would give the following in a Makefile. ```makefile all: uppercase uppercase_sse2.o: uppercase.cpp c++ -O3 -DSSE2 -msse2 -c $? -o $@ uppercase_sse42.o: uppercase.cpp c++ -O3 -DSSE42 -msse4.2 -c $? -o $@ uppercase_avx.o: uppercase.cpp c++ -O3 -DAVX -mavx -c $? -o $@ uppercase_avx2.o: uppercase.cpp c++ -O3 -DAVX2 -mavx2 -c $? -o $@ uppercase: uppercase_sse2.o \ uppercase_sse42.o \ uppercase_avx.o \ uppercase_avx2.o main.cpp c++ $? -lnsimd_avx2 -o $@ ``` Note that `libnsimd_avx2` contains all the functions for SSE 2, SSE 4.2, AVX and AVX2. This is a consequence of the retrocompatiblity of Intel SIMD extensions. The situation is the same on ARM where `libnsimd_sve.so` will contain functions for AARCH64. There is a small caveat. The symbol name corresponding to the `uppercase_simd` function will be same for all the object files which will result in error when linking together all objects. To avoid this situation one can use function overloading as follows: ```c++ template void uppercase_simd(NSIMD_SIMD, T *dst, const T *src, int n) { // ... } ``` The macro `NSIMD_SIMD` will be expanded to a type containing the information on the SIMD extension currently requested by the user. This techniques is called tag dispatching and does not require *any* modification of the algorithm inside the function. Finally in `main` one has to do dispatching by using either `cpuid` of by another mean. ```c++ int main() { // what follows is pseudo-code switch(cpuid()) { case cpuid_sse2: uppercase(nsimd::sse2, dst, src, n); break; case cpuid_sse42: uppercase(nsimd::sse42, dst, src, n); break; case cpuid_avx: uppercase(nsimd::avx, dst, src, n); break; case cpuid_avx2: uppercase(nsimd::avx2, dst, src, n); break; } return 0; } ``` ================================================ FILE: doc/md2html.cpp ================================================ /* Copyright (c) 2020 Agenium Scale Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #include #include #include #include #include // ---------------------------------------------------------------------------- // Extract lines form strings like ":L7:L42" // Returns -1 if fails std::pair extract_lines(std::string const &s) { std::pair r(-1, -1); std::vector lines = ns2::split(s, ":L"); if (lines.size() == 3 && lines[0] == "") { try { r.first = std::stoi(lines[1]); r.second = std::stoi(lines[2]); } catch (std::exception const &) { r.first = -1; r.second = -1; } } return r; } // ---------------------------------------------------------------------------- std::string callback_input_filename = ""; std::string callback_macro(std::string const &label, std::string const &url, ns2::markdown_infos_t const &markdown_infos) { std::string filename; if (ns2::startswith(label, "INCLUDE")) { filename = ns2::join_path(ns2::dirname(callback_input_filename), url); } std::string lang; if (ns2::startswith(label, "INCLUDE_CODE")) { std::string const ext = ns2::splitext(filename).second; if (ext == "sh") { lang = "Bash"; } else if (ext == "c" || ext == "h") { lang = "C"; } else if (ext == "cpp" || ext == "hpp") { lang = "C++"; } else if (ext == "py") { lang = "Python"; } } if (ns2::startswith(label, "INCLUDE_CODE:")) { std::string const lines_str = label.substr(label.find(':')); std::pair const l_first_last = extract_lines(lines_str); if (l_first_last.first == -1) { throw std::runtime_error("cannot extract first line number"); } if (l_first_last.second == -1) { throw std::runtime_error("cannot extract last line number"); } std::string out; std::string lines; { ns2::ifile_t in(filename); int num_line = 1; std::string line; while (std::getline(in, line)) { if (num_line == l_first_last.second) { lines += line; } else if (num_line < l_first_last.second) { if (num_line >= l_first_last.first) { lines += line + "\n"; } } else { break; } ++num_line; } } ns2::compile_markdown("```" + lang + "\n" + ns2::deindent(lines) + "\n```\n", &out, markdown_infos); return out; } if (ns2::startswith(label, "INCLUDE_CODE")) { std::string out; ns2::compile_markdown("```" + lang + "\n" + ns2::read_file(filename) + "\n```\n", &out, markdown_infos); return out; } if (ns2::startswith(label, "INCLUDE")) { ns2::ifile_t in(filename); std::ostringstream out; ns2::compile_markdown(&in, &out, markdown_infos); return out.str(); } return ""; } // ---------------------------------------------------------------------------- std::pair callback_link(std::string const &label, std::string const &url, ns2::markdown_infos_t const &markdown_infos) { if (markdown_infos.output_format != ns2::HTML) { return std::pair("", false); } std::pair root_basename_ext = ns2::splitext(url); if (root_basename_ext.second == "md") { return std::pair( ns2::html_href(root_basename_ext.first + ".html", label), true); } else { return std::pair("", false); } } // ---------------------------------------------------------------------------- int main(int argc, char **argv) { if (argc != 3) { std::cout << "Usage: " << argv[0] << " " << std::endl; return 1; } std::string const input_filename = argv[1]; std::string const output_filename = argv[2]; ns2::ifile_t input_file(input_filename); ns2::ofile_t output_file(output_filename); std::cout << "Convert \"" << input_filename << "\" to \"" << output_filename << "\"" << std::endl; callback_input_filename = input_filename; ns2::markdown_infos_t markdown_infos(ns2::HTML, callback_macro, callback_link, true); ns2::compile_markdown(&input_file, &output_file, markdown_infos); return 0; } ================================================ FILE: doc/what_is_wrapped.cpp ================================================ /* Copyright (c) 2021 Agenium Scale Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ /* This little C++ program reads and parses files from NSIMD wrapping intrinsics in order to build a markdown page describing in a table which operators are just intrinsics wrapper and which one are more complicated. We only to parse C code so no need for complicated stuff. Moreover what we doo is really simple and a C parser is not needed. We replace all C delimiters by spaces, then split the resulting string into words and we get a vector of strings. Then search in it the function that we want (say nsimd_add_sse2_f32) along with its opening curly and closing brakets and finally: - if there is only one token then it must be an intrinsic - if there is a for then it must use emulation - if there are several tokens but no for it must be a trick using other intrinsics The produced markdown contains: - E for emulation - T for trick with other intrinsics - NOOP for noop - a link to the Intel/Arm documentation about the intrinsic otherwise Well all that to say that a few hundreds of simple C++ code is more that enough for our need and we don't need to depend on some C/C++ parser such as Clang. Note that using a real parser will be counter productive as some intrinsics are implemented as macros to compiler builtin which then appear in the AST instead of the documented intrinsics. This code is completely non-optimized and we don't care because it does not take time to execute and it is not our purpose to optimize this code. */ // ---------------------------------------------------------------------------- #include #include #include #include // ---------------------------------------------------------------------------- #define MAX_LEN (11 * 11) typedef std::map table_t; std::string type_names_str("i8,u8,i16,u16,i32,u32,i64,u64,f16,f32,f64"); std::vector types_list(ns2::split(type_names_str, ",")); const size_t not_found = ~((size_t)0); // ---------------------------------------------------------------------------- int nbits(std::string const &typ) { if (typ == "i8" || typ == "u8") { return 8; } else { return (10 * (typ[1] - '0')) + (typ[2] - '0'); } } // ---------------------------------------------------------------------------- std::vector get_types_names(std::string const &output) { std::vector const& list = types_list; if (output == "same") { return list; } std::vector ret; for (size_t i = 0; i < list.size(); i++) { for (size_t j = 0; j < list.size(); j++) { if ((output == "same_size" && nbits(list[j]) == nbits(list[i])) || (output == "bigger_size" && nbits(list[j]) == 2 * nbits(list[i])) || (output == "lesser_size" && 2 * nbits(list[j]) == nbits(list[i]))) { ret.push_back(list[j] + "_" + list[i]); } } } return ret; } // ---------------------------------------------------------------------------- size_t find(std::vector const &haystack, std::string const &needle, size_t i0 = 0) { for (size_t i = i0; i < haystack.size(); i++) { if (haystack[i] == needle) { return i; } } return not_found; } // ---------------------------------------------------------------------------- size_t find_by_prefix(std::vector const &needles, std::string const &haystack) { for (size_t i = 0; i < needles.size(); i++) { if (ns2::startswith(haystack, needles[i])) { return i; } } return not_found; } // ---------------------------------------------------------------------------- int is_number(std::string const &s) { for (size_t i = 0; i < s.size(); i++) { if (s[i] != 'x' && s[i] != 'l' && s[i] != 'L' && s[i] != 'u' && s[i] != 'U' && !(s[i] >= '0' && s[i] <= '9')) { return false; } } return true; } // ---------------------------------------------------------------------------- int is_macro(std::string const &s) { for (size_t i = 0; i < s.size(); i++) { if (s[i] != '_' || !(s[i] >= 'A' && s[i] <= 'Z')) { return false; } } return true; } // ---------------------------------------------------------------------------- void parse_file(std::string const &input_vars, std::string const &simd_ext, std::vector const &types_names, std::string const &op_name, std::string const &filename, table_t *table_) { table_t &table = *table_; std::string content(ns2::read_file(filename)); // replace all C delimiters by spaces except {} for (size_t i = 0; i < content.size(); i++) { const char delims[] = "()[];,:+-*/%&|!%\n\t\r"; for (size_t j = 0; j < sizeof(delims); j++) { if (content[i] == delims[j]) { content[i] = ' '; break; } } } // replace '{' by ' { ' and same for '}' in case there are some code // just before/after it content = ns2::replace(ns2::replace(content, "}", " } "), "{", " { "); // now split string on spaces and removes some tokens std::vector to_be_removed( ns2::split("return,signed,unsigned,char,short,int,long,float,double," "const,void,__vector,__bool,bool,vector" + type_names_str + "," + input_vars, ',')); std::vector to_be_removed_by_prefix(ns2::split( "_mm_cast,_mm256_cast,_mm512_cast,vreinterpret,svreinterpret,svptrue_", ',')); std::vector tokens; { // to free tokens0 afterwards std::vector tokens0 = ns2::split(content, ' '); for (size_t i = 0; i < tokens0.size(); i++) { // We also remove svptrue_* as they are everywhere for SVE and all // casts as they incur no opcode and are often used for intrinsics // not supporting certain types if (tokens0[i].size() == 0 || is_number(tokens0[i]) || is_macro(tokens0[i]) || find_by_prefix(to_be_removed_by_prefix, tokens0[i]) != not_found || find(to_be_removed, tokens0[i]) != not_found) { continue; } tokens.push_back(tokens0[i]); } } // finally search for intrinsics for (size_t typ = 0; typ < types_names.size(); typ++) { std::string func_name("nsimd_" + op_name + "_" + simd_ext + "_" + types_names[typ]); // find func_name size_t pos = find(tokens, func_name); if (pos == not_found) { table[op_name][typ] = "NA"; continue; } // find opening { size_t i0 = find(tokens, "{", pos); if (i0 == not_found) { std::cerr << "WARNING: cannot find opening '{' for '" << func_name << "' in '" << filename << "'\n"; table[op_name][typ] = "NA"; continue; } // find closing } size_t i1 = i0; int nest = 0; for (i1 = i0; i1 < tokens.size(); i1++) { if (tokens[i1] == "{") { nest++; } else if (tokens[i1] == "}") { nest--; } if (nest == 0) { break; } } // if there is no token inside {} then it must be a noop // if there is only one token inside {} then it must be the intrinsic // if there is a for loop then it must be emulation // if there are several tokens but no for then it must be a trick if (i0 + 1 == i1) { table[op_name][typ] = "NOOP"; } else if (i0 + 2 == i1 && !ns2::startswith(tokens[i0 + 1], "nsimd_")) { table[op_name][typ] = "[`" + tokens[i0 + 1] + "`]"; if (simd_ext == "neon128" || simd_ext == "aarch64") { table[op_name][typ] += "(https://developer.arm.com/architectures/instruction-sets/" "intrinsics/" + tokens[i0 + 1] + ")"; } else if (ns2::startswith(simd_ext, "sve")) { table[op_name][typ] += "(https://developer.arm.com/documentation/100987/0000)"; } else if (simd_ext == "sse2" || simd_ext == "sse42" || simd_ext == "avx" || simd_ext == "avx2" || simd_ext == "avx512_knl" || simd_ext == "avx512_skylake") { table[op_name][typ] += "(https://software.intel.com/sites/landingpage/" "IntrinsicsGuide/#text=" + tokens[i0 + 1] + ")"; } else if (simd_ext == "vsx" || simd_ext == "vmx") { table[op_name][typ] += "(https://www.ibm.com/docs/en/xl-c-aix/13.1.3?topic=functions-" + ns2::replace(tokens[i0 + 1], "_", "-") + ")"; } } else { if (find(std::vector(tokens.begin() + i0, tokens.begin() + (i1 + 1)), "for") != not_found) { table[op_name][typ] = "E"; } else { table[op_name][typ] = "T"; } } } } // ---------------------------------------------------------------------------- std::string md_row(int nb_col, std::string const &cell_content) { std::string ret("|"); for (int i = 0; i < nb_col; i++) { ret += cell_content + "|"; } return ret; } // ---------------------------------------------------------------------------- int main(int argc, char **argv) { if ((argc % 2) != 0 || argc <= 5) { std::cout << "Usage: " << argv[0] << " a0,a1,a2 simd_ext output_type operator1 file1 operator2 file2 " "...\n" << "where output_type is (same|same_size|bigger_size|lesser_size)" << std::endl; return 1; } std::string input_vars(argv[1]); std::string simd_ext(argv[2]); std::string output_type(argv[3]); std::vector types_names = get_types_names(output_type); table_t table; for (int i = 4; i < argc; i += 2) { parse_file(input_vars, simd_ext, types_names, argv[i], argv[i + 1], &table); } for (table_t::const_iterator it = table.begin(); it != table.end(); it++) { std::cout << "## " << it->first << "\n\n"; if (output_type == "same") { const std::string(&row)[MAX_LEN] = it->second; for (size_t i = 0; i < types_list.size(); i++) { std::cout << "- " << it->first << " on **" << types_list[i] << "**: " << row[i] << "\n"; } std::cout << "\n\n"; } else { const std::string(&row)[MAX_LEN] = it->second; for (size_t i = 0; i < types_list.size(); i++) { for (size_t j = 0; j < types_list.size(); j++) { std::string cell_content; std::string typ(types_list[j] + "_" + types_list[i]); for (size_t k = 0; k < types_names.size(); k++) { if (typ == types_names[k]) { cell_content = row[k]; break; } } if (cell_content.size() > 0) { std::cout << "- " << it->first << " from **" << types_list[i] << "** to **" << types_list[j] << "**: " << cell_content << "\n"; } } std::cout << "\n"; } std::cout << "\n"; } } return 0; } ================================================ FILE: egg/__init__.py ================================================ # Copyright (c) 2019 Agenium Scale # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. from . import operators ================================================ FILE: egg/common.py ================================================ # Use utf-8 encoding # -*- coding: utf-8 -*- # Copyright (c) 2020 Agenium Scale # # permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. # ----------------------------------------------------------------------------- # What does this script? # ---------------------- # # This is only a python module that holds what is shared by `generate.py`, # the `platform_*.py` files and all other python code in `egg`. If contains # the list of supported types, functions, operators, and some useful helper # functions such as the python equivalent of `mkdir -p`. # ----------------------------------------------------------------------------- # Import section import math import os import sys import io import collections import platform import string import shutil import math # ----------------------------------------------------------------------------- # print def myprint(opts, obj): if opts.list_files: return print('-- {}'.format(obj)) # ----------------------------------------------------------------------------- # check if file exists def can_create_filename(opts, filename): if opts.list_files: print(filename) return False if opts.verbose: sys.stdout.write('-- {}: '.format(filename)) if os.path.isfile(filename) and not opts.force: if opts.verbose: sys.stdout.write('skipping\n') return False elif opts.force: if opts.verbose: sys.stdout.write('creating (forced)\n') return True else: if opts.verbose: sys.stdout.write('creating (missing)\n') return True # ----------------------------------------------------------------------------- # open with UTF8 encoding def open_utf8(opts, filename): dummy, ext = os.path.splitext(filename) if ext.lower() in ['.c', '.h', '.cpp', '.hpp', '.cc', '.cxx', '.hxx', '.hpp']: begin_comment = '/*' end_comment = '*/' elif ext.lower() in ['.md', '.htm', '.html']: begin_comment = '' else: begin_comment = None with io.open(filename, mode='w', encoding='utf-8') as fout: if begin_comment is not None: if opts.simple_license: fout.write('''{} Copyright (c) 2021 Agenium Scale {} '''.format(begin_comment, end_comment)) else: fout.write('''{} Copyright (c) 2021 Agenium Scale Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. {} '''.format(begin_comment, end_comment)) fout.write('{} This file has been auto-generated {}\n\n'.\ format(begin_comment, end_comment)) return io.open(filename, mode='a', encoding='utf-8') # ----------------------------------------------------------------------------- # clang-format def clang_format(opts, filename, cuda=False): with io.open(filename, 'a', encoding='utf-8') as fout: fout.write('\n') if not opts.enable_clang_format: # TODO: not sure if needed to implement a smarter call to clang-format if cuda: os.system('clang-format -style="{{ Standard: Cpp11 }}" -i {}'. \ format(filename)) else: os.system('clang-format -style="{{ Standard: Cpp03 }}" -i {}'. \ format(filename)) if cuda: shutil.copyfile(filename, filename[:-4] + '.cu') # ----------------------------------------------------------------------------- # Not implemented response NOT_IMPLEMENTED = 'abort();' # ----------------------------------------------------------------------------- # C/C++ comment hbar hbar = '/* ' + ('-' * 73) + ' */' # ----------------------------------------------------------------------------- # Convert constants for operators OUTPUT_TO_SAME_TYPE = 0 OUTPUT_TO_SAME_SIZE_TYPES = 1 OUTPUT_TO_UP_TYPES = 2 OUTPUT_TO_DOWN_TYPES = 3 # ----------------------------------------------------------------------------- # SIMD type x86_simds = [ 'sse2', 'sse42', 'avx', 'avx2', 'avx512_knl', 'avx512_skylake', ] arm_simds = [ 'neon128', 'aarch64', 'sve', 'sve128', 'sve256', 'sve512', 'sve1024', 'sve2048' ] ppc_simds = [ 'vmx', 'vsx', ] simds = ['cpu'] + x86_simds + arm_simds + ppc_simds simds_deps = { 'cpu': ['cpu'], 'sse2': ['cpu', 'sse2'], 'sse42': ['cpu', 'sse2', 'sse42'], 'avx': ['cpu', 'sse2', 'sse42', 'avx'], 'avx2': ['cpu', 'sse2', 'sse42', 'avx', 'avx2'], 'fma4': [], 'avx512_knl': ['cpu', 'sse2', 'sse42', 'avx', 'avx2', 'avx512_knl'], 'avx512_skylake': ['cpu', 'sse2', 'sse42', 'avx', 'avx2', 'avx512_skylake'], 'neon128': ['cpu', 'neon128'], 'aarch64': ['cpu', 'aarch64'], 'sve': ['cpu', 'aarch64', 'sve'], 'sve128': ['cpu', 'aarch64', 'sve128'], 'sve256': ['cpu', 'aarch64', 'sve256'], 'sve512': ['cpu', 'aarch64', 'sve512'], 'sve1024': ['cpu', 'aarch64', 'sve1024'], 'sve2048': ['cpu', 'aarch64', 'sve2048'], 'vmx': ['cpu', 'vmx'], 'vsx': ['cpu', 'vmx', 'vsx'] } ftypes = ['f64', 'f32', 'f16'] ftypes_no_f16 = ['f64', 'f32'] itypes = ['i64', 'i32', 'i16', 'i8'] utypes = ['u64', 'u32', 'u16', 'u8'] iutypes = itypes + utypes types = ftypes + iutypes def logical(typ): return 'l{}'.format(typ) signed_type = { 'i8': 'i8', 'u8': 'i8', 'i16': 'i16', 'u16': 'i16', 'i32': 'i32', 'u32': 'i32', 'i64': 'i64', 'u64': 'i64', 'f16': 'f16', 'f32': 'f32', 'f64': 'f64' } bitfield_type = { 'i8': 'u8', 'u8': 'u8', 'i16': 'u16', 'u16': 'u16', 'i32': 'u32', 'u32': 'u32', 'i64': 'u64', 'u64': 'u64', 'f16': 'u16', 'f32': 'u32', 'f64': 'u64' } in0 = 'a0' in1 = 'a1' in2 = 'a2' in3 = 'a3' in4 = 'a4' in5 = 'a5' CPU_NBITS = 128 if CPU_NBITS != 128: raise ValueError('CPU_NBITS must be 128') def get_arg(i): fmtspec = { 'in0': in0, 'in1': in1, 'in2': in2, 'in3': in3, 'in4': in4, 'in5': in5 } return '{{in{}}}'.format(i).format(**fmtspec) def get_args(n): fmtspec = { 'in0': in0, 'in1': in1, 'in2': in2, 'in3': in3, 'in4': in4, 'in5': in5 } return ', '.join(['{{in{}}}'.format(i).format(**fmtspec) \ for i in range(0, n)]) def get_simds_deps_from_opts(opts): simds = set() for simd1 in opts.simd: for simd2 in simds_deps[simd1]: simds.add(simd2) return simds def bitsize(typ): if not (typ in types): raise ValueError('Unknown type "{}"'.format(typ)) return int(typ[1:]) def sizeof(typ): return bitsize(typ) // 8 def ilog2(x): if x <= 0: return None for i in range(0, x): if 2 ** (i + 1) > x: return i #def get_same_size_types(typ): # nbits = typ[1:] # if typ in ['i8' ,'u8']: # return ['i8', 'u8'] # else: # return ['i' + nbits, 'u' + nbits, 'f' + nbits] def get_output_types(from_typ, output_to): if output_to == OUTPUT_TO_SAME_TYPE: return [from_typ] else: nbits = from_typ[1:] if output_to == OUTPUT_TO_SAME_SIZE_TYPES: if from_typ in ['i8' ,'u8']: return ['i8', 'u8'] else: return ['i' + nbits, 'u' + nbits, 'f' + nbits] elif output_to == OUTPUT_TO_UP_TYPES: if nbits == '64': raise ValueError('No uptype for ' + from_typ) else: n = str(int(nbits) * 2) return ['i' + n, 'u' + n, 'f' + n] elif output_to == OUTPUT_TO_DOWN_TYPES: n = str(int(nbits) // 2) if nbits == '8': raise ValueError('No downtype for ' + from_typ) elif nbits == '16': return ['i' + n, 'u' + n] else: return ['i' + n, 'u' + n, 'f' + n] else: raise ValueError('Invalid argument for "output_to": {}'. \ format(output_to)) # ----------------------------------------------------------------------------- # mkdir -p (avoid a dependency for just one function) def mkdir_p(path): if os.path.isdir(path): return path head, tail = os.path.split(path) if head != '': mkdir_p(head) os.mkdir(path) return path # ----------------------------------------------------------------------------- # Replacement of enumerate def enum(l): ret = [] for i in range(0, len(l)): ret.append([i, l[i]]) return ret # ----------------------------------------------------------------------------- # List of supported SIMD operators/functions # v = SIMD vector parameter # vi = SIMD vector of signed integers parameter # vx2 = struct of 2 SIMD vector parameters # vx3 = struct of 3 SIMD vector parameters # vx4 = struct of 4 SIMD vector parameters # l = SIMD vector of logicals parameter # s = Scalar parameter # * = Pointer to scalar parameter # c* = Pointer to const scalar parameter # _ = void (only for return type) # p = Parameter (int) # ----------------------------------------------------------------------------- # Type generators def get_one_type_generic(param, typ): if param == '_': return 'void' elif param == 'p': return 'int' elif param == 's': return typ elif param == '*': return '{}*'.format(typ) elif param == 'c*': return '{} const*'.format(typ) elif param == 'vi': return 'vi{}'.format(typ[1:]) elif param == 'v': return 'v{}'.format(typ) elif param == 'vx2': return 'v{}x2'.format(typ) elif param == 'vx3': return 'v{}x3'.format(typ) elif param == 'vx4': return 'v{}x4'.format(typ) elif param == 'l': return 'vl{}'.format(typ) else: raise ValueError("Unknown param '{}'".format(param)) def get_one_type_specific(param, ext, typ): if param == '_': return 'void' elif param == 'p': return 'int' elif param == 's': return typ elif param == '*': return '{}*'.format(typ) elif param == 'c*': return '{} const*'.format(typ) elif param == 'vi': return 'nsimd_{}_vi{}'.format(ext, typ[1:]) elif param == 'v': return 'nsimd_{}_v{}'.format(ext, typ) elif param == 'vx2': return 'nsimd_{}_v{}x2'.format(ext, typ) elif param == 'vx3': return 'nsimd_{}_v{}x3'.format(ext, typ) elif param == 'vx4': return 'nsimd_{}_v{}x4'.format(ext, typ) elif param == 'l': return 'nsimd_{}_vl{}'.format(ext, typ) else: raise ValueError("Unknown param '{}'".format(param)) def get_one_type_pack(param, inout, N): if param == '_': return 'void' if param == 'p': return 'int' if param == '*': return 'T*' if param == 'c*': return 'T const*' if param == 's': return 'T' if param in ['v', 'vx2', 'vx3', 'vx4']: if inout == 0: return 'pack const&'.format(N) else: return 'pack'.format(N) if param == 'vi': if inout == 0: return 'pack::itype, {}, SimdExt> const&'. \ format(N) else: return 'pack::itype, {}, SimdExt>'.format(N) if param == 'l': if inout == 0: return 'packl const&'.format(N) else: return 'packl'.format(N) raise ValueError("Unknown param '{}'".format(param)) def get_one_type_generic_adv_cxx(param, T, N): if param == '_': return 'void' elif param == 'p': return 'int' elif param == '*': return '{}*'.format(T) elif param == 'c*': return '{} const*'.format(T) elif param == 's': return T elif param == 'v': return 'pack<{}, {}, SimdExt>'.format(T, N) elif param == 'vi': return 'pack'.format(T[1:], N) elif param == 'vx2': return 'packx2<{}, {}, SimdExt>'.format(T, N) elif param == 'vx3': return 'packx3<{}, {}, SimdExt>'.format(T, N) elif param == 'vx4': return 'packx4<{}, {}, SimdExt>'.format(T, N) elif param == 'l': return 'packl<{}, {}, SimdExt>'.format(T, N) else: raise ValueError('Unknown param: "{}"'.format(param)) def get_one_type_scalar(param, t): if param == '_': return 'void' elif param in ['p', 'l']: return 'int' elif param in ['s', 'v']: return t else: raise ValueError('Unknown param: "{}"'.format(param)) def get_first_discriminating_type(params): for i in range(len(params)): if params[i] in ['v', 'l', 'vx2', 'vx3', 'vx4']: return i return -1 # ----------------------------------------------------------------------------- # Formats def pprint_lines(what): return '\n'.join(what) def pprint_commas(what): return ', '.join(what) def pprint_includes(what): return pprint_lines('#include {}'.format(i) for i in what) # ----------------------------------------------------------------------------- # Function parsing signatures def parse_signature(signature): l = signature.split(' '); name = l[1] if len(l) > 2: params = [l[0]] + l[2:] else: params = [l[0]] return (name, params) # ----------------------------------------------------------------------------- # Load platforms def get_platforms(opts): if opts.platforms_list != None: return opts.platforms_list ret = dict() path = opts.script_dir myprint(opts, 'Searching platforms in "{}"'.format(path)) for mod_file in os.listdir(path): if mod_file[-3:] == '.py' and mod_file[0:9] == 'platform_': mod_name = mod_file[:-3] myprint(opts, 'Found new platform: {}'.format(mod_name[9:])) ret[mod_name[9:]] = __import__(mod_name) opts.platforms_list = ret return ret # ----------------------------------------------------------------------------- # Find modules def get_modules(opts): if opts.modules_list != None: return opts.modules_list ret = dict() # We have one module by directory path = os.path.join(opts.script_dir, 'modules') myprint(opts, 'Searching modules in "{}"'.format(path)) for module_dir in os.listdir(path): if (not os.path.isdir(os.path.join(path, module_dir))) or \ module_dir == '.' or module_dir == '..' or \ (not os.path.exists(os.path.join(path, module_dir, 'hatch.py'))): continue myprint(opts, 'Found new module: {}'.format(module_dir)) mod = __import__('modules.{}.hatch'.format(module_dir)) ret[module_dir] = mod opts.modules_list = ret return ret # ----------------------------------------------------------------------------- # Integer limits per type using macros defined in or limits = { 'i8': {'min': 'NSIMD_I8_MIN', 'max': 'NSIMD_I8_MAX' }, 'i16': {'min': 'NSIMD_I16_MIN', 'max': 'NSIMD_I16_MAX' }, 'i32': {'min': 'NSIMD_I32_MIN', 'max': 'NSIMD_I32_MAX' }, 'i64': {'min': 'NSIMD_I64_MIN', 'max': 'NSIMD_I64_MAX' }, 'u8': {'min': 'NSIMD_U8_MIN', 'max': 'NSIMD_U8_MAX' }, 'u16': {'min': 'NSIMD_U16_MIN', 'max': 'NSIMD_U16_MAX' }, 'u32': {'min': 'NSIMD_U32_MIN', 'max': 'NSIMD_U32_MAX' }, 'u64': {'min': 'NSIMD_U64_MIN', 'max': 'NSIMD_U64_MAX' } } # ----------------------------------------------------------------------------- # Misc def ext_from_lang(lang): return 'c' if lang == 'c_base' else 'cpp' def nsimd_category(category): return 'nsimd_' + category # ------------------------------------------------------------------------------ # Doc common def to_filename(op_name): valid = string.ascii_letters + string.digits ret = '' for c in op_name: ret += '-' if c not in valid else c return ret def get_markdown_dir(opts): return os.path.join(opts.script_dir, '..', 'doc', 'markdown') def get_markdown_api_file(opts, name, module=''): root = get_markdown_dir(opts) op_name = to_filename(name) if module == '': return os.path.join(root, 'api_{}.md'.format(op_name)) else: return os.path.join(root, 'module_{}_api_{}.md'.format(module, op_name)) def get_markdown_file(opts, name, module=''): root = get_markdown_dir(opts) op_name = to_filename(name) if module == '': return os.path.join(root, '{}.md'.format(op_name)) else: return os.path.join(root, 'module_{}_{}.md'.format(module, op_name)) ================================================ FILE: egg/cuda.py ================================================ # Copyright (c) 2020 Agenium Scale # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. import common import scalar fmtspec = dict() # ----------------------------------------------------------------------------- # NVIDIA doc on f16 can be found at # https://docs.nvidia.com/cuda/cuda-math-api/group__CUDA__MATH__INTRINSIC__HALF.html def get_impl_f16(operator, totyp, typ): if operator.name == 'round_to_even': arch53_code = 'return hrint({in0});'.format(**fmtspec) elif operator.name in ['rec', 'rec8', 'rec11']: arch53_code = 'return hrcp({in0});'.format(**fmtspec) elif operator.name in ['rsqrt8', 'rsqrt11']: arch53_code = 'return hrsqrt({in0});'.format(**fmtspec) elif operator.name in ['fma', 'fms', 'fnma', 'fnms']: neg = '-' if operator.name in ['fnma, fnms'] else '' op = '-' if operator.name in ['fnms, fms'] else '' arch53_code = 'return __hfma({neg}{in0}, {in1}, {op}{in2});'. \ format(neg=neg, op=op, **fmtspec) elif operator.name in ['min', 'max']: intr = '__hlt' if operator.name == 'min' else '__hgt' arch53_code = '''if ({intr}) {{ return {in0}; }} else {{ return {in1}; }}'''.format(intr=intr, **fmtspec) elif operator.name in ['adds', 'subs']: arch53_code = 'return __h{op}({in0}, {in1});'. \ format(op=operator.name[:-1], **fmtspec) else: args = ', '.join(['{{in{}}}'.format(i).format(**fmtspec) \ for i in range(len(operator.params[1:]))]) # Some f16 functions are not prefixed by `__` not_prefixed = ['ceil', 'floor', 'trunc', 'sqrt'] if operator.name in not_prefixed: arch53_code = 'return h{}({});'.format(operator.name, args) else: arch53_code = 'return __h{}({});'.format(operator.name, args) args = ', '.join(['__half2float({{in{}}})'.format(i).format(**fmtspec) \ for i in range(len(operator.params[1:]))]) if operator.params[0] == 'l': emul = 'return gpu_{}({});'.format(operator.name, args) else: emul = 'return __float2half(gpu_{}({}));'.format(operator.name, args) return '''#if __CUDA_ARCH__ >= 530 {arch53_code} #else {emul} #endif'''.format(arch53_code=arch53_code, emul=emul) # ----------------------------------------------------------------------------- # Reinterprets on CUDA have intrinsics def reinterpret(totyp, typ): if typ == totyp: return 'return {in0};'.format(**fmtspec) cuda_typ = { 'i16': 'short', 'u16': 'ushort', 'f16': 'half', 'i32': 'int', 'u32': 'uint', 'f32': 'float', 'f64': 'double', 'i64': 'longlong' } if typ in cuda_typ and totyp in cuda_typ and \ ((typ in common.ftypes and totyp in common.iutypes) or \ (typ in common.iutypes and totyp in common.ftypes)): return 'return __{typ2}_as_{totyp2}({in0});'. \ format(typ2=cuda_typ[typ], totyp2=cuda_typ[totyp], **fmtspec) else: return '''{totyp} ret; memcpy((void *)&ret, (void *)&{in0}, sizeof({in0})); return ret;'''.format(**fmtspec) # ----------------------------------------------------------------------------- def get_impl(operator, totyp, typ): global fmtspec fmtspec = { 'in0': common.in0, 'in1': common.in1, 'in2': common.in2, 'typ': typ, 'totyp': totyp, 'typnbits': typ[1:] } # src operators if operator.src: cuda_ops = { 'sin_u35': 'sin', 'cos_u35': 'cos', 'tan_u35': 'tan', 'asin_u35': 'asin', 'acos_u35': 'acos', 'atan_u35': 'atan', 'atan2_u35': 'atan2', 'log_u35': 'log', 'cbrt_u35': 'cbrt', 'sin_u10': 'sin', 'cos_u10': 'cos', 'tan_u10': 'tan', 'asin_u10': 'asin', 'acos_u10': 'acos', 'atan_u10': 'atan', 'atan2_u10': 'atan2', 'log_u10': 'log', 'cbrt_u10': 'cbrt', 'exp_u10': 'exp', 'pow_u10': 'pow', 'sinh_u10': 'sinh', 'cosh_u10': 'cosh', 'tanh_u10': 'tanh', 'sinh_u35': 'sinh', 'cosh_u35': 'cosh', 'tanh_u35': 'tanh', 'asinh_u10': 'asinh', 'acosh_u10': 'acosh', 'atanh_u10': 'atanh', 'exp2_u10': 'exp2', 'exp2_u35': 'exp2', 'exp10_u10': 'exp10', 'exp10_u35': 'exp10', 'expm1_u10': 'expm1', 'log10_u10': 'log10', 'log2_u10': 'log2', 'log2_u35': 'log2', 'log1p_u10': 'log1p', 'sinpi_u05': 'sinpi', 'cospi_u05': 'cospi', 'hypot_u05': 'hypot', 'hypot_u35': 'hypot', 'remainder': 'remainder', 'fmod': 'fmod', 'lgamma_u10': 'lgamma', 'tgamma_u10': 'tgamma', 'erf_u10': 'erf', 'erfc_u15': 'erfc' } args = common.get_args(len(operator.params[1:])) cuda_op = cuda_ops[operator.name] if typ == 'f16': # For f16 CUDA offers only a few operator if cuda_op in ['cos', 'exp', 'exp10', 'exp2', 'log', 'log10', 'log2', 'sin']: return '''#if __CUDA_ARCH__ >= 530 return h{}({}); #else return __float2half(gpu_{}(__half2float({}))); #endif'''.format(cuda_op, args, operator.name, args) else: args = ', '.join('__half2float({})'.format(common.get_arg(i)) \ for i in range(len(operator.params[1:]))) return 'return __float2half(gpu_{}({}));'. \ format(operator.name, args) elif typ == 'f32': return 'return {}f({});'.format(cuda_op, args) else: return 'return {}({});'.format(cuda_op, args) # bool first, no special treatment for f16's bool_operators = { 'andl': 'return {in0} && {in1};', 'orl': 'return {in0} || {in1};', 'xorl': 'return {in0} ^ {in1};', 'andnotl': 'return {in0} && (!{in1});', 'notl': 'return !{in0};', } if operator.name in bool_operators: return bool_operators[operator.name].format(**fmtspec) # infix operators that needs type punning, no special treatment for f16's def pun_code(code, arity, typ): if typ in common.utypes: return 'return ' + code.format(**fmtspec) + ';' utyp = common.bitfield_type[typ] to_utyp = '\n'.join( ['''{utyp} buf{i}; memcpy(&buf{i}, &{{in{i}}}, sizeof({{in{i}}}));'''. \ format(i=i, utyp=utyp).format(**fmtspec) \ for i in range(arity)]) return '''{to_utyp} {utyp} tmp = {code}; {typ} ret; memcpy(&ret, &tmp, sizeof(tmp)); return ret;'''.format(to_utyp=to_utyp, utyp=utyp, typ=typ, code=code.format(in0='buf0', in1='buf1')) pun_operators = { 'orb': lambda: pun_code('{in0} | {in1}', 2, typ), 'andb': lambda: pun_code('{in0} & {in1}', 2, typ), 'andnotb': lambda: pun_code('{in0} & (~{in1})', 2, typ), 'notb': lambda: pun_code('~{in0}', 1, typ), 'xorb': lambda: pun_code('{in0} ^ {in1}', 2, typ), } if operator.name in pun_operators: return pun_operators[operator.name]() # reinterpret if operator.name == 'reinterpret': return reinterpret(totyp, typ) # cvt if operator.name == 'cvt': return 'return ({totyp}){in0};'.format(**fmtspec) # to_mask if operator.name == 'to_mask': if typ in common.utypes: return 'return ({typ})({in0} ? -1 : 0);'.format(**fmtspec) return 'return gpu_reinterpret({typ}(), ({utyp})({in0} ? -1 : 0));'. \ format(utyp=common.bitfield_type[typ], **fmtspec) # to_logical if operator.name == 'to_logical': if typ in common.iutypes: return 'return {in0} == ({typ})0 ? false : true;'.format(**fmtspec) return '''return gpu_reinterpret({utyp}(), {in0}) == ({utyp})0 ? false : true ;'''. \ format(utyp=common.bitfield_type[typ], **fmtspec) # for all other operators, f16 has a special treatment if typ == 'f16': return get_impl_f16(operator, totyp, typ) # then deal with f32's operators # first infix operators c_operators = { 'add': 'return ({typ})({in0} + {in1});', 'sub': 'return ({typ})({in0} - {in1});', 'mul': 'return ({typ})({in0} * {in1});', 'div': 'return ({typ})({in0} / {in1});', 'neg': 'return ({typ})(-{in0});', 'rec': 'return 1.0{f} / {in0};', 'rec8': 'return 1.0{f} / {in0};', 'rec11': 'return 1.0{f} / {in0};', 'lt': 'return {in0} < {in1};', 'gt': 'return {in0} > {in1};', 'le': 'return {in0} <= {in1};', 'ge': 'return {in0} >= {in1};', 'ne': 'return {in0} != {in1};', 'eq': 'return {in0} == {in1};', 'shl': 'return ({typ})({in0} << {in1});', } if operator.name in c_operators: return c_operators[operator.name]. \ format(f='f' if typ == 'f32' else '', **fmtspec) # right shifts if operator.name in ['shr', 'shra']: if typ in common.utypes: return 'return ({typ})({in0} >> {in1});'.format(**fmtspec) if operator.name == 'shr': return \ '''return gpu_reinterpret({typ}(), ({utyp})( gpu_reinterpret({utyp}(), {in0}) >> {in1}));'''. \ format(utyp=common.bitfield_type[typ], **fmtspec) # getting here means shra on signed types return \ '''if ({in1} == 0) {{ return {in0}; }} if ({in0} >= 0) {{ return gpu_reinterpret({typ}(), ({utyp})( gpu_reinterpret({utyp}(), {in0}) >> {in1})); }} else {{ {utyp} mask = ({utyp})((({utyp})-1) << ({typnbits} - {in1})); return gpu_reinterpret({typ}(), (({utyp})(mask | ({utyp})(gpu_reinterpret({utyp}(), {in0}) >> {in1})))); }}'''.format(utyp=common.bitfield_type[typ], **fmtspec) # adds if operator.name == 'adds': if typ in common.ftypes: return c_operators['add'].format(**fmtspec) else: return scalar.get_impl(operator, totyp, typ) # subs if operator.name == 'subs': if typ in common.ftypes: return c_operators['sub'].format(**fmtspec) elif typ in common.utypes: return scalar.get_impl(operator, totyp, typ) else: return 'return nsimd::gpu_adds({in0}, ({typ})(-{in1}));'. \ format(**fmtspec) # fma's if operator.name in ['fma', 'fms', 'fnma', 'fnms']: neg = '-' if operator.name in ['fnma, fnms'] else '' op = '-' if operator.name in ['fnms, fms'] else '' if typ in common.ftypes: return 'return fma{f}({neg}{in0}, {in1}, {op}{in2});'. \ format(f='f' if typ == 'f32' else '', neg=neg, op=op, **fmtspec) else: return 'return {neg}{in0} * {in1} + ({op}{in2});'. \ format(neg=neg, op=op, **fmtspec) # other operators if typ in common.iutypes: if operator.name in ['round_to_even', 'ceil', 'floor', 'trunc']: return 'return {in0};'.format(**fmtspec) elif operator.name == 'min': return 'return ({typ})({in0} < {in1} ? {in0} : {in1});'. \ format(**fmtspec) elif operator.name == 'max': return 'return ({typ})({in0} > {in1} ? {in0} : {in1});'. \ format(**fmtspec) elif operator.name == 'abs': return 'return ({typ})({in0} > 0 ? {in0} : -{in0});'. \ format(**fmtspec) else: cuda_name = { 'round_to_even': 'rint', 'min': 'fmin', 'max': 'fmax', 'abs': 'fabs', 'ceil': 'ceil', 'floor': 'floor', 'trunc': 'trunc', 'rsqrt8': 'rsqrt', 'rsqrt11': 'rsqrt' } args = ', '.join(['{{in{}}}'.format(i).format(**fmtspec) \ for i in range(len(operator.args))]) return 'return {name}{f}({args});'. \ format(name=cuda_name[operator.name] \ if operator.name in cuda_name else operator.name, f='f' if typ == 'f32' else '', args=args) ================================================ FILE: egg/experiments/gen_sleef_operators.py ================================================ # Copyright (c) 2021 Agenium Scale # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. import os script_dir = os.path.dirname(os.path.realpath(__file__)) sleef_dir = os.path.join(script_dir, '..', '..', '_deps-sleef') sleef_version = '3.5.1' funcproto = os.path.join(sleef_dir, 'sleef-{}'.format(sleef_version), 'src', 'libm', 'funcproto.h') ulp_suffix = { '0' : '', '1' : '_u1', '2' : '_u05', '3' : '_u35', '4' : '_u15', '5' : '_u3500' } func_type = { '0' : 'v {} v', '1' : 'v {} v v', '2' : 'vx2 {} v', '3' : 'v {} v p', '4' : 'v {} v', '5' : 'v {} v v v', '6' : 'vx2 {} v', '7' : 'p {} p', '8' : '* {} p' } props = { 'cos' : ['cosine', 'DocTrigo', 'R'], 'sin' : ['sine', 'DocTrigo', 'R'], 'fastcos' : ['cosine', 'DocTrigo', 'R'], 'fastsin' : ['sine', 'DocTrigo', 'R'], 'cospi' : ['cosine of multiple of pi argument', 'DocTrigo', 'R'], 'sinpi' : ['sine of multiple of pi argument', 'DocTrigo', 'R'], 'tan' : ['tangent', 'DocTrigo', 'R\{(z+0.5)*pi}'], 'acos' : ['arc cosine', 'DocTrigo', '(-1,1)'], 'asin' : ['arc sine', 'DocTrigo', '(-1,1)'], 'atan' : ['arc tangent', 'DocTrigo', 'R'], 'atan2' : ['arc tangent', 'DocTrigo', 'RxR'], 'log' : ['natural logarithmic', 'DocExpLog', '(0,Inf)'], 'log2' : ['base-2 logarithmic', 'DocExpLog', '(0,Inf)'], 'log10' : ['base-10 logarithmic', 'DocExpLog', '(0,Inf)'], 'log1p' : ['logarithm of one plus argument', 'DocExpLog', '(-1,Inf)'], 'exp' : ['exponential', 'DocExpLog', 'R'], 'exp2' : ['base-2 exponential', 'DocExpLog', 'R'], 'exp10' : ['base-10 exponential', 'DocExpLog', 'R'], 'expm1' : ['exponential minus 1', 'DocExpLog', 'R'], 'pow' : ['power', 'DocExpLog', 'RxR'], 'fastpow' : ['power', 'DocExpLog', 'RxR'], 'cbrt' : ['cubic root', 'DocBasicArithmetic', 'R'], 'hypot' : ['hypotenuse', 'DocBasicArithmetic', 'RxR'], 'sinh': ['hyperbolic sine', 'DocHyper', 'R'], 'cosh': ['hyperbolic cosine', 'DocHyper', 'R'], 'tanh': ['hyperbolic tangent', 'DocHyper', 'R'], 'asinh': ['hyperbolic arc sine', 'DocHyper', 'R'], 'acosh': ['hyperbolic arc cosine', 'DocHyper', '(1,Inf)'], 'atanh': ['hyperbolic arc tangent', 'DocHyper', '(-1,1)'], 'lgamma' : ['log gamma', 'DocMisc', 'R\{-n}'], 'tgamma' : ['gamma', 'DocMisc', 'R\{-n}'], 'erf' : ['error function', 'DocMisc', 'R'], 'erfc' : ['complementary error function', 'DocMisc', 'R'] } with open(funcproto, 'r') as fin: for line in fin: if not (line.find('{') != -1 and line.find('}') != -1): continue items = [item.strip() for item in line.strip(' \n\r{},').split(',')] items[0] = items[0].strip('"') if items[0] == 'NULL': break if items[0] not in props: continue name = items[0] + '_u' + items[1] symbol = 'nsimd_sleef_{}'.format(name) prop = props[items[0]] print('Class {}{}(SrcOperator):'. \ format(name[0].upper(), name[1:])) print(' full_name = \'{}\''.format(prop[0])) print(' signature = \'{}\''.format(func_type[items[3]]) \ .format(name)) print(' sleef_symbol_prefix = \'{}\''.format(symbol)) print(' domain = Domain(\'{}\')'.format(prop[2])) print(' categories = [{}]'.format(prop[1])) print(' desc = \'Compute the {} of its argument{} with ' \ 'a precision of {} ulps. For more informations visit ' \ '.\''.format(prop[0], 's' if items[3] in ['1', '3', '5'] else '', float(items[1]) / 10.0)) print('') ================================================ FILE: egg/experiments/round-ppc.c ================================================ #include #include void pp(const char *prefix, FILE *out, float buf[4]) { fputs(prefix, out); fputc('{', out); for (int i = 0; i < 4; i++) { fprintf(out, " %f", (double)buf[i]); } fputs(" }\n", out); } int main() { float res[4]; float buf[4]; buf[0] = -1.5f; buf[1] = -0.5f; buf[2] = 0.5f; buf[3] = 1.5f; __vector float v = *(__vector float *)buf; pp(" buf = ", stdout, buf); *(__vector float *)res = vec_round(v); pp(" round = ", stdout, res); *(__vector float *)res = vec_rint(v); pp(" rint = ", stdout, res); *(__vector float *)res = vec_roundc(v); pp("roundc = ", stdout, res); return 0; } ================================================ FILE: egg/experiments/upcvt-sve.c ================================================ #include #include // armclang -march=armv8+sve egg/experiments/upcvt-sve.c -o ../build/a.out // --- int len32() { return (int)svcntp_b32(svptrue_b32(), svptrue_b32()); } void print32(FILE *out, const char *var, svfloat32_t a) { float buf[2048]; svst1_f32(svptrue_b32(), buf, a); fprintf(out, "%s = ", var); for (int i = 0; i < len32(); i++) { if (i > 0) { fputs(", ", out); } fprintf(out, "%f", (double)buf[i]); } fputc('\n', stdout); } svfloat32_t iota32(float i0) { float buf[2048]; for (int i = 0; i < len32(); i++) { buf[i] = i0 + (float)i; } return svld1(svptrue_b32(), buf); } // --- int len64() { return (int)svcntp_b64(svptrue_b64(), svptrue_b64()); } void print64(FILE *out, const char *var, svfloat64_t a) { double buf[2048]; svst1_f64(svptrue_b64(), buf, a); fprintf(out, "%s = ", var); for (int i = 0; i < len64(); i++) { if (i > 0) { fputs(", ", out); } fprintf(out, "%f", buf[i]); } fputc('\n', stdout); } // --- int main() { svfloat32_t a = iota32(0.0f); svfloat32_t b = iota32(8.0f); svfloat64_t c = svcvt_f64_f32_z(svptrue_b32(), svzip1_f32(a, a)); print32(stdout, "a ", a); print32(stdout, "aa", svzip1_f32(a, a)); print64(stdout, "c ", c); return 0; } ================================================ FILE: egg/gen_adv_c_api.py ================================================ # Copyright (c) 2021 Agenium Scale # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. import common import os import operators # ----------------------------------------------------------------------------- # Construct C11 types def get_c11_types(simd_ext): ret = '' for se in common.simds_deps[simd_ext]: ret += '\n\n'.join([ '''typedef NSIMD_STRUCT nsimd_pack_{typ}_{se} {{ nsimd_{se}_v{typ} v; }} nsimd_pack_{typ}_{se}; NSIMD_INLINE nsimd_pack_{typ}_{se} nsimd_make_pack_{typ}_{se}(nsimd_{se}_v{typ} v) {{ return (nsimd_pack_{typ}_{se}){{ v }}; }}'''.format(typ=typ, se=se) for typ in common.types]) ret += '\n\n' ret += '\n\n'.join([ '''typedef NSIMD_STRUCT nsimd_packl_{typ}_{se} {{ nsimd_{se}_vl{typ} v; }} nsimd_packl_{typ}_{se}; NSIMD_INLINE nsimd_packl_{typ}_{se} nsimd_make_packl_{typ}_{se}(nsimd_{se}_vl{typ} v) {{ return (nsimd_packl_{typ}_{se}){{ v }}; }}'''.format(typ=typ, se=se) for typ in common.types]) for deg in [2, 3, 4]: vs = ', '.join(['v{}'.format(i) for i in range(deg)]) avs = ', '.join(['{{a0.v{}}}'.format(i) for i in range(deg)]) ret += '\n\n' ret += '\n\n'.join([ '''typedef NSIMD_STRUCT nsimd_packx{deg}_{typ}_{se} {{ nsimd_pack_{typ}_{se} {vs}; }} nsimd_packx{deg}_{typ}_{se}; NSIMD_INLINE nsimd_packx{deg}_{typ}_{se} nsimd_make_packx{deg}_{typ}_{se} (nsimd_{se}_v{typ}x{deg} a0) {{ return (nsimd_packx{deg}_{typ}_{se}){{ {avs} }}; }} '''. \ format(typ=typ, se=se, vs=vs, deg=deg, avs=avs) \ for typ in common.types]) ret += '\n\n' ret += '#define nsimd_make_pack(var, func) ' \ '_Generic(var, \\\n' ret += '\n'.join([ 'nsimd_pack_{typ}_{se}: nsimd_make_pack_{typ}_{se}, \\'. \ format(typ=typ, se=se) for typ in common.types \ for se in common.simds_deps[simd_ext]]) ret += '\n' ret += '\n'.join([ 'nsimd_packl_{typ}_{se}: nsimd_make_packl_{typ}_{se}, \\'. \ format(typ=typ, se=se) for typ in common.types \ for se in common.simds_deps[simd_ext]]) ret += '\n' ret += '\n'.join([ 'nsimd_packx{d}_{typ}_{se}: nsimd_make_packx{d}_{typ}_{se}, \\'. \ format(typ=typ, se=se, d=d) for typ in common.types \ for d in [2, 3, 4] \ for se in common.simds_deps[simd_ext]]) ret += '\ndefault: nsimd_c11_type_unsupported)(func)' ret += '\n\n' ret += '\n'.join([ 'typedef nsimd_pack_{typ}_{simd_ext} nsimd_pack_{typ};'. \ format(typ=typ, simd_ext=simd_ext) for typ in common.types]) ret += '\n\n' ret += '\n'.join([ 'typedef nsimd_packl_{typ}_{simd_ext} nsimd_packl_{typ};'. \ format(typ=typ, simd_ext=simd_ext) for typ in common.types]) ret += '\n\n' ret += '\n'.join([ 'typedef nsimd_packx{d}_{typ}_{simd_ext} nsimd_packx{d}_{typ};'. \ format(typ=typ, simd_ext=simd_ext, d=d) \ for typ in common.types for d in [2, 3, 4]]) ret += '\n\n' ret += '#define nsimd_c11_pack(var) _Generic((var), \\\n' ret += '\n'.join([ 'nsimd_packl_{typ}_{se}: ' \ '((nsimd_pack_{typ}_{se} (*)())NULL)(), \\'. \ format(typ=typ, se=se) for typ in common.types \ for se in common.simds_deps[simd_ext]]) ret += '\ndefault: NULL)' ret += '\n\n' ret += '#define nsimd_c11_packl(var) _Generic((var), \\\n' ret += '\n'.join([ 'nsimd_pack_{typ}_{se}: ' \ '((nsimd_packl_{typ}_{se} (*)())NULL)(), \\'. \ format(typ=typ, se=se) for typ in common.types \ for se in common.simds_deps[simd_ext]]) ret += '\ndefault: NULL)' ret += '\n\n' ret += '#define nsimd_c11_packx2(var) _Generic((var), \\\n' ret += '\n'.join([ 'nsimd_pack_{typ}_{se}: ' \ '((nsimd_packx2_{typ}_{se} (*)())NULL)(), \\'. \ format(typ=typ, se=se) for typ in common.types \ for se in common.simds_deps[simd_ext]]) ret += '\ndefault: NULL)' return ret # ----------------------------------------------------------------------------- # Construct C11 overloads def get_c11_overloads(op, simd_ext): if common.get_first_discriminating_type(op.params) == -1: # Only the len operator should go here assert op.name == 'len' ret = '\n\n'.join([ '''#define NSIMD_C11_LEN_nsimd_pack_{typ}_{se}() \\ nsimd_len_{se}_{typ}() #define NSIMD_C11_LEN_nsimd_packl_{typ}_{se}() \\ nsimd_len_{se}_{typ}() #define NSIMD_C11_LEN_nsimd_packx2_{typ}_{se}() \\ (2 * nsimd_len_{se}_{typ}()) #define NSIMD_C11_LEN_nsimd_packx3_{typ}_{se}() \\ (3 * nsimd_len_{se}_{typ}()) #define NSIMD_C11_LEN_nsimd_packx4_{typ}_{se}() \\ (4 * nsimd_len_{se}_{typ}())'''.format(typ=typ, se=se) \ for typ in op.types for se in common.simds_deps[simd_ext]]) ret += '\n\n' ret += '\n\n'.join([ '''#define NSIMD_C11_LEN_nsimd_pack_{typ}() \\ nsimd_len_{simd_ext}_{typ}() #define NSIMD_C11_LEN_nsimd_packl_{typ}() \\ nsimd_len_{simd_ext}_{typ}() #define NSIMD_C11_LEN_nsimd_packx2_{typ}() \\ (2 * nsimd_len_{simd_ext}_{typ}()) #define NSIMD_C11_LEN_nsimd_packx3_{typ}() \\ (3 * nsimd_len_{simd_ext}_{typ}()) #define NSIMD_C11_LEN_nsimd_packx4_{typ}() \\ (4 * nsimd_len_{simd_ext}_{typ}())'''. \ format(typ=typ, simd_ext=simd_ext) for typ in common.types]) ret += '\n\n' ret += '#define nsimd_len(type) \\\n' \ 'NSIMD_PP_CAT_2(NSIMD_C11_LEN_, type)()\n\n' return ret def get_c11_arg(param, name): if param in ['*', 'c*', 's', 'p']: return name elif param in ['v', 'l', 'vi']: return '({}).v'.format(name) args = op.params[1:] i0 = common.get_first_discriminating_type(args) if i0 == -1: if op.params[0] == 'v': pack = 'pack' elif op.params[0] == 'l': pack = 'packl' elif op.params[0] == 'vx2': pack = 'packx2' elif op.params[0] == 'vx3': pack = 'packx3' elif op.params[0] == 'vx4': pack = 'packx4' macro_args = ', '.join(['a{}'.format(i) for i in range(len(args))]) ret = '\n\n'.join([ '''#define NSIMD_C11_{OP_NAME}_nsimd_{pack}_{typ}_{se}({macro_args}) \\ nsimd_make_{pack}_{typ}_{se}( \\ nsimd_{op_name}_{se}_{typ}({macro_args}))'''. \ format(OP_NAME=op.name.upper(), se=se, macro_args=macro_args, op_name=op.name, typ=typ, pack=pack) \ for typ in op.types \ for se in common.simds_deps[simd_ext]]) ret += '\n\n' ret += '\n\n'.join([ '''#define NSIMD_C11_{OP_NAME}_nsimd_{pack}_{typ}({macro_args}) \\ nsimd_make_{pack}_{typ}_{simd_ext}( \\ nsimd_{op_name}_{simd_ext}_{typ}({macro_args}))'''. \ format(OP_NAME=op.name.upper(), simd_ext=simd_ext, macro_args=macro_args, op_name=op.name, typ=typ, pack=pack) for typ in op.types]) ret += '\n\n' type_args = ', '.join(['type'] + \ ['a{}'.format(i) for i in range(len(args))]) call_args = ', '.join([get_c11_arg(args[i], 'a{}'.format(i)) \ for i in range(len(args))]) ret += '\n\n#define nsimd_{op_name}({type_args})' \ ' NSIMD_PP_CAT_2(NSIMD_C11_{OP_NAME}_, type)({call_args})'. \ format(op_name=op.name, OP_NAME=op.name.upper(), call_args=call_args, type_args=type_args) return ret # Getting here means that i0 >= 0 i.e. that overloads can be determined # by argument i0 of the operator which is in ['v', 'l', 'vx2', 'vx3', # 'vx4'] macro_args = ['a{}'.format(i) for i in range(len(args))] call_args = ', '.join([get_c11_arg(args[i], 'a{}'.format(i)) \ for i in range(len(args))]) if not op.closed: macro_args = ['to_type'] + macro_args macro_args = ', '.join(macro_args) if op.params[0] in ['v', 'l', 'vx2', 'vx3', 'vx4']: if not op.closed: ret = '#define nsimd_{}({}) ' \ 'nsimd_make_pack((((to_type (*)())NULL)()), ' \ '_Generic(({}), \\\n'. \ format(op.name, macro_args, 'a{}'.format(i0)) else: if op.params[0] != args[i0]: if op.params[0] == 'v': ctrl_expr = 'nsimd_c11_pack(a{})'.format(i0) elif op.params[0] == 'l': ctrl_expr = 'nsimd_c11_packl(a{})'.format(i0) elif op.params[0] == 'vx2': ctrl_expr = 'nsimd_c11_packx2(a{})'.format(i0) else: ctrl_expr = 'a{}'.format(i0) ret = '#define nsimd_{}({}) ' \ 'nsimd_make_pack({}, _Generic(({}), \\\n'. \ format(op.name, macro_args, ctrl_expr, 'a{}'.format(i0)) else: ret = '#define nsimd_{}({}) _Generic(({}), \\\n'. \ format(op.name, macro_args, 'a{}'.format(i0)) suf = { 'v': '', 'l': 'l', 'vx2': 'x2', 'vx3': 'x3', 'vx4': 'x4'} arg = args[i0] typ_fmt = 'nsimd_pack{}_{{}}_{{}}'.format(suf[arg]) for se in common.simds_deps[simd_ext]: for typ in op.types: ret += typ_fmt.format(typ, se) + ': ' if op.closed: ret += 'nsimd_{}_{}_{}, \\\n'.format(op.name, se, typ) continue ret += '_Generic(((to_type (*)())NULL)(), \\\n' for to_typ in common.get_output_types(typ, op.output_to): to_pack = 'nsimd_pack{}_{}_{}'. \ format(suf[op.params[0]], to_typ, se) ret += ' {}: nsimd_{}_{}_{}_{}, \\\n'. \ format(to_pack, op.name, se, to_typ, typ) ret += ' default: nsimd_c11_type_unsupported), \\\n' ret += 'default: nsimd_c11_type_unsupported)({})'.format(call_args) if op.params[0] in ['v', 'l', 'vx2', 'vx3', 'vx4']: ret += ')' return ret # ----------------------------------------------------------------------------- def doit(opts): common.myprint(opts, 'Generating advanced C API (requires C11)') filename = os.path.join(opts.include_dir, 'c_adv_api_functions.h') if not common.can_create_filename(opts, filename): return with common.open_utf8(opts, filename) as out: out.write('''#ifndef NSIMD_C_ADV_API_FUNCTIONS_H #define NSIMD_C_ADV_API_FUNCTIONS_H #include ''') for simd_ext in common.simds: out.write('''{hbar} {hbar} {hbar} /* {SIMD_EXT} */ {hbar} {hbar} {hbar} #ifdef NSIMD_{SIMD_EXT} {types} '''.format(hbar=common.hbar, types=get_c11_types(simd_ext), SIMD_EXT=simd_ext.upper())) for op_name, operator in operators.operators.items(): out.write('/* {} */\n\n{}\n\n'. \ format(op_name, get_c11_overloads(operator, simd_ext))) out.write('\n\n#endif') out.write('\n\n{}\n\n#endif\n'.format(common.hbar)) ================================================ FILE: egg/gen_adv_cxx_api.py ================================================ # Copyright (c) 2019 Agenium Scale # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. import operators import common import os from datetime import date import sys # ----------------------------------------------------------------------------- # Actual implementation def get_cxx_advanced_generic(operator): def get_pack(param): if param in ['v', 'vi']: return 'pack' elif param == 'l': return 'pack' else: return 'pack{}'.format(param[1:]) args_list = common.enum(operator.params[1:]) inter = [i for i in ['v', 'vi', 'l', 'vx1', 'vx2', 'vx3', 'vx4'] \ if i in operator.params[1:]] need_tmpl_pack = get_pack(operator.params[0]) if inter == [] else None # Compute parameters passed to the base C++ API functions def var(arg, N): member = 'car' if N == '1' else 'cdr' if arg[1] in ['vi', 'v', 'l']: return 'a{}.{}'.format(arg[0], member) elif (arg[1] in ['*', 'c*']) and N != '1': return 'a{} + len_'.format(arg[0]) else: return 'a{}'.format(arg[0]) vars1 = [var(i, '1') for i in args_list] + ['T()'] + \ (['typename ToPackType::value_type()'] if not operator.closed \ else []) + ['SimdExt()'] varsN = [var(i, 'N') for i in args_list] other_varsN = ', '.join(['a{}'.format(i[0]) for i in args_list]) if other_varsN != '': other_varsN = ', ' + other_varsN if not operator.closed: varsN = ['typename ToPackType::value_type()'] + varsN if need_tmpl_pack != None: varsN = ['{}()'.format(need_tmpl_pack)] + varsN vars1 = ', '.join(vars1) varsN = ', '.join(varsN) # Compute return type ret1 = 'ToPackType' if not operator.closed \ else common.get_one_type_generic_adv_cxx(operator.params[0], 'T', '1') retN = 'ToPackType' if not operator.closed \ else common.get_one_type_generic_adv_cxx(operator.params[0], 'T', 'N') # Dump C++ if operator.params[0] in ['v', 'vi', 'l']: return_ret = 'return ret;' ret_car = 'ret.car = ' ret_cdr = 'ret.cdr = ' post_car = '' post_cdr = '' pack1_ret = '{} ret;'.format(ret1) packN_ret = '{} ret;'.format(retN) elif operator.params[0] in ['vx1', 'vx2', 'vx3', 'vx4']: num = operator.params[0][-1:] return_ret = 'return ret;' if operator.closed: ret_car = \ 'typename simd_traits::simd_vectorx{} car = '. \ format(num) else: ret_car = \ '''typename simd_traits::simd_vectorx{} car = '''.format(num) ret_cdr = 'packx{} cdr = '.format(num) post_car = '; ret.set_car({})'.format(', '.join( \ ['car.v{}'.format(i) for i in range(0, int(num))])) post_cdr = '; ret.set_cdr({})'.format(', '.join( \ ['cdr.v{}'.format(i) for i in range(0, int(num))])) pack1_ret = '{} ret;'.format(ret1) packN_ret = '{} ret;'.format(retN) else: return_ret = '' ret_car = '' ret_cdr = '' post_car = '' post_cdr = '' pack1_ret = '' packN_ret = '' if '*' in operator.params[1:] or 'c*' in operator.params[1:]: # store*[au] does not contain any packx* argument, therefore the offset # cannot be correctly computed if operator.name in ['store2u', 'store2a']: multiplier = '2 * ' elif operator.name in ['store3u', 'store3a']: multiplier = '3 * ' elif operator.name in ['store4u', 'store4a']: multiplier = '4 * ' else: multiplier = '' int_len = 'int len_ = {}len({}());'. \ format(multiplier, get_pack(inter[0]) if inter != [] \ else need_tmpl_pack) else: int_len = '' sig = operator.get_generic_signature('cxx_adv') for k in sig: sig[k] = sig[k][:-1] # remove trailing ';' tmpl = '''{{sig1}} {{{{{pack1_ret} {ret_car}{name}({vars1}){post_car}; {return_ret}}}}} {{sigN}} {{{{{packN_ret}{int_len} {ret_car}{name}({vars1}){post_car}; {ret_cdr}{{cxx_name}}({varsN}){post_cdr}; {return_ret}}}}}'''. \ format(pack1_ret=pack1_ret, ret_car=ret_car, name=operator.name, vars1=vars1, return_ret=return_ret, retN=retN, packN_ret=packN_ret, int_len=int_len, ret_cdr=ret_cdr, varsN=varsN, post_car=post_car, post_cdr=post_cdr) ret = '' if operator.cxx_operator: ret += tmpl.format(cxx_name='operator'+operator.cxx_operator, sig1=sig['op1'], sigN=sig['opN']) + '\n\n' ret += tmpl.format(cxx_name=operator.name, sig1=sig['1'], sigN=sig['N']) + '\n\n' if not operator.closed: return_ins = 'return ' if operator.params[0] != '_' else '' ret += '\n\n' ret += '''{sig} {{ {return_ins}{cxx_name}(ToPackType(){other_varsN}); }}'''. \ format(cxx_name=operator.name, sig=sig['dispatch'], other_varsN=other_varsN, return_ins=return_ins) if need_tmpl_pack != None: ret += '\n\n' ret += '''{sig} {{ return {cxx_name}(SimdVector(){other_varsN}); }}'''. \ format(sig=sig['dispatch'], cxx_name=operator.name, other_varsN=other_varsN) return ret # ----------------------------------------------------------------------------- # Generate assignments operator (+=, *=, &=, ...) def gen_assignment_operators(op): #return '''{sig} {{ }}''' return '' # ----------------------------------------------------------------------------- # Generate advanced C++ API def doit(opts): common.myprint(opts, 'Generating advanced C++ API') filename = os.path.join(opts.include_dir, 'cxx_adv_api_functions.hpp') if not common.can_create_filename(opts, filename): return with common.open_utf8(opts, filename) as out: out.write('''#ifndef NSIMD_CXX_ADV_API_FUNCTIONS_HPP #define NSIMD_CXX_ADV_API_FUNCTIONS_HPP namespace nsimd { ''') for op_name, operator in operators.operators.items(): if not operator.autogen_cxx_adv: continue out.write('''{hbar} {code} '''.format(hbar=common.hbar, code=get_cxx_advanced_generic(operator))) if operator.cxx_operator and \ (operator.args in [['v', 'v'], ['v', 'p']]): out.write('{hbar}\n{code}'. \ format(hbar=common.hbar, code=gen_assignment_operators(operator))) out.write('''{hbar} }} // namespace nsimd #endif'''.format(hbar=common.hbar)) common.clang_format(opts, filename) ================================================ FILE: egg/gen_archis.py ================================================ # Copyright (c) 2021 Agenium Scale # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. import operators import common import gen_adv_c_api import os from datetime import date import sys # ----------------------------------------------------------------------------- # Generate code for output def get_simd_implementation_src(operator, simd_ext, from_typ, fmtspec): if simd_ext == 'cpu': vlen = common.CPU_NBITS // int(from_typ[1:]) vasi = [] params = operator.params[1:] for i in range(len(params)): if params[i] in ['v', 'l', 'vi']: vasi.append('a{}.v{{i}}'.format(i)) else: vasi.append('a{}'.format(i)) vasi = ', '.join(vasi) typ2 = 'f32' if from_typ == 'f16' else from_typ if operator.params[0] == '_': body = '\n'.join( ['nsimd_scalar_{op_name}_{typ2}({vasi});'. \ format(op_name=operator.name, typ2=typ2, vasi=vasi.format(i=i)) for i in range(vlen)]) else: body = 'nsimd_cpu_v{} ret;\n'.format(from_typ) body += '\n'.join( ['ret.v{i} = nsimd_scalar_{op_name}_{typ2}({vasi});'. \ format(i=i, op_name=operator.name, typ2=typ2, vasi=vasi.format(i=i)) for i in range(vlen)]) body += '\nreturn ret;\n' return \ '''{hbar} NSIMD_INLINE {return_typ} NSIMD_VECTORCALL nsimd_{name}_{simd_ext}_{suf}({c_args}) {{ {body} }} #if NSIMD_CXX > 0 namespace nsimd {{ NSIMD_INLINE {return_typ} NSIMD_VECTORCALL {name}({cxx_args}) {{ {body} }} }} // namespace nsimd #endif '''.format(body=body, **fmtspec) if from_typ == 'f16': n = len(operator.params[1:]) f16_to_f32 = '\n'.join( ['nsimd_{simd_ext}_vf32x2 buf{i}' \ ' = nsimd_upcvt_{simd_ext}_f32_f16({args});'. \ format(i=i, args=common.get_arg(i), **fmtspec) \ for i in range(n)]) bufsv0 = ', '.join(['buf{}.v0'.format(i) for i in range(n)]) bufsv1 = ', '.join(['buf{}.v1'.format(i) for i in range(n)]) if operator.params[0] != '_': retv0 = 'nsimd_{simd_ext}_vf32 retv0 = '.format(**fmtspec) retv1 = 'nsimd_{simd_ext}_vf32 retv1 = '.format(**fmtspec) f32_to_f16 = \ 'return nsimd_downcvt_{simd_ext}_f16_f32(retv0, retv1);'. \ format(**fmtspec) else: retv0 = '' retv1 = '' f32_to_f16 = '' retv0 += '{sleef_symbol_prefix}_{simd_ext}_f32({bufsv0});'. \ format(bufsv0=bufsv0, **fmtspec) retv1 += '{sleef_symbol_prefix}_{simd_ext}_f32({bufsv1});'. \ format(bufsv1=bufsv1, **fmtspec) return \ '''{hbar} NSIMD_INLINE {return_typ} NSIMD_VECTORCALL nsimd_{name}_{simd_ext}_{suf}({c_args}) {{ {f16_to_f32} {retv0} {retv1} {f32_to_f16}}} #if NSIMD_CXX > 0 namespace nsimd {{ NSIMD_INLINE {return_typ} NSIMD_VECTORCALL {name}({cxx_args}) {{ {f16_to_f32} {retv0} {retv1} {f32_to_f16}}} }} // namespace nsimd #endif '''.format(f16_to_f32=f16_to_f32, retv0=retv0, retv1=retv1, f32_to_f16=f32_to_f16, **fmtspec) else: return \ '''{hbar} #if NSIMD_CXX > 0 extern "C" {{ #endif NSIMD_DLLSPEC {return_typ} NSIMD_VECTORCALL {sleef_symbol_prefix}_{simd_ext}_{suf}({c_args}); #if NSIMD_CXX > 0 }} // extern "C" #endif NSIMD_INLINE {return_typ} NSIMD_VECTORCALL nsimd_{name}_{simd_ext}_{suf}({c_args}) {{ {returns}{sleef_symbol_prefix}_{simd_ext}_{suf}({vas}); }} #if NSIMD_CXX > 0 namespace nsimd {{ NSIMD_INLINE {return_typ} NSIMD_VECTORCALL {name}({cxx_args}) {{ {returns}{sleef_symbol_prefix}_{simd_ext}_{suf}({vas}); }} }} // namespace nsimd #endif '''.format(**fmtspec) # ----------------------------------------------------------------------------- # Generate code for output def get_simd_implementation(opts, operator, mod, simd_ext): typ_pairs = [] for t in operator.types: return_typs = common.get_output_types(t, operator.output_to) for tt in return_typs: typ_pairs.append([t, tt]) if not operator.closed: tmp = [p for p in typ_pairs if p[0] in common.ftypes and \ p[1] in common.ftypes] tmp += [p for p in typ_pairs if p[0] in common.itypes and \ p[1] in common.itypes] tmp += [p for p in typ_pairs if p[0] in common.utypes and \ p[1] in common.utypes] tmp += [p for p in typ_pairs \ if (p[0] in common.utypes and p[1] in common.itypes) or \ (p[0] in common.itypes and p[1] in common.utypes)] tmp += [p for p in typ_pairs \ if (p[0] in common.iutypes and p[1] in common.ftypes) or \ (p[0] in common.ftypes and p[1] in common.iutypes)] typ_pairs = tmp ret = '' for pair in typ_pairs: from_typ = pair[0] to_typ = pair[1] fmtspec = operator.get_fmtspec(from_typ, to_typ, simd_ext) if operator.src: ret += get_simd_implementation_src(operator, simd_ext, from_typ, fmtspec) else: ret += \ '''{hbar} NSIMD_INLINE {return_typ} NSIMD_VECTORCALL nsimd_{name}_{simd_ext}_{suf}({c_args}) {{ {content} }} #if NSIMD_CXX > 0 namespace nsimd {{ NSIMD_INLINE {return_typ} NSIMD_VECTORCALL {name}({cxx_args}) {{ {returns}nsimd_{name}_{simd_ext}_{suf}({vas}); }} }} // namespace nsimd #endif '''.format(content=mod.get_impl(opts, operator.name, simd_ext, from_typ, to_typ), **fmtspec) return ret[0:-2] # ----------------------------------------------------------------------------- # Generate code for output def gen_archis_write_put(opts, platform, simd_ext, simd_dir): filename = os.path.join(simd_dir, 'put.h') if not common.can_create_filename(opts, filename): return op = None with common.open_utf8(opts, filename) as out: out.write( \ '''#ifndef NSIMD_{PLATFORM}_{SIMD_EXT}_PUT_H #define NSIMD_{PLATFORM}_{SIMD_EXT}_PUT_H {include_cpu_put}#include #include {hbar} '''.format(year=date.today().year, hbar=common.hbar, simd_ext=simd_ext, platform=platform, PLATFORM=platform.upper(), SIMD_EXT=simd_ext.upper(), include_cpu_put='#include \n' \ if simd_ext != 'cpu' else '')) for typ in common.types: out.write( \ '''#if NSIMD_CXX > 0 extern "C" {{ #endif NSIMD_DLLSPEC int NSIMD_VECTORCALL nsimd_put_{simd_ext}_{typ}(FILE *, const char *, nsimd_{simd_ext}_v{typ}); #if NSIMD_CXX > 0 }} // extern "C" #endif #if NSIMD_CXX > 0 namespace nsimd {{ NSIMD_INLINE int NSIMD_VECTORCALL put(FILE *out, const char *fmt, nsimd_{simd_ext}_v{typ} a0, {typ}, {simd_ext}) {{ return nsimd_put_{simd_ext}_{typ}(out, fmt, a0); }} }} // namespace nsimd #endif {hbar} #if NSIMD_CXX > 0 extern "C" {{ #endif NSIMD_DLLSPEC int NSIMD_VECTORCALL nsimd_put_{simd_ext}_l{typ}(FILE *, const char *, nsimd_{simd_ext}_vl{typ}); #if NSIMD_CXX > 0 }} // extern "C" #endif #if NSIMD_CXX > 0 namespace nsimd {{ NSIMD_INLINE int NSIMD_VECTORCALL putl(FILE *out, const char *fmt, nsimd_{simd_ext}_vl{typ} a0, {typ}, {simd_ext}) {{ return nsimd_put_{simd_ext}_l{typ}(out, fmt, a0); }} }} // namespace nsimd #endif {hbar} '''.format(simd_ext=simd_ext, hbar=common.hbar, typ=typ)) out.write('#endif') common.clang_format(opts, filename) # ----------------------------------------------------------------------------- # Generate code for architectures def gen_archis_write_file(opts, op, platform, simd_ext, simd_dir): filename = os.path.join(simd_dir, '{}.h'.format(op.name)) if not common.can_create_filename(opts, filename): return mod = opts.platforms[platform] additional_include = mod.get_additional_include(op.name, platform, simd_ext) if op.src: additional_include += \ '''#include #include '''.format(platform=platform, simd_ext=simd_ext) with common.open_utf8(opts, filename) as out: out.write( '''#ifndef {guard} #define {guard} #include {additional_include} {code} {hbar} #endif '''.format(additional_include=additional_include, year=date.today().year, guard=op.get_header_guard(platform, simd_ext), platform=platform, simd_ext=simd_ext, func=op.name, hbar=common.hbar, code=get_simd_implementation(opts, op, mod, simd_ext))) common.clang_format(opts, filename) def gen_archis_simd(opts, platform, simd_ext, simd_dir): for op_name, operator in operators.operators.items(): gen_archis_write_file(opts, operator, platform, simd_ext, simd_dir) gen_archis_write_put(opts, platform, simd_ext, simd_dir) def gen_archis_types(opts, simd_dir, platform, simd_ext): filename = os.path.join(simd_dir, 'types.h') if not common.can_create_filename(opts, filename): return mod = opts.platforms[platform] c_code = '\n'.join([mod.get_type(opts, simd_ext, t, 'nsimd_{}_v{}'.format(simd_ext, t)) \ for t in common.types]) c_code += '\n\n' c_code += '\n'.join([mod.get_logical_type( opts, simd_ext, t, 'nsimd_{}_vl{}'. \ format(simd_ext, t)) for t in common.types]) if mod.has_compatible_SoA_types(simd_ext): for deg in range(2, 5): c_code += '\n'.join([mod.get_SoA_type(simd_ext, typ, deg, 'nsimd_{}_v{}x{}'.format(simd_ext, typ, deg)) \ for typ in common.types]) else: c_code += '\n'.join([ ''' typedef NSIMD_STRUCT nsimd_{simd_ext}_v{typ}x2 {{ nsimd_{simd_ext}_v{typ} v0; nsimd_{simd_ext}_v{typ} v1; }} nsimd_{simd_ext}_v{typ}x2; '''.format(simd_ext=simd_ext, typ=typ) for typ in common.types]) c_code += '\n'.join([ ''' typedef NSIMD_STRUCT nsimd_{simd_ext}_v{typ}x3 {{ nsimd_{simd_ext}_v{typ} v0; nsimd_{simd_ext}_v{typ} v1; nsimd_{simd_ext}_v{typ} v2; }} nsimd_{simd_ext}_v{typ}x3; '''.format(simd_ext=simd_ext, typ=typ) for typ in common.types]) c_code += '\n'.join([ ''' typedef NSIMD_STRUCT nsimd_{simd_ext}_v{typ}x4 {{ nsimd_{simd_ext}_v{typ} v0; nsimd_{simd_ext}_v{typ} v1; nsimd_{simd_ext}_v{typ} v2; nsimd_{simd_ext}_v{typ} v3; }} nsimd_{simd_ext}_v{typ}x4; '''.format(simd_ext=simd_ext, typ=typ) for typ in common.types]) c_code += '\n\n' cxx_code = \ '\n\n'.join(['''template <> struct simd_traits<{typ}, {simd_ext}> {{ typedef nsimd_{simd_ext}_v{typ} simd_vector; typedef nsimd_{simd_ext}_v{typ}x2 simd_vectorx2; typedef nsimd_{simd_ext}_v{typ}x3 simd_vectorx3; typedef nsimd_{simd_ext}_v{typ}x4 simd_vectorx4; typedef nsimd_{simd_ext}_vl{typ} simd_vectorl; }};'''.format(typ=t, simd_ext=simd_ext) for t in common.types]) with common.open_utf8(opts, filename) as out: out.write('''#ifndef NSIMD_{platform}_{SIMD_EXT}_TYPES_H #define NSIMD_{platform}_{SIMD_EXT}_TYPES_H {c_code} #define NSIMD_{simd_ext}_NB_REGISTERS {nb_registers} #if NSIMD_CXX > 0 namespace nsimd {{ // defined in nsimd.h for C++20 concepts // struct {simd_ext} {{}}; {cxx_code} }} // namespace nsimd #endif #endif '''. \ format(year=date.today().year, platform=platform.upper(), SIMD_EXT=simd_ext.upper(), simd_ext=simd_ext, c_code=c_code, cxx_code=cxx_code, nb_registers=mod.get_nb_registers(simd_ext))) common.clang_format(opts, filename) def gen_archis_platform(opts, platform): include_dir = os.path.join(opts.include_dir, platform); for s in opts.platforms[platform].get_simd_exts(): common.myprint(opts, 'Found new SIMD extension: {}'.format(s)) if s in opts.simd: simd_dir = os.path.join(include_dir, s) common.mkdir_p(simd_dir) gen_archis_types(opts, simd_dir, platform, s) gen_archis_simd(opts, platform, s, simd_dir) else: common.myprint(opts, ' Extension excluded by command line') def doit(opts): common.myprint(opts, 'Generating SIMD implementations') opts.platforms = common.get_platforms(opts) for p in opts.platforms: common.mkdir_p(os.path.join(opts.include_dir, p)) gen_archis_platform(opts, p) ================================================ FILE: egg/gen_base_apis.py ================================================ # Copyright (c) 2019 Agenium Scale # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. import operators import common import os from datetime import date import sys # ----------------------------------------------------------------------------- # C base generic implem def get_c_base_generic(operator): vas = common.get_args(len(operator.params) - 1) sig = operator.get_generic_signature('c_base') if not operator.closed: return \ '''{sig} NSIMD_PP_CAT_6(nsimd_{name}_, NSIMD_SIMD, _, \\ to_type, _, from_type)({vas}) {sig_e} NSIMD_PP_CAT_6(nsimd_{name}_, simd_ext, _, \\ to_type, _, from_type)({vas})'''. \ format(sig=sig[0], sig_e=sig[1], name=operator.name, vas=vas) else: return \ '''{sig} NSIMD_PP_CAT_4(nsimd_{name}_, NSIMD_SIMD, _, type)({vas}) {sig_e} NSIMD_PP_CAT_4(nsimd_{name}_, simd_ext, _, type)({vas})'''. \ format(sig=sig[0], sig_e=sig[1], name=operator.name, vas=vas) # ----------------------------------------------------------------------------- # C++ base generic implem def get_cxx_base_generic(operator): returns = '' if operator.params[0] == '_' else 'return' temp = common.get_args(len(operator.params) - 1) temp += ', ' if temp != '' else '' args = temp + 'F(), T()' if not operator.closed else temp + 'T()' return \ '''#if NSIMD_CXX > 0 namespace nsimd {{ {sig} {{ {returns} {name}({args}, NSIMD_SIMD()); }} }} // namespace nsimd #endif'''.format(name=operator.name, args=args, returns=returns, sig=operator.get_generic_signature('cxx_base')[:-1]) # ----------------------------------------------------------------------------- # Declarations for output def get_put_decl(): return \ '''#include NSIMD_AUTO_INCLUDE(put.h) #define vput(out, fmt, a0, type) \ NSIMD_PP_CAT_4(nsimd_put_, NSIMD_SIMD, _, type)(out, fmt, a0) #define vput_e(out, fmt, a0, type, simd_ext) \ NSIMD_PP_CAT_4(nsimd_put_, simd_ext, _, type)(out, fmt, a0) #if NSIMD_CXX > 0 namespace nsimd { template int put(FILE *out, const char *fmt, A0 a0, T) { return put(out, fmt, a0, T(), NSIMD_SIMD()); } } // namespace nsimd #endif ''' # ----------------------------------------------------------------------------- # Generate base APIs def doit(opts): common.myprint(opts, 'Generating base APIs') common.mkdir_p(opts.include_dir) filename = os.path.join(opts.include_dir, 'functions.h') if not common.can_create_filename(opts, filename): return with common.open_utf8(opts, filename) as out: out.write('''#ifndef NSIMD_FUNCTIONS_H #define NSIMD_FUNCTIONS_H '''.format(year=date.today().year)) for op_name, operator in operators.operators.items(): out.write('''{} #include NSIMD_AUTO_INCLUDE({}.h) {} {} '''.format(common.hbar, operator.name, get_c_base_generic(operator), get_cxx_base_generic(operator))) out.write('''{hbar} {put_decl} {hbar} #endif'''. \ format(hbar=common.hbar, put_decl=get_put_decl())) common.clang_format(opts, filename) ================================================ FILE: egg/gen_benches.py ================================================ # Copyright (c) 2019 Agenium Scale # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. import os import sys import common import operators from datetime import date from collections import OrderedDict # ----------------------------------------------------------------------------- # Sig def sig_replace_name(sig, name): sig = sig.split(' ') sig[1] = name return ' '.join(sig) def sig_translate(sig, translates, name=None): sig = sig.split(' ') ## Translates a given type to another sig[0] = translates.get(sig[0], sig[0]) ## Do not use sig[1] (the function name) for i, p in enumerate(sig[2:]): sig[2 + i] = translates.get(p, p) sig = ' '.join(sig) ## Redefine name if available if name: sig = sig_replace_name(sig, name) return sig # ----------------------------------------------------------------------------- # Errors class BenchError(RuntimeError): pass # ----------------------------------------------------------------------------- # Markers def asm_marker(simd, bench_name): r = '' r += '#ifdef ASM_MARKER' r += '\n' for_intel = '__asm__ __volatile__("callq __asm_marker__{bench_name}");'. \ format(bench_name=bench_name) for_arm = '__asm__ __volatile__("bl __asm_marker__{bench_name}");'. \ format(bench_name=bench_name) if simd in common.x86_simds: r += for_intel elif simd in common.arm_simds: r += for_arm elif simd == 'cpu': r += '''#if defined(NSIMD_X86) {} #elif defined(NSIMD_ARM) {} #endif'''.format(for_intel, for_arm) elif simd in common.ppc_simds: #TODO return ''. format(bench_name=bench_name) else: raise BenchError('Unable to write marker for SIMD: {}'.format(simd)) r += '\n' r += '#endif' return r # ----------------------------------------------------------------------------- # Metaclass # Provides __static_init__ hook class StaticInitMetaClass(type): def __new__(cls, name, bases, dct): x = type.__new__(cls, name, bases, dct) x.__static_init__(x) return x # ----------------------------------------------------------------------------- # Basic nsimd types ## Will be automatically populated thanks to the metaclass types = {} # ----------------------------------------------------------------------------- class TypeBase(object, metaclass=StaticInitMetaClass): @staticmethod def __static_init__(c): ## Skip base class if c.__name__.endswith('Base'): return types[c.name] = c() def is_simd(self): return False def is_volatile(self): return False class TypeVectorBase(TypeBase): def is_simd(self): return True # ----------------------------------------------------------------------------- class TypeVoid(TypeBase): name = '_' def as_type(self, typ): return 'void' # ----------------------------------------------------------------------------- class TypeScalar(TypeBase): name = 's' def as_type(self, typ): return typ def code_load(self, simd, typ, ptr): return '*({})'.format(ptr) def code_store(self, simd, typ, lhs, rhs): return '*({}) = {}'.format(lhs, rhs) # ----------------------------------------------------------------------------- class TypeVolatileScalar(TypeScalar): name = 'volatile-s' def is_volatile(self): return True # ----------------------------------------------------------------------------- class TypeLogicalScalar(TypeBase): name = 'ls' def as_type(self, typ): return { 'i8': 'u8', 'i16': 'u16', 'i32': 'u32', 'i64': 'u64', 'f32': 'u32', 'f64': 'u64', }.get(typ, typ) def code_load(self, simd, typ, ptr): return '({})(*({}))'.format(self.as_type(typ), ptr) def code_store(self, simd, typ, lhs, rhs): return '*({}) = ({})({})'.format(lhs, typ, rhs) # ----------------------------------------------------------------------------- class TypeVolatileLogicalScalar(TypeLogicalScalar): name = 'volatile-ls' def is_volatile(self): return True # ----------------------------------------------------------------------------- class TypeInt(TypeScalar): name = 'p' def as_type(self, typ): return 'int' # ----------------------------------------------------------------------------- class TypePtr(TypeBase): name = '*' def as_type(self, typ): return typ + '*' # ----------------------------------------------------------------------------- class TypeConstPtr(TypeBase): name = 'c*' def as_type(self, typ): return 'const ' + typ + '*' # ----------------------------------------------------------------------------- class TypeVector(TypeVectorBase): name = 'v' def as_type(self, typ): return 'v' + typ def code_load(self, simd, typ, ptr): return 'nsimd::loada({}, {}())'.format(ptr, typ) def code_store(self, simd, typ, ptr, expr): return 'nsimd::storea({}, {}, {}())'.format(ptr, expr, typ) # ----------------------------------------------------------------------------- class TypeCPUVector(TypeVector): name = 'vcpu' def code_load(self, simd, typ, ptr): return 'nsimd::loada({}, {}(), nsimd::cpu())'.format(ptr, typ) def code_store(self, simd, typ, ptr, expr): return 'nsimd::storea({}, {}, {}(), nsimd::cpu())'.format(ptr, expr, typ) # ----------------------------------------------------------------------------- class TypeUnrolledVectorBase(TypeVectorBase): def as_type(self, typ): raise NotImplemented() def code_load(self, simd, typ, ptr): return 'nsimd::loada>({})'. \ format(typ, self.unroll, ptr) def code_store(self, simd, typ, ptr, expr): return 'nsimd::storea({}, {})'.format(ptr, expr) # ----------------------------------------------------------------------------- class TypeUnrolledVector1(TypeUnrolledVectorBase): name = 'vu1' unroll = 1 class TypeUnrolledVector2(TypeUnrolledVectorBase): name = 'vu2' unroll = 2 class TypeUnrolledVector3(TypeUnrolledVectorBase): name = 'vu3' unroll = 3 class TypeUnrolledVector4(TypeUnrolledVectorBase): name = 'vu4' unroll = 4 class TypeUnrolledVector5(TypeUnrolledVectorBase): name = 'vu5' unroll = 5 class TypeUnrolledVector6(TypeUnrolledVectorBase): name = 'vu6' unroll = 6 class TypeUnrolledVector7(TypeUnrolledVectorBase): name = 'vu7' unroll = 7 class TypeUnrolledVector8(TypeUnrolledVectorBase): name = 'vu8' unroll = 8 class TypeUnrolledVector9(TypeUnrolledVectorBase): name = 'vu9' unroll = 9 # ----------------------------------------------------------------------------- class TypeVectorX2(TypeVectorBase): name = 'vx2' def as_type(self, typ): return 'v' + typ + 'x2' # ----------------------------------------------------------------------------- class TypeVectorX3(TypeVectorBase): name = 'vx3' def as_type(self, typ): return 'v' + typ + 'x3' # ----------------------------------------------------------------------------- class TypeVectorX4(TypeVectorBase): name = 'vx4' def as_type(self, typ): return 'v' + typ + 'x4' # ----------------------------------------------------------------------------- class TypeLogical(TypeVectorBase): name = 'l' def as_type(self, typ): return 'vl' + typ def code_load(self, simd, typ, ptr): return 'nsimd::loadla({}, {}())'.format(ptr, typ) def code_store(self, simd, typ, ptr, expr): return 'nsimd::storela({}, {}, {}())'.format(ptr, expr, typ) # ----------------------------------------------------------------------------- class TypeCPULogical(TypeLogical): name = 'lcpu' def code_load(self, simd, typ, ptr): return 'nsimd::loadla({}, {}(), nsimd::cpu())'.format(ptr, typ) def code_store(self, simd, typ, ptr, expr): return 'nsimd::storela({}, {}, {}(), nsimd::cpu())'.format(ptr, expr, typ) # ----------------------------------------------------------------------------- class TypeUnrolledLogicalBase(TypeVectorBase): def as_type(self, typ): raise NotImplemented() def code_load(self, simd, typ, ptr): return 'nsimd::loadla>({})'. \ format(typ, self.unroll, ptr) def code_store(self, simd, typ, ptr, expr): return 'nsimd::storela({}, {})'.format(ptr, expr) # ----------------------------------------------------------------------------- class TypeUnrolledLogical1(TypeUnrolledLogicalBase): name = 'lu1' unroll = 1 class TypeUnrolledLogical2(TypeUnrolledLogicalBase): name = 'lu2' unroll = 2 class TypeUnrolledLogical3(TypeUnrolledLogicalBase): name = 'lu3' unroll = 3 class TypeUnrolledLogical4(TypeUnrolledLogicalBase): name = 'lu4' unroll = 4 class TypeUnrolledLogical5(TypeUnrolledLogicalBase): name = 'lu5' unroll = 5 class TypeUnrolledLogical6(TypeUnrolledLogicalBase): name = 'lu6' unroll = 6 class TypeUnrolledLogical7(TypeUnrolledLogicalBase): name = 'lu7' unroll = 7 class TypeUnrolledLogical8(TypeUnrolledLogicalBase): name = 'lu8' unroll = 8 class TypeUnrolledLogical9(TypeUnrolledLogicalBase): name = 'lu9' unroll = 9 # ----------------------------------------------------------------------------- class TypeBoostSimdVector(TypeVectorBase): name = 'boost::simd::pack' def as_type(self, typ): return 'boost::simd::pack<{}>'.format(typ) def code_load(self, simd, typ, ptr): return '{}({})'.format(self.as_type(typ), ptr) def code_store(self, simd, typ, ptr, expr): return 'nsimd::storea({}, {}, {}())'.format(ptr, expr, typ) # ----------------------------------------------------------------------------- class TypeBoostSimdLogicalVector(TypeVectorBase): name = 'boost::simd::lpack' def as_type(self, typ): return 'boost::simd::pack>'.format(typ) def code_load(self, simd, typ, ptr): return '{}({})'.format(self.as_type(typ), ptr) def code_store(self, simd, typ, ptr, expr): return 'nsimd::storea({}, {}, {}())'.format(ptr, expr, typ) # ----------------------------------------------------------------------------- class TypeMIPPReg(TypeVectorBase): name = 'mipp::reg' def as_type(self, typ): return 'mipp::Reg<{}>'.format(typ) def code_load(self, simd, typ, ptr): return 'mipp::load<{}>({})'.format(typ, ptr) def code_store(self, simd, typ, ptr, expr): return 'mipp::store({}, {})'.format(ptr, expr) # ----------------------------------------------------------------------------- class TypeMIPPMsk(TypeVectorBase): name = 'mipp::msk' def as_type(self, typ): return 'mipp::Msk<{}>'.format(typ) def code_load(self, simd, typ, ptr): if simd in ['avx512_knl', 'avx512_skylake']: return '*({})'.format(ptr) else: return 'mipp::load<{}>({})'.format(typ, ptr) def code_store(self, simd, typ, ptr, expr): if simd in ['avx512_knl', 'avx512_skylake']: return '*({}) = {}'.format(ptr, expr) else: return 'mipp::store({}, reinterpret_cast({}))'.format(ptr, expr) # ----------------------------------------------------------------------------- def type_of(param): if param in types: return types[param] else: raise BenchError("Unable to find corresponding type for: " + param) def as_type(param, typ): return type_of(param).as_type(typ) # ----------------------------------------------------------------------------- # Operator class needs to be reinforced for benches class BenchOperator(object, metaclass=type): def __init__(self): self.typed_params_ = [] for p in self.params: self.typed_params_.append(type_of(p)) @property def function_name(self): return self.name.split('::')[-1].split('<')[0] ## Generates list of includes to be included def gen_includes(self, lang): includes = [] includes.append('') if lang == 'cxx_adv': includes.append('') if lang == 'c_base': includes += ['', '', '', ''] else: includes += ['', '', '', '', ''] return includes def match_sig(self, signature): (name, params) = common.parse_signature(signature) if len(params) != len(self.params): return False for p1, p2 in zip(params, self.params): if p1 != p2: return False return True def bench_code_before(self, typ): return '' def bench_against_init(self): bench = {} for simd in ['*'] + common.simds: bench[simd] = OrderedDict() for typ in ['*'] + common.types: bench[simd][typ] = OrderedDict() return bench def bench_against_cpu(self): bench = self.bench_against_init() ## Enable bench against nsimd (cpu architecture) if self.bench_auto_against_cpu: bench['*']['*'][common.nsimd_category('cpu')] = \ cpu_fun_from_sig(sig_translate(self.signature, { 's': 'volatile-s', 'v': 'vcpu', 'l': 'lcpu', })) return bench def bench_against_libs(self): bench = self.bench_against_init() ## Enable bench against all other libraries if self.bench_auto_against_mipp: for typ in self.bench_mipp_types(): ## MIPP always requires template mipp_name = self.bench_mipp_name(typ) signature = sig_translate(self.signature, { 'v': 'mipp::reg', 'l': 'mipp::msk', }, name=mipp_name) if signature: bench['*'][typ]['MIPP'] = signature if self.bench_auto_against_sleef: for simd in common.simds: for typ in self.bench_sleef_types(): if not common.sleef_support_type(simd, typ): continue sleef_name = self.bench_sleef_name(simd, typ) if sleef_name is None: continue ## IMPORTANT: ## If simd is cpu, then make the signature using scalar if simd == 'cpu': signature = sig_translate(self.signature, { 's': 'volatile-s', 'v': 'volatile-s', 'l': 'volatile-s', }, sleef_name) else: signature = sig_translate(self.signature, {}, sleef_name) if signature: bench[simd][typ]['Sleef'] = signature if self.bench_auto_against_std: for simd in common.simds: for typ in self.bench_std_types(): std_name = self.bench_std_name(simd, typ) signature = sig_translate(self.signature, { 's': 'volatile-s', 'v': 'volatile-s', 'l': 'volatile-s', }, std_name) if signature: if self.cxx_operator: bench[simd][typ]['std'] = std_operator_from_sig(signature, self.cxx_operator) else: bench[simd][typ]['std'] = std_fun_from_sig(signature) return bench def code_call(self, typ, args): return 'nsimd::{}({}, {}())'.format(self.name, common.pprint_commas(args), typ) def code_ptr_step(self, typ, simd): if any(p.is_simd() for p in self.typed_params_): return 'vlen_e({}, {})'.format(typ, simd) else: return '1' class BenchOperatorWithNoMakers(BenchOperator): use_for_parsing = False # Classes that inherit from me do not have their name member # which is mandatory so I fill it for them here. def __init__(self): BenchOperator.__init__(self) (self.name, void) = common.parse_signature(self.signature) # ----------------------------------------------------------------------------- # Make the list of all operators, they will inherit from the corresponding # operators.Operator and then from BenchOperator functions = {} class dummy(operators.MAddToOperators): def __new__(cls, name, bases, dct): return type.__new__(cls, name, bases, dct) for op_name, operator in operators.operators.items(): if operator.load_store: # We do not bench loads/stores continue op_class = dummy(operator.__class__.__name__, (operator.__class__, BenchOperator), {}) functions[op_name] = op_class() # ----------------------------------------------------------------------------- # Function helpers def nsimd_unrolled_fun_from_sig(from_sig, unroll): sig = sig_translate(from_sig, { 'v': 'vu' + str(unroll), 'l': 'lu' + str(unroll), }) class InlineNSIMDUnrolledFun(operators.Operator, BenchOperatorWithNoMakers, metaclass=dummy): signature = sig def code_call(self, typ, args): return 'nsimd::{}({})'.format(self.name, common.pprint_commas(args)) def code_ptr_step(self, typ, simd): return 'nsimd::len(nsimd::pack<{}, {}, nsimd::{}>())'.format(typ, unroll, simd) return InlineNSIMDUnrolledFun() def fun_from_sig(from_sig): class InlineFun(operators.Operator, BenchOperatorWithNoMakers, metaclass=dummy): signature = from_sig def code_call(self, typ, args): return '{}({})'.format(self.name, common.pprint_commas(args)) return InlineFun() def std_fun_from_sig(from_sig): return fun_from_sig(from_sig) def std_operator_from_sig(from_sig, op): class InlineStdOperatorFun(operators.Operator, BenchOperatorWithNoMakers, metaclass=dummy): __metaclass__ = dummy signature = from_sig operator = op def code_call(self, typ, args): if len(args) == 1: return '{}({})'.format(self.operator, args[0]) elif len(args) == 2: return '{} {} {}'.format(args[0], self.operator, args[1]) else: raise BenchError('std:: operators requires 1 or 2 arguments!') return InlineStdOperatorFun() def cpu_fun_from_sig(from_sig): class InlineCPUFun(operators.Operator, BenchOperatorWithNoMakers, metaclass=dummy): signature = from_sig def code_call(self, typ, args): return 'nsimd::{}({}, {}(), nsimd::cpu())'. \ format(self.name, common.pprint_commas(args), typ) return InlineCPUFun() def sanitize_fun_name(name): return ''.join(map(lambda c: c if c.isalnum() else '_', name)) # ----------------------------------------------------------------------------- # Code def code_cast(typ, expr): return '({})({})'.format(typ, expr) def code_cast_ptr(typ, expr): return code_cast(typ + '*', expr) # ----------------------------------------------------------------------------- # Globals _opts = None _lang = 'cxx_adv' # ----------------------------------------------------------------------------- # Generates def TODO(f): if _opts.verbose: common.myprint(opts, '@@ TODO: ' + f.name) def gen_filename(f, simd, typ): ## Retrieve directory from global options benches_dir = common.mkdir_p(os.path.join(_opts.benches_dir, _lang)) ## Generate path (composed from: function name + type + extension) return os.path.join(benches_dir, '{}.{}.{}.{}'.format( f.name, simd, typ, common.ext_from_lang(_lang))) def gen_bench_name(category, name, unroll=None): bench_name = '{}_{}'.format(category, name) if unroll: bench_name += '_unroll{}'.format(unroll) return bench_name def gen_bench_from_code(f, typ, code, bench_with_timestamp): header = '' header += common.pprint_includes(f.gen_includes(_lang)) header += \ ''' // Required for random generation #include "../benches.hpp" // Google benchmark #ifndef DISABLE_GOOGLE_BENCHMARK #include #endif #include double timestamp_ns() { timespec ts; clock_gettime(CLOCK_MONOTONIC, &ts); return double(ts.tv_sec) * 1000000000.0 + double(ts.tv_nsec); } // std #include // #include #include // #include // Sleef #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wignored-qualifiers" #include #pragma GCC diagnostic pop // MIPP #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wconversion" #pragma GCC diagnostic ignored "-Wsign-conversion" #pragma GCC diagnostic ignored "-Wdouble-promotion" #pragma GCC diagnostic ignored "-Wunused-parameter" #if defined(__clang__) #pragma GCC diagnostic ignored "-Wzero-length-array" #endif #include #pragma GCC diagnostic pop ''' return \ '''{header} // ------------------------------------------------------------------------- static const int sz = 1024; template static {type}* make_data(int sz, Random r) {{ {type}* data = ({type}*)nsimd_aligned_alloc(sz * {sizeof}); for (int i = 0; i < sz; ++i) {{ data[i] = r(); }} return data; }} static {type}* make_data(int sz) {{ {type}* data = ({type}*)nsimd_aligned_alloc(sz * {sizeof}); for (int i = 0; i < sz; ++i) {{ data[i] = {type}(0); }} return data; }} {random_code} {code} int main(int argc, char** argv) {{ std::vector args(argv, argv + argc); if (std::find(args.begin(), args.end(), "--use_timestamp_ns") != args.end()) {{ {bench_with_timestamp} }} #ifndef DISABLE_GOOGLE_BENCHMARK else {{ ::benchmark::Initialize(&argc, argv); ::benchmark::RunSpecifiedBenchmarks(); }} #endif return 0; }} '''.format( name=f.name, type=typ, year=date.today().year, random_code=f.domain.code('rand_param', typ), code=code, bench_with_timestamp=bench_with_timestamp, sizeof=common.sizeof(typ), header=header, ) def gen_bench_info_from(f, simd, typ): bench_args_init = [] bench_args_decl = [] bench_args_call = [] ## Generate code for parameters for i, arg in enumerate(f.args): p = type_of(arg) qualifiers = '' if p.is_volatile(): qualifiers += 'volatile ' bench_args_init.append('make_data(sz, &rand_param{n})'.format(n=i)) bench_args_decl.append('{} {}* _{}'.format(qualifiers, typ, i)) bench_args_call.append(p.code_load(simd, typ, '_{} + i'.format(i))) ## Generate code for bench (using function return type) r = type_of(f.get_return()) bench_call = r.code_store(simd, typ, '_r + i', f.code_call(typ, bench_args_call)) return bench_args_init, bench_args_decl, bench_args_call, bench_call def gen_bench_asm_function(f, simd, typ, category): bench_args_init, bench_args_decl, \ bench_args_call, bench_call = gen_bench_info_from(f, simd, typ) ## Add function that can easily be parsed to get assembly and plain code return \ ''' void {bench_name}__asm__({type}* _r, {bench_args_decl}, int sz) {{ __asm__ __volatile__("nop"); __asm__ __volatile__("nop"); __asm__ __volatile__("nop"); __asm__ __volatile__("nop"); __asm__ __volatile__("nop"); __asm__ __volatile__("nop"); // code:{{ int n = {step}; #if defined(NSIMD_IS_GCC) #pragma GCC unroll 1 #elif defined(NSIMD_IS_CLANG) #pragma clang loop unroll(disable) #elif defined(NSIMD_IS_ICC) #pragma unroll(1) #endif for (int i = 0; i < sz; i += n) {{ {bench_call}; }} // code:}} __asm__ __volatile__("nop"); __asm__ __volatile__("nop"); __asm__ __volatile__("nop"); __asm__ __volatile__("nop"); __asm__ __volatile__("nop"); __asm__ __volatile__("nop"); }} '''.format( bench_name=gen_bench_name(category, f.function_name), type=typ, step=f.code_ptr_step(typ, simd), bench_call=bench_call, bench_args_decl=common.pprint_commas(bench_args_decl) ) def gen_bench_from_basic_fun(f, simd, typ, category, unroll=None): bench_args_init, bench_args_decl, bench_args_call, bench_call = \ gen_bench_info_from(f, simd, typ) bench_name = gen_bench_name(category, f.function_name, unroll) code_timestamp_ns = \ ''' void {bench_name}({type}* _r, {bench_args_decl}, int sz) {{ // Normalize size depending on the step so that we're not going out of boundaies // (Required when the size is'nt a multiple of `n`, like for unrolling benches) sz = (sz / {step}) * {step}; std::cout << "{bench_name}({type}), sz = " << sz << std::endl; {asm_marker} // code: {bench_name} int n = {step}; #if defined(NSIMD_IS_GCC) #pragma GCC unroll 1 #elif defined(NSIMD_IS_CLANG) #pragma clang loop unroll(disable) #elif defined(NSIMD_IS_ICC) #pragma unroll(1) #endif for (int i = 0; i < sz; i += n) {{ {bench_call}; }} // code: {bench_name} {asm_marker} }} ''' return \ ''' // ----------------------------------------------------------------------------- {code_before} extern "C" {{ void __asm_marker__{bench_name}() {{}} }} #ifndef DISABLE_GOOGLE_BENCHMARK void {bench_name}(benchmark::State& state, {type}* _r, {bench_args_decl}, int sz) {{ // Normalize size depending on the step so that we're not going out of boundaies // (Required when the size is'nt a multiple of `n`, like for unrolling benches) sz = (sz / {step}) * {step}; try {{ for (auto _ : state) {{ {asm_marker} // code: {bench_name} int n = {step}; #if defined(NSIMD_IS_GCC) #pragma GCC unroll 1 #elif defined(NSIMD_IS_CLANG) #pragma clang loop unroll(disable) #elif defined(NSIMD_IS_ICC) #pragma unroll(1) #endif for (int i = 0; i < sz; i += n) {{ {bench_call}; }} // code: {bench_name} {asm_marker} }} }} catch (std::exception const& e) {{ std::string message("ERROR: "); message += e.what(); state.SkipWithError(message.c_str()); }} }} BENCHMARK_CAPTURE({bench_name}, {type}, make_data(sz), {bench_args_init}, sz); #endif '''.format( bench_name=bench_name, type=typ, step=f.code_ptr_step(typ, simd), bench_call=bench_call, bench_args_init=common.pprint_commas(bench_args_init), bench_args_decl=common.pprint_commas(bench_args_decl), bench_args_call=common.pprint_commas(bench_args_call), code_before=f.bench_code_before(typ), asm_marker=asm_marker(simd, bench_name) ) def gen_code(f, simd, typ, category): code = None if f.returns_any_type: return TODO(f) ## TODO: We have to refactor this, it's annoying to add every possible signatures... if f.match_sig('v * v v') or f.match_sig('v * v v v') \ or f.match_sig('l * v v') or f.match_sig('l * l l') \ or f.match_sig('l * l') or f.match_sig('v * v') \ or f.match_sig('s * s') \ or f.match_sig('s * s s') \ or f.match_sig('s * s s s') \ or f.match_sig('vcpu * vcpu') \ or f.match_sig('vcpu * vcpu vcpu') \ or f.match_sig('vcpu * vcpu vcpu vcpu') \ or f.match_sig('lcpu * lcpu') \ or f.match_sig('lcpu * lcpu lcpu') \ or f.match_sig('lcpu * vcpu vcpu') \ or f.match_sig('vcpu * lcpu vcpu vcpu') \ or f.match_sig('volatile-s * volatile-s') \ or f.match_sig('volatile-s * volatile-s volatile-s') \ or f.match_sig('volatile-s * volatile-s volatile-s volatile-s') \ or f.match_sig('volatile-ls * volatile-s') \ or f.match_sig('volatile-ls * volatile-s volatile-s') \ or f.match_sig('volatile-ls * volatile-ls') \ or f.match_sig('volatile-ls * volatile-ls volatile-ls') \ or f.match_sig('volatile-s * volatile-ls volatile-s volatile-s') \ or f.match_sig('boost::simd::pack * boost::simd::pack') \ or f.match_sig('boost::simd::pack * boost::simd::pack boost::simd::pack') \ or f.match_sig('boost::simd::pack * boost::simd::pack boost::simd::pack boost::simd::pack') \ or f.match_sig('boost::simd::lpack * boost::simd::pack') \ or f.match_sig('boost::simd::lpack * boost::simd::pack boost::simd::pack') \ or f.match_sig('mipp::reg * mipp::reg') \ or f.match_sig('mipp::reg * mipp::reg mipp::reg') \ or f.match_sig('mipp::msk * mipp::reg') \ or f.match_sig('mipp::msk * mipp::reg mipp::reg') \ or f.match_sig('v * l v v'): code = gen_bench_from_basic_fun(f, simd, typ, category=category) if f.match_sig('p * l'): return TODO(f) if f.match_sig('l * p'): return TODO(f) if f.match_sig('v * s'): return TODO(f) if f.match_sig('l * p'): return TODO(f) if f.match_sig('p *'): return TODO(f) if f.match_sig('v * v p'): return TODO(f) if code is None: raise BenchError('Unable to generate bench for signature: ' + \ f.signature) return code def gen_bench_unrolls(f, simd, typ, category): code = '' sig = f.signature for unroll in [2, 3, 4]: f = nsimd_unrolled_fun_from_sig(sig, unroll) code += gen_bench_from_basic_fun(f, simd, typ, category=category, unroll=unroll) return code def gen_bench_against(f, simd, typ, against): code = '' # "against" dict looks like: { simd: { type: { name: sig } } } for s in [simd, '*']: if not s in against: continue for t in [typ, '*']: if not t in against[s]: continue for category, f in against[s][t].items(): # Allow function to be simple str (you use this most of the # time) if isinstance(f, str): f = fun_from_sig(f) # Now that we have a `Fun` type, we can generate code code += gen_code(f, simd, typ, category=category) return code def gen_bench_with_timestamp(f, simd, typ, category, unroll=None): code = '' bench_args_init, bench_args_decl, bench_args_call, bench_call = \ gen_bench_info_from(f, simd, typ) bench_name = gen_bench_name(category, f.function_name, unroll) bench_args_decl = '' bench_args_call = '' for i, arg in enumerate(f.args): bench_args_decl += typ + ' * data' + str(i) + ' = make_data(sz, &rand_param' + str(i) + ');' + '\n' if i != 0: bench_args_call += ', ' bench_args_call += 'data' + str(i) code += \ ''' {{ // Bench {typ} * r = make_data(sz); {bench_args_decl} double elapsed_times_ns[nb_runs] = {{ }}; // Must be at least 10000 {typ} sum = {{ }}; for (size_t run = 0; run < nb_runs; ++run) {{ double const t0 = timestamp_ns(); {bench_name}(r, {bench_args_call}, 1000); double const t1 = timestamp_ns(); elapsed_times_ns[run] = (t1 - t0) / double(sz); // Compute sum if (rand() % 2) {{ sum += std::accumulate(r, r + sz, {typ}()); }} else {{ sum -= std::accumulate(r, r + sz, {typ}()); }} }} // Save sum and elapsed time std::sort(elapsed_times_ns, elapsed_times_ns + nb_runs); size_t const i_start = nb_runs / 2 - 10; size_t const i_end = nb_runs / 2 + 10; sums["{bench_name}"] = std::make_pair(sum, std::accumulate(elapsed_times_ns + i_start, elapsed_times_ns + i_end, 0.0) / double(i_end - i_start)); // Number of elapsed times std::map nb_per_elapsed_time; for (size_t run = 0; run < nb_runs; ++run) {{ ++nb_per_elapsed_time[(i64(elapsed_times_ns[run] * 100)) / 100.0]; }} // Draw gnuplot std::system("mkdir -p gnuplot"); std::string const dat_filename = "gnuplot/benches.cxx_adv.{bench_name}.dat"; std::ofstream dat_file(dat_filename); for (auto const & elapsed_time_nb : nb_per_elapsed_time) {{ dat_file << elapsed_time_nb.first << " " << elapsed_time_nb.second << "\\n"; }} std::string const gnuplot_filename = "gnuplot/benches.cxx_adv.{bench_name}.gnuplot"; std::ofstream gnuplot_file(gnuplot_filename); gnuplot_file << "set term svg" << "\\n"; gnuplot_file << "set output \\"benches.cxx_adv.{bench_name}.svg\\"" << "\\n"; gnuplot_file << "set xlabel \\"Time in nanoseconds (lower is better)\\"" << "\\n"; gnuplot_file << "set ylabel \\"Number of runs\\"" << "\\n"; gnuplot_file << "\\n"; gnuplot_file << "set style line 1 \\\\" << "\\n"; gnuplot_file << " linecolor rgb '#db284c' \\\\" << "\\n"; gnuplot_file << " linetype 1 linewidth 2" << "\\n"; gnuplot_file << "\\n"; gnuplot_file << "plot '" << dat_filename << "' with linespoints linestyle 1" << "\\n"; std::system(("cd gnuplot && gnuplot \\"" + gnuplot_filename + "\\"").c_str()); }} '''.format(bench_name=bench_name, typ=typ, bench_args_decl=bench_args_decl, bench_args_call=bench_args_call, ) return code def gen_bench_unrolls_with_timestamp(f, simd, typ, category): code = '' for unroll in [2, 3, 4]: code += gen_bench_with_timestamp(f, simd, typ, category=category, unroll=unroll) return code def gen_bench_against_with_timestamp(f, simd, typ, against): code = '' # "against" dict looks like: { simd: { type: { name: sig } } } for s in [simd, '*']: if not s in against: continue for t in [typ, '*']: if not t in against[s]: continue for category, f in against[s][t].items(): # Allow function to be simple str (you use this most of the # time) if isinstance(f, str): f = fun_from_sig(f) # Now that we have a `Fun` type, we can generate code code += gen_bench_with_timestamp(f, simd, typ, category) return code def gen_bench(f, simd, typ): ## TODO path = gen_filename(f, simd, typ) ## Check if we need to create the file if not common.can_create_filename(_opts, path): return ## Generate specific code for the bench category = common.nsimd_category(simd) code = gen_code(f, simd, typ, category=category) if code is None: return ## Now aggregate every parts bench = '' #bench += gen_bench_asm_function(f, typ, category) bench += gen_bench_against(f, 'cpu', typ, f.bench_against_cpu()) bench += code bench += gen_bench_unrolls(f, simd, typ, category) bench += gen_bench_against(f, simd, typ, f.bench_against_libs()) ## bench_with_timestamp bench_with_timestamp = '' bench_with_timestamp += 'std::map> sums;' + '\n' bench_with_timestamp += 'size_t const nb_runs = 10 * 1000;' + '\n' bench_with_timestamp += gen_bench_against_with_timestamp(f, 'cpu', typ, f.bench_against_cpu()) bench_with_timestamp += gen_bench_with_timestamp(f, simd, typ, category) bench_with_timestamp += gen_bench_unrolls_with_timestamp(f, simd, typ, category) bench_with_timestamp += gen_bench_against_with_timestamp(f, simd, typ, f.bench_against_libs()) bench_with_timestamp += ''' std::string json = ""; json += "{{\\n"; json += " \\"benchmarks\\": [\\n"; for (auto const & bench_name_sum_time : sums) {{ std::string const & bench_name = bench_name_sum_time.first; {typ} const & sum = bench_name_sum_time.second.first; double const & elapsed_time_ns = bench_name_sum_time.second.second; json += " {{" "\\n"; json += " \\"name\\": \\"" + bench_name + "/{typ}\\"," + "\\n"; json += " \\"real_time\\": " + std::to_string(elapsed_time_ns) + "," + "\\n"; json += " \\"sum\\": " + std::string(std::isfinite(sum) ? "" : "\\"") + std::to_string(sum) + std::string(std::isfinite(sum) ? "" : "\\"") + "," + "\\n"; json += " \\"time_unit\\": \\"ns\\"\\n"; json += " }}"; if (&bench_name_sum_time != &*sums.rbegin()) {{ json += ","; }} json += "\\n"; }} json += " ]\\n"; json += "}}\\n"; std::cout << json << std::flush; '''.format(typ=typ) ## Finalize code code = gen_bench_from_code(f, typ, bench, '') # bench_with_timestamp ## Write file with common.open_utf8(_opts, path) as f: f.write(code) ## Clang-format it! common.clang_format(_opts, path) # ----------------------------------------------------------------------------- # Entry point def doit(opts): global _opts _opts = opts common.myprint(opts, 'Generating benches') for f in functions.values(): if not f.do_bench: if opts.verbose: common.myprint(opts, 'Skipping bench: {}'.format(f.name)) continue # WE MUST GENERATE CODE FOR EACH SIMD EXTENSION AS OTHER LIBRARY # USUALLY DO NOT PROPOSE A GENERIC INTERFACE for simd in _opts.simd: ## FIXME if simd in ['neon128', 'cpu']: continue for typ in f.types: ## FIXME if typ == 'f16': continue ## Skip non-matching benches if opts.match and not opts.match.match(f.name): continue ## FIXME if f.name in ['gamma', 'lgamma', 'ziplo', 'ziphi', 'unziphi', 'unziplo']: continue gen_bench(f, simd, typ) ================================================ FILE: egg/gen_doc.py ================================================ # Use utf-8 encoding # -*- coding: utf-8 -*- # Copyright (c) 2021 Agenium Scale # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. import os import platform import io import sys import subprocess import common import collections import operators import re import string categories = operators.categories operators = operators.operators # ----------------------------------------------------------------------------- # Get output of command def get_command_output(args): p = subprocess.Popen(args, stdout=subprocess.PIPE) lines = p.communicate()[0].split('\n')[0:-1] return '\n'.join([' {}'.format(l) for l in lines]) # ----------------------------------------------------------------------------- def gen_overview(opts): filename = common.get_markdown_file(opts, 'overview') if not common.can_create_filename(opts, filename): return with common.open_utf8(opts, filename) as fout: fout.write('''# Overview ## NSIMD scalar types Their names follow the following pattern: `Sxx` where - `S` is `i` for signed integers, `u` for unsigned integer or `f` for floatting point number. - `xx` is the number of bits taken to represent the number. Full list of scalar types: ''') for t in common.types: fout.write('- `{}`\n'.format(t)) fout.write(''' ## NSIMD generic SIMD vector types In NSIMD, we call a platform an architecture e.g. Intel, ARM, POWERPC. We call SIMD extension a set of low-level functions and types provided by hardware vendors to access SIMD units. Examples include SSE2, SSE42, AVX, ... When compiling the generic SIMD vector types represents a SIMD register of the target. Examples are a `__m128` for Intel SSE, `__m512` for Intel AVX-512 or `svfloat32_t` for Arm SVE. Their names follow the following pattern: - C base API: `vSCALAR` where `SCALAR` is a one of scalar type listed above. - C advanced API: `nsimd_pack_SCALAR` where `SCALAR` is a one of scalar type listed above. - C++ advanced API: `nsimd::pack` where `SCALAR` is a one of scalar type listed above. Full list of SIMD vector types: | Base type | C base API | C advanced API | C++ advanced API | |-----------|------------|----------------|------------------| ''') fout.write('\n'.join([ '| `{typ}` | `v{typ}` | `nsimd_pack_{typ}` | `nsimd::pack<{typ}>` |'. \ format(typ=typ) for typ in common.types])) fout.write(''' ## C/C++ base APIs These come automatically when you include `nsimd/nsimd.h`. You do *not* need to include a header file for having a function. Here is a list of supported platforms and their corresponding SIMD extensions. ''') platforms = common.get_platforms(opts) for p in platforms: fout.write('- Platform `{}`\n'.format(p)) for s in platforms[p].get_simd_exts(): fout.write(' - `{}`\n'.format(s)) fout.write(''' Each simd extension has its own set of SIMD types and functions. Types follow the pattern: `nsimd_SIMDEXT_vSCALAR` where - `SIMDEXT` is the SIMD extensions. - `SCALAR` is one of scalar types listed above. There are also logical types associated to each SIMD vector type. These types are used, for example, to represent the result of a comparison of SIMD vectors. They are usually bit masks. Their name follow the pattern: `nsimd_SIMDEXT_vlSCALAR` where - `SIMDEXT` is the SIMD extensions. - `SCALAR` is one of scalar types listed above. Note 1: Platform `cpu` is a 128 bits SIMD emulation fallback when no SIMD extension has been specified or is supported on a given compilation target. Note 2: as all SIMD extensions of all platforms are different there is no need to put the name of the platform in each identifier. Function names follow the pattern: `nsimd_SIMDEXT_FUNCNAME_SCALAR` where - `SIMDEXT` is the SIMD extensions. - `FUNCNAME` is the name of a function e.g. `add` or `sub`. - `SCALAR` is one of scalar types listed above. ### Generic identifier In the base C API, genericity is achieved using macros. - `vec(SCALAR)` is a type to represent a SIMD vector containing SCALAR elements. SCALAR must be one of scalar types listed above. - `vecl(SCALAR)` is a type to represent a SIMD vector of logicals for SCALAR elements. SCALAR must be one of scalar types listed above. - `vec_a(SCALAR, SIMDEXT)` is a type to represent a SIMD vector containing SCALAR elements for the simd extension SIMDEXT. SCALAR must be one of scalar types listed above and SIMDEXT must be a valid SIMD extension. - `vecl_a(SCALAR, SIMDEXT)` is a type to represent a SIMD vector of logicals for SCALAR elements for the simd extension SIMDEXT. SCALAR must be one of scalar types listed above and SIMDEXT must be a valid SIMD extension. - `vFUNCNAME` takes as input the above types to access the operator FUNCNAME e.g. `vadd`, `vsub`. In C++98 and C++03, type traits are available. - `nsimd::simd_traits::vector` is the SIMD vector type for platform SIMDEXT containing SCALAR elements. SIMDEXT is one of SIMD extension listed above, SCALAR is one of scalar type listed above. - `nsimd::simd_traits::vectorl` is the SIMD vector of logicals type for platform SIMDEXT containing SCALAR elements. SIMDEXT is one of SIMD extensions listed above, SCALAR is one of scalar type listed above. In C++11 and beyond, type traits are still available but typedefs are also provided. - `nsimd::vector` is a typedef to `nsimd::simd_traits::vector`. - `nsimd::vectorl` is a typedef to `nsimd::simd_traits::vectorl`. The C++20 API does not bring different types for SIMD registers nor other way to access the other SIMD types. It only brings concepts instead of usual `typename`s. For more informations cf. . Note that all macro and functions available in plain C are still available in C++. ### List of operators provided by the base APIs In the documentation we use interchangeably the terms "function" and "operator". For each operator FUNCNAME a C function (also available in C++) named `nsimd_SIMDEXT_FUNCNAME_SCALAR` is available for each SCALAR type unless specified otherwise. For each FUNCNAME, a C macro (also available in C++) named `vFUNCNAME` is available and takes as its last argument a SCALAR type. For each FUNCNAME, a C macro (also available in C++) named `vFUNCNAME_a` is available and takes as its two last argument a SCALAR type and a SIMDEXT. For each FUNCNAME, a C++ function in namespace `nsimd` named `FUNCNAME` is available. It takes as its last argument the SCALAR type and can optionnally take the SIMDEXT as its last last argument. For example, for the addition of two SIMD vectors `a` and `b` here are the possibilities: ```c++ c = nsimd_add_avx_f32(a, b); // use AVX c = nsimd::add(a, b, f32()); // use detected SIMDEXT c = nsimd::add(a, b, f32(), avx()); // force AVX even if detected SIMDEXT is not AVX c = vadd(a, b, f32); // use detected SIMDEXT c = vadd_e(a, b, f32, avx); // force AVX even if detected SIMDEXT is not AVX ``` Here is a list of available FUNCNAME. ''') for op_name, operator in operators.items(): return_typ = common.get_one_type_generic(operator.params[0], 'SCALAR') func = operator.name args = ', '.join([common.get_one_type_generic(p, 'SCALAR') + \ ' a' + str(count) for count, p in \ enumerate(operator.params[1:])]) fout.write('- `{} {}({});` \n'.format(return_typ, func, args)) if len(operator.types) < len(common.types): typs = ', '.join(['{}'.format(t) for t in operator.types]) fout.write(' Only available for {}\n'.format(typs)) fout.write(''' ## C advanced API (only available in C11) The C advanced API takes advantage of the C11 `_Generic` keyword to provide function overloading. Unlike the base API described above there is no need to pass as arguments the base type of the SIMD extension. The informations are contained in the types provided by this API. - `nsimd_pack_SCALAR_SIMDEXT` represents a SIMD vectors containing SCALAR elements of SIMD extension SIMDEXT. - `nsimd::packl_SCALAR_SIMDEXT` represents a SIMD vectors of logicals for SCALAR elements of SIMD extension SIMDEXT. There are versions of the above type without SIMDEXT for which the targeted SIMD extension is automatically chosen. - `nsimd_pack_SCALAR` represents a SIMD vectors containing SCALAR elements. - `nsimd::packl_SCALAR` represents a SIMD vectors of logicals for SCALAR elements. Generic types are also available: - `nsimd_pack(SCALAR)` is a type to represent a SIMD vector containing SCALAR elements. SCALAR must be one of scalar types listed above. - `nsimd_packl(SCALAR)` is a type to represent a SIMD vector of logicals for SCALAR elements. SCALAR must be one of scalar types listed above. - `nsimd_pack_a(SCALAR, SIMDEXT)` is a type to represent a SIMD vector containing SCALAR elements for the simd extension SIMDEXT. SCALAR must be one of scalar types listed above and SIMDEXT must be a valid SIMD extension. - `nsimd_packl_a(SCALAR, SIMDEXT)` is a type to represent a SIMD vector of logicals for SCALAR elements for the simd extension SIMDEXT. SCALAR must be one of scalar types listed above and SIMDEXT must be a valid SIMD extension. Finally, operators are follow the naming: `nsimd_FUNCNAME` e.g. `nsimd_add`, `nsimd_sub`. ## C++ advanced API The C++ advanced API is called advanced not because it requires C++11 or above but because it makes use of the particular implementation of ARM SVE by ARM in their compiler. We do not know if GCC (and possibly MSVC in the distant future) will use the same approach. Anyway the current implementation allows us to put SVE SIMD vectors inside some kind of structs that behave like standard structs. If you want to be sure to write portable code do *not* use this API. Two new types are available. - `nsimd::pack` represents `N` SIMD vectors containing SCALAR elements of SIMD extension SIMDEXT. You can specify only the first template argument. The second defaults to 1 while the third defaults to the detected SIMDEXT. - `nsimd::packl` represents `N` SIMD vectors of logical type containing SCALAR elements of SIMD extension SIMDEXT. You can specify only the first template argument. The second defaults to 1 while the third defaults to the detected SIMDEXT. Use N > 1 when declaring packs to have an unroll of N. This is particularily useful on ARM. Functions that takes packs do not take any other argument unless specified otherwise e.g. the load family of funtions. It is impossible to determine the kind of pack (unroll and SIMDEXT) from the type of a pointer. Therefore in this case, the last argument must be a pack and this same type will then return. Also some functions are available as C++ operators. They follow the naming: `nsimd::FUNCNAME`. ''') # ----------------------------------------------------------------------------- def gen_doc(opts): common.myprint(opts, 'Generating doc for each function') # Build tree for api.md api = dict() for _, operator in operators.items(): for c in operator.categories: if c not in api: api[c] = [operator] else: api[c].append(operator) # api.md # filename = os.path.join(opts.script_dir, '..','doc', 'markdown', 'api.md') filename = common.get_markdown_file(opts, 'api') if common.can_create_filename(opts, filename): with common.open_utf8(opts, filename) as fout: fout.write('# General API\n\n') fout.write('- [Memory function](memory.md)\n') fout.write('- [Float16 related functions](fp16.md)\n') fout.write('- [Defines provided by NSIMD](defines.md)\n') fout.write('- [NSIMD pack and related functions](pack.md)\n\n') fout.write('- [NSIMD C++20 concepts](concepts.md)\n\n') fout.write('# SIMD operators\n') for c, ops in api.items(): if len(ops) == 0: continue fout.write('\n## {}\n\n'.format(c.title)) for op in ops: Full_name = op.full_name[0].upper() + op.full_name[1:] fout.write('- [{} ({})](api_{}.md)\n'.format( Full_name, op.name, common.to_filename(op.name))) # helper to get list of function signatures def to_string(var): sigs = [var] if type(var) == str or not hasattr(var, '__iter__') \ else list(var) for i in range(0, len(sigs)): sigs[i] = re.sub('[ \n\t\r]+', ' ', sigs[i]) return '\n'.join(sigs) # Operators (one file per operator) # dirname = os.path.join(opts.script_dir, '..','doc', 'markdown') dirname = common.get_markdown_dir(opts) common.mkdir_p(dirname) for op_name, operator in operators.items(): # Skip non-matching doc if opts.match and not opts.match.match(op_name): continue # filename = os.path.join(dirname, 'api_{}.md'.format(common.to_filename( # operator.name))) filename = common.get_markdown_api_file(opts, operator.name) if not common.can_create_filename(opts, filename): continue Full_name = operator.full_name[0].upper() + operator.full_name[1:] with common.open_utf8(opts, filename) as fout: fout.write('# {}\n\n'.format(Full_name)) fout.write('## Description\n\n') fout.write(operator.desc) fout.write('\n\n## C base API (generic)\n\n') fout.write('```c\n') fout.write(to_string(operator.get_generic_signature('c_base'))) fout.write('\n```\n\n') fout.write('\n\n## C advanced API (generic, requires C11)\n\n') fout.write('```c\n') fout.write(to_string(operator.get_generic_signature('c_adv'))) fout.write('\n```\n\n') fout.write('## C++ base API (generic)\n\n') fout.write('```c++\n') fout.write(to_string(operator.get_generic_signature('cxx_base'))) fout.write('\n```\n\n') fout.write('## C++ advanced API\n\n') fout.write('```c++\n') fout.write(to_string(operator.get_generic_signature('cxx_adv'). \ values())) fout.write('\n```\n\n') fout.write('## C base API (architecture specifics)') for simd_ext in opts.simd: fout.write('\n\n### {}\n\n'.format(simd_ext.upper())) fout.write('```c\n') for typ in operator.types: fout.write(operator.get_signature(typ, 'c_base', simd_ext)) fout.write(';\n') fout.write('```') fout.write('\n\n## C++ base API (architecture specifics)') for simd_ext in opts.simd: fout.write('\n\n### {}\n\n'.format(simd_ext.upper())) fout.write('```c\n') for typ in operator.types: fout.write(operator.get_signature(typ, 'cxx_base', simd_ext)) fout.write(';\n') fout.write('```') # ----------------------------------------------------------------------------- def gen_modules_md(opts): common.myprint(opts, 'Generating modules.md') mods = common.get_modules(opts) ndms = [] for mod in mods: name = eval('mods[mod].{}.hatch.name()'.format(mod)) desc = eval('mods[mod].{}.hatch.desc()'.format(mod)) ndms.append([name, desc, mod]) filename = common.get_markdown_file(opts, 'modules') if not common.can_create_filename(opts, filename): return with common.open_utf8(opts, filename) as fout: fout.write('''# Modules NSIMD comes with several additional modules. A module provides a set of functionnalities that are usually not at the same level as SIMD intrinsics and/or that do not provide all C and C++ APIs. These functionnalities are given with the library because they make heavy use of NSIMD core which abstract SIMD intrinsics. Below is the exhaustive list of modules. ''') for ndm in ndms: fout.write('- [{}](module_{}_overview.md) \n'.format(ndm[0], ndm[2])) fout.write('\n'.join([' {}'.format(line.strip()) \ for line in ndm[1].split('\n')])) fout.write('\n\n') # ----------------------------------------------------------------------------- def build_exe_for_doc(opts): if not opts.list_files: doc_dir = os.path.join(opts.script_dir, '..', 'doc') if platform.system() == 'Windows': code = os.system('cd {} && nmake /F Makefile.win'. \ format(os.path.normpath(doc_dir))) else: code = os.system('cd {} && make -f Makefile.nix'. \ format(os.path.normpath(doc_dir))) if code == 0: common.myprint(opts, 'Build successful') else: common.myprint(opts, 'Build failed') # ----------------------------------------------------------------------------- def gen_what_is_wrapped(opts): common.myprint(opts, 'Generating "which intrinsics are wrapped"') build_exe_for_doc(opts) wrapped = 'what_is_wrapped.exe' if platform.system() == 'Windows' \ else 'what_is_wrapped' doc_dir = os.path.join(opts.script_dir, '..', 'doc') full_path_wrapped = os.path.join(doc_dir, wrapped) if not os.path.isfile(full_path_wrapped): common.myprint(opts, '{} not found'.format(wrapped)) return # Content for indexing files created in this function index = '# Intrinsics that are wrapped\n' # Build command line cmd0 = '{} {},{},{},{},{},{}'.format(full_path_wrapped, common.in0, common.in1, common.in2, common.in3, common.in4, common.in5) # For now we only list Intel, Arm and POWERPC intrinsics simd_exts = common.x86_simds + common.arm_simds + common.ppc_simds for p in common.get_platforms(opts): index_simds = '' for simd_ext in opts.platforms_list[p].get_simd_exts(): if simd_ext not in simd_exts: continue md = os.path.join(common.get_markdown_dir(opts), 'wrapped_intrinsics_for_{}.md'.format(simd_ext)) index_simds += '- [{}](wrapped_intrinsics_for_{}.md)\n'. \ format(simd_ext.upper(), simd_ext) ops = [[], [], [], []] for op_name, operator in operators.items(): if operator.src: continue c_src = os.path.join(opts.include_dir, p, simd_ext, '{}.h'.format(op_name)) ops[operator.output_to].append('{} "{}"'. \ format(op_name, c_src)) if not common.can_create_filename(opts, md): continue with common.open_utf8(opts, md) as fout: fout.write('# Intrinsics wrapped for {}\n\n'. \ format(simd_ext.upper())) fout.write('Notations are as follows:\n' '- `T` for trick usually using other intrinsics\n' '- `E` for scalar emulation\n' '- `NOOP` for no operation\n' '- `NA` means the operator does not exist for ' 'the given type\n' '- `intrinsic` for the actual wrapped intrinsic\n' '\n') cmd = '{} {} same {} >> "{}"'.format(cmd0, simd_ext, ' '.join(ops[common.OUTPUT_TO_SAME_TYPE]), md) if os.system(cmd) != 0: common.myprint(opts, 'Unable to generate markdown for ' '"same"') continue cmd = '{} {} same_size {} >> "{}"'.format(cmd0, simd_ext, ' '.join(ops[common.OUTPUT_TO_SAME_SIZE_TYPES]), md) if os.system(cmd) != 0: common.myprint(opts, 'Unable to generate markdown for ' '"same_size"') continue cmd = '{} {} bigger_size {} >> "{}"'.format(cmd0, simd_ext, ' '.join(ops[common.OUTPUT_TO_UP_TYPES]), md) if os.system(cmd) != 0: common.myprint(opts, 'Unable to generate markdown for ' '"bigger_size"') continue cmd = '{} {} lesser_size {} >> "{}"'.format(cmd0, simd_ext, ' '.join(ops[common.OUTPUT_TO_DOWN_TYPES]), md) if os.system(cmd) != 0: common.myprint(opts, 'Unable to generate markdown for ' '"lesser_size"') continue if index_simds != '': index += '\n## Platform {}\n\n'.format(p) index += index_simds md = os.path.join(common.get_markdown_dir(opts), 'wrapped_intrinsics.md') if common.can_create_filename(opts, md): with common.open_utf8(opts, md) as fout: fout.write(index) # ----------------------------------------------------------------------------- def get_html_dir(opts): return os.path.join(opts.script_dir, '..', 'doc', 'html') def get_html_api_file(opts, name, module=''): root = get_html_dir(opts) op_name = to_filename(name) if module == '': return os.path.join(root, 'api_{}.html'.format(op_name)) else: return os.path.join(root, 'module_{}_api_{}.html'. \ format(module, op_name)) def get_html_file(opts, name, module=''): root = get_html_dir(opts) op_name = to_filename(name) if module == '': return os.path.join(root, '{}.html'.format(op_name)) else: return os.path.join(root, 'module_{}_{}.html'.format(module, op_name)) doc_header = '''\ {}

NSIMD documentation
{} ''' doc_footer = '''\ ''' def get_html_header(opts, title, filename): # check if filename is part of a module doc for mod in opts.modules_list: if filename.startswith('module_{}_'.format(mod)): links = eval('opts.modules_list[mod].{}.hatch.doc_menu()'. \ format(mod)) name = eval('opts.modules_list[mod].{}.hatch.name()'.format(mod)) html = '
\n' html += '{} module documentation\n'.format(name) if len(links) > 0: html += '
\n' html += \ '
\n' html += ' | '.join(['{}'. \ format(mod, href, label) \ for label, href in links.items()]) html += '\n
\n
\n' return doc_header.format(title, html) return doc_header.format(title, '') def get_html_footer(): return doc_footer # ----------------------------------------------------------------------------- def gen_doc_html(opts, title): if not opts.list_files: build_exe_for_doc(opts) md2html = 'md2html.exe' if platform.system() == 'Windows' \ else 'md2html' doc_dir = os.path.join(opts.script_dir, '..', 'doc') full_path_md2html = os.path.join(doc_dir, md2html) if not os.path.isfile(full_path_md2html): common.myprint(opts, '{} not found'.format(md2html)) return # get all markdown files md_dir = common.get_markdown_dir(opts) html_dir = get_html_dir(opts) if not os.path.isdir(html_dir): mkdir_p(html_dir) doc_files = [] for filename in os.listdir(md_dir): name = os.path.basename(filename) if name.endswith('.md'): doc_files.append(os.path.splitext(name)[0]) if opts.list_files: ## list gen files for filename in doc_files: input_name = os.path.join(md_dir, filename + '.md') output_name = os.path.join(html_dir, filename + '.html') print(output_name) else: ## gen html files footer = get_html_footer() tmp_file = os.path.join(doc_dir, 'tmp.html') for filename in doc_files: header = get_html_header(opts, title, filename) input_name = os.path.join(md_dir, filename + '.md') output_name = os.path.join(html_dir, filename + '.html') os.system('{} "{}" "{}"'.format(full_path_md2html, input_name, tmp_file)) with common.open_utf8(opts, output_name) as fout: fout.write(header) with io.open(tmp_file, mode='r', encoding='utf-8') as fin: fout.write(fin.read()) fout.write(footer) def gen_html(opts): common.myprint(opts, 'Generating HTML documentation') gen_doc_html(opts, 'NSIMD documentation') # ----------------------------------------------------------------------------- def copy_github_file_to_doc(opts, github_filename, doc_filename): common.myprint(opts, 'Copying {} ---> {}'. \ format(github_filename, doc_filename)) if not common.can_create_filename(opts, doc_filename): return with io.open(github_filename, mode='r', encoding='utf-8') as fin: file_content = fin.read() # we replace all links to doc/... by nsimd/... file_content = file_content.replace('doc/markdown/', 'nsimd/') file_content = file_content.replace('doc/', 'nsimd/') # we do not use common.open_utf8 as the copyright is already in content with io.open(doc_filename, mode='w', encoding='utf-8') as fout: fout.write(file_content) # ----------------------------------------------------------------------------- def doit(opts): gen_overview(opts) gen_doc(opts) gen_modules_md(opts) gen_what_is_wrapped(opts) root_dir = os.path.join(opts.script_dir, '..') copy_github_file_to_doc(opts, os.path.join(root_dir, 'README.md'), common.get_markdown_file(opts, 'index')) copy_github_file_to_doc(opts, os.path.join(root_dir, 'CONTRIBUTING.md'), common.get_markdown_file(opts, 'contribute')) gen_html(opts) # This must be last ================================================ FILE: egg/gen_friendly_but_not_optimized.py ================================================ # Copyright (c) 2019 Agenium Scale # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. import common import operators import os from datetime import date import sys # ----------------------------------------------------------------------------- # Generate advanced C++ API def get_impl(operator): if operator.params == ['v', 'v', 'v'] or \ operator.params == ['l', 'v', 'v']: return \ '''template pack{l} operator{cxx_op}(pack const &v, S s) {{ return {op_name}(v, pack(T(s))); }} template pack{l} operator{cxx_op}(S s, pack const &v) {{ return {op_name}(pack(T(s)), v); }}'''.format(l='l' if operator.params[0] == 'l' else '', cxx_op=operator.cxx_operator, op_name=operator.name) if operator.params == ['l', 'l', 'l']: return \ '''template packl operator{cxx_op}(packl const &v, S s) {{ return {op_name}(v, packl(bool(s))); }} template packl operator{cxx_op}(S s, packl const &v) {{ return {op_name}(pack(bool(s)), v); }} template packl operator{cxx_op}(packl const &v, packl const &w) {{ return {op_name}(v, reinterpretl >(w)); }}'''.format(cxx_op=operator.cxx_operator, op_name=operator.name) # ----------------------------------------------------------------------------- # Generate advanced C++ API def doit(opts): common.myprint(opts, 'Generating friendly but not optimized advanced ' 'C++ API') filename = os.path.join(opts.include_dir, 'friendly_but_not_optimized.hpp') if not common.can_create_filename(opts, filename): return with common.open_utf8(opts, filename) as out: out.write('''#ifndef NSIMD_FRIENDLY_BUT_NOT_OPTIMIZED_HPP #define NSIMD_FRIENDLY_BUT_NOT_OPTIMIZED_HPP #include #include namespace nsimd {{ '''.format(year=date.today().year)) for op_name, operator in operators.operators.items(): if operator.cxx_operator == None or len(operator.params) != 3 or \ operator.name in ['shl', 'shr']: continue out.write('''{hbar} {code} '''.format(hbar=common.hbar, code=get_impl(operator))) out.write('''{hbar} }} // namespace nsimd #endif'''.format(hbar=common.hbar)) common.clang_format(opts, filename) ================================================ FILE: egg/gen_modules.py ================================================ # Copyright (c) 2019 Agenium Scale # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. import os import common def doit(opts): mods = common.get_modules(opts) for mod in mods: exec('mods[mod].{}.hatch.doit(opts)'.format(mod)) ================================================ FILE: egg/gen_scalar_utilities.py ================================================ # Copyright (c) 2019 Agenium Scale # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. import os import common import operators import scalar import cuda import rocm import oneapi # ----------------------------------------------------------------------------- def get_gpu_impl(gpu_sig, cuda_impl, rocm_impl, oneapi_sig, oneapi_impl): if cuda_impl == rocm_impl: return '''#if defined(NSIMD_CUDA) || defined(NSIMD_ROCM) inline {gpu_sig} {{ {cuda_impl} }} #elif defined(NSIMD_ONEAPI) inline {oneapi_sig} {{ {oneapi_impl} }} #endif'''.format(gpu_sig=gpu_sig, cuda_impl=cuda_impl, oneapi_sig=oneapi_sig, oneapi_impl=oneapi_impl) else: return '''#if defined(NSIMD_CUDA) inline {gpu_sig} {{ {cuda_impl} }} #elif defined(NSIMD_ROCM) inline {gpu_sig} {{ {rocm_impl} }} #elif defined(NSIMD_ONEAPI) inline {oneapi_sig} {{ {oneapi_impl} }} #endif'''.format(gpu_sig=gpu_sig, cuda_impl=cuda_impl, rocm_impl=rocm_impl, oneapi_sig=oneapi_sig, oneapi_impl=oneapi_impl) # ----------------------------------------------------------------------------- def doit(opts): common.myprint(opts, 'Generating scalar implementation for CPU and GPU') filename = os.path.join(opts.include_dir, 'scalar_utilities.h') if not common.can_create_filename(opts, filename): return with common.open_utf8(opts, filename) as out: # we declare reinterprets now as we need them scalar_tmp = [] gpu_tmp = [] oneapi_tmp = [] for t in operators.Reinterpret.types: for tt in common.get_output_types( t, operators.Reinterpret.output_to): scalar_tmp += [operators.Reinterpret(). \ get_scalar_signature('cpu', t, tt, 'c')] gpu_tmp += [operators.Reinterpret(). \ get_scalar_signature('gpu', t, tt, 'cxx')] oneapi_tmp += [operators.Reinterpret(). \ get_scalar_signature('oneapi', t, tt, 'cxx')] scalar_reinterpret_decls = '\n'.join(['NSIMD_INLINE ' + sig + ';' \ for sig in scalar_tmp]) gpu_reinterpret_decls = '\n'.join(['inline ' + sig + ';' \ for sig in gpu_tmp]) oneapi_reinterpret_decls = '\n'.join(['inline ' + sig + ';' \ for sig in oneapi_tmp]) sleef_decls = '' for op in operators.operators.values(): if 'sleef_symbol_prefix' in op.__class__.__dict__: sleef_decls += 'f32 {}_scalar_f32({});\n'. \ format(op.sleef_symbol_prefix, ', '.join(['f32'] * len(op.params[1:]))) sleef_decls += 'f64 {}_scalar_f64({});\n'. \ format(op.sleef_symbol_prefix, ', '.join(['f64'] * len(op.params[1:]))) out.write( '''#ifndef NSIMD_SCALAR_UTILITIES_H #define NSIMD_SCALAR_UTILITIES_H #if NSIMD_CXX > 0 #include #include #else #include #include #endif #ifdef NSIMD_NATIVE_FP16 #if defined(NSIMD_IS_GCC) #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wdouble-promotion" #elif defined(NSIMD_IS_CLANG) #pragma clang diagnostic push #pragma clang diagnostic ignored "-Wdouble-promotion" #endif #endif {hbar} #if NSIMD_CXX > 0 extern "C" {{ #endif {sleef_decls} #if NSIMD_CXX > 0 }} // extern "C" #endif {hbar} {scalar_reinterpret_decls} #if defined(NSIMD_CUDA) || defined(NSIMD_ROCM) || \ defined(NSIMD_ONEAPI) namespace nsimd {{ #if defined(NSIMD_CUDA) || defined(NSIMD_ROCM) {gpu_reinterpret_decls} #elif defined(NSIMD_ONEAPI) {oneapi_reinterpret_decls} #endif }} // namespace nsimd #endif '''. \ format(hbar=common.hbar, sleef_decls=sleef_decls, scalar_reinterpret_decls=scalar_reinterpret_decls, gpu_reinterpret_decls=gpu_reinterpret_decls, oneapi_reinterpret_decls=oneapi_reinterpret_decls)) for op_name, operator in operators.operators.items(): if not operator.has_scalar_impl: continue if operator.params == ['l'] * len(operator.params): out.write('\n\n' + common.hbar + '\n\n') out.write( '''NSIMD_INLINE {c_sig} {{ {scalar_impl} }} #if NSIMD_CXX > 0 namespace nsimd {{ NSIMD_INLINE {cxx_sig} {{ return nsimd_scalar_{op_name}({c_args}); }} {gpu_impl} }} // namespace nsimd #endif'''.format( c_sig=operator.get_scalar_signature('cpu', '', '', 'c'), cxx_sig=operator.get_scalar_signature('cpu', '', '', 'cxx'), op_name=op_name, c_args=', '.join(['a{}'.format(i - 1) \ for i in range(1, len(operator.params))]), scalar_impl=scalar.get_impl(operator, tt, t), gpu_impl=get_gpu_impl( operator.get_scalar_signature('gpu', t, tt, 'cxx'), cuda.get_impl(operator, tt, t), rocm.get_impl(operator, tt, t), operator.get_scalar_signature('oneapi', t, tt, 'cxx'), oneapi.get_impl(operator, tt, t)))) continue for t in operator.types: tts = common.get_output_types(t, operator.output_to) for tt in tts: out.write('\n\n' + common.hbar + '\n\n') out.write( '''NSIMD_INLINE {c_sig} {{ {scalar_impl} }} #if NSIMD_CXX > 0 namespace nsimd {{ NSIMD_INLINE {cxx_sig} {{ return nsimd_scalar_{op_name}_{suffix}({c_args}); }} {gpu_impl} }} // namespace nsimd #endif'''.format( c_sig=operator.get_scalar_signature('cpu', t, tt, 'c'), cxx_sig=operator.get_scalar_signature('cpu', t, tt, 'cxx'), op_name=op_name, suffix=t if operator.closed else '{}_{}'.format(tt, t), c_args=', '.join(['a{}'.format(i - 1) \ for i in range(1, len(operator.params))]), scalar_impl=scalar.get_impl(operator, tt, t), gpu_impl=get_gpu_impl( operator.get_scalar_signature('gpu', t, tt, 'cxx'), cuda.get_impl(operator, tt, t), rocm.get_impl(operator, tt, t), operator.get_scalar_signature('oneapi', t, tt, 'cxx'), oneapi.get_impl(operator, tt, t)))) out.write(''' {hbar} #ifdef NSIMD_NATIVE_FP16 #if defined(NSIMD_IS_GCC) #pragma GCC diagnostic pop #elif defined(NSIMD_IS_CLANG) #pragma clang diagnostic pop #endif #endif #endif'''.format(hbar=common.hbar)) common.clang_format(opts, filename) ================================================ FILE: egg/gen_src.py ================================================ # Copyright (c) 2021 Agenium Scale # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. import common import operators import os from datetime import date import sys # ----------------------------------------------------------------------------- # Implementations for output def get_put_impl(simd_ext): args = { 'i8' : ['"%d"', '(int)buf[i]'], 'u8' : ['"%d"', '(int)buf[i]'], 'i16': ['"%d"', '(int)buf[i]'], 'u16': ['"%d"', '(int)buf[i]'], 'i32': ['"%d"', 'buf[i]'], 'u32': ['"%u"', 'buf[i]'], 'i64': ['"%lld"', '(nsimd_longlong)buf[i]'], 'u64': ['"%llu"', '(nsimd_ulonglong)buf[i]'], 'f16': ['"%e"', '(double)nsimd_f16_to_f32(buf[i])'], 'f32': ['"%e"', '(double)buf[i]'], 'f64': ['"%e"', 'buf[i]'], } ret = '''#ifdef NSIMD_LONGLONG_IS_EXTENSION #if defined(NSIMD_IS_GCC) || defined(NSIMD_IS_CLANG) #pragma GCC diagnostic ignored "-Wformat" #endif #endif #include extern "C" { ''' for typ in common.types: fmt = \ '''NSIMD_DLLEXPORT int NSIMD_VECTORCALL nsimd_put_{simd_ext}_{l}{typ}(FILE *out, const char *fmt, nsimd_{simd_ext}_v{l}{typ} v) {{ using namespace nsimd; {typ} buf[NSIMD_MAX_LEN({typ})]; int n = len({typ}(), {simd_ext}()); store{l}u(buf, v, {typ}(), {simd_ext}()); if (fputs("{{ ", out) == EOF) {{ return -1; }} int ret = 2; for (int i = 0; i < n; i++) {{ int code; if (fmt != NULL) {{ code = fprintf(out, fmt, {val}); }} else {{ code = fprintf(out, {fmt}, {val}); }} if (code < 0) {{ return -1; }} ret += code; if (i < n - 1) {{ if (fputs(", ", out) == EOF) {{ return -1; }} ret += 2; }} }} if (fputs(" }}", out) == EOF) {{ return -1; }} return ret + 2; }} {hbar} ''' ret += fmt.format(typ=typ, l='', simd_ext=simd_ext, hbar=common.hbar, fmt=args[typ][0], val=args[typ][1]) ret += fmt.format(typ=typ, l='l', simd_ext=simd_ext, hbar=common.hbar, fmt=args[typ][0], val=args[typ][1]) ret += '} // extern "C"\n' return ret # ----------------------------------------------------------------------------- # Generate base APIs def write_cpp(opts, simd_ext, emulate_fp16): filename = os.path.join(opts.src_dir, 'api_{}.cpp'.format(simd_ext)) if not common.can_create_filename(opts, filename): return with common.open_utf8(opts, filename) as out: out.write('''#define NSIMD_INSIDE #include #include '''.format(year=date.today().year)) out.write(get_put_impl(simd_ext)) common.clang_format(opts, filename) def doit(opts): common.mkdir_p(opts.src_dir) common.myprint(opts, 'Generating source for binary') opts.platforms = common.get_platforms(opts) for platform in opts.platforms: mod = opts.platforms[platform] for simd_ext in mod.get_simd_exts(): write_cpp(opts, simd_ext, mod.emulate_fp16(simd_ext)) ================================================ FILE: egg/gen_tests.py ================================================ # Copyright (c) 2021 Agenium Scale # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. import os import math import sys import common import operators from datetime import date # ----------------------------------------------------------------------------- # Helper functions def should_i_do_the_test(operator, tt='', t=''): if operator.name == 'cvt' and t in common.ftypes and tt in common.iutypes: # When converting from float to int to float then we may not # get the initial result because of roundings. As tests are usually # done by going back and forth then both directions get tested in the # end return False if operator.name == 'reinterpret' and t in common.iutypes and \ tt in common.ftypes: # When reinterpreting from int to float we may get NaN or infinities # and no ones knows what this will give when going back to ints # especially when float16 are emulated. Again as tests are done by # going back and forth both directions get tested in the end. return False if operator.name in ['notb', 'andb', 'andnotb', 'xorb', 'orb'] and \ t == 'f16': # Bit operations on float16 are hard to check because they are # emulated in most cases. Therefore going back and forth with # reinterprets for doing bitwise operations make the bit in the last # place to wrong. This is normal but makes testing real hard. So for # now we do not test them on float16. return False if operator.name in ['len', 'set1', 'set1l', 'mask_for_loop_tail', 'loadu', 'loada', 'storeu', 'storea', 'loadla', 'loadlu', 'storela', 'storelu', 'if_else1']: # These functions are used in almost every tests so we consider # that they are extensively tested. return False if operator.name in ['store2a', 'store2u', 'store3a', 'store3u', 'store4a', 'store4u', 'scatter', 'scatter_linear', 'downcvt', 'to_logical']: # These functions are tested along with their load counterparts. # downcvt is tested along with upcvt and to_logical is tested with # to_mask return False return True # ----------------------------------------------------------------------------- # CBPRNG def cbprng_impl(typ, domain_, for_cpu, only_int = False): code = '((((unsigned int)(1 + i) * 69342380u + 414585u) ' \ '^ ((unsigned int)(1 + j) * 89375027u + 952905u))' \ '% 1000000u)' def c_code(a0_, a1_): if a1_ < a0_: raise ValueError("a0 must be lesser than a1") if typ in common.utypes and a0_ < 0.0 and a1_ < 0.0: raise ValueError("a0 and a1 must be positive") if typ in common.ftypes: a0 = a0_ a1 = a1_ else: a0 = 0 if typ in common.utypes and a0_ < 0 else math.ceil(a0_) a1 = math.floor(a1_) if a1 < a0: raise ValueError("a0 and a1 must be positive after filtering") if typ in common.iutypes: return 'return ({})({} + (f32)((i32){} % {}));'. \ format(typ, a0, code, a1 - a0 + 1) elif typ == 'f16': return \ 'return {}({}(((f32){} + (f32){} * (f32)({}) / 1000000.0f)));'. \ format('(f16)' if not for_cpu else 'nsimd_f32_to_f16', '(f32)(i32)' if only_int else '', a0, a1 - a0, code) elif typ in ['f32', 'f64']: return \ 'return {}(({}){} + ({}){} * ({}){} / ({})1000000);'. \ format('({})({})'.format(typ, 'i' + typ[1:]) if only_int else '', typ, a0, typ, a1 - a0, typ, code, typ) if typ not in common.utypes: domain = domain_ domain = [] for i in range(len(domain_) // 2): if domain_[2 * i + 1] > 0: domain.append(domain_[2 * i]) domain.append(domain_[2 * i + 1]) if len(domain) == 0: raise ValueError('domain {} is empty after filtering'.format(domain_)) nb_intervals = len(domain) // 2 if nb_intervals == 1: return ' {}'.format(c_code(domain[0], domain[1])) ret = 'int piece = ((1 + i) * (1 + j)) % {};'.format(nb_intervals) for i in range(nb_intervals - 1): ret += '\nif (piece == {}) {{\n'.format(i) ret += ' {}\n'.format(c_code(domain[2 * i], domain[2 * i + 1])) ret += '} else ' ret += '{\n' ret += ' {}\n'.format(c_code(domain[-2], domain[-1])) ret += '}' return ret def cbprng(typ, operator, target, gpu_params = None): if target not in ['cpu', 'cuda', 'hip', 'oneapi']: raise ValueError('Unsupported target, must be cpu, cuda, hip or ' 'oneapi') arity = len(operator.params[1:]) ret = '{}{} random_impl(int i, int j) {{\n'. \ format('' if target in ['cpu', 'oneapi'] else '__device__ ', typ) for_cpu = (target == 'cpu') if arity == 1: ret += cbprng_impl(typ, operator.domain[0], for_cpu, operator.tests_on_integers_only) else: for i in range(arity - 1): ret += 'if (j == {}) {{\n {}\n}} else '. \ format(i, cbprng_impl(typ, operator.domain[i], for_cpu, operator.tests_on_integers_only)) ret += '{{\n{}\n}} '. \ format(cbprng_impl(typ, operator.domain[-1], for_cpu, operator.tests_on_integers_only)) ret += '\n}\n\n' if target == 'cpu': ret += '''void random({} *dst, unsigned int n, int j) {{ unsigned int i; for (i = 0; i < n; i++) {{ dst[i] = random_impl((int)i, j); }} }}'''.format(typ) elif target == 'cuda': ret += '''__global__ void random_kernel({typ} *dst, int n, int j) {{ int i = threadIdx.x + blockIdx.x * blockDim.x; if (i < n) {{ dst[i] = random_impl((int)i, j); }} }} void random({typ} *dst, unsigned int n, int j) {{ random_kernel<<<{gpu_params}>>>(dst, (int)n, j); }}'''.format(typ=typ, gpu_params=gpu_params) elif target == 'hip': ret += '''__global__ void random_kernel({typ} *dst, size_t n, int j) {{ size_t i = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; if (i < n) {{ dst[i] = random_impl((int)i, j); }} }} void random({typ} *dst, unsigned int n, int j) {{ hipLaunchKernelGGL(random_kernel, {gpu_params}, 0, 0, dst, n, j); }}'''.format(typ=typ, gpu_params=gpu_params) elif target == 'oneapi': ret += '''inline void random_kernel({typ} *dst, unsigned int n, int j, sycl::nd_item<1> item) {{ size_t i = item.get_global_id().get(0); if (i < n) {{ dst[i] = random_impl((int)i, j); }} }} void random({typ} *dst, unsigned int n, int j) {{ size_t nt = (size_t)nsimd_kernel_param({n}, {tpb}); sycl::queue q_ = nsimd::oneapi::default_queue(); q_.parallel_for(sycl::nd_range<1>(sycl::range<1>(nt), sycl::range<1>({tpb})), [=](sycl::nd_item<1> item){{ random_kernel(dst, n, j, item); }}).wait_and_throw(); }}'''.format(typ=typ, n=gpu_params[0], tpb=gpu_params[1]) return ret # ----------------------------------------------------------------------------- posix_c_source = \ '''#if !defined(_POSIX_C_SOURCE) #define _POSIX_C_SOURCE 200112L #elif _POSIX_C_SOURCE < 200112L #error "_POSIX_C_SOURCE defined by third-party but must be >= 200112L" #endif''' msvc_c4334_warning = \ '''#ifdef NSIMD_IS_MSVC // MSVC wrongly emits warning C4333 on the following pieces of code: // (i64)(1 << (rand() % 4)) // (u64)(1 << (rand() % 4)) // so we deactive it for now #pragma warning( disable : 4334 ) #endif''' # ----------------------------------------------------------------------------- # Get filename for test def get_filename(opts, op, typ, lang, custom_name=''): tests_dir = os.path.join(opts.tests_dir, lang) common.mkdir_p(tests_dir) ext = { 'c_base': 'prec11.c', 'c_adv': 'c' } if not custom_name: filename = os.path.join(tests_dir, '{}.{}.{}'.format(op.name, typ, ext[lang] if lang in ['c_base', 'c_adv'] else 'cpp')) else: filename = os.path.join(tests_dir, '{}_{}.{}.{}'.format(op.name, custom_name, typ, ext[lang] if lang in ['c_base', 'c_adv'] else 'cpp')) if common.can_create_filename(opts, filename): return filename else: return None # ----------------------------------------------------------------------------- # Get standard includes def get_includes(lang): ret = '#include \n' if lang == 'cxx_adv': ret += '#include \n' if lang == 'c_adv': ret += '#include \n' if lang in ['c_base', 'c_adv']: ret += '''#include #include #include #include #include #include ''' else: ret += '''#include #include #include #include #include #include ''' return ret # ----------------------------------------------------------------------------- # Function to compute number of common bits between two floatting points # numbers distance_int = ''' int distance({typ} a, {typ} b) {{ {typ} d = (a > b ? a - b : b - a); return (int)((u64)d > (u64)INT_MAX) ? (u64)INT_MAX : (u64)d); }} ''' distance_float = ''' int distance({typ} a, {typ} b) {{ if (nsimd_isnan_{typ}(a) && nsimd_isnan_{typ}(b)) {{ return 0; }} if (nsimd_isnan_{typ}(a) || nsimd_isnan_{typ}(b)) {{ return -1; }} if (nsimd_isinf_{typ}(a) && nsimd_isinf_{typ}(b)) {{ return 0; }} if (nsimd_isinf_{typ}(a) || nsimd_isinf_{typ}(b)) {{ return -1; }} return nsimd_ufp_{typ}(a, b); }} /* ------------------------------------------------------------------------- */ ''' distance = { 'i8': distance_int.format(typ='i8'), 'u8': distance_int.format(typ='u8'), 'i16': distance_int.format(typ='i16'), 'u16': distance_int.format(typ='u16'), 'i32': distance_int.format(typ='i32'), 'u32': distance_int.format(typ='u32'), 'i64': distance_int.format(typ='i64'), 'u64': distance_int.format(typ='u64'), 'f16': distance_float.format(typ='f16'), 'f32': distance_float.format(typ='f32'), 'f64': distance_float.format(typ='f64') } # ----------------------------------------------------------------------------- # Template for a lot of tests template = \ '''{includes} #define SIZE (2048 / {sizeof}) #define STATUS "test of {op_name} over {typ}" #define CHECK(a) {{ \\ errno = 0; \\ if (!(a)) {{ \\ fprintf(stderr, "ERROR: " #a ":%d: %s\\n", \\ __LINE__, strerror(errno)); \\ fflush(stderr); \\ exit(EXIT_FAILURE); \\ }} \\ }} /* ------------------------------------------------------------------------- */ {extra_code} int comp_function({typ} ref_out, {typ} nsimd_out) {{ {comp}; }} int main(void) {{ int vi, i, step; {typ} *vout_ref, *vout_nsimd; {vin_defi} CHECK(vout_ref = ({typ}*)nsimd_aligned_alloc(SIZE * {sizeof})); CHECK(vout_nsimd = ({typ}*)nsimd_aligned_alloc(SIZE * {sizeof})); step = vlen({typ}); fprintf(stdout, STATUS "...\\n"); fflush(stdout); /* Fill input vector(s) with random values */ {vin_rand} /* We ensure that inputs are normal numbers */ for (i = 0; i < SIZE; i++) {{ {denormalize_inputs} }} /* Fill vout_ref output vector with reference values */ for (i = 0; i < SIZE; i += {cpu_step}) {{ /* This is a call directly to the cpu API of nsimd to ensure that we call the scalar version of the function */ {vout_ref_comp} }} /* Fill vout_nsimd output vector with computed values */ for (i = 0; i < SIZE; i += step) {{ {vout_nsimd_comp} }} {dnz_flush_to_zero} /* Compare results */ for (vi = 0; vi < SIZE; vi += step) {{ for (i = vi; i < vi + step; i++) {{ if (comp_function(vout_ref[i], vout_nsimd[i])) {{ fprintf(stdout, STATUS "... FAIL\\n"); fflush(stdout); return -1; }} }} }} fprintf(stdout, STATUS "... OK\\n"); fflush(stdout); return 0; }}''' # ----------------------------------------------------------------------------- # Common to most of the tests def get_content(op, typ, lang): cast = 'f32' if typ in ['f16', 'f32'] else 'f64' zero = 'nsimd_f32_to_f16(0.0f)' if typ == 'f16' else '({})0'.format(typ) # By default we use emulation functions ("cpu" architecture) for testing # in which case increment is given by nsimd_cpu_len() cpu_step = 'nsimd_len_cpu_{}()'.format(typ) nargs = range(1, len(op.params)) if typ in common.ftypes: code = ['''if (!nsimd_isnormal_{typ}(vin{i}[i])) {{ vin{i}[i] = {zero}; }}'''.format(typ=typ, i=i, zero=zero) for i in nargs] denormalize_inputs = '\n'.join(code) else: denormalize_inputs = '' # Depending on function parameters, generate specific input, ... if all(e == 'v' for e in op.params) or all(e == 'l' for e in op.params): logical = 'l' if op.params[0] == 'l' else '' # Make vin_defi code = ['{} *vin{};'.format(typ, i) for i in nargs] code += ['CHECK(vin{} = ({}*)nsimd_aligned_alloc(SIZE * {}));'. format(i, typ, common.sizeof(typ)) for i in nargs] vin_defi = '\n'.join(code) vin_rand = '\n'.join(['random(vin{}, SIZE, {});'.format(i, i - 1) \ for i in nargs]) # Make vout_ref_comp args = ', '.join(['va{}'.format(i) for i in nargs]) code = ['nsimd_cpu_v{}{} {}, vc;'.format(logical, typ, args)] code += ['va{} = nsimd_load{}u_cpu_{}(&vin{}[i]);'. format(i, logical, typ, i) for i in nargs] code += ['vc = nsimd_{}_cpu_{}({});'.format(op.name, typ, args)] code += ['nsimd_store{}u_cpu_{}(&vout_ref[i], vc);'. \ format(logical, typ)] vout_ref_comp = '\n'.join(code) # Make vout_nsimd_comp args = ', '.join(['va{}'.format(i) for i in nargs]) if lang == 'c_base': code = ['vec{}({}) {}, vc;'.format(logical, typ, args)] code += ['va{} = vload{}u(&vin{}[i], {});'. format(i, logical, i, typ) for i in nargs] code += ['vc = v{}({}, {});'.format(op.name, args, typ)] code += ['vstore{}u(&vout_nsimd[i], vc, {});'.format(logical, typ)] vout_nsimd_comp = '\n'.join(code) if lang == 'c_adv': code = ['nsimd_pack{}_{} {}, vc;'.format(logical, typ, args)] code += ['va{} = nsimd_load{}u(nsimd_pack{}_{}, &vin{}[i]);'. format(i, logical, logical, typ, i) for i in nargs] code += ['vc = nsimd_{}({});'.format(op.name, args)] code += ['nsimd_store{}u(&vout_nsimd[i], vc);'. \ format(logical, typ)] vout_nsimd_comp = '\n'.join(code) if lang == 'cxx_base': code = ['vec{}({}) {}, vc;'.format(logical, typ, args)] code += ['va{} = nsimd::load{}u(&vin{}[i], {}());'. format(i, logical, i, typ) for i in nargs] code += ['vc = nsimd::{}({}, {}());'.format(op.name, args, typ)] code += ['nsimd::store{}u(&vout_nsimd[i], vc, {}());'. \ format(logical, typ)] vout_nsimd_comp = '\n'.join(code) if lang == 'cxx_adv': code = ['nsimd::pack{}<{}> {}, vc;'.format(logical, typ, args)] code += ['''va{i} = nsimd::load{logical}u< nsimd::pack{logical}<{typ}> >( &vin{i}[i]);'''. format(i=i, logical=logical, typ=typ) for i in nargs] if op.cxx_operator: if len(op.params[1:]) == 1: code += ['vc = {}va1;'. format(op.cxx_operator)] if len(op.params[1:]) == 2: code += ['vc = va1 {} va2;'. format(op.cxx_operator)] else: code += ['vc = nsimd::{}({});'.format(op.name, args)] code += ['nsimd::store{}u(&vout_nsimd[i], vc);'. \ format(logical, typ)] vout_nsimd_comp = '\n'.join(code) elif op.params == ['l', 'v', 'v']: vin_defi = \ '''{typ} *vin1, *vin2; CHECK(vin1 = ({typ}*)nsimd_aligned_alloc(SIZE * {sizeof})); CHECK(vin2 = ({typ}*)nsimd_aligned_alloc(SIZE * {sizeof}));'''. \ format(typ=typ, sizeof=common.sizeof(typ)) code = ['random(vin{}, SIZE, {});'.format(i, i - 1) for i in nargs] vin_rand = '\n'.join(code) vout_ref_comp = '''nsimd_cpu_v{typ} va1, va2; nsimd_cpu_vl{typ} vc; va1 = nsimd_loadu_cpu_{typ}(&vin1[i]); va2 = nsimd_loadu_cpu_{typ}(&vin2[i]); vc = nsimd_{op_name}_cpu_{typ}(va1, va2); nsimd_storelu_cpu_{typ}(&vout_ref[i], vc);'''. \ format(typ=typ, op_name=op.name) if lang == 'c_base': vout_nsimd_comp = '''vec({typ}) va1, va2; vecl({typ}) vc; va1 = vloadu(&vin1[i], {typ}); va2 = vloadu(&vin2[i], {typ}); vc = v{op_name}(va1, va2, {typ}); vstorelu(&vout_nsimd[i], vc, {typ});'''. \ format(typ=typ, op_name=op.name) if lang == 'c_adv': vout_nsimd_comp = '''nsimd_pack_{typ} va1, va2; nsimd_packl_{typ} vc; va1 = nsimd_loadu(nsimd_pack_{typ}, &vin1[i]); va2 = nsimd_loadu(nsimd_pack_{typ}, &vin2[i]); vc = nsimd_{op_name}(va1, va2); nsimd_storelu(&vout_nsimd[i], vc);'''. \ format(typ=typ, op_name=op.name) if lang == 'cxx_base': vout_nsimd_comp = \ '''vec({typ}) va1, va2; vecl({typ}) vc; va1 = nsimd::loadu(&vin1[i], {typ}()); va2 = nsimd::loadu(&vin2[i], {typ}()); vc = nsimd::{op_name}(va1, va2, {typ}()); nsimd::storelu(&vout_nsimd[i], vc, {typ}());'''. \ format(typ=typ, op_name=op.name) if lang == 'cxx_adv': if op.cxx_operator: do_computation = 'vc = va1 {} va2;'. \ format(op.cxx_operator) else: do_computation = 'vc = nsimd::{}(va1, va2, {}());'. \ format(op.name, typ) vout_nsimd_comp = \ '''nsimd::pack<{typ}> va1, va2; nsimd::packl<{typ}> vc; va1 = nsimd::loadu >(&vin1[i]); va2 = nsimd::loadu >(&vin2[i]); {do_computation} nsimd::storelu(&vout_nsimd[i], vc);'''. \ format(typ=typ, op_name=op.name, do_computation=do_computation) elif op.params == ['v', 'v', 'p']: vin_defi = \ '''{typ} *vin1; CHECK(vin1 = ({typ}*)nsimd_aligned_alloc(SIZE * {sizeof}));'''. \ format(typ=typ, sizeof=common.sizeof(typ)) vin_rand = 'random(vin1, SIZE, 0);' vout_ref_comp = \ '''nsimd_cpu_v{typ} va1, vc; va1 = nsimd_loadu_cpu_{typ}(&vin1[i]); vc = nsimd_{op_name}_cpu_{typ}(va1, (i / step) % {typnbytes}); nsimd_storeu_cpu_{typ}(&vout_ref[i], vc);'''. \ format(typ=typ, op_name=op.name, typnbytes=typ[1:]) if lang == 'c_base': vout_nsimd_comp = \ '''vec({typ}) va1, vc; va1 = vloadu(&vin1[i], {typ}); vc = v{op_name}(va1, (i / step) % {typnbytes}, {typ}); vstoreu(&vout_nsimd[i], vc, {typ});'''. \ format(typ=typ, op_name=op.name, typnbytes=typ[1:]) if lang == 'c_adv': vout_nsimd_comp = \ '''nsimd_pack_{typ} va1, vc; va1 = nsimd_loadu(nsimd_pack_{typ}, &vin1[i]); vc = nsimd_{op_name}(va1, (i / step) % {typnbytes}); nsimd_storeu(&vout_nsimd[i], vc);'''. \ format(typ=typ, op_name=op.name, typnbytes=typ[1:]) if lang == 'cxx_base': vout_nsimd_comp = \ '''vec({typ}) va1, vc; va1 = nsimd::loadu(&vin1[i], {typ}()); vc = nsimd::{op_name}(va1, (i / step) % {typnbytes}, {typ}()); nsimd::storeu(&vout_nsimd[i], vc, {typ}());'''. \ format(typ=typ, op_name=op.name, typnbytes=typ[1:]) if lang == 'cxx_adv': if op.cxx_operator: do_computation = 'vc = va1 {} ((i / step) % {typnbytes});'. \ format(op.cxx_operator, typnbytes=typ[1:]) else: do_computation = \ 'vc = nsimd::{}(va1, (i / step) % {typnbytes});'. \ format(op.name, typnbytes=typ[1:]) vout_nsimd_comp = \ '''nsimd::pack<{typ}> va1, vc; va1 = nsimd::loadu >(&vin1[i]); {do_computation} nsimd::storeu(&vout_nsimd[i], vc);'''. \ format(typ=typ, do_computation=do_computation) else: raise ValueError('No test available for operator "{}" on type "{}"'. format(op.name, typ)) return { 'vin_defi': vin_defi, 'vin_rand': vin_rand, 'cpu_step': cpu_step, 'vout_ref_comp': vout_ref_comp, 'vout_nsimd_comp': vout_nsimd_comp, 'denormalize_inputs': denormalize_inputs } # ----------------------------------------------------------------------------- # Generate test in C, C++ (base API) and C++ (advanced API) for almost all # tests def gen_test(opts, op, typ, lang): filename = get_filename(opts, op, typ, lang) if filename == None: return content = get_content(op, typ, lang) extra_code = cbprng(typ, op, 'cpu') if op.name in ['notb', 'andb', 'orb', 'xorb', 'andnotb']: comp = 'return nsimd_scalar_reinterpret_{uT}_{typ}(ref_out) != ' \ 'nsimd_scalar_reinterpret_{uT}_{typ}(nsimd_out)'. \ format(typ=typ, uT=common.bitfield_type[typ]) elif op.name in ['max', 'min'] and typ in common.ftypes: comp = 'return nsimd_scalar_ne_{}(ref_out, nsimd_out);'.format(typ) else: if typ in common.ftypes: comp = 'return distance(ref_out, nsimd_out) < {}'. \ format(op.ufp[typ]) extra_code += distance[typ] else: comp = 'return nsimd_scalar_ne_{}(ref_out, nsimd_out);'. \ format(typ) includes = get_includes(lang) if typ in common.ftypes: dnz_flush_to_zero = \ '''/* We flush subnormal numbers to zero because support for it */ /* can be disabled, some intrinsics do not support them, */ /* execution of 32-bits code on 64-bits system may have different */ /* ways of handling them. */ for (i = 0; i < SIZE; i++) {{ if (!nsimd_isnormal_{typ}(vout_ref[i])) {{ vout_ref[i] = {zero}; }} if (!nsimd_isnormal_{typ}(vout_nsimd[i])) {{ vout_nsimd[i] = {zero}; }} }}'''.format(typ=typ, zero='({})0'.format(typ) if typ != 'f16' \ else 'nsimd_f32_to_f16(0.0f)') else: dnz_flush_to_zero = '' with common.open_utf8(opts, filename) as out: out.write(template.format( includes=includes, sizeof=common.sizeof(typ), typ=typ, op_name=op.name, year=date.today().year, comp=comp, dnz_flush_to_zero=dnz_flush_to_zero, extra_code=extra_code, **content)) common.clang_format(opts, filename) # ----------------------------------------------------------------------------- # Tests for addv def gen_addv(opts, op, typ, lang): filename = get_filename(opts, op, typ, lang) if filename == None: return if typ == 'f16': rand = 'nsimd_f32_to_f16((f32)(rand() % 3) - 1.0f)' zero = 'nsimd_f32_to_f16(0.0f)' comp = 'nsimd_f16_to_f32(vout[i]) != nsimd_f16_to_f32(vref[i])' else: rand = '({})((int)(rand() % 3) - 1)'.format(typ) zero = '({})0'.format(typ) comp = 'vout[i] != vref[i]' if lang == 'c_base': nsimd = 'vaddv(vloada(vin + (i * step), {typ}), {typ})'. \ format(typ=typ) elif lang == 'c_adv': nsimd = 'nsimd_addv(nsimd_loada(nsimd_pack_{}, vin + (i * step)))'. \ format(typ) elif lang == 'cxx_base': nsimd = 'nsimd::addv(nsimd::loada(vin + (i * step), {}()), {}())'. \ format(typ, typ) elif lang == 'cxx_adv': nsimd = 'nsimd::addv(nsimd::loada >' \ '(vin + (i * step)))'.format(typ) with common.open_utf8(opts, filename) as out: out.write( '''{posix_c_source} {includes} #define CHECK(a) {{ \\ errno = 0; \\ if (!(a)) {{ \\ fprintf(stderr, "ERROR: " #a ":%d: %s\\n", \\ __LINE__, strerror(errno)); \\ fflush(stderr); \\ exit(EXIT_FAILURE); \\ }} \\ }} #define STATUS "test of addv over {typ}" int main() {{ int step = vlen({typ}); int size = 2048; int i; {typ} *vin, *vref, *vout; CHECK(vin = ({typ} *)nsimd_aligned_alloc(size * {sizeof} * step)); CHECK(vref = ({typ} *)nsimd_aligned_alloc(size * {sizeof})); CHECK(vout = ({typ} *)nsimd_aligned_alloc(size * {sizeof})); fprintf(stdout, STATUS "...\\n"); fflush(stdout); for (i = 0; i < step * size; i++) {{ vin[i] = {rand}; }} for (i = 0; i < size; i++) {{ int j; {typ} acc = {zero}; for (j = step * i; j < step * i + step; j++) {{ acc = nsimd_scalar_add_{typ}(acc, vin[j]); }} vref[i] = acc; }} for (i = 0; i < size; i++) {{ vout[i] = {nsimd}; }} for (i = 0; i < size; i++) {{ if ({comp}) {{ fprintf(stdout, STATUS "... FAIL\\n"); fflush(stdout); return -1; }} }} fprintf(stdout, STATUS "... OK\\n"); fflush(stdout); return 0; }} '''.format(typ=typ, sizeof=common.sizeof(typ), zero=zero, rand=rand, comp=comp, nsimd=nsimd, posix_c_source=posix_c_source, includes=get_includes(lang))) common.clang_format(opts, filename) # ----------------------------------------------------------------------------- # General tests helpers for adds/subs def aligned_alloc_error(): return ''' #define CHECK(a) \\ {{ \\ errno = 0; \\ if (!(a)) \\ {{ \\ fprintf(stderr, \"ERROR: \" #a \":%d: %s\\n\", \\ __LINE__, strerror(errno)); \\ fflush(stderr); \\ exit(EXIT_FAILURE); \\ }} \\ }} ''' def equal(typ): return ''' int equal({typ} expected_result, {typ} computed_result) {{ return expected_result == computed_result; }} '''.format(typ=typ) def adds_subs_check_case(): return ''' #define CHECK_CASE(test_output, which_test) \\ {{ \\ if(0 == (test_output)) \\ {{ \\ fprintf(stdout, STATUS \" ... \" which_test \" check FAIL\\n\"); \\ fflush(stdout); \\ return -1; \\ }} \\ }} ''' def random_sign_flip(): return ''' int random_sign_flip(void) {{ return 2 * (rand() % 2) - 1; }} ''' def zero_out_arrays(typ): return ''' void zero_out_arrays({typ} vin1[], {typ} vin2[], {typ} vout_expected[], {typ} vout_computed[]) {{ int ii = 0; for(ii = 0; ii < SIZE; ++ii) {{ vin1[ii] = ({typ})0; vin2[ii] = ({typ})0; vout_expected[ii] = ({typ})0; vout_computed[ii] = ({typ})0; }} }} '''.format(typ=typ) def compute_op_given_language(typ, op, language): if 'c_base' == language: return \ '''vec({typ}) va1, va2, vc; va1 = vloadu(&vin1[outer], {typ}); va2 = vloadu(&vin2[outer], {typ}); vc = v{op}(va1, va2, {typ}); vstoreu(&vout_computed[outer], vc, {typ});'''. \ format(typ=typ, op=op) elif 'c_adv' == language: return \ '''nsimd_pack_{typ} va1, va2, vc; va1 = nsimd_loadu(nsimd_pack_{typ}, &vin1[outer]); va2 = nsimd_loadu(nsimd_pack_{typ}, &vin2[outer]); vc = nsimd_{op}(va1, va2); nsimd_storeu(&vout_computed[outer], vc);'''. \ format(typ=typ, op=op) elif 'cxx_base' == language: return \ '''vec({typ}) va1, va2, vc; va1 = nsimd::loadu(&vin1[outer], {typ}()); va2 = nsimd::loadu(&vin2[outer], {typ}()); vc = nsimd::{op}(va1, va2, {typ}()); nsimd::storeu(&vout_computed[outer], vc, {typ}());'''. \ format(typ=typ, op=op) else: return \ '''nsimd::pack<{typ}> va1, va2, vc; va1 = nsimd::loadu >(&vin1[outer]); va2 = nsimd::loadu >(&vin2[outer]); vc = nsimd::{op}(va1, va2); nsimd::storeu(&vout_computed[outer], vc);'''. \ format(typ=typ, op=op) def compare_expected_vs_computed(typ, op, language): values_computation = compute_op_given_language(typ, op, language) return ''' int compare_expected_vs_computed(const {typ}* vin1, const {typ}* vin2, const {typ}* vout_expected, {typ} vout_computed[]) {{ const int step = vlen({typ}); int outer = 0; int inner = 0; for (outer = 0; outer < SIZE; outer += step) {{ /* Fill vout_computed with computed values */ {values_computation} /* Compare results */ for (inner = outer; inner < outer + step; ++inner) {{ if (! equal(vout_expected[inner], vout_computed[inner])) {{ return 0; }} }} }} return 1; }} '''.format(typ=typ, values_computation=values_computation) def test_signed_neither_overflow_nor_underflow(typ, min_, max_, operator, check): return ''' int test_neither_overflow_nor_underflow({typ} vin1[], {typ} vin2[], {typ} vout_expected[], {typ} vout_computed[]) {{ int ii = 0; while(ii < SIZE) {{ {typ} a = ({typ})((random_sign_flip() * rand()) % {max_} % {min_}); {typ} b = ({typ})((random_sign_flip() * rand()) % {max_} % {min_}); if({check}(a, b)) {{ vin1[ii] = a; vin2[ii] = b; vout_expected[ii] = ({typ})(a {operator} b); ++ ii; }} }} assert(ii == SIZE); /* Test: if (neither overflow nor underflow) {{ vout_expected[ii] == a {operator} b; }} */ return compare_expected_vs_computed(vin1, vin2, vout_expected, vout_computed); }} '''.format(typ=typ, min_=min_, max_=max_, operator=operator, check=check) def test_signed_all_cases(typ, min_, max_, oper, oper_is_overflow, oper_is_underflow): return ''' int test_all_cases({typ} vin1[], {typ} vin2[], {typ} vout_expected[], {typ} vout_computed[]) {{ int ii = 0; for(ii = 0; ii < SIZE; ++ii) {{ vin1[ii] = ({typ})((random_sign_flip() * rand()) % {max_} % {min_}); vin2[ii] = ({typ})((random_sign_flip() * rand()) % {max_} % {min_}); if({oper_is_overflow}(vin1[ii], vin2[ii])) {{ vout_expected[ii] = {max_}; }} else if({oper_is_underflow}(vin1[ii], vin2[ii])) {{ vout_expected[ii] = {min_}; }} else {{ vout_expected[ii] = ({typ})(vin1[ii] {oper} vin2[ii]); }} }} /* Test all cases */ return compare_expected_vs_computed(vin1, vin2, vout_expected, vout_computed); }} ''' .format(typ=typ, min_=min_, max_=max_, oper=oper, oper_is_overflow=oper_is_overflow, oper_is_underflow=oper_is_underflow) # ----------------------------------------------------------------------------- # Tests helpers for adds - is overflow/underflow/neither overflow nor underflow def adds_is_overflow(typ, max_): return ''' int adds_is_overflow(const {typ} a, const {typ} b) {{ return (a > 0) && (b > {max_} - a); }} '''.format(typ=typ, max_=max_) def adds_signed_is_underflow(typ, min_): return ''' int adds_signed_is_underflow(const {typ} a, const {typ} b) {{ return (a < 0) && (b < {min_} - a); }} '''.format(typ=typ, min_=min_) def adds_signed_is_neither_overflow_nor_underflow(typ): return ''' int adds_signed_is_neither_overflow_nor_underflow(const {typ} a, const {typ} b) {{ return ! adds_is_overflow(a, b) && ! adds_signed_is_underflow(a, b); }} '''.format(typ=typ) # ----------------------------------------------------------------------------- # Tests helpers for adds with integer types # test integer overflow def test_adds_overflow(typ, max_): rand_ = '({typ})rand()'.format(typ=typ) \ if typ in common.utypes else 'rand()' return ''' int test_overflow({typ} vin1[], {typ} vin2[], {typ} vout_expected[], {typ} vout_computed[]) {{ /* if ((vin1[ii] > 0) && (vin2[ii] > {max_} - vin1[ii])) {{ overflow }} */ int ii = 0; /* vin1[ii] > 0 */ for(ii = 0; ii < SIZE; ++ii) {{ {typ} rand_val = ({typ})({rand_} % {max_}); vin1[ii] = (rand_val == 0 ? 1 : rand_val); }} /* vin2[ii] > {max_} - vin1[ii] vin2[ii] = {max_} - vin1[ii] + rand_val s.t.: 0 < rand_val <= vin1[ii] */ for(ii = 0; ii < SIZE; ++ii) {{ {typ} rand_val = ({typ})({rand_} % (vin1[ii] + 1)); rand_val = (rand_val == 0 ? 1 : rand_val); vin2[ii] = ({typ})({max_} - vin1[ii] + rand_val); vout_expected[ii] = {max_}; }} /* Test: if ((vin1[ii] > 0) && (vin2[ii] > {max_} - vin1[ii])) {{ vout_expected[ii] == {max_}; }} */ return compare_expected_vs_computed(vin1, vin2, vout_expected, vout_computed); }} '''.format(typ=typ, max_=max_, rand_=rand_) # ----------------------------------------------------------------------------- # Tests helpers for adds with signed integer types # test signed underflow def test_adds_signed_underflow(typ, min_): return ''' int test_underflow({typ} vin1[], {typ} vin2[], {typ} vout_expected[], {typ} vout_computed[]) {{ /* if ((vin1[ii] < 0) && (vin2[ii] < {min_} - vin1[ii])) {{ underflow }} */ int ii = 0; /* vin1[ii] < 0 */ for(ii = 0; ii < SIZE; ++ii) {{ {typ} rand_val = ({typ})((- rand()) % {min_}); vin1[ii] = (rand_val == 0 ? - 1 : rand_val); }} /* vin1[ii] < 0 vin2[ii] < {min_} - vin1[ii] vin2[ii] = {min_} - vin1[ii] - rand_val s.t.: 0 < rand_val < - vin1[ii] */ for(ii = 0; ii < SIZE; ++ii) {{ {typ} rand_val = ({typ})((rand()) % (- vin1[ii])); rand_val = (rand_val == 0 ? 1 : rand_val); vin2[ii] = ({typ})({min_} - vin1[ii] - rand_val); vout_expected[ii] = {min_}; }} /* Test: if ((vin1[ii] < 0) && (vin2[ii] < {min_} - vin1[ii])) {{ vout_expected[ii] == {min_}; }} */ return compare_expected_vs_computed(vin1, vin2, vout_expected, vout_computed); }} '''.format(typ=typ, min_=min_) # test signed neither overflow nor underflow def test_adds_signed_neither_overflow_nor_underflow(typ, min_, max_): return \ test_signed_neither_overflow_nor_underflow(typ, min_, max_, '+', 'adds_signed_is_neither_overflow_nor_underflow') # test signed all cases def test_adds_signed_all_cases(typ, min_, max_): return test_signed_all_cases(typ, min_, max_, '+', 'adds_is_overflow', 'adds_signed_is_underflow') # all signed tests def tests_adds_signed(): return''' zero_out_arrays(vin1, vin2, vout_expected, vout_computed); CHECK_CASE(test_overflow(vin1, vin2, vout_expected, vout_computed), "overflow"); zero_out_arrays(vin1, vin2, vout_expected, vout_computed); CHECK_CASE(test_underflow(vin1, vin2, vout_expected, vout_computed), "underflow"); zero_out_arrays(vin1, vin2, vout_expected, vout_computed); CHECK_CASE(test_neither_overflow_nor_underflow(vin1, vin2, vout_expected, vout_computed), "neither underflow nor overflow"); zero_out_arrays(vin1, vin2, vout_expected, vout_computed); CHECK_CASE(test_all_cases(vin1, vin2, vout_expected, vout_computed), "all cases"); ''' # ----------------------------------------------------------------------------- # Tests helper for adds with unsigned types # test signed neither overflow nor underflow def test_adds_unsigned_no_overflow(typ, max_): return ''' int test_no_overflow({typ} vin1[], {typ} vin2[], {typ} vout_expected[], {typ} vout_computed[]) {{ int ii = 0; while(ii < SIZE) {{ {typ} a = ({typ})(({typ})rand() % {max_}); {typ} b = ({typ})(({typ})rand() % {max_}); if(! adds_is_overflow(a, b)) {{ vin1[ii] = a; vin2[ii] = b; vout_expected[ii] = ({typ})(a + b); ++ ii; }} }} assert(ii == SIZE); /* Test: if (not adds is overflow) {{ vout_expected[ii] == a + b; }} */ return compare_expected_vs_computed(vin1, vin2, vout_expected, vout_computed); }} '''.format(typ=typ, max_=max_) # test unsigned all cases def test_adds_unsigned_all_cases(typ, max_): return ''' int test_all_cases({typ} vin1[], {typ} vin2[], {typ} vout_expected[], {typ} vout_computed[]) {{ int ii = 0; for(ii = 0; ii < SIZE; ++ii) {{ vin1[ii] = ({typ})(({typ})rand() % {max_}); vin2[ii] = ({typ})(({typ})rand() % {max_}); if(adds_is_overflow(vin1[ii], vin2[ii])) {{ vout_expected[ii] = {max_}; }} else {{ vout_expected[ii] = ({typ})(vin1[ii] + vin2[ii]); }} }} /* Test all cases: */ return compare_expected_vs_computed(vin1, vin2, vout_expected, vout_computed); }} '''.format(typ=typ, max_=max_) # all unsigned tests def tests_adds_unsigned(): return''' zero_out_arrays(vin1, vin2, vout_expected, vout_computed); CHECK_CASE(test_overflow(vin1, vin2, vout_expected, vout_computed), "overflow"); zero_out_arrays(vin1, vin2, vout_expected, vout_computed); CHECK_CASE(test_no_overflow(vin1, vin2, vout_expected, vout_computed), "no overflow"); zero_out_arrays(vin1, vin2, vout_expected, vout_computed); CHECK_CASE(test_all_cases(vin1, vin2, vout_expected, vout_computed), "all cases"); ''' # ------------------------------------------------------------------------------ # Get adds tests given type def get_adds_tests_cases_for_signed_types(typ, min_, max_): helpers = ''' {test_adds_overflow} {test_adds_signed_underflow} {adds_is_overflow} {adds_signed_is_underflow} {adds_signed_is_neither_overflow_nor_underflow} {test_adds_signed_neither_overflow_nor_underflow} {test_adds_signed_all_cases} ''' .format(test_adds_overflow=test_adds_overflow(typ, max_), test_adds_signed_underflow=test_adds_signed_underflow( typ, min_), adds_is_overflow=adds_is_overflow(typ, max_), adds_signed_is_underflow=adds_signed_is_underflow( typ, min_), adds_signed_is_neither_overflow_nor_underflow=adds_signed_is_neither_overflow_nor_underflow( typ), test_adds_signed_neither_overflow_nor_underflow=test_adds_signed_neither_overflow_nor_underflow( typ, min_=min_, max_=max_), test_adds_signed_all_cases=test_adds_signed_all_cases( typ, min_=min_, max_=max_) ) return {'helpers': helpers, 'tests': tests_adds_signed()} def get_adds_tests_cases_for_unsigned_types(typ, max_): helpers = ''' {test_adds_overflow} {adds_is_overflow} {test_adds_unsigned_no_overflow} {test_adds_unsigned_all_cases} ''' .format(test_adds_overflow=test_adds_overflow(typ, max_), adds_is_overflow=adds_is_overflow(typ, max_), test_adds_unsigned_no_overflow=test_adds_unsigned_no_overflow( typ, max_), test_adds_unsigned_all_cases=test_adds_unsigned_all_cases(typ, max_) ) return {'helpers': helpers, 'tests': tests_adds_unsigned()} def get_adds_tests_cases_given_type(typ): if typ in common.iutypes: type_limits = common.limits[typ] min_ = type_limits['min'] max_ = type_limits['max'] if typ in common.itypes: return get_adds_tests_cases_for_signed_types(typ=typ, min_=min_, max_=max_) if typ in common.utypes: return get_adds_tests_cases_for_unsigned_types(typ=typ, max_=max_) else: msg = '{typ} not implemented'.format(typ=typ) raise TypeError(msg) # ----------------------------------------------------------------------------- # gen_adds def gen_adds(opts, op, typ, lang): # Do not test for floats since adds(floats) == add(floats) if typ in common.ftypes: return filename = get_filename(opts, op, typ, lang) if filename == None: return sizeof = common.sizeof(typ) head = ''' {includes} #include #define SIZE (2048 / {sizeof}) #define STATUS "test of {op_name} over {typ}" {aligned_alloc_error} {adds_subs_check_case} ''' .format(includes=get_includes(lang), op_name=op.name, typ=typ, sizeof=sizeof, aligned_alloc_error=aligned_alloc_error(), adds_subs_check_case=adds_subs_check_case()) with common.open_utf8(opts, filename) as out: out.write( ''' \ {head} /* ------------------------------------------------------------------------- */ {random_sign_flip} {zero_out_arrays} {equal} {compare_expected_vs_computed} {tests_helpers} int main(void) {{ const int mem_aligned_size = SIZE * {sizeof}; {typ} *vin1; {typ} *vin2; {typ} *vout_expected; {typ} *vout_computed; CHECK(vin1 = ({typ} *)nsimd_aligned_alloc(mem_aligned_size)); CHECK(vin2 = ({typ} *)nsimd_aligned_alloc(mem_aligned_size)); CHECK(vout_expected = ({typ} *)nsimd_aligned_alloc(mem_aligned_size)); CHECK(vout_computed = ({typ} *)nsimd_aligned_alloc(mem_aligned_size)); {tests} fprintf(stdout, STATUS "... OK\\n"); fflush(stdout); return EXIT_SUCCESS; }} ''' .format(head=head, compare_expected_vs_computed=\ compare_expected_vs_computed(typ, op.name, lang), random_sign_flip='' if typ in common.utypes \ else random_sign_flip(), zero_out_arrays=zero_out_arrays(typ), equal=equal(typ), tests_helpers=\ get_adds_tests_cases_given_type(typ)['helpers'], tests=get_adds_tests_cases_given_type(typ)['tests'], op_name = op.name, typ=typ, sizeof = sizeof) ) common.clang_format(opts, filename) # ----------------------------------------------------------------------------- # Tests helpers for subs - is overflow/underflow/neither overflow nor underflow # subs signed def subs_signed_is_overflow(typ, max_): return ''' int subs_signed_is_overflow(const {typ} a, const {typ} b) {{ return (b < 0) && (a > {max_} + b); }} '''.format(typ=typ, max_=max_) def subs_signed_is_underflow(typ, min_): return ''' int subs_signed_is_underflow(const {typ} a, const {typ} b) {{ return (b > 0) && (a < {min_} + b); }} '''.format(typ=typ, min_=min_) def subs_signed_is_neither_overflow_nor_underflow(typ): return ''' int subs_signed_is_neither_overflow_nor_underflow(const {typ} a, const {typ} b) {{ return !subs_signed_is_overflow(a, b) && !subs_signed_is_underflow(a, b); }} '''.format(typ=typ) # subs unsigned def subs_unsigned_is_underflow(typ): return ''' int subs_unsigned_is_underflow(const {typ} a, const {typ} b) {{ return a < b; }} '''.format(typ=typ) # ----------------------------------------------------------------------------- # Tests helpers for subs with signed types # test signed integer overflow def test_subs_signed_overflow(typ, min_, max_): return ''' int test_overflow({typ} vin1[], {typ} vin2[], {typ} vout_expected[], {typ} vout_computed[]) {{ /* if ((vin2[ii] < 0) && (vin1[ii] > {max_} + vin2[ii])) {{ overflow }} */ int ii = 0; /* vin2[ii] < 0 */ for(ii = 0; ii < SIZE; ++ii) {{ {typ} rand_val = ({typ})((- rand()) % {min_}); vin2[ii] = (rand_val == 0 ? - 1 : rand_val); }} /* vin1[ii] - vin2[ii] > {max_} vin1[ii] > {max_} + vin2[ii] vin1[ii] = {max_} + vin2[ii] + rand_val s.t.: 0 < rand_val <= - vin2[ii] (- TYPE_MIN) overflows if vin2[ii] == -1 --> rand() % -(vin2[ii] + 1) --> rand() % 0 Therefore check if vin2[ii] == -1 --> if True --> set rand_val == 1 */ for(ii = 0; ii < SIZE; ++ii) {{ {typ} rand_val = 0; if(-1 == vin2[ii]){{ rand_val = 1; }} else{{ rand_val = ({typ})(rand() % -(vin2[ii] + 1)); rand_val = (rand_val == 0 ? 1 : rand_val); }} vin1[ii] = ({typ})({max_} + vin2[ii] + rand_val); vout_expected[ii] = {max_}; }} /* Test: if ((vin2[ii] < 0) && (vin1[ii] > {max_} + vin2[ii])) {{ vout_expected[ii] == {max_}; }} */ return compare_expected_vs_computed(vin1, vin2, vout_expected, vout_computed); }} '''.format(typ=typ, min_=min_, max_=max_) # test signed underflow def test_subs_signed_underflow(typ, min_, max_): return ''' int test_underflow({typ} vin1[], {typ} vin2[], {typ} vout_expected[], {typ} vout_computed[]) {{ /* if ((vin2[ii] > 0) && (vin1[ii] < {min_} + vin2[ii])) {{ underflow }} */ int ii = 0; /* vin2[ii] > 0 */ for(ii = 0; ii < SIZE; ++ii) {{ {typ} rand_val = ({typ})(rand() % {max_}); vin2[ii] = (rand_val == 0 ? 1 : rand_val); }} /* vin1[ii] < {min_} + vin2[ii] vin1[ii] = {min_} + vin2[ii] - rand_val s.t.: 0 < rand_val < vin2[ii] */ for(ii = 0; ii < SIZE; ++ii) {{ {typ} rand_val = ({typ})(rand() % vin2[ii]); rand_val = (rand_val == 0 ? 1 : rand_val); vin1[ii] = ({typ})({min_} + vin2[ii] - rand_val); vout_expected[ii] = {min_}; }} /* Test: if ((vin2[ii] > 0) && (vin1[ii] < {min_} + vin2[ii])) {{ vout_expected[ii] == {min_}; }} */ return compare_expected_vs_computed(vin1, vin2, vout_expected, vout_computed); }} '''.format(typ=typ, min_=min_, max_=max_) # test signed neither overflow nor underflow def test_subs_signed_neither_overflow_nor_underflow(typ, min_, max_): return \ test_signed_neither_overflow_nor_underflow(typ, min_, max_, '-', 'subs_signed_is_neither_overflow_nor_underflow') # test signed all cases def test_subs_signed_all_cases(typ, min_, max_): return test_signed_all_cases(typ, min_, max_, '-', 'subs_signed_is_overflow', 'subs_signed_is_underflow') # all signed tests def tests_subs_signed(): return ''' zero_out_arrays(vin1, vin2, vout_expected, vout_computed); CHECK_CASE(test_overflow(vin1, vin2, vout_expected, vout_computed), "overflow"); zero_out_arrays(vin1, vin2, vout_expected, vout_computed); CHECK_CASE(test_underflow(vin1, vin2, vout_expected, vout_computed), "underflow"); zero_out_arrays(vin1, vin2, vout_expected, vout_computed); CHECK_CASE(test_neither_overflow_nor_underflow(vin1, vin2, vout_expected, vout_computed), "neither underflow nor overflow"); zero_out_arrays(vin1, vin2, vout_expected, vout_computed); CHECK_CASE(test_all_cases(vin1, vin2, vout_expected, vout_computed), "all cases"); ''' # ----------------------------------------------------------------------------- # Tests helpers for subs with unsigned types # test unsigned underflow def test_subs_unsigned_underflow(typ, min_, max_): return ''' int test_underflow({typ} vin1[], {typ} vin2[], {typ} vout_expected[], {typ} vout_computed[]) {{ /* if (vin1[ii] < vin2[ii]) {{ underflow }} */ int ii = 0; /* vin1[ii] */ for(ii = 0; ii < SIZE; ++ii) {{ vin1[ii] = ({typ})(({typ})rand() % {max_}); }} /* vin1[ii] < vin2[ii] vin2[ii] = vin1[ii] + rand_val s.t.: 0 < rand_val < {max_} - vin1[ii] */ for(ii = 0; ii < SIZE; ++ii) {{ {typ} rand_val = ({typ})(({typ})rand() % ({max_} - vin1[ii])); rand_val = (rand_val == 0 ? 1 : rand_val); vin2[ii] = ({typ})(vin1[ii] + rand_val); vout_expected[ii] = ({typ}){min_}; }} /* Test: if (vin1[ii] < vin2[ii]) {{ vout_expected[ii] == {min_}; }} */ return compare_expected_vs_computed(vin1, vin2, vout_expected, vout_computed); }} '''.format(typ=typ, min_=min_, max_=max_) # test unsigned no underflow def test_subs_unsigned_no_underflow(typ, max_): return ''' int test_no_underflow({typ} vin1[], {typ} vin2[], {typ} vout_expected[], {typ} vout_computed[]) {{ /* if (vin1[ii] >= vin2[ii]) {{ no underflow }} */ int ii = 0; /* vin1[ii] */ for(ii = 0; ii < SIZE; ++ii) {{ vin1[ii] = ({typ})(({typ})rand() % {max_}); }} /* vin1[ii] >= vin2[ii] vin2 = vin1 - rand_val s.t. 0 <= rand_val <= vin1 */ for(ii = 0; ii < SIZE; ++ii) {{ {typ} rand_val = ({typ})(({typ})rand() % (vin1[ii] + 1)); vin2[ii] = ({typ})(vin1[ii] - rand_val); vout_expected[ii] = ({typ})(vin1[ii] - vin2[ii]); }} /* Test: if (vin1[ii] >= vin2[ii]) {{ vout_expected[ii] == vin1[ii] - vin2[ii]; }} */ return compare_expected_vs_computed(vin1, vin2, vout_expected, vout_computed); }} '''.format(typ=typ, max_=max_) # test signed all cases def test_subs_unsigned_all_cases(typ, min_, max_): return ''' int test_all_cases({typ} vin1[], {typ} vin2[], {typ} vout_expected[], {typ} vout_computed[]) {{ int ii = 0; for(ii = 0; ii < SIZE; ++ii) {{ vin1[ii] = ({typ})(({typ})rand() % {max_}); vin2[ii] = ({typ})(({typ})rand() % {max_}); if(subs_unsigned_is_underflow(vin1[ii], vin2[ii])) {{ vout_expected[ii] = ({typ}){min_}; }} else {{ vout_expected[ii] = ({typ})(vin1[ii] - vin2[ii]); }} }} /* Test all cases: */ return compare_expected_vs_computed(vin1, vin2, vout_expected, vout_computed); }} '''.format(typ=typ, min_=min_, max_=max_) # all unsigned tests def tests_subs_unsigned(): return''' zero_out_arrays(vin1, vin2, vout_expected, vout_computed); CHECK_CASE(test_underflow(vin1, vin2, vout_expected, vout_computed), "underflow"); zero_out_arrays(vin1, vin2, vout_expected, vout_computed); CHECK_CASE(test_no_underflow(vin1, vin2, vout_expected, vout_computed), "no underflow"); zero_out_arrays(vin1, vin2, vout_expected, vout_computed); CHECK_CASE(test_all_cases(vin1, vin2, vout_expected, vout_computed), "all cases"); ''' # ------------------------------------------------------------------------------ # Get subs tests given type def get_subs_tests_cases_for_signed_types(typ, min_, max_): helpers = ''' {test_subs_signed_overflow} {test_subs_signed_underflow} {subs_signed_is_overflow} {subs_signed_is_underflow} {subs_signed_is_neither_overflow_nor_underflow} {test_subs_signed_neither_overflow_nor_underflow} {test_subs_signed_all_cases} ''' .format(test_subs_signed_overflow=\ test_subs_signed_overflow(typ, min_, max_), test_subs_signed_underflow=\ test_subs_signed_underflow(typ, min_, max_), subs_signed_is_overflow=\ subs_signed_is_overflow(typ, max_), subs_signed_is_underflow=\ subs_signed_is_underflow(typ, min_), subs_signed_is_neither_overflow_nor_underflow=\ subs_signed_is_neither_overflow_nor_underflow(typ), test_subs_signed_neither_overflow_nor_underflow=\ test_subs_signed_neither_overflow_nor_underflow( typ, min_=min_, max_=max_), test_subs_signed_all_cases=\ test_subs_signed_all_cases(typ, min_=min_, max_=max_)) return {'helpers': helpers, 'tests': tests_subs_signed()} def get_subs_tests_cases_for_unsigned_types(typ, min_, max_): helpers = ''' {test_subs_unsigned_underflow} {test_subs_unsigned_no_underflow} {subs_unsigned_is_underflow} {test_subs_unsigned_all_cases} ''' .format(test_subs_unsigned_underflow=\ test_subs_unsigned_underflow(typ, min_, max_), test_subs_unsigned_no_underflow=\ test_subs_unsigned_no_underflow(typ, max_), subs_unsigned_is_underflow=\ subs_unsigned_is_underflow(typ), test_subs_unsigned_all_cases=\ test_subs_unsigned_all_cases(typ, min_, max_)) return {'helpers': helpers, 'tests': tests_subs_unsigned()} def get_subs_tests_cases_given_type(typ): if typ in common.iutypes: type_limits = common.limits[typ] min_ = type_limits['min'] max_ = type_limits['max'] if typ in common.itypes: return get_subs_tests_cases_for_signed_types( typ=typ, min_=min_, max_=max_) if typ in common.utypes: return get_subs_tests_cases_for_unsigned_types( typ=typ, min_=min_, max_=max_) else: msg = '{typ} not implemented'.format(typ=typ) raise TypeError(msg) # ----------------------------------------------------------------------------- # gen_subs def gen_subs(opts, op, typ, lang): # Do not test for floats since subs(floats) == sub(floats) if typ in common.ftypes: return filename = get_filename(opts, op, typ, lang) if filename == None: return sizeof = common.sizeof(typ) head = \ '''{includes} #include #define SIZE (2048 / {sizeof}) #define STATUS "test of {op_name} over {typ}" {aligned_alloc_error} {adds_subs_check_case}'''. \ format(includes=get_includes(lang), op_name=op.name, typ=typ, sizeof=sizeof, aligned_alloc_error=aligned_alloc_error(), adds_subs_check_case=adds_subs_check_case()) with common.open_utf8(opts, filename) as out: out.write(''' {head} {hbar} {random_sign_flip} {zero_out_arrays} {equal} {compare_expected_vs_computed} {tests_helpers} int main(void) {{ const int mem_aligned_size = SIZE * {sizeof}; {typ} *vin1; {typ} *vin2; {typ} *vout_expected; {typ} *vout_computed; CHECK(vin1 = ({typ} *)nsimd_aligned_alloc(mem_aligned_size)); CHECK(vin2 = ({typ} *)nsimd_aligned_alloc(mem_aligned_size)); CHECK(vout_expected = ({typ} *)nsimd_aligned_alloc(mem_aligned_size)); CHECK(vout_computed = ({typ} *)nsimd_aligned_alloc(mem_aligned_size)); {tests} fprintf(stdout, STATUS "... OK\\n"); fflush(stdout); return EXIT_SUCCESS; }} '''.format(head=head, compare_expected_vs_computed=\ compare_expected_vs_computed(typ, op.name, lang), random_sign_flip='' if typ in common.utypes \ else random_sign_flip(), zero_out_arrays=zero_out_arrays(typ), equal=equal(typ), tests_helpers=\ get_subs_tests_cases_given_type(typ)['helpers'], tests=get_subs_tests_cases_given_type(typ)['tests'], op_name=op.name, typ=typ, hbar=common.hbar, sizeof=sizeof)) common.clang_format(opts, filename) # ----------------------------------------------------------------------------- # Tests for all and any def gen_all_any(opts, op, typ, lang): filename = get_filename(opts, op, typ, lang) if filename == None: return if lang == 'c_base': op_test = 'v{}(vloadla(buf, {}), {})'.format(op.name, typ, typ) elif lang == 'c_adv': op_test = 'nsimd_{}(nsimd_loadla(nsimd_packl_{}, buf))'. \ format(op.name, typ) elif lang == 'cxx_base': op_test = 'nsimd::{}(nsimd::loadla(buf, {}()), {}())'. \ format(op.name, typ, typ) else: op_test = 'nsimd::{}(nsimd::loadla >(buf))'. \ format(op.name, typ) if typ == 'f16': scalar0 = 'nsimd_f32_to_f16(0)' scalar1 = 'nsimd_f32_to_f16(1)' else: scalar0 = '({})0'.format(typ) scalar1 = '({})1'.format(typ) with common.open_utf8(opts, filename) as out: out.write( '''{includes} #define CHECK(a) {{ \\ errno = 0; \\ if (!(a)) {{ \\ fprintf(stderr, "ERROR: " #a ":%d: %s\\n", \\ __LINE__, strerror(errno)); \\ fflush(stderr); \\ exit(EXIT_FAILURE); \\ }} \\ }} int main(void) {{ int i; {typ} *buf; int len = vlen({typ}); fprintf(stdout, "test of {op_name} over {typ}...\\n"); CHECK(buf = ({typ}*)nsimd_aligned_alloc(len * {sizeof})); /* Test with all elements to true */ for (i = 0; i < len; i++) {{ buf[i] = {scalar1}; }} if (!{op_test}) {{ exit(EXIT_FAILURE); }} /* Test with all elements set to false */ for (i = 0; i < len; i++) {{ buf[i] = {scalar0}; }} if ({op_test}) {{ exit(EXIT_FAILURE); }} /* Test with only one element set to true */ if (len > 1) {{ buf[0] = {scalar1}; if ({notl}{op_test}) {{ exit(EXIT_FAILURE); }} }} fprintf(stdout, "test of {op_name} over {typ}... OK\\n"); return EXIT_SUCCESS; }}'''.format(includes=get_includes(lang), op_name=op.name, typ=typ, op_test=op_test, year=date.today().year, notl='!' if op.name == 'any' else '', scalar0=scalar0, scalar1=scalar1, sizeof=common.sizeof(typ))) common.clang_format(opts, filename) # ----------------------------------------------------------------------------- # Tests for load/store of degrees 2, 3 and 4 def gen_load_store(opts, op, typ, lang): filename = get_filename(opts, op, typ, lang) if filename == None: return if op.name.startswith('load'): deg = op.name[4] align = op.name[5] elif op.name.startswith('store'): deg = op.name[5] align = op.name[6] variables = ', '.join(['v.v{}'.format(i) for i in range(0, int(deg))]) if lang == 'c_base': load_store = \ '''vecx{deg}({typ}) v = vload{deg}{align}(&vin[i], {typ}); vstore{deg}{align}(&vout[i], {variables}, {typ});'''. \ format(deg=deg, typ=typ, align=align, variables=variables) elif lang == 'c_adv': load_store = \ '''nsimd_packx{deg}_{typ} v = nsimd_load{deg}{align}(nsimd_packx{deg}_{typ}, &vin[i]); nsimd_store{deg}{align}(&vout[i], {variables});'''. \ format(deg=deg, typ=typ, align=align, variables=variables) elif lang == 'cxx_base': load_store = \ '''vecx{deg}({typ}) v = nsimd::load{deg}{align}(&vin[i], {typ}()); nsimd::store{deg}{align}(&vout[i], {variables}, {typ}());'''. \ format(deg=deg, typ=typ, align=align, variables=variables) else: load_store = \ '''nsimd::packx{deg}<{typ}> v = nsimd::load{deg}{align}< nsimd::packx{deg}<{typ}> >(&vin[i]); nsimd::store{deg}{align}(&vout[i], {variables});'''. \ format(deg=deg, typ=typ, align=align, variables=variables) if typ == 'f16': rand = '*((u16*)vin + i) = nsimd_f32_to_u16((float)(rand() % 10));' comp = '*((u16*)vin + i) != *((u16 *)vout + i)' else: rand = 'vin[i] = ({})(rand() % 10);'.format(typ) comp = 'vin[i] != vout[i]' if align=='u': unalign = '+1' else: unalign = '' with common.open_utf8(opts, filename) as out: out.write('''{includes} #define SIZE (2048 / {sizeof}) #define STATUS "test of {op_name} over {typ}" #define CHECK(a) {{ \\ errno = 0; \\ if (!(a)) {{ \\ fprintf(stderr, "ERROR: " #a ":%d: %s\\n", \\ __LINE__, strerror(errno)); \\ fflush(stderr); \\ exit(EXIT_FAILURE); \\ }} \\ }} int main(void) {{ int i, vi; {typ} *vin, *vout; int len = vlen({typ}); int n = SIZE * {deg} * len; fprintf(stdout, "test of {op_name} over {typ}...\\n"); CHECK(vin = ({typ}*)nsimd_aligned_alloc( n * {sizeof} {unalign}) {unalign}); CHECK(vout = ({typ}*)nsimd_aligned_alloc( n * {sizeof} {unalign}) {unalign}); /* Fill with random data */ for (i = 0; i < n; i++) {{ {rand} }} /* Load and put back data into vout */ for (i = 0; i < n; i += {deg} * len) {{ {load_store} }} /* Compare results */ for (vi = 0; vi < SIZE; vi += len) {{ for (i = vi; i < vi + len; i++) {{ if ({comp}) {{ fprintf(stdout, STATUS "... FAIL\\n"); fflush(stdout); return -1; }} }} }} fprintf(stdout, "test of {op_name} over {typ}... OK\\n"); return EXIT_SUCCESS; }}'''.format(includes=get_includes(lang), op_name=op.name, typ=typ, rand=rand, year=date.today().year, deg=deg, sizeof=common.sizeof(typ), load_store=load_store, comp=comp, unalign=unalign)) common.clang_format(opts, filename) # ----------------------------------------------------------------------------- # Tests for gather/scatter def gen_gather_scatter(opts, op, typ, lang): filename = get_filename(opts, op, typ, lang) if filename == None: return ityp = 'i' + typ[1:] if lang == 'c_base': if op.name == 'gather_linear': gather_scatter = '''vscatter_linear(vout + 1, 2, vgather_linear( vin, 2, {typ}), {typ});'''.format(typ=typ) else: gather_scatter = \ '''vec({ityp}) offsets = vmul(viota({ityp}), vset1(({ityp})2, {ityp}), {ityp}); vec({typ}) v = vgather(vin, offsets, {typ}); offsets = vadd(offsets, vset1(({ityp})1, {ityp}), {ityp}); vscatter(vout, offsets, v, {typ});'''. \ format(typ=typ, ityp=ityp) elif lang == 'c_adv': if op.name == 'gather_linear': gather_scatter = \ '''nsimd_scatter_linear( vout + 1, 2, nsimd_gather_linear( nsimd_pack_{}, vin, 2));'''.format(typ) else: gather_scatter = \ '''nsimd_pack_{ityp} offsets = nsimd_mul(nsimd_iota( nsimd_pack_{ityp}), nsimd_set1( nsimd_pack_{ityp}, ({ityp})2)); nsimd_pack_{typ} v = nsimd_gather( nsimd_pack_{typ}, vin, offsets); offsets = nsimd_add(offsets, nsimd_set1(nsimd_pack_{ityp}, ({ityp})1)); nsimd_scatter(vout, offsets, v);'''. \ format(typ=typ, ityp=ityp) elif lang == 'cxx_base': if op.name == 'gather_linear': gather_scatter = '''nsimd::scatter_linear(vout + 1, 2, nsimd::gather_linear( vin, 2, {typ}()), {typ}());'''. \ format(typ=typ) else: gather_scatter = \ '''vec({ityp}) offsets = nsimd::mul(nsimd::iota({ityp}()), nsimd::set1(({ityp})2, {ityp}()), {ityp}()); vec({typ}) v = nsimd::gather(vin, offsets, {typ}()); offsets = nsimd::add(offsets, nsimd::set1(({ityp})1, {ityp}()), {ityp}()); nsimd::scatter(vout, offsets, v, {typ}());'''. \ format(typ=typ, ityp=ityp) else: if op.name == 'gather_linear': gather_scatter = '''nsimd::scatter_linear(vout + 1, 2, nsimd::gather_linear >( vin, 2));'''.format(typ=typ) else: gather_scatter = \ '''typedef nsimd::pack<{typ}> pack; typedef nsimd::pack<{ityp}> ipack; ipack offsets = nsimd::mul(nsimd::iota(), nsimd::set1(({ityp})2)); pack v = nsimd::gather(vin, offsets); offsets = nsimd::add(offsets, nsimd::set1(({ityp})1)); nsimd::scatter(vout, offsets, v);'''. \ format(typ=typ, ityp=ityp) if typ == 'f16': one = 'nsimd_f32_to_f16(1.0f)' zero = 'nsimd_f32_to_f16(0.0f)' comp = 'nsimd_f16_to_f32(vout[i]) != 0.0f' else: one = '({typ})1'.format(typ=typ) zero = '({typ})0'.format(typ=typ) comp = 'vout[i] != ({typ})0'.format(typ=typ) with common.open_utf8(opts, filename) as out: out.write( '''{includes} #define STATUS "test of {op_name} over {typ}" int main(void) {{ int n = 2 * vlen({typ}); int i; {typ} vin[2 * NSIMD_MAX_LEN({typ})]; {typ} vout[2 * NSIMD_MAX_LEN({typ})]; fprintf(stdout, "test of {op_name} over {typ}...\\n"); /* Fill input and output with 0 1 0 1 0 1 ... */ for (i = 0; i < n; i++) {{ if ((i % 2) == 1) {{ vin[i] = {one}; vout[i] = {one}; }} else {{ vin[i] = {zero}; vout[i] = {zero}; }} }} /* We gather odd offsets elements from vin and put then at even */ /* offsets. */ {{ {gather_scatter} }} /* Compare results */ for (i = 0; i < n; i++) {{ if ({comp}) {{ fprintf(stdout, STATUS "... FAIL\\n"); fflush(stdout); return -1; }} }} fprintf(stdout, "test of {op_name} over {typ}... OK\\n"); return EXIT_SUCCESS; }}'''.format(includes=get_includes(lang), ityp=ityp, comp=comp, typ=typ, year=date.today().year, op_name=op.name, gather_scatter=gather_scatter, zero=zero, one=one)) common.clang_format(opts, filename) # ----------------------------------------------------------------------------- # Tests for masked scatter def gen_mask_scatter(opts, op, typ, lang): filename = get_filename(opts, op, typ, lang) if filename == None: return ityp = 'i' + typ[1:] if typ == 'f16': two = 'nsimd_f32_to_f16(2.0f)' one = 'nsimd_f32_to_f16(1.0f)' zero = 'nsimd_f32_to_f16(0.0f)' comp_with_0 = 'nsimd_f16_to_f32(vout[2 * k]) != 0.0f' comp_with_1 = 'nsimd_f16_to_f32(vout[2 * k + 1]) != 1.0f' comp_with_2 = 'nsimd_f16_to_f32(vout[2 * k]) != 2.0f' else: two = '({typ})2'.format(typ=typ) one = '({typ})1'.format(typ=typ) zero = '({typ})0'.format(typ=typ) comp_with_0 = 'vout[2 * k] != ({typ})0'.format(typ=typ) comp_with_1 = 'vout[2 * k + 1] != ({typ})1'.format(typ=typ) comp_with_2 = 'vout[2 * k] != ({typ})2'.format(typ=typ) if lang == 'c_base': mask_scatter = \ '''vec({ityp}) offsets = vmul(viota({ityp}), vset1(({ityp})2, {ityp}), {ityp}); vecl({typ}) mask = vmask_for_loop_tail(0, i, {typ}); vmask_scatter(mask, vout, offsets, vset1({two}, {typ}), {typ});'''.format(two=two, typ=typ, ityp=ityp) if lang == 'c_adv': mask_scatter = \ '''nsimd_pack_{ityp} offsets = nsimd_mul(nsimd_iota( nsimd_pack_{ityp}), nsimd_set1( nsimd_pack_{ityp}, ({ityp})2)); nsimd_packl_{typ} mask = nsimd_mask_for_loop_tail( nsimd_pack_{typ}, 0, i); nsimd_mask_scatter(mask, vout, offsets, nsimd_set1( nsimd_pack_{typ}, {two}));'''. \ format(two=two, typ=typ, ityp=ityp) elif lang == 'cxx_base': mask_scatter = \ '''vec({ityp}) offsets = nsimd::mul(nsimd::iota({ityp}()), nsimd::set1(({ityp})2, {ityp}()), {ityp}()); vecl({typ}) mask = nsimd::mask_for_loop_tail(0, i, {typ}()); nsimd::mask_scatter(mask, vout, offsets, nsimd::set1( {two}, {typ}()), {typ}());'''. \ format(two=two, typ=typ, ityp=ityp) else: mask_scatter = \ '''typedef nsimd::pack<{typ}> pack; typedef nsimd::pack<{ityp}> ipack; typedef nsimd::packl<{typ}> packl; ipack offsets = nsimd::mul(nsimd::iota(), nsimd::set1(({ityp})2)); packl mask = nsimd::mask_for_loop_tail(0, i); nsimd::mask_scatter(mask, vout, offsets, nsimd::set1({two}));'''. \ format(two=two, typ=typ, ityp=ityp) with common.open_utf8(opts, filename) as out: out.write( '''{includes} #define STATUS "test of {op_name} over {typ}" int main(void) {{ int n = 2 * vlen({typ}); int i, j, k; {typ} vout[2 * NSIMD_MAX_LEN({typ})]; fprintf(stdout, "test of {op_name} over {typ}...\\n"); for (i = 0; i < n / 2; i++) {{ /* Fill output with 0 1 0 1 0 1 ... */ for (j = 0; j < n; j++) {{ vout[j] = (j % 2 == 0 ? {zero} : {one}); }} {{ {mask_scatter} }} /* Check results */ for (k = 0; k < n / 2; k++) {{ if ({comp_with_1}) {{ goto error; }} }} for (k = 0; k < i; k++) {{ if ({comp_with_2}) {{ goto error; }} }} for (; k < n / 2; k++) {{ if ({comp_with_0}) {{ goto error; }} }} }} fprintf(stdout, "test of {op_name} over {typ}... OK\\n"); fflush(stdout); return EXIT_SUCCESS; error: fprintf(stdout, STATUS "... FAIL\\n"); fflush(stdout); return EXIT_FAILURE; }}'''.format(includes=get_includes(lang), ityp=ityp, two=two, typ=typ, year=date.today().year, op_name=op.name, mask_scatter=mask_scatter, zero=zero, one=one, comp_with_0=comp_with_0, comp_with_2=comp_with_2, comp_with_1=comp_with_1)) common.clang_format(opts, filename) # ----------------------------------------------------------------------------- # Tests for masked gather def gen_maskoz_gather(opts, op, typ, lang): filename = get_filename(opts, op, typ, lang) if filename == None: return ityp = 'i' + typ[1:] if typ == 'f16': three = 'nsimd_f32_to_f16(3.0f)' two = 'nsimd_f32_to_f16(2.0f)' one = 'nsimd_f32_to_f16(1.0f)' zero = 'nsimd_f32_to_f16(0.0f)' comp_with_1 = 'nsimd_f16_to_f32(vout[k]) != 1.0f' if op.name == 'maskz_gather': comp_with_0_or_3 = 'nsimd_f16_to_f32(vout[k]) != 0.0f' else: comp_with_0_or_3 = 'nsimd_f16_to_f32(vout[k]) != 3.0f' else: three = '({typ})3'.format(typ=typ) two = '({typ})2'.format(typ=typ) one = '({typ})1'.format(typ=typ) zero = '({typ})0'.format(typ=typ) comp_with_1 = 'vout[k] != ({typ})1'.format(typ=typ) if op.name == 'maskz_gather': comp_with_0_or_3 = 'vout[k] != ({typ})0'.format(typ=typ) else: comp_with_0_or_3 = 'vout[k] != ({typ})3'.format(typ=typ) oz = 'o' if op.name == 'masko_gather' else 'z' if lang == 'c_base': ta = ', vset1({three}, {typ})'.format(three=three, typ=typ) \ if op.name == 'masko_gather' else '' maskoz_gather = \ '''vec({ityp}) offsets = vmul(viota({ityp}), vset1(({ityp})2, {ityp}), {ityp}); vecl({typ}) mask = vmask_for_loop_tail(0, i, {typ}); vstoreu(vout, vmask{oz}_gather(mask, vin, offsets{ta}, {typ}), {typ});'''. \ format(typ=typ, ityp=ityp, ta=ta, oz=oz) if lang == 'c_adv': ta = ', nsimd_set1(nsimd_pack_{typ}, {three})'. \ format(three=three, typ=typ) if op.name == 'masko_gather' else '' maskoz_gather = \ '''nsimd_pack_{ityp} offsets = nsimd_mul(nsimd_iota( nsimd_pack_{ityp}), nsimd_set1( nsimd_pack_{ityp}, ({ityp})2)); nsimd_packl_{typ} mask = nsimd_mask_for_loop_tail( nsimd_pack_{typ}, 0, i); nsimd_storeu(vout, nsimd_mask{oz}_gather( mask, vin, offsets{ta}));'''. \ format(typ=typ, ityp=ityp, ta=ta, oz=oz) elif lang == 'cxx_base': ta = ', nsimd::set1({three}, {typ}())'.format(three=three, typ=typ) \ if op.name == 'masko_gather' else '' maskoz_gather = \ '''vec({ityp}) offsets = nsimd::mul(nsimd::iota({ityp}()), nsimd::set1(({ityp})2, {ityp}()), {ityp}()); vecl({typ}) mask = nsimd::mask_for_loop_tail(0, i, {typ}()); nsimd::storeu(vout, nsimd::mask{oz}_gather( mask, vin, offsets{ta}, {typ}()), {typ}());'''. \ format(typ=typ, ityp=ityp, ta=ta, oz=oz) else: ta = ', nsimd::set1 >({three})'. \ format(three=three, typ=typ) if op.name == 'masko_gather' else '' maskoz_gather = \ '''typedef nsimd::pack<{ityp}> ipack; typedef nsimd::packl<{typ}> packl; ipack offsets = nsimd::mul(nsimd::iota(), nsimd::set1(({ityp})2)); packl mask = nsimd::mask_for_loop_tail(0, i); nsimd::storeu(vout, nsimd::mask{oz}_gather( mask, vin, offsets{ta}));'''. \ format(ta=ta, oz=oz, typ=typ, ityp=ityp) with common.open_utf8(opts, filename) as out: out.write( '''{includes} #define STATUS "test of {op_name} over {typ}" int main(void) {{ int n = 2 * vlen({typ}); int i, j, k; {typ} vin[2 * NSIMD_MAX_LEN({typ})]; {typ} vout[NSIMD_MAX_LEN({typ})]; fprintf(stdout, "test of {op_name} over {typ}...\\n"); for (i = 0; i < n / 2; i++) {{ /* Fill input with 1 0 1 0 1 0 ... */ for (j = 0; j < n; j++) {{ vin[j] = (j % 2 == 1 ? {zero} : {one}); }} /* Fill output with 2's ... */ for (j = 0; j < n / 2; j++) {{ vout[j] = {two}; }} {{ {maskoz_gather} }} /* Check results */ for (k = 0; k < i; k++) {{ if ({comp_with_1}) {{ goto error; }} }} for (; k < n / 2; k++) {{ if ({comp_with_0_or_3}) {{ goto error; }} }} }} fprintf(stdout, "test of {op_name} over {typ}... OK\\n"); fflush(stdout); return EXIT_SUCCESS; error: fprintf(stdout, STATUS "... FAIL\\n"); fflush(stdout); return EXIT_FAILURE; }}'''.format(includes=get_includes(lang), ityp=ityp, two=two, typ=typ, year=date.today().year, op_name=op.name, maskoz_gather=maskoz_gather, zero=zero, one=one, comp_with_0_or_3=comp_with_0_or_3, three=three, comp_with_1=comp_with_1)) common.clang_format(opts, filename) # ----------------------------------------------------------------------------- # Tests for masked loads def gen_mask_load(opts, op, typ, lang): filename = get_filename(opts, op, typ, lang) if filename == None: return if typ == 'f16': fill_vin = 'vin[i] = nsimd_f32_to_f16((f32)i);' m1 = 'nsimd_f32_to_f16(-1.0f)' comp1 = 'nsimd_f16_to_f32(vout[j]) != (f32)j' else: fill_vin = 'vin[i] = ({typ})i;'.format(typ=typ) m1 = '({typ})-1'.format(typ=typ) comp1 = 'vout[j] != ({typ})j'.format(typ=typ) if op.name in ['masko_loada1', 'masko_loadu1']: if lang == 'c_base': test = \ '''vecl({typ}) mask = vmask_for_loop_tail(0, i, {typ}); vec({typ}) other = vset1({m1}, {typ}); vstoreu(vout, v{op_name}(mask, vin, other, {typ}), {typ});'''. \ format(typ=typ, op_name=op.name, m1=m1) elif lang == 'c_adv': test = \ '''nsimd_packl_{typ} mask = nsimd_mask_for_loop_tail( nsimd_packl_{typ}, 0, i); nsimd_pack_{typ} other = nsimd_set1(nsimd_pack_{typ}, {m1}); nsimd_storeu(vout, nsimd_{op_name}(mask, vin, other));'''. \ format(typ=typ, op_name=op.name, m1=m1) elif lang == 'cxx_base': test = \ '''vecl({typ}) mask = nsimd::mask_for_loop_tail(0, i, {typ}()); vec({typ}) other = nsimd::set1({m1}, {typ}()); nsimd::storeu(vout, nsimd::{op_name}( mask, vin, other, {typ}()), {typ}());'''. \ format(typ=typ, op_name=op.name, m1=m1) elif lang == 'cxx_adv': test = \ '''nsimd::packl<{typ}> mask = nsimd::mask_for_loop_tail >(0, i); nsimd::pack<{typ}> other = nsimd::set1 >( {m1}); nsimd::storeu(vout, nsimd::{op_name}(mask, vin, other));'''. \ format(typ=typ, op_name=op.name, m1=m1) comp2 = 'vout[j] != ({typ})-1'.format(typ=typ) if typ != 'f16' else \ 'nsimd_f16_to_f32(vout[j]) != -1.0f' else: if lang == 'c_base': test = \ '''vecl({typ}) mask = vmask_for_loop_tail(0, i, {typ}); vstoreu(vout, v{op_name}(mask, vin, {typ}), {typ});'''. \ format(typ=typ, op_name=op.name, m1=m1) elif lang == 'c_adv': test = \ '''nsimd_packl_{typ} mask = nsimd_mask_for_loop_tail( nsimd_packl_{typ}, 0, i); nsimd_storeu(vout, nsimd_{op_name}(mask, vin));'''. \ format(typ=typ, op_name=op.name, m1=m1) elif lang == 'cxx_base': test = \ '''vecl({typ}) mask = nsimd::mask_for_loop_tail(0, i, {typ}()); nsimd::storeu(vout, nsimd::{op_name}( mask, vin, {typ}()), {typ}());'''. \ format(typ=typ, op_name=op.name, m1=m1) elif lang == 'cxx_adv': test = \ '''nsimd::packl<{typ}> mask = nsimd::mask_for_loop_tail >(0, i); nsimd::storeu(vout, nsimd::{op_name}(mask, vin));'''. \ format(typ=typ, op_name=op.name, m1=m1) comp2 = 'vout[j] != ({typ})0'.format(typ=typ) if typ != 'f16' else \ 'nsimd_f16_to_f32(vout[j]) != -0.0f' if op.name in ['masko_loadu1', 'maskz_loadu1']: unalign = '\nvin += 1;' else: unalign = '' with common.open_utf8(opts, filename) as out: out.write( '''{includes} #define STATUS "test of {op_name} over {typ}" #define CHECK(a) {{ \\ errno = 0; \\ if (!(a)) {{ \\ fprintf(stderr, "ERROR: " #a ":%d: %s\\n", \\ __LINE__, strerror(errno)); \\ fflush(stderr); \\ exit(EXIT_FAILURE); \\ }} \\ }} int main(void) {{ int i, j; {typ} *vin; {typ} vout[NSIMD_MAX_LEN({typ})]; int len = vlen({typ}); fprintf(stdout, "test of {op_name} over {typ}...\\n"); CHECK(vin = ({typ}*)nsimd_aligned_alloc(2 * len));{unalign} /* Fill with data */ for (i = 0; i < len; i++) {{ {fill_vin} }} /* Load and put back data into vout */ for (i = 0; i < len; i++) {{ {test} for (j = 0; j < i; j++) {{ if ({comp1}) {{ fprintf(stdout, STATUS "... FAIL\\n"); fflush(stdout); return -1; }} }} for (; j < len; j++) {{ if ({comp2}) {{ fprintf(stdout, STATUS "... FAIL\\n"); fflush(stdout); return -1; }} }} }} fprintf(stdout, "test of {op_name} over {typ}... OK\\n"); return EXIT_SUCCESS; }}'''.format(includes=get_includes(lang), op_name=op.name, typ=typ, year=date.today().year, test=test, comp1=comp1, comp2=comp2, unalign=unalign, fill_vin=fill_vin)) common.clang_format(opts, filename) # ----------------------------------------------------------------------------- # Tests for masked stores def gen_mask_store(opts, op, typ, lang): filename = get_filename(opts, op, typ, lang) if filename == None: return if typ == 'f16': fill_vout = 'vout[i] = nsimd_f32_to_f16((f32)0);' one = 'nsimd_f32_to_f16(1.0f)' comp1 = 'nsimd_f16_to_f32(vout[j]) != (f32)1' comp2 = 'nsimd_f16_to_f32(vout[j]) != (f32)0' else: fill_vout = 'vout[i] = ({typ})0;'.format(typ=typ) one = '({typ})1'.format(typ=typ) comp1 = 'vout[j] != ({typ})1'.format(typ=typ) comp2 = 'vout[j] != ({typ})0'.format(typ=typ) if lang == 'c_base': test = \ '''vecl({typ}) mask = vmask_for_loop_tail(0, i, {typ}); v{op_name}(mask, vout, vset1({one}, {typ}), {typ});'''. \ format(typ=typ, op_name=op.name, one=one) elif lang == 'c_adv': test = \ '''nsimd_packl_{typ} mask = nsimd_mask_for_loop_tail( nsimd_packl_{typ}, 0, i); nsimd_{op_name}(mask, vout, nsimd_set1( nsimd_pack_{typ}, {one}));'''. \ format(typ=typ, op_name=op.name, one=one) elif lang == 'cxx_base': test = \ '''vecl({typ}) mask = nsimd::mask_for_loop_tail(0, i, {typ}()); nsimd::{op_name}(mask, vout, nsimd::set1({one}, {typ}()), {typ}());'''.format(typ=typ, op_name=op.name, one=one) elif lang == 'cxx_adv': test = \ '''nsimd::packl<{typ}> mask = nsimd::mask_for_loop_tail >(0, i); nsimd::{op_name}(mask, vout, nsimd::set1 >({one}));'''. \ format(typ=typ, op_name=op.name, one=one) if op.name == 'mask_storeu1': unalign = '\nvout += 1;' else: unalign = '' with common.open_utf8(opts, filename) as out: out.write( '''{includes} #define STATUS "test of {op_name} over {typ}" #define CHECK(a) {{ \\ errno = 0; \\ if (!(a)) {{ \\ fprintf(stderr, "ERROR: " #a ":%d: %s\\n", \\ __LINE__, strerror(errno)); \\ fflush(stderr); \\ exit(EXIT_FAILURE); \\ }} \\ }} int main(void) {{ int i, j; {typ} *vout; int len = vlen({typ}); fprintf(stdout, "test of {op_name} over {typ}...\\n"); CHECK(vout = ({typ}*)nsimd_aligned_alloc({sizeof} * len));{unalign} /* Fill vout with zeors */ for (i = 0; i < len; i++) {{ {fill_vout} }} /* Store data into vout */ for (i = 0; i < len; i++) {{ {test} for (j = 0; j < i; j++) {{ if ({comp1}) {{ fprintf(stdout, STATUS "... FAIL\\n"); fflush(stdout); return -1; }} }} for (; j < len; j++) {{ if ({comp2}) {{ fprintf(stdout, STATUS "... FAIL\\n"); fflush(stdout); return -1; }} }} }} fprintf(stdout, "test of {op_name} over {typ}... OK\\n"); return EXIT_SUCCESS; }}'''.format(includes=get_includes(lang), op_name=op.name, typ=typ, year=date.today().year, test=test, comp1=comp1, comp2=comp2, unalign=unalign, fill_vout=fill_vout, sizeof=common.sizeof(typ))) common.clang_format(opts, filename) # ----------------------------------------------------------------------------- # Tests that load/store of degrees 2, 3 and 4 ravels vectors correctly def gen_load_store_ravel(opts, op, typ, lang): # This test only the libs internal, not the API, so we only generate test # for c filename = get_filename(opts, op, typ, lang, 'ravel') if filename == None: return deg = op.name[4] align = op.name[5] if typ=='f16': convert_to='nsimd_f32_to_f16((f32)' else: convert_to='({typ})('.format(typ=typ) check = '\n'.join([''' comp = vset1({convert_to}{i}+1), {typ}); err = err || vany(vne(v.v{i}, comp, {typ}), {typ}); '''.format(typ=typ, i=i, convert_to=convert_to) \ for i in range (0, int(deg))]) with common.open_utf8(opts, filename) as out: out.write( '''{includes} #define SIZE (2048 / {sizeof}) #define STATUS "test raveling of {op_name} over {typ}" #define CHECK(a) {{ \\ errno = 0; \\ if (!(a)) {{ \\ fprintf(stderr, "ERROR: " #a ":%d: %s\\n", \\ __LINE__, strerror(errno)); \\ fflush(stderr); \\ exit(EXIT_FAILURE); \\ }} \\ }} int main(void) {{ {typ}* vin; {typ}* vout; int i; int len = vlen({typ}); int n = {deg} * len; int err=0; vec({typ}) comp; vecx{deg}({typ}) v; fprintf(stdout, "test raveling of {op_name} over {typ}...\\n"); CHECK(vin = ({typ}*)nsimd_aligned_alloc(n * {sizeof})); CHECK(vout = ({typ}*)nsimd_aligned_alloc(n * {sizeof})); /* Fill in the vectors */ for (i=0; i tmp = nsimd::upcvt< nsimd::pack{logical}x2<{to_typ}> >(nsimd::load{logical}a< nsimd::pack{logical}<{from_typ}> >(in)); nsimd::store{logical}a(out, nsimd::downcvt< nsimd::pack{logical}<{from_typ}> >(tmp.v0, tmp.v1));'''. \ format(op_name=op.name, from_typ=from_typ, to_typ=to_typ, logical=logical) elif op.name == 'to_mask': comp = '''nsimd::storela(out, nsimd::to_logical(nsimd::to_mask( nsimd::loadla >(in))));'''. \ format(from_typ) else: comp = \ '''nsimd::store{logical}a(out, nsimd::{op_name}< nsimd::pack{logical}<{from_typ}> >(nsimd::{op_name}< nsimd::pack{logical}<{to_typ}> >(nsimd::load{logical}a< nsimd::pack{logical}<{from_typ}> >(in))));'''. \ format(op_name=op.name, from_typ=from_typ, to_typ=to_typ, logical=logical) if logical == 'l': rand = '(rand() % 2)' else: if op.name == 'reinterpret' and to_typ == 'f16' and \ from_typ in ['i16', 'u16']: rand = '(15360 /* no denormal */ | (1 << (rand() % 4)))' else: if to_typ in common.utypes or from_typ in common.utypes: rand = '(1 << (rand() % 4))' else: rand = '((2 * (rand() % 2) - 1) * (1 << (rand() % 4)))' if from_typ == 'f16': rand = 'nsimd_f32_to_f16((f32){});'.format(rand) neq_test = '(*(u16*)&in[j]) != (*(u16*)&out[j])' else: rand = '({}){}'.format(from_typ, rand) neq_test = 'in[j] != out[j]' with common.open_utf8(opts, filename) as out: out.write( '''{includes} {msvc_c4334_warning} #define CHECK(a) {{ \\ errno = 0; \\ if (!(a)) {{ \\ fprintf(stderr, "ERROR: " #a ":%d: %s\\n", \\ __LINE__, strerror(errno)); \\ fflush(stderr); \\ exit(EXIT_FAILURE); \\ }} \\ }} int main(void) {{ int i, j; {from_typ} *in, *out; int len = vlen({from_typ}); fprintf(stdout, "test of {op_name} from {from_typ} to {to_typ}...\\n"); CHECK(in = ({from_typ}*)nsimd_aligned_alloc(len * {sizeof})); CHECK(out = ({from_typ}*)nsimd_aligned_alloc(len * {sizeof})); for (i = 0; i < 100; i++) {{ for (j = 0; j < len; j++) {{ in[j] = {rand}; }} {comp} for (j = 0; j < len; j++) {{ if ({neq_test}) {{ exit(EXIT_FAILURE); }} }} }} fprintf(stdout, "test of {op_name} from {from_typ} to {to_typ}... OK\\n"); return EXIT_SUCCESS; }}'''.format(includes=get_includes(lang), op_name=op.name, to_typ=to_typ, from_typ=from_typ, comp=comp, year=date.today().year, rand=rand, neq_test=neq_test, sizeof=common.sizeof(from_typ), msvc_c4334_warning=msvc_c4334_warning \ if from_typ in ['i64', 'u64'] else '')) common.clang_format(opts, filename) # ----------------------------------------------------------------------------- # Shuffle def gen_reverse(opts, op, typ, lang): filename = get_filename(opts, op, typ, lang) if filename == None: return if lang == 'c_base': test_code = \ 'vstorea(out, vreverse(vloada(in, {typ}), {typ}), {typ});'. \ format(typ=typ) elif lang == 'c_adv': test_code = '''nsimd_storea(out, nsimd_reverse(nsimd_loada( nsimd_pack_{typ}, in)));'''.format(typ=typ) elif lang == 'cxx_base': test_code = \ 'nsimd::storea(out, nsimd::reverse(nsimd::loada(in, {typ}()), ' \ '{typ}()), {typ}());'.format(typ=typ) elif lang == 'cxx_adv': test_code = \ 'nsimd::storea(out, nsimd::reverse(' \ 'nsimd::loada >(in)));'.format(typ=typ) if typ == 'f16': init = 'in[ i ] = nsimd_f32_to_f16((float)(i + 1));' comp = 'ok &= nsimd_f16_to_f32(out[len - 1 - i]) == ' \ 'nsimd_f16_to_f32(in[i]);' else: init = 'in[ i ] = ({typ})(i + 1);'.format(typ=typ) comp = 'ok &= out[len - 1 - i] == in[i];' with common.open_utf8(opts, filename) as out: out.write( '''{includes} #define CHECK(a) {{ \\ errno = 0; \\ if (!(a)) {{ \\ fprintf(stderr, "ERROR: " #a ":%d: %s\\n", \\ __LINE__, strerror(errno)); \\ fflush(stderr); \\ exit(EXIT_FAILURE); \\ }} \\ }} int main(void) {{ unsigned char i; int ok; {typ} * in; {typ} * out; int len = vlen({typ}); fprintf(stdout, "test of {op_name} over {typ}...\\n"); CHECK(in = ({typ}*)nsimd_aligned_alloc(len * {sizeof})); CHECK(out = ({typ}*)nsimd_aligned_alloc(len * {sizeof})); for( i = 0 ; i < len ; ++i ) {{ {init} }} {test_code} ok = 1; for( i = 0 ; i < len ; ++i ) {{ {comp} }} if( ok ) {{ fprintf(stdout, "test of {op_name} over {typ}... OK\\n"); }} else {{ fprintf(stderr, "test of {op_name} over {typ}... FAIL\\n"); exit(EXIT_FAILURE); }} nsimd_aligned_free( in ); nsimd_aligned_free( out ); return EXIT_SUCCESS; }}'''.format(includes=get_includes(lang), op_name=op.name, typ=typ, test_code=test_code, year=date.today().year, sizeof=common.sizeof(typ), init=init, comp=comp)) common.clang_format(opts, filename) # ----------------------------------------------------------------------------- # Unpack half def gen_unpack_half(opts, op, typ, lang): filename = get_filename(opts, op, typ, lang) if filename == None: return if typ == 'f16': left = '(double)nsimd_f16_to_f32(ref_out)' right = '(double)nsimd_f16_to_f32(nsimd_out)' elif typ == 'f32': left = '(double)ref_out' right = '(double)nsimd_out' else: left = 'ref_out' right = 'nsimd_out' if lang == 'c_base': typ_nsimd = 'vec({typ})'.format(typ=typ) vout1_comp = '''vec({typ}) va1, va2, vc; va1 = vloadu(&vin1[i], {typ}); va2 = vloadu(&vin2[i], {typ}); vc = v{op_name}(va1, va2, {typ}); vstoreu(&vout[i], vc, {typ});'''. \ format(typ=typ, op_name=op.name) if lang == 'c_adv': typ_nsimd = 'nsimd_pack_{typ}'.format(typ=typ) vout1_comp = '''nsimd_pack_{typ} va1, va2, vc; va1 = nsimd_loadu(nsimd_pack_{typ}, &vin1[i]); va2 = nsimd_loadu(nsimd_pack_{typ}, &vin2[i]); vc = nsimd_{op_name}(va1, va2); nsimd_storeu(&vout[i], vc);'''. \ format(typ=typ, op_name=op.name) if lang == 'cxx_base': typ_nsimd = 'vec({typ})'.format(typ=typ) vout1_comp = '''vec({typ}) va1, va2, vc; va1 = nsimd::loadu(&vin1[i], {typ}()); va2 = nsimd::loadu(&vin2[i], {typ}()); vc = nsimd::{op_name}(va1, va2, {typ}()); nsimd::storeu(&vout[i], vc, {typ}());'''. \ format(typ=typ, op_name=op.name) if lang == 'cxx_adv': typ_nsimd = 'nsimd::pack<{typ}>'.format(typ=typ) vout1_comp = '''nsimd::pack<{typ}> va1, va2, vc; va1 = nsimd::loadu >(&vin1[i]); va2 = nsimd::loadu >(&vin2[i]); vc = nsimd::{op_name}(va1, va2); nsimd::storeu(&vout[i], vc);'''. \ format(typ=typ, op_name=op.name) op_test = 'step/(2 * nb_lane)' if op.name in['ziphi', 'ziplo']: offset = 'int offset = {val};'.format(val= '0' \ if op.name == 'ziplo' else 'vlen({typ}) / 2'.format(typ=typ)) else: offset = '' if op.name in ['unziplo', 'unziphi']: if typ == 'f16': comp_unpack = ''' (nsimd_f16_to_f32(vout[i]) != nsimd_f16_to_f32(vin1[vi + 2 * j + {i}])) || (nsimd_f16_to_f32(vout[i + step / 2]) != nsimd_f16_to_f32(vin2[vi + 2 * j + {i}])) '''.format(i = '0' if op.name == 'unziplo' else '1') else: comp_unpack = '''\ (vout[i] != vin1[vi + 2 * j + {i}]) || (vout[i + step / 2] != vin2[vi + 2 * j + {i}]) '''.format(i = '0' if op.name == 'unziplo' else '1') else: if typ == 'f16': comp_unpack ='''(nsimd_f16_to_f32(vout[i]) != nsimd_f16_to_f32(vin1[j])) || (nsimd_f16_to_f32(vout[i + 1]) != nsimd_f16_to_f32(vin2[j]))''' else: comp_unpack ='''(vout[i] != vin1[j]) || (vout[i + 1] != vin2[j])''' nbits = {'f16': '10', 'f32': '21', 'f64': '48'} head = '''{posix_c_source} {includes} #include #include {msvc_c4334_warning} #define SIZE (2048 / {sizeof}) #define CHECK(a) {{ \\ errno = 0; \\ if (!(a)) {{ \\ fprintf(stderr, "ERROR: " #a ":%d: %s\\n", \\ __LINE__, strerror(errno)); \\ fflush(stderr); \\ exit(EXIT_FAILURE); \\ }} \\ }} /* {simd} */ ''' .format(year=date.today().year, typ=typ, posix_c_source=posix_c_source, includes=get_includes(lang), comp_unpack=comp_unpack, sizeof=common.sizeof(typ), simd=opts.simd, msvc_c4334_warning=msvc_c4334_warning \ if typ in ['i64', 'u64'] else '') if typ == 'f16': rand = '''nsimd_f32_to_f16((f32)(2 * (rand() % 2) - 1) * (f32)(1 << (rand() % 4)) / (f32)(1 << (rand() % 4)))''' else: rand = '''({typ})(({typ})(2 * (rand() % 2) - 1) * ({typ})(1 << (rand() % 4)) / ({typ})(1 << (rand() % 4)))'''.format(typ=typ) with common.open_utf8(opts, filename) as out: out.write( '''{head} int main(void) {{ int vi, i, j, step; {typ} *vin1, *vin2; {typ} *vout; CHECK(vin1 = ({typ} *)nsimd_aligned_alloc(SIZE * {sizeof})); CHECK(vin2 = ({typ} *)nsimd_aligned_alloc(SIZE * {sizeof})); CHECK(vout = ({typ} *)nsimd_aligned_alloc(SIZE * {sizeof})); step = vlen({typ}); fprintf(stdout, "test of {op_name} over {typ}...\\n"); /* Fill input vector(s) with random */ for (i = 0; i < SIZE; i++) {{ vin1[i] = {rand}; vin2[i] = {rand}; }} /* Fill output vector with computed values */ for (i = 0; i < SIZE; i += step) {{ {vout1_comp} }} /* Compare results */ if (step != 1) {{ {offset} for (vi = 0; vi < SIZE; vi += step){{ j = {init_j}; for (i = vi; i < {cond}; {inc}) {{ if({comp_unpack}) {{ fprintf(stderr, "test of {op_name} over {typ}... FAIL\\n"); exit(EXIT_FAILURE); }} j++; }} }} }} fprintf(stdout, "test of {op_name} over {typ}... OK\\n"); fflush(stdout); return EXIT_SUCCESS; }} '''.format(includes=get_includes(lang), op_name=op.name, typ=typ, year=date.today().year,sizeof=common.sizeof(typ), rand=rand, head=head, comp_unpack=comp_unpack, vout1_comp= vout1_comp, op_test=op_test, typ_nsimd=typ_nsimd, offset=offset, cond='vi + step' if op.name in['ziplo', 'ziphi'] else 'vi + step / 2', init_j='vi + offset' if op.name in['ziplo', 'ziphi'] else '0', inc='i += 2' if op.name in['ziphi', 'ziplo'] else 'i++', pos='0' if op.name in ['ziplo', 'unziplo', 'unziphi'] else op_test)) common.clang_format(opts, filename) # ------------------------------------------------------------------------------ # Unpack def gen_unpack(opts, op, typ, lang): filename = get_filename(opts, op, typ, lang) if filename == None: return if typ == 'f16': left = '(double)nsimd_f16_to_f32(ref_out)' right = '(double)nsimd_f16_to_f32(nsimd_out)' elif typ == 'f32': left = '(double)ref_out' right = '(double)nsimd_out' else: left = 'ref_out' right = 'nsimd_out' if lang == 'c_base': typ_nsimd = 'vec({typ})'.format(typ=typ) vout1_comp = \ '''vec({typ}) va1, va2; vecx2({typ}) vc; va1 = vloadu(&vin1[i], {typ}); va2 = vloadu(&vin2[i], {typ}); vc = v{op_name}(va1, va2, {typ}); vstoreu(&vout[2 * i], vc.v0, {typ}); vstoreu(&vout[2 * i + vlen({typ})], vc.v1, {typ});'''. \ format(typ=typ, op_name=op.name) if lang == 'c_adv': typ_nsimd = 'nsimd_pack_{typ}'.format(typ=typ) vout1_comp = \ '''nsimd_pack_{typ} va1, va2; nsimd_packx2_{typ} vc; va1 = nsimd_loadu(nsimd_pack_{typ}, &vin1[i]); va2 = nsimd_loadu(nsimd_pack_{typ}, &vin2[i]); vc = nsimd_{op_name}(va1, va2); nsimd_storeu(&vout[2 * i], vc.v0); nsimd_storeu(&vout[2 * i + nsimd_len(nsimd_pack_{typ})], vc.v1);'''.format(typ=typ, op_name=op.name) if lang == 'cxx_base': typ_nsimd = 'vec({typ})'.format(typ=typ) vout1_comp = \ '''vec({typ}) va1, va2; vecx2({typ}) vc; va1 = nsimd::loadu(&vin1[i], {typ}()); va2 = nsimd::loadu(&vin2[i], {typ}()); vc = nsimd::{op_name}(va1, va2, {typ}()); nsimd::storeu(&vout[2 * i], vc.v0, {typ}()); nsimd::storeu(&vout[2 * i + vlen({typ})], vc.v1, {typ}());'''. \ format(typ=typ, op_name=op.name) if lang == 'cxx_adv': typ_nsimd = 'nsimd::pack<{typ}>'.format(typ=typ) vout1_comp = \ '''nsimd::pack<{typ}> va1, va2; nsimd::packx2<{typ}> vc; va1 = nsimd::loadu >(&vin1[i]); va2 = nsimd::loadu >(&vin2[i]); vc = nsimd::{op_name}(va1, va2); nsimd::storeu(&vout[2 * i], vc.v0); nsimd::storeu(&vout[2 * i + nsimd::len({typ}())], vc.v1);'''. \ format(typ=typ, op_name=op.name) head = '''{posix_c_source} {includes} #include #include {msvc_c4334_warning} #define SIZE (2048 / {sizeof}) #define CHECK(a) {{ \\ errno = 0; \\ if (!(a)) {{ \\ fprintf(stderr, "ERROR: " #a ":%d: %s\\n", \\ __LINE__, strerror(errno)); \\ fflush(stderr); \\ exit(EXIT_FAILURE); \\ }} \\ }} /* {simd} */ ''' .format(year=date.today().year, typ=typ, posix_c_source=posix_c_source, includes=get_includes(lang), sizeof=common.sizeof(typ), simd= opts.simd, msvc_c4334_warning=msvc_c4334_warning \ if typ in ['i64', 'u64'] else '') if typ == 'f16': rand = 'nsimd_f32_to_f16((f32)(2 * (rand() % 2) - 1) * ' \ '(f32)(1 << (rand() % 4)) / (f32)(1 << (rand() % 4)))' else: rand = '({typ})(({typ})(2 * (rand() % 2) - 1) * ' \ '({typ})(1 << (rand() % 4)) / ({typ})(1 << (rand() % 4)))'. \ format(typ=typ) if op.name == 'zip': scalar_code = '''for(i = 0; i < step; i ++) {{ out_ptr[2 * i] = vin1_ptr[i]; out_ptr[2 * i + 1] = vin2_ptr[i]; }} ''' else: scalar_code = \ '''for(i = 0; i < step / 2; i++) {{ out_ptr[i] = vin1_ptr[2 * i]; out_ptr[step / 2 + i] = vin2_ptr[2 * i]; out_ptr[step + i] = vin1_ptr[2 * i + 1]; out_ptr[step + step / 2 + i] = vin2_ptr[2 * i + 1]; }} ''' if typ == 'f16': comp = 'nsimd_f16_to_f32(vout[vi]) != nsimd_f16_to_f32(vout_ref[vi])' else: comp = 'vout[vi] != vout_ref[vi]' with common.open_utf8(opts, filename) as out: out.write( '''{head} int main(void){{ int i, vi, step; {typ} *vin1, *vin2; {typ} *vout; {typ} *vout_ref; CHECK(vin1 = ({typ} *)nsimd_aligned_alloc(SIZE * {sizeof})); CHECK(vin2 = ({typ} *)nsimd_aligned_alloc(SIZE * {sizeof})); CHECK(vout = ({typ} *)nsimd_aligned_alloc(2 * SIZE * {sizeof})); CHECK(vout_ref = ({typ} *)nsimd_aligned_alloc(2 * SIZE * {sizeof})); step = vlen({typ}); fprintf(stdout, "test of {op_name} over {typ}...\\n"); /* Fill input vector(s) with random */ for (i = 0; i < SIZE; i++) {{ vin1[i] = {rand}; vin2[i] = {rand}; }} /* Compute a scalar reference version */ for(vi = 0; vi < SIZE; vi += step) {{ {typ} *out_ptr = vout_ref + 2 * vi; {typ} *vin1_ptr = vin1 + vi; {typ} *vin2_ptr = vin2 + vi; {scalar_code} }} /* Fill output vector with computed values */ for (i = 0; i < SIZE; i += step) {{ {vout1_comp} }} /* Compare results */ for(vi = 0; vi < SIZE; vi++) {{ if({comp}) {{ fprintf(stderr, "test of {op_name} over {typ}... FAIL\\n"); exit(EXIT_FAILURE); }} }} fprintf(stdout, "test of {op_name} over {typ}... OK\\n"); fflush(stdout); return EXIT_SUCCESS; }} '''.format(includes=get_includes(lang), op_name=op.name, typ=typ, year=date.today().year,sizeof=common.sizeof(typ), rand=rand, head=head, scalar_code=scalar_code, comp=comp, vout1_comp= vout1_comp, typ_nsimd=typ_nsimd)) common.clang_format(opts, filename) # ----------------------------------------------------------------------------- # Entry point def doit(opts): common.myprint(opts, 'Generating tests') for op_name, operator in operators.operators.items(): # Skip non-matching tests if opts.match and not opts.match.match(op_name): continue for typ in operator.types: if not should_i_do_the_test(operator, '', typ): continue elif operator.name == 'nbtrue': gen_nbtrue(opts, operator, typ, 'c_base') gen_nbtrue(opts, operator, typ, 'c_adv') gen_nbtrue(opts, operator, typ, 'cxx_base') gen_nbtrue(opts, operator, typ, 'cxx_adv') elif operator.name == 'addv': if typ in common.ftypes: gen_addv(opts, operator, typ, 'c_base') gen_addv(opts, operator, typ, 'c_adv') gen_addv(opts, operator, typ, 'cxx_base') gen_addv(opts, operator, typ, 'cxx_adv') elif operator.name == 'adds': gen_adds(opts, operator, typ, 'c_base') gen_adds(opts, operator, typ, 'c_adv') gen_adds(opts, operator, typ, 'cxx_base') gen_adds(opts, operator, typ, 'cxx_adv') elif operator.name == 'subs': gen_subs(opts, operator, typ, 'c_base') gen_subs(opts, operator, typ, 'c_adv') gen_subs(opts, operator, typ, 'cxx_base') gen_subs(opts, operator, typ, 'cxx_adv') elif operator.name in ['all', 'any']: gen_all_any(opts, operator, typ, 'c_base') gen_all_any(opts, operator, typ, 'c_adv') gen_all_any(opts, operator, typ, 'cxx_base') gen_all_any(opts, operator, typ, 'cxx_adv') elif operator.name == 'iota': gen_iota(opts, operator, typ, 'c_base') gen_iota(opts, operator, typ, 'c_adv') gen_iota(opts, operator, typ, 'cxx_base') gen_iota(opts, operator, typ, 'cxx_adv') elif operator.name in ['reinterpret', 'reinterpretl', 'cvt', 'upcvt', 'to_mask']: for to_typ in common.get_output_types(typ, operator.output_to): if not should_i_do_the_test(operator, to_typ, typ): continue gen_reinterpret_convert(opts, operator, typ, to_typ, 'c_base') gen_reinterpret_convert(opts, operator, typ, to_typ, 'c_adv') gen_reinterpret_convert(opts, operator, typ, to_typ, 'cxx_base') gen_reinterpret_convert(opts, operator, typ, to_typ, 'cxx_adv') elif operator.name in ['load2a', 'load2u', 'load3a', 'load3u', 'load4a', 'load4u']: gen_load_store(opts, operator, typ, 'c_base') gen_load_store(opts, operator, typ, 'c_adv') gen_load_store(opts, operator, typ, 'cxx_base') gen_load_store(opts, operator, typ, 'cxx_adv') gen_load_store_ravel(opts, operator, typ, 'c_base') elif operator.name in ['gather', 'gather_linear']: gen_gather_scatter(opts, operator, typ, 'c_base') gen_gather_scatter(opts, operator, typ, 'c_adv') gen_gather_scatter(opts, operator, typ, 'cxx_base') gen_gather_scatter(opts, operator, typ, 'cxx_adv') elif operator.name == 'mask_scatter': gen_mask_scatter(opts, operator, typ, 'c_base') gen_mask_scatter(opts, operator, typ, 'c_adv') gen_mask_scatter(opts, operator, typ, 'cxx_base') gen_mask_scatter(opts, operator, typ, 'cxx_adv') elif operator.name in ['maskz_gather', 'masko_gather']: gen_maskoz_gather(opts, operator, typ, 'c_base') gen_maskoz_gather(opts, operator, typ, 'c_adv') gen_maskoz_gather(opts, operator, typ, 'cxx_base') gen_maskoz_gather(opts, operator, typ, 'cxx_adv') elif operator.name in ['masko_loada1', 'masko_loadu1', 'maskz_loada1', 'maskz_loadu1']: gen_mask_load(opts, operator, typ, 'c_base') gen_mask_load(opts, operator, typ, 'c_adv') gen_mask_load(opts, operator, typ, 'cxx_base') gen_mask_load(opts, operator, typ, 'cxx_adv') elif operator.name in ['mask_storea1', 'mask_storeu1']: gen_mask_store(opts, operator, typ, 'c_base') gen_mask_store(opts, operator, typ, 'c_adv') gen_mask_store(opts, operator, typ, 'cxx_base') gen_mask_store(opts, operator, typ, 'cxx_adv') elif operator.name == 'reverse': gen_reverse(opts, operator, typ, 'c_base'); gen_reverse(opts, operator, typ, 'c_adv'); gen_reverse(opts, operator, typ, 'cxx_base'); gen_reverse(opts, operator, typ, 'cxx_adv'); elif operator.name in ['ziplo', 'ziphi', 'unziplo', 'unziphi']: gen_unpack_half(opts, operator, typ, 'c_base') gen_unpack_half(opts, operator, typ, 'c_adv') gen_unpack_half(opts, operator, typ, 'cxx_base') gen_unpack_half(opts, operator, typ, 'cxx_adv') elif operator.name in ['zip', 'unzip']: gen_unpack(opts, operator, typ, 'c_base') gen_unpack(opts, operator, typ, 'c_adv') gen_unpack(opts, operator, typ, 'cxx_base') gen_unpack(opts, operator, typ, 'cxx_adv') else: gen_test(opts, operator, typ, 'c_base') gen_test(opts, operator, typ, 'c_adv') gen_test(opts, operator, typ, 'cxx_base') gen_test(opts, operator, typ, 'cxx_adv') ================================================ FILE: egg/get_sleef_code.py ================================================ # Copyright (c) 2021 Agenium Scale # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. import common import shutil import requests import zipfile import os # ----------------------------------------------------------------------------- def doit(opts): common.myprint(opts, 'Copy native Sleef version {}'. \ format(opts.sleef_version)) # First download Sleef sleef_dir = os.path.join(opts.script_dir, '..', '_deps-sleef') common.mkdir_p(sleef_dir) url = 'https://github.com/shibatch/sleef/archive/refs/tags/{}.zip'. \ format(opts.sleef_version) r = requests.get(url, allow_redirects=True) sleef_zip = os.path.join(sleef_dir, 'sleef.zip') with open(sleef_zip, 'wb') as fout: fout.write(r.content) # Unzip sleef with zipfile.ZipFile(sleef_zip, 'r') as fin: fin.extractall(path=sleef_dir) # Copy helper function def copy(filename): dst_filename = os.path.basename(filename) shutil.copyfile(os.path.join(sleef_dir, 'sleef-{}'.format(opts.sleef_version), filename), os.path.join(opts.src_dir, dst_filename)) # Copy files copy('src/libm/sleefsimddp.c') copy('src/libm/sleefsimdsp.c') copy('src/libm/sleefdp.c') copy('src/libm/sleefsp.c') copy('src/common/misc.h') copy('src/libm/estrin.h') copy('src/libm/dd.h') copy('src/libm/df.h') copy('src/libm/rempitab.c') copy('src/arch/helpersse2.h') copy('src/arch/helperavx.h') copy('src/arch/helperavx2.h') copy('src/arch/helperavx512f.h') copy('src/arch/helperneon32.h') copy('src/arch/helperadvsimd.h') copy('src/arch/helperpower_128.h') copy('src/arch/helpersve.h') # Sleef uses aliases but we don't need those so we comment them def comment_DALIAS_lines(filename): src = os.path.join(opts.src_dir, filename) dst = os.path.join(opts.src_dir, 'tmp.c') with open(src, 'r') as fin, open(dst, 'w') as fout: for line in fin: if line.startswith('DALIAS_'): fout.write('/* {} */\n'.format(line.strip())) else: fout.write(line) shutil.copyfile(dst, src) os.remove(dst) comment_DALIAS_lines('sleefsimdsp.c') comment_DALIAS_lines('sleefsimddp.c') # Sleef provides runtime SIMD detection via cpuid but we don't need it def replace_x86_cpuid(filename): src = os.path.join(opts.src_dir, filename) dst = os.path.join(opts.src_dir, 'tmp.c') with open(src, 'r') as fin, open(dst, 'w') as fout: for line in fin: if line.startswith('void Sleef_x86CpuID'): fout.write( '''static inline void Sleef_x86CpuID(int32_t out[4], uint32_t eax, uint32_t ecx) { /* We don't care for cpuid detection */ out[0] = 0xFFFFFFFF; out[1] = 0xFFFFFFFF; out[2] = 0xFFFFFFFF; out[3] = 0xFFFFFFFF; } ''') else: fout.write(line) shutil.copyfile(dst, src) os.remove(dst) replace_x86_cpuid('helpersse2.h') replace_x86_cpuid('helperavx.h') replace_x86_cpuid('helperavx2.h') replace_x86_cpuid('helperavx512f.h') # Sleef uses force inline through its INLINE macro defined in misc.h # We modify it to avoid warnings and because force inline has been a pain # in the past. We also rename some exported symbols. with open(os.path.join(opts.src_dir, 'misc.h'), 'a') as fout: fout.write( ''' /* NSIMD specific */ #ifndef NSIMD_SLEEF_MISC_H #define NSIMD_SLEEF_MISC_H #ifdef INLINE #undef INLINE #endif #define INLINE inline #define Sleef_rempitabdp nsimd_sleef_rempitab_f64 #define Sleef_rempitabsp nsimd_sleef_rempitab_f32 #endif ''') # Sleef functions must be renamed properly for each SIMD extensions. # Moreover their name must contain their precision (in ULPs). This # precision is not the same for all functions and some functions can have # differents flavours (or precisions). The "database" is contained within # src/libm/funcproto.h. So we parse it and produce names # in headers "rename[SIMD ext].h" to avoid modifying Sleef C files. funcproto = os.path.join(sleef_dir, 'sleef-{}'.format(opts.sleef_version), 'src', 'libm', 'funcproto.h') defines = [] ulp_suffix = { '0' : '', '1' : '_u1', '2' : '_u05', '3' : '_u35', '4' : '_u15', '5' : '_u3500' } with open(funcproto, 'r') as fin: for line in fin: if (line.find('{') != -1 and line.find('}') != -1): items = [item.strip() \ for item in line.strip(' \n\r{},').split(',')] items[0] = items[0].strip('"') if items[0] == 'NULL': break sleef_name_f64 = items[0] + ulp_suffix[items[2]] sleef_name_f32 = items[0] + 'f' + ulp_suffix[items[2]] items[1] = items[1] if items[1] != '5' else '05' if items[1] == '-1': nsimd_name_f64 = 'nsimd_sleef_{}_{{nsimd_ext}}_f64'. \ format(items[0]) nsimd_name_f32 = 'nsimd_sleef_{}_{{nsimd_ext}}_f32'. \ format(items[0]) else: nsimd_name_f64 = \ 'nsimd_sleef_{}_u{}{{det}}_{{nsimd_ext}}_f64'. \ format(items[0], items[1]) nsimd_name_f32 = \ 'nsimd_sleef_{}_u{}{{det}}_{{nsimd_ext}}_f32'. \ format(items[0], items[1]) defines.append('#define x{} {}'.format(sleef_name_f64, nsimd_name_f64)) defines.append('#define x{} {}'.format(sleef_name_f32, nsimd_name_f32)) defines = '\n'.join(defines) sleef_to_nsimd = { '': ['scalar'], 'sse2': ['sse2'], 'sse4': ['sse42'], 'avx': ['avx'], 'avx2': ['avx2'], 'avx512f': ['avx512_knl', 'avx512_skylake'], 'neon32': ['neon128'], 'advsimd': ['aarch64'], 'sve': ['sve128', 'sve256', 'sve512', 'sve1024', 'sve2048'], 'vsx': ['vmx', 'vsx'] } for simd_ext in ['', 'sse2', 'sse4', 'avx', 'avx2', 'avx512f', 'neon32', 'advsimd', 'sve', 'vsx']: renameheader = os.path.join(opts.src_dir, 'rename{}.h'.format(simd_ext)) se = simd_ext if simd_ext != '' else 'scalar' with open(renameheader, 'w') as fout: fout.write( '''#ifndef RENAME{SIMD_EXT}_H #define RENAME{SIMD_EXT}_H '''.format(SIMD_EXT=se.upper())) for nse in sleef_to_nsimd[simd_ext]: ifdef = '' if simd_ext == '' \ else '#ifdef NSIMD_{}'.format(nse.upper()) endif = '' if simd_ext == '' else '#endif' fout.write( '''{hbar} /* Naming of functions {nsimd_ext} */ {ifdef} #ifdef DETERMINISTIC {defines_det_f32} #else {defines_nondet_f32} #endif #define rempi nsimd_sleef_rempi_{nsimd_ext} #define rempif nsimd_sleef_rempif_{nsimd_ext} #define rempisub nsimd_sleef_rempisub_{nsimd_ext} #define rempisubf nsimd_sleef_rempisubf_{nsimd_ext} #define gammak nsimd_gammak_{nsimd_ext} #define gammafk nsimd_gammafk_{nsimd_ext} {endif} '''.format(NSIMD_EXT=nse.upper(), nsimd_ext=nse, hbar=common.hbar, ifdef=ifdef, endif=endif, defines_det_f32=defines.format(det='d', nsimd_ext=nse), defines_nondet_f32=defines.format(det='', nsimd_ext=nse), defines_det_f64=defines.format(det='d', nsimd_ext=nse), defines_nondet_f64=defines.format(det='', nsimd_ext=nse))) fout.write('\n\n#endif\n\n') common.clang_format(opts, renameheader) ================================================ FILE: egg/hatch.py ================================================ # Copyright (c) 2021 Agenium Scale # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. # What does this script? # ---------------------- # # This script generates code for each architecture, the base C/C++ APIs and # the advanced C++ API. Each part to be generated is handled by a # `gen_*.py` file. This script simply calls the `doit` function of each # `gen_*.py` module. Names are self-explanatory. # # ----------------------------------------------------------------------------- # First thing we do is check whether python3 is used import sys if sys.version_info[0] < 3: print('Only Python 3 is supported') sys.exit(1) # ----------------------------------------------------------------------------- # Imports import argparse import os import re import common import gen_archis import gen_base_apis import gen_adv_cxx_api import gen_adv_c_api import gen_tests import gen_src import gen_doc import gen_friendly_but_not_optimized import gen_modules import gen_scalar_utilities import get_sleef_code # Dir of this script script_dir = os.path.dirname(__file__) if script_dir == '': script_dir = '.' # ----------------------------------------------------------------------------- # Arguments parsing def parse_args(args): def parse_simd(value): ## Split .simd now values = { 'x86': common.x86_simds, 'arm': common.arm_simds, 'ppc': common.ppc_simds, 'all': common.simds, }.get(value, value.split(',')) ## Check that all simd are valid ret = [] for simd in values: if simd not in common.simds: raise argparse.ArgumentTypeError( "SIMD '{}' not found in {}".format(simd, common.simds)) ret += common.simds_deps[simd] return list(set(ret)) def parse_match(value): if value is None: return None else: return re.compile(value) # In pratice, we either generate all or all except tests and we never # change default directories for code generation. So we remove unused # options and regroup some into --library. parser = argparse.ArgumentParser( description='This is NSIMD generation script.') parser.add_argument('--force', '-f', action='store_true', help='Generate all files even if they already exist') parser.add_argument('--list-files', '-L', action='store_true', default=False, help='List files that will be created by hatch.py') parser.add_argument('--all', '-A', action='store_true', help='Generate code for the library and its tests') parser.add_argument('--library', '-l', action='store_true', help='Generate code of the library (C and C++ APIs)') parser.add_argument('--sleef', '-s', action='store_true', default=False, help='Compile Sleef') parser.add_argument('--tests', '-t', action='store_true', help='Generate tests in C and C++') parser.add_argument('--doc', '-d', action='store_true', help='Generate all documentation') parser.add_argument('--enable-clang-format', '-F', action='store_false', default=True, help='Disable Clang Format (mainly for speed on Windows)') parser.add_argument('--sve-emulate-bool', action='store_true', default=False, help='Use normal SVE vector to emulate predicates.') parser.add_argument('--simd', '-D', type=parse_simd, default='all', help='List of SIMD extensions (separated by a comma)') parser.add_argument('--match', '-m', type=parse_match, default=None, help='Regex used to filter generation on operator names') parser.add_argument('--verbose', '-v', action = 'store_true', default=None, help='Enable verbose mode') parser.add_argument('--simple-license', action='store_true', default=False, help='Put a simple copyright statement instead of the whole license') opts = parser.parse_args(args) # When -L has been chosen, we want to list all files and so we have to # turn to True other parameters if opts.list_files: opts.library = True opts.tests = True opts.force = True opts.doc = True # We set variables here because all the code depends on them + we do want # to keep the possibility to change them in the future opts.archis = opts.library opts.base_apis = opts.library opts.adv_cxx_api = opts.library opts.adv_c_api = opts.library opts.friendly_but_not_optimized = opts.library opts.src = opts.library opts.scalar_utilities = opts.library opts.sleef_version = '3.5.1' opts.include_dir = os.path.join(script_dir, '..', 'include', 'nsimd') opts.tests_dir = os.path.join(script_dir, '..', 'tests') opts.src_dir = os.path.join(script_dir, '..', 'src') return opts # ----------------------------------------------------------------------------- # Entry point def main(): opts = parse_args(sys.argv[1:]) opts.script_dir = script_dir opts.modules_list = None opts.platforms_list = None ## Gather all SIMD dependencies opts.simd = common.get_simds_deps_from_opts(opts) common.myprint(opts, 'List of SIMD: {}'.format(', '.join(opts.simd))) if opts.archis == True or opts.all == True: gen_archis.doit(opts) if opts.base_apis == True or opts.all == True: gen_base_apis.doit(opts) if opts.adv_cxx_api == True or opts.all == True: gen_adv_cxx_api.doit(opts) if opts.adv_c_api == True or opts.all == True: gen_adv_c_api.doit(opts) if opts.tests == True or opts.all == True: gen_tests.doit(opts) if opts.src == True or opts.all == True: gen_src.doit(opts) if opts.sleef == True or opts.all == True: get_sleef_code.doit(opts) if opts.scalar_utilities == True or opts.all == True: gen_scalar_utilities.doit(opts) if opts.friendly_but_not_optimized == True or opts.all == True: gen_friendly_but_not_optimized.doit(opts) gen_modules.doit(opts) # this must be here after all NSIMD if opts.doc == True or opts.all == True: gen_doc.doit(opts) if __name__ == '__main__': main() ================================================ FILE: egg/modules/fixed_point/gen_doc.py ================================================ # Use utf-8 encoding # -*- coding: utf-8 -*- # Copyright (c) 2019 Agenium Scale # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. import os import platform import io import sys import subprocess import collections import re import string import common import operators # ------------------------------------------------------------------------------ def gen_overview(opts): filename = common.get_markdown_file(opts, 'overview', 'fixed_point') with common.open_utf8(opts, filename) as fout: fout.write(''' # NSIMD fixed point module ## Description This module implements a fixed-point numbers support for the `nsimd` library. Fixed-point numbers are integer types used to represent decimal numbers. A number `lf` of bits are used to encode its integer part, and `rt` bits are used to encode its fractional part. The fixed_point module uses the templated type `nsimd::fixed_point::fp_t` to represent a fixed_point number. All the basic floating-point arithmetic operaors have been defined, therefore fp_t elements can be manipulated as normal numbers. The fixed_point module will use a `i8`, `i16`, or `i32` integer type for storage, depending on the value of `lf + 2 * rt`. All the functions of the module are under the namespace `nsimd::fixed_point`, and match the same interface than `nsimd` C++ . The `fp_t` struct type is defined in `fixed.hpp`, and the associated simd `fpsimd_t` struct type are defined in `simd.hpp`. The modules redefines the `nsimd` pack type for fixed-point numbers, templated with `lf` and `rt` : ```C++ namespace nsimd { namespace fixed_point { template struct pack; } // namespace fixed_point } // namespace nsimd ``` Then, the pack can be manipulated as an `nsimd` pack like other scalar types. ## Compatibility The fixed point module is a C++ only API, compatible with the C++98 standard. It has the same compilers and hardware support than the main `nsimd` API (see the [API index](index.md)). ## Example Here is a minimal example([main.cpp](../../examples/module_fixed_point.cpp)): @[INCLUDE_CODE:L21:L61](../../examples/module_fixed_point.cpp) To test with avx2 run : ```bash export NSIMD_ROOT= g++ -o main -I$NSIMD_ROOT/include -mavx2 -DNSIMD_AVX2 main.cpp ./main ``` The console output will look like this : ```console $>./main 1.35938 | -0.421875 | 0.9375 1.13281 | 1.19531 | 2.32812 1.64844 | -1.21094 | 0.4375 -0.660156 | 1.07422 | 0.414062 -0.890625 | 0.214844 | -0.675781 -0.0898438 | 0.515625 | 0.425781 -0.539062 | 0.0546875 | -0.484375 1.80859 | 1.66406 | 3.47266 ``` ''') api_template = '''\ # {full_name} {desc} ## Template parameter type for T: When using the following typedef : ```c++ typedef nsimd::fixed_point::fp_t fp_t ``` The T template parameter is one of the following types depending on the operator: - `set1`, `loadu` and `loada`: ```c++ nsimd::fixed_point::pack ``` - `loadlu`, `loadla`: ```c++ nsimd::fixed_point::packl ``` - Other operators: ```c++ nsimd::fixed_point::fp_t ``` ## C++ API ```c++ {decl} ``` ''' decl_template = '''\ template {ret}{op}({args});\n\n''' # ----------------------------------------------------------------------------- def get_type(param, return_typ=False): if param == '_': return 'void' elif param == '*': return 'typename T::value_type *' elif param == 'c*': return 'const typename T::value_type *' elif param == 's': return 'typename T::value_type' elif param in 'v': return 'pack' if return_typ else 'const pack &' elif param == 'l': return 'packl' if return_typ else 'const packl &' elif param == 'p': return 'int ' else: return None # ----------------------------------------------------------------------------- def gen_decl(op): sig = '{}{} {{}}({});'.format( 'template ' \ if 'v' not in op.params[1:] and \ 'l' not in op.params[1:] else '', get_type(op.params[0], True), ', '.join(['{} {}'.format( get_type(op.params[i + 1]), common.get_arg(i)) \ for i in range(len(op.params[1:]))]) ) ret = 'namespace nsimd {\n' \ 'namespace fixed_point {\n\n' + sig.format(op.name) + '\n\n' if op.cxx_operator != None: ret += sig.format('operator' + op.cxx_operator) + '\n\n' ret += '} // namespace fixed_point\n' \ '} // namespace nsimd' return ret # ----------------------------------------------------------------------------- def gen_api(opts, op_list): api = dict() for _, operator in operators.operators.items(): if operator.name not in op_list: continue for c in operator.categories: if c not in api: api[c] = [operator] else: api[c].append(operator) filename = common.get_markdown_file(opts, 'api', 'fixed_point') with common.open_utf8(opts, filename) as fout: fout.write('''# NSIMD fixed point API\n''') for c, ops in api.items(): if len(ops) == 0: continue fout.write('\n## {}\n\n'.format(c.title)) for op in ops: fout.write('- [{} ({})](module_fixed_point_api_{}.md)\n'. \ format(op.full_name, op.name, common.to_filename(op.name))) # ----------------------------------------------------------------------------- def gen_doc(opts, op_list): for _, op in operators.operators.items(): if op.name not in op_list: continue filename = common.get_markdown_api_file(opts, op.name, 'fixed_point') with common.open_utf8(opts, filename) as fout: fout.write(api_template.format(full_name=op.full_name, desc=op.desc, decl=gen_decl(op))) # ----------------------------------------------------------------------------- def doit(opts, op_list): common.myprint(opts, 'Generating doc for module fixed_point') gen_overview(opts) gen_api(opts, op_list) gen_doc(opts, op_list) ================================================ FILE: egg/modules/fixed_point/gen_tests.py ================================================ # Copyright (c) 2019 Agenium Scale # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. import os import sys import common # ------------------------------------------------------------------------------- def get_filename(opts, op, lf, rt): tests_dir = os.path.join(opts.tests_dir, "modules/fixed_point") common.mkdir_p(tests_dir) filename = os.path.join(tests_dir, '{}.fp_{}_{}.cpp'.format(op, lf, rt)) if os.path.exists(filename): os.remove(filename) if common.can_create_filename(opts, filename): return filename else: return None includes = """ #include #include #include #include #include #include #include #include """ arithmetic_aliases = """ typedef nsimd::fixed_point::fp_t<{lf}, {rt}> fp_t; typedef nsimd::fixed_point::pack vec_t; typedef nsimd::fixed_point::packl vecl_t; typedef nsimd::fixed_point::pack::value_type raw_t; typedef nsimd::fixed_point::packl::value_type log_t; const size_t v_size = (size_t) nsimd::fixed_point::len(fp_t()); """ # ------------------------------------------------------------------------------ # Utility functions check = """ #define CHECK(a) {{ \\ if (!(a)) {{ \\ fprintf(stderr, "ERROR: " #a ":%s: %d\\n", __FILE__, __LINE__); \\ fflush(stderr); \\ exit(EXIT_FAILURE); \\ }} \\ }} """ limits = """ template static double __get_numeric_precision() { return (double)ldexpf(1.0, -(int)rt); } """ comparison_fp = """ template bool __compare_values(nsimd::fixed_point::fp_t val, double ref){ return nsimd_scalar_abs_f64(double(val) - ref) <= __get_numeric_precision(); } """ comparison_log = """ template bool __check_logical_val(T val, nsimd::fixed_point::fp_t v0, nsimd::fixed_point::fp_t v1) {{ return (((v0._raw {op_val} v1._raw) && (val != 0)) || (!(v0._raw {op_val} v1._raw) && (val == 0))); }} """ gen_random_val = """ template nsimd::fixed_point::fp_t __gen_random_val() {{ float tmp = (float) rand() / (float) RAND_MAX; return nsimd::fixed_point::fp_t(0.5f * tmp + 1.0f); }} """ # ------------------------------------------------------------------------------ # Template for arithmetic binary operators arithmetic_test_template = """ {includes} // ----------------------------------------------------------------------------- {decls} // ----------------------------------------------------------------------------- int main() {{ typedef nsimd::fixed_point::fp_t<{lf}, {rt}> fp_t; typedef nsimd::fixed_point::pack vec_t; const size_t v_size = (size_t) nsimd::fixed_point::len(fp_t()); // FP vectors fp_t *tab0_fp = (fp_t *) malloc(v_size * sizeof(fp_t)); fp_t *tab1_fp = (fp_t *) malloc(v_size * sizeof(fp_t)); fp_t *res_fp = (fp_t *) malloc(v_size * sizeof(fp_t)); // Floating point equivalent double *tab0_f = (double *) malloc(v_size * sizeof(double)); double *tab1_f = (double *) malloc(v_size * sizeof(double)); double *res_f = (double *) malloc(v_size * sizeof(double)); for (size_t i = 0; i < v_size; i++) {{ tab0_fp[i] = __gen_random_val<{lf}, {rt}>(); tab1_fp[i] = __gen_random_val<{lf}, {rt}>(); tab0_f[i] = double(tab0_fp[i]); tab1_f[i] = double(tab1_fp[i]); }} vec_t v0_fp = nsimd::fixed_point::loadu(tab0_fp); vec_t v1_fp = nsimd::fixed_point::loadu(tab1_fp); vec_t vres_fp = nsimd::fixed_point::{op_name}(v0_fp, v1_fp); nsimd::fixed_point::storeu(res_fp, vres_fp); for (size_t i = 0; i < v_size; i++) {{ res_f[i] = tab0_f[i] {op_val} tab1_f[i]; }} for(size_t i = 0; i < v_size; i++) {{ CHECK(__compare_values(res_fp[i], res_f[i])); }} fprintf(stdout, \"test of {op_name} over fp_t<{lf},{rt}>... OK\\n\"); return EXIT_SUCCESS; }} """ arithmetic_ops = [("add", "+"), ("sub", "-"), ("mul", "*"), ("div","/")] def gen_arithmetic_ops_tests(lf, rt, opts): for op_name, op_val in arithmetic_ops: decls = check + limits + comparison_fp + gen_random_val content_src = arithmetic_test_template.format( op_name=op_name, op_val=op_val, lf=lf, rt=rt, includes=includes, decls=decls) filename = get_filename(opts, op_name, lf, rt) if filename == None: continue with common.open_utf8(opts, filename) as fp: fp.write(content_src) common.clang_format(opts, filename) # ------------------------------------------------------------------------------ # Min max operators template minmax_test_template = """ {includes} #define op_min(a, b) ((a) < (b) ?(a) : (b)) #define op_max(a, b) ((a) > (b) ?(a) : (b)) // ----------------------------------------------------------------------------- {decls} // ----------------------------------------------------------------------------- int main() {{ typedef nsimd::fixed_point::fp_t<{lf}, {rt}> fp_t; typedef nsimd::fixed_point::pack vec_t; const size_t v_size = (size_t) nsimd::fixed_point::len(fp_t()); // FP vectors fp_t *tab0_fp = (fp_t *) malloc(v_size * sizeof(fp_t)); fp_t *tab1_fp = (fp_t *) malloc(v_size * sizeof(fp_t)); fp_t *res_fp = (fp_t *) malloc(v_size * sizeof(fp_t)); int *res_ref = (int *) malloc(v_size * sizeof(int)); for (size_t i = 0; i < v_size; i++) {{ tab0_fp[i] = __gen_random_val<{lf}, {rt}>(); tab1_fp[i] = __gen_random_val<{lf}, {rt}>(); }} vec_t v0_fp = nsimd::fixed_point::loadu(tab0_fp); vec_t v1_fp = nsimd::fixed_point::loadu(tab1_fp); vec_t vres_fp = nsimd::fixed_point::{op_name}(v0_fp, v1_fp); nsimd::fixed_point::storeu(res_fp, vres_fp); for (size_t i = 0; i < v_size; i++) {{ res_ref[i] = op_{op_name}((int) tab0_fp[i]._raw, (int) tab1_fp[i]._raw); }} for(size_t i = 0; i < v_size; i++) {{ CHECK(res_fp[i]._raw == res_ref[i]); }} fprintf(stdout, \"test of {op_name} over fp_t<{lf},{rt}>... OK\\n\"); return EXIT_SUCCESS; }} """ minmax_ops = ["min", "max"] def gen_minmax_ops_tests(lf, rt, opts): for op_name in minmax_ops: decls = check + limits + comparison_fp + gen_random_val content_src = minmax_test_template.format( op_name=op_name, lf=lf, rt=rt, includes=includes, decls=decls) filename = get_filename(opts, op_name, lf, rt) if filename == None: continue with common.open_utf8(opts, filename) as fp: fp.write(content_src) common.clang_format(opts, filename) # ------------------------------------------------------------------------------ # Ternary ops (FMA and co) ternary_ops_template = """ {includes} // ----------------------------------------------------------------------------- {decls} // ----------------------------------------------------------------------------- int main() {{ typedef nsimd::fixed_point::fp_t<{lf}, {rt}> fp_t; typedef nsimd::fixed_point::pack vec_t; const size_t v_size = (size_t) nsimd::fixed_point::len(fp_t()); // FP vectors fp_t *tab0_fp = (fp_t *) malloc(v_size * sizeof(fp_t)); fp_t *tab1_fp = (fp_t *) malloc(v_size * sizeof(fp_t)); fp_t *tab2_fp = (fp_t *) malloc(v_size * sizeof(fp_t)); fp_t *res_fp = (fp_t *) malloc(v_size * sizeof(fp_t)); // Floating point equivalent double *tab0_f = (double *) malloc(v_size * sizeof(double)); double *tab1_f = (double *) malloc(v_size * sizeof(double)); double *tab2_f = (double *) malloc(v_size * sizeof(double)); double *res_f = (double *) malloc(v_size * sizeof(double)); for (size_t i = 0; i < v_size; i++) {{ tab0_fp[i] = __gen_random_val<{lf}, {rt}>(); tab1_fp[i] = __gen_random_val<{lf}, {rt}>(); tab2_fp[i] = __gen_random_val<{lf}, {rt}>(); tab0_f[i] = double(tab0_fp[i]); tab1_f[i] = double(tab1_fp[i]); tab2_f[i] = double(tab2_fp[i]); }} vec_t v0_fp = nsimd::fixed_point::loadu(tab0_fp); vec_t v1_fp = nsimd::fixed_point::loadu(tab1_fp); vec_t v2_fp = nsimd::fixed_point::loadu(tab2_fp); vec_t vres_fp = nsimd::fixed_point::{op_name}(v0_fp, v1_fp, v2_fp); nsimd::fixed_point::storeu(res_fp, vres_fp); for(size_t i = 0; i < v_size; i++) {{ const double a = tab0_f[i]; const double b = tab1_f[i]; const double c = tab2_f[i]; {check_statement} }} for(size_t i = 0; i < v_size; i++) {{ CHECK(__compare_values(res_fp[i], res_f[i])); }} fprintf(stdout, \"test of {op_name} over fp_t<{lf},{rt}>... OK\\n\"); return EXIT_SUCCESS; }} """ ternary_ops = [("fma", "res_f[i] = (a * b) + c;")] def gen_ternary_ops_tests(lf, rt, opts): for op_name, statement in ternary_ops: decls = check + limits + comparison_fp + gen_random_val content_src = ternary_ops_template.format( op_name=op_name, check_statement=statement.format(lf=lf, rt=rt), lf=lf, rt=rt,includes=includes, decls=decls) filename = get_filename(opts, op_name, lf, rt) if filename == None: continue with common.open_utf8(opts, filename) as fp: fp.write(content_src) common.clang_format(opts, filename) # ------------------------------------------------------------------------------ # Template for math operators rec_reference = """ // Rec operator on floating points (avoids to write a particular test for rec) static inline double rec(const double x) {{ return 1.0 / x; }} """ math_test_template = """ {includes} // ----------------------------------------------------------------------------- {decls} // ----------------------------------------------------------------------------- int main() {{ typedef nsimd::fixed_point::fp_t<{lf}, {rt}> fp_t; typedef nsimd::fixed_point::pack vec_t; const size_t v_size = (size_t) nsimd::fixed_point::len(fp_t()); // FP vectors fp_t *tab0_fp= (fp_t *) malloc(v_size * sizeof(fp_t)); fp_t *res_fp = (fp_t *) malloc(v_size * sizeof(fp_t)); // Floating point equivalent double *tab0_f = (double *) malloc(v_size * sizeof(double)); double *res_f = (double *) malloc(v_size * sizeof(double)); for (size_t i = 0; i < v_size; i++) {{ tab0_fp[i] = __gen_random_val<{lf}, {rt}>(); tab0_f[i] = double(tab0_fp[i]); }} vec_t v0_fp = nsimd::fixed_point::loadu(tab0_fp); vec_t vres_fp = nsimd::fixed_point::{op_name}(v0_fp); nsimd::fixed_point::storeu(res_fp, vres_fp); for (size_t i = 0; i < v_size; i++) {{ res_f[i] = {ref_op_name}(tab0_f[i]); }} for(size_t i = 0; i < v_size; i++) {{ CHECK(__compare_values(res_fp[i], res_f[i])); }} fprintf(stdout, \"test of {op_name} over fp_t<{lf},{rt}>... OK\\n\"); return EXIT_SUCCESS; }} """ math_ops = ["rec", "abs"] def gen_math_functions_tests(lf, rt, opts): for op_name in math_ops: decls = check + limits + comparison_fp + gen_random_val if op_name == "rec": decls += rec_reference ref_op_name = 'rec' else: ref_op_name = 'nsimd_scalar_abs_f64' content_src = math_test_template.format(op_name=op_name, lf=lf, rt=rt, ref_op_name=ref_op_name, includes=includes, decls=decls) filename = get_filename(opts, op_name, lf, rt) if filename == None: continue with common.open_utf8(opts, filename) as fp: fp.write(content_src) common.clang_format(opts, filename) # ------------------------------------------------------------------------------ # Comparison operators comparison_test_template = """ {includes} // ----------------------------------------------------------------------------- {decls} // ----------------------------------------------------------------------------- int main(){{ typedef nsimd::fixed_point::fp_t<{lf}, {rt}> fp_t; typedef nsimd::fixed_point::pack vec_t; typedef nsimd::fixed_point::packl vecl_t; typedef nsimd::fixed_point::packl::value_type log_t; const size_t v_size = (size_t) nsimd::fixed_point::len(fp_t()); // FP vectors fp_t *tab0_fp = (fp_t *) malloc(v_size * sizeof(fp_t)); fp_t *tab1_fp = (fp_t *) malloc(v_size * sizeof(fp_t)); log_t *resl_fp = (log_t *) malloc(v_size * sizeof(log_t)); for(size_t i = 0; i < v_size; i++) {{ tab0_fp[i] = __gen_random_val<{lf}, {rt}>(); tab1_fp[i] = __gen_random_val<{lf}, {rt}>(); }} // Be sure there is at least one equality to test all the cases. tab0_fp[0] = tab1_fp[0]; vec_t v0_fp = nsimd::fixed_point::loadu(tab0_fp); vec_t v1_fp = nsimd::fixed_point::loadu(tab1_fp); vecl_t vres_fp = nsimd::fixed_point::{op_name}(v0_fp, v1_fp); nsimd::fixed_point::storelu(resl_fp, vres_fp); for(size_t i = 0; i < v_size; i++) {{ CHECK((__check_logical_val( resl_fp[i], tab0_fp[i], tab1_fp[i]))); }} fprintf(stdout, \"test of {op_name} over fp_t<{lf},{rt}>... OK\\n\"); return EXIT_SUCCESS; }} """ comparison_ops = [("eq","=="), ("ne","!="), ("le","<="), ("lt","<"), ("ge",">="), ("gt",">")] def gen_comparison_tests(lf, rt, opts): for op_name, op_val in comparison_ops: decls = check + limits + comparison_log.format(op_val=op_val) + gen_random_val content_src = comparison_test_template.format( op_name=op_name, op_val=op_val, lf=lf, rt=rt, includes=includes, decls=decls) filename = get_filename(opts, op_name, lf, rt) if filename == None: continue with common.open_utf8(opts, filename) as fp: fp.write(content_src) common.clang_format(opts, filename) # ------------------------------------------------------------------------------ # Bitwise binary operators bitwise_binary_test_template = """ {includes} #include // ----------------------------------------------------------------------------- {decls} // ----------------------------------------------------------------------------- int main() {{ typedef nsimd::fixed_point::fp_t<{lf}, {rt}> fp_t; typedef nsimd::fixed_point::pack{l} vec{l}_t; typedef nsimd::fixed_point::pack{l}::value_type raw_t; const size_t v_size = (size_t) nsimd::fixed_point::len(fp_t()); raw_t *tab0 = (raw_t *) malloc(v_size * sizeof(raw_t)); raw_t *tab1 = (raw_t *) malloc(v_size * sizeof(raw_t)); raw_t *res = (raw_t *) malloc(v_size * sizeof(raw_t)); for(size_t i = 0; i < v_size; i++) {{ tab0[i] = {rand_statement} tab1[i] = {rand_statement} }} // Be sure there is at least one equality to test all the cases. tab0[0] = tab1[0]; vec{l}_t v0 = nsimd::fixed_point::load{l}u(tab0); vec{l}_t v1 = nsimd::fixed_point::load{l}u(tab1); vec{l}_t v_res = nsimd::fixed_point::{op_name}{term}(v0, v1); nsimd::fixed_point::store{l}u(res, v_res); for(size_t i = 0; i < v_size; i++) {{ raw_t a = tab0[i]; raw_t b = tab1[i]; raw_t c = res[i]; CHECK({test_statement}); }} fprintf(stdout, \"test of {op_name}{term} over fp_t<{lf},{rt}>... OK\\n\"); return EXIT_SUCCESS; }} """ bitwise_binary_ops = [("and", "c._raw == (a._raw & b._raw)", "c == (a & b)"), ("andnot", "c._raw == (a._raw & ~b._raw)", "c == (a & ~b)"), ("or", "c._raw == (a._raw | b._raw)", "c == (a | b)"), ("xor","c._raw == ((~a._raw & b._raw) | (a._raw & ~b._raw))", "c == ((~a & b) | (a & ~b))")] def gen_bitwise_ops_tests(lf, rt, opts): for op_name, s0, s1 in bitwise_binary_ops: # {op}b decls = check + limits + gen_random_val content_src = bitwise_binary_test_template.format( op_name=op_name, lf=lf, rt=rt, includes=includes, decls=decls, rand_statement="__gen_random_val<{lf}, {rt}>();".format(lf=lf, rt=rt), test_statement=s0, l="", term="b") filename = get_filename(opts, op_name + "b", lf, rt) if filename != None: with common.open_utf8(opts, filename) as fp: fp.write(content_src) common.clang_format(opts, filename) # {op}l content_src = bitwise_binary_test_template.format( op_name=op_name, lf=lf, rt=rt, includes=includes, decls=decls, rand_statement="(raw_t)(rand() % 2);".format(lf=lf, rt=rt), test_statement=s1, l="l", term="l") filename = get_filename(opts, op_name + "l", lf, rt) if filename != None: with common.open_utf8(opts, filename) as fp: fp.write(content_src) common.clang_format(opts, filename) # ------------------------------------------------------------------------------ # Bitwise unary operators bitwise_unary_test_template = """ {includes} // ----------------------------------------------------------------------------- {decls} // ----------------------------------------------------------------------------- int main() {{ typedef nsimd::fixed_point::fp_t<{lf}, {rt}> fp_t; typedef nsimd::fixed_point::pack{l} vec{l}_t; typedef nsimd::fixed_point::pack{l}::value_type raw_t; const size_t v_size = (size_t) nsimd::fixed_point::len(fp_t()); raw_t *tab0 = (raw_t *) malloc(v_size * sizeof(raw_t));; raw_t *res = (raw_t *) malloc(v_size * sizeof(raw_t));; for(size_t i = 0; i < v_size; i++) {{ tab0[i] = {rand_statement} }} vec{l}_t v0 = nsimd::fixed_point::load{l}u(tab0); vec{l}_t v_res = nsimd::fixed_point::{op_name}{term}(v0); nsimd::fixed_point::store{l}u(res, v_res); for(size_t i = 0; i < v_size; i++) {{ raw_t a = tab0[i]; raw_t b = res[i]; CHECK({test_statement}); }} fprintf(stdout, \"test of {op_name}{term} over fp_t<{lf},{rt}>... OK\\n\"); return EXIT_SUCCESS; }} """ bitwise_unary_ops = [("not", "b._raw == ~a._raw", "((b == 0) && (a == 1)) | ((b == 1) && (a == 0))")] def gen_unary_ops_tests(lf, rt, opts): for op_name, s0, s1 in bitwise_unary_ops: decls = check + limits + gen_random_val # {op}b content_src = bitwise_unary_test_template.format( op_name=op_name, lf=lf, rt=rt, includes=includes, decls=decls, rand_statement="__gen_random_val<{lf}, {rt}>();".format(lf=lf, rt=rt), test_statement=s0, l="", term="b") filename = get_filename(opts, op_name + "b", lf, rt) if filename != None: with common.open_utf8(opts, filename) as fp: fp.write(content_src) common.clang_format(opts, filename) # {op}l content_src = bitwise_unary_test_template.format( op_name=op_name, lf=lf, rt=rt, includes=includes, decls=decls, rand_statement="(raw_t)(rand() % 2);".format(lf=lf, rt=rt), test_statement=s1, l="l", term="l") filename = get_filename(opts, op_name + "l", lf, rt) if filename != None: with common.open_utf8(opts, filename) as fp: fp.write(content_src) common.clang_format(opts, filename) # ----------------------------------------------------------------------------- # if_else if_else_test_template = """ {includes} // ----------------------------------------------------------------------------- {decls} // ----------------------------------------------------------------------------- int main() {{ typedef nsimd::fixed_point::fp_t<{lf}, {rt}> fp_t; typedef nsimd::fixed_point::pack vec_t; typedef nsimd::fixed_point::packl vecl_t; typedef nsimd::fixed_point::packl::value_type log_t; const size_t v_size = (size_t) nsimd::fixed_point::len(fp_t()); fp_t *tab0_fp = (fp_t *) malloc(v_size * sizeof(fp_t)); fp_t *tab1_fp = (fp_t *) malloc(v_size * sizeof(fp_t)); fp_t *res_fp = (fp_t *) malloc(v_size * sizeof(fp_t)); log_t *mask = (log_t *) malloc(v_size * sizeof(log_t)); for(size_t i = 0; i < v_size; i++) {{ tab0_fp[i] = __gen_random_val<{lf}, {rt}>(); tab1_fp[i] = __gen_random_val<{lf}, {rt}>(); mask[i] = (log_t) (rand() % 2); }} vec_t v0 = nsimd::fixed_point::loadu(tab0_fp); vec_t v1 = nsimd::fixed_point::loadu(tab1_fp); vecl_t vl = nsimd::fixed_point::loadlu(mask); vec_t v_res = nsimd::fixed_point::if_else1(vl, v0, v1); nsimd::fixed_point::storeu(res_fp, v_res); for(size_t i = 0; i < v_size; i++) {{ fp_t ref = mask[i] ? tab0_fp[i] : tab1_fp[i]; CHECK(ref._raw == res_fp[i]._raw); }} fprintf(stdout, \"test of if_else1 over fp_t<{lf},{rt}>... OK\\n\"); return EXIT_SUCCESS; }} """ def gen_if_else_tests(lf, rt, opts): decls = check + limits + comparison_fp + gen_random_val content_src = if_else_test_template.format( lf=lf, rt=rt, includes=includes, decls=decls) filename = get_filename(opts, "if_else", lf, rt) if filename == None: return with common.open_utf8(opts, filename) as fp: fp.write(content_src) common.clang_format(opts, filename) # ------------------------------------------------------------------------------- load_ops = ["loadu", "loadlu", "loada", "loadla"] store_ops = ["storeu", "storelu", "storea", "storela"] # ------------------------------------------------------------------------------- # Entry point lf_vals = ["4", "8", "16"] rt_vals = ["1", "2", "3", "4", "5", "6", "7", "8"] def doit(opts): common.myprint(opts, 'Generating tests for module fixed_point') for lf in lf_vals: for rt in rt_vals: ## Arithmetic operators gen_arithmetic_ops_tests(lf, rt, opts) ## Min and max operators gen_minmax_ops_tests(lf, rt, opts) ## Ternary_operators gen_ternary_ops_tests(lf, rt, opts) ## Math functions gen_math_functions_tests(lf, rt, opts) ## Comparison operators gen_comparison_tests(lf, rt, opts) ## Bitwise binary operators gen_bitwise_ops_tests(lf, rt, opts) ## Bitwise unary operators gen_unary_ops_tests(lf, rt, opts) ## If_else gen_if_else_tests(lf, rt, opts) ================================================ FILE: egg/modules/fixed_point/hatch.py ================================================ # Copyright (c) 2019 Agenium Scale # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. ## ----------------------------------------------------------------------------- op_list = [ 'len', 'set1', 'loadu', 'loada', 'loadlu', 'loadla', 'storeu', 'storea', 'storelu', 'storela', 'add', 'sub', 'mul', 'div', 'fma', 'min', 'max', 'abs', 'rec', 'eq', 'ne', 'le', 'lt', 'ge', 'gt', 'ifelse1', 'andb', 'andnotb', 'notb', 'orb', 'xorb', 'andl', 'andnotl', 'notl', 'orl', 'xorl' ] # ----------------------------------------------------------------------------- # Imports import modules.fixed_point.gen_tests import modules.fixed_point.gen_doc # ----------------------------------------------------------------------------- def name(): return 'Fixed-point arithmetic' def desc(): return '''This module provides vectorized fixed-point arithmetic through a C++98 API. The programmer can choose the integral type and the place of the coma for representing its fixed-point numbers. A number of operators are also provided.''' def doc_menu(): return {'Overview': 'overview', 'API reference': 'api'} # ----------------------------------------------------------------------------- # Entry point def doit(opts): if opts.tests == True or opts.all == True: modules.fixed_point.gen_tests.doit(opts) if opts.doc == True or opts.all == True: modules.fixed_point.gen_doc.doit(opts, op_list) ================================================ FILE: egg/modules/memory_management/hatch.py ================================================ # Copyright (c) 2020 Agenium Scale # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. import common # ----------------------------------------------------------------------------- def name(): return 'Memory management' def desc(): return '''This module provides C-style memory managmenent functions: malloc, calloc, free, copy to/from devices, etc... Its purpose is to facilitate the use of data buffers in a portable way for systems with CPUs only and for systems with CPUs and GPUs.''' def doc_menu(): return dict() # ----------------------------------------------------------------------------- def doit(opts): common.myprint(opts, 'Generating module memory_management') if not opts.doc: return filename = common.get_markdown_file(opts, 'overview', 'memory_management') if not common.can_create_filename(opts, filename): return with common.open_utf8(opts, filename) as fout: fout.write('''# Overview This module provides C-style memory managmenent functions. Its purpose is not to become a fully feature container library. It is to provide portable malloc, memcpy and free functions with a little helpers to copy data from and to the devices. # API reference ## Equivalents of malloc, calloc, memcpy and free for devices Note that the below functions simply wraps the corresponding C functions when targeting a CPU. - `template T *device_malloc(size_t sz)`{br} Allocates `sz * sizeof(T)` bytes of memory on the device. On error NULL is returned. - `template T *device_calloc(size_t sz)`{br} Allocates `sz * sizeof(T)` bytes of memory on the device and set the allocated memory to zero. On error NULL is returned. - `template void device_free(T *ptr)`{br} Free the memory pointed to by the given pointer. - `template void copy_to_device(T *device_ptr, T *host_ptr, size_t sz)`{br} Copy data to from host to device. - `template void copy_to_host(T *host_ptr, T *device_ptr, size_t sz)`{br} Copy data to from device to host. - `#define nsimd_fill_dev_mem_func(func_name, expr)`{br} Create a device function that will fill data with `expr`. To call the created function one simply does `func_name(ptr, sz)`. The `expr` argument represents some simple C++ expression that can depend only on `i` the i-th element in the vector as shown in the example below. ```c++ nsimd_fill_dev_mem_func(prng, ((i * 1103515245 + 12345) / 65536) % 32768) int main() {{ prng(ptr, 1000); return 0; }} ``` ## Pairs of pointers It is often useful to allocate a pair of data buffers: one on the host and one on the devices to perform data transfers. The below functions provides quick ways to malloc, calloc, free and memcpy pointers on host and devices at once. Note that when targeting CPUs the pair of pointers is reduced to one pointer that ponit the a single data buffer in which case memcpy's are not performed. Note also that there is no implicit synchronization of data between both data buffers. It is up to the programmer to triggers memcpy's. ```c++ template struct paired_pointers_t {{ T *device_ptr, *host_ptr; size_t sz; }}; ``` Members of the above structure are not to be modified but can be passed as arguments for reading/writing data from/to memory they point to. - `template paired_pointers_t pair_malloc(size_t sz)`{br} Allocate `sz * sizeof(T)` bytes of memory on the host and on the device. If an error occurs both pointers are NULL. - `template paired_pointers_t pair_malloc_or_exit(size_t sz)`{br} Allocate `sz * sizeof(T)` bytes of memory on the host and on the device. If an error occurs, prints an error message on stderr and exit(3). - `template paired_pointers_t pair_calloc(size_t sz)`{br} Allocate `sz * sizeof(T)` bytes of memory on the host and on the device. Write both data buffers with zeros. If an error occurs both pointers are NULL. - `template paired_pointers_t pair_calloc_or_exit(size_t sz)`{br} Allocate `sz * sizeof(T)` bytes of memory on the host and on the device. Write both data buffers with zeros. If an error occurs, prints an error message on stderr and exit(3). - `template void pair_free(paired_pointers_t p)`{br} Free data buffers on the host and the device. - `template void copy_to_device(paired_pointers_t p)`{br} Copy data from the host buffer to its corresponding device buffer. - `template void copy_to_host(paired_pointers_t p)`{br} Copy data from the device buffer to its corresponding host buffer. '''.format(br=' ')) ================================================ FILE: egg/modules/random/hatch.py ================================================ # Copyright (c) 2020 Agenium Scale # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. import os import common import collections # ----------------------------------------------------------------------------- rand_functions = list() class MAddToRands(type): def __new__(cls, name, bases, dct): ret = type.__new__(cls, name, bases, dct) if name != 'Rand': rand_functions.append(ret()) return ret class Rand(object, metaclass=MAddToRands): def gen_function_name(self, nwords, word_size, nrounds): return '{}_{}x{}_{}'.format(self.name, nwords, word_size, nrounds) def gen_headers(self, opts): res = '' for word_size, nwords_nrounds in self.wordsize_nwords_nrounds.items(): for nwords, list_nrounds in nwords_nrounds.items(): for nrounds in list_nrounds: res += self.gen_signature(nwords, word_size, nrounds)+';' return res def gen_tests(self, opts, nrounds, word_size, nwords): key_size = self.get_key_size(nwords) key_initialization = 'nsimd::packx{} key_pack;'. \ format(key_size, word_size) for i in range (0, key_size): key_initialization += ''' i = {i}; for (int j = 0; j < len; j++) {{ key[j + i * len] = (u{word_size})(j + i * len); }} key_pack.v{i} = nsimd::loadu(&key[i*len], u{word_size}()); '''.format(i=i, word_size=word_size) input_initilization = \ 'memset(in, 0, sizeof(u{}) * {} * ulen);\n'. \ format(word_size, nwords) for i in range (0, nwords): input_initilization += 'in_pack.v{} = nsimd::pack(0);'. \ format(i, word_size) compare = '' for i in range (0, nwords): compare += ''' if (i=={i}) {{ nsimd::storeu(out_nsimd, out_pack.v{i}); }} '''.format(i=i) l = 'll' if word_size == 64 else '' cast = '(nsimd_ulonglong)' if word_size == 64 else '' res = ''' #include #include "reference.hpp" #include #ifdef NSIMD_LONGLONG_IS_EXTENSION #if defined(NSIMD_IS_GCC) || defined(NSIMD_IS_CLANG) #pragma GCC diagnostic ignored "-Wformat" #endif #endif int main() {{ int res = EXIT_SUCCESS; printf("Test of {function_name} ...\\n"); nsimd::packx{nwords} in_pack; nsimd::packx{nwords} out_pack; const int len = nsimd::len(u{word_size}()); const unsigned int ulen = (unsigned int)len; u{word_size} *key = (u{word_size}*)malloc(ulen * sizeof(u{word_size}) * {key_size}); u{word_size} *in = (u{word_size}*)malloc(ulen * sizeof(u{word_size}) * {nwords}); u{word_size} *out = (u{word_size}*)malloc(ulen * sizeof(u{word_size}) * {nwords}); u{word_size} *out_nsimd = (u{word_size}*)malloc(ulen * sizeof(u{word_size})); tab{word_size}x{nwords}_t in_ref; tab{word_size}x{key_size}_t key_ref; tab{word_size}x{nwords}_t out_ref; int i; // Keys {key_initialization} {input_initilization} for (int cpt=0; cpt < 100000; ++cpt) {{ out_pack = nsimd::random::{function_name}(in_pack, key_pack); for (int i=0; i a, pack b, pack *low, pack *high) { nsimd::packx2 a64 = nsimd::upcvt(nsimd::packx2(), a); nsimd::packx2 b64 = nsimd::upcvt(nsimd::packx2(), b); nsimd::packx2 product; product.v0 = a64.v0 * b64.v0; product.v1 = a64.v1 * b64.v1; *high = nsimd::downcvt(nsimd::pack(), product.v0 >> 32, product.v1 >> 32); *low = nsimd::downcvt(nsimd::pack(), product.v0, product.v1); } #else void mulhilo32(pack a, pack b, pack *low, pack *high) { nsimd::pack ah = nsimd::shr(a, 16); nsimd::pack bh = nsimd::shr(b, 16); nsimd::pack al = nsimd::shr(nsimd::shl(a, 16), 16); nsimd::pack bl = nsimd::shr(nsimd::shl(b, 16), 16); nsimd::pack ahbh = ah * bh; nsimd::pack ahbl = ah * bl; nsimd::pack albh = al * bh; nsimd::pack albl = al * bl; nsimd::pack tmp1 = nsimd::shl(albh, 16); nsimd::pack tmp2 = nsimd::shl(ahbl, 16); nsimd::pack tmp3 = tmp1 + tmp2; nsimd::pack _1 = nsimd::set1(nsimd::pack(), 1u); nsimd::pack _0 = nsimd::set1(nsimd::pack(), 0u); nsimd::pack carry = nsimd::if_else1((tmp3 < tmp1) || (tmp3 < tmp2), _1, _0); *low = tmp3 + albl; carry = carry + nsimd::if_else1((*low < tmp3) || (*low < albl), _1, _0); *high = ahbh + nsimd::shr(albh, 16) + nsimd::shr(ahbl, 16) + carry; } #endif #if 0 void mulhilo64(pack a, pack b, pack *low, pack *high) { u64 a_buf[8]; u64 b_buf[8]; u64 low_buf[8]; u64 high_buf[8]; nsimd::storeu(a_buf, a); nsimd::storeu(b_buf, b); for (int i = 0; i < nsimd::len(u64()); ++i) { __uint128_t product = ((__uint128_t)a_buf[i]) * ((__uint128_t)b_buf[i]); high_buf[i] = (u64)(product >> 64); low_buf[i] = (u64)product; } *high = nsimd::loadu(high_buf, u64()); *low = nsimd::loadu(low_buf, u64()); } #else void mulhilo64(pack a, pack b, pack *low, pack *high) { nsimd::pack ah = nsimd::shr(a, 32); nsimd::pack bh = nsimd::shr(b, 32); nsimd::pack al = nsimd::shr(nsimd::shl(a, 32), 32); nsimd::pack bl = nsimd::shr(nsimd::shl(b, 32), 32); nsimd::pack ahbh = ah * bh; nsimd::pack ahbl = ah * bl; nsimd::pack albh = al * bh; nsimd::pack albl = al * bl; nsimd::pack tmp1 = nsimd::shl(albh, 32); nsimd::pack tmp2 = nsimd::shl(ahbl, 32); nsimd::pack tmp3 = tmp1 + tmp2; nsimd::pack _1 = nsimd::set1(nsimd::pack(), (u64)1); nsimd::pack _0 = nsimd::set1(nsimd::pack(), (u64)0); nsimd::pack carry = nsimd::if_else1((tmp3 < tmp1) || (tmp3 < tmp2), _1, _0); *low = tmp3 + albl; carry = carry + nsimd::if_else1((*low < tmp3) || (*low < albl), _1, _0); *high = ahbh + nsimd::shr(albh, 32) + nsimd::shr(ahbl, 32) + carry; } #endif ''' def gen_signature(self, nwords, word_size, nrounds): return ('nsimd::packx{nwords} {fun_name}' \ '(nsimd::packx{nwords} in, ' \ 'nsimd::packx{key_size} key)'). \ format(nwords = nwords, word_size = word_size, fun_name = self.gen_function_name(nwords, word_size, nrounds), key_size = self.get_key_size(nwords)) def get_key_size(self, nwords): return int(nwords/2) def gen_func(self, opts, nrounds, word_size, nwords): if nwords == 2: bump_keys_init = \ 'nsimd::pack bump = ' \ 'nsimd::set1(nsimd::pack(), {bump});'.\ format(word_size=word_size, bump = '(u64)0x9E3779B97F4A7C15ULL' \ if word_size == 64 else '(u32)0x9E3779B9U') bump_keys = 'key.v0 = key.v0 + bump;' round_init = ''' nsimd::pack mul = nsimd::set1(nsimd::pack(), {mul}); nsimd::pack high, low;'''. \ format(word_size=word_size, mul='(u64)0xD2B74407B1CE6E93ULL' \ if word_size == 64 else '(u32)0xD256D193U') round=''' mulhilo{word_size}(mul, in.v0, &low, &high); in.v0 = high ^ key.v0 ^ in.v1; in.v1 = low; '''.format(word_size=word_size) elif nwords == 4: bump_keys_init = ''' nsimd::pack bump0 = nsimd::set1(nsimd::pack(), {bump0}); nsimd::pack bump1 = nsimd::set1(nsimd::pack(), {bump1});'''.\ format(word_size=word_size, bump0 = '(u64)0x9E3779B97F4A7C15ULL' \ if word_size == 64 else '(u32)0x9E3779B9U', bump1 = '(u64)0xBB67AE8584CAA73BULL' \ if word_size == 64 else '(u32)0xBB67AE85U') bump_keys = 'key.v0 = key.v0 + bump0;\nkey.v1 = key.v1 + bump1;' round_init = ''' nsimd::pack mul0 = nsimd::set1(nsimd::pack(), {mul0}); nsimd::pack mul1 = nsimd::set1(nsimd::pack(), {mul1}); nsimd::pack low0, high0, low1, high1; '''.format(word_size=word_size, mul0='(u64)0xD2E7470EE14C6C93ULL' \ if word_size == 64 else '(u32)0xD2511F53U', mul1='(u64)0xCA5A826395121157ULL' \ if word_size == 64 else '(u32)0xCD9E8D57U') round=''' mulhilo{word_size}(mul0, in.v0, &low0, &high0); mulhilo{word_size}(mul1, in.v2, &low1, &high1); in.v0 = high1 ^ key.v0 ^ in.v1; in.v1 = low1; in.v2 = high0 ^ key.v1 ^ in.v3; in.v3 = low0;'''.format(word_size=word_size) res = self.gen_signature (nwords, word_size, nrounds) res += ' {{ nsimd::packx{} out;'.format(nwords, word_size) res += bump_keys_init res += round_init # Round 0: res += round; for i in range(1, nrounds): res += bump_keys res += round res+=''' return in; } ''' return res def generate(self, opts): res = self.mullohi for word_size, nwords_nrounds in self.wordsize_nwords_nrounds.items(): for nwords, list_nrounds in nwords_nrounds.items(): for nrounds in list_nrounds: res += self.gen_func(opts, nrounds, word_size, nwords) return res class ThreeFry(Rand): name = 'threefry' enums=''' enum enum_threefry32x2_rotations { Rot_32x2_0 = 13, Rot_32x2_1 = 15, Rot_32x2_2 = 26, Rot_32x2_3 = 6, Rot_32x2_4 = 17, Rot_32x2_5 = 29, Rot_32x2_6 = 16, Rot_32x2_7 = 24 }; enum enum_threefry32x4_rotations { Rot_32x4_0_0 = 10, Rot_32x4_0_2 = 26, Rot_32x4_1_0 = 11, Rot_32x4_1_2 = 21, Rot_32x4_2_0 = 13, Rot_32x4_2_2 = 27, Rot_32x4_3_0 = 23, Rot_32x4_3_2 = 5, Rot_32x4_4_0 = 6, Rot_32x4_4_2 = 20, Rot_32x4_5_0 = 17, Rot_32x4_5_2 = 11, Rot_32x4_6_0 = 25, Rot_32x4_6_2 = 10, Rot_32x4_7_0 = 18, Rot_32x4_7_2 = 20 }; enum enum_threefry64x2_rotations { Rot_64x2_0 = 16, Rot_64x2_1 = 42, Rot_64x2_2 = 12, Rot_64x2_3 = 31, Rot_64x2_4 = 16, Rot_64x2_5 = 32, Rot_64x2_6 = 24, Rot_64x2_7 = 21 }; enum enum_threefry64x4_rotations { Rot_64x4_0_0 = 14, Rot_64x4_0_2 = 16, Rot_64x4_1_0 = 52, Rot_64x4_1_2 = 57, Rot_64x4_2_0 = 23, Rot_64x4_2_2 = 40, Rot_64x4_3_0 = 5, Rot_64x4_3_2 = 37, Rot_64x4_4_0 = 25, Rot_64x4_4_2 = 33, Rot_64x4_5_0 = 46, Rot_64x4_5_2 = 12, Rot_64x4_6_0 = 58, Rot_64x4_6_2 = 22, Rot_64x4_7_0 = 32, Rot_64x4_7_2 = 32 }; ''' # Following macros should not be changed to function : gcc can't inline them rotations=''' #define SHIFT_MOD_32(x, N) ((x << (N & 31)) | (x >> ((32 - N) & 31))) #define SHIFT_MOD_64(x, N) ((x << (N & 63)) | (x >> ((64 - N) & 63))) ''' undef_macro=''' #undef SHIFT_MOD_32 #undef SHIFT_MOD_64 ''' wordsize_nwords_nrounds = {32: {2: [12, 20, 32], 4: [12, 20, 72]}, 64: {2: [13, 20, 32], 4: [12, 20, 72]}} def gen_signature(self, nwords, word_size, nrounds): return '''nsimd::packx{nwords} \ {fun_name} \ (nsimd::packx{nwords} in, \ nsimd::packx{nwords} key)'''. \ format(nwords=nwords, word_size = word_size, fun_name=self.gen_function_name(nwords, word_size, nrounds)) def get_key_size(self, nwords): return nwords def gen_body(self, opts, nrounds, word_size, nwords): if word_size == 32: initialize_keys = '''nsimd::pack ks{nwords} = nsimd::set1(nsimd::pack(), 0x1BD11BDAU);'''. \ format(nwords=nwords) elif word_size == 64: initialize_keys = '''nsimd::pack ks{nwords} = nsimd::set1(nsimd::pack(), (u64)0x1BD11BDAA9FC1A22ULL);'''. \ format(nwords=nwords) res = self.gen_signature(nwords, word_size, nrounds) res += ' {{ nsimd::packx{} out;'.format(nwords, word_size) res += initialize_keys initialisation_keys = ''' nsimd::pack ks{i}; ks{i} = key.v{i}; out.v{i} = in.v{i}; ks{nwords} = ks{nwords} ^ key.v{i}; out.v{i} = out.v{i} + key.v{i}; ''' for i in range(0,nwords): res += initialisation_keys.format(i=i, nwords=nwords, word_size=word_size) for i in range(0, nrounds): if nwords == 4: indexes= [1 if i%2==0 else 3, 1 if i%2==1 else 3] res += ''' out.v0 = out.v0 + out.v{index0}; out.v{index0} = SHIFT_MOD_{word_size}(out.v{index0}, Rot_{word_size}x{nwords}_{i_mod}_0); out.v{index0} = out.v{index0} ^ out.v0; out.v2 = out.v2 + out.v{index1}; out.v{index1} = SHIFT_MOD_{word_size}(out.v{index1}, Rot_{word_size}x{nwords}_{i_mod}_2); out.v{index1} = out.v{index1} ^ out.v2; '''.format(index0=indexes[0], index1=indexes[1], i_mod=i%8, word_size=word_size, nwords=nwords) elif nwords == 2: res += ''' out.v0 = out.v0 + out.v1; out.v1 = SHIFT_MOD_{word_size}(out.v1, Rot_{word_size}x{nwords}_{i_mod}); out.v1 = out.v1 ^ out.v0;'''. \ format(i_mod=i % 8, word_size=word_size, nwords=nwords) #if (i % nwords) == nwords - 1: if (i % 4) == 3: d = int(i / 4 + 1) res += '\n' for j in range(0, nwords): res += 'out.v{j} = out.v{j} + ks{calc};\n'. \ format(j=j, calc=str(int((d+j)%(nwords+1)))) res += 'out.v{n} = out.v{n} + ' \ 'nsimd::pack({d});\n'. \ format(d=d, n=nwords-1, word_size=word_size) res+=''' return out; } ''' return res def generate(self, opts): res = '' res += self.enums res += self.rotations for word_size, nwords_nrounds in self.wordsize_nwords_nrounds.items(): for nwords, list_nrounds in nwords_nrounds.items(): for nrounds in list_nrounds: res += self.gen_body(opts, nrounds, word_size, nwords) res += self.undef_macro return res def gen_functions(opts): ## Write source files #dirname = os.path.join(opts.include_dir, 'modules', 'random') #common.mkdir_p(dirname) #filename = os.path.join(dirname, 'functions.cpp') #print(filename) #with common.open_utf8(opts, filename) as out: # out.write('#include "functions.hpp"\n') # out.write('{}\n\n'.format(common.hbar)) # out.write(gen(opts)) # out.write('#endif\n') #common.clang_format(opts, filename) # Write headers dirname = os.path.join(opts.include_dir, 'modules', 'random') common.mkdir_p(dirname) filename = os.path.join(dirname, 'functions.hpp') with common.open_utf8(opts, filename) as out: out.write( '''#ifndef NSIMD_MODULES_RANDOM_FUNCTIONS_HPP #define NSIMD_MODULES_RANDOM_FUNCTIONS_HPP #include #include #include #ifdef NSIMD_LONGLONG_IS_EXTENSION #if defined(NSIMD_IS_GCC) /* Not emitting the warning -Wlong-long is not possible */ /* with GCC <= 12. It is a bug. A workaround is to tell GCC */ /* to consider this header file as a system header file so */ /* that all warnings are not emitted. This is not satisfying */ /* but necessary for the moment. */ #pragma GCC system_header #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wlong-long" #elif defined(NSIMD_IS_CLANG) #pragma clang diagnostic push #pragma clang diagnostic ignored "-Wlong-long" #endif #endif namespace nsimd { namespace random { ''') out.write('{}\n\n'.format(common.hbar)) for func in rand_functions: out.write(func.gen_headers(opts)) out.write(func.generate(opts)) out.write( '''#ifdef NSIMD_LONGLONG_IS_EXTENSION #if defined(NSIMD_IS_GCC) #pragma GCC diagnostic pop #elif defined(NSIMD_IS_CLANG) #pragma clang diagnostic pop #endif #endif } // namespace nsimd } // namespace random #endif ''') common.clang_format(opts, filename) def gen_tests(opts): for func in rand_functions: for word_size, nwords_nrounds in func.wordsize_nwords_nrounds.items(): for nwords, list_nrounds in nwords_nrounds.items(): for nrounds in list_nrounds: # Write headers dirname = os.path.join(opts.tests_dir, 'modules', 'random') common.mkdir_p(dirname) filename = os.path.join(dirname, '{}.cpp'. \ format(func.gen_function_name(nwords, word_size, nrounds))) with common.open_utf8(opts, filename) as out: out.write(func.gen_tests(opts, nrounds, word_size, nwords)) common.clang_format(opts, filename) # ----------------------------------------------------------------------------- def name(): return 'Random number generators' def desc(): return \ 'This module define functions that generate pseudorandom numbers using' \ 'algorithms described in Parallel Random Numbers: As Easy as 1,2,3, by' \ 'John K. Salmon, Mark A. Moraes, Ron O. Dror and David E.Shaw.' def gen_doc(opts): api = '' for func in rand_functions: for word_size, nwords_nrounds in func.wordsize_nwords_nrounds.items(): for nwords, list_nrounds in nwords_nrounds.items(): for nrounds in list_nrounds: api += '- `' + func.gen_signature(nwords, word_size, nrounds) + '`; \n' api += ' Returns a random number using the ' \ '{func_name} generator\n\n'. \ format(func_name=func.name) res = ''' # NSIMD Random module overview {desc} Two different algorithms are proposed : threefry and philox. Both should give high quality random number. Threefry is quicker on CPU, while philox is best used on GPU. Both algorithms are counter based pseudorandom number generator, meaning that they need two parameters: - a key, each key will generate an unique sequence, - a counter, which will give the different numbers in the sequence. # NSIMD Random API reference {api} '''.format(desc = desc(), api=api) filename = common.get_markdown_file(opts, 'overview', 'random') if not common.can_create_filename(opts, filename): return with common.open_utf8(opts, filename) as fout: fout.write(res) def doc_menu(): return dict() # ----------------------------------------------------------------------------- def doit(opts): common.myprint(opts, 'Generating module random') if opts.library: gen_functions(opts) if opts.tests: gen_tests(opts) if opts.doc: gen_doc(opts) ================================================ FILE: egg/modules/spmd/hatch.py ================================================ # Copyright (c) 2020 Agenium Scale # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. import os import operators import common import gen_scalar_utilities import gen_tests as nsimd_tests # ----------------------------------------------------------------------------- # CUDA: default number of threads per block tpb = 128 gpu_params = '(n + {}) / {}, {}'.format(tpb, tpb - 1, tpb) # ----------------------------------------------------------------------------- # helpers def append(s1, s2): if s1 == '': return s2 if s2 == '': return s1 return s1 + ', ' + s2 k_typ = {'i': 'k_int', 'u': 'k_uint', 'f': 'k_float'} def get_signature(op): args = ', '.join(['a{}'.format(i - 1) for i in range(1, len(op.params))]) if op.output_to == common.OUTPUT_TO_SAME_SIZE_TYPES or \ op.name == 'to_mask': args = append('to_type', args) return '#define k_{}({})'.format(op.name, args) # ----------------------------------------------------------------------------- def gen_doc_overview(opts): filename = common.get_markdown_file(opts, 'overview', 'spmd') if not common.can_create_filename(opts, filename): return with common.open_utf8(opts, filename) as fout: fout.write('''# Overview ## What is SPMD? SPMD stands for *Single Program Multiple Data*. It is a programming paradigm. It is used by NVIDIA CUDA. Its strengh lies in writing computation kernels. Basically you concentrate your attention on the kernel itself and not on how to run it. An example is worth more than a long speech, let's take vector addition of `float`'s. ```c++ spmd_kernel_1d(add, float *dst, float *a, float *b) k_store(dst, k_load(a) + k_load(b)); spmd_kernel_end ``` It would be written as follows for CUDA (assuming that the vector lenghts are multiples of block's sizes). ```c++ __global__ add(float *dst, float *a, float *b) { int i = blockIdx.x * blockDim.x + threadIdx.x; dst[i] = a[i] + b[i]; } ``` NSIMD's SPMD is a small DSL in standard C++98 that can be used to write computation kernels for GPUs (NVIDIA's and AMD's) and any SIMD units supported by NSIMD. On a more technical side, the DSL keywords are macros that: - translates to C-ish keywords for GPUs and - use masks for CPUs as Intel ISPC (). The difference between NSIMD's SPMD is that a single code can be compiled to target GPUs and CPUs whereas: - NVIDIA CUDA only targets NVIDIA GPUs - AMD HIP only targets NVIDIA and AMD GPUs - INTEL ICP only targets Intel SIMD units and ARM NEON ## Writing kernels and device functions As for CUDA kernels you can write templated and non-templated CUDA kernels. Declaring a kernel function and launching it is straight forward: ```c++ spmd_kernel_1d(kernel_name, arguments) // kernel code spmd_kernel_end int main() { spmd_launch_kernel_1d(kernel_name, bit_width, param, vector_size, arguments); return 0; } ``` The `bit_width` argument indicates the types width in bits that will be available inside kernels. The `param` argument indicates the unroll factor for CPUs and the number of threads per block for GPUs. The `vector_size` argument indicates the vectors length passed as arguments. Device functions can also been implemented. They are functions that will only run on the device. As for kernels, they have the same restrictions. ```c++ spmd_dev_func(k_float device_func, k_float a, k_float b) // Device function code spmd_dev_func_end spmd_kernel_1d(kernel, arguments) // ... spmd_call_dev_func(device_func, a, b); // ... spmd_kernel_end ``` The caveat with `spmd_dev_func` is that its first argument must be the return type followed by the device function name. It is also possible to write templated kernels. Due to C++ `__VA_ARGS__` limitations the number of template argument is limited to one of kind `typename`. If more types or integers are to be passed to device kernels or functions they have to be boxed inside a struct. ```c++ struct mul_t { spmd_dev_func(static k_float dev_impl, k_float a, k_float b) return a * b; spmd_dev_func_end }; struct add_t { spmd_dev_func(static k_float dev_impl, k_float a, k_float b) return a + b; spmd_dev_func_end }; // Op is the template argument (typename Op in C++ code) spmd_tmpl_dev_func(k_float trampoline, Op, k_float a, k_float b) return Op::template spmd_call_dev_func(dev_impl, a, b); spmd_dev_func_end // Op is the template argument (typename Op in C++ code) spmd_tmpl_kernel_1d(tmpl_kernel, Op, arguments) // ... spmd_call_tmpl_dev_func(trampoline, Op, a, b); // ... spmd_kernel_end int main() { // Kernel call for addition spmd_launch_tmpl_kernel_1d(tmpl_kernel, add_t, 32, 1, N, arguments); // Kernel call for multiplication spmd_launch_tmpl_kernel_1d(tmpl_kernel, mul_t, 32, 1, N, arguments); return 0; } ``` ## The NSIMD SPMD C++ DSL The DSL is of course constraint by C++ syntax and constructs. This implies some strange syntax and the impossibility to use infix operator `=`. For now (2020/05/16) the NSIMD SPMD DSL does only supports `if`'s, while-loops and `returns`. It seems that for-loops and do-while-loops cannot be nicely proposed, i.e. with a nice syntax, the switch-case keywords cannot be implemented with a good conformence to the semantic of their C++ counterparts. Goto's also cannot be implemented properly. ### Variables types available in kernels and device functions The following self-explanatory variable types are available inside kernels and devices functions: - `k_int` for signed integers - `k_uint` for unsigned integers - `k_float` for floatting point numbers - `k_bool` for booleans As explained above the bit-width of the above types are determined by the launch kernel function. Note that `k_float` does not exists for 8-bits types. ### Load/store from/to memory Given a pointer, the proper way to load data is to use `k_load(ptr)`. For storing a value to memory `k_store` is to be used. ```c++ k_store(ptr, value); k_store(ptr, expression); ``` As explained above, there is no need to compute the offset to apply to pointers. This is hidden from the programmer. ### Assignment operator (`operator=`) Due to C++ ADL () and the need for keeping things simple for the compiler (which does not always mean simple for the programmer) the use of infix operator `=` will not produce a copmilation error but will give incorrect result. You should use `k_set`. ```c++ k_set(var, value); k_set(var, expression); ``` As written above, `k_set` assign value or the result of an expression to a variable. ### if, then, else You should not use plan C++ `if`'s or `else`'s. This will not cause compilation error but will produce incorrect results at runtime. You should use `k_if`, `k_else`, `k_elseif` and `k_endif` instead. they have the same semantic as their C++ counterparts. ```c++ spmd_kernel_1d(if_elseif_else, float *dst, float *a_ptr) k_float a, ret; k_set(a, k_load(a_ptr)); k_if (a > 15.0f) k_set(ret, 15.0f); k_elseif ( a > 10.0f) k_set(ret, 10.0f); k_elseif ( a > 5.0f) k_set(ret, 5.0f); k_else k_set(ret, 0.0f); k_endif k_store(dst, ret); spmd_kernel_end ``` ### while loops You should not use plan C++ `while`'s, `break`'s and `continue`'s. This will not cause compilation error but will produce incorrect results at runtime. You should use `k_while`, `k_break`, `k_continue` and `k_endif` instead. They have the same semantic as their C++ counterparts. ```c++ spmd_kernel_1d(binpow, float *dst, float *a_ptr, int *p_ptr) k_float a, ret; k_set(a, k_load(a_ptr)); k_set(ret, 1.0f); k_int p; k_set(p, k_load(p_ptr)); k_while(p > 0) k_if ((p & 1) != 0) k_set(ret, ret * a); k_endif k_set(a, a * a); k_set(p, p >> 1); k_endwhile k_store(dst, ret); spmd_kernel_end ``` ### Returns Returns cannot be implemented as macros overloading is not possible in a standard way with an overload taking zero arguments. So returning has to be done correctly. The `k_return` keyword has the same semantic as the C++ `return` keyword without arguments and can be used at will for kernels (as kernels return type is always `void`) and for device functions returning `void`. For device functions returning a value it is recommanded to proceed this way: 1. Declare a variable, say `ret`, to store the return value. 2. Whereever you need to return, set the variable appropriately with `k_set` and return with `k_return`. 3. At the end of the function use `return ret;`. ```c++ spmd_dev_func(k_int func, k_int a) k_float ret; k_if (a == 0) k_set(ret, 0); k_return; k_endif k_if (a == 1) k_set(ret, -1); k_return; k_endif k_set(ret, a); return ret; spmd_dev_func_end ``` ## Advanced techniques and functions This paragraph applies mainly when targeting CPUs. Using techniques described below won't affect GPUs. If you are familiar with the SIMD technique of masking to emulate loops and if's you may know that `k_set` and `k_store` are implemented using respectively `nsimd::if_else` and `nsimd::maskz_storeu` which may incur performance penalties. When you know that a simple assignment or store is sufficient you may use the unmasked variants: - `k_unmasked_set` translates into a C++ assignment. - `k_unmasked_store` translates into a C++ SIMD store. Their arguments are exactly the same as `k_set` and `k_store`. Unmasked operations can usually be used at the beginning of device functions and also inside loops, on temporary variables, knowing that the result of the latter won't be needed later. You may also use C++ standard keywords and constructs. But be aware that doing so will apply all the same treatment too all SIMD lanes. This can be useful when the operations involved are independant of the processed data as in the example below. ```c++ spmd_dev_func(k_float newton_raphson_sqrt, k_float a, k_float x0) k_float ret; for (int i = 0; i < 6; i++) { k_unmasked_set(ret, (ret + ret * a) / 2.0f); } return ret; spmd_dev_func_end ``` ''') # ----------------------------------------------------------------------------- def gen_doc_api(opts): filename = common.get_markdown_file(opts, 'api', 'spmd') if not common.can_create_filename(opts, filename): return # Build tree for api.md api = dict() for _, operator in operators.operators.items(): if not operator.has_scalar_impl: continue for c in operator.categories: if c not in api: api[c] = [operator] else: api[c].append(operator) with common.open_utf8(opts, filename) as fout: fout.write( '''# NSIMD SPMD API reference This page contains the exhaustive API of the SPMD module. Note that most operators names follow the simple naming `k_[NSIMD name]` and have the same semantics. This page is light, you may use CTRL+F to find the operator you are looking for. For genericity on the base type you should use operator names instead of infix operators, e.g. `k_add` instead of `+`. Indeed for `f16`'s NVIDIA CUDA and NSIMD do not provide overloads and therefore code using `+` will fail to compile. Note that all operators accept literals and scalars. For example you may write `k_add(a, 1)` or `float s; k_add(a, s);`. This also applies when using infix operators. But note that literals or scalars must have the same type as the other operands. ''') for c, ops in api.items(): if len(ops) == 0: continue fout.write('\n## {}\n\n'.format(c.title)) for op in ops: fout.write('- `{}` \n'.format(get_signature(op))) if op.cxx_operator != None: fout.write(' Infix operator: `{}` ' \ '(*for certain types only*) \n'.\ format(op.cxx_operator)) fout.write(' {}\n\n'.format(op.desc)) # ----------------------------------------------------------------------------- def gen_tests_for_shifts(opts, t, operator): op_name = operator.name dirname = os.path.join(opts.tests_dir, 'modules', 'spmd') common.mkdir_p(dirname) filename = os.path.join(dirname, '{}.{}.cpp'.format(op_name, t)) if not common.can_create_filename(opts, filename): return with common.open_utf8(opts, filename) as out: out.write( '''#include #include #include #include "../common.hpp" #if defined(NSIMD_CUDA) __global__ void kernel({typ} *dst, {typ} *a0, int n, int s) {{ int i = threadIdx.x + blockIdx.x * blockDim.x; if (i < n) {{ dst[i] = nsimd::gpu_{op_name}(a0[i], s); }} }} void compute_result({typ} *dst, {typ} *a0, unsigned int n, int s) {{ kernel<<<{gpu_params}>>>(dst, a0, int(n), s); }} {cbprng_cuda} #elif defined(NSIMD_ROCM) __global__ void kernel({typ} *dst, {typ} *a0, size_t n, int s) {{ size_t i = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; if (i < n) {{ dst[i] = nsimd::gpu_{op_name}(a0[i], s); }} }} void compute_result({typ} *dst, {typ} *a0, size_t n, int s) {{ hipLaunchKernelGGL(kernel, {gpu_params}, 0, 0, dst, a0, n, s); }} {cbprng_hip} #elif defined(NSIMD_ONEAPI) inline void kernel({typ} *dst, {typ} *a0, const size_t n, const int s, sycl::nd_item<1> item) {{ const size_t ii = item.get_global_id().get(0); if (ii < n){{ dst[ii] = nsimd::gpu_{op_name}(a0[ii], s); }} }} void compute_result({typ} *dst, {typ} *a0, size_t n, int s) {{ size_t total_num_threads = (size_t)nsimd_kernel_param((int)n, {tpb}); sycl::queue q_ = nsimd::oneapi::default_queue(); q_.parallel_for(sycl::nd_range<1>(sycl::range<1>(total_num_threads), sycl::range<1>({tpb})), [=](sycl::nd_item<1> item){{ kernel(dst, a0, n, s, item); }}).wait_and_throw(); }} {cbprng_oneapi} #else void compute_result({typ} *dst, {typ} *a0, unsigned int n, int s) {{ for (unsigned int i = 0; i < n; i++) {{ dst[i] = nsimd::scalar_{op_name}(a0[i], s); }} }} {cbprng_cpu} #endif // clang-format off spmd_kernel_1d(kernel, {typ} *dst, {typ} *a0, int s) k_store(dst, k_{op_name}(k_load(a0), s)); spmd_kernel_end // clang-format on int main() {{ unsigned int n_[3] = {{ 10, 1001, 10001 }}; for (int i = 0; i < (int)(sizeof(n_) / sizeof(int)); i++) {{ unsigned int n = n_[i]; for (int s = 0; s < {typnbits}; s++) {{ int ret = 0; {typ} *a0 = nsimd::device_calloc<{typ}>(n); random(a0, n, 0); {typ} *ref = nsimd::device_calloc<{typ}>(n); {typ} *out = nsimd::device_calloc<{typ}>(n); spmd_launch_kernel_1d(kernel, {typnbits}, 1, n, out, a0, s); compute_result(ref, a0, n, s); if (!cmp(ref, out, n)) {{ ret = -1; }} nsimd::device_free(a0); nsimd::device_free(ref); nsimd::device_free(out); if (ret != 0) {{ return ret; }} }} }} return 0; }} '''.format(typ=t, op_name=op_name, typnbits=t[1:], tpb=tpb, cbprng_cpu=nsimd_tests.cbprng(t, operator, 'cpu'), cbprng_cuda=nsimd_tests.cbprng(t, operator, 'cuda', gpu_params), cbprng_hip=nsimd_tests.cbprng(t, operator, 'hip', gpu_params), cbprng_oneapi=nsimd_tests.cbprng(t, operator, 'oneapi', ['(int)n', str(tpb)]), gpu_params=gpu_params)) common.clang_format(opts, filename, cuda=True) # ----------------------------------------------------------------------------- def gen_tests_for_cvt_reinterpret(opts, tt, t, operator): op_name = operator.name dirname = os.path.join(opts.tests_dir, 'modules', 'spmd') common.mkdir_p(dirname) filename = os.path.join(dirname, '{}.{}_{}.cpp'.format(op_name, t, tt)) if not common.can_create_filename(opts, filename): return with common.open_utf8(opts, filename) as out: out.write( '''#include #include #include #include "../common.hpp" #if defined(NSIMD_CUDA) __global__ void kernel({typ} *dst, {typ} *a0, int n) {{ int i = threadIdx.x + blockIdx.x * blockDim.x; if (i < n) {{ dst[i] = nsimd::gpu_{op_name}({typ}(), nsimd::gpu_{op_name}( {totyp}(), a0[i])); }} }} void compute_result({typ} *dst, {typ} *a0, unsigned int n) {{ kernel<<<{gpu_params}>>>(dst, a0, int(n)); }} {cbprng_cuda} #elif defined(NSIMD_ROCM) __global__ void kernel({typ} *dst, {typ} *a0, size_t n) {{ size_t i = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; if (i < n) {{ dst[i] = nsimd::gpu_{op_name}({typ}(), nsimd::gpu_{op_name}( {totyp}(), a0[i])); }} }} void compute_result({typ} *dst, {typ} *a0, size_t n) {{ hipLaunchKernelGGL(kernel, {gpu_params}, 0, 0, dst, a0, n); }} {cbprng_hip} #elif defined(NSIMD_ONEAPI) inline void kernel({typ} *dst, {typ} *a0, const size_t n, sycl::nd_item<1> item) {{ const size_t ii = item.get_global_id().get(0); if (ii < n){{ dst[ii] = nsimd::gpu_{op_name}({typ}(), nsimd::gpu_{op_name}( {totyp}(), a0[ii])); }} }} void compute_result({typ} *dst, {typ} *a0, size_t n) {{ size_t total_num_threads = (size_t)nsimd_kernel_param((int)n, {tpb}); sycl::queue q_ = nsimd::oneapi::default_queue(); q_.parallel_for(sycl::nd_range<1>(sycl::range<1>(total_num_threads), sycl::range<1>({tpb})), [=](sycl::nd_item<1> item){{ kernel(dst, a0, n, item); }}).wait_and_throw(); }} {cbprng_oneapi} #else void compute_result({typ} *dst, {typ} *a0, unsigned int n) {{ for (unsigned int i = 0; i < n; i++) {{ dst[i] = nsimd::scalar_{op_name}({typ}(), nsimd::scalar_{op_name}( {totyp}(), a0[i])); }} }} {cbprng_cpu} #endif // clang-format off spmd_kernel_1d(kernel, {typ} *dst, {typ} *a0) k_store(dst, k_{op_name}({k_typ}, k_{op_name}({k_totyp}, k_load(a0)))); spmd_kernel_end // clang-format on int main() {{ unsigned int n_[3] = {{ 10, 1001, 10001 }}; for (int i = 0; i < (int)(sizeof(n_) / sizeof(int)); i++) {{ unsigned int n = n_[i]; int ret = 0; {typ} *a0 = nsimd::device_calloc<{typ}>(n); random(a0, n, 0); {typ} *ref = nsimd::device_calloc<{typ}>(n); {typ} *out = nsimd::device_calloc<{typ}>(n); spmd_launch_kernel_1d(kernel, {typnbits}, 1, n, out, a0); compute_result(ref, a0, n); if (!cmp(ref, out, n)) {{ ret = -1; }} nsimd::device_free(a0); nsimd::device_free(ref); nsimd::device_free(out); if (ret != 0) {{ return ret; }} }} return 0; }} '''.format(typ=t, totyp=tt, op_name=op_name, typnbits=t[1:], gpu_params=gpu_params, k_typ=k_typ[t[0]], tpb=tpb, cbprng_cpu=nsimd_tests.cbprng(t, operator, 'cpu'), cbprng_cuda=nsimd_tests.cbprng(t, operator, 'cuda'), cbprng_hip=nsimd_tests.cbprng(t, operator, 'hip', gpu_params), cbprng_oneapi=nsimd_tests.cbprng(t, operator, 'oneapi', ['(int)n', str(tpb)]), k_totyp=k_typ[tt[0]])) common.clang_format(opts, filename, cuda=True) # ----------------------------------------------------------------------------- def gen_tests_for(opts, t, operator): op_name = operator.name dirname = os.path.join(opts.tests_dir, 'modules', 'spmd') common.mkdir_p(dirname) filename = os.path.join(dirname, '{}.{}.cpp'.format(op_name, t)) if not common.can_create_filename(opts, filename): return arity = len(operator.params[1:]) k_args = ', '.join(['{} *a{}'.format(t, i) for i in range(arity)]) k_call_args = ', '.join(['a{}'.format(i) for i in range(arity)]) fill_tabs = '\n'.join(['{typ} *a{i} = nsimd::device_calloc<{typ}>(n);\n' \ 'random(a{i}, n, {i});'.format(typ=t, i=i) \ for i in range(arity)]) free_tabs = '\n'.join(['nsimd::device_free(a{i});'. \ format(typ=t, i=i) for i in range(arity)]) # spmd def get_cte_spmd(typ, cte): if typ == 'f16': return 'k_f32_to_f16((f32){})'.format(cte) else: return '({}){}'.format(typ, cte) def spmd_load_code(param, typ, i): if param == 'l': return 'k_lt(k_load(a{}), {})'.format(i, get_cte_spmd(typ, 4)) if param == 'v': return 'k_load(a{})'.format(i) args = ', '.join([spmd_load_code(operator.params[i + 1], t, i) \ for i in range(arity)]) if op_name == 'to_mask': args = k_typ[t[0]] + ', ' + args if operator.params[0] == 'v': k_code = 'k_store(dst, k_{}({}));'.format(op_name, args) else: k_code = '''k_if (k_{}({})) k_store(dst, 1); k_else k_store(dst, 0); k_endif'''.format(op_name, args) # gpu def get_cte_gpu(typ, cte, target): if typ == 'f16' and target == 'cuda_rocm': return '__float2half((f32){})'.format(cte) else: return '({}){}'.format(typ, cte) def gpu_load_code(param, typ, i, target): if param == 'l': return 'nsimd::gpu_lt(a{}[i], {})'. \ format(i, get_cte_gpu(typ, 4, target)) if param == 'v': return 'a{}[i]'.format(i) args_cuda_rocm = ', '.join([gpu_load_code(operator.params[i + 1], t, i, 'cuda_rocm') \ for i in range(arity)]) args_oneapi = ', '.join([gpu_load_code(operator.params[i + 1], t, i, 'oneapi') for i in range(arity)]) if op_name == 'to_mask': args_cuda_rocm = t + '(), ' + args_cuda_rocm args_oneapi = t + '(), ' + args_oneapi if operator.params[0] == 'v': cuda_rocm_kernel = 'dst[i] = nsimd::gpu_{}({});'. \ format(op_name, args_cuda_rocm) oneapi_kernel = 'dst[i] = nsimd::gpu_{}({});'. \ format(op_name, args_oneapi) else: tmpl = '''if (nsimd::gpu_{}({{}})) {{{{ dst[i] = {{}}; }}}} else {{{{ dst[i] = {{}}; }}}}'''.format(op_name) cuda_rocm_kernel = tmpl.format(args_cuda_rocm, get_cte_gpu(t, 1, 'cuda_rocm'), get_cte_gpu(t, 0, 'cuda_rocm')) oneapi_kernel = tmpl.format(args_oneapi, get_cte_gpu(t, 1, 'oneapi'), get_cte_gpu(t, 0, 'oneapi')) # cpu def get_cte_cpu(typ, cte): if typ == 'f16': return 'nsimd_f32_to_f16((f32){})'.format(cte) else: return '({}){}'.format(typ, cte) def cpu_load_code(param, typ, i): if param == 'l': return 'nsimd::scalar_lt(a{}[i], {})'. \ format(i, get_cte_cpu(typ, 4)) if param == 'v': return 'a{}[i]'.format(i) args = ', '.join([cpu_load_code(operator.params[i + 1], t, i) \ for i in range(arity)]) if op_name == 'to_mask': args = t + '(), ' + args if operator.params[0] == 'v': cpu_kernel = 'dst[i] = nsimd::scalar_{}({});'.format(op_name, args) else: cpu_kernel = '''if (nsimd::scalar_{op_name}({args})) {{ dst[i] = {one}; }} else {{ dst[i] = {zero}; }}'''.format(op_name=op_name, args=args, one=get_cte_cpu(t, 1), zero=get_cte_cpu(t, 0)) comp = '!cmp(ref, out, n{})'.format('' if t in common.iutypes \ else ', {}'.format(operator.ufp[t])) with common.open_utf8(opts, filename) as out: out.write( '''#include #include #include #include "../common.hpp" #if defined(NSIMD_CUDA) __global__ void kernel({typ} *dst, {k_args}, int n) {{ int i = threadIdx.x + blockIdx.x * blockDim.x; if (i < n) {{ {cuda_rocm_kernel} }} }} void compute_result({typ} *dst, {k_args}, unsigned int n) {{ kernel<<<{gpu_params}>>>(dst, {k_call_args}, int(n)); }} {cbprng_cuda} #elif defined(NSIMD_ROCM) __global__ void kernel({typ} *dst, {k_args}, size_t n) {{ size_t i = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; if (i < n) {{ {cuda_rocm_kernel} }} }} void compute_result({typ} *dst, {k_args}, size_t n) {{ hipLaunchKernelGGL(kernel, {gpu_params}, 0, 0, dst, {k_call_args}, n); }} {cbprng_hip} #elif defined(NSIMD_ONEAPI) inline void kernel({typ} *dst, {k_args}, const size_t n, sycl::nd_item<1> item) {{ const size_t i = item.get_global_id().get(0); if(i < n){{ {oneapi_kernel} }} }} void compute_result({typ} *dst, {k_args}, size_t n) {{ size_t total_num_threads = (size_t)nsimd_kernel_param((int)n, {tpb}); sycl::queue q_ = nsimd::oneapi::default_queue(); q_.parallel_for(sycl::nd_range<1>(sycl::range<1>(total_num_threads), sycl::range<1>({tpb})), [=](sycl::nd_item<1> item){{ kernel(dst, {k_call_args}, n, item); }}).wait_and_throw(); }} {cbprng_oneapi} #else void compute_result({typ} *dst, {k_args}, unsigned int n) {{ for (unsigned int i = 0; i < n; i++) {{ {cpu_kernel} }} }} {cbprng_cpu} #endif // clang-format off spmd_kernel_1d(kernel, {typ} *dst, {k_args}) {k_code} spmd_kernel_end // clang-format on #if defined(NSIMD_CUDA) || defined(NSIMD_ROCM) || defined(NSIMD_ONEAPI) #define THREADS_PER_BLOCK 128 #else #define THREADS_PER_BLOCK 1 #endif int main() {{ unsigned int n_[3] = {{ 10, 1001, 10001 }}; for (int i = 0; i < (int)(sizeof(n_) / sizeof(int)); i++) {{ unsigned int n = n_[i]; int ret = 0; {fill_tabs} {typ} *ref = nsimd::device_calloc<{typ}>(n); {typ} *out = nsimd::device_calloc<{typ}>(n); spmd_launch_kernel_1d(kernel, {typnbits}, THREADS_PER_BLOCK, n, out, {k_call_args}); compute_result(ref, {k_call_args}, n); if ({comp}) {{ ret = -1; }} nsimd::device_free(ref); nsimd::device_free(out); {free_tabs} if (ret != 0) {{ return ret; }} }} return 0; }} '''.format(typ=t, free_tabs=free_tabs, fill_tabs=fill_tabs, k_code=k_code, k_call_args=k_call_args, k_args=k_args, cpu_kernel=cpu_kernel, comp=comp, cuda_rocm_kernel=cuda_rocm_kernel, oneapi_kernel=oneapi_kernel, cbprng_cpu=nsimd_tests.cbprng(t, operator, 'cpu'), cbprng_cuda=nsimd_tests.cbprng(t, operator, 'cuda', gpu_params), cbprng_hip=nsimd_tests.cbprng(t, operator, 'hip', gpu_params), cbprng_oneapi=nsimd_tests.cbprng(t, operator, 'oneapi', ['(int)n', str(tpb)]), gpu_params=gpu_params, typnbits=t[1:], tpb=tpb)) common.clang_format(opts, filename, cuda=True) def gen_tests(opts): for op_name, operator in operators.operators.items(): if not operator.has_scalar_impl: continue not_closed = (operator.output_to == common.OUTPUT_TO_SAME_SIZE_TYPES \ or ('v' not in operator.params[1:] and 'l' not in operator.params[1:])) for t in operator.types: tts = common.get_output_types(t, operator.output_to) for tt in tts: if not nsimd_tests.should_i_do_the_test(operator, tt, t): continue if operator.name in ['shl', 'shr', 'shra']: gen_tests_for_shifts(opts, t, operator) elif operator.name in ['cvt', 'reinterpret', 'reinterpretl']: gen_tests_for_cvt_reinterpret(opts, tt, t, operator) else: gen_tests_for(opts, t, operator) # ----------------------------------------------------------------------------- def gen_functions(opts): functions = '' for op_name, operator in operators.operators.items(): if not operator.has_scalar_impl: continue if operator.params[0] == 'l': s_ret_typ = 'bool' v_ret_typ = \ 'nsimd::packl::type, N>' else: s_ret_typ = 'T' v_ret_typ = 'nsimd::pack::type, N>' def s_typ(typ): if typ == 'p': return 'int' if typ == 'v': return 'T' if typ == 'l': return 'bool' s_args = ', '.join(['{} a{}'.format(s_typ(operator.params[i]), i - 1) \ for i in range(1, len(operator.params))]) s_call_args = ', '.join(['a{}'.format(i - 1) \ for i in range(1, len(operator.params))]) s_tmpl = 'typename T' if 'v' in operator.params[1:] else '' def v_typ(typ, i): if typ == 'p': return 'int' if typ in ['v', 'l']: return 'A{}'.format(i) v_args = ', '.join(['{} a{}'. \ format(v_typ(operator.params[i], i - 1), i - 1) \ for i in range(1, len(operator.params))]) def v_call_arg(typ, i): if typ == 'p': return '(int)a{}'.format(i) if typ == 'v': return 'spmd::to_pack(a{})'.format(i) if typ == 'l': return 'spmd::to_packl(a{})'.format(i) v_call_args = ', '.join([v_call_arg(operator.params[i], i - 1) \ for i in range(1, len(operator.params))]) v_tmpl = ', '.join(['typename A{}'.format(i - 1) \ for i in range(1, len(operator.params)) \ if operator.params[i] != 'p']) m_call_args_cpu = s_call_args m_call_args_gpu = s_call_args to_type = '' ToType = '' v_op_name = op_name s_op_name = op_name template = '' # Override for non closed operators if operator.output_to == common.OUTPUT_TO_SAME_SIZE_TYPES or \ op_name == 'to_mask': s_ret_typ = 'ToType' s_tmpl = append('typename ToType', s_tmpl) m_call_args_gpu = append('to_type()', s_call_args) s_call_args = append('ToType()', s_call_args) v_tmpl = append('typename ToType', v_tmpl) to_type = '' template = 'template ' v_ret_typ = 'ToType' ToType = '' # special case for to_mask if op_name == 'to_mask': v_op_name = 'reinterpret' v_call_args = 'to_mask({})'.format(v_call_args) if v_tmpl != '': v_tmpl = 'template <{}>'.format(v_tmpl) if s_tmpl != '': s_tmpl = 'template <{}>'.format(s_tmpl) functions += \ '''#if defined(NSIMD_CUDA) || defined(NSIMD_ROCM) || \ defined(NSIMD_ONEAPI) {signature} nsimd::gpu_{s_op_name}({m_call_args_gpu}) #else template struct {op_name}_helper {{}}; template struct {op_name}_helper {{ {s_tmpl} static {s_ret_typ} impl({s_args}) {{ return nsimd::scalar_{s_op_name}({s_call_args}); }} }}; template struct {op_name}_helper {{ {v_tmpl} static {v_ret_typ} impl({v_args}) {{ typedef typename spmd::base_type::type T; return nsimd::{v_op_name}{ToType}({v_call_args}); }} }}; {signature} \\ spmd::{op_name}_helper::{template}impl{to_type}( \\ {m_call_args_cpu}) #endif {hbar} '''.format(hbar=common.hbar, s_op_name=s_op_name, s_tmpl=s_tmpl, s_ret_typ=s_ret_typ, s_args=s_args, v_args=v_args, v_call_args=v_call_args, s_call_args=s_call_args, v_tmpl=v_tmpl, v_ret_typ=v_ret_typ, ToType=ToType, m_call_args_cpu=m_call_args_cpu, to_type=to_type, v_op_name=v_op_name, op_name=op_name, template=template, m_call_args_gpu=m_call_args_gpu, signature=get_signature(operator)) # Write the code to file dirname = os.path.join(opts.include_dir, 'modules', 'spmd') common.mkdir_p(dirname) filename = os.path.join(dirname, 'functions.hpp') if not common.can_create_filename(opts, filename): return with common.open_utf8(opts, filename) as out: out.write('#ifndef NSIMD_MODULES_SPMD_FUNCTIONS_HPP\n') out.write('#define NSIMD_MODULES_SPMD_FUNCTIONS_HPP\n\n') out.write('namespace spmd {\n\n') out.write('{}\n\n'.format(common.hbar)) out.write(functions) out.write('} // namespace spmd\n\n') out.write('#endif\n') common.clang_format(opts, filename) # ----------------------------------------------------------------------------- def name(): return 'SPMD programming' def desc(): return '''SPMD programming allows the programmer to focus on kernels and the compiler to vectorize kernel code more effectively. Basically this module provides a "à la CUDA" programming C++ DSL to targets CPU SIMD as well as Intel, NVIDIA and AMD GPUs.''' def doc_menu(): return {'Overview': 'overview', 'API reference': 'api'} # ----------------------------------------------------------------------------- def doit(opts): common.myprint(opts, 'Generating module spmd') if opts.library: gen_functions(opts) if opts.tests: gen_tests(opts) if opts.doc: gen_doc_api(opts) gen_doc_overview(opts) ================================================ FILE: egg/modules/tet1d/hatch.py ================================================ # Copyright (c) 2021 Agenium Scale # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. import os import operators import common import gen_scalar_utilities import gen_tests as nsimd_tests # ----------------------------------------------------------------------------- # CUDA: default number of threads per block tpb = 128 gpu_params = '(n + {}) / {}, {}'.format(tpb, tpb - 1, tpb) def is_not_closed(operator): return (operator.output_to == common.OUTPUT_TO_SAME_SIZE_TYPES \ or ('v' not in operator.params[1:] and 'l' not in operator.params[1:])) # ----------------------------------------------------------------------------- def gen_doc_overview(opts): filename = common.get_markdown_file(opts, 'overview', 'tet1d') if not common.can_create_filename(opts, filename): return with common.open_utf8(opts, filename) as fout: fout.write('''# Overview ## What are expression templates? Expression templates are a C++ template metaprogramming technique that essentially allows high level programming for loop fusion. Take the following exemple. ```c++ std::vector operator+(std::vector const &a, std::vector const &b) {{ std::vector ret(a.size()); for (size_t i = 0; i < a.size(); i++) {{ ret[i] = a[i] + b[i]; }} return ret; }} int main() {{ std::vector a, b, c, d, sum; ... sum = a + b + c + d; ... return 0; }} ``` The expression `a + b + c + d` involves three calls to `operator+` and at least nine memory passes are necessary. This can be optimized as follows. ```c++ int main() {{ std::vector a, b, c, d, sum; ... for (size_t i = 0; i < a.size(); i++) {{ ret[i] = a[i] + b[i] + c[i] + d[i]; }} ... return 0; }} ``` The rewriting above requires only four memory passes which is of course better but as humans we prefer the writing `a + b + c + d`. Expression templates solves exactly this problem and allows the programmer to write `a + b + c + d` and the compiler to see the loop written above. ## Expressions templates with NSIMD This module provides expression templates on top of NSIMD core. As a consequence the loops seen by the compiler deduced from the high-level expressions are optimized using SIMD instructions. Note also that NVIDIA and AMD GPUs are supported through CUDA and ROCm/HIP. The API for expression templates in NSIMD is C++98 compatible and is able to work with any container as its only requirement for data is that it must be contiguous. All inputs to an expression must be declared using `tet1d::in` while the output must be declared using `tet1d::out`. ```c++ int main() {{ std::vector a, b, c; ... tet1d::out(a) = tet1d::in(&a[0], a.size()) + tet1d::in(&b[0], b.size()); ... return 0; }} ``` - `template inline node in(const T *data, I sz);`{nl} Construct an input for expression templates starting at address `data` and containing `sz` elements. The return type of this functin `node` can be used with the help of the `TET1D_IN(T)` macro where `T` if the underlying type of data (ints, floats, doubles...). - `template node out(T *data);`{nl} Construct an output for expression templates starting at address `data`. Note that memory must be allocated by the user before passing it to the expression template engine. The output type can be used with the `TET1D_OUT(T)` where `T` is the underlying type (ints, floats, doubles...). Note that it is possible to pass parameters to the expression template engine to specify the number of threads per block for GPUs or the SIMD extension to use... - `template node out(T *data, int threads_per_block, void *stream);`{nl} Construct an output for expression templates starting at address `data`. Note that memory must be allocated by the user before passing it to the expression template engine. The `Pack` parameter is useful when compiling for CPUs. The type is `nsimd::pack<...>` allowing the developper to specify all details about the NSIMD packs that will be used by the expression template engine. The `threads_per_block` and `stream` arguments are used only when compiling for GPUs. Their meaning is contained in their names. The output type can be used with the `TET1D_OUT_EX(T, N, SimdExt)` where `T` is the underlying type (ints, floats, doubles...), `N` is the unroll factor and `SimdExt` the SIMD extension. Moreover a MATLAB-like syntax is provided. One can select a subrange of given input. Indexes are understood as for Python: -1 represents the last element. The contant `tet1d::end = -1` allows one to write portable code. ```c++ int main() {{ std::vector a, b, c; ... TET1D_IN(float) va = tet1d::in(&a[0], a.size()); TET1D_IN(float) vb = tet1d::in(&b[0], b.size()); tet1d::out(c) = va(10, tet1d::end - 10) + vb; ... return 0; }} ``` One can also specify which elements of the output must be rewritten with the following syntax. ```c++ int main() {{ std::vector a, b, c; ... TET1D_IN(float) va = tet1d::in(&a[0], a.size()); TET1D_IN(float) vb = tet1d::in(&b[0], b.size()); TET1D_OUT(float) vc = tet1d::out(&c[0]); vc(va >= 10 && va < 20) = vb; ... return 0; }} ``` In the exemple above, element `i` in `vc` is written only if `va[i] >= 10` and `va[i] < 20`. The expression appearing in the parenthesis can contain arbitrary expression templates as soon as the underlying type is `bool`. ## Warning using `auto` Using auto can lead to surprising results. We advice you never to use auto when dealing with expression templates. Indeed using `auto` will make the variable an obscure type representing the computation tree of the expression template. This implies that you won't be able to get data from this variable i.e. get the `.data` member for exemple. Again this variable or its type cannot be used in template arguments where you need it. '''.format(nl=' ')) # ----------------------------------------------------------------------------- def gen_doc_api(opts): filename = common.get_markdown_file(opts, 'api', 'tet1d') if not common.can_create_filename(opts, filename): return # Build tree for api.md api = dict() for _, operator in operators.operators.items(): if not operator.has_scalar_impl: continue for c in operator.categories: if c not in api: api[c] = [operator] else: api[c].append(operator) def get_signature(op): def get_type(typ): if typ == 'p': return 'int' elif typ == 'v': return 'ExprNumber' elif typ == 'l': return 'ExprBool' ret = get_type(op.params[0]) + ' ' + op.name + '(' if is_not_closed(op): ret += 'ToType' + (', ' if len(op.params[1:]) > 0 else '') ret += ', '.join(['{{t}} {{in{i}}}'.format(i=i). \ format(t=get_type(op.params[i + 1]), in0=common.in0, in1=common.in1, in2=common.in2, in3=common.in3) \ for i in range(len(op.params[1:]))]) ret += ');' return ret with common.open_utf8(opts, filename) as fout: fout.write( '''# NSIMD TET1D API reference This page contains the exhaustive API of the TET1D module. Note that most operators names follow their NSIMD counterparts and have the same semantics. This page is light, you may use CTRL+F to find the operator you are looking for. Note that all operators accept literals and scalars. For example you may write `tet1d::add(a, 1)`. This also applies when using infix operators. Note that literals or scalars of different types can be used with expression involving other types. In all signature below the following pseudo types are used for simplification: - `ExprNumber` to designate an existing expression template on signed, unsigned integers of floatting point types or a scalar of signed, unsigned integers or floatting point types. - `ExprBool` to designate an existing expression template over booleans or a boolean. - `ToType` to designate a base type (signed, unsigned integers or floatting point types) and is used when a change in type is requested for example when converting data. ''') for c, ops in api.items(): if len(ops) == 0: continue fout.write('\n## {}\n\n'.format(c.title)) for op in ops: fout.write('- `{}` \n'.format(get_signature(op))) if op.cxx_operator != None: fout.write(' Infix operator: `{}` \n'. \ format(op.cxx_operator[8:])) fout.write(' {}\n\n'.format(op.desc)) # ----------------------------------------------------------------------------- def gen_tests_for_shifts(opts, t, operator): op_name = operator.name dirname = os.path.join(opts.tests_dir, 'modules', 'tet1d') common.mkdir_p(dirname) filename = os.path.join(dirname, '{}.{}.cpp'.format(op_name, t)) if not common.can_create_filename(opts, filename): return with common.open_utf8(opts, filename) as out: out.write( '''#include #include #include "../common.hpp" #if defined(NSIMD_CUDA) __global__ void kernel({t} *dst, {t} *tab0, int n, int s) {{ int i = threadIdx.x + blockIdx.x * blockDim.x; if (i < n) {{ dst[i] = nsimd::gpu_{op_name}(tab0[i], s); }} }} void compute_result({t} *dst, {t} *tab0, unsigned int n, int s) {{ kernel<<<{gpu_params}>>>(dst, tab0, int(n), s); }} {cbprng_cuda} #elif defined(NSIMD_ROCM) __global__ void kernel({t} *dst, {t} *tab0, size_t n, int s) {{ size_t i = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; if (i < n) {{ dst[i] = nsimd::gpu_{op_name}(tab0[i], s); }} }} void compute_result({t} *dst, {t} *tab0, size_t n, int s) {{ hipLaunchKernelGGL(kernel, {gpu_params}, 0, 0, dst, tab0, n, s); }} {cbprng_hip} #elif defined(NSIMD_ONEAPI) inline void kernel({t} *dst, {t} *tab0, const size_t n, const int s, sycl::nd_item<1> item) {{ size_t ii = item.get_global_id().get(0); if (ii < n){{ dst[ii] = nsimd::gpu_{op_name}(tab0[ii], s); }} }} void compute_result({t} *dst, {t} *tab0, size_t n, int s) {{ size_t total_num_threads = (size_t)nsimd_kernel_param((int)n, {tpb}); sycl::queue q_ = nsimd::oneapi::default_queue(); q_.parallel_for(sycl::nd_range<1>(sycl::range<1>(total_num_threads), sycl::range<1>({tpb})), [=](sycl::nd_item<1> item){{ kernel(dst, tab0, n, s, item); }}).wait_and_throw(); }} {cbprng_oneapi} #else void compute_result({t} *dst, {t} *tab0, unsigned int n, int s) {{ for (unsigned int i = 0; i < n; i++) {{ dst[i] = nsimd_scalar_{op_name}_{t}(tab0[i], s); }} }} {cbprng_cpu} #endif int main() {{ unsigned int n_[3] = {{ 10, 1001, 10001 }}; for (int i = 0; i < (int)(sizeof(n_) / sizeof(int)); i++) {{ unsigned int n = n_[i]; for (int s = 0; s < {typnbits}; s++) {{ int ret = 0; {t} *tab0 = nsimd::device_calloc<{t}>(n); random(tab0, n, 0); {t} *ref = nsimd::device_calloc<{t}>(n); {t} *out = nsimd::device_calloc<{t}>(n); compute_result(ref, tab0, n, s); tet1d::out(out) = tet1d::{op_name}(tet1d::in(tab0, n), s); if (!cmp(ref, out, n)) {{ ret = -1; }} nsimd::device_free(ref); nsimd::device_free(out); nsimd::device_free(tab0); if (ret != 0) {{ return ret; }} }} }} return 0; }} '''.format(gpu_params=gpu_params, op_name=op_name, t=t, typnbits=t[1:], tpb=tpb, cbprng_cpu=nsimd_tests.cbprng(t, operator, 'cpu'), cbprng_cuda=nsimd_tests.cbprng(t, operator, 'cuda', gpu_params), cbprng_hip=nsimd_tests.cbprng(t, operator, 'hip', gpu_params), cbprng_oneapi=nsimd_tests.cbprng(t, operator, 'oneapi', ['(int)n', str(tpb)]))) common.clang_format(opts, filename, cuda=True) def gen_tests_for(opts, tt, t, operator): op_name = operator.name dirname = os.path.join(opts.tests_dir, 'modules', 'tet1d') common.mkdir_p(dirname) filename = os.path.join(dirname, '{}.{}.cpp'.format(op_name, t if t == tt else '{}_{}'.format(t, tt))) if not common.can_create_filename(opts, filename): return arity = len(operator.params[1:]) args_tabs = ', '.join(['{typ} *tab{i}'.format(typ=t, i=i) \ for i in range(arity)]) args_tabs_call = ', '.join(['tab{i}'.format(i=i) \ for i in range(arity)]) args_tabs_i_call = ', '.join(['tab{i}[i]'.format(i=i) \ for i in range(arity)]) args_in_tabs_call = ', '.join(['tet1d::in(tab{i}, n)'. \ format(i=i) \ for i in range(arity)]) fill_tabs = '\n'.join(['{typ} *tab{i} = nsimd::device_calloc<{typ}>(n);\n' \ 'random(tab{i}, n, {i});'.format(typ=t, i=i) \ for i in range(arity)]) free_tabs = '\n'.join(['nsimd::device_free(tab{i});'. \ format(typ=t, i=i) for i in range(arity)]) zero = '{}(0)'.format(t) if t != 'f16' else '{f32_to_f16}(0.0f)' one = '{}(1)'.format(t) if t != 'f16' else '{f32_to_f16}(1.0f)' comp_tab0_to_1 = 'tab0[i] == {}(1)'.format(t) if t != 'f16' else \ '{f16_to_f32}(tab0[i]) == 1.0f' comp_tab1_to_1 = 'tab1[i] == {}(1)'.format(t) if t != 'f16' else \ '{f16_to_f32}(tab1[i]) == 1.0f' if op_name == 'cvt': tet1d_code = \ '''tet1d::out(out) = tet1d::cvt<{t}>(tet1d::cvt<{tt}>( tet1d::in(tab0, n)));'''. \ format(t=t, tt=tt) compute_result_kernel = \ '''dst[i] = nsimd::{{p}}_cvt({t}(), nsimd::{{p}}_cvt( {tt}(), tab0[i]));'''.format(t=t, tt=tt) elif op_name == 'reinterpret': tet1d_code = \ '''tet1d::out(out) = tet1d::reinterpret<{t}>( tet1d::reinterpret<{tt}>(tet1d::in( tab0, n)));'''.format(t=t, tt=tt) compute_result_kernel = \ '''dst[i] = nsimd::{{p}}_reinterpret({t}(), nsimd::{{p}}_reinterpret({tt}(), tab0[i]));'''.format(t=t, tt=tt) elif op_name in ['to_mask', 'to_logical']: tet1d_code = \ '''tet1d::out(out) = tet1d::to_mask(tet1d::to_logical(tet1d::in( tab0, n)));''' compute_result_kernel = \ '''dst[i] = nsimd::{{p}}_to_mask({t}(), nsimd::{{p}}_to_logical(tab0[i]));'''. \ format(t=t) elif operator.params == ['v'] * len(operator.params): compute_result_kernel = \ 'dst[i] = nsimd::{{p}}_{op_name}({args_tabs_i_call});'. \ format(op_name=op_name, args_tabs_i_call=args_tabs_i_call) if operator.cxx_operator != None: if len(operator.params[1:]) == 1: tet1d_code = 'tet1d::out(out) = {cxx_op}tet1d::in(tab0, n);'. \ format(cxx_op=operator.cxx_operator) else: tet1d_code = 'tet1d::out(out) = tet1d::in(tab0, n) {cxx_op} ' \ 'tet1d::in(tab1, n);'. \ format(cxx_op=operator.cxx_operator) else: tet1d_code = \ 'tet1d::out(out) = tet1d::{op_name}({args_in_tabs_call});'. \ format(op_name=op_name, args_in_tabs_call=args_in_tabs_call) elif operator.params == ['l', 'v', 'v']: if operator.cxx_operator != None: cond = 'A {} B'.format(operator.cxx_operator) else: cond = 'tet1d::{}(A, B)'.format(op_name) tet1d_code = \ '''TET1D_OUT({typ}) Z = tet1d::out(out); TET1D_IN({typ}) A = tet1d::in(tab0, n); TET1D_IN({typ}) B = tet1d::in(tab1, n); Z({cond}) = 1;'''.format(cond=cond, typ=t) compute_result_kernel = \ '''if (nsimd::{{p}}_{op_name}(tab0[i], tab1[i])) {{{{ dst[i] = {one}; }}}} else {{{{ dst[i] = {zero}; }}}}'''.format(op_name=op_name, typ=t, one=one, zero=zero) elif operator.params == ['l'] * len(operator.params): if len(operator.params[1:]) == 1: if operator.cxx_operator != None: cond = '{}(A == 1)'.format(operator.cxx_operator) else: cond = 'tet1d::{}(A == 1)'.format(op_name) tet1d_code = \ '''TET1D_OUT({typ}) Z = tet1d::out(out); TET1D_IN({typ}) A = tet1d::in(tab0, n); Z({cond}) = 1;'''.format(cond=cond, typ=t) compute_result_kernel = \ '''if (nsimd::{{p}}_{op_name}({comp_tab0_to_1})) {{{{ dst[i] = {one}; }}}} else {{{{ dst[i] = {zero}; }}}}'''.format(op_name=op_name, typ=t, one=one, zero=zero, comp_tab0_to_1=comp_tab0_to_1) if len(operator.params[1:]) == 2: if operator.cxx_operator != None: cond = '(A == 1) {} (B == 1)'.format(operator.cxx_operator) else: cond = 'tet1d::{}(A == 1, B == 1)'.format(op_name) tet1d_code = \ '''TET1D_OUT({typ}) Z = tet1d::out(out); TET1D_IN({typ}) A = tet1d::in(tab0, n); TET1D_IN({typ}) B = tet1d::in(tab1, n); Z({cond}) = 1;'''.format(cond=cond, typ=t) compute_result_kernel = \ '''if (nsimd::{{p}}_{op_name}({comp_tab0_to_1}, {comp_tab1_to_1})) {{{{ dst[i] = {one}; }}}} else {{{{ dst[i] = {zero}; }}}}'''.format(op_name=op_name, typ=t, one=one, zero=zero, comp_tab0_to_1=comp_tab0_to_1, comp_tab1_to_1=comp_tab1_to_1) else: raise Exception('Unsupported operator: "{}"'.format(op_name)) cpu_kernel = compute_result_kernel.format(p='scalar', f32_to_f16='nsimd_f32_to_f16', f16_to_f32='nsimd_f16_to_f32') cuda_rocm_kernel = compute_result_kernel.format(p='gpu', f32_to_f16='__float2half', f16_to_f32='__half2float') oneapi_kernel = compute_result_kernel.format(p='gpu', f32_to_f16='(f16)', f16_to_f32='(f32)') comp = '!cmp(ref, out, n{})'.format('' if t in common.iutypes \ else ', {}'.format(operator.ufp[t])) with common.open_utf8(opts, filename) as out: out.write( '''#include #include #include "../common.hpp" #if defined(NSIMD_CUDA) __global__ void kernel({typ} *dst, {args_tabs}, int n) {{ int i = threadIdx.x + blockIdx.x * blockDim.x; if (i < n) {{ {cuda_rocm_kernel} }} }} void compute_result({typ} *dst, {args_tabs}, unsigned int n) {{ kernel<<<{gpu_params}>>>(dst, {args_tabs_call}, int(n)); }} {cbprng_cuda} #elif defined(NSIMD_ROCM) __global__ void kernel({typ} *dst, {args_tabs}, size_t n) {{ size_t i = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; if (i < n) {{ {cuda_rocm_kernel} }} }} void compute_result({typ} *dst, {args_tabs}, size_t n) {{ hipLaunchKernelGGL(kernel, {gpu_params}, 0, 0, dst, {args_tabs_call}, n); }} {cbprng_hip} #elif defined(NSIMD_ONEAPI) inline void kernel({typ} *dst, {args_tabs}, const size_t n, sycl::nd_item<1> item) {{ size_t i = item.get_global_id().get(0); if (i < n) {{ {oneapi_kernel} }} }} void compute_result({typ} *dst, {args_tabs}, const size_t n) {{ size_t total_num_threads = (size_t)nsimd_kernel_param((int)n, {tpb}); sycl::queue q_ = nsimd::oneapi::default_queue(); q_.parallel_for(sycl::nd_range<1>(sycl::range<1>(total_num_threads), sycl::range<1>({tpb})), [=](sycl::nd_item<1> item){{ kernel(dst, {args_tabs_call}, n, item); }}).wait_and_throw(); }} {cbprng_oneapi} #else void compute_result({typ} *dst, {args_tabs}, unsigned int n) {{ for (unsigned int i = 0; i < n; i++) {{ {cpu_kernel} }} }} {cbprng_cpu} #endif int main() {{ unsigned int n_[3] = {{ 10, 1001, 10001 }}; for (int i = 0; i < (int)(sizeof(n_) / sizeof(int)); i++) {{ unsigned int n = n_[i]; int ret = 0; {fill_tabs} {typ} *ref = nsimd::device_calloc<{typ}>(n); {typ} *out = nsimd::device_calloc<{typ}>(n); compute_result(ref, {args_tabs_call}, n); {tet1d_code} if ({comp}) {{ ret = -1; }} nsimd::device_free(ref); nsimd::device_free(out); {free_tabs} if (ret != 0) {{ return ret; }} }} return 0; }} '''.format(typ=t, args_tabs=args_tabs, fill_tabs=fill_tabs, args_tabs_call=args_tabs_call, gpu_params=gpu_params, free_tabs=free_tabs, tet1d_code=tet1d_code, comp=comp, cpu_kernel=cpu_kernel, tpb=tpb, cuda_rocm_kernel=cuda_rocm_kernel, oneapi_kernel=oneapi_kernel, cbprng_cpu=nsimd_tests.cbprng(t, operator, 'cpu'), cbprng_cuda=nsimd_tests.cbprng(t, operator, 'cuda', gpu_params), cbprng_hip=nsimd_tests.cbprng(t, operator, 'hip', gpu_params), cbprng_oneapi=nsimd_tests.cbprng(t, operator, 'oneapi', ['(int)n', str(tpb)]))) common.clang_format(opts, filename, cuda=True) def gen_tests(opts): for op_name, operator in operators.operators.items(): if not operator.has_scalar_impl: continue for t in operator.types: tts = common.get_output_types(t, operator.output_to) for tt in tts: if not nsimd_tests.should_i_do_the_test(operator, tt, t): continue if operator.name in ['shl', 'shr', 'shra']: gen_tests_for_shifts(opts, t, operator) else: gen_tests_for(opts, tt, t, operator) # ----------------------------------------------------------------------------- def gen_functions(opts): functions = '' for op_name, operator in operators.operators.items(): if not operator.has_scalar_impl: continue not_closed = is_not_closed(operator) not_closed_tmpl_args = 'typename ToType, ' if not_closed else '' not_closed_tmpl_params = 'ToType' if not_closed else 'none_t' if op_name in ['shl', 'shr', 'shra']: tmpl_args = 'typename Left' tmpl_params = 'Left, none_t, none_t' size = 'return left.size();' args = 'Left const &left, int s' members = 'Left left; int s;' members_assignment = 'ret.left = to_node(left); ret.s = s;' to_node_type = 'typename to_node_t::type, none_t, none_t' elif len(operator.params) == 2: tmpl_args = not_closed_tmpl_args + 'typename Left' tmpl_params = 'Left, none_t, ' + not_closed_tmpl_params size = 'return left.size();' args = 'Left const &left' members = 'Left left;' members_assignment = 'ret.left = to_node(left);' to_node_type = 'typename to_node_t::type, none_t, none_t' elif len(operator.params) == 3: tmpl_args = 'typename Left, typename Right' tmpl_params = 'Left, Right, none_t' size = 'return compute_size(left.size(), right.size());' args = 'Left const &left, Right const &right' members = 'Left left;\nRight right;' members_assignment = '''ret.left = to_node(left); ret.right = to_node(right);''' to_node_type = 'typename to_node_t::type, ' \ 'typename to_node_t::type, none_t' elif len(operator.params) == 4: tmpl_args = 'typename Left, typename Right, typename Extra' tmpl_params = 'Left, Right, Extra' size = \ 'return compute_size(left.size(), right.size(), extra.size());' args = 'Left const &left, Right const &right, Extra const &extra' members = 'Left left;\nRight right;\nExtra extra;' members_assignment = '''ret.left = to_node(left); ret.right = to_node(right); ret.extra = to_node(extra);''' to_node_type = 'typename to_node_t::type, ' \ 'typename to_node_t::type, ' \ 'typename to_node_t::type' if operator.returns == 'v': to_pack = 'to_pack_t' return_type = 'out_type' else: to_pack = 'to_packl_t' return_type = 'bool' if not_closed: to_typ_arg = 'out_type(), ' to_typ_tmpl_arg = '::type>'. \ format(to_pack=to_pack) in_out_typedefs = '''typedef typename Left::out_type in_type; typedef ToType out_type;''' to_node_type = 'typename to_node_t::type, none_t, ToType' else: to_typ_arg = '' if op_name != 'to_mask' else 'out_type(), ' to_typ_tmpl_arg = '' in_out_typedefs = '''typedef typename Left::out_type in_type; typedef typename Left::out_type out_type;''' impl_args = 'left.{cpu_gpu}_get{tmpl}(i)' if (len(operator.params[1:]) >= 2): if operator.params[2] == 'p': impl_args += ', s' else: impl_args += ', right.{cpu_gpu}_get{tmpl}(i)' if (len(operator.params[1:]) >= 3): impl_args += ', extra.{cpu_gpu}_get{tmpl}(i)' impl_scalar = 'return nsimd::scalar_{}({}{});'. \ format(op_name, to_typ_arg, impl_args.format(cpu_gpu='scalar', tmpl='')) impl_gpu = 'return nsimd::gpu_{}({}{});'. \ format(op_name, to_typ_arg, impl_args.format(cpu_gpu='gpu', tmpl='')) impl_simd = 'return nsimd::{}{}({});'. \ format(op_name, to_typ_tmpl_arg, impl_args.format(cpu_gpu='template simd', tmpl='')) functions += \ '''struct {op_name}_t {{}}; template <{tmpl_args}> struct node<{op_name}_t, {tmpl_params}> {{ {in_out_typedefs} {members} nsimd::nat size() const {{ {size} }} #if defined(NSIMD_CUDA) || defined(NSIMD_ROCM) __device__ {return_type} gpu_get(nsimd::nat i) const {{ {impl_gpu} }} #elif defined(NSIMD_ONEAPI) {return_type} gpu_get(nsimd::nat i) const {{ {impl_gpu} }} #else {return_type} scalar_get(nsimd::nat i) const {{ {impl_scalar} }} template typename {to_pack}::type simd_get(nsimd::nat i) const {{ {impl_simd} }} #endif }}; template<{tmpl_args}> node<{op_name}_t, {to_node_type}> {op_name}({args}) {{ node<{op_name}_t, {to_node_type}> ret; {members_assignment} return ret; }}'''.format(op_name=op_name, tmpl_args=tmpl_args, size=size, tmpl_params=tmpl_params, return_type=return_type, args=args, to_pack=to_pack, to_node_type=to_node_type, members=members, members_assignment=members_assignment, in_out_typedefs=in_out_typedefs, impl_gpu=impl_gpu, impl_scalar=impl_scalar, impl_simd=impl_simd) if operator.cxx_operator != None and len(operator.params) == 2: functions += \ ''' template node<{op_name}_t, node, none_t, none_t> operator{cxx_operator}(node const &node) {{ return tet1d::{op_name}(node); }}'''.format(op_name=op_name, cxx_operator=operator.cxx_operator); if operator.cxx_operator != None and len(operator.params) == 3: functions += ''' template node<{op_name}_t, node, node::in_type>, none_t> operator{cxx_operator}(node const &node, T a) {{ typedef typename tet1d::node::in_type S; return tet1d::{op_name}(node, literal_to::impl(a)); }} template node<{op_name}_t, node::in_type>, node, none_t> operator{cxx_operator}(T a, node const &node) {{ typedef typename tet1d::node::in_type S; return tet1d::{op_name}(literal_to::impl(a), node); }} template node<{op_name}_t, node, node, none_t> operator{cxx_operator}(node const &left, node const &right) {{ return tet1d::{op_name}(left, right); }}'''.format(op_name=op_name, cxx_operator=operator.cxx_operator); functions += '\n\n{}\n\n'.format(common.hbar) # Write the code to file dirname = os.path.join(opts.include_dir, 'modules', 'tet1d') common.mkdir_p(dirname) filename = os.path.join(dirname, 'functions.hpp') if not common.can_create_filename(opts, filename): return with common.open_utf8(opts, filename) as out: out.write('#ifndef NSIMD_MODULES_TET1D_FUNCTIONS_HPP\n') out.write('#define NSIMD_MODULES_TET1D_FUNCTIONS_HPP\n\n') out.write('namespace tet1d {\n\n') out.write('{}\n\n'.format(common.hbar)) out.write(functions) out.write('} // namespace tet1d\n\n') out.write('#endif\n') common.clang_format(opts, filename) # ----------------------------------------------------------------------------- def name(): return 'Tiny expression templates 1D' def desc(): return '''This module provide a thin layer of expression templates above NSIMD core. It also allows the programmer to target Intel, NVIDIA and AMD GPUs. Expression template are a C++ technique that allows the programmer to write code "à la MATLAB" where variables usually represents vectors and operators are itemwise.''' def doc_menu(): return {'Overview': 'overview', 'API reference': 'api'} # ----------------------------------------------------------------------------- def doit(opts): common.myprint(opts, 'Generating module tet1d') if opts.library: gen_functions(opts) if opts.tests: gen_tests(opts) if opts.doc: gen_doc_api(opts) gen_doc_overview(opts) ================================================ FILE: egg/oneapi.py ================================================ # Copyright (c) 2021 Agenium Scale # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. # ----------------------------------------------------------------------------- # References: # Functions: book: # Data Parallel C++ # Mastering DPC++ for Programming of Heterogeneous Systems using # C++ and SYCL - Apress Open # Table page 475: list of maths functions. float16 supported # sycl half type (f16) API: # https://mmha.github.io/syclreference/libraries/types/half/ # ----------------------------------------------------------------------------- import common import scalar fmtspec = dict() # ----------------------------------------------------------------------------- def get_impl_f16(operator, totyp, typ): # Case 1: rounding functions # no sycl function available for half type # sycl function available for f32 # use sycl defined conversions half --> f32 , f32 --> half # Case 2: no sycl function available for half type # sycl function available for f32 # use nsimd casts f32-->f16 + sycl function + f16-->f32 no_sycl_avail_f16_cast_use_sycl_f32 = \ ['fma', 'fms', 'fnma', 'fnms', 'min', 'max', 'abs'] # Case 3: sycl provides functions supporting half type sycl_avail_functions_f16 = \ ['rec', 'rec8', 'rec11', 'rsqrt8', 'rsqrt11', 'rsqrt', 'sqrt'] # Case 4: sycl half's type provided comparison operators # Note: # not documented in the book # source: sycl half type (f16) API: # https://mmha.github.io/syclreference/libraries/types/half/ sycl_avail_cmp_op_f16 = { 'lt': 'return {in0} < {in1};', 'gt': 'return {in0} > {in1};', 'le': 'return {in0} <= {in1};', 'ge': 'return {in0} >= {in1};', 'ne': 'return {in0} != {in1};', 'eq': 'return {in0} == {in1};' } # Case 5: no sycl function available for any type # use nsimd_scalar_[operator]_f16 # Dispatch # Case 1 if operator.name in ['floor','ceil','trunc']: return 'return f16(sycl::{op}(static_cast({in0})));'.\ format(op=operator.name,**fmtspec) elif operator.name == 'round_to_even': return 'return f16(sycl::rint(static_cast({in0})));'.\ format(**fmtspec) # Case 2 elif operator.name in no_sycl_avail_f16_cast_use_sycl_f32: if operator.name in ['fma', 'fms', 'fnma', 'fnms']: neg = '-' if operator.name in ['fnma', 'fnms'] else '' op = '-' if operator.name in ['fnms', 'fms'] else '' return '''// cl::sycl::half::operator float f32 x0 = static_cast({in0}); f32 x1 = static_cast({in1}); f32 x2 = static_cast({in2}); f32 res = sycl::fma({neg}x0, x1, {op}x2); // cl::sycl::half::half(const float& f) return f16(res);'''.format(neg=neg, op=op, **fmtspec) elif operator.name in ['min', 'max']: op = 'fmin' if operator.name == 'min' else 'fmax' return '''// cl::sycl::half::operator float f32 x0 = static_cast({in0}); f32 x1 = static_cast({in1}); f32 res = sycl::{op}(x0, x1); // cl::sycl::half::half(const float& f) return f16(res);'''.format(op=op, **fmtspec) elif operator.name == 'abs': return '''// cl::sycl::half::operator float f32 x0 = static_cast({in0}); f32 res = sycl::fabs(x0); // cl::sycl::half::half(const float& f) return f16(res);'''.format(**fmtspec) # Case 3 elif operator.name in sycl_avail_functions_f16: if operator.name in ['rec8', 'rec11', 'rec']: return '''// sycl::recip available in native form only // availability in half-precision return f16(1.0f / {in0});'''.format(**fmtspec) elif operator.name in ['rsqrt8', 'rsqrt11', 'rsqrt']: return 'return sycl::rsqrt({in0});'.format(**fmtspec) elif operator.name == 'sqrt': return 'return sycl::sqrt({in0});'.format(**fmtspec) # Case 4 elif operator.name in sycl_avail_cmp_op_f16: return sycl_avail_cmp_op_f16[operator.name].format(**fmtspec) # Case 5 else: args = ', '.join(['{{in{}}}'.format(i).format(**fmtspec) \ for i in range(len(operator.params[1:]))]) return 'return nsimd_scalar_{op}_f16({args});'.\ format(op=operator.name, args=args) # ----------------------------------------------------------------------------- def reinterpret(totyp, typ): if typ == totyp: return 'return {in0};'.format(**fmtspec) elif ((typ in common.ftypes and totyp in common.iutypes) or \ (typ in common.iutypes and totyp in common.ftypes)): return 'return nsimd_scalar_reinterpret_{totyp}_{typ}({in0});'. \ format(**fmtspec) else: return '''{totyp} ret; memcpy((void *)&ret, (void *)&{in0}, sizeof({in0})); return ret;'''.format(**fmtspec) # ----------------------------------------------------------------------------- def get_impl(operator, totyp, typ): global fmtspec fmtspec = { 'in0': common.in0, 'in1': common.in1, 'in2': common.in2, 'typ': typ, 'totyp': totyp, 'typnbits': typ[1:] } # src operators if operator.src: oneapi_ops = { 'sin_u35': 'sin', 'cos_u35': 'cos', 'tan_u35': 'tan', 'asin_u35': 'asin', 'acos_u35': 'acos', 'atan_u35': 'atan', 'atan2_u35': 'atan2', 'log_u35': 'log', 'cbrt_u35': 'cbrt', 'sin_u10': 'sin', 'cos_u10': 'cos', 'tan_u10': 'tan', 'asin_u10': 'asin', 'acos_u10': 'acos', 'atan_u10': 'atan', 'atan2_u10': 'atan2', 'log_u10': 'log', 'cbrt_u10': 'cbrt', 'exp_u10': 'exp', 'pow_u10': 'pow', 'sinh_u10': 'sinh', 'cosh_u10': 'cosh', 'tanh_u10': 'tanh', 'sinh_u35': 'sinh', 'cosh_u35': 'cosh', 'tanh_u35': 'tanh', 'fastsin_u3500': 'sin', 'fastcos_u3500': 'cos', 'fastpow_u3500': 'pow', 'asinh_u10': 'asinh', 'acosh_u10': 'acosh', 'atanh_u10': 'atanh', 'exp2_u10': 'exp2', 'exp2_u35': 'exp2', 'exp10_u10': 'exp10', 'exp10_u35': 'exp10', 'expm1_u10': 'expm1', 'log10_u10': 'log10', 'log2_u10': 'log2', 'log2_u35': 'log2', 'log1p_u10': 'log1p', 'sinpi_u05': 'sinpi', 'cospi_u05': 'cospi', 'hypot_u05': 'hypot', 'hypot_u35': 'hypot', 'remainder': 'remainder', 'fmod': 'fmod', 'lgamma_u10': 'lgamma', 'tgamma_u10': 'tgamma', 'erf_u10': 'erf', 'erfc_u15': 'erfc' } return 'return cl::sycl::{}({});'.format( oneapi_ops[operator.name], common.get_args(len(operator.params[1:]))) # bool first, no special treatment for f16's bool_operators = [ 'andl', 'orl', 'xorl', 'andnotl', 'notl' ] if operator.name in bool_operators: if operator.name == 'notl': return 'return nsimd_scalar_{op}({in0});'.\ format(op=operator.name,**fmtspec) else: return 'return nsimd_scalar_{op}({in0}, {in1});'.\ format(op=operator.name,**fmtspec) # infix operators no special treatment for f16's infix_operators = [ 'orb', 'andb', 'andnotb', 'notb', 'xorb' ] if operator.name in infix_operators: if operator.name == 'notb': return 'return nsimd_scalar_{op}_{typ}({in0});'.\ format(op=operator.name,**fmtspec) else: return 'return nsimd_scalar_{op}_{typ}({in0}, {in1});'.\ format(op=operator.name,**fmtspec) # reinterpret if operator.name == 'reinterpret': return reinterpret(totyp, typ) # cvt if operator.name == 'cvt': if 'f16' == totyp: # conversion op: takes in a 32 bit float and converts it to 16 bits return 'return sycl::half(static_cast({in0}));'. \ format(**fmtspec) else: return 'return nsimd_scalar_cvt_{totyp}_{typ}({in0});'. \ format(**fmtspec) # to_mask if operator.name == 'to_mask': return 'return nsimd_scalar_to_mask_{totyp}({in0});'.format(**fmtspec) # to_logical if operator.name == 'to_logical': return 'return nsimd_scalar_to_logical_{typ}({in0});'.format(**fmtspec) # for all other operators, f16 has a special treatment if typ == 'f16': return get_impl_f16(operator, totyp, typ) # infix operators - rec - f32, f64 infix_op_rec_ftypes = ['rec', 'rec8', 'rec11'] if typ in common.ftypes_no_f16 and operator.name in infix_op_rec_ftypes: return '''// sycl::recip available in native form only return 1.0{f} / {in0};'''. \ format(f='f' if typ == 'f32' else '', **fmtspec) # infix operators - cmp - f32, f64 infix_op_cmp_f32_f64 = { 'lt': 'return {cast_to_int}sycl::isless({in0}, {in1});', 'gt': 'return {cast_to_int}sycl::isgreater({in0}, {in1});', 'le': 'return {cast_to_int}sycl::islessequal({in0}, {in1});', 'ge': 'return {cast_to_int}sycl::isgreaterequal({in0}, {in1});', 'ne': 'return {cast_to_int}sycl::isnotequal({in0}, {in1});', 'eq': 'return {cast_to_int}sycl::isequal({in0}, {in1});' } if typ in common.ftypes_no_f16 and operator.name in infix_op_cmp_f32_f64: return infix_op_cmp_f32_f64[operator.name]. \ format(cast_to_int='(int)' if typ == 'f64' else '', **fmtspec) # infix operators - cmp - integer types infix_op_cmp_iutypes = [ 'lt', 'gt', 'le', 'ge', 'ne', 'eq' ] if operator.name in infix_op_cmp_iutypes: return 'return nsimd_scalar_{op}_{typ}({in0},{in1});'.\ format(op=operator.name, **fmtspec) # infix operators f32, f64 + integers # ref: see Data Parallel C++ book, pages 480, 481, 482 # TODO: do the functions below call instrinsics/built-in # functions on the device? # 'add': 'return std::plus<{typ}>()({in0}, {in1});', # 'sub': 'return std::minus<{typ}>()({in0}, {in1});', # 'mul': 'return std::multiplies<{typ}>()({in0}, {in1});', # 'div': 'return std::divides<{typ}>()({in0}, {in1});', infix_op_t = [ 'add', 'sub', 'mul', 'div' ] if operator.name in infix_op_t: return 'return nsimd_scalar_{op}_{typ}({in0}, {in1});'. \ format(op=operator.name, **fmtspec) # neg # ref: see Data Parallel C++ book, pages 480, 481, 482 # TODO: does the function below call an instrinsic/built-in # function on the device? # 'neg': 'return std::negate<{typ}>()({in0});' if operator.name == 'neg': return 'return nsimd_scalar_{op}_{typ}({in0});'. \ format(op=operator.name, **fmtspec) # shifts shifts_op_ui_t = [ 'shl', 'shr', 'shra' ] if operator.name in shifts_op_ui_t and typ in common.iutypes: return 'return nsimd_scalar_{op}_{typ}({in0}, {in1});'. \ format(op=operator.name, **fmtspec) # adds if operator.name == 'adds': if typ in common.ftypes: return 'return nsimd_scalar_add_{typ}({in0}, {in1});'. \ format(**fmtspec) else: return 'return sycl::add_sat({in0}, {in1});'.format(**fmtspec) # subs if operator.name == 'subs': if typ in common.ftypes: return 'return nsimd_scalar_sub_{typ}({in0}, {in1});'. \ format(**fmtspec) else: return 'return sycl::sub_sat({in0}, {in1});'.format(**fmtspec) # fma's if operator.name in ['fma', 'fms', 'fnma', 'fnms']: if typ in common.ftypes: neg = '-' if operator.name in ['fnma', 'fnms'] else '' op = '-' if operator.name in ['fnms', 'fms'] else '' return 'return sycl::fma({neg}{in0}, {in1}, {op}{in2});'. \ format(op=op, neg=neg, **fmtspec) else: return 'return nsimd_scalar_{op}_{typ}({in0}, {in1}, {in2});'. \ format(op=operator.name, **fmtspec) # other operators # round_to_even, ceil, floor, trunc, min, max, abs, sqrt # round_to_even if operator.name == 'round_to_even': if typ in common.ftypes_no_f16: return 'return sycl::rint({in0});'.format(**fmtspec) else: return 'return {in0};'.format(**fmtspec) # other rounding operators other_rounding_ops = ['ceil', 'floor', 'trunc'] if operator.name in other_rounding_ops: if typ in common.iutypes: return 'return nsimd_scalar_{op}_{typ}({in0});'. \ format(op=operator.name, **fmtspec) else: return 'return sycl::{op}({in0});'. \ format(op=operator.name, **fmtspec) # min/max if operator.name in ['min', 'max']: if typ in common.iutypes: return 'return sycl::{op}({in0}, {in1});'.\ format(op=operator.name, **fmtspec) else: op = 'sycl::fmin' if operator.name == 'min' else 'sycl::fmax' return 'return {op}({in0}, {in1});'.format(op=op, **fmtspec) # abs if operator.name == 'abs': if typ in common.itypes: return 'return ({typ})sycl::abs({in0});'.format(**fmtspec) elif typ in common.utypes: return 'return nsimd_scalar_abs_{typ}({in0});'.format(**fmtspec) else: return 'return sycl::fabs({in0});'.format(**fmtspec) # sqrt if operator.name == 'sqrt' and typ in common.ftypes: return 'return sycl::sqrt({in0});'.format(**fmtspec) # rsqrt if operator.name in ['rsqrt8', 'rsqrt11', 'rsqrt'] and typ in common.ftypes: return 'return sycl::rsqrt({in0});'.format(**fmtspec) ================================================ FILE: egg/operators.py ================================================ # Use utf-8 encoding # -*- coding: utf-8 -*- # Copyright (c) 2021 Agenium Scale # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. if __name__ == 'operators': import common else: from . import common import collections # ----------------------------------------------------------------------------- # Metaclass and class to gather all operator categories categories = collections.OrderedDict() class MAddToCategories(type): def __new__(cls, name, bases, dct): if name != 'DocCategory': if 'title' not in dct: raise Exception('No member title provided for class {}'. \ format(name)) dct['name'] = name dct['id'] = '/categories/{}'.format(name) ret = type.__new__(cls, name, bases, dct) if name != 'DocCategory': categories[name] = ret() return ret class DocCategory(object, metaclass=MAddToCategories): pass # ----------------------------------------------------------------------------- # Operators categories class DocShuffle(DocCategory): title = 'Shuffle functions' class DocTrigo(DocCategory): title = 'Trigonometric functions' class DocHyper(DocCategory): title = 'Hyperbolic functions' class DocExpLog(DocCategory): title = 'Exponential and logarithmic functions' class DocBasicArithmetic(DocCategory): title = 'Basic arithmetic operators' class DocBitsOperators(DocCategory): title = 'Bits manipulation operators' class DocLogicalOperators(DocCategory): title = 'Logicals operators' class DocMisc(DocCategory): title = 'Miscellaneous' class DocLoadStore(DocCategory): title = 'Loads & stores' class DocComparison(DocCategory): title = 'Comparison operators' class DocRounding(DocCategory): title = 'Rounding functions' class DocConversion(DocCategory): title = 'Conversion operators' # ----------------------------------------------------------------------------- # Metaclass and class to gather all operators operators = collections.OrderedDict() class MAddToOperators(type): def __new__(cls, name, bases, dct): def member_is_defined(member): if member in dct: return True for bc in range(len(bases)): if member in bases[bc].__dict__: return True return False def get_member_value(member): if member in dct: return dct[member] for bc in range(len(bases)): if member in bases[bc].__dict__: return bases[bc].__dict__[member] raise Exception('Member does not exists in class {}'.format(name)) # We don't care about the parent class if name == 'Operator' or name == 'SrcOperator': return type.__new__(cls, name, bases, dct) # Mandatory members mm = ['categories', 'signature'] for m in mm: if m not in dct: raise Exception('Mandatory member "{}" not given in "{}"'. \ format(m, name)) # Check that all items in categories exists for c in dct['categories']: if type(c) == str: raise Exception( \ 'Category "{}" must not be a string for operator "{}"'. \ format(c, name)) if not hasattr(c, 'name'): raise Exception( \ 'Category "{}" does not exist for operator "{}"'. \ format(c.__class__.__name__, name)) if c.name not in categories: raise Exception( \ 'Category "{}" does not exist for operator "{}"'. \ format(c.__class__.__name__, name)) # Some defaults, that are fixed by the implementation (dct['name'], dct['params']) = common.parse_signature(dct['signature']) if 'output_to' in dct: if dct['output_to'] == common.OUTPUT_TO_SAME_TYPE: dct['closed'] = True else: dct['closed'] = False else: dct['closed'] = True dct['output_to'] = common.OUTPUT_TO_SAME_TYPE # If the operator takes as inputs vectors and returns a scalar, then # by default we cannot autogenerate the C++ advanced API because we # cannot guess how to combine pieces of a unrolled pack if 'autogen_cxx_adv' not in dct: if dct['params'][0] in ['p', 's']: dct['autogen_cxx_adv'] = False else: dct['autogen_cxx_adv'] = True # By default tests are done on random numbers depending on the type # but sometimes one needs to produce only integers even if the # type is a floating point type. if 'tests_on_integers_only' not in dct: dct['tests_on_integers_only'] = False; # Fill domain, default is [-20 ; +20] if 'domain' not in dct: dct['domain'] = [[-20, 20], [-20, 20], [-20, 20]] # Number of UFP (cf. documentation) for testing if 'ufp' not in dct: dct['ufp'] = {'f16': 8, 'f32': 18, 'f64': 45} # Check that params is not empty if len(dct['params']) == 0: raise Exception('"params" is empty for operator "{}"'. \ format(name)) # Fill full_name, default is same as name if 'full_name' not in dct: dct['full_name'] = name # Fill desc, default is a basic sentence using full_name if 'desc' not in dct: arg = 'arguments' if len(dct['params']) > 2 else 'argument' if dct['params'][0] == '_': dct['desc'] = '{} the {}.'. \ format(dct['full_name'].capitalize(), arg) else: dct['desc'] = 'Returns the {} of the {}.'.\ format(dct['full_name'], arg) # Fill src, default is operator is in header not in source if not member_is_defined('src'): dct['src'] = False # Fill load_store, default is operator is not for loading/storing if 'load_store' not in dct: dct['load_store'] = False # Fill has_scalar_impl, default is based on several properties if 'has_scalar_impl' not in dct: if DocShuffle in dct['categories'] or \ DocMisc in dct['categories'] or \ 'vx2' in dct['params'] or \ 'vx3' in dct['params'] or \ 'vx4' in dct['params'] or \ dct['output_to'] in [common.OUTPUT_TO_UP_TYPES, common.OUTPUT_TO_DOWN_TYPES] or \ dct['load_store']: dct['has_scalar_impl'] = False else: dct['has_scalar_impl'] = True ret = type.__new__(cls, name, bases, dct) operators[dct['name']] = ret() return ret class Operator(object, metaclass=MAddToOperators): # Default values (for general purpose) cxx_operator = None autogen_cxx_adv = True output_to = common.OUTPUT_TO_SAME_TYPE types = common.types params = [] aliases = [] signature = '' # Enable bench by default do_bench = True # Default values (for documentation) desc = '' # Defaults values (for benches) returns_any_type = False bench_auto_against_cpu = True bench_auto_against_mipp = False bench_auto_against_sleef = False bench_auto_against_std = False use_for_parsing = True @property def returns(self): return self.params[0] @property def args(self): return self.params[1:] def __init__(self): (self.name, self.params) = common.parse_signature(self.signature) super(Operator, self).__init__() def get_return(self): return self.params[0] def tests_mpfr_name(self): return 'mpfr_' + self.name def bench_mipp_name(self, typ): return 'mipp::{}<{}>'.format(self.name, typ) def bench_mipp_types(self): return common.ftypes_no_f16 def bench_sleef_name(self, simd, typ): return common.sleef_name(self.name, simd, typ) def bench_sleef_types(self): return common.ftypes_no_f16 def bench_std_name(self, simd, typ): return 'std::{}'.format(self.name) def bench_std_types(self): return self.types # TODO: move to gen_archis.py def get_header_guard(self, platform, simd_ext): return 'NSIMD_{}_{}_{}_H'.format(platform.upper(), simd_ext.upper(), self.name.upper()) def get_fmtspec(self, t, tt, simd_ext): ret = {} return_typ = common.get_one_type_specific(self.params[0], simd_ext, tt) ret['return_typ'] = return_typ ret['returns'] = '' if return_typ == 'void' else 'return ' args_list = common.enum([common.get_one_type_specific(p, simd_ext, t) for p in self.params[1:]]) if len(args_list) > 0: ret['c_args'] = ', '.join(['{} a{}'.format(i[1], i[0]) for i in args_list]) ret['cxx_args'] = ret['c_args'] + ', ' else: ret['c_args'] = 'void' ret['cxx_args'] = '' if self.closed: ret['cxx_args'] += '{}, {}'.format(t, simd_ext) else: ret['cxx_args'] += '{}, {}, {}'.format(t, tt, simd_ext) ret['vas'] = ', '.join(['a{}'.format(i[0]) for i in args_list]) ret['suf'] = tt if self.closed else '{}_{}'.format(tt, t) ret['name'] = self.name ret['hbar'] = common.hbar ret['simd_ext'] = simd_ext if self.src and 'sleef_symbol_prefix' in self.__class__.__dict__: ret['sleef_symbol_prefix'] = self.sleef_symbol_prefix return ret def get_generic_signature(self, lang): if lang == 'c_base': vas = common.get_args(len(self.params) - 1) args = vas + (', ' if vas != '' else '') args += 'from_type, to_type' if not self.closed else 'type' return ['#define v{name}({args})'.format(name=self.name, args=args), '#define v{name}_e({args}, simd_ext)'. \ format(name=self.name, args=args)] elif lang == 'c_adv': args = ['a{}'.format(i - 1) for i in range(1, len(self.params))] if not self.closed: args = ['to_type'] + args args = ', '.join(args) return '#define nsimd_{}({})'.format(self.name, args) elif lang == 'cxx_base': def get_type(param, typename): if param == '_': return 'void' elif param == 'p': return 'int' elif param == 's': return typename elif param == '*': return '{}*'.format(typename) elif param == 'c*': return '{} const*'.format(typename) elif param == 'vi': return 'typename simd_traits::itype,' \ ' NSIMD_SIMD>::simd_vector'.format(typename) elif param == 'l': return \ 'typename simd_traits<{}, NSIMD_SIMD>::simd_vectorl'. \ format(typename) elif param.startswith('v'): return \ 'typename simd_traits<{}, NSIMD_SIMD>::simd_vector{}'. \ format(typename, param[1:]) else: raise ValueError("Unknown param '{}'".format(param)) return_typ = get_type(self.params[0], 'T') args_list = common.enum(self.params[1:]) if not self.closed : tmpl_args = 'NSIMD_CONCEPT_VALUE_TYPE F, ' \ 'NSIMD_CONCEPT_VALUE_TYPE T' typename = 'F' else: tmpl_args = 'NSIMD_CONCEPT_VALUE_TYPE T' typename = 'T' temp = ', '.join(['{} a{}'.format(get_type(a[1], typename), a[0]) for a in args_list]) temp += ', ' if temp != '' else '' if not self.closed: func_args = temp + 'F, T' if self.output_to == common.OUTPUT_TO_SAME_SIZE_TYPES: cxx20_require = \ 'NSIMD_REQUIRES(sizeof_v == sizeof_v) ' elif self.output_to == common.OUTPUT_TO_UP_TYPES: cxx20_require = \ 'NSIMD_REQUIRES(2 * sizeof_v == sizeof_v) ' else: cxx20_require = \ 'NSIMD_REQUIRES(sizeof_v == 2 * sizeof_v) ' else: func_args = temp + 'T' cxx20_require = '' return 'template <{tmpl_args}> {cxx20_require}{return_typ} ' \ 'NSIMD_VECTORCALL {name}({func_args});'. \ format(return_typ=return_typ, tmpl_args=tmpl_args, func_args=func_args, name=self.name, cxx20_require=cxx20_require) elif lang == 'cxx_adv': def get_type(param, typename, N): if param == '_': return 'void' elif param == 'p': return 'int' elif param == 's': return typename elif param == '*': return '{}*'.format(typename) elif param == 'c*': return '{} const*'.format(typename) elif param == 'vi': return 'pack::itype, {}, SimdExt>'. \ format(typename, N) elif param == 'l': return 'packl<{}, {}, SimdExt>'.format(typename, N) elif param.startswith('v'): return 'pack{}<{}, {}, SimdExt>'. \ format(param[1:], typename, N) else: raise ValueError("Unknown param '{}'".format(param)) args_list = common.enum(self.params[1:]) # Do we need tag dispatching on pack<>? e.g. len, set1 and load* inter = [i for i in ['v', 'l', 'vi', 'vx2', 'vx3', 'vx4'] \ if i in self.params[1:]] tag_dispatching = (inter == []) # Compute template arguments tmpl_args1 = ['NSIMD_CONCEPT_VALUE_TYPE T', 'NSIMD_CONCEPT_SIMD_EXT SimdExt'] tmpl_argsN = ['NSIMD_CONCEPT_VALUE_TYPE T', 'int N', 'NSIMD_CONCEPT_SIMD_EXT SimdExt'] def get_PACK(arg): if arg == 'l': return 'PACKL' elif arg == 'v': return 'PACK' else: return 'PACK{}'.format(arg[1:].upper()) if not self.closed: tmpl = 'NSIMD_CONCEPT_{} ToPackType'. \ format(get_PACK(self.params[0])) tmpl_args1 = [tmpl] + tmpl_args1 tmpl_argsN = [tmpl] + tmpl_argsN tmpl_args1 = ', '.join(tmpl_args1) tmpl_argsN = ', '.join(tmpl_argsN) # Compute function arguments def arg_type(arg, typename, N): if arg in ['v', 'vi', 'vx2', 'vx3', 'vx4', 'l']: return '{} const&'.format(get_type(arg, typename, N)) else: return get_type(arg, typename, N) args1 = ['{} a{}'.format(arg_type(i[1], 'T', '1'), i[0]) \ for i in args_list] argsN = ['{} a{}'.format(arg_type(i[1], 'T', 'N'), i[0]) \ for i in args_list] # Arguments without tag dispatching on pack other_argsN = ', '.join(argsN) # If we need tag dispatching, then the first argument type # is the output type: # 1. If not closed, then the output type is ToPackType # 2. If closed, then the output type is pack if not self.closed: args1 = ['ToPackType const&'] + args1 argsN = ['ToPackType const&'] + argsN elif tag_dispatching: args1 = [arg_type(self.params[0], 'T', '1')] + args1 argsN = [arg_type(self.params[0], 'T', 'N')] + argsN args1 = ', '.join(args1) argsN = ', '.join(argsN) # Compute return type if not self.closed: ret1 = 'ToPackType' retN = 'ToPackType' else: ret1 = get_type(self.params[0], 'T', '1') retN = get_type(self.params[0], 'T', 'N') # For non closed operators that need tag dispatching we have a # require clause cxx20_require = '' if not self.closed: tmpl = 'NSIMD_REQUIRES((' \ '{}sizeof_v == ' \ '{}sizeof_v && ' \ 'ToPackType::unroll == {{}} && '\ 'std::is_same_v))' if self.output_to == common.OUTPUT_TO_SAME_SIZE_TYPES: cxx20_require = tmpl.format('', '') elif self.output_to == common.OUTPUT_TO_UP_TYPES: cxx20_require = tmpl.format('', '2 * ') else: cxx20_require = tmpl.format('2 * ', '') ret = { \ '1': 'template <{tmpl_args1}> {cxx20_require}{ret1} ' \ '{cxx_name}({args1});'. \ format(tmpl_args1=tmpl_args1, cxx20_require=cxx20_require.format('1'), ret1=ret1, args1=args1, cxx_name=self.name), 'N': 'template <{tmpl_argsN}> {cxx20_require}{retN} ' \ '{cxx_name}({argsN});'. \ format(tmpl_argsN=tmpl_argsN, cxx20_require=cxx20_require.format('N'), retN=retN, argsN=argsN, cxx_name=self.name) } if self.cxx_operator: ret.update({ \ 'op1': '''template <{tmpl_args1}> {ret1} operator{cxx_name}({args1});'''. \ format(tmpl_args1=tmpl_args1, ret1=ret1, args1=args1, cxx_name=self.cxx_operator), 'opN': '''template <{tmpl_argsN}> {retN} operator{cxx_name}({argsN});'''. \ format(tmpl_argsN=tmpl_argsN, retN=retN, argsN=argsN, cxx_name=self.cxx_operator) }) if not self.closed: ret['dispatch'] = \ 'template <{tmpl_argsN}> {cxx20_require}{retN} ' \ '{cxx_name}({other_argsN});'. \ format(tmpl_argsN=tmpl_argsN, cxx20_require=cxx20_require.format('N'), other_argsN=other_argsN, retN=retN, cxx_name=self.name) elif tag_dispatching: if [i for i in ['s', '*', 'c*'] if i in self.params[1:]] == []: tmpl_T = '' requires = '' else: tmpl_T = ', NSIMD_CONCEPT_VALUE_TYPE T' requires = 'NSIMD_REQUIRES((' \ 'std::is_same_v))' ret['dispatch'] = \ '''template {requires} SimdVector {cxx_name}({other_argsN});'''.format( PACK=get_PACK(self.params[0]), requires=requires, other_argsN=other_argsN, cxx_name=self.name, tmpl_T=tmpl_T) return ret else: raise Exception('Lang must be one of c_base, cxx_base, cxx_adv') def get_signature(self, typename, lang, simd_ext): # Check that the type is available for this function if typename not in self.types: raise Exception('Type {} not supported for function {}'. \ format(typename, self.name)) fmtspec = self.get_fmtspec(typename, typename, simd_ext) if lang == 'c_base': sig = '{return_typ} NSIMD_VECTORCALL ' \ 'nsimd_{name}_{simd_ext}_{suf}({c_args})'.format(**fmtspec) elif lang == 'cxx_base': sig = '{return_typ} NSIMD_VECTORCALL ' \ '{name}({cxx_args})'.format(**fmtspec) elif lang == 'cxx_adv': sig = '' raise Exception('TODO cxx_adv for {}'.format(lang)) else: raise Exception('Unknown langage {}'.format(lang)) return sig def get_scalar_signature(self, cpu_gpu, t, tt, lang): sig = '__device__ ' if cpu_gpu == 'gpu' else '' sig += common.get_one_type_scalar(self.params[0], tt) + ' ' func_name = 'nsimd_' if lang == 'c' else '' func_name += 'gpu_' if cpu_gpu in ['gpu', 'oneapi'] else 'scalar_' func_name += self.name operator_on_logicals = (self.params == ['l'] * len(self.params)) if lang == 'c' and not operator_on_logicals: func_name += '_{}_{}'.format(tt, t) if not self.closed \ else '_{}'.format(t) sig += func_name args_list = common.enum([common.get_one_type_scalar(p, t) for p in self.params[1:]]) args = ['{} a{}'.format(i[1], i[0]) for i in args_list] if lang == 'cxx' and (not self.closed or \ ('v' not in self.params[1:] and not operator_on_logicals)): args = [tt] + args sig += '(' + ', '.join(args) + ')' return sig class SrcOperator(Operator): src = True types = common.ftypes # ----------------------------------------------------------------------------- # List of functions/operators class Len(Operator): full_name = 'vector length' signature = 'p len' categories = [DocMisc] class Set1(Operator): full_name = 'value broadcast' signature = 'v set1 s' categories = [DocMisc] desc = 'Returns a vector whose all elements are set to the given value.' class Set1l(Operator): full_name = 'logical value broadcast' signature = 'l set1l p' categories = [DocMisc] desc = 'Returns a vector whose all elements are set to the given ' \ 'boolean value: zero means false and nonzero means true.' class Loadu(Operator): signature = 'v loadu c*' load_store = True categories = [DocLoadStore] desc = 'Load data from unaligned memory.' class MaskoLoadu1(Operator): signature = 'v masko_loadu1 l c* v' load_store = True categories = [DocLoadStore] desc = 'Load data from unaligned memory corresponding to True elements.' class MaskzLoadu1(Operator): signature = 'v maskz_loadu1 l c*' load_store = True categories = [DocLoadStore] desc = 'Load data from unaligned memory corresponding to True elements.' class Load2u(Operator): full_name = 'load array of structure' signature = 'vx2 load2u c*' load_store = True categories = [DocLoadStore] desc = 'Load array of structures of 2 members from unaligned memory.' class Load3u(Operator): full_name = 'load array of structure' signature = 'vx3 load3u c*' load_store = True categories = [DocLoadStore] desc = 'Load array of structures of 3 members from unaligned memory.' class Load4u(Operator): full_name = 'load array of structure' signature = 'vx4 load4u c*' load_store = True categories = [DocLoadStore] desc = 'Load array of structures of 4 members from unaligned memory.' class Loada(Operator): signature = 'v loada c*' load_store = True categories = [DocLoadStore] desc = 'Load data from aligned memory.' class MaskoLoada(Operator): signature = 'v masko_loada1 l c* v' load_store = True categories = [DocLoadStore] desc = 'Load data from aligned memory.' class MaskzLoada(Operator): signature = 'v maskz_loada1 l c*' load_store = True categories = [DocLoadStore] desc = 'Load data from aligned memory corresponding to True elements.' class Load2a(Operator): full_name = 'load array of structure' signature = 'vx2 load2a c*' load_store = True categories = [DocLoadStore] desc = 'Load array of structures of 2 members from aligned memory.' class Load3a(Operator): full_name = 'load array of structure' signature = 'vx3 load3a c*' load_store = True categories = [DocLoadStore] desc = 'Load array of structures of 3 members from aligned memory.' class Load4a(Operator): full_name = 'load array of structure' signature = 'vx4 load4a c*' load_store = True categories = [DocLoadStore] desc = 'Load array of structures of 4 members from aligned memory.' class Loadlu(Operator): full_name = 'load vector of logicals' signature = 'l loadlu c*' load_store = True categories = [DocLoadStore] desc = 'Load data from unaligned memory and interpret it as booleans. ' + \ 'Zero is interpreted as False and nonzero as True.' class Loadla(Operator): full_name = 'load vector of logicals' signature = 'l loadla c*' load_store = True categories = [DocLoadStore] desc = 'Load data from aligned memory and interpret it as booleans. ' + \ 'Zero is interpreted as False and nonzero as True.' class Storeu(Operator): signature = '_ storeu * v' load_store = True categories = [DocLoadStore] desc = 'Store SIMD vector into unaligned memory.' class MaskStoreu1(Operator): signature = '_ mask_storeu1 l * v' load_store = True categories = [DocLoadStore] desc = 'Store active SIMD vector elements into unaligned memory.' class Store2u(Operator): signature = '_ store2u * v v' load_store = True categories = [DocLoadStore] desc = 'Store 2 SIMD vectors as array of structures of 2 members into ' + \ 'unaligned memory.' class Store3u(Operator): full_name = 'store into array of structures' signature = '_ store3u * v v v' load_store = True categories = [DocLoadStore] desc = 'Store 3 SIMD vectors as array of structures of 3 members into ' + \ 'unaligned memory.' class Store4u(Operator): full_name = 'store into array of structures' signature = '_ store4u * v v v v' load_store = True categories = [DocLoadStore] desc = 'Store 4 SIMD vectors as array of structures of 4 members into ' + \ 'unaligned memory.' class Storea(Operator): signature = '_ storea * v' load_store = True categories = [DocLoadStore] desc = 'Store SIMD vector into aligned memory.' class MaskStorea1(Operator): signature = '_ mask_storea1 l * v' load_store = True categories = [DocLoadStore] desc = 'Store active SIMD vector elements into aligned memory.' class Store2a(Operator): full_name = 'store into array of structures' signature = '_ store2a * v v' load_store = True categories = [DocLoadStore] desc = 'Store 2 SIMD vectors as array of structures of 2 members into ' + \ 'aligned memory.' class Store3a(Operator): full_name = 'store into array of structures' signature = '_ store3a * v v v' load_store = True categories = [DocLoadStore] desc = 'Store 3 SIMD vectors as array of structures of 3 members into ' + \ 'aligned memory.' class Store4a(Operator): full_name = 'store into array of structures' signature = '_ store4a * v v v v' load_store = True categories = [DocLoadStore] desc = 'Store 4 SIMD vectors as array of structures of 4 members into ' + \ 'aligned memory.' class Gather(Operator): full_name = 'gather elements from memory into a SIMD vector' signature = 'v gather c* vi' load_store = True categories = [DocLoadStore] types = common.ftypes + ['i16', 'u16', 'u32', 'i32', 'i64', 'u64'] desc = 'Gather elements from memory with base address given as first ' \ 'argument and offsets given as second argument.' class GatherLinear(Operator): full_name = 'gather elements from memory into a SIMD vector' signature = 'v gather_linear c* p' load_store = True categories = [DocLoadStore] types = common.types desc = 'Gather elements from memory with base address given as first ' \ 'argument and steps given as second argument. This operator ' \ 'using a SIMD register.' #class MaskzGather(Operator): # full_name = 'gather active elements from SIMD vector to memory and put ' \ # 'zeros in inactive elements.' # signature = 'v maskz_gather l * vi' # load_store = True # categories = [DocLoadStore] # types = common.ftypes + ['i16', 'u16', 'u32', 'i32', 'i64', 'u64'] # desc = 'Gather elements from memory with base address given as second ' \ # 'argument and offsets given as third argument. Inactive elements ' \ # '(first argument) are set to zero.' #class MaskoGather(Operator): # full_name = 'gather active elements from SIMD vector to memory and put ' \ # 'zeros in inactive elements.' # signature = 'v masko_gather l * vi v' # load_store = True # categories = [DocLoadStore] # types = common.ftypes + ['i16', 'u16', 'u32', 'i32', 'i64', 'u64'] # desc = 'Gather elements from memory with base address given as second ' \ # 'argument and offsets given as third argument. Inactive elements ' \ # '(first argument) are set to corresponding elements from fourth ' \ # 'argument.' class Scatter(Operator): full_name = 'scatter elements from SIMD vector to memory' signature = '_ scatter * vi v' load_store = True categories = [DocLoadStore] types = common.ftypes + ['i16', 'u16', 'u32', 'i32', 'i64', 'u64'] desc = 'Scatter elements from third argument to memory with base ' \ 'address given as first argument and offsets given as second ' \ 'argument.' class ScatterLinear(Operator): full_name = 'scatter elements from SIMD vector to memory' signature = '_ scatter_linear * p v' load_store = True categories = [DocLoadStore] types = common.types desc = 'Scatter elements from third argument to memory with base ' \ 'address given as first argument and steps given as second ' \ 'argument. This operator avoids using a SIMD register.' #class MaskScatter(Operator): # full_name = 'scatter active elements from SIMD vector to memory' # signature = '_ mask_scatter l * vi v' # load_store = True # categories = [DocLoadStore] # types = common.ftypes + ['i16', 'u16', 'u32', 'i32', 'i64', 'u64'] # desc = 'Scatter active (first argument) elements from fourth argument ' \ # 'to memory with base address given as second argument and ' \ # 'offsets given as third argument.' class Storelu(Operator): full_name = 'store vector of logicals' signature = '_ storelu * l' load_store = True categories = [DocLoadStore] desc = 'Store SIMD vector of booleans into unaligned memory. True is ' + \ 'stored as 1 and False as 0.' class Storela(Operator): full_name = 'store vector of logicals' signature = '_ storela * l' load_store = True categories = [DocLoadStore] desc = 'Store SIMD vector of booleans into aligned memory. True is ' + \ 'stored as 1 and False as 0.' class Orb(Operator): full_name = 'bitwise or' signature = 'v orb v v' cxx_operator = '|' categories = [DocBitsOperators] class Andb(Operator): full_name = 'bitwise and' signature = 'v andb v v' cxx_operator = '&' categories = [DocBitsOperators] class Andnotb(Operator): full_name = 'bitwise andnot' signature = 'v andnotb v v' categories = [DocBitsOperators] desc = 'Returns the bitwise andnot of its arguments, more precisely ' \ '"arg1 and (not arg2)"' class Notb(Operator): full_name = 'bitwise not' signature = 'v notb v' cxx_operator = '~' categories = [DocBitsOperators] class Xorb(Operator): full_name = 'bitwise xor' signature = 'v xorb v v' cxx_operator = '^' categories = [DocBitsOperators] class Orl(Operator): full_name = 'logical or' signature = 'l orl l l' cxx_operator = '||' categories = [DocLogicalOperators] class Andl(Operator): full_name = 'logical and' signature = 'l andl l l' cxx_operator = '&&' categories = [DocLogicalOperators] class Andnotl(Operator): full_name = 'logical andnot' signature = 'l andnotl l l' categories = [DocLogicalOperators] desc = 'Returns the logical andnot of its arguments, more precisely ' \ '"arg1 and (not arg2)"' class Xorl(Operator): full_name = 'logical xor' signature = 'l xorl l l' categories = [DocLogicalOperators] class Notl(Operator): full_name = 'logical not' signature = 'l notl l' cxx_operator = '!' categories = [DocLogicalOperators] bench_auto_against_std = True class Add(Operator): full_name = 'addition' signature = 'v add v v' cxx_operator = '+' categories = [DocBasicArithmetic] bench_auto_against_std = True bench_auto_against_mipp = True class Sub(Operator): full_name = 'subtraction' signature = 'v sub v v' cxx_operator = '-' categories = [DocBasicArithmetic] bench_auto_against_std = True bench_auto_against_mipp = True class Addv(Operator): full_name = 'horizontal sum' signature = 's addv v' categories = [DocMisc] desc = 'Returns the sum of all the elements contained in v' do_bench = False types = common.ftypes class Mul(Operator): full_name = 'multiplication' signature = 'v mul v v' cxx_operator = '*' categories = [DocBasicArithmetic] class Div(Operator): full_name = 'division' signature = 'v div v v' cxx_operator = '/' domain = [[-20, 20], [0.5, 20]] categories = [DocBasicArithmetic] class Neg(Operator): full_name = 'opposite' signature = 'v neg v' cxx_operator = '-' categories = [DocBasicArithmetic] class Min(Operator): full_name = 'minimum' signature = 'v min v v' categories = [DocBasicArithmetic] class Max(Operator): full_name = 'maximum' signature = 'v max v v' categories = [DocBasicArithmetic] class Shr(Operator): full_name = 'right shift in zeros' signature = 'v shr v p' types = common.iutypes cxx_operator = '>>' categories = [DocBitsOperators] class Shl(Operator): full_name = 'left shift' signature = 'v shl v p' types = common.iutypes cxx_operator = '<<' categories = [DocBitsOperators] class Shra(Operator): full_name = 'arithmetic right shift' signature = 'v shra v p' types = common.iutypes categories = [DocBitsOperators] desc = 'Performs a right shift operation with sign extension.' class Eq(Operator): full_name = 'compare for equality' signature = 'l eq v v' cxx_operator = '==' categories = [DocComparison] class Ne(Operator): full_name = 'compare for inequality' signature = 'l ne v v' cxx_operator = '!=' categories = [DocComparison] desc = 'Compare the inputs for inequality.' class Gt(Operator): full_name = 'compare for greater-than' signature = 'l gt v v' cxx_operator = '>' categories = [DocComparison] desc = 'Compare the inputs for greater-than.' class Ge(Operator): full_name = 'compare for greater-or-equal-than' signature = 'l ge v v' cxx_operator = '>=' categories = [DocComparison] desc = 'Compare the inputs for greater-or-equal-than.' class Lt(Operator): full_name = 'compare for lesser-than' signature = 'l lt v v' cxx_operator = '<' categories = [DocComparison] desc = 'Compare the inputs for lesser-than.' class Le(Operator): full_name = 'compare for lesser-or-equal-than' signature = 'l le v v' cxx_operator = '<=' categories = [DocComparison] desc = 'Compare the inputs for lesser-or-equal-than.' class If_else1(Operator): full_name = 'blend' signature = 'v if_else1 l v v' categories = [DocMisc] desc = 'Blend the inputs using the vector of logical as a first ' + \ 'argument. Elements of the second input is taken when the ' + \ 'corresponding elements from the vector of logicals is true, ' + \ 'otherwise elements of the second input are taken.' class Abs(Operator): full_name = 'absolute value' signature = 'v abs v' categories = [DocBasicArithmetic] class Fma(Operator): full_name = 'fused multiply-add' signature = 'v fma v v v' categories = [DocBasicArithmetic] desc = 'Multiply the first and second inputs and then adds the third ' + \ 'input.' tests_on_integers_only = True class Fnma(Operator): full_name = 'fused negate-multiply-add' signature = 'v fnma v v v' categories = [DocBasicArithmetic] desc = 'Multiply the first and second inputs, negate the intermediate ' + \ 'result and then adds the third input.' tests_on_integers_only = True class Fms(Operator): full_name = 'fused multiply-substract' signature = 'v fms v v v' categories = [DocBasicArithmetic] desc = 'Substracts the third input to multiplication the first and ' + \ 'second inputs.' tests_on_integers_only = True class Fnms(Operator): full_name = 'fused negate-multiply-substract' signature = 'v fnms v v v' categories = [DocBasicArithmetic] desc = 'Multiply the first and second inputs, negate the intermediate ' + \ 'result and then substracts the third input to the ' + \ 'intermediate result.' tests_on_integers_only = True class Ceil(Operator): full_name = 'rounding up to integer value' signature = 'v ceil v' categories = [DocRounding] class Floor(Operator): full_name = 'rounding down to integer value' signature = 'v floor v' categories = [DocRounding] class Trunc(Operator): full_name = 'rounding towards zero to integer value' signature = 'v trunc v' categories = [DocRounding] class Round_to_even(Operator): full_name = 'rounding to nearest integer value, tie to even' signature = 'v round_to_even v' categories = [DocRounding] class All(Operator): full_name = 'check all elements' signature = 'p all l' categories = [DocMisc] desc = 'Return true if and only if all elements of the inputs are true.' class Any(Operator): full_name = 'check for one true elements' signature = 'p any l' categories = [DocMisc] desc = 'Return true if and only if at least one element of the inputs ' + \ 'is true.' class Nbtrue(Operator): full_name = 'count true elements' signature = 'p nbtrue l' categories = [DocMisc] desc = 'Return the number of true elements in the input.' class Reinterpret(Operator): full_name = 'reinterpret vector' signature = 'v reinterpret v' output_to = common.OUTPUT_TO_SAME_SIZE_TYPES categories = [DocConversion] desc = 'Reinterpret input vector into a different vector type ' + \ 'preserving all bits.' class Reinterpretl(Operator): full_name = 'reinterpret vector of logicals' signature = 'l reinterpretl l' categories = [DocConversion] output_to = common.OUTPUT_TO_SAME_SIZE_TYPES has_scalar_impl = False desc = 'Reinterpret input vector of logicals into a different vector ' + \ 'type of logicals preserving all elements values. The output ' + \ 'type must have same length as input type.' class Cvt(Operator): full_name = 'convert vector' signature = 'v cvt v' output_to = common.OUTPUT_TO_SAME_SIZE_TYPES categories = [DocConversion] desc = 'Convert input vector into a different vector type. The output ' + \ 'type must have same length as input type.' class Upcvt(Operator): full_name = 'convert vector to larger type' signature = 'vx2 upcvt v' output_to = common.OUTPUT_TO_UP_TYPES types = ['i8', 'u8', 'i16', 'u16', 'f16', 'i32', 'u32', 'f32'] categories = [DocConversion] desc = 'Convert input vector into a different larger vector type. The ' + \ 'output type must be twice as large as the input type.' class Downcvt(Operator): full_name = 'convert vector to narrow type' signature = 'v downcvt v v' output_to = common.OUTPUT_TO_DOWN_TYPES types = ['i16', 'u16', 'f16', 'i32', 'u32', 'f32', 'i64', 'u64', 'f64'] categories = [DocConversion] desc = 'Convert input vector into a different narrow vector type. The ' + \ 'output type must be twice as less as the input type.' class Rec(Operator): full_name = 'reciprocal' signature = 'v rec v' types = common.ftypes domain = [[-20, -0.5, 0.5, 20]] categories = [DocBasicArithmetic] class Rec11(Operator): full_name = 'reciprocal with relative error at most $2^{-11}$' signature = 'v rec11 v' types = common.ftypes categories = [DocBasicArithmetic] domain = [[-20, -0.5, 0.5, 20]] ufp = { 'f16': 10, 'f32': 10, 'f64': 10 } class Rec8(Operator): full_name = 'reciprocal with relative error at most $2^{-8}$' signature = 'v rec8 v' types = common.ftypes categories = [DocBasicArithmetic] domain = [[-20, -0.5, 0.5, 20]] ufp = { 'f16': 7, 'f32': 7, 'f64': 7 } class Sqrt(Operator): full_name = 'square root' signature = 'v sqrt v' types = common.ftypes domain = [[0, 20]] categories = [DocBasicArithmetic] class Rsqrt11(Operator): full_name = 'square root with relative error at most $2^{-11}$' signature = 'v rsqrt11 v' types = common.ftypes domain = [[0.5, 20]] ufp = { 'f16': 10, 'f32': 10, 'f64': 10 } categories = [DocBasicArithmetic] class Rsqrt8(Operator): full_name = 'square root with relative error at most $2^{-8}$' signature = 'v rsqrt8 v' types = common.ftypes domain = [[0.5, 20]] ufp = { 'f16': 7, 'f32': 7, 'f64': 7 } categories = [DocBasicArithmetic] class Ziplo(Operator): full_name = 'zip low halves' signature = 'v ziplo v v' types = common.types categories = [DocShuffle] desc = 'Construct a vector where elements of the first low half input ' + \ 'are followed by the corresponding element of the second low ' + \ 'half input.' class Ziphi(Operator): full_name = 'zip high halves' signature = 'v ziphi v v' types = common.types categories = [DocShuffle] desc = 'Construct a vector where elements of the first high half ' + \ 'input are followed by the corresponding element of the second ' + \ 'high half input.' class Unziplo(Operator): full_name = 'unziplo' signature = 'v unziplo v v' types = common.types categories = [DocShuffle] class Unziphi(Operator): full_name = 'unziphi' signature = 'v unziphi v v' types = common.types categories = [DocShuffle] class Zip(Operator): full_name = 'zip' signature = 'vx2 zip v v' types = common.types categories = [DocShuffle] class Unzip(Operator): full_name = 'unzip' signature = 'vx2 unzip v v' types = common.types categories = [DocShuffle] class ToMask(Operator): full_name = 'build mask from logicals' signature = 'v to_mask l' categories = [DocLogicalOperators] desc = 'Returns a mask consisting of all ones for true elements and ' + \ 'all zeros for false elements.' class ToLogical(Operator): full_name = 'build logicals from data' signature = 'l to_logical v' categories = [DocLogicalOperators] desc = 'Returns a vector of logicals. Set true when the corresponding ' + \ 'elements are non zero (at least one bit to 1) and false ' + \ 'otherwise.' class Iota(Operator): full_name = 'fill vector with increasing values' signature = 'v iota' categories = [DocMisc] desc = 'Returns a vectors whose first element is zero, the second is ' \ 'one and so on.' class MaskForLoopTail(Operator): full_name = 'build mask for ending loops' signature = 'l mask_for_loop_tail p p' categories = [DocMisc] desc = 'Returns a mask for loading/storing data at loop tails by ' \ 'setting the first elements to True and the last to False. ' \ 'The first argument is index in a loop whose number of elements ' \ 'is given by the second argument.' class Adds(Operator): full_name = 'addition using saturation' signature = 'v adds v v' categories = [DocBasicArithmetic] desc = 'Returns the saturated sum of the two vectors given as arguments' class Subs(Operator): full_name = 'subtraction using saturation' signature = 'v subs v v' categories = [DocBasicArithmetic] desc = 'Returns the saturated subtraction of the two vectors given as ' \ 'arguments' class Sin_u35(SrcOperator): full_name = 'sine' signature = 'v sin_u35 v' sleef_symbol_prefix = 'nsimd_sleef_sin_u35' categories = [DocTrigo] desc = 'Compute the sine of its argument with a precision of 3.5 ulps. ' \ 'For more informations visit .' class Cos_u35(SrcOperator): full_name = 'cosine' signature = 'v cos_u35 v' sleef_symbol_prefix = 'nsimd_sleef_cos_u35' categories = [DocTrigo] desc = 'Compute the cosine of its argument with a precision of ' \ '3.5 ulps. ' \ 'For more informations visit .' class Tan_u35(SrcOperator): full_name = 'tangent' signature = 'v tan_u35 v' sleef_symbol_prefix = 'nsimd_sleef_tan_u35' domain = [[-4.7, -1.6, -1.5, 1.5, 1.6, 4.7]] categories = [DocTrigo] desc = 'Compute the tangent of its argument with a precision of ' \ '3.5 ulps. ' \ 'For more informations visit .' class Asin_u35(SrcOperator): full_name = 'arcsine' signature = 'v asin_u35 v' sleef_symbol_prefix = 'nsimd_sleef_asin_u35' domain = [[-0.9, 0.9]] categories = [DocTrigo] desc = 'Compute the arcsine of its argument with a precision of ' \ '3.5 ulps. ' \ 'For more informations visit .' class Acos_u35(SrcOperator): full_name = 'arccosine' signature = 'v acos_u35 v' sleef_symbol_prefix = 'nsimd_sleef_acos_u35' domain = [[-0.9, 0.9]] categories = [DocTrigo] desc = 'Compute the arccosine of its argument with a ' \ 'precision of 3.5 ulps. ' \ 'For more informations visit .' class Atan_u35(SrcOperator): full_name = 'arctangent' signature = 'v atan_u35 v' sleef_symbol_prefix = 'nsimd_sleef_atan_u35' categories = [DocTrigo] desc = 'Compute the arctangent of its argument with a ' \ 'precision of 3.5 ulps. ' \ 'For more informations visit .' class Atan2_u35(SrcOperator): full_name = 'arctangent' signature = 'v atan2_u35 v v' sleef_symbol_prefix = 'nsimd_sleef_atan2_u35' domain = [[-20, 20], [-20, -0.5, 0.5, 20]] categories = [DocTrigo] desc = 'Compute the arctangent of its argument with a ' \ 'precision of 3.5 ulps. ' \ 'For more informations visit .' class Log_u35(SrcOperator): full_name = 'natural logarithm' signature = 'v log_u35 v' sleef_symbol_prefix = 'nsimd_sleef_log_u35' domain = [[0.5, 20]] categories = [DocExpLog] desc = 'Compute the natural logarithm of its argument with a ' \ 'precision of 3.5 ulps. ' \ 'For more informations visit .' class Cbrt_u35(SrcOperator): full_name = 'cube root' signature = 'v cbrt_u35 v' sleef_symbol_prefix = 'nsimd_sleef_cbrt_u35' categories = [DocBasicArithmetic] desc = 'Compute the cube root of its argument with a precision of ' \ '3.5 ulps. ' \ 'For more informations visit .' class Sin_u10(SrcOperator): full_name = 'sine' signature = 'v sin_u10 v' sleef_symbol_prefix = 'nsimd_sleef_sin_u10' categories = [DocTrigo] desc = 'Compute the sine of its argument with a precision of 1.0 ulps. ' \ 'For more informations visit .' class Cos_u10(SrcOperator): full_name = 'cosine' signature = 'v cos_u10 v' sleef_symbol_prefix = 'nsimd_sleef_cos_u10' categories = [DocTrigo] desc = 'Compute the cosine of its argument with a precision of ' \ '1.0 ulps. ' \ 'For more informations visit .' class Tan_u10(SrcOperator): full_name = 'tangent' signature = 'v tan_u10 v' sleef_symbol_prefix = 'nsimd_sleef_tan_u10' domain = [[-4.7, -1.6, -1.5, 1.5, 1.6, 4.7]] categories = [DocTrigo] desc = 'Compute the tangent of its argument with a precision of ' \ '1.0 ulps. ' \ 'For more informations visit .' class Asin_u10(SrcOperator): full_name = 'arcsine' signature = 'v asin_u10 v' sleef_symbol_prefix = 'nsimd_sleef_asin_u10' domain = [[-0.9, 0.9]] categories = [DocTrigo] desc = 'Compute the arcsine of its argument with a precision of ' \ '1.0 ulps. ' \ 'For more informations visit .' class Acos_u10(SrcOperator): full_name = 'arccosine' signature = 'v acos_u10 v' sleef_symbol_prefix = 'nsimd_sleef_acos_u10' domain = [[-0.9, 0.9]] categories = [DocTrigo] desc = 'Compute the arccosine of its argument with a precision of ' \ '1.0 ulps. ' \ 'For more informations visit .' class Atan_u10(SrcOperator): full_name = 'arctangent' signature = 'v atan_u10 v' sleef_symbol_prefix = 'nsimd_sleef_atan_u10' categories = [DocTrigo] desc = 'Compute the arctangent of its argument with a precision of ' \ '1.0 ulps. ' \ 'For more informations visit .' class Atan2_u10(SrcOperator): full_name = 'arctangent' signature = 'v atan2_u10 v v' sleef_symbol_prefix = 'nsimd_sleef_atan2_u10' domain = [[-20, 20], [-20, -0.5, 0.5, 20]] categories = [DocTrigo] desc = 'Compute the arctangent of its argument with a precision of ' \ '1.0 ulps. ' \ 'For more informations visit .' class Log_u10(SrcOperator): full_name = 'natural logarithm' signature = 'v log_u10 v' sleef_symbol_prefix = 'nsimd_sleef_log_u10' domain = [[0.5, 20]] categories = [DocExpLog] desc = 'Compute the natural logarithm of its argument with a ' \ 'precision of 1.0 ulps. ' \ 'For more informations visit .' class Cbrt_u10(SrcOperator): full_name = 'cube root' signature = 'v cbrt_u10 v' sleef_symbol_prefix = 'nsimd_sleef_cbrt_u10' categories = [DocBasicArithmetic] desc = 'Compute the cube root of its argument with a precision of ' \ '1.0 ulps. ' \ 'For more informations visit .' class Exp_u10(SrcOperator): full_name = 'base-e exponential' signature = 'v exp_u10 v' sleef_symbol_prefix = 'nsimd_sleef_exp_u10' domain = [[-20, 5]] categories = [DocExpLog] desc = 'Compute the base-e exponential of its argument with a ' \ 'precision of 1.0 ulps. ' \ 'For more informations visit .' class Pow_u10(SrcOperator): full_name = 'power' signature = 'v pow_u10 v v' sleef_symbol_prefix = 'nsimd_sleef_pow_u10' domain = [[0, 5], [-5, 5]] categories = [DocExpLog] desc = 'Compute the power of its argument with a precision of 1.0 ulps. ' \ 'For more informations visit .' class Sinh_u10(SrcOperator): full_name = 'hyperbolic sine' signature = 'v sinh_u10 v' sleef_symbol_prefix = 'nsimd_sleef_sinh_u10' categories = [DocHyper] desc = 'Compute the hyperbolic sine of its argument with a ' \ 'precision of 1.0 ulps. ' \ 'For more informations visit .' class Cosh_u10(SrcOperator): full_name = 'hyperbolic cosine' signature = 'v cosh_u10 v' sleef_symbol_prefix = 'nsimd_sleef_cosh_u10' categories = [DocHyper] desc = 'Compute the hyperbolic cosine of its argument with a ' \ 'precision of 1.0 ulps. ' \ 'For more informations visit .' class Tanh_u10(SrcOperator): full_name = 'hyperbolic tangent' signature = 'v tanh_u10 v' sleef_symbol_prefix = 'nsimd_sleef_tanh_u10' categories = [DocHyper] desc = 'Compute the hyperbolic tangent of its argument with a ' \ 'precision of 1.0 ulps. ' \ 'For more informations visit .' class Sinh_u35(SrcOperator): full_name = 'hyperbolic sine' signature = 'v sinh_u35 v' sleef_symbol_prefix = 'nsimd_sleef_sinh_u35' categories = [DocHyper] desc = 'Compute the hyperbolic sine of its argument with a ' \ 'precision of 3.5 ulps. ' \ 'For more informations visit .' class Cosh_u35(SrcOperator): full_name = 'hyperbolic cosine' signature = 'v cosh_u35 v' sleef_symbol_prefix = 'nsimd_sleef_cosh_u35' categories = [DocHyper] desc = 'Compute the hyperbolic cosine of its argument with a ' \ 'precision of 3.5 ulps. ' \ 'For more informations visit .' class Tanh_u35(SrcOperator): full_name = 'hyperbolic tangent' signature = 'v tanh_u35 v' sleef_symbol_prefix = 'nsimd_sleef_tanh_u35' categories = [DocHyper] desc = 'Compute the hyperbolic tangent of its argument with a ' \ 'precision of 3.5 ulps. ' \ 'For more informations visit .' class Asinh_u10(SrcOperator): full_name = 'inverse hyperbolic sine' signature = 'v asinh_u10 v' sleef_symbol_prefix = 'nsimd_sleef_asinh_u10' categories = [DocHyper] desc = 'Compute the inverse hyperbolic sine of its argument with a ' \ 'precision of 1.0 ulps. ' \ 'For more informations visit .' class Acosh_u10(SrcOperator): full_name = 'inverse hyperbolic cosine' signature = 'v acosh_u10 v' sleef_symbol_prefix = 'nsimd_sleef_acosh_u10' categories = [DocHyper] domain = [[1, 20]] desc = 'Compute the inverse hyperbolic cosine of its argument with a ' \ 'precision of 1.0 ulps. ' \ 'For more informations visit .' class Atanh_u10(SrcOperator): full_name = 'inverse hyperbolic tangent' signature = 'v atanh_u10 v' sleef_symbol_prefix = 'nsimd_sleef_atanh_u10' domain = [[-0.9, 0.9]] categories = [DocHyper] desc = 'Compute the inverse hyperbolic tangent of its argument with a ' \ 'precision of 1.0 ulps. ' \ 'For more informations visit .' class Exp2_u10(SrcOperator): full_name = 'base-2 exponential' signature = 'v exp2_u10 v' sleef_symbol_prefix = 'nsimd_sleef_exp2_u10' domain = [[-20, 5]] categories = [DocExpLog] desc = 'Compute the base-2 exponential of its argument with a ' \ 'precision of 1.0 ulps. ' \ 'For more informations visit .' class Exp2_u35(SrcOperator): full_name = 'base-2 exponential' signature = 'v exp2_u35 v' sleef_symbol_prefix = 'nsimd_sleef_exp2_u35' domain = [[-20, 5]] categories = [DocExpLog] desc = 'Compute the base-2 exponential of its argument with a ' \ 'precision of 3.5 ulps. ' \ 'For more informations visit .' class Exp10_u10(SrcOperator): full_name = 'base-10 exponential' signature = 'v exp10_u10 v' sleef_symbol_prefix = 'nsimd_sleef_exp10_u10' domain = [[-5, 3]] categories = [DocExpLog] desc = 'Compute the base-10 exponential of its argument with a ' \ 'precision of 1.0 ulps. ' \ 'For more informations visit .' class Exp10_u35(SrcOperator): full_name = 'base-10 exponential' signature = 'v exp10_u35 v' sleef_symbol_prefix = 'nsimd_sleef_exp10_u35' domain = [[-5, 3]] categories = [DocExpLog] desc = 'Compute the base-10 exponential of its argument with a ' \ 'precision of 3.5 ulps. ' \ 'For more informations visit .' class Expm1_u10(SrcOperator): full_name = 'exponential minus 1' signature = 'v expm1_u10 v' sleef_symbol_prefix = 'nsimd_sleef_expm1_u10' domain = [[-5, 3]] categories = [DocExpLog] desc = 'Compute the exponential minus 1 of its argument with a ' \ 'precision of 1.0 ulps. ' \ 'For more informations visit .' class Log10_u10(SrcOperator): full_name = 'base-10 logarithm' signature = 'v log10_u10 v' sleef_symbol_prefix = 'nsimd_sleef_log10_u10' domain = [[0.5, 20]] categories = [DocExpLog] desc = 'Compute the base-10 logarithm of its argument with a precision ' \ 'of 1.0 ulps. ' \ 'For more informations visit .' class Log2_u10(SrcOperator): full_name = 'base-2 logarithm' signature = 'v log2_u10 v' sleef_symbol_prefix = 'nsimd_sleef_log2_u10' domain = [[0.5, 20]] categories = [DocExpLog] desc = 'Compute the base-2 logarithm of its argument with a precision ' \ 'of 1.0 ulps. ' \ 'For more informations visit .' class Log2_u35(SrcOperator): full_name = 'base-2 logarithm' signature = 'v log2_u35 v' sleef_symbol_prefix = 'nsimd_sleef_log2_u35' domain = [[0.5, 20]] categories = [DocExpLog] desc = 'Compute the base-2 logarithm of its argument with a ' \ 'precision of 3.5 ulps. ' \ 'For more informations visit .' class Log1p_u10(SrcOperator): full_name = 'logarithm of 1 plus argument' signature = 'v log1p_u10 v' sleef_symbol_prefix = 'nsimd_sleef_log1p_u10' domain = [[-0.5, 19]] categories = [DocExpLog] desc = 'Compute the logarithm of 1 plus argument of its argument with ' \ 'a precision of 1.0 ulps. ' \ 'For more informations visit .' class Sinpi_u05(SrcOperator): full_name = 'sine of pi times argument' signature = 'v sinpi_u05 v' sleef_symbol_prefix = 'nsimd_sleef_sinpi_u05' categories = [DocTrigo] desc = 'Compute the sine of pi times argument of its argument with a ' \ 'precision of 0.5 ulps. ' \ 'For more informations visit .' class Cospi_u05(SrcOperator): full_name = 'cosine of pi times argument' signature = 'v cospi_u05 v' sleef_symbol_prefix = 'nsimd_sleef_cospi_u05' categories = [DocTrigo] desc = 'Compute the cosine of pi times argument of its argument with ' \ 'a precision of 0.5 ulps. ' \ 'For more informations visit .' class Hypot_u05(SrcOperator): full_name = 'Euclidean distance' signature = 'v hypot_u05 v v' sleef_symbol_prefix = 'nsimd_sleef_hypot_u05' categories = [DocBasicArithmetic] desc = 'Compute the Euclidean distance of its argument with a ' \ 'precision of 0.5 ulps. ' \ 'For more informations visit .' class Hypot_u35(SrcOperator): full_name = 'Euclidean distance' signature = 'v hypot_u35 v v' sleef_symbol_prefix = 'nsimd_sleef_hypot_u35' categories = [DocBasicArithmetic] desc = 'Compute the Euclidean distance of its argument with a ' \ 'precision of 3.5 ulps. ' \ 'For more informations visit .' class Remainder(SrcOperator): full_name = 'floating-point remainder' signature = 'v remainder v v' sleef_symbol_prefix = 'nsimd_sleef_remainder' domain = [[1, 20], [1, 20]] categories = [DocBasicArithmetic] desc = 'Compute the floating-point remainder of its arguments. ' \ 'For more informations visit .' class Fmod(SrcOperator): full_name = 'floating-point remainder' signature = 'v fmod v v' sleef_symbol_prefix = 'nsimd_sleef_fmod' domain = [[1, 20], [1, 20]] categories = [DocBasicArithmetic] desc = 'Compute the floating-point remainder of its argument. ' \ 'For more informations visit .' class Lgamma_u10(SrcOperator): full_name = 'log gamma' signature = 'v lgamma_u10 v' sleef_symbol_prefix = 'nsimd_sleef_lgamma_u10' domain = [[0.5, 20]] categories = [DocExpLog] desc = 'Compute the log gamma of its argument with a precision of ' \ '1.0 ulps. ' \ 'For more informations visit .' class Tgamma_u10(SrcOperator): full_name = 'true gamma' signature = 'v tgamma_u10 v' sleef_symbol_prefix = 'nsimd_sleef_tgamma_u10' domain = [[0.5, 5]] categories = [DocExpLog] desc = 'Compute the true gamma of its argument with a precision of ' \ '1.0 ulps. ' \ 'For more informations visit .' class Erf_u10(SrcOperator): full_name = 'complementary error' signature = 'v erf_u10 v' sleef_symbol_prefix = 'nsimd_sleef_erf_u10' categories = [DocExpLog] desc = 'Compute the complementary error of its argument with a ' \ 'precision of 1.0 ulps. ' \ 'For more informations visit .' class Erfc_u15(SrcOperator): full_name = 'complementary error' signature = 'v erfc_u15 v' sleef_symbol_prefix = 'nsimd_sleef_erfc_u15' categories = [DocExpLog] desc = 'Compute the complementary error of its argument with a ' \ 'precision of 1.5 ulps. ' \ 'For more informations visit .' ================================================ FILE: egg/platform_arm.py ================================================ # Copyright (c) 2020 Agenium Scale # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. # This file gives the implementation of platform ARM, i.e. ARM SIMD. # Reading this file is rather straightforward. ARM SIMD extensions are rather # coherent and consistent. It implements the following architectures: # - ARMv7 -> 128 bits registers without f16 and f64 support # - Aarch32 -> 128 bits registers with optional f16 and without f64 support # - Aarch64 -> 128 bits registers with optional f16 and f64 support # - SVE -> up to 2048 bits registers # The first three SIMD extensions are collectively called NEON. Aarch32 and # Aarch64 correspond respectively to ARMv8 32 and 64 bits chips. Note that # the ARM documentation says that ARMv7, Aarch32 are different but it seems # that they differ by only a handful of intrinsics which are not in the scope # of NSIMD so we have implemented the following: # # - ARMv7 \ -> neon128 # - Aarch32 / # - Aarch64 -> aarch64 # - SVE -> sve import common # ----------------------------------------------------------------------------- # Helpers def neon_typ(typ): prefix = {'i': 'int', 'u': 'uint', 'f': 'float'} return '{}{}x{}_t'.format(prefix[typ[0]], typ[1:], 128 // int(typ[1:])) def half_neon64_typ(typ): prefix = {'i': 'int', 'u': 'uint', 'f': 'float'} return '{}{}x{}_t'.format(prefix[typ[0]], typ[1:], 64 // int(typ[1:])) def sve_typ(typ): prefix = {'i': 'svint', 'u': 'svuint', 'f': 'svfloat'} return '{}{}_t'.format(prefix[typ[0]], typ[1:]) def suf(typ): if typ[0] == 'i': return 's{}'.format(typ[1:]) else: return typ neon = ['neon128', 'aarch64'] fixed_sized_sve = ['sve128', 'sve256', 'sve512', 'sve1024', 'sve2048'] sve = ['sve'] + fixed_sized_sve fmtspec = {} def convert_from_predicate(opts, op): if opts.sve_emulate_bool: return '''svsel({op}, svdup_n_u{typnbits}_x({svtrue}, (u{typnbits})~0), svdup_n_u{typnbits}_x({svtrue}, 0))'''. \ format(op=op, **fmtspec) else: return op def convert_to_predicate(opts, op): if opts.sve_emulate_bool: # TODO: the casts are a workaround to avoid a bug in gcc trunk for sve # it needs to be deleted when the bug is corrected return '''svcmpeq({svtrue}, (svuint{typnbits}_t){op}, svdup_n_u{typnbits}_x({svtrue}, (u{typnbits})~0))'''.format(op=op, **fmtspec) else: return op # ----------------------------------------------------------------------------- # Implementation of mandatory functions for this module def get_simd_exts(): return ['neon128', 'aarch64', 'sve', 'sve128', 'sve256', 'sve512', 'sve1024', 'sve2048'] def get_prev_simd_ext(simd_ext): if simd_ext in ['neon128', 'aarch64']: return 'cpu' elif simd_ext in sve: return 'aarch64' raise ValueError('Unknown SIMD extension "{}"'.format(simd_ext)) def emulate_fp16(simd_ext): if not simd_ext in get_simd_exts(): raise ValueError('Unknown SIMD extension "{}"'.format(simd_ext)) if simd_ext in sve: return False else: return True def get_type(opts, simd_ext, typ, nsimd_typ): if simd_ext in neon: if typ == 'f64': if simd_ext == 'neon128': return 'typedef struct {{ double v0; double v1; }} {};'. \ format(nsimd_typ) else: return 'typedef {} {};'.format(neon_typ('f64'), nsimd_typ) elif typ == 'f16': return ''' #ifdef NSIMD_ARM_FP16 typedef float16x8_t {nsimd_typ}; #else typedef struct {{ float32x4_t v0; float32x4_t v1; }} {nsimd_typ}; #endif '''.format(nsimd_typ=nsimd_typ) # extra \n are necessary else: return 'typedef {} {};'.format(neon_typ(typ), nsimd_typ) elif simd_ext == 'sve': return 'typedef {} {};'.format(sve_typ(typ), nsimd_typ) elif simd_ext in fixed_sized_sve: return 'typedef {} {} __attribute__((arm_sve_vector_bits({})));'. \ format(sve_typ(typ), nsimd_typ, simd_ext[3:]) else: raise ValueError('Unknown SIMD extension "{}"'.format(simd_ext)) def get_logical_type(opts, simd_ext, typ, nsimd_typ): if typ not in common.types: raise ValueError('Unknown type "{}"'.format(typ)) if simd_ext not in get_simd_exts(): raise ValueError('Unknown SIMD extension "{}"'.format(simd_ext)) if typ in common.ftypes + common.itypes: typ2 = 'u{}'.format(typ[1:]); else: typ2 = typ if simd_ext == 'neon128': if typ == 'f16': return \ ''' #ifdef NSIMD_ARM_FP16 typedef uint16x8_t {nsimd_typ}; #else typedef struct {{ uint32x4_t v0; uint32x4_t v1; }} {nsimd_typ}; #endif '''.format(nsimd_typ=nsimd_typ) # extra \n are necessary elif typ == 'f64': return 'typedef struct {{ u64 v0; u64 v1; }} {};'.format(nsimd_typ) else: return get_type(opts, simd_ext, typ2, nsimd_typ) if simd_ext == 'aarch64': if typ == 'f16': return get_logical_type(opts, 'neon128', 'f16', nsimd_typ) else: return get_type(opts, simd_ext, typ2, nsimd_typ) elif simd_ext in sve: if opts.sve_emulate_bool: return get_type(opts, simd_ext, 'u' + typ[1:], nsimd_typ) elif simd_ext in fixed_sized_sve: return \ 'typedef svbool_t {} __attribute__((arm_sve_vector_bits({})));'. \ format(nsimd_typ, simd_ext[3:]) else: return 'typedef svbool_t {};'.format(nsimd_typ) def get_nb_registers(simd_ext): if simd_ext in neon: return '16' elif simd_ext in sve: return '32' else: raise ValueError('Unknown SIMD extension "{}"'.format(simd_ext)) def get_native_soa_typ(simd_ext, typ, deg): prefix = { 'i': 'int', 'u': 'uint', 'f': 'float' }[typ[0]] if simd_ext in sve: return 'sv{}x{}_t'.format(prefix + typ[1:], deg) else: return '{}{}x{}x{}_t'.format(prefix, typ[1:], 128 // int(typ[1:]), deg) def get_SoA_type(simd_ext, typ, deg, nsimd_typ): if simd_ext != 'sve': raise ValueError('SIMD extension must be "sve"') prefix = { 'i': 'int', 'u': 'uint', 'f': 'float' }[typ[0]] return 'typedef {} {};'.format(get_native_soa_typ(simd_ext, typ, deg), nsimd_typ) def has_compatible_SoA_types(simd_ext): if simd_ext not in neon + sve: raise ValueError('Unknown SIMD extension "{}"'.format(simd_ext)) return False # ----------------------------------------------------------------------------- def get_additional_include(func, platform, simd_ext): ret = '''#include '''.format(func) if simd_ext in sve: ret += '''#include '''.format(func) if func in ['load2u', 'load3u', 'load4u', 'load2a', 'load3a', 'load4a']: deg = func[4] ret += '''#if NSIMD_CXX > 0 extern "C" {{ #endif NSIMD_INLINE nsimd_{simd_ext}_vu16x{deg} nsimd_{func}_{simd_ext}_u16(const u16*); # if NSIMD_CXX > 0 }} // extern "C" #endif '''.format(func=func, deg=deg, simd_ext=simd_ext) if func in ['mask_storea1', 'mask_storeu1', 'masko_loada1', 'masko_loadu1', 'maskz_loada1', 'maskz_loadu1'] and \ simd_ext not in sve: ret += '''#include ''' if func == 'mask_for_loop_tail' and simd_ext not in sve: ret += '''#include #include #include #include '''.format(simd_ext=simd_ext) if simd_ext == 'neon128' and func == 'notl': ret += '''#include ''' if simd_ext in neon and func == 'ne': ret += '''#include # include '''.format(simd_ext=simd_ext) if simd_ext in neon and func in ['fms', 'fnms']: ret += '''#include #include #include '''.format(simd_ext=simd_ext) if func == 'shra': ret += '''#include '''.format(simd_ext=simd_ext) if func in ['loadlu', 'loadla']: ret += '''#include # include # include # include '''.format(load='load' + func[5], simd_ext=simd_ext) if func in ['storelu', 'storela']: ret += '''#include # include # include '''.format(store='store' + func[6], simd_ext=simd_ext) if func == 'to_logical': ret += '''#include #include ''' .format(simd_ext=simd_ext) if func == 'zip': ret += '''#include #include '''.format(simd_ext=simd_ext) if func == 'unzip': ret += '''#include #include '''.format(simd_ext=simd_ext) if func == 'adds': ret += '''#include '''.format(simd_ext=simd_ext) if func == 'subs': ret += '''#include '''.format(simd_ext=simd_ext) if func in ['gather', 'scatter'] and simd_ext == 'sve': ret += '''#include ''' return ret # ----------------------------------------------------------------------------- # Emulators def emulate_op1(op, simd_ext, typ): if simd_ext in neon: le = 128 // int(typ[1:]); return '''int i; {typ} buf[{le}]; vst1q_{suf}(buf, {in0}); for (i=0; i < {le}; i += nsimd_len_cpu_{typ}()) {{ nsimd_storeu_cpu_{typ}( & buf[i], nsimd_{op}_cpu_{typ}( nsimd_loadu_cpu_{typ}(&buf[i])));}} return vld1q_{suf}(buf); '''. \ format(op=op, le=le, **fmtspec) if simd_ext in sve: le = 2048 // int(typ[1:]); return '''int i; {typ} buf[{le}]; svst1_{suf}({svtrue}, buf, {in0}); for (i=0; i < simd_len_{simd_ext}_{typ}(); i += nsimd_len_cpu_{typ}()) {{ nsimd_storeu_cpu_{typ}( & buf[i], nsimd_{op}_cpu_{typ}( nsimd_loadu_cpu_{typ}(&buf[i])));}} return svld1_{suf}({svtrue}, buf); '''. \ format(op=op, le=le, **fmtspec) def emulate_op2(op, simd_ext, typ): if simd_ext in neon: le = 128 // int(typ[1:]); return '''int i; {typ} buf0[{le}], buf1[{le}]; vst1q_{suf}(buf0, {in0}); vst1q_{suf}(buf1, {in1}); for (i=0; i < {le}; i++) {{ buf0[i] = ({typ})(buf0[i] {op} buf1[i]);}} return vld1q_{suf}(buf0); '''. \ format(op=op, le=le, **fmtspec) if simd_ext in sve: le = 2048 // int(typ[1:]); return '''int i; {typ} buf0[{le}], buf1[{le}]; svst1_{suf}({svtrue}, buf0, {in0}); svst1_{suf}({svtrue}, buf1, {in1}); for (i=0; i < nsimd_len_{simd_ext}_{typ}(); i++) {{ buf0[i] = ({typ})(buf0[i] {op} buf1[i]);}} return svld1_{suf}({svtrue}, buf0); '''. \ format(op=op, le=le, **fmtspec) def emulate_lop2_neon(opts, op, simd_ext, typ): le = 128 // int(typ[1:]); ltyp = get_logical_type(opts, simd_ext, typ) lsuf = suf(ltyp) return '''int i; {ltyp} buf0[{le}], buf1[{le}]; vst1q_{lsuf}(buf0, {in0}); vst1q_{lsuf}(buf1, {in1}); for (i = 0; i < {le}; i++) {{ buf0[i] = buf0[i] {op} buf1[i] ? ({ltyp})-1 : 0; }} return vld1q_{lsuf}(buf0);'''. \ format(op=op, le=le, ltyp=ltyp, lsuf=lsuf, **fmtspec) def emulate_op3_neon(op, simd_ext, typ): le = 128 // int(typ[1:]); return '''int i; {typ} buf0[{le}], buf1[{le}], buf2[{le}]; vst1q_{suf}(buf0, {in0}); vst1q_{suf}(buf1, {in1}); vst1q_{suf}(buf2, {in2}); for (i = 0; i < {le}; i += nsimd_len_cpu_{typ}()) {{ nsimd_storeu_cpu_{typ}(&buf0[i], nsimd_{op}_cpu_{typ}( nsimd_loadu_cpu_{typ}(&buf0[i]), nsimd_loadu_cpu_{typ}(&buf1[i]), nsimd_loadu_cpu_{typ}(&buf2[i]))); }} return vld1q_{suf}(buf0);'''.format(op=op, le=le, **fmtspec) def emulate_f64_neon(simd_ext, op, params): fmtspec2 = fmtspec.copy() fmtspec2['op'] = op fmtspec2['buf_ret_decl'] = 'nsimd_cpu_{}f64 buf_ret;'. \ format('v' if params[0] == 'v' else 'vl') fmtspec2['buf_decl'] = '\n'.join(['nsimd_cpu_{}f64 buf{};'. \ format('v' if p[1] == 'v' else 'vl', p[0]) \ for p in common.enum(params[1:])]) fmtspec2['bufs'] = ','.join(['buf{}'.format(i) \ for i in range(0, len(params) - 1)]) fmtspec2['ret_decl'] = 'nsimd_{}_{}f64 ret;'. \ format(simd_ext, 'v' if params[0] == 'v' else 'vl') buf_set = '\n'.join('''buf{i}.v0 = {ini}.v0; buf{i}.v1 = {ini}.v1;'''. \ format(i=i, ini=fmtspec['in{}'.format(i)]) \ for i in range(0, len(params) - 1)) return '''{buf_ret_decl} {buf_decl} {ret_decl} {buf_set} buf_ret = nsimd_{op}_cpu_f64({bufs}); ret.v0 = buf_ret.v0; ret.v1 = buf_ret.v1; return ret;'''.format(buf_set=buf_set, **fmtspec2) # ----------------------------------------------------------------------------- def f16f64(simd_ext, typ, op, armop, arity, forced_intrinsics = ''): fmtspec2 = fmtspec.copy() tmpl = ', '.join(['{{in{}}}.v{{{{i}}}}'.format(i).format(**fmtspec) \ for i in range(0, arity)]) fmtspec2['args1'] = tmpl.format(i='0') fmtspec2['args2'] = tmpl.format(i='1') fmtspec2['armop'] = armop fmtspec2['op'] = op if simd_ext in neon and typ == 'f16': if forced_intrinsics != '': fmtspec2['intrinsics'] = forced_intrinsics else: temp = ', '.join(['{{in{}}}'.format(i).format(**fmtspec) \ for i in range(0, arity)]) fmtspec2['intrinsics'] = 'return v{}q_f16({});'.format(armop, temp) return '''#ifdef NSIMD_ARM_FP16 {intrinsics} #else nsimd_{simd_ext}_vf16 ret; ret.v0 = nsimd_{op}_{simd_ext}_f32({args1}); ret.v1 = nsimd_{op}_{simd_ext}_f32({args2}); return ret; #endif'''.format(**fmtspec2) elif simd_ext == 'neon128' and typ == 'f64': return emulate_f64_neon(simd_ext, op, ['v'] * (arity + 1)) return '' # ----------------------------------------------------------------------------- # Lenghts def max_len(simd_ext, typ): if simd_ext == 'sve': return 2048 // int(typ[1:]) elif simd_ext in fixed_sized_sve: return int(simd_ext[3:]) // int(typ[1:]) else: return 128 // int(typ[1:]) def real_len(simd_ext, typ): if simd_ext == 'sve': return 'nsimd_len_sve_{typ}()'.format(**fmtspec) else: return max_len(simd_ext, typ) # ----------------------------------------------------------------------------- # Loads of degree 1, 2, 3 and 4 def load1234(opts, simd_ext, typ, deg): if simd_ext in neon: if deg == 1: normal = 'return vld{deg}q_{suf}({in0});'. \ format(deg=deg, **fmtspec) if typ == 'f16': return \ '''#ifdef NSIMD_ARM_FP16 {normal} #else /* Note that we can do much better but is it useful? */ nsimd_{simd_ext}_vf16 ret; f32 buf[4]; buf[0] = nsimd_u16_to_f32(*(u16*){in0}); buf[1] = nsimd_u16_to_f32(*((u16*){in0} + 1)); buf[2] = nsimd_u16_to_f32(*((u16*){in0} + 2)); buf[3] = nsimd_u16_to_f32(*((u16*){in0} + 3)); ret.v0 = vld1q_f32(buf); buf[0] = nsimd_u16_to_f32(*((u16*){in0} + 4)); buf[1] = nsimd_u16_to_f32(*((u16*){in0} + 5)); buf[2] = nsimd_u16_to_f32(*((u16*){in0} + 6)); buf[3] = nsimd_u16_to_f32(*((u16*){in0} + 7)); ret.v1 = vld1q_f32(buf); return ret; #endif'''.format(normal=normal, **fmtspec) elif typ == 'f64' and simd_ext == 'neon128': return \ '''nsimd_neon128_vf64 ret; ret.v0 = *{in0}; ret.v1 = *({in0} + 1); return ret;'''.format(**fmtspec) else: return normal else: normal = \ '''nsimd_{simd_ext}_v{typ}x{deg} ret; {soa_typ} buf = vld{deg}q_{suf}({in0}); {assignment} return ret;'''. \ format(deg=deg, soa_typ=get_native_soa_typ(simd_ext, typ, deg), assignment='\n'.join(['ret.v{i} = buf.val[{i}];'. \ format(i=i) for i in range(0, deg)]), **fmtspec) if typ == 'f16': assignment = \ '''vst1q_u16(buf, temp.val[{{i}}]); ret.v{{i}} = nsimd_loadu_{simd_ext}_f16((f16 *)buf);'''. \ format(**fmtspec) return \ '''{soa_typ} temp = vld{deg}q_u16((u16 *){in0}); u16 buf[8]; nsimd_{simd_ext}_vf16x{deg} ret; {assignment} return ret;'''. \ format(deg=deg, assignment='\n'.join([assignment. \ format(i=i) for i in range(0, deg)]), soa_typ=get_native_soa_typ(simd_ext, 'u16', deg), **fmtspec) elif typ in 'f64' and simd_ext == 'neon128': return \ 'nsimd_neon128_vf64x{} ret;\n'.format(deg) + \ '\n'.join(['ret.v{i}.v0 = *({in0} + {i});'. \ format(i=i, **fmtspec) for i in range(0, deg)]) + \ '\n'.join(['ret.v{i}.v1 = *({in0} + {ipd});'. \ format(i=i, ipd=i + deg, **fmtspec) \ for i in range(0, deg)]) + \ '\nreturn ret;\n' elif typ in ['i64', 'u64'] and simd_ext == 'neon128': return \ '''nsimd_neon128_v{typ}x{deg} ret; {typ} buf[2];'''.format(deg=deg, **fmtspec) + \ '\n'.join(['''buf[0] = *({in0} + {i}); buf[1] = *({in0} + {ipd}); ret.v{i} = vld1q_{suf}(buf);'''. \ format(i=i, ipd=i + deg, **fmtspec) \ for i in range(0, deg)]) + \ '\nreturn ret;\n' else: return normal else: if deg == 1: return 'return svld{deg}_{suf}({svtrue}, {in0});'. \ format(deg=deg, **fmtspec) else: return \ '''nsimd_{simd_ext}_v{typ}x{deg} ret; {sve_typ} buf = svld{deg}_{suf}({svtrue}, {in0}); {assignment} return ret;'''.format(assignment=\ '\n'.join(['ret.v{i} = svget{deg}_{suf}(buf, {i});'. \ format(i=i, deg=deg, **fmtspec) \ for i in range(deg)]), sve_typ=get_native_soa_typ('sve', typ, deg), deg=deg, **fmtspec) # ----------------------------------------------------------------------------- # Mask loads def maskoz_load(oz, simd_ext, typ): if simd_ext in sve: return 'return svsel_{suf}({in0}, svld1_{suf}({in0}, {in1}), {oz});'. \ format(oz='{in2}'.format(**fmtspec) if oz == 'o' \ else 'svdup_n_{suf}(({typ})0)'.format(**fmtspec), **fmtspec) if typ == 'f64' and simd_ext == 'neon128': return '''nsimd_neon128_vf64 ret; if ({in0}.v0) {{ ret.v0 = {in1}[0]; }} else {{ ret.v0 = {oz0}; }} if ({in0}.v1) {{ ret.v1 = {in1}[1]; }} else {{ ret.v1 = {oz1}; }} return ret;'''.format( oz0 = '0.0f' if oz == 'z' else '{in2}.v0'.format(**fmtspec), oz1 = '0.0f' if oz == 'z' else '{in2}.v1'.format(**fmtspec), **fmtspec) le = 128 // int(typ[1:]) normal = '''int i; {typ} buf[{le}]; u{typnbits} mask[{le}]; vst1q_{suf}(buf, {oz}); vst1q_u{typnbits}(mask, {in0}); for (i = 0; i < {le}; i++) {{ if (mask[i]) {{ buf[i] = {in1}[i]; }} }} return vld1q_{suf}(buf);'''. \ format(oz='vdupq_n_{suf}(({typ})0)'.format(**fmtspec) \ if oz == 'z' else '{in2}'.format(**fmtspec), le=le, **fmtspec) if typ == 'f16': return '''#ifdef NSIMD_ARM_FP16 {normal} #else int i; nsimd_{simd_ext}_vf16 ret; f32 buf[8]; u32 mask[8]; vst1q_f32(buf, {oz0}); vst1q_f32(buf + 4, {oz1}); vst1q_u32(mask, {in0}.v0); vst1q_u32(mask + 4, {in0}.v1); for (i = 0; i < 8; i++) {{ if (mask[i]) {{ buf[i] = nsimd_f16_to_f32({in1}[i]); }} }} ret.v0 = vld1q_f32(buf); ret.v1 = vld1q_f32(buf + 4); return ret; #endif'''. \ format(oz0='vdupq_n_f32(0.0f)'.format(**fmtspec) \ if oz == 'z' else '{in2}.v0'.format(**fmtspec), oz1='vdupq_n_f32(0.0f)'.format(**fmtspec) \ if oz == 'z' else '{in2}.v1'.format(**fmtspec), normal=normal, **fmtspec) return normal # ----------------------------------------------------------------------------- # Stores of degree 1, 2, 3 and 4 def store1234(opts, simd_ext, typ, deg): if simd_ext in neon: if deg == 1: normal = 'vst{deg}q_{suf}({in0}, {in1});'. \ format(deg=deg, **fmtspec) if typ == 'f16': return \ '''#ifdef NSIMD_ARM_FP16 {normal} #else f32 buf[4]; vst1q_f32(buf, {in1}.v0); *((u16*){in0} ) = nsimd_f32_to_u16(buf[0]); *((u16*){in0} + 1) = nsimd_f32_to_u16(buf[1]); *((u16*){in0} + 2) = nsimd_f32_to_u16(buf[2]); *((u16*){in0} + 3) = nsimd_f32_to_u16(buf[3]); vst1q_f32(buf, {in1}.v1); *((u16*){in0} + 4) = nsimd_f32_to_u16(buf[0]); *((u16*){in0} + 5) = nsimd_f32_to_u16(buf[1]); *((u16*){in0} + 6) = nsimd_f32_to_u16(buf[2]); *((u16*){in0} + 7) = nsimd_f32_to_u16(buf[3]); #endif'''.format(normal=normal, **fmtspec) elif typ == 'f64' and simd_ext == 'neon128': return \ '''*{in0} = {in1}.v0; *({in0} + 1) = {in1}.v1;'''.format(**fmtspec) else: return normal else: normal = \ '''{soa_typ} buf; {assignment} vst{deg}q_{suf}({in0}, buf);'''. \ format(deg=deg, assignment='\n'.join([ 'buf.val[{{}}] = {{in{}}};'.format(i). \ format(i - 1, **fmtspec) for i in range(1, deg + 1)]), soa_typ=get_native_soa_typ(simd_ext, typ, deg), **fmtspec) if typ == 'f16': assignment = \ '''nsimd_storeu_{{simd_ext}}_f16((f16 *)buf, {{in{}}}); temp.val[{{}}] = vld1q_u16(buf);''' return \ '''#ifdef NSIMD_ARM_FP16 {normal} #else {soa_typ} temp; u16 buf[8]; {assignment} vst{deg}q_u16((u16 *){in0}, temp); #endif'''. \ format(assignment='\n'.join([assignment.format(i). \ format(i - 1, **fmtspec) \ for i in range(1, deg + 1)]), deg=deg, normal=normal, soa_typ=get_native_soa_typ(simd_ext, 'u16', deg), **fmtspec) elif typ == 'f64' and simd_ext == 'neon128': return \ '\n'.join(['*({{in0}} + {}) = {{in{}}}.v0;'. \ format(i - 1, i).format(**fmtspec) \ for i in range(1, deg + 1)]) + '\n' + \ '\n'.join(['*({{in0}} + {}) = {{in{}}}.v1;'. \ format(i + deg - 1, i).format(**fmtspec) \ for i in range(1, deg + 1)]) elif typ in ['i64', 'u64'] and simd_ext == 'neon128': return \ '{typ} buf[{biglen}];'.format(biglen=2 * deg, **fmtspec) + \ '\n'.join(['vst1q_{{suf}}(buf + {im1x2}, {{in{i}}});'. \ format(im1x2=2 * (i - 1), i=i).format(**fmtspec) \ for i in range(1, deg + 1)]) + \ '\n'.join(['''*({in0} + {i}) = buf[{ix2}]; *({in0} + {ipd}) = buf[{ix2p1}];'''. \ format(i=i, ipd=i + deg, ix2=i * 2, ix2p1=2 * i + 1, **fmtspec) \ for i in range(0, deg)]) else: return normal else: if deg == 1: return 'svst{deg}_{suf}({svtrue}, {in0}, {in1});'. \ format(deg=deg, **fmtspec) fill_soa_typ = \ '\n'.join(['tmp = svset{{deg}}_{{suf}}(tmp, {im1}, {{in{i}}});'. \ format(im1=i - 1, i=i).format(deg=deg, **fmtspec) \ for i in range(1, deg + 1)]) return \ '''{soa_typ} tmp = svundef{deg}_{suf}(); {fill_soa_typ} svst{deg}_{suf}({svtrue}, {in0}, tmp);'''. \ format(soa_typ=get_native_soa_typ('sve', typ, deg), deg=deg, fill_soa_typ=fill_soa_typ, **fmtspec) # ----------------------------------------------------------------------------- # Mask stores def mask_store(simd_ext, typ): if simd_ext in sve: return 'svst1_{suf}({in0}, {in1}, {in2});'.format(**fmtspec) if typ == 'f64' and simd_ext == 'neon128': return '''if ({in0}.v0) {{ {in1}[0] = {in2}.v0; }} if ({in0}.v1) {{ {in1}[1] = {in2}.v1; }}'''.format(**fmtspec) le = 128 // int(typ[1:]) normal = '''int i; {typ} buf[{le}]; u{typnbits} mask[{le}]; vst1q_{suf}(buf, {in2}); vst1q_u{typnbits}(mask, {in0}); for (i = 0; i < {le}; i++) {{ if (mask[i]) {{ {in1}[i] = buf[i]; }} }}'''.format(le=le, **fmtspec) if typ == 'f16': return \ '''#ifdef NSIMD_ARM_FP16 {normal} #else f32 buf[8]; u32 mask[8]; int i; vst1q_u32(mask, {in0}.v0); vst1q_u32(mask + 4, {in0}.v1); vst1q_f32(buf, {in2}.v0); vst1q_f32(buf + 4, {in2}.v1); for (i = 0; i < 8; i++) {{ if (mask[i]) {{ {in1}[i] = nsimd_f32_to_f16(buf[i]); }} }} #endif'''.format(normal=normal, **fmtspec) return normal # ----------------------------------------------------------------------------- # Length def len1(simd_ext, typ): if simd_ext in neon: return 'return {};'.format(128 // int(typ[1:])) elif simd_ext == 'sve': return 'return (int)svcntp_b{typnbits}({svtrue}, {svtrue});'. \ format(**fmtspec) elif simd_ext in fixed_sized_sve: return 'return {};'.format(int(simd_ext[3:]) // int(typ[1:])) # ----------------------------------------------------------------------------- # Add/sub def addsub(op, simd_ext, typ): ret = f16f64(simd_ext, typ, op, op, 2) if ret != '': return ret if simd_ext in neon: return 'return v{op}q_{suf}({in0}, {in1});'. \ format(op=op, **fmtspec) else: return 'return sv{op}_{suf}_x({svtrue}, {in0}, {in1});'. \ format(op=op, **fmtspec) # ----------------------------------------------------------------------------- # Multiplication def mul2(simd_ext, typ): ret = f16f64(simd_ext, typ, 'mul', 'mul', 2) if ret != '': return ret elif simd_ext in neon and typ in ['i64', 'u64']: return emulate_op2('*', simd_ext, typ) else: if simd_ext in neon: return 'return vmulq_{suf}({in0}, {in1});'.format(**fmtspec) else: return 'return svmul_{suf}_x({svtrue}, {in0}, {in1});'. \ format(**fmtspec) # ----------------------------------------------------------------------------- # Division def div2(simd_ext, typ): if simd_ext == 'aarch64' and typ in ['f32', 'f64']: return 'return vdivq_{suf}({in0}, {in1});'.format(**fmtspec) elif simd_ext in sve and \ typ in ['f16', 'f32', 'f64', 'i32', 'u32', 'i64', 'u64']: return 'return svdiv_{suf}_x({svtrue}, {in0}, {in1});'. \ format(**fmtspec) else: ret = f16f64(simd_ext, typ, 'div', 'div', 2) if ret != '': return ret return emulate_op2('/', simd_ext, typ) # ----------------------------------------------------------------------------- # Binary operators: and, or, xor, andnot def binop2(op, simd_ext, typ): armop = {'orb': 'orr', 'xorb': 'eor', 'andb': 'and', 'andnotb': 'bic'} if typ in common.iutypes: if simd_ext in neon: return 'return v{armop}q_{suf}({in0}, {in1});'. \ format(armop=armop[op], **fmtspec) else: return 'return sv{armop}_{suf}_x({svtrue}, {in0}, {in1});'. \ format(armop=armop[op], **fmtspec) # From here only float types if typ == 'f16': intrinsics = \ '''return vreinterpretq_f16_u16(v{armop}q_u16(vreinterpretq_u16_f16( {in0}), vreinterpretq_u16_f16({in1})));'''. \ format(armop=armop[op], **fmtspec) else: intrinsics = '' ret = f16f64(simd_ext, typ, op, armop[op], 2, intrinsics) if ret != '': return ret if simd_ext in neon: return \ '''return vreinterpretq_f{typnbits}_u{typnbits}(v{armop}q_u{typnbits}( vreinterpretq_u{typnbits}_f{typnbits}({in0}), vreinterpretq_u{typnbits}_f{typnbits}({in1})));'''. \ format(armop=armop[op], **fmtspec) else: return \ '''return svreinterpret_f{typnbits}_u{typnbits}( sv{armop}_u{typnbits}_x({svtrue}, svreinterpret_u{typnbits}_f{typnbits}({in0}), svreinterpret_u{typnbits}_f{typnbits}({in1})));'''. \ format(armop=armop[op], **fmtspec) # ----------------------------------------------------------------------------- # Binary not def not1(simd_ext, typ): if typ in common.iutypes: if simd_ext in neon: if typ in ['i8', 'u8', 'i16', 'u16', 'i32', 'u32']: return 'return vmvnq_{suf}({in0});'.format(**fmtspec) else: return \ '''return vreinterpretq_{suf}_u32(vmvnq_u32( vreinterpretq_u32_{suf}({in0})));'''. \ format(**fmtspec) if simd_ext in sve: return 'return svnot_{suf}_x({svtrue}, {in0});'.format(**fmtspec) # From here only float types if typ == 'f16': intrinsics = \ '''return vreinterpretq_f16_u16(vmvnq_u16(vreinterpretq_u16_f16( {in0})));'''.format(**fmtspec) else: intrinsics = '' ret = f16f64(simd_ext, typ, 'notb', 'mvn', 1, intrinsics) if ret != '': return ret if simd_ext in neon: return \ '''return vreinterpretq_{suf}_u32(vmvnq_u32( vreinterpretq_u32_{suf}({in0})));'''. \ format(**fmtspec) else: return \ '''return svreinterpret_{suf}_u{typnbits}(svnot_u{typnbits}_x( {svtrue}, svreinterpret_u{typnbits}_{suf}({in0})));'''. \ format(**fmtspec) # ----------------------------------------------------------------------------- # Logical operators: and, or, xor, andnot def lop2(opts, op, simd_ext, typ): armop = {'orl': 'orr', 'xorl': 'eor', 'andl': 'and', 'andnotl': 'bic'} if simd_ext in neon: if typ == 'f16': return \ '''#ifdef NSIMD_ARM_FP16 return v{armop}q_u16({in0}, {in1}); #else nsimd_{simd_ext}_vlf16 ret; ret.v0 = v{armop}q_u32({in0}.v0, {in1}.v0); ret.v1 = v{armop}q_u32({in0}.v1, {in1}.v1); return ret; #endif'''.format(armop=armop[op], **fmtspec) elif simd_ext == 'neon128' and typ == 'f64': if op == 'andnotl': return '''nsimd_{simd_ext}_vlf64 ret; ret.v0 = {in0}.v0 & (~{in1}.v0); ret.v1 = {in0}.v1 & (~{in1}.v1); return ret;'''.format(**fmtspec) else: cpuop = {'orl': '|', 'xorl': '^', 'andl': '&'} return '''nsimd_{simd_ext}_vlf64 ret; ret.v0 = {in0}.v0 {cpuop} {in1}.v0; ret.v1 = {in0}.v1 {cpuop} {in1}.v1; return ret;'''.format(cpuop=cpuop[op], **fmtspec) else: return 'return v{armop}q_u{typnbits}({in0}, {in1});'. \ format(armop=armop[op], **fmtspec) else: if opts.sve_emulate_bool: # TODO: the casts are a workaround to avoid a bug in gcc trunk for sve # it needs to be deleted when the bug is corrected return \ '''return sv{armop}_x({svtrue}, (svuint{typnbits}_t){in0}, (svuint{typnbits}_t){in1});'''. \ format(armop=armop[op], **fmtspec) else: return '''return sv{armop}_z({svtrue}, {in0}, {in1});'''. \ format(armop=armop[op], **fmtspec) # ----------------------------------------------------------------------------- # Logical not def lnot1(opts, simd_ext, typ): if simd_ext in neon: if typ == 'f16': return \ '''#ifdef NSIMD_ARM_FP16 return vmvnq_u16({in0}); #else nsimd_{simd_ext}_vlf16 ret; ret.v0 = vmvnq_u32({in0}.v0); ret.v1 = vmvnq_u32({in0}.v1); return ret; #endif'''.format(**fmtspec) elif simd_ext == 'neon128' and typ == 'f64': return '''nsimd_neon128_vlf64 ret; ret.v0 = ~{in0}.v0; ret.v1 = ~{in0}.v1; return ret;'''.format(**fmtspec) elif typ in ['i64', 'u64', 'f64']: return '''return vreinterpretq_u{typnbits}_u32(vmvnq_u32( vreinterpretq_u32_u{typnbits}({in0})));'''. \ format(**fmtspec) else: return 'return vmvnq_u{typnbits}({in0});'.format(**fmtspec) elif simd_ext in sve: if opts.sve_emulate_bool: # TODO: the cast is a workaround to avoid a bug in gcc trunk for sve # it needs to be deleted when the bug is corrected return 'return svnot_x({svtrue}, (svuint{typnbits}_t){in0});'.format(**fmtspec) else: return 'return svnot_z({svtrue}, {in0});'.format(**fmtspec) # ----------------------------------------------------------------------------- # Square root def sqrt1(simd_ext, typ): if simd_ext == 'neon128': if typ in 'f16': return '''nsimd_neon128_vf16 ret; ret.v0 = nsimd_sqrt_neon128_f32({in0}.v0); ret.v1 = nsimd_sqrt_neon128_f32({in0}.v1); return ret;'''.format(**fmtspec) elif typ == 'f64': return f16f64('neon128', 'f64', 'sqrt', 'sqrt', 1) else: return emulate_op1('sqrt', simd_ext, typ) elif simd_ext == 'aarch64': if typ == 'f16': return f16f64('aarch64', 'f16', 'sqrt', 'sqrt', 1) else: return 'return vsqrtq_{suf}({in0});'.format(**fmtspec) else: return 'return svsqrt_{suf}_x({svtrue}, {in0});'.format(**fmtspec) # ----------------------------------------------------------------------------- # Shifts def shl_shr(op, simd_ext, typ): if simd_ext in neon: sign = '-' if op == 'shr' else '' if typ in common.utypes: return '''return vshlq_{suf}({in0}, vdupq_n_s{typnbits}( (i{typnbits})({sign}{in1})));'''. \ format(sign=sign, **fmtspec) else: return \ '''return vreinterpretq_s{typnbits}_u{typnbits}(vshlq_u{typnbits}( vreinterpretq_u{typnbits}_s{typnbits}({in0}), vdupq_n_s{typnbits}((i{typnbits})({sign}{in1}))));'''. \ format(sign=sign, **fmtspec) else: armop = 'lsl' if op == 'shl' else 'lsr' if op == 'shr' and typ in common.itypes: return \ '''return svreinterpret_{suf}_{suf2}(sv{armop}_{suf2}_x({svtrue}, svreinterpret_{suf2}_{suf}({in0}), svdup_n_u{typnbits}((u{typnbits}){in1})));'''. \ format(suf2=common.bitfield_type[typ], armop=armop, **fmtspec) else: return '''return sv{armop}_{suf}_x({svtrue}, {in0}, svdup_n_u{typnbits}((u{typnbits}){in1}));'''. \ format(armop=armop, **fmtspec) def shra(simd_ext, typ): if typ in common.utypes: return '''return nsimd_shr_{simd_ext}_{typ}({in0}, {in1});'''. \ format(**fmtspec) if simd_ext in neon: return '''return vshlq_{suf}( {in0}, vdupq_n_s{typnbits}((i{typnbits})-{in1}));'''.\ format(**fmtspec) elif simd_ext in sve: if typ[0] == 'i': return '''return svasr_n_{suf}_x({svtrue}, {in0}, (u{typnbits}){in1});'''.\ format(**fmtspec) elif typ[0] == 'u': return 'return svlsl_n_{suf}_x({svtrue}, {in0}, (u64){in1});'.\ format(**fmtspec) # ----------------------------------------------------------------------------- # Set1 def set1(simd_ext, typ): if simd_ext in neon: if typ == 'f16': return '''#ifdef NSIMD_ARM_FP16 return vdupq_n_f16({in0}); #else nsimd_{simd_ext}_vf16 ret; f32 f = nsimd_f16_to_f32({in0}); ret.v0 = nsimd_set1_{simd_ext}_f32(f); ret.v1 = nsimd_set1_{simd_ext}_f32(f); return ret; #endif'''.format(**fmtspec) elif simd_ext == 'neon128' and typ == 'f64': return '''nsimd_neon128_vf64 ret; ret.v0 = {in0}; ret.v1 = {in0}; return ret;'''.format(**fmtspec) else: return 'return vdupq_n_{suf}({in0});'.format(**fmtspec) else: return 'return svdup_n_{suf}({in0});'.format(**fmtspec) # ----------------------------------------------------------------------------- # Set1l def lset1(simd_ext, typ): if simd_ext in sve: return '''if ({in0}) {{ return svptrue_b{typnbits}(); }} else {{ return svpfalse_b(); }}'''.format(**fmtspec) # getting here means no NEON and AARCH64 only mask = 'vdupq_n_u{typnbits}((u{typnbits}){{}})'.format(**fmtspec) normal = '''if ({in0}) {{ return {ones}; }} else {{ return {zeros}; }}'''.format(ones=mask.format('-1'), zeros=mask.format('0'), **fmtspec) if typ == 'f16': return '''#ifdef NSIMD_ARM_FP16 {normal} #else nsimd_{simd_ext}_vlf16 ret; ret.v0 = nsimd_set1l_{simd_ext}_f32({in0}); ret.v1 = ret.v0; return ret; #endif'''.format(normal=normal, **fmtspec) if typ == 'f64' and simd_ext == 'neon128': return '''nsimd_neon128_vlf64 ret; ret.v0 = (u64)({in0} ? -1 : 0); ret.v1 = ret.v0; return ret;'''.format(**fmtspec) return normal # ----------------------------------------------------------------------------- # Comparison operators: ==, <, <=, >, >= def cmp2(opts, op, simd_ext, typ): binop = {'eq': '==', 'lt': '<', 'le': '<=', 'gt': '>', 'ge': '>='} armop = {'eq': 'eq', 'lt': 'lt', 'le': 'le', 'gt': 'gt', 'ge': 'ge'} if simd_ext in neon: emul_f16 = '''nsimd_{simd_ext}_vlf16 ret; ret.v0 = nsimd_{op}_{simd_ext}_f32({in0}.v0, {in1}.v0); ret.v1 = nsimd_{op}_{simd_ext}_f32({in0}.v1, {in1}.v1); return ret;'''.format(op=op, **fmtspec) normal = 'return vc{armop}q_{suf}({in0}, {in1});'. \ format(armop=armop[op], **fmtspec) if typ == 'f16': if simd_ext == 'neon128': return emul_f16 else: return \ '''#ifdef NSIMD_ARM_FP16 {} #else {} #endif'''.format(normal, emul_f16) if simd_ext == 'neon128' and typ == 'f64': return '''nsimd_{simd_ext}_vl{typ} ret; ret.v0 = {in0}.v0 {op} {in1}.v0 ? (u64)-1 : 0; ret.v1 = {in0}.v1 {op} {in1}.v1 ? (u64)-1 : 0; return ret;'''.format(op=binop[op], **fmtspec) elif simd_ext == 'neon128' and typ in ['i64', 'u64']: return '''{typ} buf0[2], buf1[2]; u64 ret[2]; vst1q_{suf}(buf0, {in0}); vst1q_{suf}(buf1, {in1}); ret[0] = buf0[0] {op} buf1[0] ? (u64)-1 : 0; ret[1] = buf0[1] {op} buf1[1] ? (u64)-1 : 0; return vld1q_u64(ret);'''. \ format(op=binop[op], **fmtspec) else: return normal elif simd_ext in sve: if opts.sve_emulate_bool: # TODO: the casts are a workaround to avoid a bug in gcc trunk for sve # it needs to be deleted when the bug is corrected comp = 'svcmp{op}_{suf}({svtrue}, ({svetyp}){in0}, ({svetyp}){in1})'. \ format(op=armop[op], **fmtspec) return 'return {};'.format(convert_from_predicate(opts, comp)) else: return 'return svcmp{op}_{suf}({svtrue}, {in0}, {in1});'. \ format(op=armop[op], **fmtspec) # ----------------------------------------------------------------------------- # Not equal def neq2(opts, simd_ext, typ): if simd_ext in neon: return '''return nsimd_notl_{simd_ext}_{typ}( nsimd_eq_{simd_ext}_{typ}({in0}, {in1}));'''. \ format(**fmtspec) elif simd_ext in sve: comp='svcmpne_{suf}({svtrue}, {in0}, {in1})'. \ format(**fmtspec) return 'return {};'.format(convert_from_predicate(opts, comp)) # ----------------------------------------------------------------------------- # If_else def if_else3(opts, simd_ext, typ): if simd_ext in neon: intrinsic = 'return vbslq_{suf}({in0}, {in1}, {in2});'. \ format(**fmtspec) if typ == 'f16': return \ '''#ifdef NSIMD_ARM_FP16 {intrinsic} #else nsimd_{simd_ext}_vf16 ret; ret.v0 = nsimd_if_else1_{simd_ext}_f32( {in0}.v0, {in1}.v0, {in2}.v0); ret.v1 = nsimd_if_else1_{simd_ext}_f32( {in0}.v1, {in1}.v1, {in2}.v1); return ret; #endif'''.format(intrinsic=intrinsic, **fmtspec) elif simd_ext == 'neon128' and typ == 'f64': return '''nsimd_neon128_vf64 ret; ret.v0 = {in0}.v0 != 0u ? {in1}.v0 : {in2}.v0; ret.v1 = {in0}.v1 != 0u ? {in1}.v1 : {in2}.v1; return ret;'''.format(**fmtspec) else: return intrinsic elif simd_ext in sve: if opts.sve_emulate_bool: # TODO: the casts are a workaround to avoid a bug in gcc trunk for sve # it needs to be deleted when the bug is corrected return 'return svsel_{suf}({cond}, ({svetyp}){in1}, ({svetyp}){in2});' \ .format(cond=convert_to_predicate(opts, '{in0}'.format(**fmtspec)), **fmtspec) else: return 'return svsel_{suf}({in0}, {in1}, {in2});' \ .format(**fmtspec) # ----------------------------------------------------------------------------- # Minimum and maximum def minmax2(op, simd_ext, typ): ret = f16f64(simd_ext, typ, op, op, 2) if ret != '': return ret if simd_ext in neon: if typ in ['i64', 'u64']: binop = '<' if op == 'min' else '>' return '''{typ} buf0[2], buf1[2]; vst1q_{suf}(buf0, {in0}); vst1q_{suf}(buf1, {in1}); buf0[0] = buf0[0] {binop} buf1[0] ? buf0[0] : buf1[0]; buf0[1] = buf0[1] {binop} buf1[1] ? buf0[1] : buf1[1]; return vld1q_{suf}(buf0);'''. \ format(binop=binop, **fmtspec) else: return 'return v{op}q_{suf}({in0}, {in1});'. \ format(op=op, **fmtspec) else: return 'return sv{op}_{suf}_x({svtrue}, {in0}, {in1});'. \ format(op=op, **fmtspec) # ----------------------------------------------------------------------------- # Abs def abs1(simd_ext, typ): if typ in common.utypes: return 'return {in0};'.format(**fmtspec) elif simd_ext in neon: if typ == 'f16': return f16f64(simd_ext, 'f16', 'abs', 'abs', 1) elif (typ in ['i8', 'i16', 'i32', 'f32']) or \ (simd_ext == 'aarch64' and typ in ['i64', 'f64']): return 'return vabsq_{suf}({in0});'.format(**fmtspec) elif typ == 'i64': return emulate_op1('abs', 'neon128', 'i64') else: return f16f64(simd_ext, 'f64', 'abs', 'abs', 1) else: return 'return svabs_{suf}_x({svtrue}, {in0});'. \ format(**fmtspec) # ----------------------------------------------------------------------------- # Round, trunc, ceil and round_to_even def round1(op, simd_ext, typ): if typ in common.iutypes: return 'return {in0};'.format(**fmtspec) armop = {'floor': 'rndm', 'ceil': 'rndp', 'trunc': 'rnd', 'round_to_even': 'rndn'} if simd_ext == 'neon128': ret = f16f64('neon128', typ, op, 'v{armop}q_{suf}'. \ format(armop=armop, **fmtspec), 1) if ret != '': return ret return emulate_op1(op, 'neon128', typ); elif simd_ext == 'aarch64': if typ == 'f16': return f16f64('aarch64', 'f16', op, armop[op], 1) else: return 'return v{armop}q_{suf}({in0});'. \ format(armop=armop[op], **fmtspec) else: armop = {'floor': 'rintm', 'ceil': 'rintp', 'trunc': 'rintz', 'round_to_even': 'rintn'} return 'return sv{armop}_{suf}_x({svtrue}, {in0});'. \ format(armop=armop[op], **fmtspec) # ----------------------------------------------------------------------------- # FMA and FNMA def fmafnma3(op, simd_ext, typ): if typ in common.ftypes and simd_ext == 'aarch64': armop = {'fma': 'fma', 'fnma': 'fms'} else: armop = {'fma': 'mla', 'fnma': 'mls'} if simd_ext in neon: normal = 'return v{armop}q_{suf}({in2}, {in1}, {in0});'. \ format(armop=armop[op], **fmtspec) emul = emulate_op3_neon(op, simd_ext, typ) if typ == 'f16': using_f32 = \ '''nsimd_{simd_ext}_vf16 ret; ret.v0 = nsimd_{op}_{simd_ext}_f32({in0}.v0, {in1}.v0, {in2}.v0); ret.v1 = nsimd_{op}_{simd_ext}_f32({in0}.v1, {in1}.v1, {in2}.v1); return ret;'''.format(op=op, **fmtspec) if simd_ext == 'aarch64': return \ '''#ifdef NSIMD_ARM_FP16 {} #else {} #endif'''.format(emul, using_f32) else: return using_f32 elif simd_ext == 'neon128' and typ == 'f64': return emulate_f64_neon('neon128', op, ['v'] * 4) elif simd_ext == 'aarch64' and typ == 'f64': return normal elif typ in ['i64', 'u64']: return emul else: return normal else: return 'return sv{armop}_{suf}_x({svtrue}, {in2}, {in1}, {in0});'. \ format(armop=armop[op], **fmtspec) # ----------------------------------------------------------------------------- # FMS and FNMS def fmsfnms3(op, simd_ext, typ): if typ in common.iutypes: return \ '''return nsimd_neg_{simd_ext}_{typ}(nsimd_{op2}_{simd_ext}_{typ}( {in0}, {in1}, {in2}));'''. \ format(op2='fma' if op == 'fnms' else 'fnma', **fmtspec) if simd_ext in neon: return \ '''return nsimd_{op2}_{simd_ext}_{typ}({in0}, {in1}, nsimd_neg_{simd_ext}_{typ}({in2}));'''. \ format(op2='fma' if op == 'fms' else 'fnma', **fmtspec) else: armop = {'fnms': 'nmla', 'fms': 'nmls'} return 'return sv{armop}_{suf}_x({svtrue}, {in2}, {in1}, {in0});'. \ format(armop=armop[op], **fmtspec) # ----------------------------------------------------------------------------- # Neg def neg1(simd_ext, typ): if simd_ext in neon: normal = 'return vnegq_{suf}({in0});'.format(**fmtspec) if typ == 'f16': return f16f64(simd_ext, 'f16', 'neg', 'neg', 1) elif typ in ['i8', 'i16', 'i32', 'f32']: return normal elif typ in ['u8', 'u16', 'u32']: return \ '''return vreinterpretq_{suf}_s{typnbits}( vnegq_s{typnbits}( vreinterpretq_s{typnbits}_{suf}({in0})));'''. \ format(**fmtspec) elif simd_ext == 'neon128' and typ in ['i64', 'u64']: return emulate_op1('neg', simd_ext, typ) elif simd_ext == 'neon128' and typ == 'f64': return \ '''nsimd_neon128_vf64 ret; ret.v0 = -{in0}.v0; ret.v1 = -{in0}.v1; return ret;'''.format(**fmtspec) elif simd_ext == 'aarch64' and typ in ['f64', 'i64']: return normal elif simd_ext == 'aarch64' and typ == 'u64': return \ '''return vreinterpretq_u64_s64(vnegq_s64( vreinterpretq_s64_u64({in0})));'''. \ format(**fmtspec) else: if typ in common.utypes: return \ '''return svreinterpret_{suf}_s{typnbits}( svneg_s{typnbits}_x({svtrue}, svreinterpret_s{typnbits}_{suf}({in0})));'''. \ format(**fmtspec) else: return 'return svneg_{suf}_x({svtrue}, {in0});'.format(**fmtspec) # ----------------------------------------------------------------------------- # Reciprocals def recs1(op, simd_ext, typ): cte = '({typ})1'.format(**fmtspec) if typ != 'f16' \ else 'nsimd_f32_to_f16(1.0f)' if op in ['rec', 'rec11']: return \ '''return nsimd_div_{simd_ext}_{typ}( nsimd_set1_{simd_ext}_{typ}({cte}), {in0});'''. \ format(cte=cte, **fmtspec) elif op == 'rsqrt11': return \ '''return nsimd_div_{simd_ext}_{typ}( nsimd_set1_{simd_ext}_{typ}({cte}), nsimd_sqrt_{simd_ext}_{typ}({in0}));'''. \ format(cte=cte, **fmtspec) elif op in ['rec8', 'rsqrt8']: armop = 'recpe' if op == 'rec8' else 'rsqrte' if simd_ext in sve: return 'return sv{armop}_{suf}({in0});'. \ format(armop=armop, **fmtspec) else: ret = f16f64(simd_ext, typ, op, armop, 1) if ret != '': return ret return 'return v{armop}q_{suf}({in0});'. \ format(armop=armop, **fmtspec) # Rec11 and rsqrt11 # According to http://infocenter.arm.com/help/topic/com.arm.doc.faqs/ka14282.html # reciprocal estimates only work when inputs is restrained in some small # interval so we comment these for now and return full-precision reciprocals. # def rec11rsqrt11(op, simd_ext, typ): # armop = {'rec11': 'recpe', 'rsqrt11': 'rsqrte'} # if simd_ext in neon: # ret = f16f64(simd_ext, typ, op, armop[op], 1) # if ret != '': # return ret # return 'return v{armop}q_{suf}({in0});'. \ # format(armop=armop[op], **fmtspec) # else: # return 'return sv{armop}_{suf}({in0});'. \ # format(armop=armop[op], **fmtspec) # ----------------------------------------------------------------------------- # Load of logicals def loadl(aligned, simd_ext, typ): return \ '''/* This can surely be improved but it is not our priority. */ return nsimd_notl_{simd_ext}_{typ}(nsimd_eq_{simd_ext}_{typ}( nsimd_load{align}_{simd_ext}_{typ}( {in0}), nsimd_set1_{simd_ext}_{typ}({zero})));'''. \ format(align='a' if aligned else 'u', zero = 'nsimd_f32_to_f16(0.0f)' if typ == 'f16' else '({})0'.format(typ), **fmtspec) # ----------------------------------------------------------------------------- # Store of logicals def storel(aligned, simd_ext, typ): return \ '''/* This can surely be improved but it is not our priority. */ nsimd_store{align}_{simd_ext}_{typ}({in0}, nsimd_if_else1_{simd_ext}_{typ}({in1}, nsimd_set1_{simd_ext}_{typ}({one}), nsimd_set1_{simd_ext}_{typ}({zero})));'''. \ format(align = 'a' if aligned else 'u', one = 'nsimd_f32_to_f16(1.0f)' if typ == 'f16' else '({})1'.format(typ), zero = 'nsimd_f32_to_f16(0.0f)' if typ == 'f16' else '({})0'.format(typ), **fmtspec) # ----------------------------------------------------------------------------- # All and any def allany1(opts, op, simd_ext, typ): binop = '&&' if op == 'all' else '||' if simd_ext == 'neon128': if typ == 'f16': return \ '''return nsimd_{op}_neon128_f32({in0}.v0) {binop} nsimd_{op}_neon128_f32({in0}.v1);'''. \ format(op=op, binop=binop, **fmtspec) elif typ == 'f64': return 'return {in0}.v0 {binop} {in0}.v1;'. \ format(binop=binop, **fmtspec) else: return 'return ' + \ binop.join(['vgetq_lane_u{typnbits}({in0}, {i})'. \ format(i=i, **fmtspec) \ for i in range(0, 128 // int(fmtspec['typnbits']))]) + \ ';' elif simd_ext == 'aarch64': armop = {'all': 'min', 'any': 'max'} normal = 'return v{armop}vq_u{typnbits}({in0}) != 0;'. \ format(armop=armop[op], **fmtspec) if typ == 'f16': return \ '''#ifdef NSIMD_ARM_FP16 {normal} #else return nsimd_{op}_aarch64_f32({in0}.v0) {binop} nsimd_{op}_aarch64_f32({in0}.v1); #endif'''.format(normal=normal, op=op, binop=binop, **fmtspec) elif typ in ['i64', 'u64', 'f64']: return \ 'return v{armop}vq_u32(vreinterpretq_u32_u64({in0})) != 0;'. \ format(armop=armop[op], **fmtspec) else: return normal elif simd_ext in sve: if op == 'any': operand= convert_to_predicate(opts, '{in0}'.format(**fmtspec)) return '''return svptest_any({svtrue}, {operand});'''. \ format(operand=operand, **fmtspec) else: operand='svnot_z({svtrue}, {op})'. \ format(op=convert_to_predicate(opts, '{in0}'.format(**fmtspec)), **fmtspec) return '''return !svptest_any({svtrue}, {operand});'''. \ format(operand=operand, **fmtspec) # ----------------------------------------------------------------------------- # nbtrue def nbtrue1(opts, simd_ext, typ): if simd_ext == 'neon128': if typ == 'f16': return \ '''return nsimd_nbtrue_neon128_f32({in0}.v0) + nsimd_nbtrue_neon128_f32({in0}.v1);'''. \ format(**fmtspec) elif typ == 'f64': return 'return -(int)((i64){in0}.v0 + (i64){in0}.v1);'. \ format(**fmtspec) else: return \ '''nsimd_neon128_vi{typnbits} temp = vreinterpretq_s{typnbits}_u{typnbits}({in0}); return -(int)('''.format(**fmtspec) + \ '+'.join(['vgetq_lane_s{typnbits}(temp, {i})'. \ format(i=i, **fmtspec) \ for i in range(0, 128 // int(fmtspec['typnbits']))]) + \ ');' elif simd_ext == 'aarch64': normal = \ '''return -(int)vaddvq_s{typnbits}( vreinterpretq_s{typnbits}_u{typnbits}({in0}));'''. \ format(**fmtspec) if typ == 'f16': return \ '''#ifdef NSIMD_ARM_FP16 {normal} #else return nsimd_nbtrue_aarch64_f32({in0}.v0) + nsimd_nbtrue_aarch64_f32({in0}.v1); #endif'''.format(normal=normal, **fmtspec) elif typ in ['i64', 'u64', 'f64']: return \ '''return -(vaddvq_s32(vreinterpretq_s32_u64({in0})) >> 1);'''. \ format(**fmtspec) else: return normal elif simd_ext in sve: return 'return (int)svcntp_b{typnbits}({svtrue}, {op});'. \ format(op=convert_to_predicate(opts, '{in0}'.format(**fmtspec)), **fmtspec) # ----------------------------------------------------------------------------- # Reinterpret logical def reinterpretl1(simd_ext, from_typ, to_typ): if from_typ == to_typ or simd_ext in sve: return 'return {in0};'.format(**fmtspec) to_f16_with_f32 = \ '''nsimd_{simd_ext}_vlf16 ret; u32 buf[4]; buf[0] = (vgetq_lane_u16({in0}, 0) ? (u32)-1 : 0); buf[1] = (vgetq_lane_u16({in0}, 1) ? (u32)-1 : 0); buf[2] = (vgetq_lane_u16({in0}, 2) ? (u32)-1 : 0); buf[3] = (vgetq_lane_u16({in0}, 3) ? (u32)-1 : 0); ret.v0 = vld1q_u32(buf); buf[0] = (vgetq_lane_u16({in0}, 4) ? (u32)-1 : 0); buf[1] = (vgetq_lane_u16({in0}, 5) ? (u32)-1 : 0); buf[2] = (vgetq_lane_u16({in0}, 6) ? (u32)-1 : 0); buf[3] = (vgetq_lane_u16({in0}, 7) ? (u32)-1 : 0); ret.v1 = vld1q_u32(buf); return ret;'''.format(**fmtspec) from_f16_with_f32 = \ '''u16 buf[8]; buf[0] = (vgetq_lane_u32({in0}.v0, 0) ? (u16)-1 : 0); buf[1] = (vgetq_lane_u32({in0}.v0, 1) ? (u16)-1 : 0); buf[2] = (vgetq_lane_u32({in0}.v0, 2) ? (u16)-1 : 0); buf[3] = (vgetq_lane_u32({in0}.v0, 3) ? (u16)-1 : 0); buf[4] = (vgetq_lane_u32({in0}.v1, 0) ? (u16)-1 : 0); buf[5] = (vgetq_lane_u32({in0}.v1, 1) ? (u16)-1 : 0); buf[6] = (vgetq_lane_u32({in0}.v1, 2) ? (u16)-1 : 0); buf[7] = (vgetq_lane_u32({in0}.v1, 3) ? (u16)-1 : 0); return vld1q_u16(buf);'''.format(**fmtspec) if simd_ext == 'neon128': if to_typ == 'f16': return to_f16_with_f32 elif from_typ == 'f16': return from_f16_with_f32 elif to_typ == 'f64': return '''nsimd_neon128_vlf64 ret; ret.v0 = vgetq_lane_u64({in0}, 0); ret.v1 = vgetq_lane_u64({in0}, 1); return ret;'''.format(**fmtspec) elif from_typ == 'f64': return '''u64 buf[2]; buf[0] = {in0}.v0; buf[1] = {in0}.v1; return vld1q_u64(buf);'''.format(**fmtspec) else: return 'return {in0};'.format(**fmtspec) elif simd_ext == 'aarch64': if to_typ == 'f16': return '''#ifdef NSIMD_ARM_FP16 return {in0}; #else {using_f32} #endif'''.format(using_f32=to_f16_with_f32, **fmtspec) elif from_typ == 'f16': return '''#ifdef NSIMD_ARM_FP16 return {in0}; #else {using_f32} #endif'''.format(using_f32=from_f16_with_f32, **fmtspec) else: return 'return {in0};'.format(**fmtspec) # ----------------------------------------------------------------------------- # Convert def convert1(simd_ext, from_typ, to_typ): fmtspec2 = fmtspec.copy() fmtspec2['to_suf'] = suf(to_typ) fmtspec2['from_suf'] = suf(from_typ) if from_typ == to_typ: return 'return {in0};'.format(**fmtspec) if from_typ in common.iutypes and to_typ in common.iutypes: if simd_ext in neon: return 'return vreinterpretq_{to_suf}_{from_suf}({in0});'. \ format(**fmtspec2) else: return 'return svreinterpret_{to_suf}_{from_suf}({in0});'. \ format(**fmtspec2) if simd_ext in sve: return 'return svcvt_{to_suf}_{from_suf}_x({svtrue}, {in0});'. \ format(**fmtspec2) to_f16_with_f32 = \ '''nsimd_{simd_ext}_vf16 ret; f32 buf[4]; buf[0] = (f32)vgetq_lane_{from_suf}({in0}, 0); buf[1] = (f32)vgetq_lane_{from_suf}({in0}, 1); buf[2] = (f32)vgetq_lane_{from_suf}({in0}, 2); buf[3] = (f32)vgetq_lane_{from_suf}({in0}, 3); ret.v0 = vld1q_f32(buf); buf[0] = (f32)vgetq_lane_{from_suf}({in0}, 4); buf[1] = (f32)vgetq_lane_{from_suf}({in0}, 5); buf[2] = (f32)vgetq_lane_{from_suf}({in0}, 6); buf[3] = (f32)vgetq_lane_{from_suf}({in0}, 7); ret.v1 = vld1q_f32(buf); return ret;'''.format(**fmtspec2) from_f16_with_f32 = \ '''{to_typ} buf[8]; buf[0] = ({to_typ})vgetq_lane_f32({in0}.v0, 0); buf[1] = ({to_typ})vgetq_lane_f32({in0}.v0, 1); buf[2] = ({to_typ})vgetq_lane_f32({in0}.v0, 2); buf[3] = ({to_typ})vgetq_lane_f32({in0}.v0, 3); buf[4] = ({to_typ})vgetq_lane_f32({in0}.v1, 0); buf[5] = ({to_typ})vgetq_lane_f32({in0}.v1, 1); buf[6] = ({to_typ})vgetq_lane_f32({in0}.v1, 2); buf[7] = ({to_typ})vgetq_lane_f32({in0}.v1, 3); return vld1q_{to_suf}(buf);'''.format(**fmtspec2) if simd_ext == 'neon128': if to_typ == 'f16': return to_f16_with_f32 elif from_typ == 'f16': return from_f16_with_f32 elif to_typ == 'f64': return '''nsimd_neon128_vf64 ret; ret.v0 = (f64)vgetq_lane_{from_suf}({in0}, 0); ret.v1 = (f64)vgetq_lane_{from_suf}({in0}, 1); return ret;'''.format(**fmtspec2) elif from_typ == 'f64': return '''{to_typ} buf[2]; buf[0] = ({to_typ}){in0}.v0; buf[1] = ({to_typ}){in0}.v1; return vld1q_{to_suf}(buf);'''.format(**fmtspec2) else: return 'return vcvtq_{to_suf}_{from_suf}({in0});'. \ format(**fmtspec2) elif simd_ext == 'aarch64': if to_typ == 'f16': return '''#ifdef NSIMD_ARM_FP16 return vcvtq_{to_suf}_{from_suf}({in0}); #else {using_f32} #endif'''.format(using_f32=to_f16_with_f32, **fmtspec2) elif from_typ == 'f16': return '''#ifdef NSIMD_ARM_FP16 return vcvtq_{to_suf}_{from_suf}({in0}); #else {using_f32} #endif'''.format(using_f32=from_f16_with_f32, **fmtspec2) else: return 'return vcvtq_{to_suf}_{from_suf}({in0});'. \ format(**fmtspec2) # ----------------------------------------------------------------------------- # Reinterpret def reinterpret1(simd_ext, from_typ, to_typ): fmtspec2 = fmtspec.copy() fmtspec2['to_suf'] = suf(to_typ) fmtspec2['from_suf'] = suf(from_typ) if from_typ == to_typ: return 'return {in0};'.format(**fmtspec) if simd_ext in sve: return 'return svreinterpret_{to_suf}_{from_suf}({in0});'. \ format(**fmtspec2) to_f16_with_f32 = \ '''nsimd_{simd_ext}_vf16 ret; f32 buf[4]; buf[0] = nsimd_u16_to_f32((u16)vgetq_lane_{from_suf}({in0}, 0)); buf[1] = nsimd_u16_to_f32((u16)vgetq_lane_{from_suf}({in0}, 1)); buf[2] = nsimd_u16_to_f32((u16)vgetq_lane_{from_suf}({in0}, 2)); buf[3] = nsimd_u16_to_f32((u16)vgetq_lane_{from_suf}({in0}, 3)); ret.v0 = vld1q_f32(buf); buf[0] = nsimd_u16_to_f32((u16)vgetq_lane_{from_suf}({in0}, 4)); buf[1] = nsimd_u16_to_f32((u16)vgetq_lane_{from_suf}({in0}, 5)); buf[2] = nsimd_u16_to_f32((u16)vgetq_lane_{from_suf}({in0}, 6)); buf[3] = nsimd_u16_to_f32((u16)vgetq_lane_{from_suf}({in0}, 7)); ret.v1 = vld1q_f32(buf); return ret;'''.format(**fmtspec2) from_f16_with_f32 = \ '''{to_typ} buf[8]; buf[0] = ({to_typ})nsimd_f32_to_u16(vgetq_lane_f32({in0}.v0, 0)); buf[1] = ({to_typ})nsimd_f32_to_u16(vgetq_lane_f32({in0}.v0, 1)); buf[2] = ({to_typ})nsimd_f32_to_u16(vgetq_lane_f32({in0}.v0, 2)); buf[3] = ({to_typ})nsimd_f32_to_u16(vgetq_lane_f32({in0}.v0, 3)); buf[4] = ({to_typ})nsimd_f32_to_u16(vgetq_lane_f32({in0}.v1, 0)); buf[5] = ({to_typ})nsimd_f32_to_u16(vgetq_lane_f32({in0}.v1, 1)); buf[6] = ({to_typ})nsimd_f32_to_u16(vgetq_lane_f32({in0}.v1, 2)); buf[7] = ({to_typ})nsimd_f32_to_u16(vgetq_lane_f32({in0}.v1, 3)); return vld1q_{to_suf}(buf);'''.format(**fmtspec2) if simd_ext == 'neon128': if to_typ == 'f16': return to_f16_with_f32 elif from_typ == 'f16': return from_f16_with_f32 elif to_typ == 'f64': return '''nsimd_neon128_vf64 ret; union {{ f64 to; {from_typ} from; }} buf; buf.from = vgetq_lane_{from_suf}({in0}, 0); ret.v0 = buf.to; buf.from = vgetq_lane_{from_suf}({in0}, 1); ret.v1 = buf.to; return ret;'''.format(**fmtspec2) elif from_typ == 'f64': return '''union {{ f64 from; {to_typ} to; }} buf_; {to_typ} buf[2]; buf_.from = {in0}.v0; buf[0] = buf_.to; buf_.from = {in0}.v1; buf[1] = buf_.to; return vld1q_{to_suf}(buf);'''.format(**fmtspec2) else: return 'return vreinterpretq_{to_suf}_{from_suf}({in0});'. \ format(**fmtspec2) elif simd_ext == 'aarch64': if to_typ == 'f16': return '''#ifdef NSIMD_ARM_FP16 return vreinterpretq_{to_suf}_{from_suf}({in0}); #else {using_f32} #endif'''.format(using_f32=to_f16_with_f32, **fmtspec2) elif from_typ == 'f16': return '''#ifdef NSIMD_ARM_FP16 return vreinterpretq_{to_suf}_{from_suf}({in0}); #else {using_f32} #endif'''.format(using_f32=from_f16_with_f32, **fmtspec2) else: return 'return vreinterpretq_{to_suf}_{from_suf}({in0});'. \ format(**fmtspec2) # ----------------------------------------------------------------------------- # reverse def reverse1(simd_ext, typ): armtyp = suf(typ) if simd_ext in sve: return '''return svrev_{suf}( {in0} );'''.format(**fmtspec) elif simd_ext == 'neon128' and typ == 'f64': return '''nsimd_neon128_vf64 ret; ret.v0 = {in0}.v1; ret.v1 = {in0}.v0; return ret;'''.format(**fmtspec) elif typ in [ 'i64', 'u64', 'f64' ]: return '''return vcombine_{armtyp}(vget_high_{armtyp}({in0}), vget_low_{armtyp}({in0}));'''. \ format(armtyp=armtyp, **fmtspec) elif typ == 'f16': return '''nsimd_{simd_ext}_vf16 ret; ret.v0 = nsimd_reverse_{simd_ext}_f32(a0.v1); ret.v1 = nsimd_reverse_{simd_ext}_f32(a0.v0); return ret;'''.format(**fmtspec) else: return '''{in0} = vrev64q_{armtyp}({in0}); return vcombine_{armtyp}(vget_high_{armtyp}({in0}), vget_low_{armtyp}({in0}));'''. \ format(armtyp=armtyp, **fmtspec) # ----------------------------------------------------------------------------- # Horizontal sum def addv(simd_ext, typ): if simd_ext == 'neon128': if typ == 'f64': return 'return ({typ})({in0}.v0 + {in0}.v1);'.format(**fmtspec) elif typ == 'f16': return \ '''#ifdef NSIMD_ARM_FP16 {t} tmp = vadd_{suf}(vget_low_{suf}({in0}), vget_high_{suf}({in0})); tmp = vadd_{suf}(tmp, vext_{suf}(tmp, tmp, 3)); tmp = vadd_{suf}(tmp, vext_{suf}(tmp, tmp, 0)); return vget_lane_{suf}(tmp, 0); #else float32x2_t tmp0 = vadd_f32(vget_low_f32({in0}.v0), vget_high_f32({in0}.v0)); tmp0 = vadd_f32(tmp0, vext_f32(tmp0, tmp0, 1)); float32x2_t tmp1 = vadd_f32(vget_low_f32({in0}.v1), vget_high_f32({in0}.v1)); tmp1 = vadd_f32(tmp1, vext_f32(tmp1, tmp1, 1)); return nsimd_f32_to_f16(vget_lane_f32(tmp0, 0) + vget_lane_f32(tmp1, 0)); #endif''' .format(t=half_neon64_typ(typ), **fmtspec) elif typ == 'f32': return \ '''{t} tmp = vadd_{suf}(vget_low_{suf}({in0}), vget_high_{suf}({in0})); tmp = vadd_{suf}(tmp, vext_{suf}(tmp, tmp, 1)); return vget_lane_{suf}(tmp, 0);'''. \ format(t=half_neon64_typ(typ), **fmtspec) elif typ[0] in ['i', 'u']: le = 128 // int(typ[1:]); return \ '''{typ} res = ({typ})0; {typ} buf[{le}]; vst1q_{suf}(buf, {in0}); for (int i = 0; i < {le}; i++) {{ res += buf[i]; }} return res;'''. \ format(le=le, **fmtspec) elif simd_ext == 'aarch64': if typ == 'f16': return \ '''#ifdef NSIMD_ARM_FP16 {t} tmp = vadd_{suf}(vget_low_{suf}({in0}), vget_high_{suf}({in0})); tmp = vadd_{suf}(tmp, vext_{suf}(tmp, tmp, 3)); tmp = vadd_{suf}(tmp, vext_{suf}(tmp, tmp, 0)); return vget_lane_{suf}(tmp, 0); #else float32x2_t tmp0 = vadd_f32(vget_low_f32({in0}.v0), vget_high_f32({in0}.v0)); tmp0 = vadd_f32(tmp0, vext_f32(tmp0, tmp0, 1)); float32x2_t tmp1 = vadd_f32(vget_low_f32({in0}.v1), vget_high_f32({in0}.v1)); tmp1 = vadd_f32(tmp1, vext_f32(tmp1, tmp1, 1)); return nsimd_f32_to_f16(vget_lane_f32(tmp0, 0) + vget_lane_f32(tmp1, 0)); #endif''' .format(t=half_neon64_typ(typ), **fmtspec) elif typ in ['f32', 'f64']: return 'return vaddvq_{suf}({in0});'.format(**fmtspec) elif simd_ext in sve: return 'return svaddv_{suf}({svtrue}, {in0});' .format(**fmtspec) # ----------------------------------------------------------------------------- # Up convert def upcvt1(simd_ext, from_typ, to_typ): # For integer upcast, due to 2's complement representation # _s : signed -> bigger signed # _s : signed -> bigger unsigned # _u : unsigned -> bigger signed # _u : unsigned -> bigger unsigned if simd_ext in neon: if from_typ == 'f16' and to_typ == 'f32': return \ '''#ifdef NSIMD_ARM_FP16 nsimd_{simd_ext}_vf32x2 ret; ret.v0 = vcvt_f32_f16(vget_low_{suf}({in0})); ret.v1 = vcvt_f32_f16(vget_high_{suf}({in0})); return ret; #else nsimd_{simd_ext}_vf32x2 ret; ret.v0 = {in0}.v0; ret.v1 = {in0}.v1; return ret; #endif'''.format(**fmtspec) elif from_typ == 'f32' and to_typ == 'f64': if simd_ext == 'neon128': return \ '''nsimd_neon128_vf64x2 ret; f32 buf[4]; vst1q_f32(buf, {in0}); ret.v0.v0 = (f64)buf[0]; ret.v0.v1 = (f64)buf[1]; ret.v1.v0 = (f64)buf[2]; ret.v1.v1 = (f64)buf[3]; return ret;'''.format(**fmtspec) else: return \ '''nsimd_aarch64_vf64x2 ret; ret.v0 = vcvt_f64_f32(vget_low_{suf}({in0})); ret.v1 = vcvt_f64_f32(vget_high_{suf}({in0})); return ret;'''.format(**fmtspec) elif (from_typ in common.itypes and to_typ in common.itypes) or \ (from_typ in common.utypes and to_typ in common.utypes): return '''nsimd_{simd_ext}_v{to_typ}x2 ret; ret.v0 = vmovl_{suf}(vget_low_{suf}({in0})); ret.v1 = vmovl_{suf}(vget_high_{suf}({in0})); return ret;'''.format(**fmtspec) elif (from_typ in common.itypes and to_typ in common.utypes) or \ (from_typ in common.utypes and to_typ in common.itypes): return '''nsimd_{simd_ext}_v{to_typ}x2 ret; ret.v0 = vreinterpretq_{suf_to_typ}_{suf_int_typ}( vmovl_{suf}(vget_low_{suf}({in0}))); ret.v1 = vreinterpretq_{suf_to_typ}_{suf_int_typ}( vmovl_{suf}(vget_high_{suf}({in0}))); return ret;'''. \ format(suf_to_typ=suf(to_typ), suf_int_typ=suf(from_typ[0] + to_typ[1:]), **fmtspec) else: return \ '''nsimd_{simd_ext}_v{to_typ}x2 ret; nsimd_{simd_ext}_v{int_typ}x2 tmp; tmp = nsimd_upcvt_{simd_ext}_{int_typ}_{from_typ}({in0}); ret.v0 = nsimd_cvt_{simd_ext}_{to_typ}_{int_typ}(tmp.v0); ret.v1 = nsimd_cvt_{simd_ext}_{to_typ}_{int_typ}(tmp.v1); return ret;'''. \ format(int_typ=from_typ[0] + to_typ[1:], **fmtspec) # Getting here means that we deal with SVE if (from_typ in common.itypes and to_typ in common.itypes) or \ (from_typ in common.utypes and to_typ in common.utypes): return '''nsimd_{simd_ext}_v{to_typ}x2 ret; ret.v0 = svunpklo_{suf_to_typ}({in0}); ret.v1 = svunpkhi_{suf_to_typ}({in0}); return ret;'''.format(suf_to_typ=suf(to_typ), **fmtspec) elif (from_typ in common.itypes and to_typ in common.utypes) or \ (from_typ in common.utypes and to_typ in common.itypes): return \ '''nsimd_{simd_ext}_v{to_typ}x2 ret; ret.v0 = svreinterpret_{suf_to_typ}_{suf_int_typ}( svunpklo_{suf_int_typ}({in0})); ret.v1 = svreinterpret_{suf_to_typ}_{suf_int_typ}( svunpkhi_{suf_int_typ}({in0})); return ret;'''. \ format(suf_to_typ=suf(to_typ), suf_int_typ=suf(from_typ[0] + to_typ[1:]), **fmtspec) elif from_typ in common.iutypes and to_typ in common.ftypes: return \ '''nsimd_{simd_ext}_v{to_typ}x2 ret; ret.v0 = svcvt_{suf_to_typ}_{suf_int_typ}_x( {svtrue}, svunpklo_{suf_int_typ}({in0})); ret.v1 = svcvt_{suf_to_typ}_{suf_int_typ}_x( {svtrue}, svunpkhi_{suf_int_typ}({in0})); return ret;'''. \ format(suf_to_typ=suf(to_typ), suf_int_typ=suf(from_typ[0] + to_typ[1:]), **fmtspec) else: return \ '''nsimd_{simd_ext}_v{to_typ}x2 ret; ret.v0 = svcvt_{suf_to_typ}_{suf}_x({svtrue}, svzip1_{suf}( {in0}, {in0})); ret.v1 = svcvt_{suf_to_typ}_{suf}_x({svtrue}, svzip2_{suf}( {in0}, {in0})); return ret;'''.format(suf_to_typ=suf(to_typ), **fmtspec) # ----------------------------------------------------------------------------- # Down convert def downcvt1(simd_ext, from_typ, to_typ): if simd_ext in neon: if from_typ == 'f64' and to_typ == 'f32': if simd_ext == 'neon128': return '''f32 buf[4]; buf[0] = (f32){in0}.v0; buf[1] = (f32){in0}.v1; buf[2] = (f32){in1}.v0; buf[3] = (f32){in1}.v1; return vld1q_f32(buf);'''.format(**fmtspec) else: return '''return vcombine_f32(vcvt_f32_f64({in0}), vcvt_f32_f64({in1}));'''. \ format(**fmtspec) elif from_typ == 'f32' and to_typ == 'f16': return '''#ifdef NSIMD_ARM_FP16 return vcombine_f16(vcvt_f16_f32({in0}), vcvt_f16_f32({in1})); #else nsimd_{simd_ext}_vf16 ret; ret.v0 = {in0}; ret.v1 = {in1}; return ret; #endif'''.format(**fmtspec) elif (from_typ in common.itypes and to_typ in common.itypes) or \ (from_typ in common.utypes and to_typ in common.utypes): return '''return vcombine_{suf_to_typ}(vmovn_{suf}({in0}), vmovn_{suf}({in1}));'''. \ format(suf_to_typ=suf(to_typ), **fmtspec) elif (from_typ in common.itypes and to_typ in common.itypes) or \ (from_typ in common.utypes and to_typ in common.utypes): return '''return vreinterpretq_{suf_to_typ}( vcombine_{suf_to_typ}(vmovn_{suf}({in0}), vmovn_{suf}({in1}));'''. \ format(suf_to_typ=suf(to_typ), **fmtspec) else: return \ '''return nsimd_downcvt_{simd_ext}_{to_typ}_{int_typ}( nsimd_cvt_{simd_ext}_{int_typ}_{from_typ}({in0}), nsimd_cvt_{simd_ext}_{int_typ}_{from_typ}({in1}));'''.\ format(int_typ=to_typ[0] + from_typ[1:], **fmtspec) # Getting here means that we deal with SVE if from_typ in common.iutypes and to_typ in common.iutypes: return '''return svuzp1_{suf_to_typ}( svreinterpret_{suf_to_typ}_{suf}({in0}), svreinterpret_{suf_to_typ}_{suf}({in1}));'''. \ format(suf_to_typ=suf(to_typ), **fmtspec) elif from_typ in common.ftypes and to_typ in common.iutypes: return \ '''return svuzp1_{suf_to_typ}(svreinterpret_{suf_to_typ}_{suf_int_typ}( svcvt_{suf_int_typ}_{suf}_x({svtrue}, {in0})), svreinterpret_{suf_to_typ}_{suf_int_typ}( svcvt_{suf_int_typ}_{suf}_x({svtrue}, {in1})));'''. \ format(suf_to_typ=suf(to_typ), suf_int_typ=suf(to_typ[0] + from_typ[1:]), **fmtspec) else: return \ '''return svuzp1_{suf_to_typ}(svcvt_{suf_to_typ}_{suf}_x( {svtrue}, {in0}), svcvt_{suf_to_typ}_{suf}_x( {svtrue}, {in1}));'''. \ format(suf_to_typ=suf(to_typ), **fmtspec) # ----------------------------------------------------------------------------- # adds def adds(simd_ext, from_typ): if from_typ in common.ftypes: return 'return nsimd_add_{simd_ext}_{from_typ}({in0}, {in1});'. \ format(**fmtspec) if simd_ext in neon: return 'return vqaddq_{suf}({in0}, {in1});'.format(**fmtspec) else: return 'return svqadd_{suf}({in0}, {in1});'.format(**fmtspec) # ----------------------------------------------------------------------------- # subs def subs(simd_ext, from_typ): if from_typ in common.ftypes: return 'return nsimd_sub_{simd_ext}_{from_typ}({in0}, {in1});'. \ format(**fmtspec) elif simd_ext in neon: return 'return vqsubq_{suf}({in0}, {in1});'.format(**fmtspec) else: return 'return svqsub_{suf}({in0}, {in1});'.format(**fmtspec) # ----------------------------------------------------------------------------- # to_mask def to_mask1(opts, simd_ext, typ): if typ in common.itypes + common.ftypes: normal = 'return vreinterpretq_{suf}_u{typnbits}({in0});'. \ format(**fmtspec) else: normal = 'return {in0};'.format(**fmtspec) emulate_f16 = '''nsimd_{simd_ext}_vf16 ret; ret.v0 = nsimd_to_mask_{simd_ext}_f32({in0}.v0); ret.v1 = nsimd_to_mask_{simd_ext}_f32({in0}.v1); return ret;'''.format(**fmtspec) if simd_ext == 'neon128' and typ == 'f16': return emulate_f16 elif simd_ext == 'neon128' and typ == 'f64': return '''nsimd_neon128_vf64 ret; ret.v0 = nsimd_scalar_reinterpret_f64_u64({in0}.v0); ret.v1 = nsimd_scalar_reinterpret_f64_u64({in0}.v1); return ret;'''.format(**fmtspec) elif simd_ext == 'aarch64' and typ == 'f16': return '''#ifdef NSIMD_ARM_FP16 {normal} #else {emulate_f16} #endif'''.format(normal=normal, emulate_f16=emulate_f16) elif simd_ext in sve: if opts.sve_emulate_bool: return 'return svreinterpret_{suf}_u{typnbits}({in0});'. \ format(**fmtspec) else: utyp = 'u{}'.format(fmtspec['typnbits']) return '''return svreinterpret_{suf}_{utyp}(svsel_{utyp}( {in0}, svdup_n_{utyp}(({utyp})-1), svdup_n_{utyp}(({utyp})0)));'''. \ format(utyp=utyp, **fmtspec) else: return normal # ----------------------------------------------------------------------------- # iota def iota(simd_ext, typ): if simd_ext in sve: if typ in common.iutypes: return 'return svindex_{suf}(0, 1);'.format(**fmtspec) else: return \ '''return svcvt_{suf}_s{typnbits}_x({svtrue}, svindex_s{typnbits}(0, 1));'''.format(**fmtspec) if typ == 'f64' and simd_ext == 'neon128': return '''nsimd_neon128_vf64 ret; ret.v0 = 0.0; ret.v1 = 1.0; return ret;'''.format(**fmtspec) typ2 = 'f32' if typ == 'f16' else typ le = 128 // int(typ[1:]) iota = ', '.join(['({typ2}){i}'.format(typ2=typ2, i=i) \ for i in range(le)]) normal = '''{typ} buf[{le}] = {{ {iota} }}; return vld1q_{suf}(buf);'''. \ format(le=le, iota=iota, **fmtspec) if typ == 'f16': return '''#ifdef NSIMD_ARM_FP16 {normal} #else f32 buf[8] = {{ {iota} }}; nsimd_{simd_ext}_vf16 ret; ret.v0 = vld1q_f32(buf); ret.v1 = vld1q_f32(buf + 4); return ret; #endif'''.format(iota=iota, normal=normal, **fmtspec) return normal # ----------------------------------------------------------------------------- # mask_for_loop_tail def mask_for_loop_tail(simd_ext, typ): if typ == 'f16': threshold = 'nsimd_f32_to_f16((f32)({in1} - {in0}))'.format(**fmtspec) else: threshold = '({typ})({in1} - {in0})'.format(**fmtspec) if simd_ext == 'sve': le = 'nsimd_len_sve_{typ}()'.format(**fmtspec) elif simd_ext in fixed_sized_sve: le = int(simd_ext[3:]) // int(typ[1:]) else: le = 128 // int(typ[1:]) return '''if ({in0} >= {in1}) {{ return nsimd_set1l_{simd_ext}_{typ}(0); }} if ({in1} - {in0} < {le}) {{ nsimd_{simd_ext}_v{typ} n = nsimd_set1_{simd_ext}_{typ}({threshold}); return nsimd_lt_{simd_ext}_{typ}( nsimd_iota_{simd_ext}_{typ}(), n); }} else {{ return nsimd_set1l_{simd_ext}_{typ}(1); }}'''.format(le=le, threshold=threshold, **fmtspec) # ----------------------------------------------------------------------------- # to_logical def to_logical1(opts, simd_ext, typ): if typ in common.iutypes: return '''return nsimd_ne_{simd_ext}_{typ}({in0}, nsimd_set1_{simd_ext}_{typ}(({typ})0));'''. \ format(**fmtspec) normal_fp = \ '''return nsimd_reinterpretl_{simd_ext}_{suf}_{utyp}( nsimd_ne_{simd_ext}_{utyp}( nsimd_reinterpret_{simd_ext}_{utyp}_{typ}( {in0}), nsimd_set1_{simd_ext}_{utyp}(({utyp})0)));'''. \ format(utyp='u{}'.format(fmtspec['typnbits']), **fmtspec) if typ in ['f32', 'f64'] or (typ == 'f16' and simd_ext in sve): return normal_fp emulate_fp16 = \ '''nsimd_{simd_ext}_vlf16 ret; ret.v0 = nsimd_to_logical_{simd_ext}_f32({in0}.v0); ret.v1 = nsimd_to_logical_{simd_ext}_f32({in0}.v1); return ret;'''.format(**fmtspec) if simd_ext == 'aarch64': return '''#ifdef NSIMD_ARM_FP16 {normal_fp} #else {emulate_fp16} #endif'''.format(normal_fp=normal_fp, emulate_fp16=emulate_fp16) elif simd_ext == 'neon128': return emulate_fp16 # ----------------------------------------------------------------------------- # unpack functions def zip_unzip_half(func, simd_ext, typ): if simd_ext == 'aarch64' or simd_ext in sve: if typ =='f16' and simd_ext == 'aarch64': if func in ['zip1', 'zip2']: return '''\ #ifdef NSIMD_ARM_FP16 return {s}v{op}{q}_{suf}({in0}, {in1}); #else nsimd_{simd_ext}_v{typ} ret; ret.v0 = {s}vzip1{q}_f32({in0}.v{i}, {in1}.v{i}); ret.v1 = {s}vzip2{q}_f32({in0}.v{i}, {in1}.v{i}); return ret; #endif '''.format(op=func, i = '0' if func in ['zip1', 'uzp1'] else '1', s = 's' if simd_ext in sve else '', q = '' if simd_ext in sve else 'q', **fmtspec) else: return '''\ #ifdef NSIMD_ARM_FP16 return {s}v{op}{q}_{suf}({in0}, {in1}); #else nsimd_{simd_ext}_v{typ} ret; ret.v0 = {s}v{func}{q}_f32({in0}.v0, {in0}.v1); ret.v1 = {s}v{func}{q}_f32({in1}.v0, {in1}.v1); return ret; #endif'''.format(op=func, func=func, s = 's' if simd_ext in sve else '', q = '' if simd_ext in sve else 'q', **fmtspec) else: return 'return {s}v{op}{q}_{suf}({in0}, {in1});'. \ format(op=func, s = 's' if simd_ext in sve else '', q = '' if simd_ext in sve else 'q', **fmtspec) elif simd_ext == 'neon128': armop = {'zip1': 'zipq', 'zip2': 'zipq', 'uzp1': 'uzpq', 'uzp2': 'uzpq'} prefix = { 'i': 'int', 'u': 'uint', 'f': 'float' } neon_typ = '{}{}x{}x2_t'. \ format(prefix[typ[0]], typ[1:], 128 // int(typ[1:])) if typ == 'f16': if func in ['zip1', 'zip2']: return '''\ nsimd_{simd_ext}_v{typ} ret; float32x4x2_t tmp = v{op}_f32({in0}.v{i}, {in1}.v{i}); ret.v0 = tmp.val[0]; ret.v1 = tmp.val[1]; return ret; '''.format(i = '0' if func == 'zip1' else '1', op=armop[func], **fmtspec) else: return '''\ nsimd_{simd_ext}_v{typ} ret; float32x4x2_t tmp0 = vuzpq_f32({in0}.v0, {in0}.v1); float32x4x2_t tmp1 = vuzpq_f32({in1}.v0, {in1}.v1); ret.v0 = tmp0.val[{i}]; ret.v1 = tmp1.val[{i}]; return ret; '''.format(i = '0' if func == 'uzp1' else '1', **fmtspec) elif typ in ['i64', 'u64']: return '''\ {typ} buf0[2], buf1[2]; {typ} ret[2]; vst1q_{suf}(buf0, {in0}); vst1q_{suf}(buf1, {in1}); ret[0] = buf0[{i}]; ret[1] = buf1[{i}]; return vld1q_{suf}(ret);'''. \ format(**fmtspec, i= '0' if func in ['zip1', 'uzp1'] else '1') elif typ == 'f64' : return '''\ nsimd_{simd_ext}_v{typ} ret; ret.v0 = {in0}.v{i}; ret.v1 = {in1}.v{i}; return ret;'''. \ format(**fmtspec, i= '0' if func in ['zip1', 'uzp1'] else '1') else : return '''\ {neon_typ} res; res = v{op}_{suf}({in0}, {in1}); return res.val[{i}];'''. \ format(neon_typ=neon_typ, op=armop[func], **fmtspec, i = '0' if func in ['zip1', 'uzp1'] else '1') def zip_unzip(func, simd_ext, typ): lo_hi = '''\ nsimd_{simd_ext}_v{typ}x2 ret; ret.v0 = nsimd_{func}lo_{simd_ext}_{typ}({in0}, {in1}); ret.v1 = nsimd_{func}hi_{simd_ext}_{typ}({in0}, {in1}); return ret; '''.format(func='zip' if func == 'zip' else 'unzip', **fmtspec) if simd_ext == 'aarch64' or simd_ext in sve: content = '''\ nsimd_{simd_ext}_v{typ}x2 ret; ret.v0 = {s}v{func}1{q}_{suf}({in0}, {in1}); ret.v1 = {s}v{func}2{q}_{suf}({in0}, {in1}); return ret;'''.format(s = 's' if simd_ext in sve else '', q = '' if simd_ext in sve else 'q', func=func, **fmtspec) if typ == 'f16': return '''\ #ifdef NSIMD_ARM_FP16 {c} #else {default} #endif'''.\ format(c=content, default=lo_hi, s = 's' if simd_ext in sve else '', **fmtspec) else: return content else: prefix = { 'i': 'int', 'u': 'uint', 'f': 'float' } neon_typ = '{}{}x{}x2_t'.\ format(prefix[typ[0]], typ[1:], 128 // int(typ[1:])) content = '''\ nsimd_{simd_ext}_v{typ}x2 ret; {neon_typ} tmp = v{func}q_{suf}({in0}, {in1}); ret.v0 = tmp.val[0]; ret.v1 = tmp.val[1]; return ret;'''\ .format(func=func, neon_typ=neon_typ, **fmtspec) if typ in ['u64', 'i64', 'f64']: return lo_hi elif typ == 'f16': return '''\ #ifdef NSIMD_ARM_FP16 {content} #else {default} #endif'''.\ format(content=content, default=lo_hi, f='zip' if func == 'zip' else 'unzip', **fmtspec) else: return content # ----------------------------------------------------------------------------- # gather def gather(simd_ext, typ): le = max_len(simd_ext, typ) real_le = real_len(simd_ext, typ) if simd_ext in sve: emul = '''int i; {typ} buf[{le}]; i{typnbits} offset_buf[{le}]; svst1_s{typnbits}({svtrue}, offset_buf, {in1}); for (i = 0; i < {real_le}; i++) {{ buf[i] = {in0}[offset_buf[i]]; }} return svld1_{suf}({svtrue}, buf);'''. \ format(le=le, real_le=real_le, **fmtspec) else: emul = \ '''nsimd_{simd_ext}_v{typ} ret; ret = vdupq_n_{suf}({in0}[vgetq_lane_s{typnbits}({in1}, 0)]);'''. \ format(**fmtspec) + ''.join([ '''ret = vsetq_lane_{suf}({in0}[ vgetq_lane_s{typnbits}({in1}, {i})], ret, {i});\n'''. \ format(i=i, **fmtspec) for i in range(1, le)]) + \ 'return ret;' if typ == 'f16': if simd_ext in sve: return emul return '''#ifdef NSIMD_ARM_FP16 {emul} #else nsimd_{simd_ext}_vf16 ret; f32 buf[8]; '''.format(emul=emul, **fmtspec) + \ ''.join(['buf[{i}] = nsimd_f16_to_f32({in0}[' \ 'vgetq_lane_s16({in1}, {i})]);\n'. \ format(i=i, **fmtspec) for i in range(4)]) + \ ''.join(['buf[4 + {i}] = nsimd_f16_to_f32({in0}[' \ 'vgetq_lane_s16({in1}, 4 + {i})]);\n'. \ format(i=i, **fmtspec) for i in range(4)]) + \ ''' ret.v0 = vld1q_f32(buf); ret.v1 = vld1q_f32(buf + 4); return ret; #endif'''.format(**fmtspec) if simd_ext == 'neon128' and typ == 'f64': return '''nsimd_neon128_vf64 ret; i64 offset_buf[2]; vst1q_s64(offset_buf, {in1}); ret.v0 = {in0}[offset_buf[0]]; ret.v1 = {in0}[offset_buf[1]]; return ret;'''.format(**fmtspec) if simd_ext in neon or typ in ['i8', 'u8', 'i16', 'u16']: return emul # getting here means SVE return 'return svld1_gather_s{typnbits}index_{suf}({svtrue}, {in0}, ' \ '{in1});'.format(**fmtspec) # ----------------------------------------------------------------------------- # linear gather def gather_linear(simd_ext, typ): if simd_ext in sve: if typ in ['i8', 'u8', 'i16', 'u16', 'f16']: le = max_len(simd_ext, typ) real_le = real_len(simd_ext, typ) return '''{typ} buf[{le}]; int i; for (i = 0; i < {real_le}; i++) {{ buf[i] = {in0}[i * {in1}]; }} return svld1_{suf}({svtrue}, buf);'''. \ format(le=le, real_le=real_le, **fmtspec) else: return 'return svld1_gather_s{typnbits}index_{suf}({svtrue}, ' \ '{in0}, svindex_s{typnbits}(0, (i{typnbits}){in1}));'. \ format(**fmtspec) # getting here means neon128 and aarch64 intrinsic = '''nsimd_{simd_ext}_v{typ} ret; ret = vdupq_n_{suf}({in0}[0]); '''.format(**fmtspec) + ''.join([ 'ret = vsetq_lane_{suf}({in0}[{i} * {in1}], ret, {i});\n'. \ format(i=i, **fmtspec) \ for i in range(1, 128 // int(fmtspec['typnbits']))]) + \ '''return ret;''' if typ == 'f16': return '''#ifdef NSIMD_ARM_FP16 {intrinsic} #else nsimd_{simd_ext}_vf16 ret; f32 buf[8]; int i; for (i = 0; i < 8; i++) {{ buf[i] = nsimd_f16_to_f32({in0}[i * {in1}]); }} ret.v0 = vld1q_f32(buf); ret.v1 = vld1q_f32(buf + 4); return ret; #endif'''.format(intrinsic=intrinsic, **fmtspec) if typ == 'f64' and simd_ext == 'neon128': return '''nsimd_neon128_vf64 ret; ret.v0 = {in0}[0]; ret.v1 = {in0}[{in1}]; return ret;'''.format(**fmtspec) return intrinsic # ----------------------------------------------------------------------------- # masked gather def maskoz_gather(oz, simd_ext, typ): le = max_len(simd_ext, typ) real_le = real_len(simd_ext, typ) if simd_ext in sve: utyp = 'u{typnbits}'.format(**fmtspec) store = '''svst1_s{typnbits}({svtrue}, offset_buf, {in2}); svst1_{utyp}({svtrue}, mask, svsel_{utyp}( {in0}, svdup_n_{utyp}(({utyp})-1), svdup_n_{utyp}( ({utyp})0))); '''.format(utyp=utyp, **fmtspec) if oz == 'z': store += 'svst1_{suf}({svtrue}, buf, svdup_n_{suf}(({typ})0));'. \ format(**fmtspec) else: store += 'svst1_{suf}({svtrue}, buf, {in3});'.format(**fmtspec) load = 'svld1_{suf}({svtrue}, buf)'.format(**fmtspec) else: store = '''vst1q_s{typnbits}(offset_buf, {in2}); vst1q_u{typnbits}(mask, {in0});'''.format(**fmtspec) if oz == 'z': store += 'vst1q_{suf}(buf, vdupq_n_{suf}(({typ})0));'. \ format(**fmtspec) else: store += 'vst1q_{suf}(buf, {in3});'.format(**fmtspec) load = 'vld1q_{suf}(buf)'.format(**fmtspec) emul = '''int i; {typ} buf[{le}]; u{typnbits} mask[{le}]; i{typnbits} offset_buf[{le}]; {store} for (i = 0; i < {real_le}; i++) {{ if (mask[i]) {{ buf[i] = {in1}[offset_buf[i]]; }} }} return {load};'''. \ format(le=le, real_le=real_le, store=store, load=load, **fmtspec) if typ == 'f16': if simd_ext in sve: return emul if oz == 'z': oz0 = 'vdupq_n_f32(0.0f)' oz1 = oz0 else: oz0 = '{in3}.v0'.format(**fmtspec) oz1 = '{in3}.v1'.format(**fmtspec) return '''#ifdef NSIMD_ARM_FP16 {emul} #else nsimd_{simd_ext}_vf16 ret; int i; f32 buf[{le}]; u32 mask[{le}]; i16 offset_buf[{le}]; vst1q_s16(offset_buf, {in2}); vst1q_f32(buf, {oz0}); vst1q_f32(buf + {leo2}, {oz1}); vst1q_u32(mask, {in0}.v0); vst1q_u32(mask + {leo2}, {in0}.v1); for (i = 0; i < {le}; i++) {{ if (mask[i]) {{ buf[i] = nsimd_f16_to_f32({in1}[offset_buf[i]]); }} }} ret.v0 = vld1q_f32(buf); ret.v1 = vld1q_f32(buf + {leo2}); return ret; #endif'''.format(emul=emul, leo2=le // 2, le=le, oz0=oz0, oz1=oz1, **fmtspec) if simd_ext == 'neon128' and typ == 'f64': oz0 = '0.0' if oz == 'z' else '{in3}.v0'.format(**fmtspec) oz1 = '0.0' if oz == 'z' else '{in3}.v1'.format(**fmtspec) return '''nsimd_neon128_vf64 ret; i64 offset_buf[2]; vst1q_s64(offset_buf, {in2}); if ({in0}.v0) {{ ret.v0 = {in1}[offset_buf[0]]; }} else {{ ret.v0 = {oz0}; }} if ({in0}.v1) {{ ret.v1 = {in1}[offset_buf[1]]; }} else {{ ret.v1 = {oz1}; }} return ret;'''.format(oz0=oz0, oz1=oz1, **fmtspec) if simd_ext in neon or typ in ['i8', 'u8', 'i16', 'u16']: return emul # getting here means SVE oz0 = 'svdup_n_{suf}(({typ})0)'.format(**fmtspec) if oz == 'z' \ else '{in3}'.format(**fmtspec) return '''return svsel_{suf}({in0}, svld1_gather_s{typnbits}index_{suf}( {in0}, {in1}, {in2}), {oz0});'''. \ format(oz0=oz0, **fmtspec) # ----------------------------------------------------------------------------- # scatter def scatter(simd_ext, typ): le = max_len(simd_ext, typ) real_le = real_len(simd_ext, typ) if simd_ext in sve: emul = '''int i; {typ} buf[{le}]; i{typnbits} offset_buf[{le}]; svst1_s{typnbits}({svtrue}, offset_buf, {in1}); svst1_{suf}({svtrue}, buf, {in2}); for (i = 0; i < {real_le}; i++) {{ {in0}[offset_buf[i]] = buf[i]; }}'''.format(le=le, real_le=real_le, **fmtspec) else: emul = '\n'.join(['{in0}[vgetq_lane_s{typnbits}({in1}, {i})] = ' \ 'vgetq_lane_{suf}({in2}, {i});\n'. \ format(i=i, **fmtspec) for i in range(int(le))]) if typ == 'f16': if simd_ext in sve: return emul return '''#ifdef NSIMD_ARM_FP16 {emul} #else '''.format(emul=emul) + \ '\n'.join(['{in0}[vgetq_lane_s16({in1}, {i})] = ' \ 'nsimd_f32_to_f16(vgetq_lane_f32({in2}.v0, ' '{i}));\n'.format(i=i, **fmtspec) \ for i in range(4)]) + \ '\n'.join(['{in0}[vgetq_lane_s16({in1}, 4 + {i})] = ' \ 'nsimd_f32_to_f16(vgetq_lane_f32({in2}.v1, ' '{i}));\n'.format(i=i, **fmtspec) \ for i in range(4)]) + \ ''' #endif''' if simd_ext == 'neon128' and typ == 'f64': return '''i64 offset_buf[2]; vst1q_s64(offset_buf, {in1}); {in0}[offset_buf[0]] = {in2}.v0; {in0}[offset_buf[1]] = {in2}.v1;'''.format(**fmtspec) if simd_ext in neon or typ in ['i8', 'u8', 'i16', 'u16']: return emul # getting here means SVE return 'svst1_scatter_s{typnbits}index_{suf}({svtrue}, {in0}, ' \ '{in1}, {in2});'.format(le=le, **fmtspec) # ----------------------------------------------------------------------------- # linear scatter def scatter_linear(simd_ext, typ): if simd_ext in sve: if typ in ['i8', 'u8', 'i16', 'u16', 'f16']: le = max_len(simd_ext, typ) real_le = real_len(simd_ext, typ) return '''{typ} buf[{le}]; int i; svst1_{suf}({svtrue}, buf, {in2}); for (i = 0; i < {real_le}; i++) {{ {in0}[i * {in1}] = buf[i]; }}'''.format(le=le, real_le=real_le, **fmtspec) else: return 'svst1_scatter_s{typnbits}index_{suf}({svtrue}, {in0}, ' \ 'svindex_s{typnbits}(0, (i{typnbits}){in1}), {in2});'. \ format(**fmtspec) # getting here means neon128 and aarch64 intrinsic = '\n'.join([ '{in0}[{i} * {in1}] = vgetq_lane_{suf}({in2}, {i});'. \ format(i=i, **fmtspec) for i in range(128 // int(fmtspec['typnbits']))]) if typ == 'f16': return '''#ifdef NSIMD_ARM_FP16 {intrinsic} #else f32 buf[8]; int i; vst1q_f32(buf, {in2}.v0); vst1q_f32(buf + 4, {in2}.v1); for (i = 0; i < 8; i++) {{ {in0}[i * {in1}] = nsimd_f32_to_f16(buf[i]); }} #endif'''.format(intrinsic=intrinsic, **fmtspec) if typ == 'f64' and simd_ext == 'neon128': return '''{in0}[0] = {in2}.v0; {in0}[{in1}] = {in2}.v1;'''.format(**fmtspec) return intrinsic # ----------------------------------------------------------------------------- # mask_scatter def mask_scatter(simd_ext, typ): le = max_len(simd_ext, typ) real_le = real_len(simd_ext, typ) if simd_ext in sve: store = '''svst1_s{typnbits}({svtrue}, offset_buf, {in2}); svst1_u{typnbits}({svtrue}, mask, svsel_u{typnbits}( {in0}, svdup_n_u{typnbits}((u{typnbits})1), svdup_n_u{typnbits}((u{typnbits})0))); svst1_{suf}({svtrue}, buf, {in3});'''.format(**fmtspec) else: store = '''vst1q_s{typnbits}(offset_buf, {in2}); vst1q_{suf}(buf, {in3}); vst1q_u{typnbits}(mask, {in0});'''.format(**fmtspec) emul = '''int i; {typ} buf[{le}]; u{typnbits} mask[{le}]; i{typnbits} offset_buf[{le}]; {store} for (i = 0; i < {real_le}; i++) {{ if (mask[i]) {{ {in1}[offset_buf[i]] = buf[i]; }} }}'''.format(le=le, real_le=real_le, store=store, **fmtspec) if typ == 'f16': if simd_ext in sve: return emul return '''#ifdef NSIMD_ARM_FP16 {emul} #else int i; f32 buf[{le}]; u32 mask[{le}]; i16 offset_buf[{le}]; vst1q_s16(offset_buf, {in2}); vst1q_f32(buf, {in3}.v0); vst1q_f32(buf + {leo2}, {in3}.v1); vst1q_u32(mask, {in0}.v0); vst1q_u32(mask + {leo2}, {in0}.v1); for (i = 0; i < {le}; i++) {{ if (mask[i]) {{ {in1}[offset_buf[i]] = nsimd_f32_to_f16(buf[i]); }} }} #endif'''.format(emul=emul, le=le, leo2=le // 2, **fmtspec) if simd_ext == 'neon128' and typ == 'f64': return '''i64 offset_buf[2]; vst1q_s64(offset_buf, {in2}); if ({in0}.v0) {{ {in1}[offset_buf[0]] = {in3}.v0; }} if ({in0}.v1) {{ {in1}[offset_buf[1]] = {in3}.v1; }}'''.format(**fmtspec) if simd_ext in neon or typ in ['i8', 'u8', 'i16', 'u16']: return emul # getting here means SVE return 'svst1_scatter_s{typnbits}index_{suf}({in0}, {in1}, ' \ '{in2}, {in3});'.format(le=le, **fmtspec) # ----------------------------------------------------------------------------- # get_impl function def get_impl(opts, func, simd_ext, from_typ, to_typ): global fmtspec simd_ext2 = simd_ext if not simd_ext in fixed_sized_sve else 'sve' fmtspec = { 'simd_ext': simd_ext, 'simd_ext2': simd_ext2, 'typ': from_typ, 'from_typ': from_typ, 'to_typ': to_typ, 'suf': suf(from_typ), 'in0': common.in0, 'in1': common.in1, 'in2': common.in2, 'in3': common.in3, 'in4': common.in4, 'in5': common.in5, 'typnbits': from_typ[1:], 'svtrue': 'svptrue_b{}()'.format(from_typ[1:]), 'svetyp': sve_typ(from_typ), } impls = { 'loada': lambda: load1234(opts, simd_ext, from_typ, 1), 'masko_loada1': lambda: maskoz_load('o', simd_ext, from_typ), 'maskz_loada1': lambda: maskoz_load('z', simd_ext, from_typ), 'load2a': lambda: load1234(opts, simd_ext, from_typ, 2), 'load3a': lambda: load1234(opts, simd_ext, from_typ, 3), 'load4a': lambda: load1234(opts, simd_ext, from_typ, 4), 'loadu': lambda: load1234(opts, simd_ext, from_typ, 1), 'masko_loadu1': lambda: maskoz_load('o', simd_ext, from_typ), 'maskz_loadu1': lambda: maskoz_load('z', simd_ext, from_typ), 'load2u': lambda: load1234(opts, simd_ext, from_typ, 2), 'load3u': lambda: load1234(opts, simd_ext, from_typ, 3), 'load4u': lambda: load1234(opts, simd_ext, from_typ, 4), 'storea': lambda: store1234(opts, simd_ext, from_typ, 1), 'mask_storea1': lambda: mask_store(simd_ext, from_typ), 'store2a': lambda: store1234(opts, simd_ext, from_typ, 2), 'store3a': lambda: store1234(opts, simd_ext, from_typ, 3), 'store4a': lambda: store1234(opts, simd_ext, from_typ, 4), 'storeu': lambda: store1234(opts, simd_ext, from_typ, 1), 'mask_storeu1': lambda: mask_store(simd_ext, from_typ), 'store2u': lambda: store1234(opts, simd_ext, from_typ, 2), 'store3u': lambda: store1234(opts, simd_ext, from_typ, 3), 'store4u': lambda: store1234(opts, simd_ext, from_typ, 4), 'gather': lambda: gather(simd_ext, from_typ), 'gather_linear': lambda: gather_linear(simd_ext, from_typ), 'maskz_gather': lambda: maskoz_gather('z', simd_ext, from_typ), 'masko_gather': lambda: maskoz_gather('o', simd_ext, from_typ), 'scatter': lambda: scatter(simd_ext, from_typ), 'scatter_linear': lambda: scatter_linear(simd_ext, from_typ), 'mask_scatter': lambda: mask_scatter(simd_ext, from_typ), 'andb': lambda: binop2("andb", simd_ext2, from_typ), 'xorb': lambda: binop2("xorb", simd_ext2, from_typ), 'orb': lambda: binop2("orb", simd_ext2, from_typ), 'andl': lambda: lop2(opts, "andl", simd_ext2, from_typ), 'xorl': lambda: lop2(opts, "xorl", simd_ext2, from_typ), 'orl': lambda: lop2(opts, "orl", simd_ext2, from_typ), 'notb': lambda: not1(simd_ext2, from_typ), 'notl': lambda: lnot1(opts, simd_ext2, from_typ), 'andnotb': lambda: binop2("andnotb", simd_ext2, from_typ), 'andnotl': lambda: lop2(opts, "andnotl", simd_ext2, from_typ), 'add': lambda: addsub("add", simd_ext2, from_typ), 'sub': lambda: addsub("sub", simd_ext2, from_typ), 'adds': lambda: adds(simd_ext2, from_typ), 'subs': lambda: subs(simd_ext2, from_typ), 'div': lambda: div2(simd_ext2, from_typ), 'sqrt': lambda: sqrt1(simd_ext2, from_typ), 'len': lambda: len1(simd_ext, from_typ), 'mul': lambda: mul2(simd_ext2, from_typ), 'shl': lambda: shl_shr("shl", simd_ext2, from_typ), 'shr': lambda: shl_shr("shr", simd_ext2, from_typ), 'shra': lambda: shra(simd_ext2, from_typ), 'set1': lambda: set1(simd_ext2, from_typ), 'set1l': lambda: lset1(simd_ext2, from_typ), 'eq': lambda: cmp2(opts, "eq", simd_ext2, from_typ), 'lt': lambda: cmp2(opts, "lt", simd_ext2, from_typ), 'le': lambda: cmp2(opts, "le", simd_ext2, from_typ), 'gt': lambda: cmp2(opts, "gt", simd_ext2, from_typ), 'ge': lambda: cmp2(opts, "ge", simd_ext2, from_typ), 'ne': lambda: neq2(opts, simd_ext2, from_typ), 'if_else1': lambda: if_else3(opts, simd_ext2, from_typ), 'min': lambda: minmax2("min", simd_ext2, from_typ), 'max': lambda: minmax2("max", simd_ext2, from_typ), 'loadla': lambda: loadl(True, simd_ext2, from_typ), 'loadlu': lambda: loadl(False, simd_ext2, from_typ), 'storela': lambda: storel(True, simd_ext2, from_typ), 'storelu': lambda: storel(False, simd_ext2, from_typ), 'abs': lambda: abs1(simd_ext2, from_typ), 'fma': lambda: fmafnma3("fma", simd_ext2, from_typ), 'fnma': lambda: fmafnma3("fnma", simd_ext2, from_typ), 'fms': lambda: fmsfnms3("fms", simd_ext2, from_typ), 'fnms': lambda: fmsfnms3("fnms", simd_ext2, from_typ), 'ceil': lambda: round1("ceil", simd_ext2, from_typ), 'floor': lambda: round1("floor", simd_ext2, from_typ), 'trunc': lambda: round1("trunc", simd_ext2, from_typ), 'round_to_even': lambda: round1("round_to_even", simd_ext2, from_typ), 'all': lambda: allany1(opts, "all", simd_ext2, from_typ), 'any': lambda: allany1(opts, "any", simd_ext2, from_typ), 'reinterpret': lambda: reinterpret1(simd_ext2, from_typ, to_typ), 'reinterpretl': lambda: reinterpretl1(simd_ext2, from_typ, to_typ), 'cvt': lambda: convert1(simd_ext2, from_typ, to_typ), 'rec11': lambda: recs1("rec11", simd_ext2, from_typ), 'rec8': lambda: recs1("rec8", simd_ext2, from_typ), 'rsqrt11': lambda: recs1("rsqrt11", simd_ext2, from_typ), 'rsqrt8': lambda: recs1("rsqrt8", simd_ext2, from_typ), 'rec': lambda: recs1("rec", simd_ext2, from_typ), 'neg': lambda: neg1(simd_ext2, from_typ), 'nbtrue': lambda: nbtrue1(opts, simd_ext2, from_typ), 'reverse': lambda: reverse1(simd_ext2, from_typ), 'addv': lambda: addv(simd_ext2, from_typ), 'upcvt': lambda: upcvt1(simd_ext2, from_typ, to_typ), 'downcvt': lambda: downcvt1(simd_ext2, from_typ, to_typ), 'to_logical': lambda: to_logical1(opts, simd_ext2, from_typ), 'to_mask': lambda: to_mask1(opts, simd_ext2, from_typ), 'ziplo': lambda: zip_unzip_half("zip1", simd_ext2, from_typ), 'ziphi': lambda: zip_unzip_half("zip2", simd_ext2, from_typ), 'unziplo': lambda: zip_unzip_half("uzp1", simd_ext2, from_typ), 'unziphi': lambda: zip_unzip_half("uzp2", simd_ext2, from_typ), 'zip' : lambda: zip_unzip("zip", simd_ext2, from_typ), 'unzip' : lambda: zip_unzip("uzp", simd_ext2, from_typ), 'mask_for_loop_tail': lambda : mask_for_loop_tail(simd_ext, from_typ), 'iota': lambda : iota(simd_ext2, from_typ) } if simd_ext not in get_simd_exts(): raise ValueError('Unknown SIMD extension "{}"'.format(simd_ext)) if not from_typ in common.types: raise ValueError('Unknown type "{}"'.format(from_typ)) if not func in impls: return common.NOT_IMPLEMENTED else: return impls[func]() ================================================ FILE: egg/platform_cpu.py ================================================ # Copyright (c) 2020 Agenium Scale # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. # This file gives the implementation of platform CPU, i.e. scalar emulation. # Reading this file is straightforward. For each function, e.g. the addition, # code looks like: # # return 'return {} + {};'.format(common.in0, common.in1) # # with an 'if' before to handle the FP16 special case. import common import scalar # ----------------------------------------------------------------------------- # Emulation parameters # # When emulating, we need to choose a vector length to fit the philosophy of # SIMD. By default we choose 64 bits. It must be a multiple of 64 bits. NBITS = common.CPU_NBITS def get_nb_el(typ): return NBITS // int(typ[1:]) # ----------------------------------------------------------------------------- # Implementation of mandatory functions for this module def get_simd_exts(): return ['cpu'] def get_prev_simd_ext(simd_ext): if simd_ext != 'cpu': raise ValueError('Unknown SIMD extension "{}"'.format(simd_ext)) return '' def get_simd_strings(simd_ext): if simd_ext == 'cpu': return ['cpu'] else: raise ValueError('Unknown SIMD extension "{}"'.format(simd_ext)) def emulate_fp16(simd_ext): if simd_ext != 'cpu': raise ValueError('Unknown SIMD extension "{}"'.format(simd_ext)) return True def get_type(opts, simd_ext, typ, nsimd_typ): if simd_ext != 'cpu': raise ValueError('Unknown SIMD extension "{}"'.format(simd_ext)) if typ not in common.types: raise ValueError('Unknown type "{}"'.format(typ)) typ2 = typ if typ != 'f16' else 'f32' members = '\n'.join('{} v{};'.format(typ2, i) \ for i in range(0, get_nb_el(typ))) return 'typedef struct {{ {} }} {};'.format(members, nsimd_typ) def get_logical_type(opts, simd_ext, typ, nsimd_typ): if simd_ext != 'cpu': raise ValueError('Unknown SIMD extension "{}"'.format(simd_ext)) if typ not in common.types: raise ValueError('Unknown type "{}"'.format(typ)) members = '\n'.join('unsigned int v{};'.format(i) \ for i in range(0, get_nb_el(typ))) return 'typedef struct {{ {} }} {};'.format(members, nsimd_typ) def get_nb_registers(simd_ext): if simd_ext != 'cpu': raise ValueError('Unknown SIMD extension "{}"'.format(simd_ext)) return '1' def has_compatible_SoA_types(simd_ext): if simd_ext != 'cpu': raise ValueError('Unknown SIMD extension "{}"'.format(simd_ext)) return False def get_additional_include(func, platform, simd_ext): if func in ['adds', 'subs', 'orb', 'andb', 'andnotb', 'xorb', 'min', 'max' 'notb', 'sqrt', 'shr', 'shl', 'shra', 'abs', 'fma', 'fnma', 'fms', 'fnms', 'ceil', 'floor', 'trunc', 'round_to_even', 'rec11', 'rec8', 'rsqrt11', 'rsqrt8', 'rec', 'neg', 'lgamma_u10', 'tgamma_u10', 'erf_u10', 'erfc_u15']: return '''#include ''' elif func == 'zip': return '''#include #include ''' elif func == 'unzip': return '''#include #include ''' return '' # ----------------------------------------------------------------------------- # Returns C code for func fmtspec = {} def repeat_stmt(fmt, typ): return '\n'.join(fmt.format(i=i) for i in range(0, get_nb_el(typ))) # ----------------------------------------------------------------------------- def func_body(fmt, typ2, logical = False): return '''nsimd_cpu_v{logical}{typ2} ret; {content} return ret;'''.format(logical='l' if logical else '', typ2=typ2, content=repeat_stmt(fmt, typ2), **fmtspec) # ----------------------------------------------------------------------------- def op2(op, typ): return func_body('ret.v{{i}} = {cast}({in0}.v{{i}} {op} {in1}.v{{i}});'. \ format(cast='({})'.format(typ) if typ in common.iutypes \ else '', op=op, **fmtspec), typ) # ----------------------------------------------------------------------------- def lop2(op, typ): return func_body('ret.v{{i}} = {in0}.v{{i}} {op} {in1}.v{{i}};'. \ format(op=op, **fmtspec), typ, True) # ----------------------------------------------------------------------------- def landnot2(typ): return func_body('ret.v{{i}} = {in0}.v{{i}} & (~{in1}.v{{i}});'.\ format(**fmtspec), typ, True) # ----------------------------------------------------------------------------- def lnot1(typ): return func_body('ret.v{{i}} = ~{in0}.v{{i}};'.\ format(**fmtspec), typ, True) # ----------------------------------------------------------------------------- def scalar_impl(func, typ, arity): typ2 = 'f32' if typ == 'f16' else typ # special case for shl, shr, shra if func in ['shl', 'shr', 'shra']: args = '{in0}.v{{i}}, {in1}'.format(**fmtspec) else: args = ', '.join(['{{in{}}}'.format(i).format(**fmtspec) \ + '.v{i}' for i in range(arity)]) return func_body('ret.v{{i}} = nsimd_scalar_{func}_{typ2}({args});'. \ format(func=func, typ2=typ2, args=args, **fmtspec), typ) # ----------------------------------------------------------------------------- def cmp2(op, typ): return '''nsimd_cpu_vl{typ} ret; {content} return ret;'''.format(content=repeat_stmt( '''ret.v{{i}} = (u32)({in0}.v{{i}} {op} {in1}.v{{i}} ? -1 : 0);'''. \ format(op=op, **fmtspec), typ), **fmtspec) # ----------------------------------------------------------------------------- def set1(typ): if typ == 'f16': content = repeat_stmt('ret.v{{i}} = nsimd_f16_to_f32({in0});'. \ format(**fmtspec), typ) else: content = repeat_stmt('ret.v{{i}} = {in0};'.format(**fmtspec), typ) return '''nsimd_cpu_v{typ} ret; {content} return ret;'''.format(content=content, **fmtspec) # ----------------------------------------------------------------------------- def set1l(typ): return func_body('ret.v{{i}} = (u32)({in0} ? -1 : 0);'. \ format(**fmtspec), typ, True) # ----------------------------------------------------------------------------- def load(typ): if typ == 'f16': content = repeat_stmt( 'ret.v{{i}} = nsimd_u16_to_f32(((u16 *){in0})[{{i}}]);'. \ format(**fmtspec), typ) else: content = repeat_stmt('ret.v{{i}} = {in0}[{{i}}];'.format(**fmtspec), typ) return '''nsimd_cpu_v{typ} ret; {content} return ret;'''.format(content=content, **fmtspec) # ----------------------------------------------------------------------------- def maskoz_load(oz, typ): if typ == 'f16': else_value = '0.0f' if oz == 'z' else '{in2}.v{{i}}'.format(**fmtspec) content = repeat_stmt( '''ret.v{{i}} = {in0}.v{{i}} ? nsimd_u16_to_f32(((u16 *){in1})[{{i}}]) : {else_value};'''. \ format(else_value=else_value, **fmtspec), typ) else: else_value = '({typ})0'.format(**fmtspec) if oz == 'z' else \ '{in2}.v{{i}}'.format(**fmtspec) content = repeat_stmt( 'ret.v{{i}} = {in0}.v{{i}} ? {in1}[{{i}}] : {else_value};'. \ format(else_value=else_value, **fmtspec), typ) return '''nsimd_cpu_v{typ} ret; {content} return ret;'''.format(content=content, **fmtspec) # ----------------------------------------------------------------------------- def load_deg234(typ, deg): if typ == 'f16': buf = repeat_stmt( '''ret.v{{{{j}}}}.v{{i}} = nsimd_u16_to_f32( ((u16 *){in0})[{deg} * {{i}} + {{{{j}}}}]);'''. \ format(deg=deg, **fmtspec), typ) else: buf = repeat_stmt( 'ret.v{{{{j}}}}.v{{i}} = {in0}[{deg} * {{i}} + {{{{j}}}}];'. \ format(deg=deg, **fmtspec), typ) content = '\n'.join(buf.format(j=j) for j in range(0, deg)) return '''nsimd_cpu_v{typ}x{deg} ret; {content} return ret;'''.format(deg=deg, content=content, **fmtspec) # ----------------------------------------------------------------------------- def store_deg234(typ, deg): content = '' for i in range(0, get_nb_el(typ)): for j in range(1, deg + 1): arg = fmtspec['in{}'.format(j)] if typ == 'f16': content += \ '''((u16 *){in0})[{deg} * {i} + {j}] = nsimd_f32_to_u16({arg}.v{i});\n'''. \ format(deg=deg, i=i, j=j - 1, arg=arg, **fmtspec) else: content += \ '{in0}[{deg} * {i} + {j}] = {arg}.v{i};\n'. \ format(deg=deg, i=i, j=j - 1, arg=arg, **fmtspec) return content[:-1] # ----------------------------------------------------------------------------- def loadl(typ): if typ == 'f16': content = repeat_stmt( '''ret.v{{i}} = (u32)(nsimd_u16_to_f32(((u16 *){in0})[{{i}}]) == 0.0f ? 0 : -1);'''. \ format(**fmtspec), typ) else: content = repeat_stmt( '''ret.v{{i}} = (u32)({in0}[{{i}}] == ({typ})0 ? 0 : -1);'''. \ format(**fmtspec), typ) return '''nsimd_cpu_vl{typ} ret; {content} return ret;'''.format(content=content, **fmtspec) # ----------------------------------------------------------------------------- def store(typ): if typ == 'f16': return repeat_stmt( '((u16*){in0})[{{i}}] = nsimd_f32_to_u16({in1}.v{{i}});'. \ format(**fmtspec), typ) else: return repeat_stmt('{in0}[{{i}}] = {in1}.v{{i}};'. \ format(**fmtspec), typ) # ----------------------------------------------------------------------------- def mask_store(typ): if typ == 'f16': return repeat_stmt( '''if ({in0}.v{{i}}) {{{{ ((u16*){in1})[{{i}}] = nsimd_f32_to_u16({in2}.v{{i}}); }}}}'''.format(**fmtspec), typ) else: return repeat_stmt('''if ({in0}.v{{i}}) {{{{ {in1}[{{i}}] = {in2}.v{{i}}; }}}}'''.format(**fmtspec), typ) # ----------------------------------------------------------------------------- def storel(typ): if typ == 'f16': content = repeat_stmt( '''((u16*){in0})[{{i}}] = (u16)({in1}.v{{i}} == (u32)0 ? nsimd_f32_to_u16(0.0f) : nsimd_f32_to_u16(1.0f));'''. \ format(**fmtspec), typ) else: content = repeat_stmt( '''{in0}[{{i}}] = ({typ})({in1}.v{{i}} == (u32)0 ? ({typ})0 : ({typ})1);'''. \ format(**fmtspec), typ) return content # ----------------------------------------------------------------------------- def if_else1(typ): typ2 = 'f32' if typ == 'f16' else typ return func_body( '''ret.v{{i}} = ({typ2})({in0}.v{{i}} != (u32)0 ? {in1}.v{{i}} : {in2}.v{{i}});'''. \ format(typ2=typ2, **fmtspec), typ) # ----------------------------------------------------------------------------- def all_any(typ, func): op = '&&' if func == 'all' else '||' if get_nb_el(typ) == 1: cond = '{in0}.v0 == (u32)-1'.format(**fmtspec) else: cond = op.join('({in0}.v{i} == (u32)-1)'.format(i=i, **fmtspec) \ for i in range(0, get_nb_el(typ))) return '''if ({cond}) {{ return -1; }} else {{ return 0; }}'''.format(cond=cond) # ----------------------------------------------------------------------------- def reinterpret1(from_typ, to_typ): if from_typ == to_typ: return func_body('ret.v{{i}} = {in0}.v{{i}};'.format(**fmtspec), to_typ) return '''char buf[{len}]; nsimd_storeu_cpu_{from_typ}(({from_typ} *)buf, {in0}); return nsimd_loadu_cpu_{to_typ}(({to_typ} *)buf);'''. \ format(len=NBITS // 8, **fmtspec) # ----------------------------------------------------------------------------- def reinterpretl1(from_typ, to_typ): return func_body('ret.v{{i}} = {in0}.v{{i}};'.format(**fmtspec), to_typ, True); # ----------------------------------------------------------------------------- def convert1(from_typ, to_typ): if to_typ == from_typ: return func_body('ret.v{{i}} = {in0}.v{{i}};'.format(**fmtspec), to_typ) typ2 = 'f32' if to_typ == 'f16' else to_typ return func_body('ret.v{{i}} = ({typ2}){in0}.v{{i}};'. \ format(typ2=typ2, **fmtspec), to_typ) # ----------------------------------------------------------------------------- def nbtrue1(typ): acc_code = repeat_stmt('acc += {in0}.v{{i}} == (u32)-1 ? 1 : 0;'. \ format(**fmtspec), typ) return '''int acc = 0; {acc_code} return acc;'''.format(acc_code=acc_code) # ----------------------------------------------------------------------------- def reverse1(typ): n = get_nb_el(typ) content = '\n'.join('ret.v{i} = {in0}.v{j}'. \ format(i=i, j=n - i, **fmtspec) \ for i in range(0, n)) return '''nsimd_cpu_v{typ} ret; {content} return ret;'''.format(content=content, **fmtspec) # ----------------------------------------------------------------------------- def addv1(typ): content = '+'.join('{in0}.v{i}'.format(i=i, **fmtspec) \ for i in range(0, get_nb_el(typ))) if typ == 'f16': return 'return nsimd_f32_to_f16({});'.format(content) else: return 'return {};'.format(content) # ----------------------------------------------------------------------------- def upcvt1(from_typ, to_typ): n = get_nb_el(to_typ) to_typ2 = 'f32' if to_typ == 'f16' else to_typ lower_half = '\n'.join('ret.v0.v{i} = ({to_typ2}){in0}.v{i};'. \ format(i=i, to_typ2=to_typ2, **fmtspec) \ for i in range(0, n)) upper_half = '\n'.join('ret.v1.v{i} = ({to_typ2}){in0}.v{j};'. \ format(i=i, j=i + n, to_typ2=to_typ2, **fmtspec) \ for i in range(0, n)) return '''nsimd_cpu_v{to_typ}x2 ret; {lower_half} {upper_half} return ret;'''.format(lower_half=lower_half, upper_half=upper_half, **fmtspec) # ----------------------------------------------------------------------------- def downcvt2(from_typ, to_typ): n = get_nb_el(from_typ) to_typ2 = 'f32' if to_typ == 'f16' else to_typ lower_half = '\n'.join('ret.v{i} = ({to_typ2}){in0}.v{i};'. \ format(i=i, to_typ2=to_typ2, **fmtspec) \ for i in range(0, n)) upper_half = '\n'.join('ret.v{j} = ({to_typ2}){in1}.v{i};'. \ format(i=i, j=i + n, to_typ2=to_typ2, **fmtspec) \ for i in range(0, n)) return '''nsimd_cpu_v{to_typ} ret; {lower_half} {upper_half} return ret;'''.format(lower_half=lower_half, upper_half=upper_half, **fmtspec) # ----------------------------------------------------------------------------- def len1(typ): return 'return {};'.format(get_nb_el(typ)) # ----------------------------------------------------------------------------- def to_logical1(typ): unsigned_to_logical = \ 'ret.v{{i}} = ({in0}.v{{i}} == ({utyp})0 ? (u32)0 : (u32)-1);'. \ format(**fmtspec) if typ in common.utypes: return func_body(unsigned_to_logical, typ, True) else: unsigned_to_logical = \ 'ret.v{{i}} = (buf.v{{i}} == ({utyp})0 ? (u32)0 : (u32)-1);'. \ format(**fmtspec) return '''nsimd_cpu_vl{typ} ret; nsimd_cpu_vu{typnbits} buf; buf = nsimd_reinterpret_cpu_u{typnbits}_{typ}({in0}); {unsigned_to_logical} return ret;'''. \ format(unsigned_to_logical=repeat_stmt(unsigned_to_logical, typ), **fmtspec) # ----------------------------------------------------------------------------- def to_mask1(typ): logical_to_unsigned = \ 'ret.v{{i}} = ({utyp})({in0}.v{{i}} ? -1 : 0);'. \ format(**fmtspec) if typ in common.utypes: return func_body(logical_to_unsigned, typ) elif typ == 'f16': return '''union {{ f32 f; u32 u; }} buf; nsimd_cpu_vf16 ret; {u32_to_f32} return ret;'''. \ format(u32_to_f32=repeat_stmt( 'buf.u = {in0}.v{{i}}; ret.v{{i}} = buf.f;'. \ format(**fmtspec), 'f16'), **fmtspec) else: return '''nsimd_cpu_vu{typnbits} ret; {logical_to_unsigned} return nsimd_reinterpret_cpu_{typ}_u{typnbits}(ret);'''. \ format(logical_to_unsigned=repeat_stmt(logical_to_unsigned, typ), **fmtspec) # ----------------------------------------------------------------------------- def zip_half(func, typ): n = get_nb_el(typ) if func == "ziplo": content = '\n'.join('ret.v{j1} = {in0}.v{i}; ret.v{j2} = {in1}.v{i};'. \ format(i=i, j1=i*2, j2=i*2+1, **fmtspec) \ for i in range(0, int(n/2))) else : content = '\n'.join('ret.v{j1} = {in0}.v{i}; ret.v{j2} = {in1}.v{i};'. \ format(i=i+int(n/2), j1=i*2, j2=i*2+1, **fmtspec) \ for i in range(0, int(n/2))) return '''nsimd_cpu_v{typ} ret; {content} return ret;'''.format(content=content, **fmtspec) # ----------------------------------------------------------------------------- def unzip_half(func, typ): n = get_nb_el(typ) content = '' if func == "unziplo": content = '\n'.join('ret.v{i} = {in0}.v{j}; '. \ format(i=i, j=i*2, **fmtspec) \ for i in range(0, int(n/2))) content = content + '\n'.join('ret.v{i} = {in1}.v{j}; '. \ format(i=i, j=2*(i-int(n/2)), **fmtspec) \ for i in range(int(n/2), n)) else : content = '\n'.join('ret.v{i} = {in0}.v{j}; '. \ format(i=i, j=i*2+1, **fmtspec) \ for i in range(0, int(n/2))) content = content + '\n'.join('ret.v{i} = {in1}.v{j}; '. \ format(i=i, j=2*(i-int(n/2))+1, **fmtspec)\ for i in range(int(n/2), n)) return '''nsimd_cpu_v{typ} ret; {content} return ret;'''.format(content=content, **fmtspec) def zip(from_typ): return '''nsimd_{simd_ext}_v{typ}x2 ret; ret.v0 = nsimd_ziplo_cpu_{typ}({in0}, {in1}); ret.v1 = nsimd_ziphi_cpu_{typ}({in0}, {in1}); return ret;'''.format(**fmtspec) def unzip(from_typ): return '''nsimd_{simd_ext}_v{typ}x2 ret; ret.v0 = nsimd_unziplo_cpu_{typ}({in0}, {in1}); ret.v1 = nsimd_unziphi_cpu_{typ}({in0}, {in1}); return ret;'''.format(**fmtspec) # ----------------------------------------------------------------------------- def mask_for_loop_tail(typ): return func_body( 'ret.v{{i}} = {in0} + {{i}} < {in1} ? (u32)-1 : (u32)0;'. \ format(**fmtspec), typ, True) # ----------------------------------------------------------------------------- def iota(typ): typ2 = 'f32' if typ == 'f16' else typ return func_body('ret.v{{i}} = ({typ2}){{i}};'. \ format(typ2=typ2, **fmtspec), typ) # ----------------------------------------------------------------------------- def gather(typ): if typ == 'f16': return func_body( 'ret.v{{i}} = nsimd_f16_to_f32({in0}[{in1}.v{{i}}]);'. \ format(**fmtspec), typ) return func_body('ret.v{{i}} = {in0}[{in1}.v{{i}}];'. \ format(**fmtspec), typ) # ----------------------------------------------------------------------------- def gather_linear(typ): if typ == 'f16': return func_body( 'ret.v{{i}} = nsimd_f16_to_f32({in0}[{{i}} * {in1}]);'. \ format(**fmtspec), typ) return func_body('ret.v{{i}} = {in0}[{{i}} * {in1}];'. \ format(**fmtspec), typ) # ----------------------------------------------------------------------------- def maskoz_gather(op, typ): if typ == 'f16': oz = '0.0f' if op == 'z' else '{in3}.v{{i}}' return func_body( ('''if ({in0}.v{{i}}) {{{{ ret.v{{i}} = nsimd_f16_to_f32({in1}[{in2}.v{{i}}]); }}}} else {{{{ ret.v{{i}} = ''' + oz + '''; }}}}''').format(**fmtspec), typ) oz = '({typ})0' if op == 'z' else '{in3}.v{{i}}' return func_body(('''if ({in0}.v{{i}}) {{{{ ret.v{{i}} = {in1}[{in2}.v{{i}}]; }}}} else {{{{ ret.v{{i}} = ''' + oz + '''; }}}}''').format(**fmtspec), typ) # ----------------------------------------------------------------------------- def scatter(typ): if typ == 'f16': return repeat_stmt( '{in0}[{in1}.v{{i}}] = nsimd_f32_to_f16({in2}.v{{i}});'. \ format(**fmtspec), typ) return repeat_stmt('{in0}[{in1}.v{{i}}] = {in2}.v{{i}};'. \ format(**fmtspec), typ) # ----------------------------------------------------------------------------- def scatter_linear(typ): if typ == 'f16': return repeat_stmt( '{in0}[{{i}} * {in1}] = nsimd_f32_to_f16({in2}.v{{i}});'. \ format(**fmtspec), typ) return repeat_stmt('{in0}[{{i}} * {in1}] = {in2}.v{{i}};'. \ format(**fmtspec), typ) # ----------------------------------------------------------------------------- def mask_scatter(typ): if typ == 'f16': return repeat_stmt( '''if ({in0}.v{{i}}) {{{{ {in1}[{in2}.v{{i}}] = nsimd_f32_to_f16({in3}.v{{i}}); }}}}'''.format(**fmtspec), typ) return repeat_stmt('''if ({in0}.v{{i}}) {{{{ {in1}[{in2}.v{{i}}] = {in3}.v{{i}}; }}}}'''.format(**fmtspec), typ) # ----------------------------------------------------------------------------- def get_impl(opts, func, simd_ext, from_typ, to_typ=''): global fmtspec fmtspec = { 'simd_ext': simd_ext, 'typ': from_typ, 'from_typ': from_typ, 'to_typ': to_typ, 'utyp': common.bitfield_type[from_typ], 'in0': common.in0, 'in1': common.in1, 'in2': common.in2, 'in3': common.in3, 'in4': common.in4, 'typnbits': from_typ[1:] } impls = { 'loada': lambda: load(from_typ), 'maskz_loada1': lambda: maskoz_load('z', from_typ), 'masko_loada1': lambda: maskoz_load('o', from_typ), 'load2a': lambda: load_deg234(from_typ, 2), 'load3a': lambda: load_deg234(from_typ, 3), 'load4a': lambda: load_deg234(from_typ, 4), 'loadu': lambda: load(from_typ), 'maskz_loadu1': lambda: maskoz_load('z', from_typ), 'masko_loadu1': lambda: maskoz_load('o', from_typ), 'load2u': lambda: load_deg234(from_typ, 2), 'load3u': lambda: load_deg234(from_typ, 3), 'load4u': lambda: load_deg234(from_typ, 4), 'storea': lambda: store(from_typ), 'mask_storea1': lambda: mask_store(from_typ), 'store2a': lambda: store_deg234(from_typ, 2), 'store3a': lambda: store_deg234(from_typ, 3), 'store4a': lambda: store_deg234(from_typ, 4), 'storeu': lambda: store(from_typ), 'mask_storeu1': lambda: mask_store(from_typ), 'store2u': lambda: store_deg234(from_typ, 2), 'store3u': lambda: store_deg234(from_typ, 3), 'store4u': lambda: store_deg234(from_typ, 4), 'loadla': lambda: loadl(from_typ), 'loadlu': lambda: loadl(from_typ), 'gather': lambda: gather(from_typ), 'gather_linear': lambda: gather_linear(from_typ), 'maskz_gather': lambda: maskoz_gather('z', from_typ), 'masko_gather': lambda: maskoz_gather('o', from_typ), 'scatter': lambda: scatter(from_typ), 'scatter_linear': lambda: scatter_linear(from_typ), 'mask_scatter': lambda: mask_scatter(from_typ), 'storela': lambda: storel(from_typ), 'storelu': lambda: storel(from_typ), 'add': lambda: op2('+', from_typ), 'mul': lambda: op2('*', from_typ), 'div': lambda: op2('/', from_typ), 'sub': lambda: op2('-', from_typ), 'adds' : lambda: scalar_impl('adds', from_typ, 2), 'subs' : lambda: scalar_impl('subs', from_typ, 2), 'orb': lambda: scalar_impl('orb', from_typ, 2), 'orl': lambda: lop2('|', from_typ), 'andb': lambda: scalar_impl('andb', from_typ, 2), 'andnotb': lambda: scalar_impl('andnotb', from_typ, 2), 'andnotl': lambda: landnot2(from_typ), 'andl': lambda: lop2('&', from_typ), 'xorb': lambda: scalar_impl('xorb', from_typ, 2), 'xorl': lambda: lop2('^', from_typ), 'min': lambda: scalar_impl('min', from_typ, 2), 'max': lambda: scalar_impl('max', from_typ, 2), 'notb': lambda: scalar_impl('notb', from_typ, 1), 'notl': lambda: lnot1(from_typ), 'sqrt': lambda: scalar_impl('sqrt', from_typ, 1), 'set1': lambda: set1(from_typ), 'set1l': lambda: set1l(from_typ), 'shr': lambda: scalar_impl('shr', from_typ, 2), 'shl': lambda: scalar_impl('shl', from_typ, 2), 'shra': lambda: scalar_impl('shra', from_typ, 2), 'eq': lambda: cmp2('==', from_typ), 'ne': lambda: cmp2('!=', from_typ), 'gt': lambda: cmp2('>', from_typ), 'ge': lambda: cmp2('>=', from_typ), 'lt': lambda: cmp2('<', from_typ), 'le': lambda: cmp2('<=', from_typ), 'len': lambda: len1(from_typ), 'if_else1': lambda: if_else1(from_typ), 'abs': lambda: scalar_impl('abs', from_typ, 1), 'fma': lambda: scalar_impl('fma', from_typ, 3), 'fnma': lambda: scalar_impl('fnma', from_typ, 3), 'fms': lambda: scalar_impl('fms', from_typ, 3), 'fnms': lambda: scalar_impl('fnms', from_typ, 3), 'ceil': lambda: scalar_impl('ceil', from_typ, 1), 'floor': lambda: scalar_impl('floor', from_typ, 1), 'trunc': lambda: scalar_impl('trunc', from_typ, 1), 'round_to_even': lambda: scalar_impl('round_to_even', from_typ, 1), 'all': lambda: all_any(from_typ, 'all'), 'any': lambda: all_any(from_typ, 'any'), 'reinterpret': lambda: reinterpret1(from_typ, to_typ), 'reinterpretl': lambda: reinterpretl1(from_typ, to_typ), 'cvt': lambda: convert1(from_typ, to_typ), 'rec11': lambda: scalar_impl('rec11', from_typ, 1), 'rec8': lambda: scalar_impl('rec8', from_typ, 1), 'rsqrt11': lambda: scalar_impl('rsqrt11', from_typ, 1), 'rsqrt8': lambda: scalar_impl('rsqrt8', from_typ, 1), 'rec': lambda: scalar_impl('rec', from_typ, 1), 'neg': lambda: scalar_impl('neg', from_typ, 1), 'nbtrue': lambda: nbtrue1(from_typ), 'reverse': lambda: reverse1(from_typ), 'addv': lambda: addv1(from_typ), 'upcvt': lambda: upcvt1(from_typ, to_typ), 'downcvt': lambda: downcvt2(from_typ, to_typ), 'to_logical': lambda: to_logical1(from_typ), 'to_mask': lambda: to_mask1(from_typ), 'ziplo': lambda: zip_half('ziplo', from_typ), 'ziphi': lambda: zip_half('ziphi', from_typ), 'unziplo': lambda: unzip_half('unziplo', from_typ), 'unziphi': lambda: unzip_half('unziphi', from_typ), 'zip' : lambda : zip(from_typ), 'unzip' : lambda : unzip(from_typ), 'mask_for_loop_tail': lambda : mask_for_loop_tail(from_typ), 'iota': lambda : iota(from_typ) } if simd_ext != 'cpu': raise ValueError('Unknown SIMD extension "{}"'.format(simd_ext)) if not from_typ in common.types: raise ValueError('Unknown from_type "{}"'.format(from_typ)) if not func in impls: return common.NOT_IMPLEMENTED return impls[func]() ================================================ FILE: egg/platform_ppc.py ================================================ # Copyright (c) 2021 Agenium Scale # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. # This file gives the implementation for the Power PC platform. # This script tries to be as readable as possible. It implements VMX and VSX. # Documentation found from: # https://www.nxp.com/docs/en/reference-manual/ALTIVECPIM.pdf # https://www.ibm.com/docs/en/xl-c-and-cpp-linux/13.1.6?topic=functions-vector-built-in # https://gcc.gnu.org/onlinedocs/gcc-9.1.0/gcc/PowerPC-AltiVec-Built-in-Functions-Available-on-ISA-2_002e06.html import common fmtspec = {} # ----------------------------------------------------------------------------- # Helpers def has_to_be_emulated(simd_ext, typ): if typ == 'f16': return True if simd_ext == 'vmx' and typ in ['f64', 'i64', 'u64']: return True return False # Returns the power pc type corresponding to the nsimd type def native_type(typ): if typ == 'u8': return '__vector unsigned char' elif typ == 'i8': return '__vector signed char' elif typ == 'u16': return '__vector unsigned short' elif typ == 'i16': return '__vector signed short' elif typ == 'u32': return '__vector unsigned int' elif typ == 'u64': return '__vector unsigned long long' elif typ == 'i32': return '__vector signed int' elif typ == 'i64': return '__vector signed long long' elif typ == 'f32': return '__vector float' elif typ == 'f64': return '__vector double' else: raise ValueError('Type "{}" not supported'.format(typ)) # Returns the logical power pc type corresponding to the nsimd type def native_typel(typ): if typ in ['i8', 'u8']: return '__vector __bool char' elif typ in ['i16', 'u16']: return '__vector __bool short' elif typ in ['i32', 'u32', 'f32']: return '__vector __bool int' elif typ in ['f64', 'i64', 'u64']: return '__vector __bool long long' else: raise ValueError('Type "{}" not supported'.format(typ)) # Length of a vector with elements of type typ def get_len(typ): return 128 // int(typ[1:]) # Emulate 64 bits types for vmx only def emulate_64(op, typ, params): def arg(param, i): if param == 'v': return '{}.v{{i}}'.format(common.get_arg(i)) elif param == 'l': return '(int)({}.v{{i}} & ((u64)1))'.format(common.get_arg(i)) else: return common.get_arg(i) args = ', '.join(arg(params[i + 1], i) for i in range(len(params[1:]))) args0 = args.format(i=0) args1 = args.format(i=1) if params[0] == 'v': return '''nsimd_vmx_v{typ} ret; ret.v0 = nsimd_scalar_{op}_{typ}({args0}); ret.v1 = nsimd_scalar_{op}_{typ}({args1}); return ret;'''. \ format(typ=typ, op=op, args0=args0, args1=args1) else: return \ '''nsimd_vmx_vl{typ} ret; ret.v0 = (u64)(nsimd_scalar_{op}{suf}({args0}) ? -1 : 0); ret.v1 = (u64)(nsimd_scalar_{op}{suf}({args1}) ? -1 : 0); return ret;'''. \ format(suf='' if params == ['l'] * len(params) else '_' + typ, typ=typ, op=op, args0=args0, args1=args1) def emulate_f16(op, simd_ext, params): tmpl = ', '.join(['{{in{}}}.v{{{{i}}}}'.format(i).format(**fmtspec) \ for i in range(len(params[1:]))]) args1 = tmpl.format(i=0) args2 = tmpl.format(i=1) l = 'l' if params[0] == 'l' else '' return '''nsimd_{simd_ext}_v{l}f16 ret; ret.v0 = nsimd_{op}_{simd_ext}_f32({args1}); ret.v1 = nsimd_{op}_{simd_ext}_f32({args2}); return ret;'''. \ format(l=l, op=op, args1=args1, args2=args2, **fmtspec) def emulation_code(op, simd_ext, typ, params): if typ == 'f16': return emulate_f16(op, simd_ext, params) elif simd_ext == 'vmx' and typ in ['f64', 'i64', 'u64']: return emulate_64(op, typ, params) else: raise ValueError('Automatic emulation for {}/{}/{} is not supported'. \ format(func, simd_ext, typ)) def emulate_with_scalar(op, simd_ext, typ, params): def arg(param, i): if param == 'v': return 'vec_extract({}, {{i}})'.format(common.get_arg(i)) elif param == 'l': return '(int)(vec_extract({}, {{i}}) & ((u{})1))'. \ format(common.get_arg(i), typ[1:]) else: return common.get_arg(i) args = ', '.join(arg(params[i + 1], i) for i in range(len(params[1:]))) if params[0] == 'v': return '''nsimd_{simd_ext}_v{typ} ret; ret = vec_splats(nsimd_scalar_{op}_{typ}({args0})); '''.format(typ=typ, op=op, args0=args.format(i=0), simd_ext=simd_ext) + '\n' + \ '\n'.join('ret = vec_insert('\ 'nsimd_scalar_{op}_{typ}({argsi}), ret, {i});'. \ format(op=op, typ=typ, argsi=args.format(i=i), i=i) \ for i in range(1, get_len(typ))) + '\nreturn ret;' else: utyp = 'u' + typ[1:] return \ '''nsimd_{simd_ext}_vl{typ} ret; ret = ({ppc_typl})vec_splats(({utyp})( nsimd_scalar_{op}_{typ}({args0}) ? -1 : 0)); '''.format(typ=typ, op=op, args0=args.format(i=0), utyp=utyp, ppc_typl=native_typel(typ), simd_ext=simd_ext) + '\n' + \ '\n'.join( 'ret = ({ppc_typl})vec_insert(({utyp})(' \ 'nsimd_scalar_{op}_{typ}({argsi}) ? -1 : 0), ret, {i});'. \ format(op=op, typ=typ, utyp=utyp, argsi=args.format(i=i), ppc_typl=native_typel(typ), i=i) \ for i in range(1, get_len(typ))) + '\nreturn ret;' # ----------------------------------------------------------------------------- # Implementation of mandatory functions for this module def emulate_fp16(simd_ext): return True def get_simd_exts(): return ['vmx', 'vsx'] def get_type(opts, simd_ext, typ, nsimd_typ): if simd_ext not in get_simd_exts(): raise ValueError('Unknown SIMD extension "{}"'.format(simd_ext)) if typ not in common.types: raise ValueError('Unknown type "{}"'.format(typ)) if typ == 'f16': struct = 'struct {__vector float v0; __vector float v1;}' elif simd_ext == 'vmx' and typ in ['i64', 'u64', 'f64']: struct = 'struct {{ {} v0; {} v1; }}'.format(typ, typ) else: struct = native_type(typ) return 'typedef {} {};'.format(struct, nsimd_typ) def get_logical_type(opts, simd_ext, typ, nsimd_typ): if simd_ext not in get_simd_exts(): raise ValueError('Unknown SIMD extension "{}"'.format(simd_ext)) if typ not in common.types: raise ValueError('Unknown type "{}"'.format(typ)) if typ == 'f16': struct = 'struct {__vector __bool int v0; __vector __bool int v1;}' elif simd_ext == 'vmx' and typ in ['i64', 'u64', 'f64']: struct = 'struct { u64 v0; u64 v1; }' else: struct = native_typel(typ) return 'typedef {} {};'.format(struct, nsimd_typ) def get_nb_registers(simd_ext): if simd_ext == 'vsx': return '64' elif simd_ext == 'vmx': return '32' else: raise ValueError('Unknown SIMD extension "{}"'.format(simd_ext)) def has_compatible_SoA_types(simd_ext): if simd_ext in get_simd_exts(): return False else: raise ValueError('Unknown SIMD extension "{}"'.format(simd_ext)) def get_additional_include(func, platform, simd_ext): ret = '''#include '''.format(func) if simd_ext == 'vsx': ret += '''#include '''.format(func) if func == 'neq': ret += '''#include #include '''.format(simd_ext=simd_ext) elif func in ['loadlu', 'loadla']: ret += '''#include #include #include #include '''.format(load='load' + func[5], **fmtspec) elif func in ['storelu']: ret += '''#include #include '''.format(**fmtspec) elif func in ['shr', 'shl']: ret += '''#include '''.format(**fmtspec) elif func == "shra": ret += '''#include ''' elif func in ['zip', 'unzip']: ret += '''#include #include '''.format(unzip_prefix="" if func == "zip" else "un", **fmtspec) elif func in ['unziplo', 'unziphi']: ret += '''#include #include #include '''.format(**fmtspec) elif func[:5] in ['masko', 'maskz']: ret += '''#include ''' elif func == 'mask_for_loop_tail': ret += '''#include #include #include #include '''.format(simd_ext=simd_ext) elif func[:4] == 'load': ret += ''' #include #define NSIMD_PERMUTE_MASK_64(a, b) \ {{ (unsigned char)(8 * a), (unsigned char)(8 * a + 1), \ (unsigned char)(8 * b), (unsigned char)(8 * b + 1) }} #define NSIMD_PERMUTE_MASK_32(a, b, c, d) \ {{ (unsigned char)(4 * a), (unsigned char)(4 * a + 1), \ (unsigned char)(4 * a + 2), (unsigned char)(4 * a + 3), \ (unsigned char)(4 * b), (unsigned char)(4 * b + 1), \ (unsigned char)(4 * b + 2), (unsigned char)(4 * b + 3), \ (unsigned char)(4 * c), (unsigned char)(4 * c + 1), \ (unsigned char)(4 * c + 2), (unsigned char)(4 * c + 3), \ (unsigned char)(4 * d), (unsigned char)(4 * d + 1), \ (unsigned char)(4 * d + 2), (unsigned char)(4 * d + 3) }} #define NSIMD_PERMUTE_MASK_16(a, b, c, d, e, f, g, h) \ {{ (unsigned char)(2 * a + 0), (unsigned char)(2 * a + 1), \ (unsigned char)(2 * b + 0), (unsigned char)(2 * b + 1), \ (unsigned char)(2 * c + 0), (unsigned char)(2 * c + 1), \ (unsigned char)(2 * d + 0), (unsigned char)(2 * d + 1), \ (unsigned char)(2 * e + 0), (unsigned char)(2 * e + 1), \ (unsigned char)(2 * f + 0), (unsigned char)(2 * f + 1), \ (unsigned char)(2 * g + 0), (unsigned char)(2 * g + 1), \ (unsigned char)(2 * h + 0), (unsigned char)(2 * h + 1) }} #define NSIMD_PERMUTE_MASK_8(a, b, c, d, e, f, g, h, \ i, j, k, l, m, n, o, p) \ {{ (unsigned char)(a), (unsigned char)(b), \ (unsigned char)(c), (unsigned char)(d), \ (unsigned char)(e), (unsigned char)(f), \ (unsigned char)(g), (unsigned char)(h), \ (unsigned char)(i), (unsigned char)(j), \ (unsigned char)(k), (unsigned char)(l), \ (unsigned char)(m), (unsigned char)(n), \ (unsigned char)(o), (unsigned char)(p) }} '''.format(**fmtspec) return ret # ----------------------------------------------------------------------------- def printf2(*args0): """ debugging purposes decorate the function with it and when executed on test, it will print the environnements *args0 are the name of var to printf """ to_print = [] for arg in args0: if isinstance(arg, str): to_print.append(arg) def decorator(func): import inspect def wrapper(*args, **kwargs): func_args = inspect.signature(func).bind(*args, **kwargs).arguments func_args_str = '{} called on {}\\n'. \ format(func.__name__, fmtspec['typ']) + \ ', "'.join('{} = {!r}'.format(*item) \ for item in func_args.items()) ret = '' if not DEBUG: return func(*args) typ = '' if 'typ' in func_args: typ = func_args['typ'] else: typ = func_args['from_typ'] ret += 'int k;\n' if func.__name__ == 'store1234' and typ in ['f64', 'i64', 'u64']: ret += ''' printf("element to store: %ld %ld", {in1}{suf0}, {in1}{suf1}); printf("\\n"); '''.format(**fmtspec, **get_suf64(typ)) elif func.__name__ == 'store1234' and typ[1:] == '32': ret += ''' printf("element to store:"); for (k = 0; k < 4; k++) {{ printf(" %lx", {in1}[k]); }} printf("\\n"); '''.format(**fmtspec, nbits=get_len(typ)) #print var passed as parameter on printf2 for var in to_print: if ppc_is_vec_type(typ): ret += ''' printf("values of {var}:"); for (k = 0; k < {nbits}; k++) {{ printf(" %lld", {var}[k]); }} printf("\\n"); '''.format(var=var, **fmtspec, nbits=get_len(typ)) return ''' printf("\\n---------------\\n"); printf("{}.{} ( {} )\\n"); '''.format(func.__module__, func.__qualname__, func_args_str) + ret + func(*args) return wrapper return decorator # ----------------------------------------------------------------------------- # Loads of degree 1, 2, 3 and 4 # About unaligned loads/stores for Altivec: # https://developer.ibm.com/technologies/systems/articles/pa-dalign/ def load1234(simd_ext, typ, deg, aligned): if typ in ['f64', 'i64', 'u64']: if deg == 1: if simd_ext == 'vmx': return '''nsimd_{simd_ext}_v{typ} ret; ret.v0 = {in0}[0]; ret.v1 = {in0}[1]; return ret;'''.format(**fmtspec) else: return '''nsimd_{simd_ext}_v{typ} ret; ret = vec_splats({in0}[0]); ret = vec_insert({in0}[1], ret, 1); return ret;'''.format(**fmtspec) else: if simd_ext == 'vmx': return \ 'nsimd_{simd_ext}_v{typ}x{} ret;\n'.format(deg, **fmtspec) + \ '\n'.join(['ret.v{i}.v0 = *({in0} + {i});'. \ format(i=i, **fmtspec) \ for i in range(0, deg)]) + \ '\n'.join(['ret.v{i}.v1 = *({in0} + {ipd});'. \ format(i=i, ipd=i + deg, **fmtspec) \ for i in range(0, deg)]) + \ '\nreturn ret;' else: return \ 'nsimd_{simd_ext}_v{typ}x{} ret;\n'.format(deg, **fmtspec) + \ '\n'.join( 'ret.v{i} = vec_splats({in0}[{i}]);'.format(i=i, **fmtspec) \ for i in range(0, deg)) + \ '\n'.join( 'ret.v{i} = vec_insert({in0}[{ipd}], ret.v{i}, 1);'. \ format(i=i, ipd=i + deg, **fmtspec) for i in range(0, deg)) + \ '\nreturn ret;' if typ == 'f16': if deg == 1: return \ '''nsimd_{simd_ext}_vf16 ret; u16 *ptr = (u16 *){in0}; ret.v0 = vec_splats(nsimd_u16_to_f32(ptr[0])); ret.v0 = vec_insert(nsimd_u16_to_f32(ptr[1]), ret.v0, 1); ret.v0 = vec_insert(nsimd_u16_to_f32(ptr[2]), ret.v0, 2); ret.v0 = vec_insert(nsimd_u16_to_f32(ptr[3]), ret.v0, 3); ret.v1 = vec_splats(nsimd_u16_to_f32(ptr[4])); ret.v1 = vec_insert(nsimd_u16_to_f32(ptr[5]), ret.v1, 1); ret.v1 = vec_insert(nsimd_u16_to_f32(ptr[6]), ret.v1, 2); ret.v1 = vec_insert(nsimd_u16_to_f32(ptr[7]), ret.v1, 3); return ret;'''.format(**fmtspec) else: ret = '''nsimd_{simd_ext}_vf16x{deg} ret; u16 *ptr = (u16 *){in0}; '''.format(deg=deg, **fmtspec) for i in range(0, deg): for k in range(0, 2): ret += 'ret.v{}.v{} = vec_splats(' \ 'nsimd_u16_to_f32(ptr[{}]));\n'. \ format(i, k, i + k * 4 * deg) for j in range(1, 4): ret += 'ret.v{i}.v{k} = vec_insert(nsimd_u16_to_f32(' \ 'ptr[{o}]), ret.v{i}.v{k}, {j});\n'. \ format(i=i, k=k, j=j, o=i + k * 4 * deg + j * deg) ret += 'return ret;' return ret if deg == 1: if aligned: return 'return vec_ld(0, {in0});'.format(**fmtspec) else: return 'return *({ppc_typ}*){in0};'. \ format(ppc_typ=native_type(typ), **fmtspec) # From here deg >= 2 if aligned: load = 'nsimd_{simd_ext}_v{typ}x{deg} ret;\n'. \ format(deg=deg, **fmtspec) + \ '\n'.join( 'nsimd_{simd_ext}_v{typ} in{i} = vec_ld({o}, {in0});'. \ format(i=i, o=i * 16, **fmtspec) for i in range(deg)) else: load = \ 'nsimd_{simd_ext}_v{typ}x{deg} ret;\n'. \ format(deg=deg, **fmtspec) + \ '\n'.join( 'nsimd_{simd_ext}_v{typ} in{i} = *(({ppc_typ}*){in0} + {i});'. \ format(i=i, ppc_typ=native_type(typ), **fmtspec) \ for i in range(0, deg)) if deg == 2: return '''{load} ret = nsimd_unzip_{simd_ext}_{typ}(in0, in1); return ret;'''.format(load=load, **fmtspec) elif deg == 3: if typ in ['i32', 'u32', 'f32']: return \ '''__vector unsigned char perm1 = NSIMD_PERMUTE_MASK_32( 0, 3, 6, 0); {load} nsimd_{simd_ext}_v{typ} tmp0 = vec_perm(in0, in1, perm1); nsimd_{simd_ext}_v{typ} tmp1 = vec_perm(in1, in2, perm1); nsimd_{simd_ext}_v{typ} tmp2 = vec_perm(in2, in0, perm1); __vector unsigned char perm2 = NSIMD_PERMUTE_MASK_32( 0, 1, 2, 5); __vector unsigned char perm3 = NSIMD_PERMUTE_MASK_32( 5, 0, 1, 2); __vector unsigned char perm4 = NSIMD_PERMUTE_MASK_32( 2, 5, 0, 1); ret.v0 = vec_perm(tmp0, in2, perm2); ret.v1 = vec_perm(tmp1, in0, perm3); ret.v2 = vec_perm(tmp2, in1, perm4); return ret;'''.format(load=load, **fmtspec) elif typ in ['i16', 'u16']: return \ '''{load} __vector unsigned char permRAB = NSIMD_PERMUTE_MASK_16( 0, 3, 6, 9, 12, 15, 0, 0); __vector unsigned char permRDC = NSIMD_PERMUTE_MASK_16( 0, 1, 2, 3, 4, 5, 10, 13); nsimd_{simd_ext}_v{typ} tmp0 = vec_perm(in0, in1, permRAB); ret.v0 = vec_perm(tmp0, in2, permRDC); __vector unsigned char permGAB = NSIMD_PERMUTE_MASK_16( 1, 4, 7, 10, 13, 0, 0, 0); __vector unsigned char permGEC = NSIMD_PERMUTE_MASK_16( 0, 1, 2, 3, 4, 8, 11, 14); nsimd_{simd_ext}_v{typ} tmp1 = vec_perm(in0, in1, permGAB); ret.v1 = vec_perm(tmp1, in2, permGEC); __vector unsigned char permBAB = NSIMD_PERMUTE_MASK_16( 2, 5, 8, 11, 14, 0, 0, 0); __vector unsigned char permBFC = NSIMD_PERMUTE_MASK_16( 0, 1, 2, 3, 4, 9, 12, 15); nsimd_{simd_ext}_v{typ} tmp2 = vec_perm(in0, in1, permBAB); ret.v2 = vec_perm(tmp2, in2, permBFC); return ret;'''.format(load=load, **fmtspec) elif typ in ['i8', 'u8']: return \ '''{load} __vector unsigned char permRAB = NSIMD_PERMUTE_MASK_8( 0, 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 0, 0, 0, 0, 0); __vector unsigned char permRDC = NSIMD_PERMUTE_MASK_8( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 17, 20, 23, 26, 29); nsimd_{simd_ext}_v{typ} tmp0 = vec_perm(in0, in1, permRAB); ret.v0 = vec_perm(tmp0, in2, permRDC); __vector unsigned char permGAB = NSIMD_PERMUTE_MASK_8( 1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31, 0, 0, 0, 0, 0); __vector unsigned char permGEC = NSIMD_PERMUTE_MASK_8( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 18, 21, 24, 27, 30); nsimd_{simd_ext}_v{typ} tmp1 = vec_perm(in0, in1, permGAB); ret.v1 = vec_perm(tmp1, in2, permGEC); __vector unsigned char permBAB = NSIMD_PERMUTE_MASK_8( 2, 5, 8, 11, 14, 17, 20, 23, 26, 29, 0, 0, 0, 0, 0, 0); __vector unsigned char permBFC = NSIMD_PERMUTE_MASK_8( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 16, 19, 22, 25, 28, 31); nsimd_{simd_ext}_v{typ} tmp2 = vec_perm(in0, in1, permBAB); ret.v2 = vec_perm(tmp2, in2, permBFC); return ret;'''.format(load=load, **fmtspec) else: if typ in ['i32', 'u32', 'f32']: return \ '''{load} nsimd_{simd_ext}_v{typ} tmp0 = vec_mergeh(in0, in2); nsimd_{simd_ext}_v{typ} tmp1 = vec_mergel(in0, in2); nsimd_{simd_ext}_v{typ} tmp2 = vec_mergeh(in1, in3); nsimd_{simd_ext}_v{typ} tmp3 = vec_mergel(in1, in3); ret.v0 = vec_mergeh(tmp0, tmp2); ret.v1 = vec_mergel(tmp0, tmp2); ret.v2 = vec_mergeh(tmp1, tmp3); ret.v3 = vec_mergel(tmp1, tmp3); return ret;'''.format(load=load, **fmtspec) elif typ in ['i16', 'u16']: return \ '''{load} ret.v0 = vec_mergeh(in0, in2); ret.v1 = vec_mergel(in0, in2); ret.v2 = vec_mergeh(in1, in3); ret.v3 = vec_mergel(in1, in3); nsimd_{simd_ext}_v{typ} tmp0 = vec_mergeh(ret.v0, ret.v2); nsimd_{simd_ext}_v{typ} tmp1 = vec_mergel(ret.v0, ret.v2); nsimd_{simd_ext}_v{typ} tmp2 = vec_mergeh(ret.v1, ret.v3); nsimd_{simd_ext}_v{typ} tmp3 = vec_mergel(ret.v1, ret.v3); ret.v0 = vec_mergeh(tmp0, tmp2); ret.v1 = vec_mergel(tmp0, tmp2); ret.v2 = vec_mergeh(tmp1, tmp3); ret.v3 = vec_mergel(tmp1, tmp3); return ret;'''.format(load=load, **fmtspec) elif typ in ['i8', 'u8']: return \ '''{load} nsimd_{simd_ext}_v{typ} tmp0 = vec_mergeh(in0, in2); nsimd_{simd_ext}_v{typ} tmp1 = vec_mergel(in0, in2); nsimd_{simd_ext}_v{typ} tmp2 = vec_mergeh(in1, in3); nsimd_{simd_ext}_v{typ} tmp3 = vec_mergel(in1, in3); ret.v0 = vec_mergeh(tmp0, tmp2); ret.v1 = vec_mergel(tmp0, tmp2); ret.v2 = vec_mergeh(tmp1, tmp3); ret.v3 = vec_mergel(tmp1, tmp3); tmp0 = vec_mergeh(ret.v0, ret.v2); tmp1 = vec_mergel(ret.v0, ret.v2); tmp2 = vec_mergeh(ret.v1, ret.v3); tmp3 = vec_mergel(ret.v1, ret.v3); ret.v0 = vec_mergeh(tmp0, tmp2); ret.v1 = vec_mergel(tmp0, tmp2); ret.v2 = vec_mergeh(tmp1, tmp3); ret.v3 = vec_mergel(tmp1, tmp3); return ret;'''.format(load=load, **fmtspec) # ----------------------------------------------------------------------------- # Stores of degree 1, 2, 3 and 4 def store1234(simd_ext, typ, deg, aligned): if typ in ['f64', 'i64', 'u64']: if simd_ext == 'vmx': return '\n'.join('{}[{}] = {}.v0;'. \ format(common.in0, i, common.get_arg(i + 1)) \ for i in range(deg)) + '\n' + \ '\n'.join('{}[{}] = {}.v1;'. \ format(common.in0, i + deg, common.get_arg(i + 1)) for i in range(deg)) else: return '\n'.join('{}[{}] = vec_extract({}, 0);'. \ format(common.in0, i, common.get_arg(i + 1)) \ for i in range(deg)) + '\n' + \ '\n'.join('{}[{}] = vec_extract({}, 1);'. \ format(common.in0, i + deg, common.get_arg(i + 1)) for i in range(deg)) if typ == 'f16': if deg == 1: return \ '''u16 *ptr = (u16 *){in0}; ptr[0] = nsimd_f32_to_u16(vec_extract({in1}.v0, 0)); ptr[1] = nsimd_f32_to_u16(vec_extract({in1}.v0, 1)); ptr[2] = nsimd_f32_to_u16(vec_extract({in1}.v0, 2)); ptr[3] = nsimd_f32_to_u16(vec_extract({in1}.v0, 3)); ptr[4] = nsimd_f32_to_u16(vec_extract({in1}.v1, 0)); ptr[5] = nsimd_f32_to_u16(vec_extract({in1}.v1, 1)); ptr[6] = nsimd_f32_to_u16(vec_extract({in1}.v1, 2)); ptr[7] = nsimd_f32_to_u16(vec_extract({in1}.v1, 3));'''. \ format(**fmtspec) else: ret = 'u16 *ptr = (u16 *){in0};\n'.format(**fmtspec) for i in range(0, deg): for k in range(0, 2): for j in range(0, 4): ret += 'ptr[{o}] = nsimd_f32_to_u16(' \ 'vec_extract({a}.v{k}, {j}));\n'. \ format(a=common.get_arg(i + 1), j=j, k=k, o=i + k * 4 * deg + j * deg, **fmtspec) return ret if deg == 1: if aligned: return 'vec_st({in1}, 0, {in0});'.format(**fmtspec) else: return '*({ppc_typ} *){in0} = {in1};'. \ format(ppc_typ=native_type(typ), **fmtspec) # From here deg >= 2 if aligned: store = '\n'.join('vec_st(ret{i}, {o}, {in0});'. \ format(i=i, o=i * 16, **fmtspec) \ for i in range(0, deg)) else: store = '\n'.join('*({ppc_typ} *)({in0} + {o}) = ret{i};'. \ format(o=i * get_len(typ), ppc_typ=native_type(typ), i=i, **fmtspec) for i in range(deg)) if deg == 2: return \ '''nsimd_{simd_ext}_v{typ} ret0 = vec_mergeh({in1}, {in2}); nsimd_{simd_ext}_v{typ} ret1 = vec_mergel({in1}, {in2}); {store}'''.format(store=store, **fmtspec) elif deg == 3: if typ in ['i32', 'u32', 'f32']: return \ '''__vector unsigned char perm1 = NSIMD_PERMUTE_MASK_32( 0, 2, 4, 6); __vector unsigned char perm2 = NSIMD_PERMUTE_MASK_32( 0, 2, 5, 7); __vector unsigned char perm3 = NSIMD_PERMUTE_MASK_32( 1, 3, 5, 7); nsimd_{simd_ext}_v{typ} tmp0 = vec_perm({in1}, {in2}, perm1); nsimd_{simd_ext}_v{typ} tmp1 = vec_perm({in3}, {in1}, perm2); nsimd_{simd_ext}_v{typ} tmp2 = vec_perm({in2}, {in3}, perm3); nsimd_{simd_ext}_v{typ} ret0 = vec_perm(tmp0, tmp1, perm1); nsimd_{simd_ext}_v{typ} ret1 = vec_perm(tmp2, tmp0, perm2); nsimd_{simd_ext}_v{typ} ret2 = vec_perm(tmp1, tmp2, perm3); {store}'''.format(store=store, **fmtspec) elif typ in ['i16', 'u16']: return \ '''__vector unsigned char permARG = NSIMD_PERMUTE_MASK_16( 0, 8, 0, 1, 9, 0, 2, 10); __vector unsigned char permAXB = NSIMD_PERMUTE_MASK_16( 0, 1, 8, 3, 4, 9, 6, 7); nsimd_{simd_ext}_v{typ} tmp0 = vec_perm({in1}, {in2}, permARG); nsimd_{simd_ext}_v{typ} ret0 = vec_perm(tmp0, {in3}, permAXB); __vector unsigned char permBRG = NSIMD_PERMUTE_MASK_16( 0, 3, 11, 0, 4, 12, 0, 5); __vector unsigned char permBYB = NSIMD_PERMUTE_MASK_16( 10, 1, 2, 11, 4, 5, 12, 7); nsimd_{simd_ext}_v{typ} tmp1 = vec_perm({in1}, {in2}, permBRG); nsimd_{simd_ext}_v{typ} ret1 = vec_perm(tmp1, {in3}, permBYB); __vector unsigned char permCRG = NSIMD_PERMUTE_MASK_16( 13, 0, 6, 14, 0, 7, 15, 0); __vector unsigned char permCZB = NSIMD_PERMUTE_MASK_16( 0, 13, 2, 3, 14, 5, 6, 15); nsimd_{simd_ext}_v{typ} tmp2 = vec_perm({in1}, {in2}, permCRG); nsimd_{simd_ext}_v{typ} ret2 = vec_perm(tmp2, {in3}, permCZB); {store}'''.format(store=store, **fmtspec) elif typ in ['i8', 'u8']: return \ '''__vector unsigned char mARG = NSIMD_PERMUTE_MASK_8( 0, 16, 0, 1, 17, 0, 2, 18, 0, 3, 19, 0, 4, 20, 0, 5); __vector unsigned char mAXB = NSIMD_PERMUTE_MASK_8( 0, 1, 16, 3, 4, 17, 6, 7, 18, 9, 10, 19, 12, 13, 20, 15); nsimd_{simd_ext}_v{typ} tmp0 = vec_perm({in1}, {in2}, mARG); nsimd_{simd_ext}_v{typ} ret0 = vec_perm(tmp0, {in3}, mAXB); __vector unsigned char mBRG = NSIMD_PERMUTE_MASK_8( 21, 0, 6, 22, 0, 7, 23, 0, 8, 24, 0, 9, 25, 0, 10, 26); __vector unsigned char mBYB = NSIMD_PERMUTE_MASK_8( 0, 21, 2, 3, 22, 5, 6, 23, 8, 9, 24, 11, 12, 25, 14, 15); nsimd_{simd_ext}_v{typ} tmp1 = vec_perm({in1}, {in2}, mBRG); nsimd_{simd_ext}_v{typ} ret1 = vec_perm(tmp1, {in3}, mBYB); __vector unsigned char mCRG = NSIMD_PERMUTE_MASK_8( 0, 11, 27, 0, 12, 28, 0, 13, 29, 0, 14, 30, 0, 15, 31, 0); __vector unsigned char mCZB = NSIMD_PERMUTE_MASK_8( 26, 1, 2, 27, 4, 5, 28, 7, 8, 29, 10, 11, 30, 13, 14, 31); nsimd_{simd_ext}_v{typ} tmp2 = vec_perm({in1}, {in2}, mCRG); nsimd_{simd_ext}_v{typ} ret2 = vec_perm(tmp2, {in3}, mCZB); {store}'''.format(store=store, **fmtspec) else: if typ in ['i32', 'u32', 'f32']: return \ '''nsimd_{simd_ext}_v{typ} tmp0 = vec_mergeh({in1}, {in3}); nsimd_{simd_ext}_v{typ} tmp1 = vec_mergel({in1}, {in3}); nsimd_{simd_ext}_v{typ} tmp2 = vec_mergeh({in2}, {in4}); nsimd_{simd_ext}_v{typ} tmp3 = vec_mergel({in2}, {in4}); nsimd_{simd_ext}_v{typ} ret0 = vec_mergeh(tmp0, tmp2); nsimd_{simd_ext}_v{typ} ret1 = vec_mergel(tmp0, tmp2); nsimd_{simd_ext}_v{typ} ret2 = vec_mergeh(tmp1, tmp3); nsimd_{simd_ext}_v{typ} ret3 = vec_mergel(tmp1, tmp3); {store}'''.format(store=store, **fmtspec) elif typ in ['i16', 'u16']: return \ '''nsimd_{simd_ext}_v{typ} tmp0 = vec_mergeh({in1}, {in3}); nsimd_{simd_ext}_v{typ} tmp1 = vec_mergel({in1}, {in3}); nsimd_{simd_ext}_v{typ} tmp2 = vec_mergeh({in2}, {in4}); nsimd_{simd_ext}_v{typ} tmp3 = vec_mergel({in2}, {in4}); nsimd_{simd_ext}_v{typ} ret0 = vec_mergeh(tmp0, tmp2); nsimd_{simd_ext}_v{typ} ret1 = vec_mergel(tmp0, tmp2); nsimd_{simd_ext}_v{typ} ret2 = vec_mergeh(tmp1, tmp3); nsimd_{simd_ext}_v{typ} ret3 = vec_mergel(tmp1, tmp3); {store}'''.format(store=store, **fmtspec) elif typ in ['i8', 'u8']: return \ '''nsimd_{simd_ext}_v{typ} tmp0 = vec_mergeh({in1}, {in3}); nsimd_{simd_ext}_v{typ} tmp1 = vec_mergel({in1}, {in3}); nsimd_{simd_ext}_v{typ} tmp2 = vec_mergeh({in2}, {in4}); nsimd_{simd_ext}_v{typ} tmp3 = vec_mergel({in2}, {in4}); nsimd_{simd_ext}_v{typ} ret0 = vec_mergeh(tmp0, tmp2); nsimd_{simd_ext}_v{typ} ret1 = vec_mergel(tmp0, tmp2); nsimd_{simd_ext}_v{typ} ret2 = vec_mergeh(tmp1, tmp3); nsimd_{simd_ext}_v{typ} ret3 = vec_mergel(tmp1, tmp3); {store}'''.format(store=store, **fmtspec) # ----------------------------------------------------------------------------- # Length def len1(simd_ext, typ): return 'return {};'.format(128 // int(typ[1:])) # ----------------------------------------------------------------------------- # Other helper functions def simple_op2(op, simd_ext, typ): if has_to_be_emulated(simd_ext, typ): return emulation_code(op, simd_ext, typ, ['v', 'v', 'v']) return 'return vec_{op}({in0}, {in1});'.format(op=op, **fmtspec) # Binary operators: and, or, xor, andnot def binary_op2(op, simd_ext, typ): if has_to_be_emulated(simd_ext, typ): return emulation_code(op, simd_ext, typ, ['v', 'v', 'v']) else: ppcop = {'orb': 'or', 'xorb': 'xor', 'andb': 'and', 'andnotb': 'andc'} return 'return vec_{op}({in0}, {in1});'.format(op=ppcop[op], **fmtspec) # Logical operators: and, or, xor, andnot def logical_op2(op, simd_ext, typ): if has_to_be_emulated(simd_ext, typ): return emulation_code(op, simd_ext, typ, ['l', 'l', 'l']) ppcop = {'orl': 'or', 'xorl': 'xor', 'andl': 'and', 'andnotl': 'andc'} return 'return vec_{op}({in0}, {in1});'.format(op=ppcop[op], **fmtspec) # ----------------------------------------------------------------------------- def div2(simd_ext, typ): if has_to_be_emulated(simd_ext, typ): return emulation_code('div', simd_ext, typ, ['v', 'v', 'v']) elif typ in common.ftypes: return 'return vec_div({in0}, {in1});'.format(**fmtspec) elif typ in common.iutypes: return '''nsimd_{simd_ext}_v{typ} ret; ret = vec_splats(({typ})(vec_extract({in0}, 0) / vec_extract({in1}, 0))); '''.format(**fmtspec) + \ '\n'.join( '''ret = vec_insert(({typ})(vec_extract({in0}, {i}) / vec_extract({in1}, {i})), ret, {i});'''. \ format(i=i, **fmtspec) \ for i in range(get_len(typ))) + \ '\nreturn ret;' # ----------------------------------------------------------------------------- def not1(simd_ext, typ): if has_to_be_emulated(simd_ext, typ): return emulation_code('notb', simd_ext, typ, ['v', 'v']) return 'return vec_nor({in0}, {in0});'.format(**fmtspec) # ----------------------------------------------------------------------------- def lnot1(simd_ext, typ): if has_to_be_emulated(simd_ext, typ): return emulation_code('notl', simd_ext, typ, ['l', 'l']) return 'return vec_nor({in0}, {in0});'.format(**fmtspec) # ----------------------------------------------------------------------------- def sqrt1(simd_ext, typ): if has_to_be_emulated(simd_ext, typ): return emulation_code('sqrt', simd_ext, typ, ['v', 'v']) return 'return vec_sqrt({in0});'.format(**fmtspec) # ----------------------------------------------------------------------------- def shift2(op, simd_ext, typ): if has_to_be_emulated(simd_ext, typ): return emulation_code(op, simd_ext, typ, ['v', 'v', 'p']) return 'return vec_{ppcop}({in0}, vec_splats((u{typnbits}){in1}));'. \ format(ppcop={'shl': 'sl', 'shr': 'sr', 'shra': 'sra'}[op], **fmtspec) # ----------------------------------------------------------------------------- def set1(simd_ext, typ): if typ == 'f16': return '''nsimd_{simd_ext}_vf16 ret; f32 tmp = nsimd_f16_to_f32({in0}); ret.v0 = vec_splats(tmp); ret.v1 = ret.v0; return ret;'''.format(**fmtspec) elif has_to_be_emulated(simd_ext, typ): return '''nsimd_{simd_ext}_v{typ} ret; ret.v0 = {in0}; ret.v1 = {in0}; return ret;'''.format(**fmtspec) else: return 'return vec_splats({in0});'.format(**fmtspec) # ----------------------------------------------------------------------------- def lset1(simd_ext, typ): if typ == 'f16': return \ '''nsimd_{simd_ext}_vlf16 ret; ret.v0 = (__vector __bool int)vec_splats((u32)({in0} ? -1 : 0)); ret.v1 = ret.v0; return ret;'''.format(**fmtspec) elif has_to_be_emulated(simd_ext, typ): return '''nsimd_{simd_ext}_vl{typ} ret; ret.v0 = (u64)({in0} ? -1 : 0); ret.v1 = (u64)({in0} ? -1 : 0); return ret;'''.format(**fmtspec) else: return '''if ({in0}) {{ return ({ppc_typ})vec_splats((u{typnbits})-1); }} else {{ return {lzeros}; }}'''.format(ppc_typ=native_typel(typ), **fmtspec) # ----------------------------------------------------------------------------- def cmp2(op, simd_ext, typ): if has_to_be_emulated(simd_ext, typ): return emulation_code(op, simd_ext, typ, ['l', 'v', 'v']) elif typ in common.iutypes: if op == 'ne': return '''nsimd_{simd_ext}_vl{typ} tmp; tmp = vec_cmpeq({in0}, {in1}); return vec_nor(tmp, tmp);'''.format(op=op, **fmtspec) else: return 'return vec_cmp{op}({in0}, {in1});'.format(op=op, **fmtspec) else: return emulate_with_scalar(op, simd_ext, typ, ['l', 'v', 'v']) # ----------------------------------------------------------------------------- def if_else3(simd_ext, typ): if typ == 'f16': return emulate_f16('if_else1', simd_ext, ['v', 'l', 'v', 'v']) elif has_to_be_emulated(simd_ext, typ): return '''nsimd_{simd_ext}_v{typ} ret; ret.v0 = ({in0}.v0 ? {in1}.v0 : {in2}.v0); ret.v1 = ({in0}.v1 ? {in1}.v1 : {in2}.v1); return ret;'''.format(**fmtspec) return 'return vec_sel({in2}, {in1}, {in0});'.format(**fmtspec) # ----------------------------------------------------------------------------- def minmax2(op, simd_ext, typ): if has_to_be_emulated(simd_ext, typ): return emulation_code(op, simd_ext, typ, ['v', 'v', 'v']) return 'return vec_{op}({in0}, {in1});'.format(op=op, **fmtspec) # ----------------------------------------------------------------------------- def abs1(simd_ext, typ): if typ in common.utypes: return 'return {in0};'.format(**fmtspec) elif has_to_be_emulated(simd_ext, typ): return emulation_code('abs', simd_ext, typ, ['v', 'v']) return 'return vec_abs({in0});'.format(**fmtspec) # ----------------------------------------------------------------------------- def round1(op, simd_ext, typ): if typ in common.iutypes: return 'return {in0};'.format(**fmtspec) elif has_to_be_emulated(simd_ext, typ): return emulation_code(op, simd_ext, typ, ['v', 'v']) if op == 'round_to_even': return emulate_with_scalar('round_to_even', simd_ext, typ, ['v', 'v']) ppcop = { 'trunc': 'trunc', 'ceil': 'ceil', 'floor': 'floor' } return 'return vec_{op}({in0});'.format(op=ppcop[op], **fmtspec) # ----------------------------------------------------------------------------- def fma(op, simd_ext, typ): if has_to_be_emulated(simd_ext, typ): return emulation_code(op, simd_ext, typ, ['v', 'v', 'v', 'v']) elif typ in common.iutypes: if op == 'fma': return \ 'return vec_add(vec_mul({in0}, {in1}), {in2});'.format(**fmtspec) elif op == 'fms': return \ 'return vec_sub(vec_mul({in0}, {in1}), {in2});'.format(**fmtspec) elif op == 'fnma': return \ 'return vec_sub({in2}, vec_mul({in0}, {in1}));'.format(**fmtspec) elif op == 'fnms': return '''return vec_sub(nsimd_neg_{simd_ext}_{typ}({in2}), vec_mul({in0}, {in1}));'''.format(**fmtspec) elif typ in common.ftypes: ppcop = { 'fma': 'vec_madd', 'fms': 'vec_msub', 'fnms': 'vec_nmadd', 'fnma': 'vec_nmsub' } return 'return {ppcop}({in0}, {in1}, {in2});'. \ format(ppcop=ppcop[op], **fmtspec) # ----------------------------------------------------------------------------- def neg1(simd_ext, typ): if has_to_be_emulated(simd_ext, typ): return emulation_code('neg', simd_ext, typ, ['v', 'v']) elif typ in common.itypes or typ in common.ftypes: return 'return vec_neg({in0});'.format(**fmtspec) else: return 'return vec_sub({zeros}, {in0});'.format(**fmtspec) # ----------------------------------------------------------------------------- def recs1(op, simd_ext, typ): if has_to_be_emulated(simd_ext, typ): return emulation_code(op, simd_ext, typ, ['v', 'v']) elif op == 'rec': return 'return vec_div(vec_splats(({typ})1), {in0});'. \ format(**fmtspec) elif op in ['rec8', 'rec11']: return 'return vec_re({in0});'.format(**fmtspec) elif op in ['rsqrt8', 'rsqrt11']: return 'return vec_rsqrte({in0});'.format(**fmtspec) # ----------------------------------------------------------------------------- def loadl(aligned, simd_ext, typ): return \ '''/* This can surely be improved but it is not our priority. */ return nsimd_notl_{simd_ext}_{typ}(nsimd_eq_{simd_ext}_{typ}( nsimd_load{align}_{simd_ext}_{typ}( {in0}), nsimd_set1_{simd_ext}_{typ}({zero})));'''. \ format(align='a' if aligned else 'u', zero='nsimd_f32_to_f16(0.0f)' if typ == 'f16' else '({})0'.format(typ), **fmtspec) # ----------------------------------------------------------------------------- def storel(aligned, simd_ext, typ): return \ '''/* This can surely be improved but it is not our priority. */ nsimd_store{align}_{simd_ext}_{typ}({in0}, nsimd_if_else1_{simd_ext}_{typ}({in1}, nsimd_set1_{simd_ext}_{typ}({one}), nsimd_set1_{simd_ext}_{typ}({zero})));'''. \ format(align='a' if aligned else 'u', one='nsimd_f32_to_f16(1.0f)' if typ == 'f16' else '({})1'.format(typ), zero='nsimd_f32_to_f16(0.0f)' if typ == 'f16' else '({})0'.format(typ), **fmtspec) # ----------------------------------------------------------------------------- def allany1(op, simd_ext, typ): binop = '&&' if op == 'all' else '||' if typ == 'f16': return \ '''return nsimd_{op}_{simd_ext}_f32({in0}.v0) {binop} nsimd_{op}_{simd_ext}_f32({in0}.v1);'''. \ format(op=op, binop=binop, **fmtspec) elif has_to_be_emulated(simd_ext, typ): return 'return {in0}.v0 {binop} {in0}.v1;'. \ format(binop=binop, **fmtspec) return 'return vec_{op}_ne({in0}, ({lzeros}));'.format(op=op, **fmtspec) # ----------------------------------------------------------------------------- def nbtrue1(simd_ext, typ): if typ == 'f16': return \ '''return nsimd_nbtrue_{simd_ext}_f32({in0}.v0) + nsimd_nbtrue_{simd_ext}_f32({in0}.v1);'''. \ format(**fmtspec) elif has_to_be_emulated(simd_ext, typ): return 'return -(int)((i64)({in0}.v0) + (i64)({in0}.v1));'. \ format(**fmtspec) return 'return {};'. \ format(' + '.join('(vec_extract({in0}, {i}) ? 1 : 0)'. \ format(i=i, **fmtspec) \ for i in range(get_len(typ)))) # ----------------------------------------------------------------------------- def reinterpretl1(simd_ext, from_typ, to_typ): if from_typ == to_typ: return 'return {in0};'.format(**fmtspec) elif simd_ext == 'vmx' and from_typ in ['f64', 'i64', 'u64']: return \ '''nsimd_{simd_ext}_vl{to_typ} ret; ret.v0 = {in0}.v0; ret.v1 = {in0}.v1; return ret;'''.format(**fmtspec) elif from_typ == 'f16': return \ '''nsimd_{simd_ext}_vl{to_typ} ret = (__vector __bool short)vec_splats( (u16)vec_extract({in0}.v0, 0)); ret = (__vector __bool short)vec_insert( (u16)vec_extract({in0}.v0, 1), ret, 1); ret = (__vector __bool short)vec_insert( (u16)vec_extract({in0}.v0, 2), ret, 2); ret = (__vector __bool short)vec_insert( (u16)vec_extract({in0}.v0, 3), ret, 3); ret = (__vector __bool short)vec_insert( (u16)vec_extract({in0}.v1, 0), ret, 4); ret = (__vector __bool short)vec_insert( (u16)vec_extract({in0}.v1, 1), ret, 5); ret = (__vector __bool short)vec_insert( (u16)vec_extract({in0}.v1, 2), ret, 6); ret = (__vector __bool short)vec_insert( (u16)vec_extract({in0}.v1, 3), ret, 7); return ret;'''.format(**fmtspec) elif to_typ == 'f16': return \ '''nsimd_{simd_ext}_vlf16 ret; ret.v0 = (__vector __bool int)vec_splats( (u32)(vec_extract({in0}, 0) ? -1 : 0)); ret.v0 = (__vector __bool int)vec_insert( (u32)(vec_extract({in0}, 1) ? -1 : 0), ret.v0, 1); ret.v0 = (__vector __bool int)vec_insert( (u32)(vec_extract({in0}, 2) ? -1 : 0), ret.v0, 2); ret.v0 = (__vector __bool int)vec_insert( (u32)(vec_extract({in0}, 3) ? -1 : 0), ret.v0, 3); ret.v1 = (__vector __bool int)vec_splats( (u32)(vec_extract({in0}, 4) ? -1 : 0)); ret.v1 = (__vector __bool int)vec_insert( (u32)(vec_extract({in0}, 5) ? -1 : 0), ret.v1, 1); ret.v1 = (__vector __bool int)vec_insert( (u32)(vec_extract({in0}, 6) ? -1 : 0), ret.v1, 2); ret.v1 = (__vector __bool int)vec_insert( (u32)(vec_extract({in0}, 7) ? -1 : 0), ret.v1, 3); return ret;'''.format(**fmtspec) else: return 'return ({ppc_to_typ}){in0};'. \ format(ppc_to_typ=native_typel(to_typ), **fmtspec) # ----------------------------------------------------------------------------- def convert1(simd_ext, from_typ, to_typ): if from_typ == to_typ: return 'return {in0};'.format(**fmtspec) elif from_typ == 'f16' and to_typ == 'u16': return \ '''return vec_pack((__vector unsigned int)vec_ctu({in0}.v0, 0), (__vector unsigned int)vec_ctu({in0}.v1, 0));'''. \ format(**fmtspec) elif from_typ == 'f16' and to_typ == 'i16': return \ '''return vec_pack((__vector signed int)vec_cts({in0}.v0, 0), (__vector signed int)vec_cts({in0}.v1, 0));'''. \ format(**fmtspec) elif from_typ == 'u16' and to_typ == 'f16': return \ '''nsimd_{simd_ext}_vf16 ret; /* Unpack extends the sign, we need to remove the extra 1s */ __vector int mask = vec_splats((int)0xFFFF); ret.v0 = vec_ctf(vec_and(vec_unpackh((__vector short){in0}), mask), 0); ret.v1 = vec_ctf(vec_and(vec_unpackl((__vector short){in0}), mask), 0); return ret;'''.format(**fmtspec) elif from_typ == 'i16' and to_typ == 'f16': return '''nsimd_{simd_ext}_vf16 ret; ret.v0 = vec_ctf(vec_unpackh({in0}), 0); ret.v1 = vec_ctf(vec_unpackl({in0}), 0); return ret;'''.format(**fmtspec) elif has_to_be_emulated(simd_ext, to_typ): return '''nsimd_{simd_ext}_v{to_typ} ret; ret.v0 = nsimd_scalar_cvt_{to_typ}_{from_typ}({in0}.v0); ret.v1 = nsimd_scalar_cvt_{to_typ}_{from_typ}({in0}.v1); return ret;'''.format(**fmtspec) elif from_typ in ['f32', 'f64'] and to_typ in ['i32', 'i64']: return 'return vec_cts({in0}, 0);'.format(**fmtspec) elif from_typ in ['f32', 'f64'] and to_typ in ['u32', 'u64']: return 'return vec_ctu({in0}, 0);'.format(**fmtspec) elif from_typ in ['i32', 'i64', 'u32', 'u64'] and to_typ in ['f32', 'f64']: return 'return vec_ctf({in0}, 0);'.format(**fmtspec) elif from_typ in common.iutypes and to_typ in common.iutypes: return 'return ({ppctyp}){in0};'. \ format(ppctyp=native_type(to_typ), **fmtspec) # ----------------------------------------------------------------------------- def reinterpret1(simd_ext, from_typ, to_typ): if from_typ == to_typ: return 'return {in0};'.format(**fmtspec) elif simd_ext == 'vmx' and from_typ in ['f64', 'i64', 'u64']: return \ '''nsimd_{simd_ext}_v{to_typ} ret; ret.v0 = nsimd_scalar_reinterpret_{to_typ}_{from_typ}({in0}.v0); ret.v1 = nsimd_scalar_reinterpret_{to_typ}_{from_typ}({in0}.v1); return ret;'''.format(**fmtspec) elif from_typ == 'f16' and to_typ == 'u16': return \ '''nsimd_{simd_ext}_vu16 ret; ret = vec_splats(nsimd_f32_to_u16(vec_extract({in0}.v0, 0))); ret = vec_insert(nsimd_f32_to_u16(vec_extract({in0}.v0, 1)), ret, 1); ret = vec_insert(nsimd_f32_to_u16(vec_extract({in0}.v0, 2)), ret, 2); ret = vec_insert(nsimd_f32_to_u16(vec_extract({in0}.v0, 3)), ret, 3); ret = vec_insert(nsimd_f32_to_u16(vec_extract({in0}.v1, 0)), ret, 4); ret = vec_insert(nsimd_f32_to_u16(vec_extract({in0}.v1, 1)), ret, 5); ret = vec_insert(nsimd_f32_to_u16(vec_extract({in0}.v1, 2)), ret, 6); ret = vec_insert(nsimd_f32_to_u16(vec_extract({in0}.v1, 3)), ret, 7); return ret;'''.format(**fmtspec) elif from_typ == 'f16' and to_typ == 'i16': return \ '''nsimd_{simd_ext}_vi16 ret; ret = vec_splats(nsimd_scalar_reinterpret_i16_u16( nsimd_f32_to_u16(vec_extract({in0}.v0, 0)))); ret = vec_insert(nsimd_scalar_reinterpret_i16_u16( nsimd_f32_to_u16(vec_extract({in0}.v0, 1))), ret, 1); ret = vec_insert(nsimd_scalar_reinterpret_i16_u16( nsimd_f32_to_u16(vec_extract({in0}.v0, 2))), ret, 2); ret = vec_insert(nsimd_scalar_reinterpret_i16_u16( nsimd_f32_to_u16(vec_extract({in0}.v0, 3))), ret, 3); ret = vec_insert(nsimd_scalar_reinterpret_i16_u16( nsimd_f32_to_u16(vec_extract({in0}.v1, 0))), ret, 4); ret = vec_insert(nsimd_scalar_reinterpret_i16_u16( nsimd_f32_to_u16(vec_extract({in0}.v1, 1))), ret, 5); ret = vec_insert(nsimd_scalar_reinterpret_i16_u16( nsimd_f32_to_u16(vec_extract({in0}.v1, 2))), ret, 6); ret = vec_insert(nsimd_scalar_reinterpret_i16_u16( nsimd_f32_to_u16(vec_extract({in0}.v1, 3))), ret, 7); return ret;'''.format(**fmtspec) elif from_typ == 'u16' and to_typ == 'f16': return \ '''nsimd_{simd_ext}_vf16 ret; ret.v0 = vec_splats(nsimd_u16_to_f32(vec_extract({in0}, 0))); ret.v0 = vec_insert(nsimd_u16_to_f32(vec_extract({in0}, 1)), ret.v0, 1); ret.v0 = vec_insert(nsimd_u16_to_f32(vec_extract({in0}, 2)), ret.v0, 2); ret.v0 = vec_insert(nsimd_u16_to_f32(vec_extract({in0}, 3)), ret.v0, 3); ret.v1 = vec_splats(nsimd_u16_to_f32(vec_extract({in0}, 4))); ret.v1 = vec_insert(nsimd_u16_to_f32(vec_extract({in0}, 5)), ret.v1, 1); ret.v1 = vec_insert(nsimd_u16_to_f32(vec_extract({in0}, 6)), ret.v1, 2); ret.v1 = vec_insert(nsimd_u16_to_f32(vec_extract({in0}, 7)), ret.v1, 3); return ret;'''.format(**fmtspec) elif from_typ == 'i16' and to_typ == 'f16': return \ '''nsimd_{simd_ext}_vf16 ret; ret.v0 = vec_splats(nsimd_u16_to_f32( nsimd_scalar_reinterpret_u16_i16( vec_extract({in0}, 0)))); ret.v0 = vec_insert(nsimd_u16_to_f32( nsimd_scalar_reinterpret_u16_i16( vec_extract({in0}, 1))), ret.v0, 1); ret.v0 = vec_insert(nsimd_u16_to_f32( nsimd_scalar_reinterpret_u16_i16( vec_extract({in0}, 2))), ret.v0, 2); ret.v0 = vec_insert(nsimd_u16_to_f32( nsimd_scalar_reinterpret_u16_i16( vec_extract({in0}, 3))), ret.v0, 3); ret.v1 = vec_splats(nsimd_u16_to_f32( nsimd_scalar_reinterpret_u16_i16( vec_extract({in0}, 4)))); ret.v1 = vec_insert(nsimd_u16_to_f32( nsimd_scalar_reinterpret_u16_i16( vec_extract({in0}, 5))), ret.v1, 1); ret.v1 = vec_insert(nsimd_u16_to_f32( nsimd_scalar_reinterpret_u16_i16( vec_extract({in0}, 6))), ret.v1, 2); ret.v1 = vec_insert(nsimd_u16_to_f32( nsimd_scalar_reinterpret_u16_i16( vec_extract({in0}, 7))), ret.v1, 3); return ret;'''.format(**fmtspec) else: return 'return ({ppc_typ}){in0};'. \ format(ppc_typ=native_type(to_typ), **fmtspec) # ----------------------------------------------------------------------------- def reverse1(simd_ext, typ): if typ == 'f16': return emulate_f16('reverse', simd_ext, ['v', 'v']) elif has_to_be_emulated(simd_ext, typ): return '''nsimd_{simd_ext}_v{typ} ret; ret.v0 = {in0}.v1; ret.v1 = {in0}.v0; return ret;'''.format(**fmtspec) elif typ in ['i8', 'u8']: return '''return vec_perm({in0}, {in0}, (__vector unsigned char) {{ 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 }});'''. \ format(**fmtspec) elif typ in ['i16', 'u16']: return '''return vec_perm({in0}, {in0}, (__vector unsigned char) {{ 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1 }});'''. \ format(**fmtspec) elif typ in ['i32', 'u32', 'f32']: return '''return vec_perm({in0}, {in0}, (__vector unsigned char) {{ 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3 }});'''. \ format(**fmtspec) elif typ in ['f64', 'i64', 'u64']: return '''return vec_perm({in0}, {in0}, (__vector unsigned char) {{ 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7 }});'''. \ format(**fmtspec) # ----------------------------------------------------------------------------- def addv(simd_ext, typ): if typ == 'f16': return '''return nsimd_f32_to_f16( nsimd_addv_{simd_ext}_f32({in0}.v0) + nsimd_addv_{simd_ext}_f32({in0}.v1));'''.format(**fmtspec) elif has_to_be_emulated(simd_ext, typ): return 'return {in0}.v0 + {in0}.v1;'.format(**fmtspec) return 'return ({})({});'. \ format(typ, ' + '.join('vec_extract({in0}, {i})'. \ format(i=i, **fmtspec) \ for i in range(get_len(typ)))) # ----------------------------------------------------------------------------- def add_sub_s(op, simd_ext, typ): if has_to_be_emulated(simd_ext, typ): return emulation_code(op, simd_ext, typ, ['v', 'v', 'v']) if typ in common.ftypes: return 'return vec_{op}({in0}, {in1});'.format(op=op[:-1], **fmtspec) elif typ in ['i64', 'u64']: return '''nsimd_{simd_ext}_v{typ} ret; ret = vec_splats(nsimd_scalar_{op}_{typ}( vec_extract({in0}, 0), vec_extract({in1}, 0))); ret = vec_insert(nsimd_scalar_{op}_{typ}( vec_extract({in0}, 1), vec_extract({in1}, 1)), ret, 1); return ret;'''.format(op=op, **fmtspec) return 'return vec_{op}({in0}, {in1});'.format(op=op, **fmtspec) # ----------------------------------------------------------------------------- def upcvt1(simd_ext, from_typ, to_typ): if from_typ in ['i8', 'u8'] and to_typ == 'f16': return '''nsimd_{simd_ext}_vf16x2 ret; nsimd_{simd_ext}_vi16x2 tmp; tmp = nsimd_upcvt_{simd_ext}_i16_{from_typ}(a0); ret.v0 = nsimd_cvt_{simd_ext}_f16_i16(tmp.v0); ret.v1 = nsimd_cvt_{simd_ext}_f16_i16(tmp.v1); return ret;'''.format(**fmtspec) elif from_typ == 'f16' and to_typ == 'f32': return '''nsimd_{simd_ext}_v{to_typ}x2 ret; ret.v0 = {in0}.v0; ret.v1 = {in0}.v1; return ret;'''.format(**fmtspec) elif from_typ == 'f16' and to_typ in ['i32', 'u32']: sign = 'u' if to_typ[0] == 'u' else 's' return '''nsimd_{simd_ext}_v{to_typ}x2 ret; ret.v0 = vec_ct{sign}({in0}.v0, 0); ret.v1 = vec_ct{sign}({in0}.v1, 0); return ret;'''.format(sign=sign, **fmtspec) elif from_typ == 'f32' and to_typ in ['f64', 'i64', 'u64']: if simd_ext == 'vmx': return '''nsimd_vmx_v{to_typ}x2 ret; ret.v0.v0 = ({to_typ})vec_extract({in0}, 0); ret.v0.v1 = ({to_typ})vec_extract({in0}, 1); ret.v1.v0 = ({to_typ})vec_extract({in0}, 2); ret.v1.v1 = ({to_typ})vec_extract({in0}, 3); return ret;'''.format(**fmtspec) else: return \ '''nsimd_vsx_v{to_typ}x2 ret; ret.v0 = vec_splats(({to_typ})vec_extract({in0}, 0)); ret.v0 = vec_insert(({to_typ})vec_extract({in0}, 1), ret.v0, 1); ret.v1 = vec_splats(({to_typ})vec_extract({in0}, 2)); ret.v1 = vec_insert(({to_typ})vec_extract({in0}, 3), ret.v1, 1); return ret;'''.format(**fmtspec) elif (from_typ in ['i16', 'u16'] and to_typ == 'f32') or \ (from_typ in ['i32', 'u32'] and to_typ == 'f64'): return '''nsimd_{simd_ext}_v{to_typ}x2 ret; nsimd_{simd_ext}_v{sto_typ}x2 tmp; tmp = nsimd_upcvt_{simd_ext}_{sto_typ}_{from_typ}({in0}); ret.v0 = nsimd_cvt_{simd_ext}_{to_typ}_{sto_typ}(tmp.v0); ret.v1 = nsimd_cvt_{simd_ext}_{to_typ}_{sto_typ}(tmp.v1); return ret;'''.format(sto_typ=from_typ[0] + to_typ[1:], **fmtspec) elif from_typ in ['u8', 'u16']: mask='(i{})0x{}'.format(to_typ[1:], 'F' * (int(from_typ[1:]) // 4)) ppc_sto_typ = native_type('i' + to_typ[1:]) ppc_sfrom_typ = '({})'.format(native_type('i' + from_typ[1:])) ppc_to_typ = '({})'.format(native_type(to_typ)) \ if to_typ in common.utypes else '' return '''nsimd_{simd_ext}_v{to_typ}x2 ret; {ppc_sto_typ} mask = vec_splats({mask}); ret.v0 = {ppc_to_typ}vec_and( vec_unpackh({ppc_sfrom_typ}{in0}), mask); ret.v1 = {ppc_to_typ}vec_and( vec_unpackl({ppc_sfrom_typ}{in0}), mask); return ret;'''.format(mask=mask, ppc_sto_typ=ppc_sto_typ, ppc_sfrom_typ=ppc_sfrom_typ, ppc_to_typ=ppc_to_typ, **fmtspec) elif from_typ in ['i8', 'i16']: ppc_to_typ = '({})'.format(native_type(to_typ)) \ if to_typ in common.utypes else '' return '''nsimd_{simd_ext}_v{to_typ}x2 ret; ret.v0 = {ppc_to_typ}vec_unpackh({in0}); ret.v1 = {ppc_to_typ}vec_unpackl({in0}); return ret;'''.format(ppc_to_typ=ppc_to_typ, **fmtspec) elif from_typ in ['i32', 'u32']: if simd_ext == 'vmx': return '''nsimd_vmx_v{to_typ}x2 ret; ret.v0.v0 = ({to_typ})vec_extract({in0}, 0); ret.v0.v1 = ({to_typ})vec_extract({in0}, 1); ret.v1.v0 = ({to_typ})vec_extract({in0}, 2); ret.v1.v1 = ({to_typ})vec_extract({in0}, 3); return ret;'''.format(**fmtspec) else: return \ '''nsimd_vsx_v{to_typ}x2 ret; ret.v0 = vec_splats(({to_typ})vec_extract({in0}, 0)); ret.v0 = vec_insert(({to_typ})vec_extract({in0}, 1), ret.v0, 1); ret.v1 = vec_splats(({to_typ})vec_extract({in0}, 2)); ret.v1 = vec_insert(({to_typ})vec_extract({in0}, 3), ret.v1, 1); return ret;'''.format(**fmtspec) # ----------------------------------------------------------------------------- def downcvt1(simd_ext, from_typ, to_typ): if from_typ in ['f64', 'i64', 'u64']: if simd_ext == 'vmx': return '''nsimd_vmx_v{to_typ} ret; ret = vec_splats(({to_typ}){in0}.v0); ret = vec_insert(({to_typ}){in0}.v1, ret, 1); ret = vec_insert(({to_typ}){in1}.v0, ret, 2); ret = vec_insert(({to_typ}){in1}.v1, ret, 3); return ret;'''.format(**fmtspec) else: return \ '''nsimd_vsx_v{to_typ} ret; ret = vec_splats(({to_typ})vec_extract({in0}, 0)); ret = vec_insert(({to_typ})vec_extract({in0}, 1), ret, 1); ret = vec_insert(({to_typ})vec_extract({in1}, 0), ret, 2); ret = vec_insert(({to_typ})vec_extract({in1}, 1), ret, 3); return ret;'''.format(**fmtspec) elif from_typ in common.iutypes and to_typ in common.iutypes: return 'return {cast}vec_pack({in0}, {in1});'. \ format(cast='({})'.format(native_type(to_typ)) \ if from_typ[0] != to_typ[0] else '', **fmtspec) elif from_typ == 'f32' and to_typ == 'f16': return '''nsimd_{simd_ext}_vf16 ret; ret.v0 = {in0}; ret.v1 = {in1}; return ret;'''.format(**fmtspec) elif from_typ == 'f32' and to_typ in common.iutypes: return 'return vec_pack(vec_ct{s}({in0}, 0), vec_ct{s}({in1}, 0));'. \ format(s='s' if to_typ == 'i16' else 'u', **fmtspec) elif from_typ in common.iutypes and to_typ == 'f16': return '''nsimd_{simd_ext}_vf16 ret; ret.v0 = vec_ctf({in0}, 0); ret.v1 = vec_ctf({in1}, 0); return ret;'''.format(**fmtspec) elif from_typ == 'f16': return \ '''return vec_pack(vec_pack(vec_ct{s}({in0}.v0, 0), vec_ct{s}({in0}.v1, 0)), vec_pack(vec_ct{s}({in1}.v0, 0), vec_ct{s}({in1}.v1, 0)));'''. \ format(s='s' if to_typ == 'i8' else 'u', **fmtspec) # ----------------------------------------------------------------------------- def unzip(func, simd_ext, typ): if typ == 'f16': return '''nsimd_{simd_ext}_vf16 ret; ret.v0 = nsimd_{func}_{simd_ext}_f32({in0}.v0, {in0}.v1); ret.v1 = nsimd_{func}_{simd_ext}_f32({in1}.v0, {in1}.v1); return ret;'''.format(func=func, **fmtspec) elif typ in ['f64', 'i64', 'u64']: if simd_ext == 'vmx': return '''nsimd_vmx_v{typ} ret; ret.v0 = {in0}.v{i}; ret.v1 = {in1}.v{i}; return ret;'''.format(i=0 if func == 'unziplo' else 1, **fmtspec) else: return '''nsimd_vsx_v{typ} ret; ret = vec_splats(vec_extract({in0}, {i})); ret = vec_insert(vec_extract({in1}, {i}), ret, 1); return ret;'''.format(i=0 if func == 'unziplo' else 1, **fmtspec) elif typ in ['i8', 'u8', 'i16', 'u16', 'i32', 'u32', 'f32']: perm = [] le = get_len(typ) for i in range(le): sz = int(typ[1:]) // 8 for j in range(0, sz): perm += ['(unsigned char)' + str(2 * sz * i + \ (0 if func == 'unziplo' else sz) + j)] return \ '''__vector unsigned char perm = {{ {perm} }}; return vec_perm({in0}, {in1}, perm);'''. \ format(perm=', '.join(perm), **fmtspec) # ----------------------------------------------------------------------------- def zip(op, simd_ext, typ): if typ == 'f16': return '''nsimd_{simd_ext}_vf16 ret; ret.v0 = vec_splats(vec_extract({in0}.v{i}, 0)); ret.v0 = vec_insert(vec_extract({in0}.v{i}, 1), ret.v0, 2); ret.v1 = vec_splats(vec_extract({in0}.v{i}, 2)); ret.v1 = vec_insert(vec_extract({in0}.v{i}, 3), ret.v1, 2); ret.v0 = vec_insert(vec_extract({in1}.v{i}, 0), ret.v0, 1); ret.v0 = vec_insert(vec_extract({in1}.v{i}, 1), ret.v0, 3); ret.v1 = vec_insert(vec_extract({in1}.v{i}, 2), ret.v1, 1); ret.v1 = vec_insert(vec_extract({in1}.v{i}, 3), ret.v1, 3); return ret;'''.format(i=0 if op == 'ziplo' else 1, **fmtspec) elif simd_ext == 'vmx' and typ in ['f64', 'i64', 'u64']: return '''nsimd_{simd_ext}_v{typ} ret; ret.v0 = {in0}.v{i}; ret.v1 = {in1}.v{i}; return ret;'''.format(i='1' if op == 'ziphi' else '0', **fmtspec) return 'return vec_merge{suf}({in0}, {in1});'. \ format(suf='l' if op == 'ziphi' else 'h', **fmtspec) # ----------------------------------------------------------------------------- def zip_unzip_basic(op, simd_ext, typ): return \ '''nsimd_{simd_ext}_v{typ}x2 ret; ret.v0 = nsimd_{pre}ziplo_{simd_ext}_{typ}({in0}, {in1}); ret.v1 = nsimd_{pre}ziphi_{simd_ext}_{typ}({in0}, {in1}); return ret;'''.format(pre='un' if op == 'unzip' else '', **fmtspec) # ----------------------------------------------------------------------------- def to_mask(simd_ext, typ): if typ == 'f16': return '''nsimd_{simd_ext}_vf16 ret; ret.v0 = (__vector float){in0}.v0; ret.v1 = (__vector float){in0}.v1; return ret;'''.format(**fmtspec) if simd_ext == 'vmx' and typ in ['f64', 'i64']: return '''nsimd_{simd_ext}_v{typ} ret; ret.v0 = nsimd_scalar_reinterpret_{typ}_u64({in0}.v0); ret.v1 = nsimd_scalar_reinterpret_{typ}_u64({in0}.v1); return ret;'''.format(**fmtspec) elif simd_ext == 'vmx' and typ == 'u64': return '''nsimd_{simd_ext}_vu64 ret; ret.v0 = {in0}.v0; ret.v1 = {in0}.v1; return ret;'''.format(**fmtspec) return 'return ({ppc_typ}){in0};'. \ format(ppc_typ=native_type(typ), **fmtspec) # ----------------------------------------------------------------------------- def iota(simd_ext, typ): if typ == 'f16': return '''nsimd_{simd_ext}_vf16 ret; ret.v0 = vec_splats(0.0f); ret.v0 = vec_insert(1.0f, ret.v0, 1); ret.v0 = vec_insert(2.0f, ret.v0, 2); ret.v0 = vec_insert(3.0f, ret.v0, 3); ret.v1 = vec_splats(4.0f); ret.v1 = vec_insert(5.0f, ret.v1, 1); ret.v1 = vec_insert(6.0f, ret.v1, 2); ret.v1 = vec_insert(7.0f, ret.v1, 3); return ret;'''.format(**fmtspec) elif has_to_be_emulated(simd_ext, typ): return '''nsimd_vmx_v{typ} ret; ret.v0 = ({typ})0; ret.v1 = ({typ})1; return ret;'''.format(**fmtspec) return 'nsimd_{simd_ext}_v{typ} ret;\n' \ 'ret = vec_splats(({typ})0);\n'.format(**fmtspec) + \ '\n'.join('ret = vec_insert(({}){}, ret, {});'.format(typ, i, i) \ for i in range(1, get_len(typ))) + \ '\nreturn ret;' # ----------------------------------------------------------------------------- def mask_for_loop_tail(simd_ext, typ): le = get_len(typ) if typ == 'f16': threshold = 'nsimd_f32_to_f16((f32)({in1} - {in0}))'.format(**fmtspec) else: threshold = '({typ})({in1} - {in0})'.format(**fmtspec) return '''if ({in0} >= {in1}) {{ return nsimd_set1l_{simd_ext}_{typ}(0); }} if ({in1} - {in0} < {le}) {{ nsimd_{simd_ext}_v{typ} n = nsimd_set1_{simd_ext}_{typ}({threshold}); return nsimd_lt_{simd_ext}_{typ}( nsimd_iota_{simd_ext}_{typ}(), n); }} else {{ return nsimd_set1l_{simd_ext}_{typ}(1); }}'''.format(le=le, threshold=threshold, **fmtspec) # ----------------------------------------------------------------------------- def scatter(simd_ext, typ): le = get_len(typ) if typ == 'f16': return \ '''{in0}[vec_extract({in1}, 0)] = nsimd_f32_to_f16( vec_extract({in2}.v0, 0)); {in0}[vec_extract({in1}, 1)] = nsimd_f32_to_f16( vec_extract({in2}.v0, 1)); {in0}[vec_extract({in1}, 2)] = nsimd_f32_to_f16( vec_extract({in2}.v0, 2)); {in0}[vec_extract({in1}, 3)] = nsimd_f32_to_f16( vec_extract({in2}.v0, 3)); {in0}[vec_extract({in1}, 4)] = nsimd_f32_to_f16( vec_extract({in2}.v1, 0)); {in0}[vec_extract({in1}, 5)] = nsimd_f32_to_f16( vec_extract({in2}.v1, 1)); {in0}[vec_extract({in1}, 6)] = nsimd_f32_to_f16( vec_extract({in2}.v1, 2)); {in0}[vec_extract({in1}, 7)] = nsimd_f32_to_f16( vec_extract({in2}.v1, 3));'''. \ format(**fmtspec) elif has_to_be_emulated(simd_ext, typ): return '''{in0}[{in1}.v0] = {in2}.v0; {in0}[{in1}.v1] = {in2}.v1;'''.format(**fmtspec) return '\n'.join(['{in0}[vec_extract({in1}, {i})] = ' \ 'vec_extract({in2}, {i});'.format(i=i, **fmtspec) \ for i in range(get_len(typ))]) # ----------------------------------------------------------------------------- def gather(simd_ext, typ): if typ == 'f16': return \ '''nsimd_{simd_ext}_v{typ} ret; ret.v0 = vec_splats(nsimd_f16_to_f32({in0}[vec_extract({in1}, 0)])); ret.v0 = vec_insert(nsimd_f16_to_f32({in0}[vec_extract({in1}, 0)]), ret.v0, 1); ret.v0 = vec_insert(nsimd_f16_to_f32({in0}[vec_extract({in1}, 0)]), ret.v0, 2); ret.v0 = vec_insert(nsimd_f16_to_f32({in0}[vec_extract({in1}, 0)]), ret.v0, 3); ret.v1 = vec_splats(nsimd_f16_to_f32({in0}[vec_extract({in1}, 0)])); ret.v1 = vec_insert(nsimd_f16_to_f32({in0}[vec_extract({in1}, 0)]), ret.v1, 1); ret.v1 = vec_insert(nsimd_f16_to_f32({in0}[vec_extract({in1}, 0)]), ret.v1, 2); ret.v1 = vec_insert(nsimd_f16_to_f32({in0}[vec_extract({in1}, 0)]), ret.v1, 3); return ret;'''.format(**fmtspec) elif has_to_be_emulated(simd_ext, typ): return '''nsimd_{simd_ext}_v{typ} ret; ret.v0 = {in0}[{in1}.v0]; ret.v1 = {in0}[{in1}.v1]; return ret;'''.format(**fmtspec) return '''nsimd_{simd_ext}_v{typ} ret; ret = vec_splats({in0}[vec_extract({in1}, 0)]); '''.format(**fmtspec) + \ '\n'.join('ret = vec_insert({in0}[vec_extract({in1}, {i})], ' \ 'ret, {i});'.format(i=i, **fmtspec) \ for i in range(1, get_len(typ))) + '\n' + \ 'return ret;' # ----------------------------------------------------------------------------- def gather_linear(simd_ext, typ): if typ == 'f16': return \ '''nsimd_{simd_ext}_v{typ} ret; ret.v0 = vec_splats(nsimd_f16_to_f32({in0}[0])); ret.v0 = vec_insert(nsimd_f16_to_f32({in0}[{in1}]), ret.v0, 1); ret.v0 = vec_insert(nsimd_f16_to_f32({in0}[2 * {in1}]), ret.v0, 2); ret.v0 = vec_insert(nsimd_f16_to_f32({in0}[3 * {in1}]), ret.v0, 3); ret.v1 = vec_splats(nsimd_f16_to_f32({in0}[4 * {in1}])); ret.v1 = vec_insert(nsimd_f16_to_f32({in0}[5 * {in1}]), ret.v1, 1); ret.v1 = vec_insert(nsimd_f16_to_f32({in0}[6 * {in1}]), ret.v1, 2); ret.v1 = vec_insert(nsimd_f16_to_f32({in0}[7 * {in1}]), ret.v1, 3); return ret;'''.format(**fmtspec) elif has_to_be_emulated(simd_ext, typ): return '''nsimd_{simd_ext}_v{typ} ret; ret.v0 = {in0}[0]; ret.v1 = {in0}[{in1}]; return ret;'''.format(**fmtspec) return '''nsimd_{simd_ext}_v{typ} ret; ret = vec_splats({in0}[0]); '''.format(**fmtspec) + \ '\n'.join('ret = vec_insert({in0}[{in1} * {i}], ret, {i});'. \ format(i=i, **fmtspec) for i in range(1, get_len(typ))) + \ '\nreturn ret;' # ----------------------------------------------------------------------------- def scatter_linear(simd_ext, typ): if typ == 'f16': return \ '''{in0}[0] = nsimd_f32_to_f16(vec_extract({in2}.v0, 0)); {in0}[{in1}] = nsimd_f32_to_f16(vec_extract({in2}.v0, 1)); {in0}[2 * {in1}] = nsimd_f32_to_f16(vec_extract({in2}.v0, 2)); {in0}[3 * {in1}] = nsimd_f32_to_f16(vec_extract({in2}.v0, 3)); {in0}[4 * {in1}] = nsimd_f32_to_f16(vec_extract({in2}.v1, 0)); {in0}[5 * {in1}] = nsimd_f32_to_f16(vec_extract({in2}.v1, 1)); {in0}[6 * {in1}] = nsimd_f32_to_f16(vec_extract({in2}.v1, 2)); {in0}[7 * {in1}] = nsimd_f32_to_f16(vec_extract({in2}.v1, 3));'''. \ format(**fmtspec) elif has_to_be_emulated(simd_ext, typ): return '''{in0}[0] = {in2}.v0; {in0}[{in1}] = {in2}.v1;'''.format(**fmtspec) return '\n'.join(['{in0}[{in1} * {i}] = vec_extract({in2}, {i});'. \ format(i=i, **fmtspec) for i in range(get_len(typ))]) # ----------------------------------------------------------------------------- def maskoz_load(oz, simd_ext, typ): if typ == 'f16': return \ '''nsimd_{simd_ext}_vf16 ret; ret.v0 = vec_splats(0.0f); ret.v0 = vec_insert(vec_extract({in0}.v0, 0) ? nsimd_f16_to_f32({in1}[0]) : {oz0}, ret.v0, 0); ret.v0 = vec_insert(vec_extract({in0}.v0, 1) ? nsimd_f16_to_f32({in1}[1]) : {oz1}, ret.v0, 1); ret.v0 = vec_insert(vec_extract({in0}.v0, 2) ? nsimd_f16_to_f32({in1}[2]) : {oz2}, ret.v0, 2); ret.v0 = vec_insert(vec_extract({in0}.v0, 3) ? nsimd_f16_to_f32({in1}[3]) : {oz3}, ret.v0, 3); ret.v1 = ret.v0; ret.v1 = vec_insert(vec_extract({in0}.v1, 0) ? nsimd_f16_to_f32({in1}[4]) : {oz4}, ret.v1, 0); ret.v1 = vec_insert(vec_extract({in0}.v1, 1) ? nsimd_f16_to_f32({in1}[5]) : {oz5}, ret.v1, 1); ret.v1 = vec_insert(vec_extract({in0}.v1, 2) ? nsimd_f16_to_f32({in1}[6]) : {oz6}, ret.v1, 2); ret.v1 = vec_insert(vec_extract({in0}.v1, 3) ? nsimd_f16_to_f32({in1}[7]) : {oz7}, ret.v1, 3); return ret;'''. \ format(oz0='0.0f' if oz == 'z' else 'vec_extract({in2}.v0, 0)', oz1='0.0f' if oz == 'z' else 'vec_extract({in2}.v0, 1)', oz2='0.0f' if oz == 'z' else 'vec_extract({in2}.v0, 2)', oz3='0.0f' if oz == 'z' else 'vec_extract({in2}.v0, 3)', oz4='0.0f' if oz == 'z' else 'vec_extract({in2}.v1, 0)', oz5='0.0f' if oz == 'z' else 'vec_extract({in2}.v1, 1)', oz6='0.0f' if oz == 'z' else 'vec_extract({in2}.v1, 2)', oz7='0.0f' if oz == 'z' else 'vec_extract({in2}.v1, 3)', **fmtspec).format(**fmtspec) elif has_to_be_emulated(simd_ext, typ): if oz == 'z': return '''nsimd_{simd_ext}_v{typ} ret; ret.v0 = {in0}.v0 ? {in1}[0] : ({typ})0; ret.v1 = {in0}.v1 ? {in1}[1] : ({typ})0; return ret;'''.format(**fmtspec) else: return '''nsimd_{simd_ext}_v{typ} ret; ret.v0 = {in0}.v0 ? {in1}[0] : {in2}.v0; ret.v1 = {in0}.v1 ? {in1}[1] : {in2}.v1; return ret;'''.format(**fmtspec) return 'nsimd_{simd_ext}_v{typ} ret = {zeros};\n'.format(**fmtspec) + \ '\n'.join( '''if (vec_extract({in0}, {i})) {{ ret = vec_insert({in1}[{i}], ret, {i}); }} else {{ ret = vec_insert({v}, ret, {i}); }}'''.format(i=i, v='({})0'.format(typ) if oz == 'z' \ else 'vec_extract({in2}, {i})'. \ format(i=i, **fmtspec), **fmtspec) \ for i in range(get_len(typ))) + \ '\nreturn ret;' # ----------------------------------------------------------------------------- def mask_store(simd_ext, typ): if typ == 'f16': return \ '''if (vec_extract({in0}.v0, 0)) {{ {in1}[0] = nsimd_f32_to_f16(vec_extract({in2}.v0, 0)); }} if (vec_extract({in0}.v0, 1)) {{ {in1}[1] = nsimd_f32_to_f16(vec_extract({in2}.v0, 1)); }} if (vec_extract({in0}.v0, 2)) {{ {in1}[2] = nsimd_f32_to_f16(vec_extract({in2}.v0, 2)); }} if (vec_extract({in0}.v0, 3)) {{ {in1}[3] = nsimd_f32_to_f16(vec_extract({in2}.v0, 3)); }} if (vec_extract({in0}.v1, 0)) {{ {in1}[4] = nsimd_f32_to_f16(vec_extract({in2}.v1, 0)); }} if (vec_extract({in0}.v1, 1)) {{ {in1}[5] = nsimd_f32_to_f16(vec_extract({in2}.v1, 1)); }} if (vec_extract({in0}.v1, 2)) {{ {in1}[6] = nsimd_f32_to_f16(vec_extract({in2}.v1, 2)); }} if (vec_extract({in0}.v1, 3)) {{ {in1}[7] = nsimd_f32_to_f16(vec_extract({in2}.v1, 3)); }}'''.format(**fmtspec) elif has_to_be_emulated(simd_ext, typ): return '''if ({in0}.v0) {{ {in1}[0] = {in2}.v0; }} if ({in0}.v1) {{ {in1}[1] = {in2}.v1; }}'''.format(**fmtspec) return '\n'.join( '''if (vec_extract({in0}, {i})) {{ {in1}[{i}] = vec_extract({in2}, {i}); }}'''.format(i=i, **fmtspec) for i in range(get_len(typ))) # ----------------------------------------------------------------------------- def to_logical(simd_ext, typ): if typ == 'f16': return emulate_f16('to_logical', simd_ext, ['l', 'v']) elif has_to_be_emulated(simd_ext, typ): if typ in ['i64', 'u64']: return '''nsimd_{simd_ext}_vl{typ} ret; ret.v0 = (u64)({in0}.v0 != ({typ})0 ? -1 : 0); ret.v1 = (u64)({in0}.v1 != ({typ})0 ? -1 : 0); return ret;'''.format(**fmtspec) elif typ == 'f64': return '''nsimd_{simd_ext}_vl{typ} ret; ret.v0 = (u64)(nsimd_scalar_reinterpret_u64_f64( {in0}.v0) != (u64)0 ? -1 : 0); ret.v1 = (u64)(nsimd_scalar_reinterpret_u64_f64( {in0}.v1) != (u64)0 ? -1 : 0); return ret;'''.format(**fmtspec) elif typ in common.iutypes: return 'return nsimd_ne_{simd_ext}_{typ}({in0}, {zeros});'. \ format(**fmtspec) elif typ in ['f32', 'f64']: return '''return nsimd_ne_{simd_ext}_u{typnbits}( nsimd_reinterpret_{simd_ext}_u{typnbits}_{typ}( {in0}), vec_splats((u{typnbits})0));'''. \ format(**fmtspec) # ----------------------------------------------------------------------------- def get_impl(opts, func, simd_ext, from_typ, to_typ): global fmtspec fmtspec = { 'simd_ext': simd_ext, 'typ': from_typ, 'styp': get_type(opts, simd_ext, from_typ, to_typ), 'from_typ': from_typ, 'to_typ': to_typ, 'in0': common.in0, 'in1': common.in1, 'in2': common.in2, 'in3': common.in3, 'in4': common.in4, 'in5': common.in5, 'zeros': 'vec_splats(({})0)'.format(from_typ), 'lzeros': '({})vec_splats((u{})0)'. \ format(native_typel(from_typ), from_typ[1:]) \ if not has_to_be_emulated(simd_ext, from_typ) else '', 'typnbits': from_typ[1:] } impls = { 'loada': 'load1234(simd_ext, from_typ, 1, True)', 'load2a': 'load1234(simd_ext, from_typ, 2, True)', 'load3a': 'load1234(simd_ext, from_typ, 3, True)', 'load4a': 'load1234(simd_ext, from_typ, 4, True)', 'loadu': 'load1234(simd_ext, from_typ, 1, False)', 'load2u': 'load1234(simd_ext, from_typ, 2, False)', 'load3u': 'load1234(simd_ext, from_typ, 3, False)', 'load4u': 'load1234(simd_ext, from_typ, 4, False)', 'storea': 'store1234(simd_ext, from_typ, 1, True)', 'store2a': 'store1234(simd_ext, from_typ, 2, True)', 'store3a': 'store1234(simd_ext, from_typ, 3, True)', 'store4a': 'store1234(simd_ext, from_typ, 4, True)', 'storeu': 'store1234(simd_ext, from_typ, 1, False)', 'store2u': 'store1234(simd_ext, from_typ, 2, False)', 'store3u': 'store1234(simd_ext, from_typ, 3, False)', 'store4u': 'store1234(simd_ext, from_typ, 4, False)', 'andb': 'binary_op2("andb", simd_ext, from_typ)', 'xorb': 'binary_op2("xorb", simd_ext, from_typ)', 'orb': 'binary_op2("orb", simd_ext, from_typ)', 'andl': 'logical_op2("andl", simd_ext, from_typ)', 'xorl': 'logical_op2("xorl", simd_ext, from_typ)', 'orl': 'logical_op2("orl", simd_ext, from_typ)', 'notb': 'not1(simd_ext, from_typ)', 'notl': 'lnot1(simd_ext, from_typ)', 'andnotb': 'binary_op2("andnotb", simd_ext, from_typ)', 'andnotl': 'logical_op2("andnotl", simd_ext, from_typ)', 'add': 'simple_op2("add", simd_ext, from_typ)', 'adds': 'add_sub_s("adds",simd_ext, from_typ)', 'sub': 'simple_op2("sub", simd_ext, from_typ)', 'subs': 'add_sub_s("subs",simd_ext, from_typ)', 'div': 'div2(simd_ext, from_typ)', 'sqrt': 'sqrt1(simd_ext, from_typ)', 'len': 'len1(simd_ext, from_typ)', 'mul': 'simple_op2("mul", simd_ext, from_typ)', 'shl': 'shift2("shl", simd_ext, from_typ)', 'shr': 'shift2("shr", simd_ext, from_typ)', 'shra': 'shift2("shra", simd_ext, from_typ)', 'set1': 'set1(simd_ext, from_typ)', 'set1l': 'lset1(simd_ext, from_typ)', 'eq': 'cmp2("eq", simd_ext, from_typ)', 'lt': 'cmp2("lt", simd_ext, from_typ)', 'le': 'cmp2("le", simd_ext, from_typ)', 'gt': 'cmp2("gt", simd_ext, from_typ)', 'ge': 'cmp2("ge", simd_ext, from_typ)', 'ne': 'cmp2("ne", simd_ext, from_typ)', 'if_else1': 'if_else3(simd_ext, from_typ)', 'min': 'minmax2("min", simd_ext, from_typ)', 'max': 'minmax2("max", simd_ext, from_typ)', 'loadla': 'loadl(True, simd_ext, from_typ)', 'loadlu': 'loadl(False, simd_ext, from_typ)', 'storela': 'storel(True, simd_ext, from_typ)', 'storelu': 'storel(False, simd_ext, from_typ)', 'abs': 'abs1(simd_ext, from_typ)', 'fma': 'fma("fma", simd_ext, from_typ)', 'fnma': 'fma("fnma", simd_ext, from_typ)', 'fms': 'fma("fms", simd_ext, from_typ)', 'fnms': 'fma("fnms", simd_ext, from_typ)', 'ceil': 'round1("ceil", simd_ext, from_typ)', 'floor': 'round1("floor", simd_ext, from_typ)', 'trunc': 'round1("trunc", simd_ext, from_typ)', 'round_to_even': 'round1("round_to_even", simd_ext, from_typ)', 'all': 'allany1("all", simd_ext, from_typ)', 'any': 'allany1("any", simd_ext, from_typ)', 'reinterpret': 'reinterpret1(simd_ext, from_typ, to_typ)', 'reinterpretl': 'reinterpretl1(simd_ext, from_typ, to_typ)', 'cvt': 'convert1(simd_ext, from_typ, to_typ)', 'rec8': 'recs1("rec8", simd_ext, from_typ)', 'rec11': 'recs1("rec11", simd_ext, from_typ)', 'rsqrt8': 'recs1("rsqrt8", simd_ext, from_typ)', 'rsqrt11': 'recs1("rsqrt11", simd_ext, from_typ)', 'rec': 'recs1("rec", simd_ext, from_typ)', 'neg': 'neg1(simd_ext, from_typ)', 'nbtrue': 'nbtrue1(simd_ext, from_typ)', 'reverse': 'reverse1(simd_ext, from_typ)', 'addv': 'addv(simd_ext, from_typ)', 'upcvt': 'upcvt1(simd_ext, from_typ, to_typ)', 'downcvt': 'downcvt1(simd_ext, from_typ, to_typ)', 'iota': 'iota(simd_ext, from_typ)', 'to_logical': 'to_logical(simd_ext, from_typ)', 'mask_for_loop_tail': 'mask_for_loop_tail(simd_ext, from_typ)', 'masko_loadu1': 'maskoz_load("o", simd_ext, from_typ)', 'maskz_loadu1': 'maskoz_load("z", simd_ext, from_typ)', 'masko_loada1': 'maskoz_load("o", simd_ext, from_typ)', 'maskz_loada1': 'maskoz_load("z", simd_ext, from_typ)', 'mask_storea1': 'mask_store(simd_ext, from_typ)', 'mask_storeu1': 'mask_store(simd_ext, from_typ)', 'gather': 'gather(simd_ext, from_typ)', 'scatter': 'scatter(simd_ext, from_typ)', 'gather_linear': 'gather_linear(simd_ext, from_typ)', 'scatter_linear': 'scatter_linear(simd_ext, from_typ)', 'to_mask': 'to_mask(simd_ext, from_typ)', 'ziplo': 'zip("ziplo", simd_ext, from_typ)', 'ziphi': 'zip("ziphi", simd_ext, from_typ)', 'zip': 'zip_unzip_basic("zip", simd_ext, from_typ)', 'unzip': 'zip_unzip_basic("unzip", simd_ext, from_typ)', 'unziplo': 'unzip("unziplo", simd_ext, from_typ)', 'unziphi': 'unzip("unziphi", simd_ext, from_typ)' } if simd_ext not in get_simd_exts(): raise ValueError('Unknown SIMD extension "{}"'.format(simd_ext)) if not from_typ in common.types: raise ValueError('Unknown type "{}"'.format(from_typ)) if not func in impls: return common.NOT_IMPLEMENTED else: return eval(impls[func]) ================================================ FILE: egg/platform_x86.py ================================================ # Copyright (c) 2019 Agenium Scale # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. # This file gives the implementation of platform x86, i.e. Intel/AMD SIMD. # Reading this file is NOT straightforward. X86 SIMD extensions is a mess. # This script nonetheless tries to be as readable as possible. It implements # SSE2, SSE42, AVX, AVX2, AVX512 as found on KNLs and AVX512 as found on Xeon # Skylakes. import common import x86_load_store_deg234 as ldst234 # ----------------------------------------------------------------------------- # Helpers sse = ['sse2', 'sse42'] avx = ['avx', 'avx2'] avx512 = ['avx512_knl', 'avx512_skylake'] # ----------------------------------------------------------------------------- # Implementation of mandatory functions for this module def get_simd_exts(): return ['sse2', 'sse42', 'avx', 'avx2', 'avx512_knl', 'avx512_skylake'] def get_prev_simd_ext(simd_ext): if simd_ext == 'sse2': return 'cpu' elif simd_ext == 'sse42': return 'sse2' elif simd_ext == 'avx': return 'sse42' elif simd_ext == 'avx2': return 'avx' elif simd_ext in avx512: return 'avx2' raise ValueError('Unknown SIMD extension "{}"'.format(simd_ext)) def emulate_fp16(simd_ext): if not simd_ext in get_simd_exts(): raise ValueError('Unknown SIMD extension "{}"'.format(simd_ext)) return True def get_native_typ(simd_ext, typ): # Number of bits if simd_ext in sse: bits = '128' elif simd_ext in avx: bits = '256' elif simd_ext in avx512: bits = '512' else: raise ValueError('Unknown SIMD extension "{}"'.format(simd_ext)) if typ == 'f32': return '__m{}'.format(bits) elif typ == 'f64': return '__m{}d'.format(bits) elif typ in common.iutypes: return '__m{}i'.format(bits) def get_type(opts, simd_ext, typ, nsimd_typ): if typ not in common.types: raise ValueError('Unknown type "{}"'.format(typ)) if typ == 'f16': return 'typedef struct {{{t} v0; {t} v1; }} {nsimd_typ};'. \ format(t=get_native_typ(simd_ext, 'f32'), nsimd_typ=nsimd_typ) else: return 'typedef {} {};'.format(get_native_typ(simd_ext, typ), nsimd_typ) def get_logical_type(opts, simd_ext, typ, nsimd_typ): if typ not in common.types: raise ValueError('Unknown type "{}"'.format(typ)) if simd_ext in sse + avx: return get_type(opts, simd_ext, typ, nsimd_typ) elif simd_ext in avx512: if typ == 'f16': return 'typedef struct {{ __mmask16 v0; __mmask16 v1; }} {};'. \ format(nsimd_typ) return 'typedef __mmask{} {};'. \ format(512 // common.bitsize(typ), nsimd_typ) else: raise ValueError('Unknown SIMD extension "{}"'.format(simd_ext)) def get_nb_registers(simd_ext): if simd_ext in sse + avx: return '16' elif simd_ext in avx512: return '32' else: raise ValueError('Unknown SIMD extension "{}"'.format(simd_ext)) def has_compatible_SoA_types(simd_ext): if simd_ext not in sse + avx + avx512: raise ValueError('Unknown SIMD extension "{}"'.format(simd_ext)) else: return False def get_additional_include(func, platform, simd_ext): ret = '' if simd_ext == 'sse2': ret += '''#include '''.format(func) elif simd_ext == 'sse42': ret += '''#include '''.format(func) elif simd_ext == 'avx': ret += '''#include '''.format(func) elif simd_ext == 'avx2': ret += '''#include '''.format(func) elif simd_ext == 'avx512_knl': ret += '''#include '''.format(func) elif simd_ext == 'avx512_skylake': ret += '''#include '''.format(func) if func == 'shra': ret += '''#include '''.format(simd_ext=simd_ext) if func in ['loadla', 'loadlu', 'storela', 'storelu']: ret += '''#include # include # include # include '''.format(simd_ext=simd_ext) if func in ['masko_loada1', 'masko_loadu1', 'maskz_loada1', 'maskz_loadu1', 'mask_storea1', 'mask_storeu1']: ret += '''#include ''' if func in ['notb']: ret += '''#include '''.format(simd_ext=simd_ext) if func in ['notl']: ret += '''#include # include '''.format(simd_ext=simd_ext) if func in ['min', 'max']: ret += '''#include '''.format(simd_ext=simd_ext) if func in ['lt']: ret += '''#include '''.format(simd_ext=simd_ext) if func in ['ge']: ret += '''#include '''.format(simd_ext=simd_ext) if func in ['if_else1']: ret += '''#include # include # include # include '''.format(simd_ext=simd_ext) if func in ['abs']: ret += '''#include # include '''.format(simd_ext=simd_ext) if func == 'reinterpretl' and simd_ext in ['sse', 'avx']: ret += '''#include # include '''.format(simd_ext=simd_ext) if func == 'upcvt': ret += '''#include '''.format(simd_ext=simd_ext) if func == 'ziplo' and simd_ext in ['avx512_knl', 'avx512_skylake']: ret += '''#include '''.format(simd_ext=simd_ext) if func == 'ziphi' and simd_ext in ['avx512_knl', 'avx512_skylake']: ret += '''#include '''.format(simd_ext=simd_ext) if func == 'zip': ret += '''#include #include '''.format(simd_ext=simd_ext) if func == 'unzip': ret += '''#include #include '''.format(simd_ext=simd_ext) if simd_ext in avx512 and func in ['loadlu', 'loadla']: ret += ''' # if NSIMD_CXX > 0 extern "C" {{ # endif NSIMD_INLINE nsimd_{simd_ext}_vlu16 NSIMD_VECTORCALL nsimd_{func}_{simd_ext}_u16(const u16*); # if NSIMD_CXX > 0 }} // extern "C" # endif '''.format(func=func, simd_ext=simd_ext) if func in ['load2u', 'load3u', 'load4u', 'load2a', 'load3a', 'load4a']: ret += ''' # include # include # if NSIMD_CXX > 0 extern "C" {{ # endif NSIMD_INLINE nsimd_{simd_ext}_vu16x{deg} NSIMD_VECTORCALL nsimd_{func}_{simd_ext}_u16(const u16*); # if NSIMD_CXX > 0 }} // extern "C" # endif '''.format(func=func, deg=func[4], simd_ext=simd_ext) if func in ['store2u', 'store3u', 'store4u', 'store2a', 'store3a', 'store4a']: deg = func[5] args = ','.join(['nsimd_{simd_ext}_vu16'.format(simd_ext=simd_ext) for i in range(1, int(deg) + 1)]) ret += ''' # include # include # if NSIMD_CXX > 0 extern "C" {{ # endif NSIMD_INLINE void NSIMD_VECTORCALL nsimd_{func}_{simd_ext}_u16(u16*, {args}); # if NSIMD_CXX > 0 }} // extern "C" # endif '''.format(func=func, deg=deg, args=args, simd_ext=simd_ext) if func == 'to_logical': ret += '''#include #include '''.format(simd_ext=simd_ext) if func == 'adds': ret += '''#include #include #include #include #include #include #include #include #if NSIMD_CXX > 0 #include #else #include #endif ''' .format(simd_ext=simd_ext) if simd_ext in avx512: ret += '''#include '''.format(simd_ext=simd_ext) if func == 'subs': ret += '''#include #include #include #include #include #include '''.format(simd_ext=simd_ext) if func == 'mask_for_loop_tail': ret += '''#include #include #include #include '''.format(simd_ext=simd_ext) return ret # ----------------------------------------------------------------------------- # Function prefixes and suffixes def pre(simd_ext): # Number of bits if simd_ext in sse: bits = '' elif simd_ext in avx: bits = '256' elif simd_ext in avx512: bits = '512' else: raise ValueError('Unknown SIMD extension "{}"'.format(simd_ext)) return '_mm{}_'.format(bits) def suf_ep(typ): if typ == 'f16': return '_ph' elif typ == 'f32': return '_ps' elif typ == 'f64': return '_pd' elif typ in common.iutypes: return '_epi{}'.format(typ[1:]) else: raise ValueError('Unknown type "{}"'.format(typ)) def nbits(simd_ext): if simd_ext in sse: return '128' elif simd_ext in avx: return '256' else: return '512' def suf_si(simd_ext, typ): if typ == 'f16': return '_ph' elif typ == 'f32': return '_ps' elif typ == 'f64': return '_pd' elif typ in common.iutypes: return '_si{}'.format(nbits(simd_ext)) else: raise ValueError('Unknown type "{}"'.format(typ)) # ----------------------------------------------------------------------------- # Other helper functions fmtspec = {} LO = 0 HI = 1 def castsi(simd_ext, typ): if typ in common.ftypes: return '' else: return '(__m{}i *)'.format(nbits(simd_ext)) def extract(simd_ext, typ, lohi, var): if simd_ext in avx: lohi_arg = '0' if lohi == LO else '1' if typ == 'f32': if lohi == LO: return '_mm256_castps256_ps128({})'.format(var) else: return '_mm256_extractf128_ps({}, 1)'.format(var) elif typ == 'f64': if lohi == LO: return '_mm256_castpd256_pd128({})'.format(var) else: return '_mm256_extractf128_pd({}, 1)'.format(var) else: if lohi == LO: return '_mm256_castsi256_si128({})'.format(var) else: return '_mm256_extractf128_si256({}, 1)'.format(var) elif simd_ext in avx512: lohi_arg = '0' if lohi == LO else '1' if typ == 'f32': if lohi == LO: return '_mm512_castps512_ps256({})'.format(var) else: return '''_mm256_castsi256_ps(_mm512_extracti64x4_epi64( _mm512_castps_si512({}), 1))'''.format(var) elif typ == 'f64': if lohi == LO: return '_mm512_castpd512_pd256({})'.format(var) else: return '_mm512_extractf64x4_pd({}, 1)'.format(var) else: if lohi == LO: return '_mm512_castsi512_si256({})'.format(var) else: return '_mm512_extracti64x4_epi64({}, 1)'.format(var) def setr(simd_ext, typ, var1, var2): if simd_ext in avx: if typ == 'f32': return '''_mm256_insertf128_ps(_mm256_castps128_ps256( {}), {}, 1)'''.format(var1, var2) elif typ == 'f64': return '''_mm256_insertf128_pd(_mm256_castpd128_pd256( {}), {}, 1)'''.format(var1, var2) else: return '''_mm256_insertf128_si256(_mm256_castsi128_si256( {}), {}, 1)'''.format(var1, var2) elif simd_ext in avx512: if typ == 'f32': return '''_mm512_castpd_ps(_mm512_insertf64x4(_mm512_castps_pd( _mm512_castps256_ps512({})), _mm256_castps_pd( {}), 1))'''. \ format(var1, var2) elif typ == 'f64': return '''_mm512_insertf64x4(_mm512_castpd256_pd512( {}), {}, 1)'''.format(var1, var2) else: return '''_mm512_inserti64x4(_mm512_castsi256_si512( {}), {}, 1)'''.format(var1, var2) def set_lane(simd_ext, typ, var_name, scalar, i): # No code for f16's if typ == 'f16': return '' # Inserting a # Code for reinterpreting bits of input: # All intrinscis manipulates only integers. So we use them. if typ in ['u8', 'u16']: vin0 = var_name if simd_ext in sse: vin1 = '(int)({})'.format(scalar) else: vin1 = scalar if typ in ['i8', 'i16']: vin0 = var_name vin1 = '(int)nsimd_scalar_reinterpret_{}_{}({})'. \ format('u' + typ[1:], typ, scalar) elif typ in ['i32', 'i64']: vin0 = var_name vin1 = scalar elif typ in ['u32', 'f32', 'u64', 'f64']: if typ in ['u32', 'u64']: vin0 = var_name else: vin0 = '{pre}cast{pspd}_si{nbits}({var_name})'. \ format(pspd='ps' if typ == 'f32' else 'pd', var_name=var_name, **fmtspec) vin1 = 'nsimd_scalar_reinterpret_{}_{}({})'. \ format('i' + typ[1:], typ, scalar) # Code for inserting bits if simd_ext == 'sse2': if typ[1:] == '8': if i % 2 == 0: tmp = '_mm_insert_epi16({vin0}, ' \ '(_mm_extract_epi16({vin0}, {io2}) & 65280) | {vin1}, ' \ '{io2})'.format(vin0=vin0, vin1=vin1, io2=int(i // 2)) else: tmp = '_mm_insert_epi16({vin0}, ' \ '(_mm_extract_epi16({vin0}, {io2}) & 255) | ' \ '({vin1} << 8), {io2})'. \ format(vin0=vin0, vin1=vin1, io2=int(i // 2)) if typ[1:] == '16': tmp = '_mm_insert_epi16({}, {}, {})'.format(vin0, vin1, i) if typ[1:] == '32': tmp = '_mm_insert_epi16(_mm_insert_epi16({vin0}, {vin1} & 65535,' \ ' {ix2}), (int)nsimd_scalar_reinterpret_u32_i32(' \ '{vin1}) >> 16, {ix2p1})'.format(vin0=vin0, vin1=vin1, ix2=i * 2, ix2p1=(i * 2) + 1) if typ[1:] == '64': if i == 0: tmp = '_mm_unpackhi_epi64(_mm_slli_si128(' \ '_mm_cvtsi64_si128({vin1}), 8), {vin0})'. \ format(vin0=vin0, vin1=vin1) elif i == 1: tmp = '_mm_unpacklo_epi64({vin0}, ' \ '_mm_cvtsi64_si128({vin1}))'.format(vin0=vin0, vin1=vin1) elif simd_ext in ['sse42'] + avx: tmp = '{pre}insert_epi{typnbits}({vin0}, {vin1}, {i})'. \ format(vin0=vin0, vin1=vin1, i=i, **fmtspec) elif simd_ext in avx512: half = int(nbits(simd_ext)) // 2 // int(typ[1:]) if i < half: tmp = '_mm512_inserti64x4({vin0}, _mm256_insert_epi{typnbits}(' \ '_mm512_castsi512_si256({vin0}), {vin1}, {i}), 0)'. \ format(vin0=vin0, vin1=vin1, i=i, **fmtspec) else: tmp = '_mm512_inserti64x4({vin0}, _mm256_insert_epi{typnbits}(' \ '_mm512_extracti64x4_epi64({vin0}, 1), {vin1}, {i}),' \ ' 1)'.format(vin0=vin0, vin1=vin1, i=i - half, **fmtspec) # Then code for reinterpreting bits of output: if typ in common.iutypes: return '{} = {};'.format(var_name, tmp) elif typ in ['f32', 'f64']: return '{var_name} = {pre}castsi{nbits}_{pdps}({tmp});'. \ format(var_name=var_name, pdps='ps' if typ == 'f32' else 'pd', tmp=tmp, **fmtspec) def get_lane(simd_ext, typ, var_name, i): # No code for f16's if typ == 'f16': return '' # Code for reinterpreting bits of input: # All intrinscis manipulates only integers. So we use them. if typ in common.iutypes: vin = var_name elif typ in ['f32', 'f64']: vin = '{pre}cast{pdps}_si{nbits}({v})'. \ format(pdps='ps' if typ == 'f32' else 'pd', v=var_name, **fmtspec) # Code for extracting bits if simd_ext == 'sse2': if typ[1:] == '8': lane = '(_mm_cvtsi128_si32(_mm_srli_si128({vin}, {i})) & 255)'. \ format(vin=vin, i=i, **fmtspec) if typ[1:] == '16': lane = '_mm_extract_epi16({}, {})'.format(vin, i) if typ[1:] in ['32', '64']: lane = '(_mm_cvtsi128_si{}(_mm_srli_si128({}, {})))'. \ format(typ[1:], vin, i * int(typ[1:]) // 8) elif simd_ext in ['sse42', 'avx2']: lane = '{pre}extract_epi{typnbits}({vin}, {i})'. \ format(vin=vin, i=i, **fmtspec) elif simd_ext in ['avx'] + avx512: if simd_ext == 'avx' and typ[1:] in ['32', '64']: lane = '{pre}extract_epi{typnbits}({vin}, {i})'. \ format(vin=vin, i=i, **fmtspec) else: half = int(nbits(simd_ext)) // 2 // int(typ[1:]) if i < half: ext_half = extract(simd_ext, 'i' + typ[1:], LO, vin) lane = '{}extract_epi{}({}, {})'.format( '_mm_' if simd_ext == 'avx' else '_mm256_', typ[1:], ext_half, i) else: ext_half = extract(simd_ext, 'i' + typ[1:], HI, vin) lane = '{}extract_epi{}({}, {})'.format( '_mm_' if simd_ext == 'avx' else '_mm256_', typ[1:], ext_half, i - half) # Then code for reinterpreting bits of output: # - For 8 and 16-bits types intrinsics returns an 32-bits int # - For 32 and 64-bits types intrinsics returns an int of that size if typ in ['u8', 'u16']: return '({})({})'.format(typ, lane) if typ in ['i8', 'i16']: return 'nsimd_scalar_reinterpret_{}_{}(({})({}))'. \ format(typ, 'u' + typ[1:], 'u' + typ[1:], lane) elif typ in ['i32', 'i64']: return lane elif typ in ['u32', 'f32', 'u64', 'f64']: return 'nsimd_scalar_reinterpret_{}_{}({})'. \ format(typ, 'i' + typ[1:], lane) def get_undefined(simd_ext, typ): if typ in ['f32', 'f64']: return '{pre}undefined{suf}()'.format(**fmtspec) elif typ in common.iutypes: if simd_ext in sse + avx: return '{pre}undefined{sufsi}()'.format(**fmtspec) elif simd_ext in avx512: return '{pre}undefined_epi32()'.format(**fmtspec) # Signature must be a list of 'v', 's' # 'v' means vector so code to extract has to be emitted # 's' means base type so no need to write code for extraction def get_emulation_code(func, signature, simd_ext, typ): # Trick using insert and extract trick = 'nsimd_{simd_ext}_v{typ} ret = {undef};\n'. \ format(undef=get_undefined(simd_ext, typ), **fmtspec) arity = len(signature) trick += typ + ' ' + \ ', '.join(['tmp{}'.format(i) \ for i in range(arity) if signature[i] == 'v']) + ';\n' args = ', '.join(['{{in{}}}'.format(i).format(**fmtspec) \ if signature[i] == 's' else 'tmp{}'.format(i) \ for i in range(arity)]) for i in range(fmtspec['le']): trick += '\n'.join(['tmp{} = {};'. \ format(j, get_lane(simd_ext, typ, '{{in{}}}'.format(j).format(**fmtspec), i)) \ for j in range(arity) if signature[j] == 'v']) + '\n' trick += set_lane(simd_ext, typ, 'ret', 'nsimd_scalar_{func}_{typ}({args})'. \ format(func=func, args=args, **fmtspec), i) + '\n' trick += 'return ret;' # but in 32-bits mode insert and extract instrinsics are almost never # available so we emulate emulation = 'int i;\n{typ} ret[{le}];\n'.format(**fmtspec) emulation += typ + ' ' + \ ', '.join(['buf{}[{}]'.format(i, fmtspec['le']) \ for i in range(arity) if signature[i] == 'v']) + \ ';\n' emulation += '\n'.join(['{{pre}}store{{sufsi}}({cast}buf{i}, {{in{i}}});'. \ format(i=i, cast=castsi(simd_ext, typ)). \ format(**fmtspec) \ for i in range(arity) if signature[i] == 'v']) + \ '\n' args = ', '.join(['{{in{}}}'.format(i).format(**fmtspec) \ if signature[i] == 's' else 'buf{}[i]'.format(i) \ for i in range(arity)]) emulation += '''for (i = 0; i < {le}; i++) {{ ret[i] = nsimd_scalar_{func}_{typ}({args}); }} return {pre}loadu{sufsi}({cast}ret);'''. \ format(args=args, cast=castsi(simd_ext, typ), func=func, **fmtspec) if simd_ext == 'sse42' and \ typ in ['i8', 'u8', 'i16', 'u16', 'i32', 'u32', 'f32']: return trick else: return '''#if NSIMD_WORD_SIZE == 32 {} #else {} #endif'''.format(emulation, trick) def how_it_should_be_op2(func, simd_ext, typ): if typ == 'f16': return '''nsimd_{simd_ext}_vf16 ret; ret.v0 = {pre}{func}_ps({in0}.v0, {in1}.v0); ret.v1 = {pre}{func}_ps({in0}.v1, {in1}.v1); return ret;'''.format(func=func, **fmtspec) else: return 'return {pre}{func}{suf}({in0}, {in1});'. \ format(func=func, **fmtspec) def split_opn(func, simd_ext, typ, n): simd_ext2 = 'sse42' if simd_ext in avx else 'avx2' inp = [common.in0, common.in1, common.in2] defi = '' for i in range(0, n): defi += \ '''nsimd_{simd_ext2}_v{typ} v{i}0 = {extract_loi}; nsimd_{simd_ext2}_v{typ} v{i}1 = {extract_hii};'''. \ format(simd_ext2=simd_ext2, typ=typ, i=i, extract_loi=extract(simd_ext, typ, LO, inp[i]), extract_hii=extract(simd_ext, typ, HI, inp[i])) vlo = ', '.join(['v{}0'.format(i) for i in range(0, n)]) vhi = ', '.join(['v{}1'.format(i) for i in range(0, n)]) return '''{defi} v00 = nsimd_{func}_{simd_ext2}_{typ}({vlo}); v01 = nsimd_{func}_{simd_ext2}_{typ}({vhi}); return {merge};'''. \ format(defi=defi, vlo=vlo, vhi=vhi, func=func, simd_ext2=simd_ext2, typ=typ, merge=setr(simd_ext, typ, 'v00', 'v01')) def split_op2(func, simd_ext, typ): return split_opn(func, simd_ext, typ, 2) def emulate_op2(opts, op, simd_ext, typ): func = {'/': 'div', '*': 'mul'} return get_emulation_code(func[op], ['v', 'v'], simd_ext, typ) def emulate_op1(opts, func, simd_ext, typ): return get_emulation_code(func, ['v'], simd_ext, typ) def split_cmp2(func, simd_ext, typ): simd_ext2 = 'sse42' if simd_ext in avx else 'avx2' leo2 = int(fmtspec['le']) // 2 if simd_ext in avx512: if typ in ['i8', 'u8', 'f32', 'f64']: merge = \ '''return (__mmask{le})(u32)_mm256_movemask{suf}( v00) | ((__mmask{le})(u32)_mm256_movemask{suf}( v01) << {leo2});'''. \ format(leo2=leo2, **fmtspec) elif typ in ['i32', 'u32', 'i64', 'u64']: ftyp = 'f{typnbits}'.format(**fmtspec) merge = \ '''return (__mmask{le})(u32)_mm256_movemask{fsuf}( _mm256_castsi256{suf}(v00)) | (((__mmask{le})(u32)_mm256_movemask{fsuf}( _mm256_castsi256{suf}(v01))) << {leo2});'''. \ format(fsuf=suf_ep(ftyp), leo2=leo2, **fmtspec) else: merge = \ '''v00 = _mm256_permute4x64_epi64(v00, 216); /* exchange middle qwords */ nsimd_avx2_vi16 lo1 = _mm256_unpacklo_epi16(v00, v00); nsimd_avx2_vi16 hi1 = _mm256_unpackhi_epi16(v00, v00); v01 = _mm256_permute4x64_epi64(v01, 216); /* exchange middle qwords */ nsimd_avx2_vi16 lo2 = _mm256_unpacklo_epi16(v01, v01); nsimd_avx2_vi16 hi2 = _mm256_unpackhi_epi16(v01, v01); return (__mmask32)(u32)_mm256_movemask_ps( _mm256_castsi256_ps(lo1)) | (__mmask32)((u32)_mm256_movemask_ps( _mm256_castsi256_ps(hi1)) << 8) | (__mmask32)((u32)_mm256_movemask_ps( _mm256_castsi256_ps(lo2)) << 16) | (__mmask32)((u32)_mm256_movemask_ps( _mm256_castsi256_ps(hi2)) << 24);'''. \ format(**fmtspec) else: merge = 'return {};'.format(setr(simd_ext, typ, 'v00', 'v01')) return '''nsimd_{simd_ext2}_v{typ} v00 = {extract_lo0}; nsimd_{simd_ext2}_v{typ} v01 = {extract_hi0}; nsimd_{simd_ext2}_v{typ} v10 = {extract_lo1}; nsimd_{simd_ext2}_v{typ} v11 = {extract_hi1}; v00 = nsimd_{func}_{simd_ext2}_{typ}(v00, v10); v01 = nsimd_{func}_{simd_ext2}_{typ}(v01, v11); {merge}'''. \ format(simd_ext2=simd_ext2, extract_lo0=extract(simd_ext, typ, LO, common.in0), extract_hi0=extract(simd_ext, typ, HI, common.in0), extract_lo1=extract(simd_ext, typ, LO, common.in1), extract_hi1=extract(simd_ext, typ, HI, common.in1), func=func, merge=merge, **fmtspec) def f16_cmp2(func, simd_ext): return '''nsimd_{simd_ext}_vlf16 ret; ret.v0 = nsimd_{func}_{simd_ext}_f32({in0}.v0, {in1}.v0); ret.v1 = nsimd_{func}_{simd_ext}_f32({in0}.v1, {in1}.v1); return ret;'''.format(func=func, **fmtspec) def cmp2_with_add(func, simd_ext, typ): cte = { 'u8': '0x80', 'u16': '0x8000', 'u32': '0x80000000', 'u64': '0x8000000000000000' } return \ '''nsimd_{simd_ext}_v{typ} cte = nsimd_set1_{simd_ext}_{typ}({cte}); return nsimd_{func}_{simd_ext}_{ityp}( {pre}add{suf}({in0}, cte), {pre}add{suf}({in1}, cte));'''. \ format(func=func, cte=cte[typ], ityp='i{}'.format(typ[1:]), **fmtspec) # ----------------------------------------------------------------------------- # Returns C code for func # Load def load(simd_ext, typ, aligned): align = '' if aligned else 'u' cast = castsi(simd_ext, typ) if typ == 'f16': if simd_ext in sse: return \ '''#ifdef NSIMD_FP16 nsimd_{simd_ext}_vf16 ret; __m128i v = _mm_load{align}_si128((__m128i*){in0}); ret.v0 = _mm_cvtph_ps(v); v = _mm_shuffle_epi32(v, 14); /* = (3 << 2) | (2 << 0) */ ret.v1 = _mm_cvtph_ps(v); return ret; #else /* Note that we can do much better but is it useful? */ nsimd_{simd_ext}_vf16 ret; f32 buf[4]; buf[0] = nsimd_u16_to_f32(*(u16*){in0}); buf[1] = nsimd_u16_to_f32(*((u16*){in0} + 1)); buf[2] = nsimd_u16_to_f32(*((u16*){in0} + 2)); buf[3] = nsimd_u16_to_f32(*((u16*){in0} + 3)); ret.v0 = _mm_loadu_ps(buf); buf[0] = nsimd_u16_to_f32(*((u16*){in0} + 4)); buf[1] = nsimd_u16_to_f32(*((u16*){in0} + 5)); buf[2] = nsimd_u16_to_f32(*((u16*){in0} + 6)); buf[3] = nsimd_u16_to_f32(*((u16*){in0} + 7)); ret.v1 = _mm_loadu_ps(buf); return ret; #endif'''.format(align=align, **fmtspec) elif simd_ext in avx: return '''#ifdef NSIMD_FP16 nsimd_{simd_ext}_vf16 ret; ret.v0 = _mm256_cvtph_ps(_mm_load{align}_si128( (__m128i*){in0})); ret.v1 = _mm256_cvtph_ps(_mm_load{align}_si128( (__m128i*){in0} + 1)); return ret; #else /* Note that we can do much better but is it useful? */ nsimd_{simd_ext}_vf16 ret; f32 buf[8]; int i; for (i = 0; i < 8; i++) {{ buf[i] = nsimd_u16_to_f32(*((u16*){in0} + i)); }} ret.v0 = _mm256_loadu_ps(buf); for (i = 0; i < 8; i++) {{ buf[i] = nsimd_u16_to_f32(*((u16*){in0} + (8 + i))); }} ret.v1 = _mm256_loadu_ps(buf); return ret; #endif'''.format(align=align, **fmtspec) elif simd_ext in avx512: return '''nsimd_{simd_ext}_vf16 ret; ret.v0 = _mm512_cvtph_ps( _mm256_load{align}_si256((__m256i*){in0}) ); ret.v1 = _mm512_cvtph_ps( _mm256_load{align}_si256((__m256i*){in0} + 1) ); return ret; '''.format(align=align, **fmtspec) else: return 'return {pre}load{align}{sufsi}({cast}{in0});'. \ format(align=align, cast=cast, **fmtspec) # ----------------------------------------------------------------------------- # masked loads def maskoz_load(simd_ext, typ, oz, aligned): if typ == 'f16': le2 = fmtspec['le'] // 2 if simd_ext in sse + avx: store_mask = '''{pre}storeu_ps(mask, {in0}.v0); {pre}storeu_ps(mask + {le2}, {in0}.v1);'''. \ format(le2=le2, **fmtspec) else: store_mask = '''_mm512_storeu_ps(mask, _mm512_maskz_mov_ps( {in0}.v0, _mm512_set1_ps(1.0f))); _mm512_storeu_ps(mask + {le2}, _mm512_maskz_mov_ps( {in0}.v1, _mm512_set1_ps(1.0f)));'''. \ format(le2=le2, **fmtspec) return '''int i; nsimd_{simd_ext}_vf16 ret; f32 buf[{le}], mask[{le}]; {store_mask} {pre}storeu_ps(buf, {oz0}); {pre}storeu_ps(buf + {le2}, {oz1}); for (i = 0; i < {le}; i++) {{ if (nsimd_scalar_reinterpret_u32_f32(mask[i]) != (u32)0) {{ buf[i] = nsimd_f16_to_f32({in1}[i]); }} }} ret.v0 = {pre}loadu_ps(buf); ret.v1 = {pre}loadu_ps(buf + {le2}); return ret;'''.format(le2=fmtspec['le'] // 2, oz0 = '{pre}setzero_ps()'.format(**fmtspec) if oz == 'z' \ else '{in2}.v0'.format(**fmtspec), oz1 = '{pre}setzero_ps()'.format(**fmtspec) if oz == 'z' \ else '{in2}.v1'.format(**fmtspec), store_mask=store_mask, **fmtspec) if (typ in ['i8', 'u8', 'i16', 'u16'] and simd_ext != 'avx512_skylake') \ or (typ in ['i32', 'u32', 'f32', 'i64', 'u64', 'f64'] and \ simd_ext in sse): cast = castsi(simd_ext, typ) if simd_ext == 'avx512_knl': mask_decl = 'u64 mask;' store_mask = 'mask = (u64){in0};'.format(**fmtspec) cond = '(mask >> i) & 1' else: mask_decl = '{typ} mask[{le}];'.format(**fmtspec) store_mask = '{pre}storeu{sufsi}({cast}mask, {in0});'. \ format(cast=cast, **fmtspec) cond = 'nsimd_scalar_reinterpret_{utyp}_{typ}(mask[i]) != '\ '({utyp})0'.format(utyp='u' + typ[1:], **fmtspec) return \ '''int i; {typ} buf[{le}]; {mask_decl} {pre}storeu{sufsi}({cast}buf, {oz}); {store_mask} for (i = 0; i < {le}; i++) {{ if ({cond}) {{ buf[i] = {in1}[i]; }} }} return {pre}loadu{sufsi}({cast}buf);'''. \ format(cast=cast, mask_decl=mask_decl, store_mask=store_mask, cond=cond, oz='{in2}'.format(**fmtspec) if oz == 'o' else \ '{pre}setzero{sufsi}()'.format(**fmtspec), **fmtspec) # Here typ is 32 of 64-bits wide except if simd_ext in avx: suf2 = 'ps' if typ[1:] == '32' else 'pd' if typ in common.ftypes: maskload = \ '{pre}maskload{suf}({in1}, {pre}cast{suf2}_si256({in0}))'. \ format(suf2=suf2, **fmtspec) if oz == 'z': return 'return {};'.format(maskload) else: return \ 'return {pre}blendv{suf}({in2}, {maskload}, {in0});'. \ format(maskload=maskload, **fmtspec) else: if simd_ext == 'avx2': maskload = '{pre}maskload{suf}({cast}{in1}, {in0})'. \ format(cast='(nsimd_longlong *)' \ if typ in ['i64', 'u64'] else '(int *)', **fmtspec) if oz == 'z': return 'return {};'.format(maskload) else: return \ 'return {pre}blendv_epi8({in2}, {maskload}, {in0});'. \ format(maskload=maskload, **fmtspec) else: maskload = '{pre}maskload_{suf2}(({ftyp}*){in1}, {in0})'. \ format(suf2=suf2, ftyp='f' + typ[1:], **fmtspec) if oz == 'z': return 'return {pre}cast{suf2}_si256({maskload});'. \ format(maskload=maskload, suf2=suf2, **fmtspec) else: return \ '''return {pre}cast{suf2}_si256({pre}blendv_{suf2}( {pre}castsi256_{suf2}({in2}), {maskload}, {pre}castsi256_{suf2}({in0})));'''. \ format(suf2=suf2, maskload=maskload, **fmtspec) # getting here means avx512 with intrinsics mask = { 'z': 'return {pre}maskz_load{{}}{suf}({in0}, (void*){in1});'. \ format(**fmtspec), 'o': 'return {pre}mask_load{{}}{suf}({in2}, {in0}, (void*){in1});'. \ format(**fmtspec) } if typ in ['i32', 'u32', 'f32', 'i64', 'u64', 'f64']: return mask[oz].format('' if aligned else 'u') else: return mask[oz].format('u') # ----------------------------------------------------------------------------- # Loads of degree 2, 3 and 4 def load_deg234(simd_ext, typ, align, deg): if typ == 'f16': a = 'a' if align else 'u' code = '\n'.join([ \ '''nsimd_storeu_{simd_ext}_u16(buf, tmp.v{i}); ret.v{i} = nsimd_loadu_{simd_ext}_f16((f16 *)buf);'''. \ format(i=i, **fmtspec) for i in range(0, deg)]) return \ '''nsimd_{simd_ext}_v{typ}x{deg} ret; u16 buf[{le}]; nsimd_{simd_ext}_vu16x{deg} tmp = nsimd_load{deg}{a}_{simd_ext}_u16((u16*)a0); {code} return ret;'''.format(code=code, a=a, deg=deg, **fmtspec) if simd_ext in sse: if deg == 2: return ldst234.load2_sse(simd_ext, typ, align, fmtspec) if deg == 3: return ldst234.load3_sse(simd_ext, typ, align, fmtspec) if deg == 4: return ldst234.load4_sse(simd_ext, typ, align, fmtspec) if simd_ext in avx: if deg == 2: return ldst234.load2_avx(simd_ext, typ, align, fmtspec) if deg == 3: return ldst234.load3_avx(simd_ext, typ, align, fmtspec) if deg == 4: return ldst234.load4_avx(simd_ext, typ, align, fmtspec) if simd_ext in avx512: if deg == 2: return ldst234.load2_avx512(simd_ext, typ, align, fmtspec) if deg == 3: return ldst234.load3_avx512(simd_ext, typ, align, fmtspec) if deg == 4: return ldst234.load4_avx512(simd_ext, typ, align, fmtspec) return common.NOT_IMPLEMENTED # ----------------------------------------------------------------------------- # Stores of degree 2, 3 and 4 def store_deg234(simd_ext, typ, align, deg): if typ == 'f16': a = 'a' if align else 'u' variables = ', '.join(['v{}'.format(i) for i in range(0, deg)]) code = '\n'.join([ \ '''nsimd_storeu_{{simd_ext}}_f16((f16 *)buf, {{in{ip1}}}); v{i} = nsimd_loadu_{{simd_ext}}_u16((u16 *)buf);'''. \ format(i=i, ip1=i + 1).format(**fmtspec) \ for i in range(0, deg)]) return \ '''nsimd_{simd_ext}_vu16 {variables}; u16 buf[{le}]; {code} nsimd_store{deg}{a}_{simd_ext}_u16((u16 *){in0}, {variables});'''. \ format(variables=variables, code=code, a=a, deg=deg, **fmtspec) if simd_ext in sse: if deg == 2: return ldst234.store2(simd_ext, typ, align, fmtspec) if deg == 3: return ldst234.store3_sse(simd_ext, typ, align, fmtspec) if deg == 4: return ldst234.store4_sse(typ, align, fmtspec) if simd_ext in avx: if deg == 2: return ldst234.store2(simd_ext, typ, align, fmtspec) if deg == 3: return ldst234.store3_avx(simd_ext, typ, align, fmtspec) if deg == 4: return ldst234.store4_avx(simd_ext, typ, align, fmtspec) if simd_ext in avx512: if deg == 2: return ldst234.store2(simd_ext, typ, align, fmtspec) if deg == 3: return ldst234.store3_avx512(simd_ext, typ, align, fmtspec) if deg == 4: return ldst234.store4_avx512(simd_ext, typ, align, fmtspec) return common.NOT_IMPLEMENTED # ----------------------------------------------------------------------------- # Store def store(simd_ext, typ, aligned): align = '' if aligned else 'u' cast = castsi(simd_ext, typ) if typ == 'f16': if simd_ext in sse: return \ '''#ifdef NSIMD_FP16 __m128i v0 = _mm_cvtps_ph({in1}.v0, 4); __m128i v1 = _mm_cvtps_ph({in1}.v1, 4); __m128d v = _mm_shuffle_pd(_mm_castsi128_pd(v0), _mm_castsi128_pd(v1), 0 /* = (0 << 1) | (0 << 0) */); _mm_store{align}_pd((f64*){in0}, v); #else /* Note that we can do much better but is it useful? */ f32 buf[4]; _mm_storeu_ps(buf, {in1}.v0); *((u16*){in0} ) = nsimd_f32_to_u16(buf[0]); *((u16*){in0} + 1) = nsimd_f32_to_u16(buf[1]); *((u16*){in0} + 2) = nsimd_f32_to_u16(buf[2]); *((u16*){in0} + 3) = nsimd_f32_to_u16(buf[3]); _mm_storeu_ps(buf, {in1}.v1); *((u16*){in0} + 4) = nsimd_f32_to_u16(buf[0]); *((u16*){in0} + 5) = nsimd_f32_to_u16(buf[1]); *((u16*){in0} + 6) = nsimd_f32_to_u16(buf[2]); *((u16*){in0} + 7) = nsimd_f32_to_u16(buf[3]); #endif'''.format(align=align, **fmtspec) elif simd_ext in avx: return \ '''#ifdef NSIMD_FP16 _mm_store{align}_si128((__m128i*){in0}, _mm256_cvtps_ph({in1}.v0, 4)); _mm_store{align}_si128((__m128i*){in0} + 1, _mm256_cvtps_ph({in1}.v1, 4)); #else /* Note that we can do much better but is it useful? */ int i; f32 buf[8]; _mm256_storeu_ps(buf, {in1}.v0); for (i = 0; i < 8; i++) {{ *((u16*){in0} + i) = nsimd_f32_to_u16(buf[i]); }} _mm256_storeu_ps(buf, {in1}.v1); for (i = 0; i < 8; i++) {{ *((u16*){in0} + (8 + i)) = nsimd_f32_to_u16(buf[i]); }} #endif'''.format(align=align, **fmtspec) elif simd_ext in avx512: return \ '''_mm256_store{align}_si256((__m256i*){in0}, _mm512_cvtps_ph({in1}.v0, 4)); _mm256_store{align}_si256((__m256i*){in0} + 1, _mm512_cvtps_ph({in1}.v1, 4));'''. \ format(align=align, **fmtspec) else: return '{pre}store{align}{sufsi}({cast}{in0}, {in1});'. \ format(align=align, cast=cast, **fmtspec) # masked store def mask_store(simd_ext, typ, aligned): if typ == 'f16': le2 = fmtspec['le'] // 2 if simd_ext in sse + avx: store_mask = '''{pre}storeu_ps(mask, {in0}.v0); {pre}storeu_ps(mask + {le2}, {in0}.v1);'''. \ format(le2=le2, **fmtspec) else: store_mask = '''_mm512_storeu_ps(mask, _mm512_maskz_mov_ps( {in0}.v0, _mm512_set1_ps(1.0f))); _mm512_storeu_ps(mask + {le2}, _mm512_maskz_mov_ps( {in0}.v1, _mm512_set1_ps(1.0f)));'''. \ format(le2=le2, **fmtspec) return '''f32 mask[{le}], buf[{le}]; int i; {store_mask} {pre}storeu_ps(buf, {in2}.v0); {pre}storeu_ps(buf + {le2}, {in2}.v1); for (i = 0; i < {le}; i++) {{ if (nsimd_scalar_reinterpret_u32_f32(mask[i]) != (u32)0) {{ {in1}[i] = nsimd_f32_to_f16(buf[i]); }} }}'''.format(store_mask=store_mask, le2=le2, **fmtspec) suf2 = 'ps' if typ[1:] == '32' else 'pd' if simd_ext in sse: if typ in common.iutypes: return '_mm_maskmoveu_si128({in2}, {in0}, (char *){in1});'. \ format(**fmtspec) else: return '''_mm_maskmoveu_si128(_mm_cast{suf2}_si128({in2}), _mm_cast{suf2}_si128({in0}), (char *){in1});'''. \ format(suf2=suf2, **fmtspec) if typ in ['i8', 'u8', 'i16', 'u16'] and simd_ext != 'avx512_skylake': if simd_ext == 'avx512_knl': return \ '''int i; u64 mask; {typ} buf[{le}]; {pre}storeu{sufsi}((__m512i *)buf, {in2}); mask = (u64){in0}; for (i = 0; i < {le}; i++) {{ if ((mask >> i) & 1) {{ {in1}[i] = buf[i]; }} }}'''.format(utyp='u' + typ[1:], **fmtspec) else: return \ '''nsimd_{op_name}_sse42_{typ}({mask_lo}, {in1}, {val_lo}); nsimd_{op_name}_sse42_{typ}({mask_hi}, {in1} + {le2}, {val_hi}); '''.format(le2=fmtspec['le'] // 2, op_name='mask_store{}1'.format('a' if aligned else 'u'), mask_lo=extract(simd_ext, typ, LO, common.in0), mask_hi=extract(simd_ext, typ, HI, common.in0), val_lo=extract(simd_ext, typ, LO, common.in2), val_hi=extract(simd_ext, typ, HI, common.in2), **fmtspec) # Here typ is 32 of 64-bits wide except if simd_ext in avx: if typ in common.ftypes: return '''{pre}maskstore{suf}({in1}, {pre}cast{suf2}_si256({in0}), {in2});'''. \ format(suf2=suf2, **fmtspec) else: if simd_ext == 'avx2': return '{pre}maskstore{suf}({cast}{in1}, {in0}, {in2});'. \ format(cast='(nsimd_longlong *)' \ if typ in ['i64', 'u64'] \ else '(int *)', **fmtspec) else: return '''{pre}maskstore_{suf2}(({ftyp}*){in1}, {in0}, {pre}castsi256_{suf2}({in2}));'''. \ format(suf2=suf2, ftyp='f' + typ[1:], **fmtspec) # getting here means avx512 with intrinsics code = '{pre}mask_store{{}}{suf}((void*){in1}, {in0}, {in2});'. \ format(**fmtspec) if typ in ['i32', 'u32', 'f32', 'i64', 'u64', 'f64']: return code.format('' if aligned else 'u') else: return code.format('u') # ----------------------------------------------------------------------------- # Code for binary operators: and, or, xor def binop2(func, simd_ext, typ, logical=False): logical = 'l' if logical else '' func = func[0:-1] if typ == 'f16': return \ '''nsimd_{simd_ext}_v{logi}f16 ret; ret.v0 = nsimd_{func}{logi2}_{simd_ext}_f32({in0}.v0, {in1}.v0); ret.v1 = nsimd_{func}{logi2}_{simd_ext}_f32({in0}.v1, {in1}.v1); return ret;'''.format(logi='l' if logical else '', func=func, logi2='l' if logical else 'b', **fmtspec) normal = 'return {pre}{func}{sufsi}({in0}, {in1});'. \ format(func=func, **fmtspec) if simd_ext in sse: return normal if simd_ext in avx: if simd_ext == 'avx2' or typ in ['f32', 'f64']: return normal else: return '''return _mm256_castpd_si256(_mm256_{func}_pd( _mm256_castsi256_pd({in0}), _mm256_castsi256_pd({in1})));'''. \ format(func=func, **fmtspec) if simd_ext in avx512: if simd_ext == 'avx512_skylake' or typ in common.iutypes: return normal else: return \ '''return _mm512_castsi512{suf}(_mm512_{func}_si512( _mm512_cast{typ2}_si512({in0}), _mm512_cast{typ2}_si512({in1})));'''. \ format(func=func, typ2=suf_ep(typ)[1:], **fmtspec) # ----------------------------------------------------------------------------- # Code for logical binary operators: andl, orl, xorl def binlop2(func, simd_ext, typ): op = { 'orl': '|', 'xorl': '^', 'andl': '&' } op_fct = { 'orl': 'kor', 'xorl': 'kxor', 'andl': 'kand' } if simd_ext not in avx512: if typ == 'f16': return binop2(func, simd_ext, typ, True) else: return binop2(func, simd_ext, typ) elif simd_ext == 'avx512_knl': if typ == 'f16': return '''nsimd_{simd_ext}_vlf16 ret; ret.v0 = _{op_fct}_mask16({in0}.v0, {in1}.v0); ret.v1 = _{op_fct}_mask16({in0}.v1, {in1}.v1); return ret;'''. \ format(op_fct=op_fct[func], **fmtspec) elif typ in ['f32', 'u32', 'i32']: return 'return _{op_fct}_mask16({in0}, {in1});'. \ format(op_fct=op_fct[func], **fmtspec) else: return 'return (__mmask{le})({in0} {op} {in1});'. \ format(op=op[func], **fmtspec) elif simd_ext == 'avx512_skylake': if typ == 'f16': return '''nsimd_{simd_ext}_vlf16 ret; #if defined(NSIMD_IS_GCC) || defined(NSIMD_IS_CLANG) ret.v0 = (__mmask16)({in0}.v0 {op} {in1}.v0); ret.v1 = (__mmask16)({in0}.v1 {op} {in1}.v1); #else ret.v0 = _{op_fct}_mask16({in0}.v0, {in1}.v0); ret.v1 = _{op_fct}_mask16({in0}.v1, {in1}.v1); #endif return ret;'''. \ format(op_fct=op_fct[func], op=op[func], **fmtspec) else: return '''#if defined(NSIMD_IS_GCC) || defined(NSIMD_IS_CLANG) return (__mmask{le})({in0} {op} {in1}); #else return _{op_fct}_mask{le}({in0}, {in1}); #endif'''.format(op_fct=op_fct[func], op=op[func], **fmtspec) # ----------------------------------------------------------------------------- # andnot def andnot2(simd_ext, typ, logical=False): if typ == 'f16': return \ '''nsimd_{simd_ext}_v{logi}f16 ret; ret.v0 = nsimd_andnot{logi2}_{simd_ext}_f32({in0}.v0, {in1}.v0); ret.v1 = nsimd_andnot{logi2}_{simd_ext}_f32({in0}.v1, {in1}.v1); return ret;'''.format(logi='l' if logical else '', logi2='l' if logical else 'b', **fmtspec) if simd_ext in sse: return 'return _mm_andnot{sufsi}({in1}, {in0});'.format(**fmtspec) if simd_ext in avx: if simd_ext == 'avx2' or typ in ['f32', 'f64']: return 'return _mm256_andnot{sufsi}({in1}, {in0});'. \ format(**fmtspec) else: return '''return _mm256_castpd_si256(_mm256_andnot_pd( _mm256_castsi256_pd({in1}), _mm256_castsi256_pd({in0})));'''. \ format(**fmtspec) if simd_ext in avx512: if simd_ext == 'avx512_skylake' or typ in common.iutypes: return 'return _mm512_andnot{sufsi}({in1}, {in0});'. \ format(**fmtspec) else: return '''return _mm512_castsi512{suf}(_mm512_andnot_si512( _mm512_cast{suf2}_si512({in1}), _mm512_cast{suf2}_si512({in0})));'''. \ format(suf2=fmtspec['suf'][1:], **fmtspec) # ----------------------------------------------------------------------------- # logical andnot def landnot2(simd_ext, typ): if simd_ext in avx512: if typ == 'f16': return '''nsimd_{simd_ext}_vlf16 ret; ret.v0 = (__mmask16)({in0}.v0 & (~{in1}.v0)); ret.v1 = (__mmask16)({in0}.v1 & (~{in1}.v1)); return ret;'''.format(**fmtspec) else: return 'return (__mmask{le})({in0} & (~{in1}));'.format(**fmtspec) return andnot2(simd_ext, typ, True) # ----------------------------------------------------------------------------- # Code for unary not def not1(simd_ext, typ, logical=False): if typ == 'f16': return \ '''nsimd_{simd_ext}_v{logi}f16 ret; nsimd_{simd_ext}_vf32 cte = {pre}castsi{nbits}_ps( {pre}set1_epi8(-1)); ret.v0 = nsimd_andnot{logi2}_{simd_ext}_f32(cte, {in0}.v0); ret.v1 = nsimd_andnot{logi2}_{simd_ext}_f32(cte, {in0}.v1); return ret;'''.format(logi='l' if logical else '', logi2='l' if logical else 'b', **fmtspec) elif typ in ['f32', 'f64']: return '''return nsimd_andnotb_{simd_ext}_{typ}( {pre}castsi{nbits}{suf}( {pre}set1_epi8(-1)), {in0});'''.format(**fmtspec) else: return '''return nsimd_andnotb_{simd_ext}_{typ}( {pre}set1_epi8(-1), {in0});'''.format(**fmtspec) # ----------------------------------------------------------------------------- # Code for unary logical lnot def lnot1(simd_ext, typ): if simd_ext in avx512: if typ == 'f16': return '''nsimd_{simd_ext}_vlf16 ret; ret.v0 = (__mmask16)(~{in0}.v0); ret.v1 = (__mmask16)(~{in0}.v1); return ret;'''.format(**fmtspec) else: return 'return (__mmask{le})(~{in0});'.format(**fmtspec) return not1(simd_ext, typ, True) # ----------------------------------------------------------------------------- # Addition and substraction def addsub(func, simd_ext, typ): if typ in common.ftypes or simd_ext in sse or \ (simd_ext in avx512 and typ in ['u32', 'i32', 'u64', 'i64']): return how_it_should_be_op2(func, simd_ext, typ) else: if simd_ext in ['avx2', 'avx512_skylake']: return how_it_should_be_op2(func, simd_ext, typ) else: return split_op2(func, simd_ext, typ) # ----------------------------------------------------------------------------- # Len def len1(simd_ext, typ): return 'return {le};'.format(**fmtspec) # ----------------------------------------------------------------------------- # Division def div2(opts, simd_ext, typ): if typ in common.ftypes: return how_it_should_be_op2('div', simd_ext, typ) return emulate_op2(opts, '/', simd_ext, typ) # ----------------------------------------------------------------------------- # Multiplication def mul2(opts, simd_ext, typ): emulate = emulate_op2(opts, '*', simd_ext, typ) split = split_op2('mul', simd_ext, typ) # Floats if typ in common.ftypes: return how_it_should_be_op2('mul', simd_ext, typ) # Integers 16, 32 on SSE if simd_ext in sse and typ in ['i16', 'u16']: return 'return _mm_mullo_epi16({in0}, {in1});'.format(**fmtspec) if simd_ext in sse and typ in ['i32', 'u32']: if simd_ext == 'sse42': return 'return _mm_mullo_epi32({in0}, {in1});'.format(**fmtspec) else: return emulate # Integers 16, 32 on AVX if simd_ext in avx and typ in ['i16', 'u16', 'i32', 'u32']: if simd_ext == 'avx2': return 'return _mm256_mullo{suf}({in0}, {in1});'.format(**fmtspec) else: return split # Integers 64 on SSE on AVX if simd_ext in sse + avx and typ in ['i64', 'u64']: return emulate_op2(opts, '*', simd_ext, typ) # Integers 16 on AVX512 if simd_ext in avx512 and typ in ['i16', 'u16']: if simd_ext == 'avx512_skylake': return 'return _mm512_mullo_epi16({in0}, {in1});'.format(**fmtspec) else: return split # Integers 32 on AVX512 if simd_ext in avx512 and typ in ['i32', 'u32']: return 'return _mm512_mullo_epi32({in1}, {in0});'.format(**fmtspec) # Integers 64 on AVX512 if simd_ext in avx512 and typ in ['i64', 'u64']: if simd_ext == 'avx512_skylake': return 'return _mm512_mullo_epi64({in0}, {in1});'.format(**fmtspec) else: return emulate # Integers 8 on SSE with_epi16 = '''nsimd_{simd_ext}_v{typ} lo = {pre}mullo_epi16({in0}, {in1}); nsimd_{simd_ext}_v{typ} hi = {pre}slli_epi16( {pre}mullo_epi16({pre}srli_epi16({in0}, 8), {pre}srli_epi16({in1}, 8)), 8); return {pre}or{sufsi}({pre}and{sufsi}( lo, {pre}set1_epi16(255)),hi);'''. \ format(**fmtspec) split_epi16 = split_op2('mul', simd_ext, typ) if simd_ext in sse and typ in ['i8', 'u8']: return with_epi16 if simd_ext in avx + avx512 and typ in ['i8', 'u8']: if simd_ext in ['avx2', 'avx512_skylake']: return with_epi16 else: return split_epi16 # ----------------------------------------------------------------------------- # Shift left and right def shl_shr(func, simd_ext, typ): if typ in ['f16', 'f32', 'f64']: return '' intrinsic = 'srl' if func == 'shr' else 'sll' simd_ext2 = 'sse42' if simd_ext in avx else 'avx2' split = '''nsimd_{simd_ext2}_v{typ} v0 = {extract_lo}; nsimd_{simd_ext2}_v{typ} v1 = {extract_hi}; v0 = nsimd_{func}_{simd_ext2}_{typ}(v0, {in1}); v1 = nsimd_{func}_{simd_ext2}_{typ}(v1, {in1}); return {merge};'''. \ format(simd_ext2=simd_ext2, func=func, extract_lo=extract(simd_ext, typ, LO, common.in0), extract_hi=extract(simd_ext, typ, HI, common.in0), merge=setr(simd_ext, typ, 'v0', 'v1'), **fmtspec) normal_16_32_64 = '''return {pre}{intrinsic}{suf}( {in0}, _mm_set1_epi64x({in1}));'''. \ format(intrinsic=intrinsic, **fmtspec) FFs = '0x' + ('F' * int((int(typ[1:]) // 4))) FFOOs = FFs + ('0' * int((int(typ[1:]) // 4))) with_2n_for_n = '''nsimd_{simd_ext}_v{typ} lo = {pre}and{sufsi}( {pre}{intrinsic}_epi{typ2nbits}( {in0}, _mm_set1_epi64x({in1})), nsimd_set1_{simd_ext}_u{typ2nbits}({masklo})); nsimd_{simd_ext}_v{typ} hi = {pre}{intrinsic}_epi{typ2nbits}({pre}and{sufsi}({in0}, nsimd_set1_{simd_ext}_u{typ2nbits}({maskhi})), _mm_set1_epi64x({in1})); return {pre}or{sufsi}(hi, lo);'''. \ format(intrinsic=intrinsic, typ2nbits=2 * int(typ[1:]), masklo=FFs if func == 'shl' else FFOOs, maskhi=FFOOs if func == 'shl' else FFs, **fmtspec) with_32_for_8 = '''nsimd_{simd_ext}_v{typ} masklo = nsimd_set1_{simd_ext}_u32(0xFF00FF); nsimd_{simd_ext}_v{typ} lo = {pre}and{sufsi}({pre}{intrinsic}_epi32( {pre}and{sufsi}({in0}, masklo), _mm_set1_epi64x({in1})), masklo); nsimd_{simd_ext}_v{typ} maskhi = nsimd_set1_{simd_ext}_u32(0xFF00FF00); nsimd_{simd_ext}_v{typ} hi = {pre}and{sufsi}({pre}{intrinsic}_epi32( {pre}and{sufsi}({in0}, maskhi), _mm_set1_epi64x({in1})), maskhi); return {pre}or{sufsi}(hi, lo);'''. \ format(intrinsic=intrinsic, **fmtspec) if simd_ext in sse: if typ in ['i8', 'u8']: return with_2n_for_n if typ in ['i16', 'u16', 'i32', 'u32', 'i64', 'u64']: return normal_16_32_64 if simd_ext in avx: if typ in ['i8', 'u8']: return with_2n_for_n if simd_ext == 'avx2' else split if typ in ['i16', 'u16', 'i32', 'u32', 'i64', 'u64']: return normal_16_32_64 if simd_ext == 'avx2' else split if simd_ext in avx512: if typ in ['i8', 'u8']: return with_2n_for_n if simd_ext == 'avx512_skylake' \ else with_32_for_8 if typ in ['i16', 'u16']: return normal_16_32_64 if simd_ext == 'avx512_skylake' \ else with_2n_for_n if typ in ['i32', 'u32', 'i64', 'u64']: return normal_16_32_64 # ----------------------------------------------------------------------------- # Arithmetic shift right def shra(opts, simd_ext, typ): if typ in common.utypes: # For unsigned type, logical shift return 'return nsimd_shr_{simd_ext}_{typ}({in0}, {in1});'. \ format(**fmtspec) intrinsic = 'return {pre}sra{suf}({in0}, _mm_set1_epi64x((i64){in1}));'. \ format(**fmtspec) simd_ext2 = 'sse42' if simd_ext in avx else 'avx2' split = '''nsimd_{simd_ext2}_v{typ} v0 = {extract_lo}; nsimd_{simd_ext2}_v{typ} v1 = {extract_hi}; v0 = nsimd_shra_{simd_ext2}_{typ}(v0, {in1}); v1 = nsimd_shra_{simd_ext2}_{typ}(v1, {in1}); return {merge};'''. \ format(simd_ext2=simd_ext2, extract_lo=extract(simd_ext, typ, LO, common.in0), extract_hi=extract(simd_ext, typ, HI, common.in0), merge=setr(simd_ext, typ, 'v0', 'v1'), **fmtspec) trick_for_i8 = \ '''__m128i count = _mm_set1_epi64x((i64){in1}); nsimd_{simd_ext}_vi16 lo, hi; hi = {pre}andnot{sufsi}({pre}set1_epi16(255), {pre}sra_epi16({in0}, count)); lo = {pre}srli_epi16({pre}sra_epi16( {pre}slli_epi16({in0}, 8), count), 8); return {pre}or{sufsi}(hi, lo);'''.format(**fmtspec) emulation = get_emulation_code('shra', ['v', 's'], simd_ext, typ) if simd_ext in sse + ['avx2']: if typ == 'i8': return trick_for_i8 elif typ in ['i16', 'i32']: return intrinsic elif typ == 'i64': return emulation elif simd_ext == 'avx': if typ in ['i8', 'i16', 'i32']: return split elif typ == 'i64': return emulation elif simd_ext == 'avx512_knl': if typ in ['i8', 'i16']: return split elif typ in ['i32', 'i64']: return intrinsic elif simd_ext == 'avx512_skylake': if typ == 'i8': return trick_for_i8 elif typ in ['i16', 'i32', 'i64']: return intrinsic # ----------------------------------------------------------------------------- # set1 or splat function def set1(simd_ext, typ): if typ == 'f16': return '''nsimd_{simd_ext}_vf16 ret; f32 f = nsimd_f16_to_f32({in0}); ret.v0 = {pre}set1_ps(f); ret.v1 = {pre}set1_ps(f); return ret;'''.format(**fmtspec) if simd_ext in sse + avx: if typ == 'i64': return 'return {pre}set1_epi64x({in0});'.format(**fmtspec) if typ == 'u64': return '''union {{ u64 u; i64 i; }} buf; buf.u = {in0}; return {pre}set1_epi64x(buf.i);'''.format(**fmtspec) if typ in ['u8', 'u16', 'u32', 'u64']: return '''union {{ {typ} u; i{typnbits} i; }} buf; buf.u = {in0}; return {pre}set1{suf}(buf.i);'''.format(**fmtspec) return 'return {pre}set1{suf}({in0});'.format(**fmtspec) # ----------------------------------------------------------------------------- # set1l or splat function for logical def set1l(simd_ext, typ): if typ == 'f16': return '''nsimd_{simd_ext}_vlf16 ret; ret.v0 = nsimd_set1l_{simd_ext}_f32({in0}); ret.v1 = ret.v0; return ret;'''.format(**fmtspec) if simd_ext in sse + avx: if simd_ext in sse: ones = '_mm_cmpeq_pd(_mm_setzero_pd(), _mm_setzero_pd())' else: ones = '_mm256_cmp_pd(_mm256_setzero_pd(), _mm256_setzero_pd(), ' \ '_CMP_EQ_OQ)' if typ != 'f64': ones = '{pre}castpd{sufsi}({ones})'.format(ones=ones, **fmtspec) return '''if ({in0}) {{ return {ones}; }} else {{ return {pre}setzero{sufsi}(); }}'''.format(ones=ones, **fmtspec) else: return '''if ({in0}) {{ return (__mmask{le})(~(__mmask{le})(0)); }} else {{ return (__mmask{le})(0); }}'''.format(**fmtspec) # ----------------------------------------------------------------------------- # Equality def eq2(simd_ext, typ): if typ == 'f16': return f16_cmp2('eq', simd_ext) if simd_ext in sse: if typ in ['i64', 'u64']: if simd_ext == 'sse42': return how_it_should_be_op2('cmpeq', simd_ext, typ) else: return \ '''__m128i t = _mm_cmpeq_epi32({in0}, {in1}); return _mm_and_si128(t, _mm_shuffle_epi32(t, 177) /* = 2|3|0|1 */);'''. \ format(**fmtspec) else: return how_it_should_be_op2('cmpeq', simd_ext, typ) if simd_ext in avx: if typ in ['f32', 'f64']: return 'return _mm256_cmp{suf}({in0}, {in1}, _CMP_EQ_OQ);'. \ format(**fmtspec) else: if simd_ext == 'avx2': return how_it_should_be_op2('cmpeq', simd_ext, typ) else: return split_cmp2('eq', simd_ext, typ) if simd_ext in avx512: if typ in ['f32', 'f64']: return 'return _mm512_cmp{suf}_mask({in0}, {in1}, _CMP_EQ_OQ);'. \ format(**fmtspec) elif typ in ['i32', 'u32', 'i64', 'u64']: return \ 'return _mm512_cmp{suf}_mask({in0}, {in1}, _MM_CMPINT_EQ);'. \ format(**fmtspec) else: if simd_ext == 'avx512_skylake': return \ 'return _mm512_cmp{suf}_mask({in0}, {in1}, _MM_CMPINT_EQ);'. \ format(**fmtspec) else: return split_cmp2('eq', simd_ext, typ) # ----------------------------------------------------------------------------- # not equal def neq2(simd_ext, typ): if typ == 'f16': return f16_cmp2('ne', simd_ext) if simd_ext in sse and typ in ['f32', 'f64']: return how_it_should_be_op2('cmpneq', simd_ext, typ) if simd_ext in avx and typ in ['f32', 'f64']: return 'return _mm256_cmp{suf}({in0}, {in1}, _CMP_NEQ_UQ);'. \ format(**fmtspec) if simd_ext in avx512 and typ in ['f32', 'f64']: return 'return _mm512_cmp{suf}_mask({in0}, {in1}, _CMP_NEQ_UQ);'. \ format(**fmtspec) noteq = '''return nsimd_notl_{simd_ext}_{typ}( nsimd_eq_{simd_ext}_{typ}({in0}, {in1}));'''. \ format(**fmtspec) if simd_ext in avx512: intrinsic = \ 'return _mm512_cmp{suf}_mask({in0}, {in1}, _MM_CMPINT_NE);'. \ format(**fmtspec) if typ in ['i32', 'u32', 'i64', 'u64']: return intrinsic else: return intrinsic if simd_ext == 'avx512_skylake' else noteq return noteq # ----------------------------------------------------------------------------- # Greater than def gt2(simd_ext, typ): if typ == 'f16': return f16_cmp2('gt', simd_ext) if simd_ext in sse: if typ in ['f32', 'f64', 'i8', 'i16', 'i32']: return how_it_should_be_op2('cmpgt', simd_ext, typ) if typ == 'i64': if simd_ext == 'sse42': return how_it_should_be_op2('cmpgt', simd_ext, typ) #return '''return _mm_sub_epi64(_mm_setzero_si128(), _mm_srli_epi64( # _mm_sub_epi64({in1}, {in0}), 63));'''. \ # format(**fmtspec) return '''{typ} buf0[2], buf1[2]; _mm_storeu_si128((__m128i*)buf0, {in0}); _mm_storeu_si128((__m128i*)buf1, {in1}); buf0[0] = -(buf0[0] > buf1[0]); buf0[1] = -(buf0[1] > buf1[1]); return _mm_loadu_si128((__m128i*)buf0);'''. \ format(**fmtspec) return cmp2_with_add('gt', simd_ext, typ) if simd_ext in avx: if typ in ['f32', 'f64']: return 'return _mm256_cmp{suf}({in0}, {in1}, _CMP_GT_OQ);'. \ format(**fmtspec) if typ in ['i8', 'i16', 'i32', 'i64']: if simd_ext == 'avx2': return how_it_should_be_op2('cmpgt', simd_ext, typ) else: return split_cmp2('gt', simd_ext, typ) if simd_ext == 'avx2': return cmp2_with_add('gt', simd_ext, typ) else: return split_cmp2('gt', simd_ext, typ) # AVX512 if typ in ['f32', 'f64', 'i32', 'i64']: return \ 'return _mm512_cmp{suf}_mask({in0}, {in1}, {cte});'. \ format(cte='_CMP_GT_OQ' if typ in ['f32', 'f64'] else '_MM_CMPINT_NLE', **fmtspec) if typ in ['u32', 'u64']: return \ 'return _mm512_cmp_epu{typ2}_mask({in0}, {in1}, _MM_CMPINT_NLE);'. \ format(typ2=typ[1:], **fmtspec) if simd_ext == 'avx512_skylake': return \ 'return _mm512_cmp_ep{typ}_mask({in0}, {in1}, _MM_CMPINT_NLE);'. \ format(**fmtspec) else: return split_cmp2('gt', simd_ext, typ) # ----------------------------------------------------------------------------- # lesser than def lt2(simd_ext, typ): return 'return nsimd_gt_{simd_ext}_{typ}({in1}, {in0});'. \ format(**fmtspec) # ----------------------------------------------------------------------------- # greater or equal def geq2(simd_ext, typ): if typ == 'f16': return f16_cmp2('ge', simd_ext) notlt = '''return nsimd_notl_{simd_ext}_{typ}( nsimd_lt_{simd_ext}_{typ}({in0}, {in1}));'''. \ format(**fmtspec) if simd_ext in sse: if typ in ['f32', 'f64']: return how_it_should_be_op2('cmpge', simd_ext, typ) if simd_ext in avx: if typ in ['f32', 'f64']: return 'return _mm256_cmp{suf}({in0}, {in1}, _CMP_GE_OQ);'. \ format(**fmtspec) if simd_ext in avx512: if typ in ['i32', 'i64', 'u32', 'u64']: return \ 'return _mm512_cmp_ep{typ}_mask({in0}, {in1}, _MM_CMPINT_NLT);'. \ format(**fmtspec) if typ in ['f32', 'f64']: return 'return _mm512_cmp{suf}_mask({in0}, {in1}, _CMP_GE_OQ);'. \ format(**fmtspec) if simd_ext == 'avx512_skylake': return \ 'return _mm512_cmp_ep{typ}_mask({in0}, {in1}, _MM_CMPINT_NLT);'. \ format(**fmtspec) else: return notlt return notlt # ----------------------------------------------------------------------------- # lesser or equal def leq2(simd_ext, typ): if typ == 'f16': return f16_cmp2('le', simd_ext) notgt = '''return nsimd_notl_{simd_ext}_{typ}( nsimd_gt_{simd_ext}_{typ}({in0}, {in1}));'''. \ format(**fmtspec) if simd_ext in sse and typ in ['f32', 'f64']: return 'return _mm_cmple{suf}({in0}, {in1});'.format(**fmtspec) if simd_ext in avx and typ in ['f32', 'f64']: return 'return _mm256_cmp{suf}({in0}, {in1}, _CMP_LE_OQ);'. \ format(**fmtspec) if simd_ext in avx512: if typ in ['i32', 'i64', 'u32', 'u64']: return \ 'return _mm512_cmp_ep{typ}_mask({in0}, {in1}, _MM_CMPINT_LE);'. \ format(**fmtspec) if typ in ['f32', 'f64']: return 'return _mm512_cmp{suf}_mask({in0}, {in1}, _CMP_LE_OQ);'. \ format(**fmtspec) if simd_ext == 'avx512_skylake': return \ 'return _mm512_cmp_ep{typ}_mask({in0}, {in1}, _MM_CMPINT_LE);'. \ format(**fmtspec) else: return notgt return notgt # ----------------------------------------------------------------------------- # if_else1 function def if_else1(simd_ext, typ): if typ == 'f16': return '''nsimd_{simd_ext}_vf16 ret; ret.v0 = nsimd_if_else1_{simd_ext}_f32( {in0}.v0, {in1}.v0, {in2}.v0); ret.v1 = nsimd_if_else1_{simd_ext}_f32( {in0}.v1, {in1}.v1, {in2}.v1); return ret;'''.format(**fmtspec) manual = '''return nsimd_orb_{simd_ext}_{typ}( nsimd_andb_{simd_ext}_{typ}({in1}, {in0}), nsimd_andnotb_{simd_ext}_{typ}({in2}, {in0}));'''. \ format(**fmtspec) if simd_ext in sse: if simd_ext == 'sse42': return 'return _mm_blendv{fsuf}({in2}, {in1}, {in0});'. \ format(fsuf=suf_ep(typ) if typ in ['f32', 'f64'] else '_epi8', **fmtspec) else: return manual if simd_ext in avx: if typ in ['f32', 'f64']: return 'return _mm256_blendv{suf}({in2}, {in1}, {in0});'. \ format(**fmtspec) else: if simd_ext == 'avx2': return 'return _mm256_blendv_epi8({in2}, {in1}, {in0});'. \ format(**fmtspec) else: return manual if simd_ext in avx512: if typ in ['f32', 'f64', 'i32', 'u32', 'i64', 'u64']: return 'return _mm512_mask_blend{suf}({in0}, {in2}, {in1});'. \ format(**fmtspec) else: if simd_ext == 'avx512_skylake': return 'return _mm512_mask_blend{suf}({in0}, {in2}, {in1});'. \ format(**fmtspec) else: return '''int i; {typ} buf0[{le}], buf1[{le}]; _mm512_storeu_si512(buf0, {in1}); _mm512_storeu_si512(buf1, {in2}); for (i = 0; i < {le}; i++) {{ if ((({in0} >> i) & 1) == 0) {{ buf0[i] = buf1[i]; }} }} return _mm512_loadu_si512(buf0);'''.format(**fmtspec) # ----------------------------------------------------------------------------- # min and max functions def minmax(func, simd_ext, typ): if typ in ['f16', 'f32', 'f64']: return how_it_should_be_op2(func, simd_ext, typ) with_if_else = '''return nsimd_if_else1_{simd_ext}_{typ}( nsimd_gt_{simd_ext}_{typ}( {args}), {in0}, {in1});'''. \ format(args = '{in0}, {in1}'.format(**fmtspec) if func == 'max' else '{in1}, {in0}'.format(**fmtspec), **fmtspec) if simd_ext in sse: if typ in ['u8', 'i16']: return 'return _mm_{func}_ep{typ}({in0}, {in1});'. \ format(func=func, **fmtspec) if typ in ['i8', 'u16', 'i32', 'u32']: if simd_ext == 'sse42': return 'return _mm_{func}_ep{typ}({in0}, {in1});'. \ format(func=func, **fmtspec) else: return with_if_else if simd_ext in avx and typ in ['i8', 'u8', 'i16', 'u16', 'i32', 'u32']: if simd_ext == 'avx2': return 'return _mm256_{func}_ep{typ}({in0}, {in1});'. \ format(func=func, **fmtspec) else: return split_op2(func, simd_ext, typ) if simd_ext in avx512: if typ in ['i32', 'u32', 'i64', 'u64']: return 'return _mm512_{func}_ep{typ}({in0}, {in1});'. \ format(func=func, **fmtspec) else: if simd_ext == 'avx512_skylake': return 'return _mm512_{func}_ep{typ}({in0}, {in1});'. \ format(func=func, **fmtspec) else: return split_op2(func, simd_ext, typ) return with_if_else # ----------------------------------------------------------------------------- # sqrt def sqrt1(simd_ext, typ): if typ == 'f16': return '''nsimd_{simd_ext}_vf16 ret; ret.v0 = {pre}sqrt_ps({in0}.v0); ret.v1 = {pre}sqrt_ps({in0}.v1); return ret;'''.format(**fmtspec) return 'return {pre}sqrt{suf}({in0});'.format(**fmtspec) # ----------------------------------------------------------------------------- # Load logical def loadl(simd_ext, typ, aligned): if simd_ext in avx512: if typ == 'f16': return '''/* This can surely be improved but it is not our priority. Note that we take advantage of the fact that floating zero is represented as integer zero to simplify code. */ nsimd_{simd_ext}_vlf16 ret; __mmask32 tmp = nsimd_loadlu_{simd_ext}_u16((u16*){in0}); ret.v0 = (__mmask16)(tmp & 0xFFFF); ret.v1 = (__mmask16)((tmp >> 16) & 0xFFFF); return ret;'''.format(**fmtspec) return '''/* This can surely be improved but it is not our priority. */ int i; __mmask{le} ret = 0; for (i = 0; i < {le}; i++) {{ if ({in0}[i] != ({typ})0) {{ ret |= (__mmask{le})((__mmask{le})1 << i); }} }} return ret;'''.format(**fmtspec) return \ '''/* This can surely be improved but it is not our priority. */ return nsimd_notl_{simd_ext}_{typ}(nsimd_eq_{simd_ext}_{typ}( nsimd_load{align}_{simd_ext}_{typ}( {in0}), nsimd_set1_{simd_ext}_{typ}({zero})));'''. \ format(align='a' if aligned else 'u', zero = 'nsimd_f32_to_f16(0.0f)' if typ == 'f16' else '({})0'.format(typ), **fmtspec) # ----------------------------------------------------------------------------- # Store logical def storel(simd_ext, typ, aligned): if simd_ext in avx512: if typ == 'f16': return '''/* This can surely be improved but it is not our priority. Note that we take advantage of the fact that floating zero is represented as integer zero to simplify code. */ int i; u16 one = 0x3C00; /* FP16 IEEE754 representation of 1 */ for (i = 0; i < 16; i++) {{ ((u16*){in0})[i] = (u16)((({in1}.v0 >> i) & 1) ? one : 0); }} for (i = 0; i < 16; i++) {{ ((u16*){in0})[i + 16] = (u16)((({in1}.v1 >> i) & 1) ? one : 0); }}'''.format(**fmtspec) return '''/* This can surely be improved but it is not our priority. */ int i; for (i = 0; i < {le}; i++) {{ {in0}[i] = ({typ})((({in1} >> i) & 1) ? 1 : 0); }}'''.format(**fmtspec) return \ '''/* This can surely be improved but it is not our priority. */ nsimd_store{align}_{simd_ext}_{typ}({in0}, nsimd_if_else1_{simd_ext}_{typ}({in1}, nsimd_set1_{simd_ext}_{typ}({one}), nsimd_set1_{simd_ext}_{typ}({zero})));'''. \ format(align = 'a' if aligned else 'u', one = 'nsimd_f32_to_f16(1.0f)' if typ == 'f16' else '({})1'.format(typ), zero = 'nsimd_f32_to_f16(0.0f)' if typ == 'f16' else '({})0'.format(typ), **fmtspec) # ----------------------------------------------------------------------------- # Absolute value def abs1(simd_ext, typ): def mask(typ): return '0x7F' + ('F' * int(((int(typ[1:]) - 8) // 4))) if typ == 'f16': return \ '''nsimd_{simd_ext}_vf16 ret; nsimd_{simd_ext}_vf32 mask = {pre}castsi{nbits}_ps( nsimd_set1_{simd_ext}_u32({mask})); ret.v0 = nsimd_andb_{simd_ext}_f32({in0}.v0, mask); ret.v1 = nsimd_andb_{simd_ext}_f32({in0}.v1, mask); return ret;'''.format(mask=mask('f32'), **fmtspec) if typ in ['u8', 'u16', 'u32', 'u64']: return 'return {in0};'.format(**fmtspec) if typ in ['f32', 'f64']: return \ '''nsimd_{simd_ext}_v{typ} mask = {pre}castsi{nbits}{suf}( nsimd_set1_{simd_ext}_u{typnbits}({mask})); return nsimd_andb_{simd_ext}_{typ}({in0}, mask);'''. \ format(mask=mask(typ), **fmtspec) bit_twiddling_arith_shift = \ '''nsimd_{simd_ext}_v{typ} mask = {pre}srai{suf}({in0}, {typnbitsm1}); return {pre}xor{sufsi}({pre}add{suf}({in0}, mask), mask);'''. \ format(typnbitsm1=int(typ[1:]) - 1, **fmtspec) bit_twiddling_no_arith_shift = \ '''nsimd_{simd_ext}_v{typ} mask = {pre}sub{suf}({pre}setzero{sufsi}(), nsimd_shr_{simd_ext}_{typ}( {in0}, {typnbitsm1})); return {pre}xor{sufsi}({pre}add{suf}({in0}, mask), mask);'''. \ format(typnbitsm1=int(typ[1:]) - 1, **fmtspec) with_blendv = \ '''return _mm256_castpd_si256(_mm256_blendv_pd( _mm256_castsi256_pd({in0}), _mm256_castsi256_pd(_mm256_sub_epi64(_mm256_setzero_si256(), {in0})), _mm256_castsi256_pd({in0})));'''.format(**fmtspec) if simd_ext in sse: if typ in ['i16', 'i32']: if simd_ext == 'sse42': return 'return _mm_abs{suf}({in0});'.format(**fmtspec) else: return bit_twiddling_arith_shift if typ == 'i8': if simd_ext == 'sse42': return 'return _mm_abs{suf}({in0});'.format(**fmtspec) else: return bit_twiddling_no_arith_shift if typ == 'i64': return bit_twiddling_no_arith_shift if simd_ext in avx: if typ in ['i8', 'i16', 'i32']: if simd_ext == 'avx2': return 'return _mm256_abs{suf}({in0});'.format(**fmtspec) else: return split_opn('abs', simd_ext, typ, 1) else: if simd_ext == 'avx2': return with_blendv else: return split_opn('abs', simd_ext, typ, 1) if simd_ext in avx512: if typ in ['i32', 'i64']: return 'return _mm512_abs{suf}({in0});'.format(**fmtspec) else: if simd_ext == 'avx512_skylake': return 'return _mm512_abs{suf}({in0});'.format(**fmtspec) else: return split_opn('abs', simd_ext, typ, 1) # ----------------------------------------------------------------------------- # FMA and FMS def fma_fms(func, simd_ext, typ): op = 'add' if func in ['fma', 'fnma'] else 'sub' neg = 'n' if func in ['fnma', 'fnms'] else '' if typ == 'f16': return \ '''nsimd_{simd_ext}_vf16 ret; ret.v0 = nsimd_{func}_{simd_ext}_f32({in0}.v0, {in1}.v0, {in2}.v0); ret.v1 = nsimd_{func}_{simd_ext}_f32({in0}.v1, {in1}.v1, {in2}.v1); return ret;'''.format(neg=neg, func=func, **fmtspec) if neg == '': emulate = '''return nsimd_{op}_{simd_ext}_{typ}( nsimd_mul_{simd_ext}_{typ}({in0}, {in1}), {in2});'''.format(op=op, **fmtspec) else: emulate = '''return nsimd_{op}_{simd_ext}_{typ}( nsimd_mul_{simd_ext}_{typ}( nsimd_neg_{simd_ext}_{typ}({in0}), {in1}), {in2});'''.format(op=op, **fmtspec) # One could use only emulate and no split. But to avoid splitting and # merging SIMD register for each operation: sub, mul and add, we use # emulation only for SIMD extensions that have natively add, sub and mul # intrinsics. split = split_opn(func, simd_ext, typ, 3) if typ in ['f32', 'f64']: if simd_ext in sse + avx: return '''#ifdef NSIMD_FMA return {pre}f{neg}m{op}{suf}({in0}, {in1}, {in2}); # else {emulate} # endif'''.format(op=op, neg=neg, emulate=emulate, **fmtspec) else: return 'return {pre}f{neg}m{op}{suf}({in0}, {in1}, {in2});'. \ format(op=op, neg=neg, **fmtspec) if simd_ext in avx: return emulate if simd_ext == 'avx2' else split if simd_ext in avx512: return emulate if simd_ext == 'avx512_skylake' else split return emulate # ----------------------------------------------------------------------------- # Ceil and floor def round1(opts, func, simd_ext, typ): if typ == 'f16': return '''nsimd_{simd_ext}_vf16 ret; ret.v0 = nsimd_{func}_{simd_ext}_f32({in0}.v0); ret.v1 = nsimd_{func}_{simd_ext}_f32({in0}.v1); return ret;'''.format(func=func, **fmtspec) if typ in ['f32', 'f64']: normal = 'return {pre}{func}{suf}({in0});'.format(func=func, **fmtspec) if simd_ext not in sse: return normal if simd_ext == 'sse42': return normal else: return emulate_op1(opts, func, simd_ext, typ) return 'return {in0};'.format(**fmtspec) # ----------------------------------------------------------------------------- # Trunc def trunc1(opts, simd_ext, typ): if typ == 'f16': return '''nsimd_{simd_ext}_vf16 ret; ret.v0 = nsimd_trunc_{simd_ext}_f32({in0}.v0); ret.v1 = nsimd_trunc_{simd_ext}_f32({in0}.v1); return ret;'''.format(**fmtspec) if typ in ['f32', 'f64']: normal = '''return {pre}round{suf}({in0}, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);'''.format(**fmtspec) if simd_ext == 'sse2': return emulate_op1(opts, 'trunc', simd_ext, typ) if simd_ext == 'sse42': return normal if simd_ext in avx: return normal if simd_ext in avx512: return \ '''__mmask{le} cond = nsimd_gt_{simd_ext}_{typ}( {in0}, _mm512_setzero{sufsi}()); return nsimd_if_else1_{simd_ext}_{typ}(cond, nsimd_floor_{simd_ext}_{typ}({in0}), nsimd_ceil_{simd_ext}_{typ}({in0}));'''. \ format(**fmtspec) return 'return {in0};'.format(**fmtspec) # ----------------------------------------------------------------------------- # Round to even def round_to_even1(opts, simd_ext, typ): if typ == 'f16': return '''nsimd_{simd_ext}_vf16 ret; ret.v0 = nsimd_round_to_even_{simd_ext}_f32({in0}.v0); ret.v1 = nsimd_round_to_even_{simd_ext}_f32({in0}.v1); return ret;'''.format(**fmtspec) if typ in ['f32', 'f64']: normal = '''return {pre}round{suf}({in0}, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);'''.format(**fmtspec) if simd_ext == 'sse2': return emulate_op1(opts, 'round_to_even', simd_ext, typ) if simd_ext == 'sse42': return normal if simd_ext in avx: return normal if simd_ext in avx512: return 'return _mm512_roundscale{suf}({in0}, 0);'.format(**fmtspec) return 'return {in0};'.format(**fmtspec) # ----------------------------------------------------------------------------- # All and any functions def all_any(func, simd_ext, typ): if typ == 'f16': return \ '''return nsimd_{func}_{simd_ext}_f32({in0}.v0) {and_or} nsimd_{func}_{simd_ext}_f32({in0}.v1);'''. \ format(func=func, and_or='&&' if func == 'all' else '||', **fmtspec) if simd_ext in sse: if typ in common.iutypes: return 'return (u32)_mm_movemask_epi8({in0}) {test};'. \ format(test='== 0xFFFF' if func == 'all' else '!= 0u', **fmtspec) else: mask = '0xF' if typ == 'f32' else '0x3' return 'return (u32)_mm_movemask{suf}({in0}) {test};'. \ format(test='== ' + mask if func == 'all' else '!= 0u', **fmtspec) if simd_ext in avx: if typ in common.iutypes: if simd_ext == 'avx2': return 'return _mm256_movemask_epi8({in0}) {test};'. \ format(test='== -1' if func == 'all' else '!= 0', **fmtspec) else: return \ '''nsimd_sse42_v{typ} lo = {extract_lo}; nsimd_sse42_v{typ} hi = {extract_hi}; return nsimd_{func}_sse42_{typ}(lo) {and_or} nsimd_{func}_sse42_{typ}(hi);'''. \ format(extract_lo=extract(simd_ext, typ, LO, common.in0), extract_hi=extract(simd_ext, typ, HI, common.in0), func=func, and_or='&&' if func == 'all' else '||', **fmtspec) else: mask = '0xFF' if typ == 'f32' else '0xF' return 'return _mm256_movemask{suf}({in0}) {test};'. \ format(test='== ' + mask if func == 'all' else '!= 0', **fmtspec) if simd_ext in avx512: all_test = '== 0x' + ('F' * int((512 // int(typ[1:]) // 4))) return 'return {in0} {test};'. \ format(test=all_test if func == 'all' else '!= 0', **fmtspec) # ----------------------------------------------------------------------------- # Reinterpret (bitwise_cast) def reinterpret1(simd_ext, from_typ, to_typ): if from_typ == to_typ: return 'return {in0};'.format(**fmtspec) if to_typ == 'f16': emulate = '''{from_typ} buf[{le}]; nsimd_storeu_{simd_ext}_{from_typ}(buf, {in0}); return nsimd_loadu_{simd_ext}_f16((f16*)buf);'''. \ format(**fmtspec) native = '''nsimd_{simd_ext}_vf16 ret; ret.v0 = {pre}cvtph_ps({extract_lo}); ret.v1 = {pre}cvtph_ps({extract_hi}); return ret;'''.format( extract_lo=extract(simd_ext, 'u16', LO, common.in0), extract_hi=extract(simd_ext, 'u16', HI, common.in0), **fmtspec) if simd_ext in sse: return \ '''#ifdef NSIMD_FP16 nsimd_{simd_ext}_vf16 ret; ret.v0 = _mm_cvtph_ps({in0}); {in0} = _mm_shuffle_epi32({in0}, 14); /* = (3 << 2) | (2 << 0) */ ret.v1 = _mm_cvtph_ps({in0}); return ret; #else {emulate} #endif'''.format(emulate=emulate, **fmtspec) if simd_ext in avx: return \ '''#ifdef NSIMD_FP16 {} #else {} #endif'''.format(native, emulate) if simd_ext in avx512: return native if from_typ == 'f16': emulate = \ '''u16 buf[{le}]; nsimd_storeu_{simd_ext}_f16((f16*)buf, {in0}); return nsimd_loadu_{simd_ext}_{to_typ}(({to_typ}*)buf);'''. \ format(**fmtspec) native = 'return {};'.format(setr(simd_ext, 'u16', '{pre}cvtps_ph({in0}.v0, 4)'.format(**fmtspec), '{pre}cvtps_ph({in0}.v1, 4)'.format(**fmtspec))) if simd_ext in sse: return \ '''#ifdef NSIMD_FP16 __m128i lo = _mm_cvtps_ph({in0}.v0, 4); __m128i hi = _mm_cvtps_ph({in0}.v1, 4); return _mm_castpd_si128(_mm_shuffle_pd( _mm_castsi128_pd(lo), _mm_castsi128_pd(hi), 0)); #else {emulate} #endif'''.format(emulate=emulate, **fmtspec) if simd_ext in avx: return \ '''#ifdef NSIMD_FP16 {} #else {} #endif'''.format(native, emulate) if simd_ext in avx512: return native if from_typ in common.iutypes and to_typ in common.iutypes: return 'return {in0};'.format(**fmtspec) if to_typ in ['f32', 'f64']: return 'return {pre}castsi{nbits}{to_suf}({in0});'. \ format(to_suf=suf_ep(to_typ), **fmtspec) if from_typ in ['f32', 'f64']: return 'return {pre}cast{from_suf}_si{nbits}({in0});'. \ format(from_suf=suf_ep(from_typ)[1:], **fmtspec) # ----------------------------------------------------------------------------- # Reinterpretl, i.e. reinterpret on logicals def reinterpretl1(simd_ext, from_typ, to_typ): if from_typ == to_typ: return 'return {in0};'.format(**fmtspec) if to_typ == 'f16': if simd_ext in sse: return \ '''nsimd_{simd_ext}_vlf16 ret; ret.v0 = _mm_castsi128_ps(_mm_unpacklo_epi16({in0}, {in0})); ret.v1 = _mm_castsi128_ps(_mm_unpackhi_epi16({in0}, {in0})); return ret;'''.format(**fmtspec) if simd_ext == 'avx': return \ '''nsimd_{simd_ext}_vlf16 ret; nsimd_sse42_vlf16 tmp1 = nsimd_reinterpretl_sse42_f16_{from_typ}( _mm256_castsi256_si128({in0})); nsimd_sse42_vlf16 tmp2 = nsimd_reinterpretl_sse42_f16_{from_typ}( _mm256_extractf128_si256({in0}, 1)); ret.v0 = {setr_tmp1}; ret.v1 = {setr_tmp2}; return ret;'''. \ format(setr_tmp1=setr('avx', 'f32', 'tmp1.v0', 'tmp1.v1'), setr_tmp2=setr('avx', 'f32', 'tmp2.v0', 'tmp2.v1'), **fmtspec) if simd_ext == 'avx2': return \ '''nsimd_{simd_ext}_vlf16 ret; ret.v0 = _mm256_castsi256_ps(_mm256_cvtepi16_epi32( _mm256_castsi256_si128({in0}))); ret.v1 = _mm256_castsi256_ps(_mm256_cvtepi16_epi32( _mm256_extractf128_si256({in0}, 1))); return ret;'''.format(**fmtspec) if simd_ext in avx512: return '''nsimd_{simd_ext}_vlf16 ret; ret.v0 = (__mmask16)({in0} & 0xFFFF); ret.v1 = (__mmask16)(({in0} >> 16) & 0xFFFF); return ret;'''.format(**fmtspec) if from_typ == 'f16': if simd_ext in sse + avx: return '''f32 in[{le}]; {to_typ} out[{le}]; int i; nsimd_storeu_{simd_ext}_f32(in, {in0}.v0); nsimd_storeu_{simd_ext}_f32(in + {leo2}, {in0}.v1); for (i = 0; i < {le}; i++) {{ out[i] = ({to_typ})(in[i] != 0.0f ? -1 : 0); }} return nsimd_loadu_{simd_ext}_{to_typ}(out);'''. \ format(leo2=int(fmtspec['le']) // 2, **fmtspec) if simd_ext in avx512: return \ 'return (__mmask32){in0}.v0 | ((__mmask32){in0}.v1 << 16);'. \ format(**fmtspec) if simd_ext in sse + avx: return reinterpret1(simd_ext, from_typ, to_typ) else: return 'return {in0};'.format(**fmtspec) # ----------------------------------------------------------------------------- # Convert def convert1(simd_ext, from_typ, to_typ): if to_typ == from_typ or \ to_typ in common.iutypes and from_typ in common.iutypes: return 'return {in0};'.format(**fmtspec) if to_typ == 'f16': if simd_ext in sse: getlo = '{in0}'.format(**fmtspec) gethi = '_mm_unpackhi_epi64({in0}, {in0})'.format(**fmtspec) if simd_ext in avx: getlo = '_mm256_castsi256_si128({in0})'.format(**fmtspec) gethi = '_mm256_extractf128_si256({in0}, 1)'.format(**fmtspec) if simd_ext in avx512: getlo = '_mm512_castsi512_si256({in0})'.format(**fmtspec) gethi = '_mm512_extracti64x4_epi64({in0}, 1)'.format(**fmtspec) through_epi32 = \ '''nsimd_{simd_ext}_v{to_typ} ret; ret.v0 = {pre}cvtepi32_ps({pre}cvtep{from_typ}_epi32({getlo})); ret.v1 = {pre}cvtepi32_ps({pre}cvtep{from_typ}_epi32({gethi})); return ret;'''.format(getlo=getlo, gethi=gethi, **fmtspec) emulate = '''{from_typ} in[{le}]; f32 out[{leo2}]; nsimd_{simd_ext}_vf16 ret; int i; nsimd_storeu_{simd_ext}_{from_typ}(in, {in0}); for (i = 0; i < {leo2}; i++) {{ out[i] = (f32)in[i]; }} ret.v0 = nsimd_loadu_{simd_ext}_f32(out); for (i = 0; i < {leo2}; i++) {{ out[i] = (f32)in[i + {leo2}]; }} ret.v1 = nsimd_loadu_{simd_ext}_f32(out); return ret;'''.format(leo2=int(fmtspec['le']) // 2, **fmtspec) if simd_ext in ['sse42', 'avx2']: return through_epi32 if simd_ext in ['sse2', 'avx']: return emulate if simd_ext in avx512: return through_epi32 if from_typ == 'f16': return '''f32 in[{leo2}]; {to_typ} out[{le}]; int i; nsimd_storeu_{simd_ext}_f32(in, {in0}.v0); for (i = 0; i < {leo2}; i++) {{ out[i] = ({to_typ})in[i]; }} nsimd_storeu_{simd_ext}_f32(in, {in0}.v1); for (i = 0; i < {leo2}; i++) {{ out[i + {leo2}] = ({to_typ})in[i]; }} return nsimd_loadu_{simd_ext}_{to_typ}(out);'''. \ format(leo2=int(fmtspec['le']) // 2, **fmtspec) emulate = '''{from_typ} in[{le}]; {to_typ} out[{le}]; int i; nsimd_storeu_{simd_ext}_{from_typ}(in, {in0}); for (i = 0; i < {le}; i++) {{ out[i] = ({to_typ})in[i]; }} return nsimd_loadu_{simd_ext}_{to_typ}(out);'''. \ format(**fmtspec) if to_typ == 'f64' or from_typ == 'f64': if simd_ext == 'avx512_skylake': return 'return _mm512_cvt{from_suf}{to_suf}({in0});'. \ format(from_suf=suf_ep(from_typ)[1:], to_suf=suf_ep(to_typ), **fmtspec) else: return emulate if to_typ == 'f32' and from_typ == 'i32': return 'return {pre}cvtepi32_ps({in0});'.format(**fmtspec) if to_typ == 'f32' and from_typ == 'u32': if simd_ext in sse + avx: return emulate if simd_ext in avx512: return 'return _mm512_cvtepu32_ps({in0});'.format(**fmtspec) if to_typ == 'i32' and from_typ == 'f32': return 'return {pre}cvtps_epi32({in0});'.format(**fmtspec) if to_typ == 'u32' and from_typ == 'f32': if simd_ext in sse + avx: return emulate if simd_ext in avx512: return 'return _mm512_cvtps_epu32({in0});'.format(**fmtspec) # ----------------------------------------------------------------------------- # Reciprocal (at least 11 bits of precision) def rec11_rsqrt11(func, simd_ext, typ): if typ == 'f16': return '''nsimd_{simd_ext}_vf16 ret; ret.v0 = nsimd_{func}11_{simd_ext}_f32({in0}.v0); ret.v1 = nsimd_{func}11_{simd_ext}_f32({in0}.v1); return ret;'''. \ format(func='rec' if func == 'rcp' else 'rsqrt', **fmtspec) if typ == 'f32': if simd_ext in sse + avx: return 'return {pre}{func}_ps({in0});'.format(func=func, **fmtspec) if simd_ext in avx512: return 'return _mm512_{func}14_ps({in0});'. \ format(func=func, **fmtspec) if typ == 'f64': if simd_ext in sse + avx: one = '{pre}set1_pd(1.0)'.format(**fmtspec) if func == 'rcp': return 'return {pre}div{suf}({one}, {in0});'.format(one=one, **fmtspec) else: return 'return {pre}div{suf}({one}, {pre}sqrt{suf}({in0}));'. \ format(one=one, **fmtspec) format(func=func, **fmtspec) if simd_ext in avx512: return 'return _mm512_{func}14_pd({in0});'. \ format(func=func, **fmtspec) # ----------------------------------------------------------------------------- # Reciprocal (IEEE) def rec1(simd_ext, typ): one = '{pre}set1_ps(1.0f)'.format(**fmtspec) if typ in ['f16', 'f32'] \ else '{pre}set1_pd(1.0)'.format(**fmtspec) if typ == 'f16': return '''nsimd_{simd_ext}_vf16 ret; nsimd_{simd_ext}_vf32 one = {one}; ret.v0 = {pre}div_ps(one, {in0}.v0); ret.v1 = {pre}div_ps(one, {in0}.v1); return ret;'''.format(one=one, **fmtspec) return 'return {pre}div{suf}({one}, {in0});'.format(one=one, **fmtspec) # ----------------------------------------------------------------------------- # Negative def neg1(simd_ext, typ): cte = '0x80000000' if typ in ['f16', 'f32'] else '0x8000000000000000' fsuf = '_ps' if typ in ['f16', 'f32'] else '_pd' utyp = 'u32' if typ in ['f16', 'f32'] else 'u64' vmask = '{pre}castsi{nbits}{fsuf}(nsimd_set1_{simd_ext}_{utyp}({cte}))'. \ format(cte=cte, utyp=utyp, fsuf=fsuf, **fmtspec) if typ == 'f16': return '''nsimd_{simd_ext}_vf16 ret; nsimd_{simd_ext}_vf32 mask = {vmask}; ret.v0 = nsimd_xorb_{simd_ext}_f32(mask, {in0}.v0); ret.v1 = nsimd_xorb_{simd_ext}_f32(mask, {in0}.v1); return ret;'''.format(vmask=vmask, **fmtspec) if typ in ['f32', 'f64']: return 'return nsimd_xorb_{simd_ext}_{typ}({vmask}, {in0});'. \ format(vmask=vmask, **fmtspec) return '''return nsimd_sub_{simd_ext}_{typ}( {pre}setzero_si{nbits}(), {in0});'''. \ format(**fmtspec) # ----------------------------------------------------------------------------- # nbtrue def nbtrue1(simd_ext, typ): if typ == 'f16': return '''return nsimd_nbtrue_{simd_ext}_f32({in0}.v0) + nsimd_nbtrue_{simd_ext}_f32({in0}.v1);'''. \ format(**fmtspec) if typ in ['i8', 'u8']: code = 'return nsimd_popcnt32_((u32){pre}movemask_epi8({in0}));'. \ format(**fmtspec) elif typ in ['i16', 'u16']: code = 'return nsimd_popcnt32_((u32){pre}movemask_epi8({in0})) >> 1;'. \ format(**fmtspec) elif typ in ['i32', 'u32', 'i64', 'u64']: code = '''return nsimd_popcnt32_((u32){pre}movemask{fsuf}( {pre}castsi{nbits}{fsuf}({in0})));'''. \ format(fsuf='_ps' if typ in ['i32', 'u32'] else '_pd', **fmtspec) else: code = 'return nsimd_popcnt32_((u32){pre}movemask{suf}({in0}));'. \ format(**fmtspec) if simd_ext in sse: return code if simd_ext in avx: if typ in ['i32', 'u32', 'i64', 'u64', 'f32', 'f64']: return code else: if simd_ext == 'avx2': return code else: return \ '''return nsimd_nbtrue_sse42_{typ}( _mm256_castsi256_si128({in0})) + nsimd_nbtrue_sse42_{typ}( _mm256_extractf128_si256({in0}, 1));'''. \ format(**fmtspec) if simd_ext in avx512: return 'return nsimd_popcnt64_((u64){in0});'.format(**fmtspec) # ----------------------------------------------------------------------------- # reverse def reverse1(simd_ext, typ): # 8-bit int if typ in ['i8', 'u8']: if simd_ext == 'sse2': return '''{in0} = _mm_shufflehi_epi16({in0}, _MM_SHUFFLE(0,1,2,3)); {in0} = _mm_shufflelo_epi16({in0}, _MM_SHUFFLE(0,1,2,3)); {in0} = _mm_castpd_si128(_mm_shuffle_pd( _mm_castsi128_pd({in0}), _mm_castsi128_pd( {in0}), 1)); nsimd_{simd_ext}_v{typ} r0 = _mm_srli_epi16({in0}, 8); nsimd_{simd_ext}_v{typ} r1 = _mm_slli_epi16({in0}, 8); return _mm_or_si128(r0, r1);'''.format(**fmtspec) elif simd_ext == 'sse42': return '''nsimd_{simd_ext}_v{typ} mask = _mm_set_epi8( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); return _mm_shuffle_epi8({in0}, mask);'''. \ format(**fmtspec) elif simd_ext == 'avx': return \ '''nsimd_sse42_v{typ} r0 = _mm_shuffle_epi8( _mm256_extractf128_si256({in0}, 0), _mm_set_epi8( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)); nsimd_sse42_v{typ} r1 = _mm_shuffle_epi8( _mm256_extractf128_si256({in0}, 1), _mm_set_epi8( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)); {in0} = _mm256_insertf128_si256({in0}, r0, 1); return _mm256_insertf128_si256({in0}, r1, 0);'''. \ format(**fmtspec) elif simd_ext == 'avx2': return \ '''{in0} = _mm256_shuffle_epi8({in0}, _mm256_set_epi8( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31)); return _mm256_permute2x128_si256({in0}, {in0}, 1);'''. \ format(**fmtspec) # AVX-512F and above. else: return \ '''nsimd_avx2_v{typ} r0 = _mm512_extracti64x4_epi64({in0}, 0); nsimd_avx2_v{typ} r1 = _mm512_extracti64x4_epi64({in0}, 1); r0 = _mm256_shuffle_epi8(r0, _mm256_set_epi8( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31)); r1 = _mm256_shuffle_epi8(r1, _mm256_set_epi8( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31)); r0 = _mm256_permute2x128_si256(r0, r0, 1); r1 = _mm256_permute2x128_si256(r1, r1, 1); {in0} = _mm512_insertf64x4({in0}, r0, 1); return _mm512_insertf64x4({in0}, r1, 0);'''.format(**fmtspec) # 16-bit int elif typ in ['i16', 'u16']: if simd_ext == 'sse2': return '''{in0} = _mm_shufflehi_epi16( {in0}, _MM_SHUFFLE(0,1,2,3) ); {in0} = _mm_shufflelo_epi16( {in0}, _MM_SHUFFLE(0,1,2,3) ); return _mm_castpd_si128(_mm_shuffle_pd( _mm_castsi128_pd({in0}), _mm_castsi128_pd({in0}), 1));'''. \ format(**fmtspec) elif simd_ext == 'sse42': return \ '''return _mm_shuffle_epi8({in0}, _mm_set_epi8( 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14));'''.format(**fmtspec) elif simd_ext == 'avx': return \ '''nsimd_sse42_v{typ} r0 = _mm_shuffle_epi8( _mm256_extractf128_si256({in0}, 0), _mm_set_epi8( 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14)); nsimd_sse42_v{typ} r1 = _mm_shuffle_epi8( _mm256_extractf128_si256({in0}, 1), _mm_set_epi8( 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14)); {in0} = _mm256_insertf128_si256( {in0}, r0, 1); return _mm256_insertf128_si256({in0}, r1, 0);'''. \ format(**fmtspec) elif simd_ext == 'avx2': return \ '''{in0} = _mm256_shuffle_epi8({in0}, _mm256_set_epi8( 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 17, 16, 19, 18, 21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30)); return _mm256_permute2x128_si256({in0}, {in0}, 1);'''. \ format(**fmtspec) # AVX-512F elif simd_ext == 'avx512_knl': return \ '''{in0} = _mm512_permutexvar_epi32(_mm512_set_epi32( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15), {in0}); nsimd_{simd_ext}_v{typ} r0 = _mm512_srli_epi32({in0}, 16); nsimd_{simd_ext}_v{typ} r1 = _mm512_slli_epi32({in0}, 16); return _mm512_or_si512(r0, r1);'''.format(**fmtspec) # AVX-512F+BW (Skylake) + WORKAROUND GCC<=8 else: return \ '''return _mm512_permutexvar_epi16(_mm512_set_epi32( (0<<16) | 1, (2<<16) | 3, (4<<16) | 5, (6<<16) | 7, (8<<16) | 9, (10<<16) | 11, (12<<16) | 13, (14<<16) | 15, (16<<16) | 17, (18<<16) | 19, (20<<16) | 21, (22<<16) | 23, (24<<16) | 25, (26<<16) | 27, (28<<16) | 29, (30<<16) | 31), {in0} );'''.format(**fmtspec) # 32-bit int elif typ in ['i32', 'u32']: if simd_ext in ['sse2', 'sse42']: return 'return _mm_shuffle_epi32({in0}, _MM_SHUFFLE(0,1,2,3));'. \ format(**fmtspec) elif simd_ext == 'avx': return '''{in0} = _mm256_castps_si256(_mm256_shuffle_ps( _mm256_castsi256_ps({in0}), _mm256_castsi256_ps({in0}), _MM_SHUFFLE(0,1,2,3))); return _mm256_permute2f128_si256({in0}, {in0}, 1);'''. \ format(**fmtspec) elif simd_ext == 'avx2': return \ '''{in0} = _mm256_shuffle_epi32({in0}, _MM_SHUFFLE(0,1,2,3)); return _mm256_permute2x128_si256({in0}, {in0}, 1);'''. \ format(**fmtspec) else: return \ '''return _mm512_permutexvar_epi32(_mm512_set_epi32( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15), {in0});'''. \ format(**fmtspec) elif typ in ['i64', 'u64']: if simd_ext in ['sse2', 'sse42']: return '''return _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd( {in0}), _mm_castsi128_pd({in0}), 1));'''. \ format(**fmtspec) elif simd_ext == 'avx': return '''{in0} = _mm256_castpd_si256( _mm256_shuffle_pd( _mm256_castsi256_pd({in0}), _mm256_castsi256_pd({in0}), (1<<2) | 1 ) ); return _mm256_permute2f128_si256({in0}, {in0}, 1);'''. \ format(**fmtspec) elif simd_ext == 'avx2': return '''return _mm256_permute4x64_epi64({in0}, _MM_SHUFFLE(0, 1, 2, 3));'''.format(**fmtspec) else: return '''return _mm512_permutexvar_epi64(_mm512_set_epi64( 0, 1, 2, 3, 4, 5, 6, 7), {in0});'''. \ format(**fmtspec) # 16-bit float elif typ == 'f16': return '''nsimd_{simd_ext}_vf16 ret; ret.v0 = nsimd_reverse_{simd_ext}_f32({in0}.v0); ret.v1 = nsimd_reverse_{simd_ext}_f32({in0}.v1); return ret;'''.format(**fmtspec) # 32-bit float elif typ == 'f32': if simd_ext in ['sse2', 'sse42']: return '''return _mm_shuffle_ps({in0}, {in0}, _MM_SHUFFLE(0, 1, 2, 3));'''.format(**fmtspec) elif simd_ext in ['avx', 'avx2']: return '''{in0} = _mm256_shuffle_ps({in0}, {in0}, _MM_SHUFFLE(0, 1, 2, 3)); return _mm256_permute2f128_ps({in0}, {in0}, 1);'''. \ format(**fmtspec) else: return \ '''return _mm512_permutexvar_ps(_mm512_set_epi32( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15), {in0} );'''.format(**fmtspec) # 64-bit float else: if simd_ext in ['sse2', 'sse42']: return 'return _mm_shuffle_pd({in0}, {in0}, 1);'.format(**fmtspec) elif simd_ext == 'avx': return '''{in0} = _mm256_shuffle_pd({in0}, {in0}, (1<<2) | 1); return _mm256_permute2f128_pd({in0}, {in0}, 1);'''. \ format(**fmtspec) elif simd_ext == 'avx2': return '''return _mm256_permute4x64_pd({in0}, _MM_SHUFFLE(0, 1, 2, 3));'''.format(**fmtspec) else: return '''return _mm512_permute_mm512_set_epi64( 0, 1, 2, 3, 4, 5, 6, 7), {in0});'''. \ format(**fmtspec) # ----------------------------------------------------------------------------- # addv def addv(simd_ext, typ): if simd_ext in sse: if typ == 'f64': return \ '''return _mm_cvtsd_f64(_mm_add_pd({in0}, _mm_shuffle_pd({in0}, {in0}, 0x01)));'''. \ format(**fmtspec) elif typ == 'f32': return \ '''nsimd_{simd_ext}_vf32 tmp = _mm_add_ps({in0}, _mm_shuffle_ps( {in0}, {in0}, 0xb1)); return _mm_cvtss_f32(_mm_add_ps(tmp, _mm_shuffle_ps( tmp, tmp, 0x4e)));''' .format(**fmtspec) elif typ == 'f16': return \ '''nsimd_{simd_ext}_vf32 tmp0 = _mm_add_ps({in0}.v0, _mm_shuffle_ps({in0}.v0, {in0}.v0, 0xb1)); nsimd_{simd_ext}_vf32 tmp1 = _mm_add_ps({in0}.v1, _mm_shuffle_ps({in0}.v1, {in0}.v1, 0xb1)); return nsimd_f32_to_f16(_mm_cvtss_f32(_mm_add_ps( tmp0, _mm_shuffle_ps(tmp0, tmp0, 0x4e))) + _mm_cvtss_f32(_mm_add_ps(tmp1, _mm_shuffle_ps( tmp1, tmp1, 0x4e))));''' .format(**fmtspec) elif simd_ext in avx: if typ == 'f64': return \ '''__m128d tmp = _mm_add_pd(_mm256_extractf128_pd({in0}, 1), _mm256_extractf128_pd({in0}, 0)); return _mm_cvtsd_f64(_mm_add_pd(tmp, _mm_shuffle_pd( tmp, tmp, 0x01)));''' .format(**fmtspec) elif typ == 'f32': return \ '''__m128 tmp0 = _mm_add_ps(_mm256_extractf128_ps({in0}, 1), _mm256_extractf128_ps({in0}, 0)); __m128 tmp1 = _mm_add_ps(tmp0, _mm_shuffle_ps(tmp0, tmp0, 0xb1)); return _mm_cvtss_f32(_mm_add_ps(tmp1, _mm_shuffle_ps( tmp1, tmp1, 0x4e)));''' .format(**fmtspec) elif typ == 'f16': return \ '''__m128 tmp00 = _mm_add_ps(_mm256_extractf128_ps({in0}.v0, 1), _mm256_extractf128_ps({in0}.v0, 0)); __m128 tmp01 = _mm_add_ps(tmp00, _mm_shuffle_ps( tmp00, tmp00, 0xb1)); __m128 tmp10 = _mm_add_ps(_mm256_extractf128_ps({in0}.v1, 1), _mm256_extractf128_ps({in0}.v1, 0)); __m128 tmp11 = _mm_add_ps(tmp10, _mm_shuffle_ps( tmp10, tmp10, 0xb1)); return nsimd_f32_to_f16(_mm_cvtss_f32(_mm_add_ps( tmp01, _mm_shuffle_ps(tmp01, tmp01, 0x4e))) + _mm_cvtss_f32(_mm_add_ps(tmp11, _mm_shuffle_ps( tmp11, tmp11, 0x4e)))); ''' .format(**fmtspec) elif simd_ext in avx512: if typ == 'f64': return \ '''__m256d tmp0 = _mm256_add_pd(_mm512_extractf64x4_pd({in0}, 0), _mm512_extractf64x4_pd({in0}, 1)); __m128d tmp1 = _mm_add_pd(_mm256_extractf128_pd(tmp0, 1), _mm256_extractf128_pd(tmp0, 0)); return _mm_cvtsd_f64(_mm_add_pd(tmp1, _mm_shuffle_pd( tmp1, tmp1, 0x01)));''' .format(**fmtspec) elif typ == 'f32': return \ '''__m128 tmp0 = _mm_add_ps(_mm_add_ps(_mm512_extractf32x4_ps( {in0}, 0), _mm512_extractf32x4_ps({in0}, 1)), _mm_add_ps(_mm512_extractf32x4_ps({in0}, 2), _mm512_extractf32x4_ps({in0}, 3))); __m128 tmp1 = _mm_add_ps(tmp0, _mm_shuffle_ps( tmp0, tmp0, 0xb1)); return _mm_cvtss_f32(_mm_add_ps(tmp1, _mm_shuffle_ps( tmp1, tmp1, 0x4e)));''' .format(**fmtspec) elif typ == 'f16': return \ '''f32 res; __m128 tmp0 = _mm_add_ps( _mm_add_ps(_mm512_extractf32x4_ps({in0}.v0, 0), _mm512_extractf32x4_ps({in0}.v0, 1)), _mm_add_ps(_mm512_extractf32x4_ps({in0}.v0, 2), _mm512_extractf32x4_ps({in0}.v0, 3))); __m128 tmp1 = _mm_add_ps(tmp0, _mm_shuffle_ps( tmp0, tmp0, 0xb1)); res = _mm_cvtss_f32(_mm_add_ps(tmp1, _mm_shuffle_ps( tmp1, tmp1, 0x4e))); tmp0 = _mm_add_ps( _mm_add_ps(_mm512_extractf32x4_ps({in0}.v1, 0), _mm512_extractf32x4_ps({in0}.v1, 1)), _mm_add_ps(_mm512_extractf32x4_ps({in0}.v1, 2), _mm512_extractf32x4_ps({in0}.v1, 3))); tmp1 = _mm_add_ps(tmp0, _mm_shuffle_ps(tmp0, tmp0, 0xb1)); return nsimd_f32_to_f16(res + _mm_cvtss_f32(_mm_add_ps( tmp1, _mm_shuffle_ps(tmp1, tmp1, 0x4e))));''' . \ format(**fmtspec) # ----------------------------------------------------------------------------- # upconvert def upcvt1(simd_ext, from_typ, to_typ): # From f16 is easy if from_typ == 'f16': if to_typ == 'f32': return \ '''nsimd_{simd_ext}_vf32x2 ret; ret.v0 = {in0}.v0; ret.v1 = {in0}.v1; return ret;'''.format(**fmtspec) else: return \ '''nsimd_{simd_ext}_v{to_typ}x2 ret; ret.v0 = nsimd_cvt_{simd_ext}_{to_typ}_f32({in0}.v0); ret.v1 = nsimd_cvt_{simd_ext}_{to_typ}_f32({in0}.v1); return ret;'''.format(**fmtspec) # To f16 is easy if to_typ == 'f16': return \ '''nsimd_{simd_ext}_vf16x2 ret; nsimd_{simd_ext}_v{iu}16x2 buf; buf = nsimd_upcvt_{simd_ext}_{iu}16_{iu}8({in0}); ret.v0 = nsimd_cvt_{simd_ext}_f16_{iu}16(buf.v0); ret.v1 = nsimd_cvt_{simd_ext}_f16_{iu}16(buf.v1); return ret;'''.format(iu=from_typ[0], **fmtspec) # For integer upcast, due to 2's complement representation # epi_epi : signed -> bigger signed # epi_epi : signed -> bigger unsigned # epu_epi : unsigned -> bigger signed # epu_epi : unsigned -> bigger unsigned if from_typ in common.iutypes: suf_epep = 'ep{ui}{typnbits}_epi{typnbits2}'. \ format(ui='u' if from_typ in common.utypes else 'i', typnbits2=str(int(fmtspec['typnbits']) * 2), **fmtspec) else: suf_epep = 'ps_pd' # compute lower half if simd_ext in sse: lower_half = '{in0}'.format(**fmtspec) else: lower_half = extract(simd_ext, from_typ, LO, fmtspec['in0']) # compute upper half if simd_ext in sse: if from_typ in common.iutypes: upper_half = '_mm_shuffle_epi32({in0}, 14 /* 2 | 3 */)'. \ format(**fmtspec) else: upper_half = '''{pre}castpd_ps({pre}shuffle_pd( {pre}castps_pd({in0}), {pre}castps_pd({in0}), 1))'''.format(**fmtspec) else: upper_half = extract(simd_ext, from_typ, HI, fmtspec['in0']) # When intrinsics are provided # for conversions integers <-> floating point, there is no intrinsics, so # we use cvt's if from_typ == 'i32' and to_typ == 'f64': with_intrinsic = \ '''nsimd_{simd_ext}_vf64x2 ret; ret.v0 = {pre}cvtepi32_pd({lower_half}); ret.v1 = {pre}cvtepi32_pd({upper_half}); return ret;'''.format(upper_half=upper_half, lower_half=lower_half, **fmtspec) elif (from_typ in common.iutypes and to_typ in common.iutypes) or \ (from_typ == 'f32' and to_typ == 'f64'): with_intrinsic = \ '''nsimd_{simd_ext}_v{to_typ}x2 ret; ret.v0 = {pre}cvt{suf_epep}({lower_half}); ret.v1 = {pre}cvt{suf_epep}({upper_half}); return ret;'''.format(upper_half=upper_half, lower_half=lower_half, suf_epep=suf_epep, **fmtspec) else: from_typ2 = from_typ[0] + str(int(fmtspec['typnbits']) * 2) if from_typ not in common.iutypes: # getting here means that from_typ=f32 and to_typ=f64 with_intrinsic = \ '''nsimd_{simd_ext}_vf64x2 ret; ret.v0 = nsimd_cvt_{simd_ext}_{to_typ}_f64({pre}cvtps_pd( {lower_half})); ret.v1 = nsimd_cvt_{simd_ext}_{to_typ}_f64({pre}cvtps_pd( {upper_half})); return ret;'''. \ format(upper_half=upper_half, lower_half=lower_half, from_typ2=from_typ2, suf_epep=suf_epep, **fmtspec) # When no intrinsic is given for going from integers to floating or # from floating to integer we can go through a cvt if to_typ in common.ftypes: int_float = \ '''nsimd_{simd_ext}_v{to_typ}x2 ret; nsimd_{simd_ext}_v{int_typ}x2 tmp; tmp = nsimd_upcvt_{simd_ext}_{int_typ}_{from_typ}({in0}); ret.v0 = nsimd_cvt_{simd_ext}_{to_typ}_{int_typ}(tmp.v0); ret.v1 = nsimd_cvt_{simd_ext}_{to_typ}_{int_typ}(tmp.v1); return ret;'''. \ format(int_typ=from_typ[0] + to_typ[1:], lower_half=lower_half, upper_half=upper_half, **fmtspec) else: int_float = \ '''return nsimd_upcvt_{simd_ext}_{to_typ}_{int_typ}( nsimd_cvt_{simd_ext}_{int_typ}_{from_typ}({in0}));'''. \ format(int_typ=to_typ[0] + from_typ[1:], lower_half=lower_half, upper_half=upper_half, **fmtspec) # When no intrinsic is given we can use the trick of falling back to # the lower SIMD extension split_trick = \ '''nsimd_{simd_ext}_v{to_typ}x2 ret; nsimd_{simd_ext2}_v{to_typ}x2 ret2; ret2 = nsimd_upcvt_{simd_ext2}_{to_typ}_{from_typ}({lo}); ret.v0 = {merge}; ret2 = nsimd_upcvt_{simd_ext2}_{to_typ}_{from_typ}({hi}); ret.v1 = {merge}; return ret;'''. \ format(simd_ext2='sse42' if simd_ext == 'avx' else 'avx2', lo=extract(simd_ext, from_typ, LO, common.in0), hi=extract(simd_ext, from_typ, HI, common.in0), merge=setr(simd_ext, to_typ, 'ret2.v0', 'ret2.v1'), **fmtspec) # return C code if from_typ == 'i32' and to_typ == 'f64': return with_intrinsic if (from_typ in common.ftypes and to_typ in common.iutypes) or \ (from_typ in common.iutypes and to_typ in common.ftypes): return int_float # if simd_ext == 'sse2': if simd_ext in sse: if from_typ in common.itypes and to_typ in common.iutypes: return \ '''nsimd_{simd_ext}_v{to_typ}x2 ret; __m128i mask = _mm_cmpgt{suf}(_mm_setzero_si128(), {in0}); ret.v0 = _mm_unpacklo{suf}({in0}, mask); ret.v1 = _mm_unpackhi{suf}({in0}, mask); return ret;'''.format(**fmtspec) elif from_typ in common.utypes and to_typ in common.iutypes: return \ '''nsimd_{simd_ext}_v{to_typ}x2 ret; ret.v0 = _mm_unpacklo{suf}({in0}, _mm_setzero_si128()); ret.v1 = _mm_unpackhi{suf}({in0}, _mm_setzero_si128()); return ret;'''.format(**fmtspec) else: return with_intrinsic # elif simd_ext == 'sse42': # return with_intrinsic elif simd_ext == 'avx': if from_typ == 'i32' and to_typ == 'f64': return with_intrinsic else: return split_trick elif simd_ext == 'avx2': return with_intrinsic elif simd_ext == 'avx512_knl': if from_typ in ['i16', 'u16', 'i32', 'u32', 'f32']: return with_intrinsic else: return split_trick else: return with_intrinsic # ----------------------------------------------------------------------------- # downconvert def downcvt1(opts, simd_ext, from_typ, to_typ): # From f16 is easy if from_typ == 'f16': le_to_typ = int(fmtspec['le']) * 2 le_1f32 = le_to_typ // 4 le_2f32 = 2 * le_to_typ // 4 le_3f32 = 3 * le_to_typ // 4 cast = castsi(simd_ext, to_typ) return \ '''{to_typ} dst[{le_to_typ}]; f32 src[{le_to_typ}]; int i; {pre}storeu_ps(src, {in0}.v0); {pre}storeu_ps(src + {le_1f32}, {in0}.v1); {pre}storeu_ps(src + {le_2f32}, {in1}.v0); {pre}storeu_ps(src + {le_3f32}, {in1}.v1); for (i = 0; i < {le_to_typ}; i++) {{ dst[i] = ({to_typ})src[i]; }} return {pre}loadu_si{nbits}({cast}dst);'''. \ format(le_to_typ=le_to_typ, le_1f32=le_1f32, le_2f32=le_2f32, le_3f32=le_3f32, cast=cast, **fmtspec) # To f16 is easy if to_typ == 'f16': if from_typ == 'f32': return \ '''nsimd_{simd_ext}_vf16 ret; ret.v0 = {in0}; ret.v1 = {in1}; return ret;'''.format(**fmtspec) else: return \ '''nsimd_{simd_ext}_vf16 ret; ret.v0 = nsimd_cvt_{simd_ext}_f32_{from_typ}({in0}); ret.v1 = nsimd_cvt_{simd_ext}_f32_{from_typ}({in1}); return ret;'''.format(**fmtspec) # f64 --> f32 have intrinsics if from_typ == 'f64' and to_typ == 'f32': if simd_ext in sse: return '''return _mm_movelh_ps(_mm_cvtpd_ps({in0}), _mm_cvtpd_ps({in1}));'''. \ format(**fmtspec) else: return 'return {};'.format(setr(simd_ext, 'f32', '{pre}cvtpd_ps({in0})'.format(**fmtspec), '{pre}cvtpd_ps({in1})'.format(**fmtspec))) # integer conversions intrinsics are only available with AVX-512 if simd_ext in avx512: if (from_typ in ['i32', 'i64'] and to_typ in common.itypes) or \ (simd_ext == 'avx512_skylake' and from_typ == 'i16' and \ to_typ == 'i8'): return 'return {};'.format(setr(simd_ext, to_typ, '{pre}cvtep{from_typ}_ep{to_typ}({in0})'.format(**fmtspec), '{pre}cvtep{from_typ}_ep{to_typ}({in1})'.format(**fmtspec))) elif from_typ == 'i64' and to_typ == 'f32': return 'return nsimd_cvt_{simd_ext}_f32_i32({});'. \ format(setr(simd_ext, from_typ, '{pre}cvtepi64_epi32({in0})'.format(**fmtspec), '{pre}cvtepi64_epi32({in1})'.format(**fmtspec)), **fmtspec) # and then emulation le_to_typ = 2 * int(fmtspec['le']) cast_src = '(__m{nbits}i *)'.format(**fmtspec) \ if from_typ in common.iutypes else '' cast_dst = '(__m{nbits}i *)'.format(**fmtspec) \ if to_typ in common.iutypes else '' return \ '''{to_typ} dst[{le_to_typ}]; {from_typ} src[{le_to_typ}]; int i; {pre}storeu{sufsi}({cast_src}src, {in0}); {pre}storeu{sufsi}({cast_src}(src + {le}), {in1}); for (i = 0; i < {le_to_typ}; i++) {{ dst[i] = ({to_typ})src[i]; }} return {pre}loadu{sufsi_to_typ}({cast_dst}dst);'''. \ format(cast_src=cast_src, cast_dst=cast_dst, le_to_typ=le_to_typ, sufsi_to_typ=suf_si(simd_ext, to_typ), **fmtspec) # ----------------------------------------------------------------------------- # adds / subs helper def adds_subs_intrinsic_instructions_i8_i16_u8_u16(which_op, simd_ext, typ): valid_types = ('i8', 'i16', 'u8', 'u16') if typ not in valid_types: raise TypeError( '''def adds_subs_intrinsic_instructions_i8_i16_u8_u16(...): {typ} must belong to the following types set: {valid_types}'''.\ format(typ=typ, valid_types=valid_types) ) if 'sse2' in simd_ext or 'sse42' in simd_ext: return''' return _mm_{which_op}_ep{typ}({in0}, {in1}); '''.format(which_op=which_op, **fmtspec) if 'avx' == simd_ext: return split_opn(which_op, simd_ext, typ, 2) if simd_ext in ('avx2', 'avx512_skylake'): return 'return {pre}{which_op}_ep{typ}({in0}, {in1});'. \ format(which_op=which_op, **fmtspec) if 'avx512_knl' == simd_ext: return split_opn(which_op, simd_ext, typ, 2) def get_avx512_sse2_i32_i64_dependent_code(simd_ext, typ): if 'avx512' in simd_ext or 'sse2' in simd_ext: mask_processing = \ '''/* For avx512/sse2 */ const nsimd_{simd_ext}_vu{typnbits} mask_strong_bit = nsimd_shr_{simd_ext}_u{typnbits}( mask, sizeof(u{typnbits}) * CHAR_BIT - 1); const nsimd_{simd_ext}_vi{typnbits} imask_strong_bit = nsimd_reinterpret_{simd_ext}_i{typnbits}_u{typnbits}( mask_strong_bit); const nsimd_{simd_ext}_vli{typnbits} limask_strong_bit = nsimd_to_logical_{simd_ext}_i{typnbits}(imask_strong_bit);'''. \ format(**fmtspec) if_else = \ '''/* For avx512/sse2 */ return nsimd_if_else1_{simd_ext}_i{typnbits}( limask_strong_bit, ires, i_max_min);'''. \ format(**fmtspec) else: mask_processing = '/* Before avx512: is_same(__m128i, ' \ 'vector, vector, ' \ 'vector) */' suf2 = 'ps' if typ in ['i32', 'u32'] else 'pd' if_else = '''return {pre}cast{suf2}_si{nbits}({pre}blendv_{suf2}( {pre}castsi{nbits}_{suf2}(i_max_min), {pre}castsi{nbits}_{suf2}(ires), {pre}castsi{nbits}_{suf2}(mask))); '''.format(suf2=suf2, **fmtspec) return { 'mask_processing': mask_processing, 'if_else': if_else } # ----------------------------------------------------------------------------- # adds def adds(simd_ext, typ): if typ in common.ftypes: return 'return nsimd_add_{simd_ext}_{typ}({in0}, {in1});'. \ format(**fmtspec) if typ in ('i8', 'i16', 'u8', 'u16'): return adds_subs_intrinsic_instructions_i8_i16_u8_u16( 'adds', simd_ext, typ) if typ in common.utypes: return \ '''/* Algo pseudo code: */ /* ures = a + b */ /* if overflow then ures < a && ures < b */ /* --> test against a single value: if(ures < a){{ overflow ; }} */ /* return ures < a ? {type_max} : ures */ const nsimd_{simd_ext}_v{typ} ures = nsimd_add_{simd_ext}_{typ}({in0}, {in1}); const nsimd_{simd_ext}_v{typ} type_max = nsimd_set1_{simd_ext}_{typ}(({typ}){type_max}); return nsimd_if_else1_{simd_ext}_{typ}( nsimd_lt_{simd_ext}_{typ}(ures, {in0}), type_max, ures);'''. \ format(type_max=common.limits[typ]['max'], **fmtspec) avx512_sse2_i32_i64_dependent_code = \ get_avx512_sse2_i32_i64_dependent_code(simd_ext, typ) return \ '''/* Algo pseudo code: */ /* if ( ( same_sign(ux, uy) && same_sign(uy, res) ) || */ /* ! same_sign(ux, uy) ): */ /* neither overflow nor underflow happened */ /* else: */ /* if(ux > 0 && uy > 0): res = MAX // overflow */ /* else: res = MIN // underflow */ /* Step 1: reinterpret to unsigned to work with the bits */ nsimd_{simd_ext}_vu{typnbits} ux = nsimd_reinterpret_{simd_ext}_u{typnbits}_i{typnbits}({in0}); const nsimd_{simd_ext}_vu{typnbits} uy = nsimd_reinterpret_{simd_ext}_u{typnbits}_i{typnbits}({in1}); const nsimd_{simd_ext}_vu{typnbits} ures = nsimd_add_{simd_ext}_u{typnbits}(ux, uy); /* Step 2: check signs different: ux, uy, res */ /* xor_ux_uy's most significant bit will be zero if both ux and */ /* uy have same sign */ const nsimd_{simd_ext}_vu{typnbits} xor_ux_uy = nsimd_xorb_{simd_ext}_u{typnbits}(ux, uy); /* xor_uy_res's most significant bit will be zero if both uy and */ /* ures have same sign */ const nsimd_{simd_ext}_vu{typnbits} xor_uy_res = nsimd_xorb_{simd_ext}_u{typnbits}(uy, ures); /* Step 3: Construct the MIN/MAX vector */ /* Pseudo code: */ /* Both positive --> overflow possible */ /* --> get the MAX: */ /* (signed)ux >= 0 && (signed)uy >= 0 */ /* <=> ((unsigned)ux | (unsigned)uy) >> 31 == 0 */ /* --> MAX + ( (ux | uy) >> 31 ) == MAX + 0 == MAX */ /* At least one negative */ /* --> overflow not possible / underflow possible if both negative */ /* --> get the MIN: */ /* unsigned tmp = (unsigned)MAX + */ /* ( ( (ux | uy) >> 31 ) == (unsigned)MAX + 1 ) */ /* --> MIN = (reinterpret signed)tmp */ /* ux | uy */ const nsimd_{simd_ext}_vu{typnbits} ux_uy_orb = nsimd_orb_{simd_ext}_u{typnbits}(ux, uy); /* (ux | uy) >> 31 --> Vector of 0's and 1's */ const nsimd_{simd_ext}_vu{typnbits} u_zeros_ones = nsimd_shr_{simd_ext}_u{typnbits}( ux_uy_orb, sizeof(u{typnbits}) * CHAR_BIT - 1); /* MIN/MAX vector */ /* i{typnbits} tmp = sMAX + 1 --> undefined behavior */ /* u{typnbits} tmp = (u{typnbits})sMAX + 1 */ /* i{typnbits} sMIN = *(i{typnbits}*)(&tmp) */ const nsimd_{simd_ext}_vu{typnbits} u_max = nsimd_set1_{simd_ext}_u{typnbits}((u{typnbits}){type_max}); const nsimd_{simd_ext}_vu{typnbits} u_max_min = nsimd_add_{simd_ext}_u{typnbits}(u_max, u_zeros_ones); const nsimd_{simd_ext}_vi{typnbits} i_max_min = nsimd_reinterpret_{simd_ext}_i{typnbits}_u{typnbits}(u_max_min); /* Step 4: Construct the mask vector */ /* mask == ( 8ot_same_sign(ux, uy) || same_sign(uy, res) ) */ /* mask: True (no underflow/overflow) / False (underflow/overflow) */ /* mask = xor_ux_uy | ~ xor_uy_res */ const nsimd_{simd_ext}_vu{typnbits} not_xor_uy_res = nsimd_notb_{simd_ext}_u{typnbits}(xor_uy_res); const nsimd_{simd_ext}_vu{typnbits} mask = nsimd_orb_{simd_ext}_u{typnbits}(xor_ux_uy, not_xor_uy_res); {avx512_sse2_dependent_mask_processing} /* Step 5: Apply the Mask */ const nsimd_{simd_ext}_vi{typnbits} ires = nsimd_reinterpret_{simd_ext}_i{typnbits}_u{typnbits}(ures); {avx512_sse2_dependent_if_else}'''. \ format(type_max = common.limits[typ]['max'], avx512_sse2_dependent_mask_processing = \ avx512_sse2_i32_i64_dependent_code['mask_processing'], avx512_sse2_dependent_if_else = \ avx512_sse2_i32_i64_dependent_code['if_else'], **fmtspec) # ----------------------------------------------------------------------------- # subs def subs(simd_ext, typ): if typ in common.ftypes: return 'return nsimd_sub_{simd_ext}_{typ}({in0}, {in1});'. \ format(**fmtspec) if typ in ('i8', 'i16', 'u8', 'u16'): return adds_subs_intrinsic_instructions_i8_i16_u8_u16( 'subs', simd_ext, typ) if typ in common.itypes: return 'return nsimd_adds_{simd_ext}_{typ}({in0}, ' \ 'nsimd_neg_{simd_ext}_{typ}({in1}));'.format(**fmtspec) min_ = common.limits[typ]['min'] return \ '''/* Algo pseudo code: */ /* unsigned only */ /* a > 0; b > 0 ==> a - b --> possibility for underflow only */ /* if b > a --> underflow */ const nsimd_{simd_ext}_v{typ} ures = nsimd_sub_{simd_ext}_{typ}({in0}, {in1}); const nsimd_{simd_ext}_vl{typ} is_underflow = nsimd_gt_{simd_ext}_{typ}({in1}, {in0}); const nsimd_{simd_ext}_v{typ} umin = nsimd_set1_{simd_ext}_{typ}(({typ}){min_}); return nsimd_if_else1_{simd_ext}_{typ}(is_underflow, umin, ures);'''. \ format(min_=min_, **fmtspec) # ----------------------------------------------------------------------------- # to_mask def to_mask1(simd_ext, typ): if typ == 'f16': return '''nsimd_{simd_ext}_vf16 ret; ret.v0 = nsimd_to_mask_{simd_ext}_f32({in0}.v0); ret.v1 = nsimd_to_mask_{simd_ext}_f32({in0}.v1); return ret;'''.format(**fmtspec) if simd_ext in sse + avx: return 'return {in0};'.format(**fmtspec) elif simd_ext == 'avx512_skylake': if typ in common.iutypes: return 'return _mm512_movm_epi{typnbits}({in0});'. \ format(**fmtspec) elif typ in ['f32', 'f64']: return '''return _mm512_castsi512{suf}( _mm512_movm_epi{typnbits}({in0}));'''. \ format(**fmtspec) else: if typ in ['i32', 'u32', 'i64', 'u64']: return '''return _mm512_mask_mov{suf}(_mm512_setzero_si512(), {in0}, _mm512_set1_epi32(-1));'''. \ format(**fmtspec) elif typ in ['f32', 'f64']: return '''return _mm512_mask_mov{suf}(_mm512_castsi512{suf}( _mm512_setzero_si512()), {in0}, _mm512_castsi512{suf}( _mm512_set1_epi32(-1)));'''. \ format(**fmtspec) else: return '''{typ} buf[{le}]; int i; for (i = 0; i < {le}; i++) {{ if (({in0} >> i) & 1) {{ buf[i] = ({typ})-1; }} else {{ buf[i] = ({typ})0; }} }} return _mm512_loadu_si512(buf);'''.format(**fmtspec) # ----------------------------------------------------------------------------- # to_logical def to_logical1(simd_ext, typ): if typ in common.iutypes: return '''return nsimd_ne_{simd_ext}_{typ}( {in0}, {pre}setzero{sufsi}());'''.format(**fmtspec) elif typ in ['f32', 'f64']: return '''return nsimd_reinterpretl_{simd_ext}_{typ}_{utyp}( nsimd_ne_{simd_ext}_{utyp}( {pre}cast{suf2}_si{nbits}({in0}), {pre}setzero_si{nbits}()));'''. \ format(suf2=suf_si(simd_ext, typ)[1:], utyp='u{}'.format(fmtspec['typnbits']), **fmtspec) else: return '''nsimd_{simd_ext}_vlf16 ret; ret.v0 = nsimd_to_logical_{simd_ext}_f32({in0}.v0); ret.v1 = nsimd_to_logical_{simd_ext}_f32({in0}.v1); return ret;'''.format(**fmtspec) # ----------------------------------------------------------------------------- # zip functions def zip_half(func, simd_ext, typ): simd_ext2 = 'sse42' if simd_ext in avx else 'avx2' if simd_ext in sse: if typ == 'f16': return '''nsimd_{simd_ext}_v{typ} ret; ret.v0 = _mm_unpacklo_ps({in0}.v{k}, {in1}.v{k}); ret.v1 = _mm_unpackhi_ps({in0}.v{k}, {in1}.v{k}); return ret;'''. \ format(k='0' if func == 'ziplo' else '1', **fmtspec) else: return 'return {pre}unpack{lo}{suf}({in0}, {in1});'. \ format(lo='lo' if func == 'ziplo' else 'hi', **fmtspec) elif simd_ext in avx: # Currently, 256 and 512 bits vectors are splitted into 128 bits # vectors in order to perform the ziplo/hi operation using the # unpacklo/hi sse operations. if typ == 'f16': in0vk = '{in0}.v{k}'.format(k='0' if func == 'ziplo' else '1', **fmtspec) in1vk = '{in1}.v{k}'.format(k='0' if func == 'ziplo' else '1', **fmtspec) return \ '''nsimd_{simd_ext}_v{typ} ret; __m128 v_tmp0 = {get_low_in0vk}; __m128 v_tmp1 = {get_low_in1vk}; __m128 v_tmp2 = {get_high_in0vk}; __m128 v_tmp3 = {get_high_in1vk}; __m128 vres_lo0 = _mm_unpacklo_ps(v_tmp0, v_tmp1); __m128 vres_hi0 = _mm_unpackhi_ps(v_tmp0, v_tmp1); ret.v0 = {merge0}; __m128 vres_lo1 = _mm_unpacklo_ps(v_tmp2, v_tmp3); __m128 vres_hi1 = _mm_unpackhi_ps(v_tmp2, v_tmp3); ret.v1 = {merge1}; return ret; '''.format(get_low_in0vk=extract(simd_ext, 'f32', LO, in0vk), get_low_in1vk=extract(simd_ext, 'f32', LO, in1vk), get_high_in0vk=extract(simd_ext, 'f32', HI, in0vk), get_high_in1vk=extract(simd_ext, 'f32', HI, in1vk), merge0=setr(simd_ext, 'f32', 'vres_lo0', 'vres_hi0'), merge1=setr(simd_ext, 'f32', 'vres_lo1', 'vres_hi1'), **fmtspec) else: hl = LO if func == 'ziplo' else HI return \ '''{nat} v_tmp0 = {half_in0}; {nat} v_tmp1 = {half_in1}; {nat} vres_lo = _mm_unpacklo{suf}(v_tmp0, v_tmp1); {nat} vres_hi = _mm_unpackhi{suf}(v_tmp0, v_tmp1); return {merge}; '''.format(nat=get_native_typ(simd_ext2, typ), half_in0=extract(simd_ext, typ, hl, common.in0), half_in1=extract(simd_ext, typ, hl, common.in1), merge=setr(simd_ext, typ, 'vres_lo', 'vres_hi'), **fmtspec) else: if typ == 'f16': return \ '''nsimd_{simd_ext}_v{typ} ret; __m512 v0 = {in0}.v{k}; __m512 v1 = {in1}.v{k}; __m256 v_tmp0, v_tmp1, vres_lo, vres_hi; /* Low part */ v_tmp0 = {low_v0}; v_tmp1 = {low_v1}; vres_lo = nsimd_ziplo_avx2_f32(v_tmp0, v_tmp1); vres_hi = nsimd_ziphi_avx2_f32(v_tmp0, v_tmp1); ret.v0 = {merge}; /* High part */ v_tmp0 = {high_v0}; v_tmp1 = {high_v1}; vres_lo = nsimd_ziplo_avx2_f32(v_tmp0, v_tmp1); vres_hi = nsimd_ziphi_avx2_f32(v_tmp0, v_tmp1); ret.v1 = {merge}; return ret;'''. \ format(k='0' if func == 'ziplo' else '1', low_v0=extract(simd_ext, 'f32', LO, 'v0'), low_v1=extract(simd_ext, 'f32', LO, 'v1'), high_v0=extract(simd_ext, 'f32', HI, 'v0'), high_v1=extract(simd_ext, 'f32', HI, 'v1'), merge=setr(simd_ext, 'f32', 'vres_lo', 'vres_hi'), **fmtspec) else: hl = LO if func == 'ziplo' else HI return \ '''{nat} v_tmp0, v_tmp1; v_tmp0 = {half_in0}; v_tmp1 = {half_in1}; {nat} vres_lo = nsimd_ziplo_avx2_{typ}(v_tmp0, v_tmp1); {nat} vres_hi = nsimd_ziphi_avx2_{typ}(v_tmp0, v_tmp1); return {merge};'''. \ format(nat=get_native_typ(simd_ext2, typ), half_in0=extract(simd_ext, typ, hl, common.in0), half_in1=extract(simd_ext, typ, hl, common.in1), merge=setr(simd_ext, typ, 'vres_lo', 'vres_hi'), **fmtspec) def zip(simd_ext, typ): return '''nsimd_{simd_ext}_v{typ}x2 ret; ret.v0 = nsimd_ziplo_{simd_ext}_{typ}({in0}, {in1}); ret.v1 = nsimd_ziphi_{simd_ext}_{typ}({in0}, {in1}); return ret; '''.format(**fmtspec) # ----------------------------------------------------------------------------- # unzip functions def unzip_half(opts, func, simd_ext, typ): loop = '''{typ} tab[{lex2}]; {typ} res[{le}]; int i; nsimd_storeu_{simd_ext}_{typ}(tab, {in0}); nsimd_storeu_{simd_ext}_{typ}(tab + {le}, {in1}); for(i = 0; i < {le}; i++) {{ res[i] = tab[2 * i + {offset}]; }} return nsimd_loadu_{simd_ext}_{typ}(res); '''.format(lex2=2 * int(fmtspec['le']), offset='0' if func == 'unziplo' else '1', **fmtspec) if simd_ext in sse: if typ in ['f32', 'i32', 'u32']: v0 = ('_mm_castsi128_ps({in0})' if typ in ['i32', 'u32'] \ else '{in0}').format(**fmtspec) v1 = ('_mm_castsi128_ps({in1})' if typ in ['i32', 'u32'] \ else '{in1}').format(**fmtspec) ret = ('_mm_castps_si128(v_res)' if typ in ['i32', 'u32'] \ else 'v_res').format(**fmtspec) return '''__m128 v_res; v_res = _mm_shuffle_ps({v0}, {v1}, {mask}); return {ret};'''.format( mask='_MM_SHUFFLE(2, 0, 2, 0)' if func == 'unziplo' \ else '_MM_SHUFFLE(3, 1, 3, 1)', v0=v0, v1=v1, ret=ret, **fmtspec) elif typ == 'f16': return \ '''nsimd_{simd_ext}_v{typ} v_res; v_res.v0 = _mm_shuffle_ps({in0}.v0, {in0}.v1, {mask}); v_res.v1 = _mm_shuffle_ps({in1}.v0, {in1}.v1, {mask}); return v_res;'''.format(mask='_MM_SHUFFLE(2, 0, 2, 0)' \ if func == 'unziplo' \ else '_MM_SHUFFLE(3, 1, 3, 1)', **fmtspec) elif typ in ['f64', 'i64', 'u64']: v0 = ('_mm_castsi128_pd({in0})' if typ in ['i64', 'u64'] \ else '{in0}').format(**fmtspec) v1 = ('_mm_castsi128_pd({in1})' if typ in ['i64', 'u64'] \ else '{in1}').format(**fmtspec) ret = ('_mm_castpd_si128(v_res)' if typ in ['i64', 'u64'] \ else 'v_res').format(**fmtspec) return '''__m128d v_res; v_res = _mm_shuffle_pd({v0}, {v1}, {mask}); return {ret}; '''.format(mask='0' if func == 'unziplo' else '3', v0=v0, v1=v1, ret=ret, **fmtspec) elif typ in ['i16', 'u16']: return '''__m128i v_tmp0 = _mm_shufflelo_epi16( {in0}, _MM_SHUFFLE(3, 1, 2, 0)); v_tmp0 = _mm_shufflehi_epi16(v_tmp0, _MM_SHUFFLE(3, 1, 2, 0)); __m128i v_tmp1 = _mm_shufflelo_epi16({in1}, _MM_SHUFFLE(3, 1, 2, 0)); v_tmp1 = _mm_shufflehi_epi16(v_tmp1, _MM_SHUFFLE(3, 1, 2, 0)); __m128 v_res = _mm_shuffle_ps(_mm_castsi128_ps(v_tmp0), _mm_castsi128_ps(v_tmp1), {mask}); return _mm_castps_si128(v_res); '''.format(mask='_MM_SHUFFLE(2, 0, 2, 0)' \ if func == 'unziplo' \ else '_MM_SHUFFLE(3, 1, 3, 1)', **fmtspec) else: return loop elif simd_ext in avx: ret_template = \ '''v_tmp0 = _mm256_permute2f128_{t}({v0}, {v0}, 0x01); v_tmp0 = _mm256_shuffle_{t}({v0}, v_tmp0, {mask}); v_tmp1 = _mm256_permute2f128_{t}({v1}, {v1}, 0x01); v_tmp1 = _mm256_shuffle_{t}({v1}, v_tmp1, {mask}); v_res = _mm256_permute2f128_{t}(v_tmp0, v_tmp1, 0x20); {ret} = {v_res};''' if typ in ['f32', 'i32', 'u32']: v0 = '_mm256_castsi256_ps({in0})' \ if typ in ['i32', 'u32'] else '{in0}' v1 = '_mm256_castsi256_ps({in1})' \ if typ in ['i32', 'u32'] else '{in1}' v_res = '_mm256_castps_si256(v_res)' \ if typ in ['i32', 'u32'] else 'v_res' ret = 'ret' src = ret_template.format(mask='_MM_SHUFFLE(2, 0, 2, 0)' \ if func == 'unziplo' else '_MM_SHUFFLE(3, 1, 3, 1)', v0=v0, v1=v1, v_res=v_res, ret=ret, t='ps', **fmtspec) return '''nsimd_{simd_ext}_v{typ} ret; __m256 v_res, v_tmp0, v_tmp1; {src} return ret;'''. \ format(src=src.format(**fmtspec), **fmtspec) elif typ == 'f16': src0 = ret_template.format(mask='_MM_SHUFFLE(2, 0, 2, 0)' \ if func == 'unziplo' else '_MM_SHUFFLE(3, 1, 3, 1)', v0='{in0}.v0', v1='{in0}.v1', v_res='v_res', ret='ret.v0', t='ps') src1 = ret_template.format(mask='_MM_SHUFFLE(2, 0, 2, 0)' \ if func == 'unziplo' else '_MM_SHUFFLE(3, 1, 3, 1)', v0='{in1}.v0', v1='{in1}.v1', v_res='v_res', ret='ret.v1', t='ps') return '''nsimd_{simd_ext}_v{typ} ret; __m256 v_res, v_tmp0, v_tmp1; {src0} {src1} return ret;'''.format(src0=src0.format(**fmtspec), src1=src1.format(**fmtspec), **fmtspec) elif typ in ['f64', 'i64', 'u64']: v0 = ('_mm256_castsi256_pd({in0})' \ if typ in ['i64', 'u64'] else '{in0}').format(**fmtspec) v1 = ('_mm256_castsi256_pd({in1})' \ if typ in ['i64', 'u64'] else '{in1}').format(**fmtspec) v_res = ('_mm256_castpd_si256(v_res)' \ if typ in ['i64', 'u64'] else 'v_res'). \ format(**fmtspec) src = ret_template.format(mask='0x00' if func == 'unziplo' \ else '0x03', v0=v0, v1=v1, ret='ret', v_res=v_res, t='pd') return '''nsimd_{simd_ext}_v{typ} ret; __m256d v_res, v_tmp0, v_tmp1; {src} return ret;'''.format(src=src.format(**fmtspec), **fmtspec) elif typ in ['i16', 'u16']: return \ '''__m128i v_tmp0_hi = {hi0}; __m128i v_tmp0_lo = {lo0}; __m128i v_tmp1_hi = {hi1}; __m128i v_tmp1_lo = {lo1}; v_tmp0_lo = nsimd_{func}_sse2_{typ}(v_tmp0_lo, v_tmp0_hi); v_tmp1_lo = nsimd_{func}_sse2_{typ}(v_tmp1_lo, v_tmp1_hi); return {merge};'''. \ format(hi0=extract(simd_ext, typ, HI, common.in0), lo0=extract(simd_ext, typ, LO, common.in0), hi1=extract(simd_ext, typ, HI, common.in1), lo1=extract(simd_ext, typ, LO, common.in1), merge=setr(simd_ext, typ, 'v_tmp0_lo', 'v_tmp1_lo'), func=func, **fmtspec) else: return loop else: if typ == 'f16': return \ '''nsimd_{simd_ext}_v{typ} ret; __m256 v_tmp0, v_tmp1, v_res_lo, v_res_hi; v_tmp0 = {loin0v0}; v_tmp1 = {hiin0v0}; v_res_lo = nsimd_{func}_avx2_f32(v_tmp0, v_tmp1); v_tmp0 = {loin0v1}; v_tmp1 = {hiin0v1}; v_res_hi = nsimd_{func}_avx2_f32(v_tmp0, v_tmp1); ret.v0 = {merge}; v_tmp0 = {loin1v0}; v_tmp1 = {hiin1v0}; v_res_lo = nsimd_{func}_avx2_f32(v_tmp0, v_tmp1); v_tmp0 = {loin1v1}; v_tmp1 = {hiin1v1}; v_res_hi = nsimd_{func}_avx2_f32(v_tmp0, v_tmp1); ret.v1 = {merge}; return ret;'''.format( loin0v0=extract(simd_ext, 'f32', LO, common.in0 + '.v0'), hiin0v0=extract(simd_ext, 'f32', HI, common.in0 + '.v0'), loin0v1=extract(simd_ext, 'f32', LO, common.in0 + '.v1'), hiin0v1=extract(simd_ext, 'f32', HI, common.in0 + '.v1'), loin1v0=extract(simd_ext, 'f32', LO, common.in1 + '.v0'), hiin1v0=extract(simd_ext, 'f32', HI, common.in1 + '.v0'), loin1v1=extract(simd_ext, 'f32', LO, common.in1 + '.v1'), hiin1v1=extract(simd_ext, 'f32', HI, common.in1 + '.v1'), merge=setr(simd_ext, 'f32', 'v_res_lo', 'v_res_hi'), func=func, **fmtspec) else: return '''nsimd_avx2_v{typ} v00 = {extract_lo0}; nsimd_avx2_v{typ} v01 = {extract_hi0}; nsimd_avx2_v{typ} v10 = {extract_lo1}; nsimd_avx2_v{typ} v11 = {extract_hi1}; v00 = nsimd_{func}_avx2_{typ}(v00, v01); v01 = nsimd_{func}_avx2_{typ}(v10, v11); return {merge};'''.format( func=func, extract_lo0=extract(simd_ext, typ, LO, common.in0), extract_lo1=extract(simd_ext, typ, LO, common.in1), extract_hi0=extract(simd_ext, typ, HI, common.in0), extract_hi1=extract(simd_ext, typ, HI, common.in1), merge=setr(simd_ext, typ, 'v00', 'v01'), **fmtspec) def unzip(simd_ext, typ): return '''nsimd_{simd_ext}_v{typ}x2 ret; ret.v0 = nsimd_unziplo_{simd_ext}_{typ}({in0}, {in1}); ret.v1 = nsimd_unziphi_{simd_ext}_{typ}({in0}, {in1}); return ret;'''.format(**fmtspec) # ----------------------------------------------------------------------------- # mask_for_loop_tail def mask_for_loop_tail(simd_ext, typ): if typ == 'f16': fill_n = '''n.v0 = {pre}set1_ps((f32)({in1} - {in0})); n.v1 = n.v0;'''.format(**fmtspec) else: fill_n = 'n = nsimd_set1_{simd_ext}_{typ}(({typ})({in1} - {in0}));'. \ format(**fmtspec) return '''if ({in0} >= {in1}) {{ return nsimd_set1l_{simd_ext}_{typ}(0); }} if ({in1} - {in0} < {le}) {{ nsimd_{simd_ext}_v{typ} n; {fill_n} return nsimd_lt_{simd_ext}_{typ}( nsimd_iota_{simd_ext}_{typ}(), n); }} else {{ return nsimd_set1l_{simd_ext}_{typ}(1); }}'''.format(fill_n=fill_n, **fmtspec) # ----------------------------------------------------------------------------- # iota def iota(simd_ext, typ): typ2 = 'f32' if typ == 'f16' else typ iota = ', '.join(['({typ2}){i}'.format(typ2=typ2, i=i) \ for i in range(int(fmtspec['le']))]) if typ == 'f16': return '''f32 buf[{le}] = {{ {iota} }}; nsimd_{simd_ext}_vf16 ret; ret.v0 = {pre}loadu_ps(buf); ret.v1 = {pre}loadu_ps(buf + {le2}); return ret;'''. \ format(iota=iota, le2=fmtspec['le'] // 2, **fmtspec) return '''{typ} buf[{le}] = {{ {iota} }}; return {pre}loadu{sufsi}({cast}buf);'''. \ format(iota=iota, cast='(__m{nbits}i*)'.format(**fmtspec) \ if typ in common.iutypes else '', **fmtspec) # ----------------------------------------------------------------------------- # scatter def scatter(simd_ext, typ): if typ == 'f16': return '''int i; f32 buf[{le}]; i16 offset_buf[{le}]; {pre}storeu_si{nbits}((__m{nbits}i *)offset_buf, {in1}); {pre}storeu_ps(buf, {in2}.v0); {pre}storeu_ps(buf + {leo2}, {in2}.v1); for (i = 0; i < {le}; i++) {{ {in0}[offset_buf[i]] = nsimd_f32_to_f16(buf[i]); }}'''.format(leo2=int(fmtspec['le']) // 2, **fmtspec) if simd_ext in (sse + avx) or typ in ['i8', 'u8', 'i16', 'u16']: cast = castsi(simd_ext, typ) return '''int i; {typ} buf[{le}]; {ityp} offset_buf[{le}]; {pre}storeu_si{nbits}((__m{nbits}i *)offset_buf, {in1}); {pre}storeu{sufsi}({cast}buf, {in2}); for (i = 0; i < {le}; i++) {{ {in0}[offset_buf[i]] = buf[i]; }}'''.format(ityp='i' + typ[1:], cast=cast, **fmtspec) # getting here means 32 and 64-bits types for avx512 return '''{pre}i{typnbits}scatter{suf}( (void *){in0}, {in1}, {in2}, {scale});'''. \ format(scale=int(typ[1:]) // 8, **fmtspec) # ----------------------------------------------------------------------------- # linear scatter def scatter_linear(simd_ext, typ): if typ == 'f16': return '''int i; f32 buf[{le}]; {pre}storeu_ps(buf, {in2}.v0); {pre}storeu_ps(buf + {leo2}, {in2}.v1); for (i = 0; i < {le}; i++) {{ {in0}[i * {in1}] = nsimd_f32_to_f16(buf[i]); }}'''.format(leo2=int(fmtspec['le']) // 2, **fmtspec) if simd_ext in avx512: return '''nsimd_scatter_linear_avx2_{typ}({in0}, {in1}, {lo}); nsimd_scatter_linear_avx2_{typ}({in0} + ({leo2} * {in1}), {in1}, {hi});'''. \ format(leo2=int(fmtspec['le']) // 2, lo=extract(simd_ext, typ, LO, fmtspec['in2']), hi=extract(simd_ext, typ, HI, fmtspec['in2']), **fmtspec) emulation = '''int i; {typ} buf[{le}]; {pre}storeu{sufsi}({cast}buf, {in2}); for (i = 0; i < {le}; i++) {{ {in0}[i * {in1}] = buf[i]; }}'''.format(cast=castsi(simd_ext, typ), **fmtspec) if (simd_ext == 'sse2' and typ in ['i16', 'u16']) or \ (simd_ext == 'avx' and \ typ in ['i32', 'u32', 'f32', 'i64', 'u64', 'f64']) or \ (simd_ext in ['sse42', 'avx2']): trick = '\n'.join([ '{in0}[{i} * {in1}] = {get_lane};'.format(i=i, get_lane=get_lane(simd_ext, typ, '{in2}'.format(**fmtspec), i), **fmtspec) for i in range(int(fmtspec['le']))]) return '''#if NSIMD_WORD_SIZE == 32 {} #else {} #endif'''.format(emulation, trick) else: return emulation # ----------------------------------------------------------------------------- # mask_scatter def mask_scatter(simd_ext, typ): if typ == 'f16': le2 = fmtspec['le'] // 2 if simd_ext in sse + avx: store_mask = '''{pre}storeu_ps(mask, {in0}.v0); {pre}storeu_ps(mask + {le2}, {in0}.v1);'''. \ format(le2=le2, **fmtspec) else: store_mask = '''_mm512_storeu_ps(mask, _mm512_maskz_mov_ps( {in0}.v0, _mm512_set1_ps(1.0f))); _mm512_storeu_ps(mask + {le2}, _mm512_maskz_mov_ps( {in0}.v1, _mm512_set1_ps(1.0f)));'''. \ format(le2=le2, **fmtspec) return '''int i; f32 mask[{le}], buf[{le}]; i16 offset_buf[{le}]; {store_mask} {pre}storeu_si{nbits}((__m{nbits}i *)offset_buf, {in2}); {pre}storeu_ps(buf, {in3}.v0); {pre}storeu_ps(buf + {le2}, {in3}.v1); for (i = 0; i < {le}; i++) {{ if (nsimd_scalar_reinterpret_u32_f32(mask[i]) != (u32)0) {{ {in1}[offset_buf[i]] = nsimd_f32_to_f16(buf[i]); }} }}'''.format(le2=le2, store_mask=store_mask, **fmtspec) if simd_ext in (sse + avx) or typ in ['i8', 'u8', 'i16', 'u16']: cast = castsi(simd_ext, typ) if simd_ext in avx512: mask_decl = 'u64 mask;' store_mask = 'mask = (u64){in0};'.format(**fmtspec) cond = '(mask >> i) & 1' else: mask_decl = '{typ} mask[{le}];'.format(**fmtspec) store_mask = '{pre}storeu{sufsi}({cast}mask, {in0});'. \ format(cast=cast, **fmtspec) cond = 'nsimd_scalar_reinterpret_{utyp}_{typ}(mask[i]) != '\ '({utyp})0'.format(utyp='u' + typ[1:], **fmtspec) return '''int i; {typ} buf[{le}]; {mask_decl} {ityp} offset_buf[{le}]; {store_mask} {pre}storeu_si{nbits}((__m{nbits}i *)offset_buf, {in2}); {pre}storeu{sufsi}({cast}buf, {in3}); for (i = 0; i < {le}; i++) {{ if ({cond}) {{ {in1}[offset_buf[i]] = buf[i]; }} }}'''.format(ityp='i' + typ[1:], cast=cast, cond=cond, mask_decl=mask_decl, store_mask=store_mask, **fmtspec) # getting here means 32 and 64-bits types for avx512 return '''{pre}mask_i{typnbits}scatter{suf}( (void *){in1}, {in0}, {in2}, {in3}, {scale});'''. \ format(scale=int(typ[1:]) // 8, **fmtspec) # ----------------------------------------------------------------------------- # gather def gather(simd_ext, typ): if typ == 'f16': return '''nsimd_{simd_ext}_vf16 ret; int i; f32 buf[{le}]; i16 offset_buf[{le}]; {pre}storeu_si{nbits}((__m{nbits}i *)offset_buf, {in1}); for (i = 0; i < {le}; i++) {{ buf[i] = nsimd_f16_to_f32({in0}[offset_buf[i]]); }} ret.v0 = {pre}loadu_ps(buf); ret.v1 = {pre}loadu_ps(buf + {leo2}); return ret;'''.format(leo2=int(fmtspec['le']) // 2, **fmtspec) if simd_ext in (sse + ['avx']) or typ in ['i8', 'u8', 'i16', 'u16']: cast = castsi(simd_ext, typ) return '''int i; {typ} buf[{le}]; {ityp} offset_buf[{le}]; {pre}storeu_si{nbits}((__m{nbits}i *)offset_buf, {in1}); for (i = 0; i < {le}; i++) {{ buf[i] = {in0}[offset_buf[i]]; }} return {pre}loadu{sufsi}({cast}buf);'''. \ format(ityp='i' + typ[1:], cast=cast, **fmtspec) # getting here means 32 and 64-bits types for avx2 and avx512 if simd_ext == 'avx2': if typ in ['i64', 'u64']: cast = '(nsimd_longlong *)' elif typ in ['i32', 'u32']: cast = '(int *)' else: cast = '({typ} *)'.format(**fmtspec) return '''return {pre}i{typnbits}gather{suf}( {cast}{in0}, {in1}, {scale});'''. \ format(scale=int(typ[1:]) // 8, cast=cast, **fmtspec) elif simd_ext in avx512: return 'return {pre}i{typnbits}gather{suf}({in1}, ' \ '(const void *){in0}, {scale});'. \ format(scale=int(typ[1:]) // 8, **fmtspec) # ----------------------------------------------------------------------------- # linear gather def gather_linear(simd_ext, typ): le = int(fmtspec['le']) cast = castsi(simd_ext, typ) if typ == 'f16': return '''nsimd_{simd_ext}_vf16 ret; f32 buf[{le}]; int i; for (i = 0; i < {le}; i++) {{ buf[i] = nsimd_f16_to_f32({in0}[i * {in1}]); }} ret.v0 = {pre}loadu_ps(buf); ret.v1 = {pre}loadu_ps(buf + {leo2}); return ret;'''.format(leo2=le // 2, **fmtspec) emulation = '''{typ} buf[{le}]; int i; for (i = 0; i < {le}; i++) {{ buf[i] = {in0}[i * {in1}]; }} return {pre}loadu{sufsi}({cast}buf);'''. \ format(cast=cast, **fmtspec) if simd_ext == 'sse2' and typ not in ['i16', 'u16']: return emulation if simd_ext in sse + avx: trick = \ '''nsimd_{simd_ext}_v{typ} ret; ret = {pre}undefined{sufsi}(); '''.format(**fmtspec) + ''.join([ set_lane(simd_ext, typ, 'ret', '{in0}[{i} * {in1}]'. \ format(i=i, **fmtspec), i) + '\n' \ for i in range(le)]) + \ '''return ret;''' return '''#if NSIMD_WORD_SIZE == 32 {} #else {} #endif '''.format(emulation, trick) # getting here means AVX-512 return \ '''nsimd_avx2_v{typ} lo = _mm256_undefined{sufsi2}(); nsimd_avx2_v{typ} hi = _mm256_undefined{sufsi2}(); lo = nsimd_gather_linear_avx2_{typ}({in0}, {in1}); hi = nsimd_gather_linear_avx2_{typ}({in0} + ({leo2} * {in1}), {in1}); return {merge};'''.format(merge=setr(simd_ext, typ, 'lo', 'hi'), sufsi2=suf_si('avx2', typ), leo2=le // 2, **fmtspec) # ----------------------------------------------------------------------------- # maksed gather def maskoz_gather(oz, simd_ext, typ): if typ == 'f16': le2 = fmtspec['le'] // 2 if simd_ext in sse + avx: store_mask = '''{pre}storeu_ps(mask, {in0}.v0); {pre}storeu_ps(mask + {le2}, {in0}.v1);'''. \ format(le2=le2, **fmtspec) else: store_mask = '''_mm512_storeu_ps(mask, _mm512_maskz_mov_ps( {in0}.v0, _mm512_set1_ps(1.0f))); _mm512_storeu_ps(mask + {le2}, _mm512_maskz_mov_ps( {in0}.v1, _mm512_set1_ps(1.0f)));'''. \ format(le2=le2, **fmtspec) if oz == 'z': store_oz = '''{pre}storeu_ps(buf, {pre}setzero_ps()); {pre}storeu_ps(buf + {le2}, {pre}setzero_ps());'''. \ format(le2=le2, **fmtspec) else: store_oz = '''{pre}storeu_ps(buf, {in3}.v0); {pre}storeu_ps(buf + {le2}, {in3}.v1);'''. \ format(le2=le2, **fmtspec) return '''nsimd_{simd_ext}_vf16 ret; int i; f32 buf[{le}], mask[{le}]; i16 offset_buf[{le}]; {store_mask} {store_oz} {pre}storeu_si{nbits}((__m{nbits}i *)offset_buf, {in2}); for (i = 0; i < {le}; i++) {{ if (nsimd_scalar_reinterpret_u32_f32(mask[i]) != (u32)0) {{ buf[i] = nsimd_f16_to_f32({in1}[offset_buf[i]]); }} }} ret.v0 = {pre}loadu_ps(buf); ret.v1 = {pre}loadu_ps(buf + {leo2}); return ret;'''.format(leo2=le2, store_mask=store_mask, store_oz=store_oz, **fmtspec) if simd_ext in (sse + ['avx']) or typ in ['i8', 'u8', 'i16', 'u16']: cast = castsi(simd_ext, typ) if simd_ext in sse + avx: mask_decl = '{typ} mask[{le}];'.format(**fmtspec) store_mask = '{pre}storeu{sufsi}({cast}mask, {in0});'. \ format(cast=cast, **fmtspec) if typ in common.iutypes: comp = 'mask[i]' else: comp = 'nsimd_scalar_reinterpret_u{typnbits}_{typ}(mask[i])'. \ format(**fmtspec) else: mask_decl = 'u64 mask;' store_mask = 'mask = (u64){in0};'.format(**fmtspec) comp = '(mask >> i) & 1' if oz == 'z': store_oz = '''{pre}storeu{sufsi}({cast}buf, {pre}setzero{sufsi}());'''. \ format(cast=cast, **fmtspec) else: store_oz = '{pre}storeu{sufsi}({cast}buf, {in3});'. \ format(cast=cast, **fmtspec) return '''int i; {typ} buf[{le}]; {mask_decl} {ityp} offset_buf[{le}]; {store_mask} {store_oz} {pre}storeu_si{nbits}((__m{nbits}i *)offset_buf, {in2}); for (i = 0; i < {le}; i++) {{ if ({comp}) {{ buf[i] = {in1}[offset_buf[i]]; }} }} return {pre}loadu{sufsi}({cast}buf);'''. \ format(ityp='i' + typ[1:], cast=cast, store_mask=store_mask, store_oz=store_oz, comp=comp, mask_decl=mask_decl, **fmtspec) # getting here means 32 and 64-bits types for avx2 and avx512 if oz == 'o': src = '{in3}'.format(**fmtspec) else: src = '{pre}setzero{sufsi}()'.format(**fmtspec) if simd_ext == 'avx2': if typ in ['i64', 'u64']: cast = '(nsimd_longlong *)' elif typ in ['i32', 'u32']: cast = '(int *)' else: cast = '({typ} *)'.format(**fmtspec) return '''return {pre}mask_i{typnbits}gather{suf}({src}, {cast}{in1}, {in2}, {in0}, {scale});'''. \ format(scale=int(typ[1:]) // 8, cast=cast, src=src, **fmtspec) elif simd_ext in avx512: return 'return {pre}mask_i{typnbits}gather{suf}({src}, {in0}, ' \ '{in2}, (const void *){in1}, {scale});'. \ format(src=src, scale=int(typ[1:]) // 8, **fmtspec) # ----------------------------------------------------------------------------- # get_impl function def get_impl(opts, func, simd_ext, from_typ, to_typ): global fmtspec fmtspec = { 'simd_ext': simd_ext, 'typ': from_typ, 'styp': get_native_typ(simd_ext, from_typ), 'from_typ': from_typ, 'to_typ': to_typ, 'pre': pre(simd_ext), 'suf': suf_ep(from_typ), 'sufsi': suf_si(simd_ext, from_typ), 'in0': common.in0, 'in1': common.in1, 'in2': common.in2, 'in3': common.in3, 'in4': common.in4, 'in5': common.in5, 'nbits': nbits(simd_ext), 'le': int(nbits(simd_ext)) // int(from_typ[1:]), 'typnbits': from_typ[1:] } impls = { 'loada': lambda: load(simd_ext, from_typ, True), 'masko_loada1': lambda: maskoz_load(simd_ext, from_typ, 'o', True), 'maskz_loada1': lambda: maskoz_load(simd_ext, from_typ, 'z', True), 'load2a': lambda: load_deg234(simd_ext, from_typ, True, 2), 'load3a': lambda: load_deg234(simd_ext, from_typ, True, 3), 'load4a': lambda: load_deg234(simd_ext, from_typ, True, 4), 'loadu': lambda: load(simd_ext, from_typ, False), 'masko_loadu1': lambda: maskoz_load(simd_ext, from_typ, 'o', False), 'maskz_loadu1': lambda: maskoz_load(simd_ext, from_typ, 'z', False), 'load2u': lambda: load_deg234(simd_ext, from_typ, False, 2), 'load3u': lambda: load_deg234(simd_ext, from_typ, False, 3), 'load4u': lambda: load_deg234(simd_ext, from_typ, False, 4), 'storea': lambda: store(simd_ext, from_typ, True), 'mask_storea1': lambda: mask_store(simd_ext, from_typ, True), 'store2a': lambda: store_deg234(simd_ext, from_typ, True, 2), 'store3a': lambda: store_deg234(simd_ext, from_typ, True, 3), 'store4a': lambda: store_deg234(simd_ext, from_typ, True, 4), 'storeu': lambda: store(simd_ext, from_typ, False), 'mask_storeu1': lambda: mask_store(simd_ext, from_typ, False), 'store2u': lambda: store_deg234(simd_ext, from_typ, False, 2), 'store3u': lambda: store_deg234(simd_ext, from_typ, False, 3), 'store4u': lambda: store_deg234(simd_ext, from_typ, False, 4), 'gather': lambda: gather(simd_ext, from_typ), 'gather_linear': lambda: gather_linear(simd_ext, from_typ), 'masko_gather': lambda: maskoz_gather('o', simd_ext, from_typ), 'maskz_gather': lambda: maskoz_gather('z', simd_ext, from_typ), 'scatter': lambda: scatter(simd_ext, from_typ), 'scatter_linear': lambda: scatter_linear(simd_ext, from_typ), 'mask_scatter': lambda: mask_scatter(simd_ext, from_typ), 'andb': lambda: binop2('andb', simd_ext, from_typ), 'xorb': lambda: binop2('xorb', simd_ext, from_typ), 'orb': lambda: binop2('orb', simd_ext, from_typ), 'andl': lambda: binlop2('andl', simd_ext, from_typ), 'xorl': lambda: binlop2('xorl', simd_ext, from_typ), 'orl': lambda: binlop2('orl', simd_ext, from_typ), 'notb': lambda: not1(simd_ext, from_typ), 'notl': lambda: lnot1(simd_ext, from_typ), 'andnotb': lambda: andnot2(simd_ext, from_typ), 'andnotl': lambda: landnot2(simd_ext, from_typ), 'add': lambda: addsub('add', simd_ext, from_typ), 'sub': lambda: addsub('sub', simd_ext, from_typ), 'adds': lambda: adds(simd_ext, from_typ), 'subs': lambda: subs(simd_ext, from_typ), 'div': lambda: div2(opts, simd_ext, from_typ), 'sqrt': lambda: sqrt1(simd_ext, from_typ), 'len': lambda: len1(simd_ext, from_typ), 'mul': lambda: mul2(opts, simd_ext, from_typ), 'shl': lambda: shl_shr('shl', simd_ext, from_typ), 'shr': lambda: shl_shr('shr', simd_ext, from_typ), 'shra': lambda: shra(opts, simd_ext, from_typ), 'set1': lambda: set1(simd_ext, from_typ), 'set1l': lambda: set1l(simd_ext, from_typ), 'eq': lambda: eq2(simd_ext, from_typ), 'ne': lambda: neq2(simd_ext, from_typ), 'gt': lambda: gt2(simd_ext, from_typ), 'lt': lambda: lt2(simd_ext, from_typ), 'ge': lambda: geq2(simd_ext, from_typ), 'le': lambda: leq2(simd_ext, from_typ), 'if_else1': lambda: if_else1(simd_ext, from_typ), 'min': lambda: minmax('min', simd_ext, from_typ), 'max': lambda: minmax('max', simd_ext, from_typ), 'loadla': lambda: loadl(simd_ext, from_typ, True), 'loadlu': lambda: loadl(simd_ext, from_typ, False), 'storela': lambda: storel(simd_ext, from_typ, True), 'storelu': lambda: storel(simd_ext, from_typ, False), 'abs': lambda: abs1(simd_ext, from_typ), 'fma': lambda: fma_fms('fma', simd_ext, from_typ), 'fnma': lambda: fma_fms('fnma', simd_ext, from_typ), 'fms': lambda: fma_fms('fms', simd_ext, from_typ), 'fnms': lambda: fma_fms('fnms', simd_ext, from_typ), 'ceil': lambda: round1(opts, 'ceil', simd_ext, from_typ), 'floor': lambda: round1(opts, 'floor', simd_ext, from_typ), 'trunc': lambda: trunc1(opts, simd_ext, from_typ), 'round_to_even': lambda: round_to_even1(opts, simd_ext, from_typ), 'all': lambda: all_any('all', simd_ext, from_typ), 'any': lambda: all_any('any', simd_ext, from_typ), 'reinterpret': lambda: reinterpret1(simd_ext, from_typ, to_typ), 'reinterpretl': lambda: reinterpretl1(simd_ext, from_typ, to_typ), 'cvt': lambda: convert1(simd_ext, from_typ, to_typ), 'rec11': lambda: rec11_rsqrt11('rcp', simd_ext, from_typ), 'rec8': lambda: rec11_rsqrt11('rcp', simd_ext, from_typ), 'rsqrt11': lambda: rec11_rsqrt11('rsqrt', simd_ext, from_typ), 'rsqrt8': lambda: rec11_rsqrt11('rsqrt', simd_ext, from_typ), 'rec': lambda: rec1(simd_ext, from_typ), 'neg': lambda: neg1(simd_ext, from_typ), 'nbtrue': lambda: nbtrue1(simd_ext, from_typ), 'reverse': lambda: reverse1(simd_ext, from_typ), 'addv': lambda: addv(simd_ext, from_typ), 'upcvt': lambda: upcvt1(simd_ext, from_typ, to_typ), 'downcvt': lambda: downcvt1(opts, simd_ext, from_typ, to_typ), 'to_mask': lambda: to_mask1(simd_ext, from_typ), 'to_logical': lambda: to_logical1(simd_ext, from_typ), 'ziplo': lambda: zip_half('ziplo', simd_ext, from_typ), 'ziphi': lambda: zip_half('ziphi', simd_ext, from_typ), 'unziplo': lambda: unzip_half(opts, 'unziplo', simd_ext, from_typ), 'unziphi': lambda: unzip_half(opts, 'unziphi', simd_ext, from_typ), 'zip' : lambda : zip(simd_ext, from_typ), 'unzip' : lambda : unzip(simd_ext, from_typ), 'mask_for_loop_tail': lambda : mask_for_loop_tail(simd_ext, from_typ), 'iota': lambda : iota(simd_ext, from_typ) } if simd_ext not in get_simd_exts(): raise ValueError('Unknown SIMD extension "{}"'.format(simd_ext)) if not from_typ in common.types: raise ValueError('Unknown type "{}"'.format(from_typ)) if not func in impls: return common.NOT_IMPLEMENTED else: return impls[func]() ================================================ FILE: egg/rocm.py ================================================ # Copyright (c) 2020 Agenium Scale # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. import cuda # ----------------------------------------------------------------------------- def get_impl(operator, totyp, typ): return cuda.get_impl(operator, totyp, typ) ================================================ FILE: egg/scalar.py ================================================ # Copyright (c) 2020 Agenium Scale # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. import common fmtspec = dict() # ----------------------------------------------------------------------------- def opnum(func, typ): normal = 'return ({typ})({func});'. \ format(func=func.format(**fmtspec), **fmtspec) if typ == 'f16': return \ '''#ifdef NSIMD_ARM_FP16 {normal} #else return nsimd_f32_to_f16({func}); #endif'''.format(normal=normal, func=func. \ format(in0='nsimd_f16_to_f32({in0})', in1='nsimd_f16_to_f32({in1})', in2='nsimd_f16_to_f32({in2})').format(**fmtspec)) else: return normal # ----------------------------------------------------------------------------- def cmp(func, typ): normal = 'return ({func});'. \ format(func=func.format(**fmtspec), **fmtspec) if typ == 'f16': return \ '''#ifdef NSIMD_ARM_FP16 {normal} #else return ({func}); #endif'''.format(normal=normal, func=func. \ format(in0='nsimd_f16_to_f32({in0})', in1='nsimd_f16_to_f32({in1})', in2='nsimd_f16_to_f32({in2})').format(**fmtspec)) else: return normal # ----------------------------------------------------------------------------- def opbit(func, typ): in0 = '{in0}'.format(**fmtspec) if typ in common.utypes else \ 'nsimd_scalar_reinterpret_u{typnbits}_{typ}({in0})'.format(**fmtspec) in1 = '{in1}'.format(**fmtspec) if typ in common.utypes else \ 'nsimd_scalar_reinterpret_u{typnbits}_{typ}({in1})'.format(**fmtspec) if typ in common.utypes: return 'return ({typ})({func});'. \ format(func=func.format(in0=in0, in1=in1), **fmtspec) else: return '''return nsimd_scalar_reinterpret_{typ}_u{typnbits}( (u{typnbits})({func}));'''.format( func=func.format(in0=in0, in1=in1), **fmtspec) # ----------------------------------------------------------------------------- def shift(func, typ): if func == 'shl': return 'return ({typ})({in0} << {in1});'.format(**fmtspec) # getting here means shr or shra if typ in common.utypes: return 'return ({typ})({in0} >> {in1});'.format(**fmtspec) # getting here means shr or shra on signed type utyp = common.bitfield_type[typ] if func == 'shr': return '''return nsimd_scalar_reinterpret_{typ}_{utyp}( ({utyp})(nsimd_scalar_reinterpret_{utyp}_{typ}( {in0}) >> {in1}));'''.format(utyp=utyp, **fmtspec) # getting here means shra on signed type return \ '''if ({in1} == 0) {{ return {in0}; }} if ({in0} >= 0) {{ return nsimd_scalar_reinterpret_{typ}_{utyp}(({utyp})( nsimd_scalar_reinterpret_{utyp}_{typ}({in0}) >> {in1})); }} else {{ {utyp} mask = ({utyp})((({utyp})-1) << ({typnbits} - {in1})); return nsimd_scalar_reinterpret_{typ}_{utyp}(({utyp})(mask | ({utyp})(nsimd_scalar_reinterpret_{utyp}_{typ}( {in0}) >> {in1}))); }}'''.format(utyp=utyp, **fmtspec) # ----------------------------------------------------------------------------- def libm_opn(func, arity, typ, until_cpp11, c89_code): cxx_version = '> 0' if not until_cpp11 else '>= 2011' comment = \ '''/* {func} is not available in C89 but is given by POSIX 2001 */ /* and C99. But we do not want to pollute the user includes */ /* and POSIX value if set so we play dirty. */'''. \ format(func=func) args = ', '.join(['{{in{}}}'.format(i).format(**fmtspec) \ for i in range(arity)]) args_f16 = ', '.join(['nsimd_f16_to_f32({{in{}}})'.format(i). \ format(**fmtspec) for i in range(arity)]) args_f64 = ', '.join(['(f64){{in{}}}'.format(i).format(**fmtspec) \ for i in range(arity)]) args_f64_f16 = ', '.join(['(f64)nsimd_f16_to_f32({{in{}}})'.format(i). \ format(**fmtspec) for i in range(arity)]) if typ == 'f16': c99_code = 'return nsimd_f32_to_f16({}f({}));'.format(func, args_f16) if c89_code == '': c89_code = 'return nsimd_f32_to_f16((f32){}({}));'. \ format(func, args_f64_f16) return \ ''' {comment} #if defined(NSIMD_IS_MSVC) && _MSC_VER <= 1800 /* VS 2012 */ {c89_code} #else #if NSIMD_CXX {cxx_version} || NSIMD_C >= 1999 || \ _POSIX_C_SOURCE >= 200112L {c99_code} #else {c89_code} #endif #endif'''. \ format(comment=comment, cxx_version=cxx_version, c89_code=c89_code, c99_code=c99_code) elif typ == 'f32': c99_code = 'return {}f({});'.format(func, args) if c89_code == '': c89_code = 'return (f32){}({});'.format(func, args_f64) return \ ''' {comment} #if defined(NSIMD_IS_MSVC) && _MSC_VER <= 1800 /* VS 2012 */ {c89_code} #else #if NSIMD_CXX {cxx_version} || NSIMD_C >= 1999 || \ _POSIX_C_SOURCE >= 200112L {c99_code} #else {c89_code} #endif #endif'''. \ format(comment=comment, cxx_version=cxx_version, c89_code=c89_code, c99_code=c99_code) else: normal = 'return {}({});'.format(func, args) if c89_code == '': return normal return \ ''' {comment} #if NSIMD_CXX {cxx_version} || NSIMD_C >= 1999 || \ _POSIX_C_SOURCE >= 200112L {normal} #else {c89_code} #endif'''. \ format(comment=comment, normal=normal, c89_code=c89_code, cxx_version=cxx_version) # ----------------------------------------------------------------------------- def round_to_even(typ): if typ in ['f32', 'f64']: return \ '''{typ} fl = nsimd_scalar_floor_{typ}({in0}); {typ} ce = nsimd_scalar_ceil_{typ}({in0}); {typ} df = {in0} - fl; /* exactly representable in IEEE754 */ {typ} dc = ce - {in0}; /* exactly representable in IEEE754 */ if (df < dc) {{ return fl; }} else if (df > dc) {{ return ce; }} else {{ {typ} fld2 = fl * 0.5{f}; /* exactly representable in IEEE754 */ if (fld2 == nsimd_scalar_floor_{typ}(fld2)) {{ return fl; }} else {{ return ce; }} }}'''.format(f='f' if typ == 'f32' else '', **fmtspec) elif typ == 'f16': return \ '''f32 in0 = nsimd_f16_to_f32({in0}); f32 fl = nsimd_scalar_floor_f32(in0); f32 ce = nsimd_scalar_ceil_f32(in0); f32 df = in0 - fl; /* exactly representable in IEEE754 */ f32 dc = ce - in0; /* exactly representable in IEEE754 */ if (df < dc) {{ return nsimd_f32_to_f16(fl); }} else if (df > dc) {{ return nsimd_f32_to_f16(ce); }} else {{ f32 fld2 = fl * 0.5f; /* exactly representable in IEEE754 */ if (fld2 == nsimd_scalar_floor_f32(fld2)) {{ return nsimd_f32_to_f16(fl); }} else {{ return nsimd_f32_to_f16(ce); }} }}'''.format(**fmtspec) else: return 'return {in0};'.format(**fmtspec) # ----------------------------------------------------------------------------- def reinterpret(totyp, typ): if totyp == typ: return 'return {in0};'.format(**fmtspec) via_union = '''union {{ {typ} from; {totyp} to; }} buf; buf.from = {in0}; return buf.to;'''.format(**fmtspec) via_memcpy = '''{totyp} ret; memcpy((void *)&ret, (void *)&{in0}, sizeof(ret)); return ret;'''.format(**fmtspec) if typ == 'f16': if totyp == 'u16': emulated = 'return {in0}.u;'.format(**fmtspec) else: emulated = 'return nsimd_scalar_reinterpret_i16_u16({in0}.u);'. \ format(**fmtspec) return \ '''#if defined(NSIMD_ARM_FP16) && defined(NSIMD_IS_GCC) {via_union} #elif (defined(NSIMD_ARM_FP16) && !defined(NSIMD_IS_GCC)) || \ defined(NSIMD_CUDA) || defined(NSIMD_ROCM) || \ defined(NSIMD_ONEAPI) {via_memcpy} #else {emulated} #endif'''.format(via_union=via_union, via_memcpy=via_memcpy, emulated=emulated) if totyp == 'f16': if typ == 'u16': emulated = '''f16 ret; ret.u = {in0}; return ret;'''.format(**fmtspec) else: emulated = '''f16 ret; ret.u = nsimd_scalar_reinterpret_u16_i16({in0}); return ret;'''.format(**fmtspec) return \ '''#if defined(NSIMD_ARM_FP16) && defined(NSIMD_IS_GCC) {via_union} #elif (defined(NSIMD_ARM_FP16) && !defined(NSIMD_IS_GCC)) || \ defined(NSIMD_CUDA) || defined(NSIMD_ROCM) || \ defined(NSIMD_ONEAPI) {via_memcpy} #else {emulated} #endif'''.format(via_union=via_union, via_memcpy=via_memcpy, emulated=emulated) return '''#ifdef NSIMD_IS_GCC {via_union} #else {via_memcpy} #endif'''.format(via_union=via_union, via_memcpy=via_memcpy) # ----------------------------------------------------------------------------- def cvt(totyp, typ): if totyp == typ: return 'return {in0};'.format(**fmtspec) if typ == 'f16': return '''#ifdef NSIMD_ARM_FP16 return ({totyp}){in0}; #else return ({totyp})nsimd_f16_to_f32({in0}); #endif'''.format(**fmtspec) if totyp == 'f16': return '''#ifdef NSIMD_ARM_FP16 return (f16){in0}; #else return nsimd_f32_to_f16((f32){in0}); #endif'''.format(**fmtspec) return 'return ({totyp}){in0};'.format(**fmtspec) # ----------------------------------------------------------------------------- def adds(typ): if typ in common.ftypes: return opnum('{in0} + {in1}', typ) if typ in common.utypes: return '''{typ} tmp = ({typ})({in0} + {in1}); if (tmp < {in0} || tmp < {in1}) {{ return ({typ})-1; }} else {{ return tmp; }} '''.format(**fmtspec) # Getting here means typ is signed int_max = 'NSIMD_' + typ.upper() + '_MAX' int_min = 'NSIMD_' + typ.upper() + '_MIN' return '''if (({in0} >= 0 && {in1} <= 0) || ({in0} <= 0 && {in1} >= 0)) {{ return ({typ})({in0} + {in1}); }} else {{ if ({in0} > 0) {{ if ({in1} > {int_max} - {in0}) {{ return {int_max}; }} else {{ return ({typ})({in0} + {in1}); }} }} else {{ if ({in1} < {int_min} - {in0}) {{ return {int_min}; }} else {{ return ({typ})({in0} + {in1}); }} }} }}'''.format(int_min=int_min, int_max=int_max, **fmtspec) # ----------------------------------------------------------------------------- def subs(typ): if typ in common.ftypes: return opnum('{in0} - {in1}', typ) if typ in common.utypes: return '''if ({in0} < {in1}) {{ return ({typ})0; }} else {{ return ({typ})({in0} - {in1}); }} '''.format(**fmtspec) # Getting here means typ is signed return 'return nsimd_scalar_adds_{typ}({in0}, ({typ})(-{in1}));'. \ format(**fmtspec) # ----------------------------------------------------------------------------- def get_impl(operator, totyp, typ): global fmtspec fmtspec = { 'in0': common.in0, 'in1': common.in1, 'in2': common.in2, 'typ': typ, 'totyp': totyp, 'typnbits': typ[1:] } if operator.name == 'trunc': if typ in common.iutypes: return 'return {in0};'.format(**fmtspec) elif typ == 'f16': c89_code = \ '''f32 buf = nsimd_f16_to_f32({in0}); return nsimd_f32_to_f16(buf >= 0.0f ? nsimd_scalar_floor_f32(buf) : nsimd_scalar_ceil_f32(buf));'''. \ format(**fmtspec) else: c89_code = \ '''return {in0} >= 0.0{f} ? nsimd_scalar_floor_{typ}({in0}) : nsimd_scalar_ceil_{typ}({in0});'''. \ format(f='f' if typ == 'f32' else '', **fmtspec) return libm_opn('trunc', 1, typ, True, c89_code) if operator.name == 'abs': if typ == 'f16': return '''f32 tmp = nsimd_f16_to_f32({in0}); return nsimd_f32_to_f16(tmp >= 0.0f ? tmp : -tmp);'''. \ format(**fmtspec) elif typ in common.utypes: return 'return {in0};'.format(**fmtspec) else: return 'return ({typ})({in0} >= ({typ})0 ? {in0} : -{in0});'. \ format(**fmtspec) if operator.name in ['min', 'max']: op = '<' if operator.name == 'min' else '>' if typ == 'f16': return '''f32 in0 = nsimd_f16_to_f32({in0}); f32 in1 = nsimd_f16_to_f32({in1}); return nsimd_f32_to_f16(in0 {op} in1 ? in0 : in1);'''. \ format(op=op, **fmtspec) else: return 'return {in0} {op} {in1} ? {in0} : {in1};'. \ format(op=op, **fmtspec) if operator.name == 'to_logical': if typ in common.iutypes: return 'return {in0} != ({typ})0;'.format(**fmtspec) else: return '''return nsimd_scalar_reinterpret_u{typnbits}_{typ}( {in0}) != (u{typnbits})0;'''.format(**fmtspec) if operator.name == 'to_mask': if typ in common.utypes: return 'return ({typ})({in0} ? -1 : 0);'.format(**fmtspec) else: return '''return nsimd_scalar_reinterpret_{typ}_u{typnbits}(( u{typnbits})({in0} ? -1 : 0));'''. \ format(**fmtspec) if operator.name == 'round_to_even': return round_to_even(typ) if operator.name in ['floor', 'ceil', 'sqrt']: if typ in common.iutypes and operator.name != 'sqrt': return 'return {in0};'.format(**fmtspec) return libm_opn(operator.name, 1, typ, False, '') if operator.name == 'fma': if typ in common.iutypes: return 'return ({typ})({in0} * {in1} + {in2});'.format(**fmtspec) else: if typ == 'f16': c89_code = 'return nsimd_f32_to_f16(nsimd_f16_to_f32({in0}) ' \ '* nsimd_f16_to_f32({in1}) ' \ '+ nsimd_f16_to_f32({in2}));'.format(**fmtspec) else: c89_code = 'return {in0} * {in1} + {in2};'.format(**fmtspec) return libm_opn(operator.name, 3, typ, False, c89_code) if operator.name in ['fnma', 'fms', 'fnms']: neg = '-' if operator.name in ['fnms', 'fnma'] else '' op = '-' if operator.name in ['fms', 'fnms'] else '+' if typ in common.iutypes: return 'return ({typ})(({neg}{in0}) * {in1} {op} {in2});'. \ format(neg=neg, op=op, **fmtspec) else: typ2 = 'f32' if typ == 'f16' else typ return opnum( 'nsimd_scalar_fma_{typ2}({neg}{{in0}}, {{in1}}, {op}{{in2}})'. \ format(typ2=typ2, neg=neg, op=op, **fmtspec), typ) f = 'f' if typ in ['f16', 'f32'] else '' typ2 = 'f32' if typ == 'f16' else typ if operator.src: if typ == 'f16': return \ '''return nsimd_f32_to_f16( nsimd_sleef_{op_name}_scalar_f32({vas}));'''. \ format(op_name=operator.name, vas=', '.join(['nsimd_f16_to_f32({})'. \ format(common.get_arg(i)) \ for i in range(len(operator.params[1:]))]), **fmtspec) else: return 'return nsimd_sleef_{op_name}_scalar_{typ}({vas});'. \ format(op_name=operator.name, vas=common.get_args(len(operator.params[1:])), **fmtspec) func = { 'orb': lambda: opbit('{in0} | {in1}', typ), 'andb': lambda: opbit('{in0} & {in1}', typ), 'andnotb': lambda: opbit('{in0} & (~{in1})', typ), 'notb': lambda: opbit('~{in0}', typ), 'xorb': lambda: opbit('{in0} ^ {in1}', typ), 'add': lambda: opnum('{in0} + {in1}', typ), 'sub': lambda: opnum('{in0} - {in1}', typ), 'mul': lambda: opnum('{in0} * {in1}', typ), 'div': lambda: opnum('{in0} / {in1}', typ), 'neg': lambda: opnum('-{in0}', typ), 'lt': lambda: cmp('{in0} < {in1}', typ), 'gt': lambda: cmp('{in0} > {in1}', typ), 'le': lambda: cmp('{in0} <= {in1}', typ), 'ge': lambda: cmp('{in0} >= {in1}', typ), 'ne': lambda: cmp('{in0} != {in1}', typ), 'eq': lambda: cmp('{in0} == {in1}', typ), 'andl': lambda: 'return {in0} && {in1};'.format(**fmtspec), 'orl': lambda: 'return {in0} || {in1};'.format(**fmtspec), 'xorl': lambda: 'return {in0} ^ {in1};'.format(**fmtspec), 'andnotl': lambda: 'return {in0} && (!{in1});'.format(**fmtspec), 'notl': lambda: 'return !{in0};'.format(**fmtspec), 'shl': lambda: shift('shl', typ), 'shr': lambda: shift('shr', typ), 'shra': lambda: shift('shra', typ), 'reinterpret': lambda: reinterpret(totyp, typ), 'cvt': lambda: cvt(totyp, typ), 'adds': lambda: adds(typ), 'subs': lambda: subs(typ), 'rec': lambda: opnum('1.0{f} / {{in0}}'.format(f=f), typ), 'rec8': lambda: opnum('1.0{f} / {{in0}}'.format(f=f), typ), 'rec11': lambda: opnum('1.0{f} / {{in0}}'.format(f=f), typ), 'rsqrt': lambda: opnum('1.0{f} / nsimd_scalar_sqrt_{typ2}({{in0}})'. \ format(f=f, typ2=typ2), typ), 'rsqrt8': lambda: opnum('1.0{f} / nsimd_scalar_sqrt_{typ2}({{in0}})'. \ format(f=f, typ2=typ2), typ), 'rsqrt11': lambda: opnum('1.0{f} / nsimd_scalar_sqrt_{typ2}({{in0}})'. \ format(f=f, typ2=typ2), typ) } return func[operator.name]() ================================================ FILE: egg/x86_load_store_deg234.py ================================================ # Copyright (c) 2019 Agenium Scale # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. import platform_x86 as x86 import common sse = ['sse2', 'sse42'] avx = ['avx', 'avx2'] avx512 = ['avx512_knl', 'avx512_skylake'] ############################################################################### # Helper def perm64(var1, var2, ind1, ind2): return '''_mm_castpd_si128(_mm_shuffle_pd( _mm_castsi128_pd({}), _mm_castsi128_pd( {}), _MM_SHUFFLE2({}, {})))'''.format(var1, var2, ind1, ind2) ############################################################################### def get_load_v0v1(simd_ext, typ, align, fmtspec): load = '{pre}load{a}{sufsi}'.format(a='' if align else 'u', **fmtspec) if typ in ['f32', 'f64']: return '''{styp} v0 = {load}(a0); {styp} v1 = {load}(a0 + {le});'''. \ format(load=load, **fmtspec) else: return '''{styp} v0 = {load}(({styp}*)a0); {styp} v1 = {load}(({styp}*)a0 + 1);'''. \ format(load=load, **fmtspec) ############################################################################### def load2_sse(simd_ext, typ, align, fmtspec2): fmtspec = fmtspec2.copy() fmtspec['load_v0v1'] = get_load_v0v1('sse', typ, align, fmtspec) if typ in ['i8', 'u8']: if simd_ext == 'sse42': return \ '''nsimd_sse42_v{typ}x2 ret; {load_v0v1} __m128i mask = _mm_set_epi8(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0); __m128i A0 = _mm_shuffle_epi8(v0, mask); __m128i B0 = _mm_shuffle_epi8(v1, mask); ret.v0 = {perm0}; ret.v1 = {perm1}; return ret;'''. \ format(perm0=perm64('A0', 'B0', '0', '0'), perm1=perm64('A0', 'B0', '1', '1'), **fmtspec) else: return \ '''nsimd_sse2_v{typ}x2 ret; {load_v0v1} __m128i A1 = _mm_unpacklo_epi8(v0, v1); __m128i B2 = _mm_unpackhi_epi8(v0, v1); __m128i A3 = _mm_unpacklo_epi8(A1, B2); __m128i B4 = _mm_unpackhi_epi8(A1, B2); __m128i A5 = _mm_unpacklo_epi8(A3, B4); __m128i B6 = _mm_unpackhi_epi8(A3, B4); ret.v0 = _mm_unpacklo_epi8(A5, B6); ret.v1 = _mm_unpackhi_epi8(A5, B6); return ret;'''.format(**fmtspec) if typ in ['i16', 'u16']: if simd_ext == 'sse42': return \ '''nsimd_sse42_v{typ}x2 ret; {load_v0v1} __m128i mask = _mm_set_epi8(15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0); __m128i A0 = _mm_shuffle_epi8(v0, mask); __m128i B0 = _mm_shuffle_epi8(v1, mask); ret.v0 = {perm0}; ret.v1 = {perm1}; return ret;'''. \ format(perm0=perm64('A0', 'B0', '0', '0'), perm1=perm64('A0', 'B0', '1', '1'), **fmtspec) else: return \ '''nsimd_sse2_v{typ}x2 ret; {load_v0v1} __m128i v2 = _mm_unpacklo_epi16(v0, v1); __m128i v3 = _mm_unpackhi_epi16(v0, v1); __m128i v5 = _mm_unpacklo_epi16(v2, v3); __m128i v6 = _mm_unpackhi_epi16(v2, v3); ret.v0 = _mm_unpacklo_epi16(v5, v6); ret.v1 = _mm_unpackhi_epi16(v5, v6); return ret;'''.format(**fmtspec) if typ in ['i32', 'u32', 'f32']: return '''nsimd_{simd_ext}_v{typ}x2 ret; {load_v0v1} {styp} A0 = _mm_unpacklo{suf}(v0, v1); {styp} B0 = _mm_unpackhi{suf}(v0, v1); ret.v0 = _mm_unpacklo{suf}(A0, B0); ret.v1 = _mm_unpackhi{suf}(A0, B0); return ret;'''.format(**fmtspec) if typ in ['i64', 'u64', 'f64']: return '''nsimd_{simd_ext}_v{typ}x2 ret; {load_v0v1} ret.v0 = _mm_unpacklo{suf}(v0, v1); ret.v1 = _mm_unpackhi{suf}(v0, v1); return ret;'''.format(**fmtspec) ############################################################################### def load2_avx(simd_ext, typ, align, fmtspec2): fmtspec = fmtspec2.copy() fmtspec['exlo_v0'] = x86.extract('avx', typ, x86.LO, 'v0') fmtspec['exhi_v0'] = x86.extract('avx', typ, x86.HI, 'v0') fmtspec['exlo_v1'] = x86.extract('avx', typ, x86.LO, 'v1') fmtspec['exhi_v1'] = x86.extract('avx', typ, x86.HI, 'v1') fmtspec['load_v0v1'] = get_load_v0v1('avx', typ, align, fmtspec) fmtspec['a'] = 'a' if align else 'u' if typ in ['i8', 'u8']: if simd_ext == 'avx2': return \ '''nsimd_avx2_v{typ}x2 ret; {load_v0v1} __m256i mask = _mm256_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15); __m256i A1 = _mm256_shuffle_epi8(v0, mask); __m256i B1 = _mm256_shuffle_epi8(v1, mask); __m256i A2 = _mm256_permute4x64_epi64(A1, _MM_SHUFFLE(3,1,2,0)); __m256i B2 = _mm256_permute4x64_epi64(B1, _MM_SHUFFLE(3,1,2,0)); ret.v0 = _mm256_permute2f128_si256(A2, B2, 2 << 4); ret.v1 = _mm256_permute2f128_si256(A2, B2, (3 << 4) | 1); return ret;'''.format(**fmtspec) else: return \ '''nsimd_avx_v{typ}x2 ret; {load_v0v1} __m128i v0a = {exlo_v0}; __m128i v0b = {exhi_v0}; __m128i v1a = {exlo_v1}; __m128i v1b = {exhi_v1}; __m128i mask = _mm_set_epi8(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0); __m128i A0a = _mm_shuffle_epi8(v0a, mask); __m128i B0a = _mm_shuffle_epi8(v1a, mask); __m128i A1a = {perm_a0}; __m128i B1a = {perm_a1}; __m128i A0b = _mm_shuffle_epi8(v0b, mask); __m128i B0b = _mm_shuffle_epi8(v1b, mask); __m128i A1b = {perm_b0}; __m128i B1b = {perm_b1}; ret.v0 = {merge_A1}; ret.v1 = {merge_B1}; return ret;'''. \ format(merge_A1=x86.setr('avx', typ, 'A1a', 'A1b'), merge_B1=x86.setr('avx', typ, 'B1a', 'B1b'), perm_a0=perm64('A0a', 'B0a', '0', '0'), perm_a1=perm64('A0a', 'B0a', '1', '1'), perm_b0=perm64('A0b', 'B0b', '0', '0'), perm_b1=perm64('A0b', 'B0b', '1', '1'), **fmtspec) if typ in ['i16', 'u16']: if simd_ext == 'avx2': return \ '''nsimd_avx2_v{typ}x2 ret; {load_v0v1} __m256i A1 = _mm256_unpacklo_epi16(v0, v1); __m256i B1 = _mm256_unpackhi_epi16(v0, v1); __m256i A2 = _mm256_unpacklo_epi16(A1, B1); __m256i B2 = _mm256_unpackhi_epi16(A1, B1); ret.v0 = _mm256_unpacklo_epi16(A2, B2); ret.v1 = _mm256_unpackhi_epi16(A2, B2); return ret;'''.format(**fmtspec) else: return \ '''nsimd_avx_v{typ}x2 ret; {load_v0v1} __m128i Aa = {exlo_v0}; __m128i Ba = {exhi_v0}; __m128i Ab = {exlo_v1}; __m128i Bb = {exhi_v1}; __m128i mask = _mm_set_epi8(15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0); __m128i XY0 = _mm_shuffle_epi8(Aa, mask); __m128i XY1 = _mm_shuffle_epi8(Ba, mask); __m128i Xa = {perm0}; __m128i Ya = {perm1}; XY0 = _mm_shuffle_epi8(Ab, mask); XY1 = _mm_shuffle_epi8(Bb, mask); __m128i Xb = {perm0}; __m128i Yb = {perm1}; ret.v0 = {mergeX}; ret.v1 = {mergeY}; return ret;'''. \ format(perm0=perm64('XY0', 'XY1', '0', '0'), perm1=perm64('XY0', 'XY1', '1', '1'), mergeX=x86.setr('avx', typ, 'Xa', 'Xb'), mergeY=x86.setr('avx', typ, 'Ya', 'Yb'), **fmtspec) if typ == 'f32': return '''nsimd_{simd_ext}_vf32x2 ret; {load_v0v1} __m256 A1 = _mm256_unpacklo_ps(v0, v1); __m256 B1 = _mm256_unpackhi_ps(v0, v1); ret.v0 = _mm256_unpacklo_ps(A1, B1); ret.v1 = _mm256_unpackhi_ps(A1, B1); return ret;'''.format(**fmtspec) if typ in ['i32', 'u32']: if simd_ext == 'avx2': return \ '''nsimd_avx2_v{typ}x2 ret; {load_v0v1} __m256i A1 = _mm256_unpacklo_epi32(v0, v1); __m256i B1 = _mm256_unpackhi_epi32(v0, v1); ret.v0 = _mm256_unpacklo_epi32(A1, B1); ret.v1 = _mm256_unpackhi_epi32(A1, B1); return ret;'''.format(**fmtspec) else: return \ '''nsimd_avx_v{typ}x2 ret; nsimd_avx_vf32x2 retf32 = nsimd_load2{a}_avx_f32((f32 *){in0}); ret.v0 = _mm256_castps_si256(retf32.v0); ret.v1 = _mm256_castps_si256(retf32.v1); return ret;'''.format(**fmtspec) if typ == 'f64': return '''nsimd_{simd_ext}_vf64x2 ret; {load_v0v1} ret.v0 = _mm256_unpacklo_pd(v0, v1); ret.v1 = _mm256_unpackhi_pd(v0, v1); return ret;'''.format(**fmtspec) if typ in ['i64', 'u64']: if simd_ext == 'avx2': return \ '''nsimd_avx2_v{typ}x2 ret; {load_v0v1} ret.v0 = _mm256_unpacklo_epi64(v0, v1); ret.v1 = _mm256_unpackhi_epi64(v0, v1); return ret;'''.format(**fmtspec) else: return \ '''nsimd_avx_v{typ}x2 ret; nsimd_avx_vf64x2 retf64 = nsimd_load2{a}_avx_f64((f64 *){in0}); ret.v0 = _mm256_castpd_si256(retf64.v0); ret.v1 = _mm256_castpd_si256(retf64.v1); return ret;'''.format(**fmtspec) ############################################################################### def load2_avx512(simd_ext, typ, align, fmtspec2): fmtspec = fmtspec2.copy() fmtspec['exlo_v0'] = x86.extract(simd_ext, typ, x86.LO, 'v0') fmtspec['exhi_v0'] = x86.extract(simd_ext, typ, x86.HI, 'v0') fmtspec['exlo_v1'] = x86.extract(simd_ext, typ, x86.LO, 'v1') fmtspec['exhi_v1'] = x86.extract(simd_ext, typ, x86.HI, 'v1') fmtspec['load_v0v1'] = get_load_v0v1(simd_ext, typ, align, fmtspec) if typ in ['i8', 'u8']: return \ '''nsimd_{simd_ext}_v{typ}x2 ret; {load_v0v1} __m256i A0 = {exlo_v0}; __m256i B0 = {exhi_v0}; __m256i C0 = {exlo_v1}; __m256i D0 = {exhi_v1}; __m256i mask = _mm256_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15); __m256i A1 = _mm256_shuffle_epi8(A0, mask); __m256i B1 = _mm256_shuffle_epi8(B0, mask); __m256i C1 = _mm256_shuffle_epi8(C0, mask); __m256i D1 = _mm256_shuffle_epi8(D0, mask); __m256i A2 = _mm256_permute4x64_epi64(A1, _MM_SHUFFLE(3,1,2,0)); __m256i B2 = _mm256_permute4x64_epi64(B1, _MM_SHUFFLE(3,1,2,0)); __m256i C2 = _mm256_permute4x64_epi64(C1, _MM_SHUFFLE(3,1,2,0)); __m256i D2 = _mm256_permute4x64_epi64(D1, _MM_SHUFFLE(3,1,2,0)); __m256i A3 = _mm256_permute2f128_si256(A2, B2, 2 << 4); __m256i B3 = _mm256_permute2f128_si256(A2, B2, (3 << 4) | 1); __m256i C3 = _mm256_permute2f128_si256(C2, D2, 2 << 4); __m256i D3 = _mm256_permute2f128_si256(C2, D2, (3 << 4) | 1); ret.v0 = {mergeAC}; ret.v1 = {mergeBD}; return ret;'''.format(mergeAC=x86.setr(simd_ext, typ, 'A3', 'C3'), mergeBD=x86.setr(simd_ext, typ, 'B3', 'D3'), **fmtspec) if typ in ['i16', 'u16']: return \ '''nsimd_{simd_ext}_v{typ}x2 ret; {load_v0v1} __m256i A0a = {exlo_v0}; __m256i B0a = {exhi_v0}; __m256i A0b = {exlo_v1}; __m256i B0b = {exhi_v1}; __m256i A1 = _mm256_unpacklo_epi16(A0a, B0a); __m256i B1 = _mm256_unpackhi_epi16(A0a, B0a); __m256i A2 = _mm256_unpacklo_epi16(A1, B1); __m256i B2 = _mm256_unpackhi_epi16(A1, B1); __m256i A3a = _mm256_unpacklo_epi16(A2, B2); __m256i B3a = _mm256_unpackhi_epi16(A2, B2); A1 = _mm256_unpacklo_epi16(A0b, B0b); B1 = _mm256_unpackhi_epi16(A0b, B0b); A2 = _mm256_unpacklo_epi16(A1, B1); B2 = _mm256_unpackhi_epi16(A1, B1); __m256i A3b = _mm256_unpacklo_epi16(A2, B2); __m256i B3b = _mm256_unpackhi_epi16(A2, B2); ret.v0 = {mergeA}; ret.v1 = {mergeB}; return ret;'''.format(mergeA=x86.setr(simd_ext, typ, 'A3a', 'A3b'), mergeB=x86.setr(simd_ext, typ, 'B3a', 'B3b'), **fmtspec) if typ in ['f32', 'i32', 'u32']: return \ '''nsimd_{simd_ext}_v{typ}x2 ret; {load_v0v1} __m512i mask1 = _mm512_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30); __m512i mask2 = _mm512_setr_epi32(1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31); ret.v0 = _mm512_permutex2var{suf}(v0, mask1, v1); ret.v1 = _mm512_permutex2var{suf}(v0, mask2, v1); return ret;'''.format(**fmtspec) if typ in ['f64', 'i64', 'u64']: return \ '''nsimd_{simd_ext}_v{typ}x2 ret; {load_v0v1} ret.v0 = _mm512_unpacklo{suf}(v0, v1); ret.v1 = _mm512_unpackhi{suf}(v0, v1); return ret;'''.format(**fmtspec) ############################################################################### def store2(simd_ext, typ, align, fmtspec2): fmtspec = fmtspec2.copy() fmtspec['store'] = '{pre}store{a}{sufsi}'.format(a='' if align else 'u', **fmtspec) if typ in ['f32', 'f64']: dest1 = '{in0}'.format(**fmtspec) dest2 = '{in0} + {le}'.format(**fmtspec) else: dest1 = '(__m{nbits}i *){in0}'.format(**fmtspec) dest2 = '(__m{nbits}i *){in0} + 1'.format(**fmtspec) normal = '''{store}({dest1}, {pre}unpacklo{suf}({in1}, {in2})); {store}({dest2}, {pre}unpackhi{suf}({in1}, {in2}));'''. \ format(dest1=dest1, dest2=dest2, **fmtspec) if simd_ext in sse: return normal fmtspec['exlo_in1'] = x86.extract(simd_ext, typ, x86.LO, common.in1) fmtspec['exhi_in1'] = x86.extract(simd_ext, typ, x86.HI, common.in1) fmtspec['exlo_in2'] = x86.extract(simd_ext, typ, x86.LO, common.in2) fmtspec['exhi_in2'] = x86.extract(simd_ext, typ, x86.HI, common.in2) fmtspec['normal'] = normal fmtspec['dest1'] = dest1 fmtspec['dest2'] = dest2 if simd_ext == 'avx2': if typ in ['i8', 'u8']: return \ '''__m256i A1 = _mm256_permute2f128_si256({in1}, {in2}, 2 << 4); __m256i B1 = _mm256_permute2f128_si256( {in1}, {in2}, (3 << 4) | 1); __m256i A2 = _mm256_permute4x64_epi64(A1, _MM_SHUFFLE(3,1,2,0)); __m256i B2 = _mm256_permute4x64_epi64(B1, _MM_SHUFFLE(3,1,2,0)); __m256i mask = _mm256_setr_epi8(0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15, 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15); {store}({dest1}, _mm256_shuffle_epi8(A2, mask)); {store}({dest2}, _mm256_shuffle_epi8(B2, mask));'''. \ format(**fmtspec) if typ in ['i16', 'u16']: return normal if simd_ext == 'avx': if typ in ['i8', 'u8']: return \ '''__m128i v0a = {exlo_in1}; __m128i v0b = {exhi_in1}; __m128i v1a = {exlo_in2}; __m128i v1b = {exhi_in2}; __m128i A1a = _mm_unpacklo_epi8(v0a, v1a); __m128i B1a = _mm_unpackhi_epi8(v0a, v1a); __m128i A1b = _mm_unpacklo_epi8(v0b, v1b); __m128i B1b = _mm_unpackhi_epi8(v0b, v1b); __m256i A1 = {mergeA1}; __m256i B1 = {mergeB1}; {store}({dest1}, A1); {store}({dest2}, B1);'''. \ format(mergeA1=x86.setr('avx', typ, 'A1a', 'A1b'), mergeB1=x86.setr('avx', typ, 'B1a', 'B1b'), **fmtspec) if typ in ['i16', 'u16']: return \ '''__m128i Xa = {exlo_in1}; __m128i Xb = {exhi_in1}; __m128i Ya = {exlo_in2}; __m128i Yb = {exhi_in2}; __m128i A0 = _mm_unpacklo_epi16(Xa, Ya); __m128i B0 = _mm_unpackhi_epi16(Xa, Ya); __m128i A1 = _mm_unpacklo_epi16(Xb, Yb); __m128i B1 = _mm_unpackhi_epi16(Xb, Yb); __m256i A = {merge0}; __m256i B = {merge1}; {store}({dest1}, A); {store}({dest2}, B);'''. \ format(merge0=x86.setr('avx', typ, 'A0', 'B0'), merge1=x86.setr('avx', typ, 'A1', 'B1'), **fmtspec) if (simd_ext in avx and typ in ['f32', 'f64']) or \ simd_ext == 'avx2' and typ in ['i32', 'u32', 'i64', 'u64']: return normal if simd_ext == 'avx' and typ in ['i32', 'u32', 'i64', 'u64']: ftyp = '__m256' if typ in ['i32', 'u32'] else '__m256d' fsuf = 'ps' if typ in ['i32', 'u32'] else 'pd' return '''{ftyp} v0 = _mm256_castsi256_{fsuf}({in1}); {ftyp} v1 = _mm256_castsi256_{fsuf}({in2}); {store}({dest1}, _mm256_cast{fsuf}_si256( _mm256_unpacklo_{fsuf}(v0, v1))); {store}({dest2}, _mm256_cast{fsuf}_si256( _mm256_unpackhi_{fsuf}(v0, v1)));'''. \ format(ftyp=ftyp, fsuf=fsuf, **fmtspec) if simd_ext in avx512: if typ in ['i8', 'u8']: return \ '''__m256i A1 = {exlo_in1}; __m256i B1 = {exhi_in1}; __m256i C1 = {exlo_in2}; __m256i D1 = {exhi_in2}; __m256i A2 = _mm256_permute2f128_si256(A1, C1, 2 << 4); __m256i B2 = _mm256_permute2f128_si256(A1, C1, (3 << 4) | 1); __m256i C2 = _mm256_permute2f128_si256(B1, D1, 2 << 4); __m256i D2 = _mm256_permute2f128_si256(B1, D1, (3 << 4) | 1); __m256i A3 = _mm256_permute4x64_epi64(A2, _MM_SHUFFLE(3,1,2,0)); __m256i B3 = _mm256_permute4x64_epi64(B2, _MM_SHUFFLE(3,1,2,0)); __m256i C3 = _mm256_permute4x64_epi64(C2, _MM_SHUFFLE(3,1,2,0)); __m256i D3 = _mm256_permute4x64_epi64(D2, _MM_SHUFFLE(3,1,2,0)); __m256i mask = _mm256_setr_epi8(0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15, 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15); __m256i A4 = _mm256_shuffle_epi8(A3, mask); __m256i B4 = _mm256_shuffle_epi8(B3, mask); __m256i C4 = _mm256_shuffle_epi8(C3, mask); __m256i D4 = _mm256_shuffle_epi8(D3, mask); {store}({dest1}, {mergeAB}); {store}({dest2}, {mergeCD});'''. \ format(mergeAB=x86.setr(simd_ext, typ, 'A4', 'B4'), mergeCD=x86.setr(simd_ext, typ, 'C4', 'D4'), **fmtspec) if typ in ['i16', 'u16']: return \ '''__m256i A0a = {exlo_in1}; __m256i A0b = {exhi_in1}; __m256i B0a = {exlo_in2}; __m256i B0b = {exhi_in2}; __m256i A1a = _mm256_unpacklo_epi16(A0a, B0a); __m256i B1a = _mm256_unpackhi_epi16(A0a, B0a); __m256i A1b = _mm256_unpacklo_epi16(A0b, B0b); __m256i B1b = _mm256_unpackhi_epi16(A0b, B0b); {store}({dest1}, {mergea}); {store}({dest2}, {mergeb});'''.\ format(mergea=x86.setr(simd_ext, typ, 'A1a', 'B1a'), mergeb=x86.setr(simd_ext, typ, 'A1b', 'B1b'), **fmtspec) if typ in ['i32', 'f32', 'u32']: return \ '''__m512i mask1 = _mm512_setr_epi32(0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23); __m512i mask2 = _mm512_setr_epi32(8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31); {store}({dest1}, _mm512_permutex2var{suf}({in1}, mask1, {in2})); {store}({dest2}, _mm512_permutex2var{suf}( {in1}, mask2, {in2}));'''.format(**fmtspec) if typ in ['i64', 'u64', 'f64']: return \ '''{store}({dest1}, _mm512_unpacklo{suf}({in1}, {in2})); {store}({dest2}, _mm512_unpackhi{suf}({in1}, {in2}));'''. \ format(**fmtspec) ############################################################################### def get_load_v0v1v2v3(simd_ext, typ, align, fmtspec): load = '{pre}load{a}{sufsi}'.format(a='' if align else 'u', **fmtspec) if typ in ['f32', 'f64']: return '''{styp} v0 = {load}(a0); {styp} v1 = {load}(a0 + {le}); {styp} v2 = {load}(a0 + (2 * {le})); {styp} v3 = {load}(a0 + (3 * {le}));'''. \ format(load=load, **fmtspec) else: return '''{styp} v0 = {load}(({styp}*)a0); {styp} v1 = {load}(({styp}*)a0 + 1); {styp} v2 = {load}(({styp}*)a0 + 2); {styp} v3 = {load}(({styp}*)a0 + 3);'''. \ format(load=load, **fmtspec) ############################################################################### def load4_sse(simd_ext, typ, align, fmtspec2): fmtspec = fmtspec2.copy() fmtspec['load_v0v1v2v3'] = get_load_v0v1v2v3('sse', typ, align, fmtspec) if typ in ['i8', 'u8']: if simd_ext == 'sse42': return \ '''nsimd_sse42_v{typ}x4 ret; {load_v0v1v2v3} __m128i mask = _mm_set_epi8(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0); __m128d A1 = _mm_castsi128_pd(_mm_shuffle_epi8(v0, mask)); __m128d B1 = _mm_castsi128_pd(_mm_shuffle_epi8(v1, mask)); __m128d C1 = _mm_castsi128_pd(_mm_shuffle_epi8(v2, mask)); __m128d D1 = _mm_castsi128_pd(_mm_shuffle_epi8(v3, mask)); __m128 A2 = _mm_castpd_ps(_mm_shuffle_pd(A1, B1, _MM_SHUFFLE2(0, 0))); __m128 A3 = _mm_castpd_ps(_mm_shuffle_pd(C1, D1, _MM_SHUFFLE2(0, 0))); __m128 C2 = _mm_castpd_ps(_mm_shuffle_pd(A1, B1, _MM_SHUFFLE2(1, 1))); __m128 C3 = _mm_castpd_ps(_mm_shuffle_pd(C1, D1, _MM_SHUFFLE2(1, 1))); ret.v0 = _mm_castps_si128(_mm_shuffle_ps( A2, A3, _MM_SHUFFLE(2, 0, 2, 0))); ret.v1 = _mm_castps_si128(_mm_shuffle_ps( A2, A3, _MM_SHUFFLE(3, 1, 3, 1))); ret.v2 = _mm_castps_si128(_mm_shuffle_ps( C2, C3, _MM_SHUFFLE(2, 0, 2, 0))); ret.v3 = _mm_castps_si128(_mm_shuffle_ps( C2, C3, _MM_SHUFFLE(3, 1, 3, 1))); return ret;'''.format(**fmtspec) else: return \ '''nsimd_sse2_v{typ}x4 ret; {load_v0v1v2v3} __m128i A1 = _mm_unpacklo_epi8(v0, v2); __m128i B1 = _mm_unpackhi_epi8(v0, v2); __m128i C1 = _mm_unpacklo_epi8(v1, v3); __m128i D1 = _mm_unpackhi_epi8(v1, v3); __m128i A2 = _mm_unpacklo_epi8(A1, C1); __m128i B2 = _mm_unpackhi_epi8(A1, C1); __m128i C2 = _mm_unpacklo_epi8(B1, D1); __m128i D2 = _mm_unpackhi_epi8(B1, D1); __m128i A3 = _mm_unpacklo_epi8(A2, C2); __m128i B3 = _mm_unpackhi_epi8(A2, C2); __m128i C3 = _mm_unpacklo_epi8(B2, D2); __m128i D3 = _mm_unpackhi_epi8(B2, D2); ret.v0 = _mm_unpacklo_epi8(A3, C3); ret.v1 = _mm_unpackhi_epi8(A3, C3); ret.v2 = _mm_unpacklo_epi8(B3, D3); ret.v3 = _mm_unpackhi_epi8(B3, D3); return ret;'''.format(**fmtspec) if typ in ['i16', 'u16']: return \ '''nsimd_{simd_ext}_v{typ}x4 ret; {load_v0v1v2v3} __m128i E = _mm_unpacklo_epi16(v0,v1); __m128i F = _mm_unpackhi_epi16(v0,v1); __m128i G = _mm_unpacklo_epi16(v2,v3); __m128i H = _mm_unpackhi_epi16(v2,v3); __m128i I = _mm_unpacklo_epi16(E,F); __m128i J = _mm_unpackhi_epi16(E,F); __m128i K = _mm_unpacklo_epi16(G,H); __m128i L = _mm_unpackhi_epi16(G,H); ret.v0 = _mm_unpacklo_epi64(I,K); ret.v1 = _mm_unpackhi_epi64(I,K); ret.v2 = _mm_unpacklo_epi64(J,L); ret.v3 = _mm_unpackhi_epi64(J,L); return ret;'''.format(**fmtspec) if typ in ['f32', 'i32', 'u32']: return \ '''nsimd_{simd_ext}_v{typ}x4 ret; {load_v0v1v2v3} {styp} A1 = _mm_unpacklo{suf}(v0, v2); {styp} B1 = _mm_unpackhi{suf}(v0, v2); {styp} C1 = _mm_unpacklo{suf}(v1, v3); {styp} D1 = _mm_unpackhi{suf}(v1, v3); ret.v0 = _mm_unpacklo{suf}(A1, C1); ret.v1 = _mm_unpackhi{suf}(A1, C1); ret.v2 = _mm_unpacklo{suf}(B1, D1); ret.v3 = _mm_unpackhi{suf}(B1, D1); return ret;'''.format(**fmtspec) if typ in ['f64', 'i64', 'u64']: return \ '''nsimd_{simd_ext}_v{typ}x4 ret; {load_v0v1v2v3} ret.v0 = _mm_unpacklo{suf}(v0, v2); ret.v1 = _mm_unpackhi{suf}(v0, v2); ret.v2 = _mm_unpacklo{suf}(v1, v3); ret.v3 = _mm_unpackhi{suf}(v1, v3); return ret;'''.format(**fmtspec) ############################################################################### def load4_avx(simd_ext, typ, align, fmtspec2): fmtspec = fmtspec2.copy() fmtspec['load_v0v1v2v3'] = get_load_v0v1v2v3('avx', typ, align, fmtspec) fmtspec['exlo_v0'] = x86.extract('avx', typ, x86.LO, 'v0') fmtspec['exhi_v0'] = x86.extract('avx', typ, x86.HI, 'v0') fmtspec['exlo_v1'] = x86.extract('avx', typ, x86.LO, 'v1') fmtspec['exhi_v1'] = x86.extract('avx', typ, x86.HI, 'v1') fmtspec['exlo_v2'] = x86.extract('avx', typ, x86.LO, 'v2') fmtspec['exhi_v2'] = x86.extract('avx', typ, x86.HI, 'v2') fmtspec['exlo_v3'] = x86.extract('avx', typ, x86.LO, 'v3') fmtspec['exhi_v3'] = x86.extract('avx', typ, x86.HI, 'v3') fmtspec['a'] = 'a' if align else 'u' if typ in ['i8', 'u8']: if simd_ext == 'avx2': return \ '''nsimd_avx2_v{typ}x4 ret; {load_v0v1v2v3} __m256i mask = _mm256_setr_epi8(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15, 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15); __m256i mask2 = _mm256_setr_epi32(0, 4, 1, 5, 2, 6, 3, 7); __m256i A1 = _mm256_shuffle_epi8(v0, mask); __m256i B1 = _mm256_shuffle_epi8(v1, mask); __m256i C1 = _mm256_shuffle_epi8(v2, mask); __m256i D1 = _mm256_shuffle_epi8(v3, mask); __m256i A2 = _mm256_permutevar8x32_epi32(A1, mask2); __m256i B2 = _mm256_permutevar8x32_epi32(B1, mask2); __m256i C2 = _mm256_permutevar8x32_epi32(C1, mask2); __m256i D2 = _mm256_permutevar8x32_epi32(D1, mask2); __m256i A3 = _mm256_permute2x128_si256(A2, C2, 2 << 4); __m256i C3 = _mm256_permute2x128_si256(B2, D2, 2 << 4); __m256i B3 = _mm256_permute2x128_si256(A2, C2, (3 << 4) | 1); __m256i D3 = _mm256_permute2x128_si256(B2, D2, (3 << 4) | 1); ret.v0 = _mm256_unpacklo_epi64(A3, C3); ret.v1 = _mm256_unpackhi_epi64(A3, C3); ret.v2 = _mm256_unpacklo_epi64(B3, D3); ret.v3 = _mm256_unpackhi_epi64(B3, D3); return ret;'''.format(**fmtspec) else: return \ '''nsimd_avx_v{typ}x4 ret; {load_v0v1v2v3} __m128i Aa = {exlo_v0}; __m128i Ba = {exhi_v0}; __m128i Ca = {exlo_v1}; __m128i Da = {exhi_v1}; __m128i Ab = {exlo_v2}; __m128i Bb = {exhi_v2}; __m128i Cb = {exlo_v3}; __m128i Db = {exhi_v3}; __m128i mask = _mm_set_epi8(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0); __m128d A1 = _mm_castsi128_pd(_mm_shuffle_epi8(Aa, mask)); __m128d B1 = _mm_castsi128_pd(_mm_shuffle_epi8(Ba, mask)); __m128d C1 = _mm_castsi128_pd(_mm_shuffle_epi8(Ca, mask)); __m128d D1 = _mm_castsi128_pd(_mm_shuffle_epi8(Da, mask)); __m128 A2 = _mm_castpd_ps(_mm_shuffle_pd(A1, B1, _MM_SHUFFLE2(0, 0))); __m128 A3 = _mm_castpd_ps(_mm_shuffle_pd(C1, D1, _MM_SHUFFLE2(0, 0))); __m128 C2 = _mm_castpd_ps(_mm_shuffle_pd(A1, B1, _MM_SHUFFLE2(1, 1))); __m128 C3 = _mm_castpd_ps(_mm_shuffle_pd(C1, D1, _MM_SHUFFLE2(1, 1))); __m128i Wa = _mm_castps_si128(_mm_shuffle_ps(A2, A3, _MM_SHUFFLE(2, 0, 2, 0))); __m128i Xa = _mm_castps_si128(_mm_shuffle_ps(A2, A3, _MM_SHUFFLE(3, 1, 3, 1))); __m128i Ya = _mm_castps_si128(_mm_shuffle_ps(C2, C3, _MM_SHUFFLE(2, 0, 2, 0))); __m128i Za = _mm_castps_si128(_mm_shuffle_ps(C2, C3, _MM_SHUFFLE(3, 1, 3, 1))); A1 = _mm_castsi128_pd(_mm_shuffle_epi8(Ab, mask)); B1 = _mm_castsi128_pd(_mm_shuffle_epi8(Bb, mask)); C1 = _mm_castsi128_pd(_mm_shuffle_epi8(Cb, mask)); D1 = _mm_castsi128_pd(_mm_shuffle_epi8(Db, mask)); A2 = _mm_castpd_ps(_mm_shuffle_pd(A1, B1, _MM_SHUFFLE2(0, 0))); A3 = _mm_castpd_ps(_mm_shuffle_pd(C1, D1, _MM_SHUFFLE2(0, 0))); C2 = _mm_castpd_ps(_mm_shuffle_pd(A1, B1, _MM_SHUFFLE2(1, 1))); C3 = _mm_castpd_ps(_mm_shuffle_pd(C1, D1, _MM_SHUFFLE2(1, 1))); __m128i Wb = _mm_castps_si128(_mm_shuffle_ps(A2, A3, _MM_SHUFFLE(2, 0, 2, 0))); __m128i Xb = _mm_castps_si128(_mm_shuffle_ps(A2, A3, _MM_SHUFFLE(3, 1, 3, 1))); __m128i Yb = _mm_castps_si128(_mm_shuffle_ps(C2, C3, _MM_SHUFFLE(2, 0, 2, 0))); __m128i Zb = _mm_castps_si128(_mm_shuffle_ps(C2, C3, _MM_SHUFFLE(3, 1, 3, 1))); ret.v0 = {mergeW}; ret.v1 = {mergeX}; ret.v2 = {mergeY}; ret.v3 = {mergeZ}; return ret;'''.format(mergeW=x86.setr('avx', typ, 'Wa', 'Wb'), mergeX=x86.setr('avx', typ, 'Xa', 'Xb'), mergeY=x86.setr('avx', typ, 'Ya', 'Yb'), mergeZ=x86.setr('avx', typ, 'Za', 'Zb'), **fmtspec) if typ in ['i16', 'u16']: if simd_ext == 'avx2': return \ '''nsimd_avx2_v{typ}x4 ret; {load_v0v1v2v3} __m256i A1 = _mm256_unpacklo_epi16(v0, v2); __m256i B1 = _mm256_unpackhi_epi16(v0, v2); __m256i C1 = _mm256_unpacklo_epi16(v1, v3); __m256i D1 = _mm256_unpackhi_epi16(v1, v3); __m256i A2 = _mm256_unpacklo_epi16(A1, C1); __m256i B2 = _mm256_unpackhi_epi16(A1, C1); __m256i C2 = _mm256_unpacklo_epi16(B1, D1); __m256i D2 = _mm256_unpackhi_epi16(B1, D1); ret.v0 = _mm256_unpacklo_epi16(A2, C2); ret.v1 = _mm256_unpackhi_epi16(A2, C2); ret.v2 = _mm256_unpacklo_epi16(B2, D2); ret.v3 = _mm256_unpackhi_epi16(B2, D2); return ret;'''.format(**fmtspec) else: return \ '''nsimd_avx_v{typ}x4 ret; {load_v0v1v2v3} __m128i Aa = {exlo_v0}; __m128i Ba = {exhi_v0}; __m128i Ca = {exlo_v1}; __m128i Da = {exhi_v1}; __m128i Ab = {exlo_v2}; __m128i Bb = {exhi_v2}; __m128i Cb = {exlo_v3}; __m128i Db = {exhi_v3}; __m128i mask = _mm_set_epi8(15, 14, 7, 6, 13, 12, 5, 4, 11, 10, 3, 2, 9, 8, 1, 0); __m128d A1 = _mm_castsi128_pd(_mm_shuffle_epi8(Aa, mask)); __m128d B1 = _mm_castsi128_pd(_mm_shuffle_epi8(Ba, mask)); __m128d C1 = _mm_castsi128_pd(_mm_shuffle_epi8(Ca, mask)); __m128d D1 = _mm_castsi128_pd(_mm_shuffle_epi8(Da, mask)); __m128 A2 = _mm_castpd_ps(_mm_shuffle_pd(A1, B1, _MM_SHUFFLE2(0, 0))); __m128 A3 = _mm_castpd_ps(_mm_shuffle_pd(C1, D1, _MM_SHUFFLE2(0, 0))); __m128 C2 = _mm_castpd_ps(_mm_shuffle_pd(A1, B1, _MM_SHUFFLE2(1, 1))); __m128 C3 = _mm_castpd_ps(_mm_shuffle_pd(C1, D1, _MM_SHUFFLE2(1, 1))); __m128i Wa = _mm_castps_si128(_mm_shuffle_ps(A2, A3, _MM_SHUFFLE(2, 0, 2, 0))); __m128i Xa = _mm_castps_si128(_mm_shuffle_ps(A2, A3, _MM_SHUFFLE(3, 1, 3, 1))); __m128i Ya = _mm_castps_si128(_mm_shuffle_ps(C2, C3, _MM_SHUFFLE(2, 0, 2, 0))); __m128i Za = _mm_castps_si128(_mm_shuffle_ps(C2, C3, _MM_SHUFFLE(3, 1, 3, 1))); A1 = _mm_castsi128_pd(_mm_shuffle_epi8(Ab, mask)); B1 = _mm_castsi128_pd(_mm_shuffle_epi8(Bb, mask)); C1 = _mm_castsi128_pd(_mm_shuffle_epi8(Cb, mask)); D1 = _mm_castsi128_pd(_mm_shuffle_epi8(Db, mask)); A2 = _mm_castpd_ps(_mm_shuffle_pd(A1, B1, _MM_SHUFFLE2(0, 0))); A3 = _mm_castpd_ps(_mm_shuffle_pd(C1, D1, _MM_SHUFFLE2(0, 0))); C2 = _mm_castpd_ps(_mm_shuffle_pd(A1, B1, _MM_SHUFFLE2(1, 1))); C3 = _mm_castpd_ps(_mm_shuffle_pd(C1, D1, _MM_SHUFFLE2(1, 1))); __m128i Wb = _mm_castps_si128(_mm_shuffle_ps(A2, A3, _MM_SHUFFLE(2, 0, 2, 0))); __m128i Xb = _mm_castps_si128(_mm_shuffle_ps(A2, A3, _MM_SHUFFLE(3, 1, 3, 1))); __m128i Yb = _mm_castps_si128(_mm_shuffle_ps(C2, C3, _MM_SHUFFLE(2, 0, 2, 0))); __m128i Zb = _mm_castps_si128(_mm_shuffle_ps(C2, C3, _MM_SHUFFLE(3, 1, 3, 1))); ret.v0 = {mergeW}; ret.v1 = {mergeX}; ret.v2 = {mergeY}; ret.v3 = {mergeZ}; return ret;'''.format(mergeW=x86.setr('avx', typ, 'Wa', 'Wb'), mergeX=x86.setr('avx', typ, 'Xa', 'Xb'), mergeY=x86.setr('avx', typ, 'Ya', 'Yb'), mergeZ=x86.setr('avx', typ, 'Za', 'Zb'), **fmtspec) if typ == 'f32': return '''nsimd_{simd_ext}_vf32x4 ret; {load_v0v1v2v3} __m256 A1 = _mm256_unpacklo_ps(v0, v2); __m256 B1 = _mm256_unpackhi_ps(v0, v2); __m256 C1 = _mm256_unpacklo_ps(v1, v3); __m256 D1 = _mm256_unpackhi_ps(v1, v3); ret.v0 = _mm256_unpacklo_ps(A1, C1); ret.v1 = _mm256_unpackhi_ps(A1, C1); ret.v2 = _mm256_unpacklo_ps(B1, D1); ret.v3 = _mm256_unpackhi_ps(B1, D1); return ret;'''.format(**fmtspec) if typ in ['i32', 'u32']: if simd_ext == 'avx2': return \ '''nsimd_avx2_v{typ}x4 ret; {load_v0v1v2v3} __m256i A1 = _mm256_unpacklo_epi32(v0, v2); __m256i B1 = _mm256_unpackhi_epi32(v0, v2); __m256i C1 = _mm256_unpacklo_epi32(v1, v3); __m256i D1 = _mm256_unpackhi_epi32(v1, v3); ret.v0 = _mm256_unpacklo_epi32(A1, C1); ret.v1 = _mm256_unpackhi_epi32(A1, C1); ret.v2 = _mm256_unpacklo_epi32(B1, D1); ret.v3 = _mm256_unpackhi_epi32(B1, D1); return ret;'''.format(**fmtspec) else: return \ '''nsimd_avx_v{typ}x4 ret; nsimd_avx_vf32x4 retf32 = nsimd_load4{a}_avx_f32((f32 *){in0}); ret.v0 = _mm256_castps_si256(retf32.v0); ret.v1 = _mm256_castps_si256(retf32.v1); ret.v2 = _mm256_castps_si256(retf32.v2); ret.v3 = _mm256_castps_si256(retf32.v3); return ret;'''.format(**fmtspec) if typ == 'f64': return \ '''nsimd_{simd_ext}_vf64x4 ret; {load_v0v1v2v3} __m256d A1 = _mm256_permute2f128_pd(v0, v2, 2 << 4); __m256d B1 = _mm256_permute2f128_pd(v0, v2, (3 << 4) | 1); __m256d C1 = _mm256_permute2f128_pd(v1, v3, 2 << 4); __m256d D1 = _mm256_permute2f128_pd(v1, v3, (3 << 4) | 1); ret.v0 = _mm256_unpacklo_pd(A1, C1); ret.v1 = _mm256_unpackhi_pd(A1, C1); ret.v2 = _mm256_unpacklo_pd(B1, D1); ret.v3 = _mm256_unpackhi_pd(B1, D1); return ret;'''.format(**fmtspec) if typ in ['i64', 'u64']: if simd_ext == 'avx2': return \ '''nsimd_avx2_v{typ}x4 ret; {load_v0v1v2v3} __m256i A1 = _mm256_permute2f128_si256(v0, v2, 2 << 4); __m256i B1 = _mm256_permute2f128_si256(v0, v2, (3 << 4) | 1); __m256i C1 = _mm256_permute2f128_si256(v1, v3, 2 << 4); __m256i D1 = _mm256_permute2f128_si256(v1, v3, (3 << 4) | 1); ret.v0 = _mm256_unpacklo_epi64(A1, C1); ret.v1 = _mm256_unpackhi_epi64(A1, C1); ret.v2 = _mm256_unpacklo_epi64(B1, D1); ret.v3 = _mm256_unpackhi_epi64(B1, D1); return ret;'''.format(**fmtspec) else: return \ '''nsimd_avx_vf64x4 retf64 = nsimd_load4{a}_avx_f64((f64 *){in0}); nsimd_avx_v{typ}x4 ret; ret.v0 = _mm256_castpd_si256(retf64.v0); ret.v1 = _mm256_castpd_si256(retf64.v1); ret.v2 = _mm256_castpd_si256(retf64.v2); ret.v3 = _mm256_castpd_si256(retf64.v3); return ret;'''.format(**fmtspec) ############################################################################### def load4_avx512(simd_ext, typ, align, fmtspec2): fmtspec = fmtspec2.copy() fmtspec['load_v0v1v2v3'] = get_load_v0v1v2v3(simd_ext, typ, align, fmtspec) fmtspec['exlo_v0'] = x86.extract(simd_ext, typ, x86.LO, 'v0') fmtspec['exhi_v0'] = x86.extract(simd_ext, typ, x86.HI, 'v0') fmtspec['exlo_v1'] = x86.extract(simd_ext, typ, x86.LO, 'v1') fmtspec['exhi_v1'] = x86.extract(simd_ext, typ, x86.HI, 'v1') fmtspec['exlo_v2'] = x86.extract(simd_ext, typ, x86.LO, 'v2') fmtspec['exhi_v2'] = x86.extract(simd_ext, typ, x86.HI, 'v2') fmtspec['exlo_v3'] = x86.extract(simd_ext, typ, x86.LO, 'v3') fmtspec['exhi_v3'] = x86.extract(simd_ext, typ, x86.HI, 'v3') fmtspec['a'] = 'a' if align else 'u' if typ in ['i8', 'u8']: return \ '''nsimd_{simd_ext}_v{typ}x4 ret; {load_v0v1v2v3} __m256i A0a = {exlo_v0}; __m256i B0a = {exhi_v0}; __m256i C0a = {exlo_v1}; __m256i D0a = {exhi_v1}; __m256i A0b = {exlo_v2}; __m256i B0b = {exhi_v2}; __m256i C0b = {exlo_v3}; __m256i D0b = {exhi_v3}; __m256i mask = _mm256_setr_epi8(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15, 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15); __m256i mask2 = _mm256_setr_epi32(0, 4, 1, 5, 2, 6, 3, 7); __m256i A1 = _mm256_shuffle_epi8(A0a, mask); __m256i B1 = _mm256_shuffle_epi8(B0a, mask); __m256i C1 = _mm256_shuffle_epi8(C0a, mask); __m256i D1 = _mm256_shuffle_epi8(D0a, mask); __m256i A2 = _mm256_permutevar8x32_epi32(A1, mask2); __m256i B2 = _mm256_permutevar8x32_epi32(B1, mask2); __m256i C2 = _mm256_permutevar8x32_epi32(C1, mask2); __m256i D2 = _mm256_permutevar8x32_epi32(D1, mask2); __m256i A3 = _mm256_permute2x128_si256(A2, C2, 2 << 4); __m256i C3 = _mm256_permute2x128_si256(B2, D2, 2 << 4); __m256i B3 = _mm256_permute2x128_si256(A2, C2, (3 << 4) | 1); __m256i D3 = _mm256_permute2x128_si256(B2, D2, (3 << 4) | 1); __m256i A4a = _mm256_unpacklo_epi64(A3, C3); __m256i B4a = _mm256_unpackhi_epi64(A3, C3); __m256i C4a = _mm256_unpacklo_epi64(B3, D3); __m256i D4a = _mm256_unpackhi_epi64(B3, D3); A1 = _mm256_shuffle_epi8(A0b, mask); B1 = _mm256_shuffle_epi8(B0b, mask); C1 = _mm256_shuffle_epi8(C0b, mask); D1 = _mm256_shuffle_epi8(D0b, mask); A2 = _mm256_permutevar8x32_epi32(A1, mask2); B2 = _mm256_permutevar8x32_epi32(B1, mask2); C2 = _mm256_permutevar8x32_epi32(C1, mask2); D2 = _mm256_permutevar8x32_epi32(D1, mask2); A3 = _mm256_permute2x128_si256(A2, C2, 2 << 4); C3 = _mm256_permute2x128_si256(B2, D2, 2 << 4); B3 = _mm256_permute2x128_si256(A2, C2, (3 << 4) | 1); D3 = _mm256_permute2x128_si256(B2, D2, (3 << 4) | 1); __m256i A4b = _mm256_unpacklo_epi64(A3, C3); __m256i B4b = _mm256_unpackhi_epi64(A3, C3); __m256i C4b = _mm256_unpacklo_epi64(B3, D3); __m256i D4b = _mm256_unpackhi_epi64(B3, D3); ret.v0 = {mergeA}; ret.v1 = {mergeB}; ret.v2 = {mergeC}; ret.v3 = {mergeD}; return ret;'''.format(mergeA=x86.setr(simd_ext, typ, 'A4a', 'A4b'), mergeB=x86.setr(simd_ext, typ, 'B4a', 'B4b'), mergeC=x86.setr(simd_ext, typ, 'C4a', 'C4b'), mergeD=x86.setr(simd_ext, typ, 'D4a', 'D4b'), **fmtspec) if typ in ['i16', 'u16']: return \ '''nsimd_{simd_ext}_v{typ}x4 ret; {load_v0v1v2v3} __m256i A0a = {exlo_v0}; __m256i B0a = {exhi_v0}; __m256i C0a = {exlo_v1}; __m256i D0a = {exhi_v1}; __m256i A0b = {exlo_v2}; __m256i B0b = {exhi_v2}; __m256i C0b = {exlo_v3}; __m256i D0b = {exhi_v3}; __m256i A1 = _mm256_unpacklo_epi16(A0a, C0a); __m256i B1 = _mm256_unpackhi_epi16(A0a, C0a); __m256i C1 = _mm256_unpacklo_epi16(B0a, D0a); __m256i D1 = _mm256_unpackhi_epi16(B0a, D0a); __m256i A2 = _mm256_unpacklo_epi16(A1, C1); __m256i B2 = _mm256_unpackhi_epi16(A1, C1); __m256i C2 = _mm256_unpacklo_epi16(B1, D1); __m256i D2 = _mm256_unpackhi_epi16(B1, D1); __m256i A3a = _mm256_unpacklo_epi16(A2, C2); __m256i B3a = _mm256_unpackhi_epi16(A2, C2); __m256i C3a = _mm256_unpacklo_epi16(B2, D2); __m256i D3a = _mm256_unpackhi_epi16(B2, D2); A1 = _mm256_unpacklo_epi16(A0b, C0b); B1 = _mm256_unpackhi_epi16(A0b, C0b); C1 = _mm256_unpacklo_epi16(B0b, D0b); D1 = _mm256_unpackhi_epi16(B0b, D0b); A2 = _mm256_unpacklo_epi16(A1, C1); B2 = _mm256_unpackhi_epi16(A1, C1); C2 = _mm256_unpacklo_epi16(B1, D1); D2 = _mm256_unpackhi_epi16(B1, D1); __m256i A3b = _mm256_unpacklo_epi16(A2, C2); __m256i B3b = _mm256_unpackhi_epi16(A2, C2); __m256i C3b = _mm256_unpacklo_epi16(B2, D2); __m256i D3b = _mm256_unpackhi_epi16(B2, D2); ret.v0 = {mergeA}; ret.v1 = {mergeB}; ret.v2 = {mergeC}; ret.v3 = {mergeD}; return ret;'''.format(mergeA=x86.setr(simd_ext, typ, 'A3a', 'A3b'), mergeB=x86.setr(simd_ext, typ, 'B3a', 'B3b'), mergeC=x86.setr(simd_ext, typ, 'C3a', 'C3b'), mergeD=x86.setr(simd_ext, typ, 'D3a', 'D3b'), **fmtspec) if typ in ['f32', 'i32', 'u32']: return \ '''nsimd_{simd_ext}_v{typ}x4 ret; {load_v0v1v2v3} __m512i WXm = _mm512_setr_epi32(0, 4, 8, 12, 16, 20, 24, 28, 1, 5, 9, 13, 17, 21, 25, 29); __m512i YZm = _mm512_setr_epi32(2, 6, 10, 14, 18, 22, 26, 30, 3, 7, 11, 15, 19, 23, 27, 31); __m512i Wm = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23); __m512i Xm = _mm512_setr_epi32(8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31); {styp} WXa = _mm512_permutex2var{suf}(v0, WXm, v1); {styp} WXb = _mm512_permutex2var{suf}(v2, WXm, v3); {styp} YZa = _mm512_permutex2var{suf}(v0, YZm, v1); {styp} YZb = _mm512_permutex2var{suf}(v2, YZm, v3); ret.v0 = _mm512_permutex2var{suf}(WXa, Wm, WXb); ret.v1 = _mm512_permutex2var{suf}(WXa, Xm, WXb); ret.v2 = _mm512_permutex2var{suf}(YZa, Wm, YZb); ret.v3 = _mm512_permutex2var{suf}(YZa, Xm, YZb); return ret;'''.format(**fmtspec) if typ in ['f64', 'i64', 'u64']: return \ '''nsimd_{simd_ext}_v{typ}x4 ret; {load_v0v1v2v3} {styp} A1 = _mm512_unpacklo{suf}(v0, v1); {styp} B1 = _mm512_unpacklo{suf}(v2, v3); {styp} C1 = _mm512_unpackhi{suf}(v0, v1); {styp} D1 = _mm512_unpackhi{suf}(v2, v3); __m512i A_mask = _mm512_set_epi64(13, 9, 12, 8, 5, 1, 4, 0); __m512i B_mask = _mm512_set_epi64(15, 11, 14, 10, 7, 3, 6, 2); ret.v0 = _mm512_permutex2var{suf}(A1, A_mask, B1); ret.v1 = _mm512_permutex2var{suf}(C1, A_mask, D1); ret.v2 = _mm512_permutex2var{suf}(A1, B_mask, B1); ret.v3 = _mm512_permutex2var{suf}(C1, B_mask, D1); return ret;'''.format(**fmtspec) ############################################################################### def store4(simd_ext, typ, align, fmtspec2, v0, v1, v2, v3): fmtspec = fmtspec2.copy() fmtspec['a'] = '' if align else 'u' store = '{pre}store{a}{sufsi}'.format(**fmtspec) fmtspec['store'] = store fmtspec['v0'] = v0 fmtspec['v1'] = v1 fmtspec['v2'] = v2 fmtspec['v3'] = v3 if typ in ['f32', 'f64']: return \ '''{store}({in0}, {v0}); {store}({in0} + {le}, {v1}); {store}({in0} + (2 * {le}), {v2}); {store}({in0} + (3 * {le}), {v3});'''.format(**fmtspec) else: return \ '''{store}(({styp} *){in0}, {v0}); {store}(({styp} *){in0} + 1, {v1}); {store}(({styp} *){in0} + 2, {v2}); {store}(({styp} *){in0} + 3, {v3});'''.format(**fmtspec) ############################################################################### def store4_sse(typ, align, fmtspec2): fmtspec = fmtspec2.copy() if typ in ['i8', 'u8']: return \ '''__m128i A5 = _mm_unpacklo_epi8({in1}, {in3}); __m128i B5 = _mm_unpackhi_epi8({in1}, {in3}); __m128i C5 = _mm_unpacklo_epi8({in2}, {in4}); __m128i D5 = _mm_unpackhi_epi8({in2}, {in4}); __m128i A6 = _mm_unpacklo_epi8(A5, C5); __m128i B6 = _mm_unpackhi_epi8(A5, C5); __m128i C6 = _mm_unpacklo_epi8(B5, D5); __m128i D6 = _mm_unpackhi_epi8(B5, D5); {store}'''.format(store=store4('sse', typ, align, fmtspec, 'A6', 'B6', 'C6', 'D6'), **fmtspec) if typ in ['i16', 'u16']: return \ '''__m128i Q = _mm_unpacklo_epi16({in1}, {in2}); __m128i R = _mm_unpackhi_epi16({in1}, {in2}); __m128i S = _mm_unpacklo_epi16({in3}, {in4}); __m128i T = _mm_unpackhi_epi16({in3}, {in4}); __m128i U = _mm_unpacklo_epi32(Q, S); __m128i V = _mm_unpackhi_epi32(Q, S); __m128i W = _mm_unpacklo_epi32(R, T); __m128i X = _mm_unpackhi_epi32(R, T); {store}'''.format(store=store4('sse', typ, align, fmtspec, 'U', 'V', 'W', 'X'), **fmtspec) if typ in ['f32', 'i32', 'u32']: return \ '''{styp} A3 = _mm_unpacklo{suf}({in1}, {in3}); {styp} B3 = _mm_unpackhi{suf}({in1}, {in3}); {styp} C3 = _mm_unpacklo{suf}({in2}, {in4}); {styp} D3 = _mm_unpackhi{suf}({in2}, {in4}); {styp} A4 = _mm_unpacklo{suf}(A3, C3); {styp} B4 = _mm_unpackhi{suf}(A3, C3); {styp} C4 = _mm_unpacklo{suf}(B3, D3); {styp} D4 = _mm_unpackhi{suf}(B3, D3); {store}'''.format(store=store4('sse', typ, align, fmtspec, 'A4', 'B4', 'C4', 'D4'), **fmtspec) if typ in ['f64', 'u64', 'i64']: return \ '''{styp} A0 = _mm_unpacklo{suf}({in1}, {in2}); {styp} B0 = _mm_unpacklo{suf}({in3}, {in4}); {styp} C0 = _mm_unpackhi{suf}({in1}, {in2}); {styp} D0 = _mm_unpackhi{suf}({in3}, {in4}); {store}'''.format(store=store4('sse', typ, align, fmtspec, 'A0', 'B0', 'C0', 'D0'), **fmtspec) ############################################################################### def store4_avx(simd_ext, typ, align, fmtspec2): fmtspec = fmtspec2.copy() fmtspec['exlo_in1'] = x86.extract('avx', typ, x86.LO, common.in1) fmtspec['exhi_in1'] = x86.extract('avx', typ, x86.HI, common.in1) fmtspec['exlo_in2'] = x86.extract('avx', typ, x86.LO, common.in2) fmtspec['exhi_in2'] = x86.extract('avx', typ, x86.HI, common.in2) fmtspec['exlo_in3'] = x86.extract('avx', typ, x86.LO, common.in3) fmtspec['exhi_in3'] = x86.extract('avx', typ, x86.HI, common.in3) fmtspec['exlo_in4'] = x86.extract('avx', typ, x86.LO, common.in4) fmtspec['exhi_in4'] = x86.extract('avx', typ, x86.HI, common.in4) fmtspec['a'] = 'a' if align else 'u' if typ in ['i8', 'u8']: if simd_ext == 'avx2': return \ '''__m256i A1 = _mm256_unpacklo_epi8({in1}, {in3}); __m256i B1 = _mm256_unpackhi_epi8({in1}, {in3}); __m256i C1 = _mm256_unpacklo_epi8({in2}, {in4}); __m256i D1 = _mm256_unpackhi_epi8({in2}, {in4}); __m256i A2 = _mm256_permute4x64_epi64(A1, _MM_SHUFFLE(3,1,2,0)); __m256i B2 = _mm256_permute4x64_epi64(B1, _MM_SHUFFLE(3,1,2,0)); __m256i C2 = _mm256_permute4x64_epi64(C1, _MM_SHUFFLE(3,1,2,0)); __m256i D2 = _mm256_permute4x64_epi64(D1, _MM_SHUFFLE(3,1,2,0)); __m256i A = _mm256_unpacklo_epi8(A2, C2); __m256i B = _mm256_unpacklo_epi8(B2, D2); __m256i C = _mm256_unpackhi_epi8(A2, C2); __m256i D = _mm256_unpackhi_epi8(B2, D2); {store}'''.format(store=store4('avx', typ, align, fmtspec, 'A', 'B', 'C', 'D'), **fmtspec) else: return \ '''__m128i Wa = {exlo_in1}; __m128i Wb = {exhi_in1}; __m128i Xa = {exlo_in2}; __m128i Xb = {exhi_in2}; __m128i Ya = {exlo_in3}; __m128i Yb = {exhi_in3}; __m128i Za = {exlo_in4}; __m128i Zb = {exhi_in4}; __m128i AA = _mm_unpacklo_epi8(Wa, Ya); __m128i BB = _mm_unpackhi_epi8(Wa, Ya); __m128i CC = _mm_unpacklo_epi8(Xa, Za); __m128i DD = _mm_unpackhi_epi8(Xa, Za); __m128i A0 = _mm_unpacklo_epi8(AA, CC); __m128i B0 = _mm_unpackhi_epi8(AA, CC); __m128i C0 = _mm_unpacklo_epi8(BB, DD); __m128i D0 = _mm_unpackhi_epi8(BB, DD); AA = _mm_unpacklo_epi8(Wb, Yb); BB = _mm_unpackhi_epi8(Wb, Yb); CC = _mm_unpacklo_epi8(Xb, Zb); DD = _mm_unpackhi_epi8(Xb, Zb); __m128i A1 = _mm_unpacklo_epi8(AA, CC); __m128i B1 = _mm_unpackhi_epi8(AA, CC); __m128i C1 = _mm_unpacklo_epi8(BB, DD); __m128i D1 = _mm_unpackhi_epi8(BB, DD); __m256i A = {mergeAB0}; __m256i B = {mergeCD0}; __m256i C = {mergeAB1}; __m256i D = {mergeCD1}; {store}'''.format(mergeAB0=x86.setr('avx', typ, 'A0', 'B0'), mergeCD0=x86.setr('avx', typ, 'C0', 'D0'), mergeAB1=x86.setr('avx', typ, 'A1', 'B1'), mergeCD1=x86.setr('avx', typ, 'C1', 'D1'), store=store4('avx', typ, align, fmtspec, 'A', 'B', 'C', 'D'), **fmtspec) if typ in ['i16', 'u16']: if simd_ext == 'avx2': return \ '''__m256i A3 = _mm256_unpacklo_epi16({in1}, {in3}); __m256i B3 = _mm256_unpackhi_epi16({in1}, {in3}); __m256i C3 = _mm256_unpacklo_epi16({in2}, {in4}); __m256i D3 = _mm256_unpackhi_epi16({in2}, {in4}); __m256i A = _mm256_unpacklo_epi16(A3, C3); __m256i B = _mm256_unpackhi_epi16(A3, C3); __m256i C = _mm256_unpacklo_epi16(B3, D3); __m256i D = _mm256_unpackhi_epi16(B3, D3); {store}'''.format(store=store4('avx', typ, align, fmtspec, 'A', 'B', 'C', 'D'), **fmtspec) else: return \ '''__m128i Wa = {exlo_in1}; __m128i Wb = {exhi_in1}; __m128i Xa = {exlo_in2}; __m128i Xb = {exhi_in2}; __m128i Ya = {exlo_in3}; __m128i Yb = {exhi_in3}; __m128i Za = {exlo_in4}; __m128i Zb = {exhi_in4}; __m128i AA = _mm_unpacklo_epi16(Wa, Ya); __m128i BB = _mm_unpackhi_epi16(Wa, Ya); __m128i CC = _mm_unpacklo_epi16(Xa, Za); __m128i DD = _mm_unpackhi_epi16(Xa, Za); __m128i A0 = _mm_unpacklo_epi16(AA, CC); __m128i B0 = _mm_unpackhi_epi16(AA, CC); __m128i C0 = _mm_unpacklo_epi16(BB, DD); __m128i D0 = _mm_unpackhi_epi16(BB, DD); AA = _mm_unpacklo_epi16(Wb, Yb); BB = _mm_unpackhi_epi16(Wb, Yb); CC = _mm_unpacklo_epi16(Xb, Zb); DD = _mm_unpackhi_epi16(Xb, Zb); __m128i A1 = _mm_unpacklo_epi16(AA, CC); __m128i B1 = _mm_unpackhi_epi16(AA, CC); __m128i C1 = _mm_unpacklo_epi16(BB, DD); __m128i D1 = _mm_unpackhi_epi16(BB, DD); __m256i A = {mergeAB0}; __m256i B = {mergeCD0}; __m256i C = {mergeAB1}; __m256i D = {mergeCD1}; {store}'''.format(mergeAB0=x86.setr('avx', typ, 'A0', 'B0'), mergeCD0=x86.setr('avx', typ, 'C0', 'D0'), mergeAB1=x86.setr('avx', typ, 'A1', 'B1'), mergeCD1=x86.setr('avx', typ, 'C1', 'D1'), store=store4('avx', typ, align, fmtspec, 'A', 'B', 'C', 'D'), **fmtspec) if typ == 'f32': return \ '''__m256 A3 = _mm256_unpacklo_ps({in1}, {in3}); __m256 B3 = _mm256_unpackhi_ps({in1}, {in3}); __m256 C3 = _mm256_unpacklo_ps({in2}, {in4}); __m256 D3 = _mm256_unpackhi_ps({in2}, {in4}); __m256 A = _mm256_unpacklo_ps(A3, C3); __m256 B = _mm256_unpackhi_ps(A3, C3); __m256 C = _mm256_unpacklo_ps(B3, D3); __m256 D = _mm256_unpackhi_ps(B3, D3); {store}'''.format(store=store4('avx', typ, align, fmtspec, 'A', 'B', 'C', 'D'), **fmtspec) if typ in ['i32', 'u32']: if simd_ext == 'avx2': return \ '''__m256i A3 = _mm256_unpacklo_epi32({in1}, {in3}); __m256i B3 = _mm256_unpackhi_epi32({in1}, {in3}); __m256i C3 = _mm256_unpacklo_epi32({in2}, {in4}); __m256i D3 = _mm256_unpackhi_epi32({in2}, {in4}); __m256i A = _mm256_unpacklo_epi32(A3, C3); __m256i B = _mm256_unpackhi_epi32(A3, C3); __m256i C = _mm256_unpacklo_epi32(B3, D3); __m256i D = _mm256_unpackhi_epi32(B3, D3); {store}'''.format(store=store4('avx', typ, align, fmtspec, 'A', 'B', 'C', 'D'), **fmtspec) else: return \ '''nsimd_store4{a}_avx_f32((f32 *){in0}, _mm256_castsi256_ps({in1}), _mm256_castsi256_ps({in2}), _mm256_castsi256_ps({in3}), _mm256_castsi256_ps({in4}));'''. \ format(**fmtspec) if typ == 'f64': return \ '''__m256d A3 = _mm256_permute2f128_pd({in1}, {in3}, 2 << 4); __m256d B3 = _mm256_permute2f128_pd({in2}, {in4}, 2 << 4); __m256d C3 = _mm256_permute2f128_pd({in1}, {in3}, (3 << 4) | 1); __m256d D3 = _mm256_permute2f128_pd({in2}, {in4}, (3 << 4) | 1); __m256d A = _mm256_unpacklo_pd(A3, B3); __m256d B = _mm256_unpackhi_pd(A3, B3); __m256d C = _mm256_unpacklo_pd(C3, D3); __m256d D = _mm256_unpackhi_pd(C3, D3); {store}'''.format(store=store4('avx', typ, align, fmtspec, 'A', 'B', 'C', 'D'), **fmtspec) if typ in ['i64', 'u64']: if simd_ext == 'avx2': return \ '''__m256i A3 = _mm256_permute2f128_si256({in1}, {in3}, 2 << 4); __m256i B3 = _mm256_permute2f128_si256({in2}, {in4}, 2 << 4); __m256i C3 = _mm256_permute2f128_si256( {in1}, {in3}, (3 << 4) | 1); __m256i D3 = _mm256_permute2f128_si256( {in2}, {in4}, (3 << 4) | 1); __m256i A = _mm256_unpacklo_epi64(A3, B3); __m256i B = _mm256_unpackhi_epi64(A3, B3); __m256i C = _mm256_unpacklo_epi64(C3, D3); __m256i D = _mm256_unpackhi_epi64(C3, D3); {store}'''.format(store=store4('avx', typ, align, fmtspec, 'A', 'B', 'C', 'D'), **fmtspec) else: return \ '''nsimd_store4{a}_avx_f64((f64 *){in0}, _mm256_castsi256_pd({in1}), _mm256_castsi256_pd({in2}), _mm256_castsi256_pd({in3}), _mm256_castsi256_pd({in4}));'''. \ format(**fmtspec) ############################################################################### def store4_avx512(simd_ext, typ, align, fmtspec2): fmtspec = fmtspec2.copy() fmtspec['exlo_in1'] = x86.extract(simd_ext, typ, x86.LO, common.in1) fmtspec['exhi_in1'] = x86.extract(simd_ext, typ, x86.HI, common.in1) fmtspec['exlo_in2'] = x86.extract(simd_ext, typ, x86.LO, common.in2) fmtspec['exhi_in2'] = x86.extract(simd_ext, typ, x86.HI, common.in2) fmtspec['exlo_in3'] = x86.extract(simd_ext, typ, x86.LO, common.in3) fmtspec['exhi_in3'] = x86.extract(simd_ext, typ, x86.HI, common.in3) fmtspec['exlo_in4'] = x86.extract(simd_ext, typ, x86.LO, common.in4) fmtspec['exhi_in4'] = x86.extract(simd_ext, typ, x86.HI, common.in4) fmtspec['a'] = 'a' if align else 'u' if typ in ['i8', 'u8']: return \ '''__m256i A0a = {exlo_in1}; __m256i A0b = {exhi_in1}; __m256i B0a = {exlo_in2}; __m256i B0b = {exhi_in2}; __m256i C0a = {exlo_in3}; __m256i C0b = {exhi_in3}; __m256i D0a = {exlo_in4}; __m256i D0b = {exhi_in4}; __m256i A1 = _mm256_unpacklo_epi8(A0a, C0a); __m256i B1 = _mm256_unpackhi_epi8(A0a, C0a); __m256i C1 = _mm256_unpacklo_epi8(B0a, D0a); __m256i D1 = _mm256_unpackhi_epi8(B0a, D0a); __m256i A2 = _mm256_permute4x64_epi64(A1, _MM_SHUFFLE(3,1,2,0)); __m256i B2 = _mm256_permute4x64_epi64(B1, _MM_SHUFFLE(3,1,2,0)); __m256i C2 = _mm256_permute4x64_epi64(C1, _MM_SHUFFLE(3,1,2,0)); __m256i D2 = _mm256_permute4x64_epi64(D1, _MM_SHUFFLE(3,1,2,0)); __m256i A3a = _mm256_unpacklo_epi8(A2, C2); __m256i B3a = _mm256_unpacklo_epi8(B2, D2); __m256i C3a = _mm256_unpackhi_epi8(A2, C2); __m256i D3a = _mm256_unpackhi_epi8(B2, D2); A1 = _mm256_unpacklo_epi8(A0b, C0b); B1 = _mm256_unpackhi_epi8(A0b, C0b); C1 = _mm256_unpacklo_epi8(B0b, D0b); D1 = _mm256_unpackhi_epi8(B0b, D0b); A2 = _mm256_permute4x64_epi64(A1, _MM_SHUFFLE(3,1,2,0)); B2 = _mm256_permute4x64_epi64(B1, _MM_SHUFFLE(3,1,2,0)); C2 = _mm256_permute4x64_epi64(C1, _MM_SHUFFLE(3,1,2,0)); D2 = _mm256_permute4x64_epi64(D1, _MM_SHUFFLE(3,1,2,0)); __m256i A3b = _mm256_unpacklo_epi8(A2, C2); __m256i B3b = _mm256_unpacklo_epi8(B2, D2); __m256i C3b = _mm256_unpackhi_epi8(A2, C2); __m256i D3b = _mm256_unpackhi_epi8(B2, D2); __m512i A = {mergeABa}; __m512i B = {mergeCDa}; __m512i C = {mergeABb}; __m512i D = {mergeCDb}; {store}'''.format(mergeABa=x86.setr(simd_ext, typ, 'A3a', 'B3a'), mergeCDa=x86.setr(simd_ext, typ, 'C3a', 'D3a'), mergeABb=x86.setr(simd_ext, typ, 'A3b', 'B3b'), mergeCDb=x86.setr(simd_ext, typ, 'C3b', 'D3b'), store=store4(simd_ext, typ, align, fmtspec, 'A', 'B', 'C', 'D'), **fmtspec) if typ in ['i16', 'u16']: return \ '''__m256i A0a = {exlo_in1}; __m256i A0b = {exhi_in1}; __m256i B0a = {exlo_in2}; __m256i B0b = {exhi_in2}; __m256i C0a = {exlo_in3}; __m256i C0b = {exhi_in3}; __m256i D0a = {exlo_in4}; __m256i D0b = {exhi_in4}; __m256i A3 = _mm256_unpacklo_epi16(A0a, C0a); __m256i B3 = _mm256_unpackhi_epi16(A0a, C0a); __m256i C3 = _mm256_unpacklo_epi16(B0a, D0a); __m256i D3 = _mm256_unpackhi_epi16(B0a, D0a); __m256i A4a = _mm256_unpacklo_epi16(A3, C3); __m256i B4a = _mm256_unpackhi_epi16(A3, C3); __m256i C4a = _mm256_unpacklo_epi16(B3, D3); __m256i D4a = _mm256_unpackhi_epi16(B3, D3); A3 = _mm256_unpacklo_epi16(A0b, C0b); B3 = _mm256_unpackhi_epi16(A0b, C0b); C3 = _mm256_unpacklo_epi16(B0b, D0b); D3 = _mm256_unpackhi_epi16(B0b, D0b); __m256i A4b = _mm256_unpacklo_epi16(A3, C3); __m256i B4b = _mm256_unpackhi_epi16(A3, C3); __m256i C4b = _mm256_unpacklo_epi16(B3, D3); __m256i D4b = _mm256_unpackhi_epi16(B3, D3); __m512i A = {mergeABa}; __m512i B = {mergeCDa}; __m512i C = {mergeABb}; __m512i D = {mergeCDb}; {store}'''.format(mergeABa=x86.setr(simd_ext, typ, 'A4a', 'B4a'), mergeCDa=x86.setr(simd_ext, typ, 'C4a', 'D4a'), mergeABb=x86.setr(simd_ext, typ, 'A4b', 'B4b'), mergeCDb=x86.setr(simd_ext, typ, 'C4b', 'D4b'), store=store4(simd_ext, typ, align, fmtspec, 'A', 'B', 'C', 'D'), **fmtspec) if typ in ['f32', 'i32', 'u32']: return \ '''__m512i m1 = _mm512_setr_epi32(0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23); __m512i m2 = _mm512_setr_epi32(8, 9, 10, 11, 24, 25, 26, 27, 12, 13, 14, 15, 28, 29, 30, 31); __m512i m3 = _mm512_setr_epi32(0, 4, 16, 20, 1, 5, 17, 21, 2, 6, 18, 22, 3, 7, 19, 23); __m512i m4 = _mm512_setr_epi32(8, 12, 24, 28, 9, 13, 25, 29, 10, 14, 26, 30, 11, 15, 27, 31); {styp} WXa = _mm512_permutex2var{suf}({in1}, m1, {in2}); {styp} WXb = _mm512_permutex2var{suf}({in1}, m2, {in2}); {styp} YZa = _mm512_permutex2var{suf}({in3}, m1, {in4}); {styp} YZb = _mm512_permutex2var{suf}({in3}, m2, {in4}); {styp} A = _mm512_permutex2var{suf}(WXa, m3, YZa); {styp} B = _mm512_permutex2var{suf}(WXa, m4, YZa); {styp} C = _mm512_permutex2var{suf}(WXb, m3, YZb); {styp} D = _mm512_permutex2var{suf}(WXb, m4, YZb); {store}'''.format(store=store4(simd_ext, typ, align, fmtspec, 'A', 'B', 'C', 'D'), **fmtspec) if typ in ['f64', 'i64', 'u64']: return \ '''__m512i A_mask = _mm512_setr_epi64(0, 1, 2, 3, 8, 9, 10, 11); __m512i B_mask = _mm512_setr_epi64(4, 5, 6, 7, 12, 13, 14, 15); __m512i C_mask = _mm512_setr_epi64(0, 4, 8, 12, 1, 5, 9, 13); __m512i D_mask = _mm512_setr_epi64(2, 6, 10, 14, 3, 7, 11, 15); {styp} A1 = _mm512_permutex2var{suf}({in1}, A_mask, {in2}); {styp} B1 = _mm512_permutex2var{suf}({in1}, B_mask, {in2}); {styp} C1 = _mm512_permutex2var{suf}({in3}, A_mask, {in4}); {styp} D1 = _mm512_permutex2var{suf}({in3}, B_mask, {in4}); {styp} A = _mm512_permutex2var{suf}(A1, C_mask, C1); {styp} B = _mm512_permutex2var{suf}(A1, D_mask, C1); {styp} C = _mm512_permutex2var{suf}(B1, C_mask, D1); {styp} D = _mm512_permutex2var{suf}(B1, D_mask, D1); {store}'''.format(store=store4(simd_ext, typ, align, fmtspec, 'A', 'B', 'C', 'D'), **fmtspec) ############################################################################### def get_load_v0v1v2(simd_ext, typ, align, fmtspec): load = '{pre}load{a}{sufsi}'.format(a='' if align else 'u', **fmtspec) if typ in ['f32', 'f64']: return '''{styp} v0 = {load}(a0); {styp} v1 = {load}(a0 + {le}); {styp} v2 = {load}(a0 + (2 * {le}));'''. \ format(load=load, **fmtspec) else: return '''{styp} v0 = {load}(({styp}*)a0); {styp} v1 = {load}(({styp}*)a0 + 1); {styp} v2 = {load}(({styp}*)a0 + 2);'''. \ format(load=load, **fmtspec) ############################################################################### def load3_sse(simd_ext, typ, align, fmtspec2): fmtspec = fmtspec2.copy() fmtspec['load_v0v1v2'] = get_load_v0v1v2('sse', typ, align, fmtspec) fmtspec['a'] = 'a' if align else 'u' if typ in ['i8', 'u8']: if simd_ext == 'sse42': return \ '''nsimd_sse42_v{typ}x3 ret; {load_v0v1v2} __m128i A1_mask = _mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 15, 12, 9, 6, 3, 0); __m128i A2_mask = _mm_set_epi8(-1, -1, -1, -1, -1, 14, 11, 8, 5, 2, -1, -1, -1, -1, -1, -1); __m128i A3_mask = _mm_set_epi8(13, 10, 7, 4, 1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1); __m128i A4 = _mm_shuffle_epi8(v0, A1_mask); __m128i A5 = _mm_shuffle_epi8(v1, A2_mask); __m128i A6 = _mm_shuffle_epi8(v2, A3_mask); A4 = _mm_or_si128(A4, A5); ret.v0 = _mm_or_si128(A4, A6); __m128i B1_mask = _mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 13, 10, 7, 4, 1); __m128i B2_mask = _mm_set_epi8(-1, -1, -1, -1, -1, 15, 12, 9, 6, 3, 0, -1, -1, -1, -1, -1); __m128i B3_mask = _mm_set_epi8(14, 11, 8, 5, 2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1); __m128i B4 = _mm_shuffle_epi8(v0, B1_mask); __m128i B5 = _mm_shuffle_epi8(v1, B2_mask); __m128i B6 = _mm_shuffle_epi8(v2, B3_mask); B4 = _mm_or_si128(B4, B5); ret.v1 = _mm_or_si128(B4, B6); __m128i C1_mask = _mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 14, 11, 8, 5, 2); __m128i C2_mask = _mm_set_epi8(-1, -1, -1, -1, -1, -1, 13, 10, 7, 4, 1, -1, -1, -1, -1, -1); __m128i C3_mask = _mm_set_epi8(15, 12, 9, 6, 3, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1); __m128i C4 = _mm_shuffle_epi8(v0, C1_mask); __m128i C5 = _mm_shuffle_epi8(v1, C2_mask); __m128i C6 = _mm_shuffle_epi8(v2, C3_mask); C4 = _mm_or_si128(C4, C5); ret.v2 = _mm_or_si128(C4, C6); return ret;'''.format(**fmtspec) else: return \ '''nsimd_sse2_v{typ}x3 ret; {load_v0v1v2} __m128i A0 = v0; __m128i B0 = v1; __m128i C0 = v2; int k; for (k = 0; k < 4; ++k) {{ __m128d B0_pd = _mm_castsi128_pd(B0); __m128d C0_pd = _mm_castsi128_pd(C0); __m128d B1_pd = _mm_shuffle_pd(B0_pd, B0_pd, 1); __m128d C2_pd = _mm_shuffle_pd(C0_pd, C0_pd, 1); __m128i B1 = _mm_castpd_si128(B1_pd); __m128i C2 = _mm_castpd_si128(C2_pd); __m128i B3 = _mm_unpackhi_epi8(A0, C2); __m128i A4 = _mm_unpacklo_epi8(A0, B1); __m128i C5 = _mm_unpackhi_epi8(B1, C0); A0 = A4; B0 = B3; C0 = C5; }} ret.v0 = A0; ret.v1 = B0; ret.v2 = C0; return ret;'''.format(**fmtspec) if typ in ['i16', 'u16']: return \ '''nsimd_{simd_ext}_v{typ}x3 ret; {load_v0v1v2} int k; for (k = 0; k < 3; ++k) {{ __m128d B1_pd = _mm_castsi128_pd(v1); __m128d C1_pd = _mm_castsi128_pd(v2); __m128d B2_pd = _mm_shuffle_pd(B1_pd, B1_pd, 1); __m128d C3_pd = _mm_shuffle_pd(C1_pd, C1_pd, 1); __m128i B2 = _mm_castpd_si128(B2_pd); __m128i C3 = _mm_castpd_si128(C3_pd); __m128i B4 = _mm_unpackhi_epi16(v0, C3); __m128i A5 = _mm_unpacklo_epi16(v0, B2); __m128i C7 = _mm_unpackhi_epi16(B2, v2); v0 = A5; v1 = B4; v2 = C7; }} ret.v0 = v0; ret.v1 = v1; ret.v2 = v2; return ret;'''.format(**fmtspec) if typ == 'f32': return \ '''nsimd_{simd_ext}_v{typ}x3 ret; {load_v0v1v2} __m128 A1 = _mm_shuffle_ps(v2, v1, _MM_SHUFFLE(3,2,1,0)); __m128 B2 = _mm_shuffle_ps(v0, v2, _MM_SHUFFLE(3,2,1,0)); __m128 C3 = _mm_shuffle_ps(v1, v0, _MM_SHUFFLE(3,2,1,0)); ret.v0 = _mm_shuffle_ps(v0, A1, _MM_SHUFFLE(1,2,3,0)); __m128 B5 = _mm_shuffle_ps(B2, v1, _MM_SHUFFLE(0,3,2,1)); ret.v2 = _mm_shuffle_ps(C3, v2, _MM_SHUFFLE(3,0,1,2)); ret.v1 = _mm_shuffle_ps(B5, B5, _MM_SHUFFLE(1,2,3,0)); return ret;'''.format(**fmtspec) if typ in ['i32', 'u32']: return \ '''nsimd_{simd_ext}_v{typ}x3 ret; nsimd_{simd_ext}_vf32x3 retf32 = nsimd_load3{a}_{simd_ext}_f32((f32 *){in0}); ret.v0 = _mm_castps_si128(retf32.v0); ret.v1 = _mm_castps_si128(retf32.v1); ret.v2 = _mm_castps_si128(retf32.v2); return ret;'''.format(**fmtspec) if typ == 'f64': return \ '''nsimd_{simd_ext}_vf64x3 ret; {load_v0v1v2} ret.v0 = _mm_shuffle_pd(v0, v1, 2); ret.v1 = _mm_shuffle_pd(v0, v2, 1); ret.v2 = _mm_shuffle_pd(v1, v2, 2); return ret;'''.format(**fmtspec) if typ in ['i64', 'u64']: return \ '''nsimd_{simd_ext}_v{typ}x3 ret; nsimd_{simd_ext}_vf64x3 retf64 = nsimd_load3{a}_{simd_ext}_f64((f64 *){in0}); ret.v0 = _mm_castpd_si128(retf64.v0); ret.v1 = _mm_castpd_si128(retf64.v1); ret.v2 = _mm_castpd_si128(retf64.v2); return ret;'''.format(**fmtspec) ############################################################################### def store3(simd_ext, typ, align, fmtspec2, v0, v1, v2): fmtspec = fmtspec2.copy() fmtspec['a'] = '' if align else 'u' store = '{pre}store{a}{sufsi}'.format(**fmtspec) fmtspec['store'] = store fmtspec['v0'] = v0 fmtspec['v1'] = v1 fmtspec['v2'] = v2 if typ in ['f32', 'f64']: return \ '''{store}({in0}, {v0}); {store}({in0} + {le}, {v1}); {store}({in0} + (2 * {le}), {v2});'''.format(**fmtspec) else: return \ '''{store}(({styp} *){in0}, {v0}); {store}(({styp} *){in0} + 1, {v1}); {store}(({styp} *){in0} + 2, {v2});'''.format(**fmtspec) ############################################################################### def store3_sse(simd_ext, typ, align, fmtspec2): fmtspec = fmtspec2.copy() fmtspec['a'] = 'a' if align else 'u' if typ in ['i8', 'u8']: if simd_ext == 'sse42': return \ '''__m128i A1_mask = _mm_set_epi8( 5, -1, -1, 4, -1, -1, 3, -1, -1, 2, -1, -1, 1, -1, -1, 0); __m128i A2_mask = _mm_set_epi8(-1, -1, 4, -1, -1, 3, -1, -1, 2, -1, -1, 1, -1, -1, 0, -1); __m128i A3_mask = _mm_set_epi8(-1, 4, -1, -1, 3, -1, -1, 2, -1, -1, 1, -1, -1, 0, -1, -1); __m128i A4 = _mm_shuffle_epi8({in1}, A1_mask); __m128i A5 = _mm_shuffle_epi8({in2}, A2_mask); __m128i A6 = _mm_shuffle_epi8({in3}, A3_mask); A4 = _mm_or_si128(A4, A5); A4 = _mm_or_si128(A4, A6); __m128i B1_mask = _mm_set_epi8(-1, 10, -1, -1, 9, -1, -1, 8, -1, -1, 7, -1, -1, 6, -1, -1); __m128i B2_mask = _mm_set_epi8(10, -1, -1, 9, -1, -1, 8, -1, -1, 7, -1, -1, 6, -1, -1, 5); __m128i B3_mask = _mm_set_epi8(-1, -1, 9, -1, -1, 8, -1, -1, 7, -1, -1, 6, -1, -1, 5, -1); __m128i B4 = _mm_shuffle_epi8({in1}, B1_mask); __m128i B5 = _mm_shuffle_epi8({in2}, B2_mask); __m128i B6 = _mm_shuffle_epi8({in3}, B3_mask); B4 = _mm_or_si128(B4, B5); B4 = _mm_or_si128(B4, B6); __m128i C1_mask = _mm_set_epi8(-1, -1, 15, -1, -1, 14, -1, -1, 13, -1, -1, 12, -1, -1, 11, -1); __m128i C2_mask = _mm_set_epi8(-1, 15, -1, -1, 14, -1, -1, 13, -1, -1, 12, -1, -1, 11, -1, -1); __m128i C3_mask = _mm_set_epi8(15, -1, -1, 14, -1, -1, 13, -1, -1, 12, -1, -1, 11, -1, -1, 10); __m128i C4 = _mm_shuffle_epi8({in1}, C1_mask); __m128i C5 = _mm_shuffle_epi8({in2}, C2_mask); __m128i C6 = _mm_shuffle_epi8({in3}, C3_mask); C4 = _mm_or_si128(C4, C5); C4 = _mm_or_si128(C4, C6); {store4}'''.format(store4=store3('sse', typ, align, fmtspec, 'A4', 'B4', 'C4'), **fmtspec) else: return \ '''__m128i A0 = {in1}; __m128i B0 = {in2}; __m128i C0 = {in3}; int k; for (k = 0; k < 4; ++k) {{ __m128i A1 = _mm_unpacklo_epi8(A0, B0); __m128i A2 = _mm_unpackhi_epi8(A0, B0); __m128i A3 = _mm_unpacklo_epi8(A1, A2); __m128i A4 = _mm_unpackhi_epi8(A1, A2); __m128i A5 = _mm_unpacklo_epi8(A3, A4); __m128i A6 = _mm_unpackhi_epi8(A3, A4); __m128i A7 = _mm_unpacklo_epi8(A5, A6); __m128i B8 = _mm_unpackhi_epi8(A5, A6); __m128i C9 = _mm_castpd_si128(_mm_shuffle_pd( _mm_castsi128_pd(C0), _mm_castsi128_pd(C0), 1)); __m128i C10 = _mm_unpacklo_epi8(C0, C9); __m128i C11 = _mm_castpd_si128(_mm_shuffle_pd( _mm_castsi128_pd(C10), _mm_castsi128_pd(C10), 1)); __m128i C12 = _mm_unpacklo_epi8(C10, C11); __m128i C13 = _mm_castpd_si128(_mm_shuffle_pd( _mm_castsi128_pd(C12), _mm_castsi128_pd(C12), 1)); __m128i C14 = _mm_unpacklo_epi8(C12, C13); __m128i B15 = _mm_castpd_si128(_mm_shuffle_pd( _mm_castsi128_pd(C14), _mm_castsi128_pd(B8), 0)); __m128i C16 = _mm_castpd_si128(_mm_shuffle_pd( _mm_castsi128_pd(B8), _mm_castsi128_pd(C14), 3)); A0 = A7; B0 = B15; C0 = C16; }} {store0}'''.format(store0=store3('sse', typ, align, fmtspec, 'A0', 'B0', 'C0'), **fmtspec) if typ in ['i16', 'u16']: if simd_ext == 'avx2': return \ '''__m128i A0 = {in1}; __m128i B0 = {in2}; __m128i C0 = {in3}; __m128i A1_mask = _mm_set_epi8(-1, -1, 5, 4, -1, -1, -1, -1, 3, 2, -1, -1, -1, -1, 1, 0); __m128i A2_mask = _mm_set_epi8( 5, 4, -1, -1, -1, -1, 3, 2, -1, -1, -1, -1, 1, 0, -1, -1); __m128i A3_mask = _mm_set_epi8(-1, -1, -1, -1, 3, 2, -1, -1, -1, -1, 1, 0, -1, -1, -1, -1); __m128i A4 = _mm_shuffle_epi8(A0, A1_mask); __m128i A5 = _mm_shuffle_epi8(B0, A2_mask); __m128i A6 = _mm_shuffle_epi8(C0, A3_mask); A4 = _mm_or_si128(A4, A5); A4 = _mm_or_si128(A4, A6); __m128i B1_mask = _mm_set_epi8(11, 10, -1, -1, -1, -1, 9, 8, -1, -1, -1, -1, 7, 6, -1, -1); __m128i B2_mask = _mm_set_epi8(-1, -1, -1, -1, 9, 8, -1, -1, -1, -1, 7, 6, -1, -1, -1, -1); __m128i B3_mask = _mm_set_epi8(-1, -1, 9, 8, -1, -1, -1, -1, 7, 6, -1, -1, -1, -1, 5, 4); __m128i B4 = _mm_shuffle_epi8(A0, B1_mask); __m128i B5 = _mm_shuffle_epi8(B0, B2_mask); __m128i B6 = _mm_shuffle_epi8(C0, B3_mask); B4 = _mm_or_si128(B4, B5); B4 = _mm_or_si128(B4, B6); __m128i C1_mask = _mm_set_epi8(-1, -1, -1, -1, 15, 14, -1, -1, -1, -1, 13, 12, -1, -1, -1, -1); __m128i C2_mask = _mm_set_epi8(-1, -1, 15, 14, -1, -1, -1, -1, 13, 12, -1, -1, -1, -1, 11, 10); __m128i C3_mask = _mm_set_epi8(15, 14, -1, -1, -1, -1, 13, 12, -1, -1, -1, -1, 11, 10, -1, -1); __m128i C4 = _mm_shuffle_epi8(A0, C1_mask); __m128i C5 = _mm_shuffle_epi8(B0, C2_mask); __m128i C6 = _mm_shuffle_epi8(C0, C3_mask); C4 = _mm_or_si128(C4, C5); C4 = _mm_or_si128(C4, C6); {store4};'''.format(store4=store3('sse', typ, align, fmtspec, 'A4', 'B4', 'C4'), **fmtspec) else: return \ '''__m128i A0 = {in1}; __m128i B0 = {in2}; __m128i C0 = {in3}; int k; for (k = 0; k < 3; ++k) {{ __m128i A1 = _mm_shufflelo_epi16(A0, _MM_SHUFFLE(3, 1, 2, 0)); __m128i A2 = _mm_shufflehi_epi16(A1, _MM_SHUFFLE(3, 1, 2, 0)); __m128i B3 = _mm_shufflelo_epi16(B0, _MM_SHUFFLE(3, 1, 2, 0)); __m128i B4 = _mm_shufflehi_epi16(B3, _MM_SHUFFLE(3, 1, 2, 0)); __m128i C5 = _mm_shufflelo_epi16(C0, _MM_SHUFFLE(3, 1, 2, 0)); __m128i C6 = _mm_shufflehi_epi16(C5, _MM_SHUFFLE(3, 1, 2, 0)); __m128 A2_ps = _mm_castsi128_ps(A2); __m128 B4_ps = _mm_castsi128_ps(B4); __m128 C6_ps = _mm_castsi128_ps(C6); __m128 A0_ps = _mm_shuffle_ps(A2_ps, B4_ps, _MM_SHUFFLE(2, 0, 2, 0)); __m128 B0_ps = _mm_shuffle_ps(C6_ps, A2_ps, _MM_SHUFFLE(3, 1, 2, 0)); __m128 C0_ps = _mm_shuffle_ps(B4_ps, C6_ps, _MM_SHUFFLE(3, 1, 3, 1)); A0 = _mm_castps_si128(A0_ps); B0 = _mm_castps_si128(B0_ps); C0 = _mm_castps_si128(C0_ps); }} {store0}'''.format(store0=store3('sse', typ, align, fmtspec, 'A0', 'B0', 'C0'), **fmtspec) if typ == 'f32': return \ '''__m128 A1 = _mm_shuffle_ps({in1}, {in2}, _MM_SHUFFLE(2,0,2,0)); __m128 B2 = _mm_shuffle_ps({in3}, {in1}, _MM_SHUFFLE(3,1,2,0)); __m128 C3 = _mm_shuffle_ps({in2}, {in3}, _MM_SHUFFLE(3,1,3,1)); __m128 A4 = _mm_shuffle_ps(A1, B2, _MM_SHUFFLE(2,0,2,0)); __m128 B5 = _mm_shuffle_ps(C3, A1, _MM_SHUFFLE(3,1,2,0)); __m128 C6 = _mm_shuffle_ps(B2, C3, _MM_SHUFFLE(3,1,3,1)); {store};'''. \ format(store=store3('sse', typ, align, fmtspec, 'A4', 'B5', 'C6'), **fmtspec) if typ in ['i32', 'u32']: return \ '''nsimd_store3{a}_{simd_ext}_f32((f32 *){in0}, _mm_castsi128_ps({in1}), _mm_castsi128_ps({in2}), _mm_castsi128_ps({in3}));'''. \ format(**fmtspec) if typ == 'f64': return \ '''__m128d A0 = _mm_unpacklo_pd({in1}, {in2}); __m128d B0 = _mm_shuffle_pd({in3}, {in1}, 2); __m128d C0 = _mm_unpackhi_pd({in2}, {in3}); {store}'''. \ format(store=store3('sse', typ, align, fmtspec, 'A0', 'B0', 'C0'), **fmtspec) if typ in ['i64', 'u64']: return \ '''nsimd_store3{a}_{simd_ext}_f64((f64 *){in0}, _mm_castsi128_pd({in1}), _mm_castsi128_pd({in2}), _mm_castsi128_pd({in3}));'''. \ format(**fmtspec) ############################################################################### def load3_avx(simd_ext, typ, align, fmtspec2): fmtspec = fmtspec2.copy() fmtspec['load_v0v1v2'] = get_load_v0v1v2('avx', typ, align, fmtspec) fmtspec['exlo_v0'] = x86.extract('avx', typ, x86.LO, 'v0') fmtspec['exhi_v0'] = x86.extract('avx', typ, x86.HI, 'v0') fmtspec['exlo_v1'] = x86.extract('avx', typ, x86.LO, 'v1') fmtspec['exhi_v1'] = x86.extract('avx', typ, x86.HI, 'v1') fmtspec['exlo_v2'] = x86.extract('avx', typ, x86.LO, 'v2') fmtspec['exhi_v2'] = x86.extract('avx', typ, x86.HI, 'v2') fmtspec['a'] = 'a' if align else 'u' if typ in ['i8', 'u8']: if simd_ext == 'avx2': return \ '''nsimd_avx2_v{typ}x3 ret; {load_v0v1v2} __m256i ARmask = _mm256_setr_epi8( 0, 3, 6, 9, 12, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 2, 5, 8, 11, 14, -1, -1, -1, -1, -1); __m256i BRmask = _mm256_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1); __m256i CRmask = _mm256_setr_epi8(-1, -1, -1, -1, -1, -1, 2, 5, 8, 11, 14, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, 4, 7, 10, 13); __m256i AR = _mm256_shuffle_epi8(v0, ARmask); __m256i BR = _mm256_shuffle_epi8(v1, BRmask); __m256i CR = _mm256_shuffle_epi8(v2, CRmask); __m256i DR = _mm256_permute2f128_si256(AR, CR, (2 << 4) | 1); __m256i R0 = _mm256_or_si256(AR, BR); __m256i R1 = _mm256_or_si256(BR, CR); __m256i R2 = _mm256_permute2f128_si256(R0, R1, 3 << 4); ret.v0 = _mm256_or_si256(DR, R2); __m256i AGmask = _mm256_setr_epi8( 1, 4, 7, 10, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 3, 6, 9, 12, 15, -1, -1, -1, -1, -1); __m256i BGmask = _mm256_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 2, 5, 8, 11, 14, 1, 4, 7, 10, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1); __m256i CGmask = _mm256_setr_epi8(-1, -1, -1, -1, -1, 0, 3, 6, 9, 12, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 2, 5, 8, 11, 14); __m256i AG = _mm256_shuffle_epi8(v0, AGmask); __m256i BG = _mm256_shuffle_epi8(v1, BGmask); __m256i CG = _mm256_shuffle_epi8(v2, CGmask); __m256i DG = _mm256_permute2f128_si256(AG, CG, (2 << 4) | 1); __m256i G0 = _mm256_or_si256(AG, BG); __m256i G1 = _mm256_or_si256(BG, CG); __m256i G2 = _mm256_permute2f128_si256(G0, G1, 3 << 4); ret.v1 = _mm256_or_si256(DG, G2); __m256i ABmask = _mm256_setr_epi8( 2, 5, 8, 11, 14, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, 4, 7, 10, 13, -1, -1, -1, -1, -1, -1); __m256i BBmask = _mm256_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1); __m256i CBmask = _mm256_setr_epi8(-1, -1, -1, -1, -1, 1, 4, 7, 10, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 3, 6, 9, 12, 15); __m256i AB = _mm256_shuffle_epi8(v0, ABmask); __m256i BB = _mm256_shuffle_epi8(v1, BBmask); __m256i CB = _mm256_shuffle_epi8(v2, CBmask); __m256i DB = _mm256_permute2f128_si256(AB, CB, (2 << 4) | 1); __m256i B0 = _mm256_or_si256(AB, BB); __m256i B1 = _mm256_or_si256(BB, CB); __m256i B2 = _mm256_permute2f128_si256(B0, B1, 3 << 4); ret.v2 = _mm256_or_si256(DB, B2); return ret;'''.format(**fmtspec) else: return \ '''nsimd_avx_v{typ}x3 ret; {load_v0v1v2} __m128i Aa = {exlo_v0}; __m128i Ba = {exhi_v0}; __m128i Ca = {exlo_v1}; __m128i Ab = {exhi_v1}; __m128i Bb = {exlo_v2}; __m128i Cb = {exhi_v2}; __m128i ARm = _mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 15, 12, 9, 6, 3, 0); __m128i BRm = _mm_set_epi8(-1, -1, -1, -1, -1, 14, 11, 8, 5, 2, -1, -1, -1, -1, -1, -1); __m128i CRm = _mm_set_epi8(13, 10, 7, 4, 1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1); __m128i AR = _mm_shuffle_epi8(Aa, ARm); __m128i BR = _mm_shuffle_epi8(Ba, BRm); __m128i CR = _mm_shuffle_epi8(Ca, CRm); __m128i R0 = _mm_or_si128(AR, BR); R0 = _mm_or_si128(R0, CR); AR = _mm_shuffle_epi8(Ab, ARm); BR = _mm_shuffle_epi8(Bb, BRm); CR = _mm_shuffle_epi8(Cb, CRm); __m128i R1 = _mm_or_si128(AR, BR); R1 = _mm_or_si128(R1, CR); __m128i AGm = _mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 13, 10, 7, 4, 1); __m128i BGm = _mm_set_epi8(-1, -1, -1, -1, -1, 15, 12, 9, 6, 3, 0, -1, -1, -1, -1, -1); __m128i CGm = _mm_set_epi8(14, 11, 8, 5, 2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1); __m128i AG = _mm_shuffle_epi8(Aa, AGm); __m128i BG = _mm_shuffle_epi8(Ba, BGm); __m128i CG = _mm_shuffle_epi8(Ca, CGm); __m128i G0 = _mm_or_si128(AG, BG); G0 = _mm_or_si128(G0, CG); AG = _mm_shuffle_epi8(Ab, AGm); BG = _mm_shuffle_epi8(Bb, BGm); CG = _mm_shuffle_epi8(Cb, CGm); __m128i G1 = _mm_or_si128(AG, BG); G1 = _mm_or_si128(G1, CG); __m128i ABm = _mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 14, 11, 8, 5, 2); __m128i BBm = _mm_set_epi8(-1, -1, -1, -1, -1, -1, 13, 10, 7, 4, 1, -1, -1, -1, -1, -1); __m128i CBm = _mm_set_epi8(15, 12, 9, 6, 3, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1); __m128i AB = _mm_shuffle_epi8(Aa, ABm); __m128i BB = _mm_shuffle_epi8(Ba, BBm); __m128i CB = _mm_shuffle_epi8(Ca, CBm); __m128i B0 = _mm_or_si128(AB, BB); B0 = _mm_or_si128(B0, CB); AB = _mm_shuffle_epi8(Ab, ABm); BB = _mm_shuffle_epi8(Bb, BBm); CB = _mm_shuffle_epi8(Cb, CBm); __m128i B1 = _mm_or_si128(AB, BB); B1 = _mm_or_si128(B1, CB); ret.v0 = {mergeR}; ret.v1 = {mergeG}; ret.v2 = {mergeB}; return ret;'''.format(mergeR=x86.setr('avx', typ, 'R0', 'R1'), mergeG=x86.setr('avx', typ, 'G0', 'G1'), mergeB=x86.setr('avx', typ, 'B0', 'B1'), **fmtspec) if typ in ['i16', 'u16']: if simd_ext == 'avx2': return \ '''nsimd_avx2_v{typ}x3 ret; {load_v0v1v2} __m256i ARmask = _mm256_setr_epi8( 0, 1, 6, 7, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 2, 3, 8, 9, 14, 15, -1, -1, -1, -1); __m256i BRmask = _mm256_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 4, 5, 10, 11, 0, 1, 6, 7, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1); __m256i CRmask = _mm256_setr_epi8(-1, -1, -1, -1, -1, -1, 2, 3, 8, 9, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 4, 5, 10, 11); __m256i AR = _mm256_shuffle_epi8(v0, ARmask); __m256i BR = _mm256_shuffle_epi8(v1, BRmask); __m256i CR = _mm256_shuffle_epi8(v2, CRmask); __m256i DR = _mm256_permute2f128_si256(AR, CR, (2 << 4) | 1); __m256i R0 = _mm256_or_si256(AR, BR); __m256i R1 = _mm256_or_si256(BR, CR); __m256i R2 = _mm256_permute2f128_si256(R0, R1, 3 << 4); ret.v0 = _mm256_or_si256(DR, R2); __m256i AGmask = _mm256_setr_epi8( 2, 3, 8, 9, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 4, 5, 10, 11, -1, -1, -1, -1, -1, -1); __m256i BGmask = _mm256_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1); __m256i CGmask = _mm256_setr_epi8(-1, -1, -1, -1, -1, -1, 4, 5, 10, 11, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 1, 6, 7, 12, 13); __m256i AG = _mm256_shuffle_epi8(v0, AGmask); __m256i BG = _mm256_shuffle_epi8(v1, BGmask); __m256i CG = _mm256_shuffle_epi8(v2, CGmask); __m256i DG = _mm256_permute2f128_si256(AG, CG, (2 << 4) | 1); __m256i G0 = _mm256_or_si256(AG, BG); __m256i G1 = _mm256_or_si256(BG, CG); __m256i G2 = _mm256_permute2f128_si256(G0, G1, 3 << 4); ret.v1 = _mm256_or_si256(DG, G2); __m256i ABmask = _mm256_setr_epi8( 4, 5, 10, 11, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 1, 6, 7, 12, 13, -1, -1, -1, -1, -1, -1); __m256i BBmask = _mm256_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1); __m256i CBmask = _mm256_setr_epi8(-1, -1, -1, -1, 0, 1, 6, 7, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 2, 3, 8, 9, 14, 15); __m256i AB = _mm256_shuffle_epi8(v0, ABmask); __m256i BB = _mm256_shuffle_epi8(v1, BBmask); __m256i CB = _mm256_shuffle_epi8(v2, CBmask); __m256i DB = _mm256_permute2f128_si256(AB, CB, (2 << 4) | 1); __m256i B0 = _mm256_or_si256(AB, BB); __m256i B1 = _mm256_or_si256(BB, CB); __m256i B2 = _mm256_permute2f128_si256(B0, B1, 3 << 4); ret.v2 = _mm256_or_si256(DB, B2); return ret;'''.format(**fmtspec) else: return \ '''nsimd_avx_v{typ}x3 ret; {load_v0v1v2} __m128i Aa = {exlo_v0}; __m128i Ba = {exhi_v0}; __m128i Ca = {exlo_v1}; __m128i Ab = {exhi_v1}; __m128i Bb = {exlo_v2}; __m128i Cb = {exhi_v2}; __m128i ARm = _mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 13, 12, 7, 6, 1, 0); __m128i BRm = _mm_set_epi8(-1, -1, -1, -1, 15, 14, 9, 8, 3, 2, -1, -1, -1, -1, -1, -1); __m128i CRm = _mm_set_epi8(11, 10, 5, 4, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1); __m128i AR = _mm_shuffle_epi8(Aa, ARm); __m128i BR = _mm_shuffle_epi8(Ba, BRm); __m128i CR = _mm_shuffle_epi8(Ca, CRm); __m128i R0 = _mm_or_si128(AR, BR); R0 = _mm_or_si128(R0, CR); AR = _mm_shuffle_epi8(Ab, ARm); BR = _mm_shuffle_epi8(Bb, BRm); CR = _mm_shuffle_epi8(Cb, CRm); __m128i R1 = _mm_or_si128(AR, BR); R1 = _mm_or_si128(R1, CR); __m128i AGm = _mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 15, 14, 9, 8, 3, 2); __m128i BGm = _mm_set_epi8(-1, -1, -1, -1, -1, -1, 11, 10, 5, 4, -1, -1, -1, -1, -1, -1); __m128i CGm = _mm_set_epi8(13, 12, 7, 6, 1, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1); __m128i AG = _mm_shuffle_epi8(Aa, AGm); __m128i BG = _mm_shuffle_epi8(Ba, BGm); __m128i CG = _mm_shuffle_epi8(Ca, CGm); __m128i G0 = _mm_or_si128(AG, BG); G0 = _mm_or_si128(G0, CG); AG = _mm_shuffle_epi8(Ab, AGm); BG = _mm_shuffle_epi8(Bb, BGm); CG = _mm_shuffle_epi8(Cb, CGm); __m128i G1 = _mm_or_si128(AG, BG); G1 = _mm_or_si128(G1, CG); __m128i ABm = _mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 11, 10, 5, 4); __m128i BBm = _mm_set_epi8(-1, -1, -1, -1, -1, -1, 13, 12, 7, 6, 1, 0, -1, -1, -1, -1); __m128i CBm = _mm_set_epi8(15, 14, 9, 8, 3, 2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1); __m128i AB = _mm_shuffle_epi8(Aa, ABm); __m128i BB = _mm_shuffle_epi8(Ba, BBm); __m128i CB = _mm_shuffle_epi8(Ca, CBm); __m128i B0 = _mm_or_si128(AB, BB); B0 = _mm_or_si128(B0, CB); AB = _mm_shuffle_epi8(Ab, ABm); BB = _mm_shuffle_epi8(Bb, BBm); CB = _mm_shuffle_epi8(Cb, CBm); __m128i B1 = _mm_or_si128(AB, BB); B1 = _mm_or_si128(B1, CB); ret.v0 = {mergeR}; ret.v1 = {mergeG}; ret.v2 = {mergeB}; return ret;'''.format(mergeR=x86.setr('avx', typ, 'R0', 'R1'), mergeG=x86.setr('avx', typ, 'G0', 'G1'), mergeB=x86.setr('avx', typ, 'B0', 'B1'), **fmtspec) avx2_template = \ '''nsimd_avx2_v{typ}x3 ret; {load_v0v1v2} __m256i RAm = _mm256_setr_epi32( 0, 3, 6, -1, -1, -1, -1, -1); __m256i RBm = _mm256_setr_epi32(-1, -1, -1, 1, 4, 7, -1, -1); __m256i RCm = _mm256_setr_epi32(-1, -1, -1, -1, -1, -1, 2, 5); __m256i GAm = _mm256_setr_epi32( 1, 4, 7, -1, -1, -1, -1, -1); __m256i GBm = _mm256_setr_epi32(-1, -1, -1, 2, 5, -1, -1, -1); __m256i GCm = _mm256_setr_epi32(-1, -1, -1, -1, -1, 0, 3, 6); __m256i BAm = _mm256_setr_epi32( 2, 5, -1, -1, -1, -1, -1, -1); __m256i BBm = _mm256_setr_epi32(-1, -1, 0, 3, 6, -1, -1, -1); __m256i BCm = _mm256_setr_epi32(-1, -1, -1, -1, -1, 1, 4, 7); {styp} RA = _mm256_permutevar8x32{suf}(v0, RAm); {styp} RB = _mm256_permutevar8x32{suf}(v1, RBm); {styp} RC = _mm256_permutevar8x32{suf}(v2, RCm); {styp} R = _mm256_blend{suf}(RA, RB, 8 + 16 + 32); ret.v0 = _mm256_blend{suf}(R, RC, 64 + 128); {styp} GA = _mm256_permutevar8x32{suf}(v0, GAm); {styp} GB = _mm256_permutevar8x32{suf}(v1, GBm); {styp} GC = _mm256_permutevar8x32{suf}(v2, GCm); {styp} G = _mm256_blend{suf}(GA, GB, 8 + 16); ret.v1 = _mm256_blend{suf}(G, GC, 32 + 64 + 128); {styp} BA = _mm256_permutevar8x32{suf}(v0, BAm); {styp} BB = _mm256_permutevar8x32{suf}(v1, BBm); {styp} BC = _mm256_permutevar8x32{suf}(v2, BCm); {styp} B = _mm256_blend{suf}(BA, BB, 4 + 8 + 16); ret.v2 = _mm256_blend{suf}(B, BC, 32 + 64 + 128); return ret;'''.format(**fmtspec) if typ == 'f32': if simd_ext == 'avx2': return avx2_template else: return \ '''nsimd_avx_v{typ}x3 ret; {load_v0v1v2} __m256i RAm = _mm256_setr_epi32( 0, 3, -1, -1, -1, -1, 2, -1); __m256i RBm = _mm256_setr_epi32(-1, -1, -1, 1, 0, 3, -1, -1); __m256i RCm = _mm256_setr_epi32( 0, 0, 2, 0, 1, 1, 1, 1); __m256i GAm = _mm256_setr_epi32( 1, -1, -1, -1, -1, 0, 3, -1); __m256i GBm = _mm256_setr_epi32(-1, -1, -1, 2, 5, -1, -1, -1); __m256i GCm = _mm256_setr_epi32(-1, 0, 3, -1, -1, -1, -1, 6); __m256i BAm = _mm256_setr_epi32( 2, -1, -1, -1, -1, 1, -1, -1); __m256i BBm = _mm256_setr_epi32(-1, -1, 0, 3, 6, -1, -1, -1); __m256i BCm = _mm256_setr_epi32(-1, 1, -1, -1, -1, -1, 4, 7); __m256 RA = _mm256_permutevar_ps(v0, RAm); __m256 RAi = _mm256_permute2f128_ps(RA, RA, (2 << 4) | 1); RA = _mm256_blend_ps(RAi, RA, 1 + 2); __m256 RB = _mm256_permutevar_ps(v1, RBm); __m256 RC = _mm256_permutevar_ps(v2, RCm); __m256 RCi = _mm256_permute2f128_ps(RC, RC, 2 << 4); RC = _mm256_blend_ps(RC, RCi, 64); __m256 R = _mm256_blend_ps(RA, RB, 8 + 16 + 32); ret.v0 = _mm256_blend_ps(R, RC, 64 + 128); __m256 GA = _mm256_permutevar_ps(v0, GAm); __m256 GAi = _mm256_permute2f128_ps(GA, GA, (2 << 4) | 1); GA = _mm256_blend_ps(GA, GAi, 2 + 4); __m256 GB = _mm256_permutevar_ps(v1, GBm); __m256 GC = _mm256_permutevar_ps(v2, GCm); __m256 GCi = _mm256_permute2f128_ps(GC, GC, 2 << 4); GC = _mm256_blend_ps(GC, GCi, 32 + 64); __m256 G = _mm256_blend_ps(GA, GB, 8 + 16); ret.v1 = _mm256_blend_ps(G, GC, 32 + 64 + 128); __m256 BA = _mm256_permutevar_ps(v0, BAm); __m256 BAi = _mm256_permute2f128_ps(BA, BA, (2 << 4) | 1); BA = _mm256_blend_ps(BA, BAi, 2); __m256 BB = _mm256_permutevar_ps(v1, BBm); __m256 BC = _mm256_permutevar_ps(v2, BCm); __m256 BCi = _mm256_permute2f128_ps(BC, BC, 2 << 4); BC = _mm256_blend_ps(BC, BCi, 32); __m256 B = _mm256_blend_ps(BA, BB, 4 + 8 + 16); ret.v2 = _mm256_blend_ps(B, BC, 32 + 64 + 128); return ret;'''.format(**fmtspec) if typ in ['i32', 'u32', 'f32']: if simd_ext == 'avx2': return avx2_template else: return \ '''nsimd_avx_v{typ}x3 ret; nsimd_avx_vf32x3 retf32 = nsimd_load3{a}_avx_f32((f32 *){in0}); ret.v0 = _mm256_castps_si256(retf32.v0); ret.v1 = _mm256_castps_si256(retf32.v1); ret.v2 = _mm256_castps_si256(retf32.v2); return ret;'''.format(**fmtspec) avx2_template = \ '''nsimd_avx2_v{typ}x3 ret; {load_v0v1v2} {styp} A1 = _mm256_permute4x64{suf}(v0, _MM_SHUFFLE(2, 1, 3, 0)); {styp} C2 = _mm256_permute4x64{suf}(v2, _MM_SHUFFLE(3, 0, 2, 1)); {styp} B3 = _mm256_permute2f128{sufsi}(A1, v1, (2 << 4) | 1); {styp} B4 = _mm256_permute2f128{sufsi}(v1, C2, (2 << 4) | 1); {styp} B5 = _mm256_permute4x64{suf}(B3, _MM_SHUFFLE(3, 1, 2, 0)); {styp} B6 = _mm256_permute4x64{suf}(B4, _MM_SHUFFLE(3, 1, 2, 0)); ret.v0 = _mm256_permute2f128{sufsi}(A1, B6, 2 << 4); ret.v1 = _mm256_permute2f128{sufsi}(B5, B6, 3 << 4); ret.v2 = _mm256_permute2f128{sufsi}(B5, C2, (3 << 4 ) | 1); return ret;'''.format(**fmtspec) if typ == 'f64': if simd_ext == 'avx2': return avx2_template else: return \ '''nsimd_avx_v{typ}x3 ret; {load_v0v1v2} __m256d R1 = _mm256_permute2f128_pd(v0, v2, (2 << 4) | 1); __m256d R2 = _mm256_permute2f128_pd(v0, v1, 3 << 4); ret.v0 = _mm256_blend_pd(R1, R2, 1 + 4); __m256d G1 = _mm256_permute2f128_pd(v0, v1, 3 << 4); __m256d G2 = _mm256_permute2f128_pd(v1, v2, 3 << 4); __m256d G = _mm256_blend_pd(G1, G2, 1 + 4); ret.v1 = _mm256_permute_pd(G, 1 + 4); __m256d B1 = _mm256_permute2f128_pd(v0, v2, (2 << 4) | 1); __m256d B2 = _mm256_permute2f128_pd(v1, v2, 3 << 4); ret.v2 = _mm256_blend_pd(B1, B2, 2 + 8); return ret;'''.format(**fmtspec) if typ in ['i64', 'u64']: if simd_ext == 'avx2': return avx2_template else: return \ '''nsimd_avx_v{typ}x3 ret; nsimd_avx_vf64x3 retf64 = nsimd_load3{a}_avx_f64((f64 *){in0}); ret.v0 = _mm256_castpd_si256(retf64.v0); ret.v1 = _mm256_castpd_si256(retf64.v1); ret.v2 = _mm256_castpd_si256(retf64.v2); return ret;'''.format(**fmtspec) ############################################################################### def store3_avx(simd_ext, typ, align, fmtspec2): fmtspec = fmtspec2.copy() fmtspec['exlo_in1'] = x86.extract('avx', typ, x86.LO, common.in1) fmtspec['exhi_in1'] = x86.extract('avx', typ, x86.HI, common.in1) fmtspec['exlo_in2'] = x86.extract('avx', typ, x86.LO, common.in2) fmtspec['exhi_in2'] = x86.extract('avx', typ, x86.HI, common.in2) fmtspec['exlo_in3'] = x86.extract('avx', typ, x86.LO, common.in3) fmtspec['exhi_in3'] = x86.extract('avx', typ, x86.HI, common.in3) fmtspec['a'] = 'a' if align else 'u' if typ in ['i8', 'u8']: if simd_ext == 'avx2': return \ '''__m256i RACm = _mm256_setr_epi8( 0, -1, -1, 1, -1, -1, 2, -1, -1, 3, -1, -1, 4, -1, -1, 5, -1, 27, -1, -1, 28, -1, -1, 29, -1, -1, 30, -1, -1, 31, -1, -1); __m256i RBBm = _mm256_setr_epi8(-1, 11, -1, -1, 12, -1, -1, 13, -1, -1, 14, -1, -1, 15, -1, -1, 16, -1, -1, 17, -1, -1, 18, -1, -1, 19, -1, -1, 20, -1, -1, 21); __m256i RCAm = _mm256_setr_epi8(-1, -1, 22, -1, -1, 23, -1, -1, 24, -1, -1, 25, -1, -1, 26, -1, -1, -1, 6, -1, -1, 7, -1, -1, 8, -1, -1, 9, -1, -1, 10, -1); __m256i GACm = _mm256_setr_epi8(-1, 0, -1, -1, 1, -1, -1, 2, -1, -1, 3, -1, -1, 4, -1, -1, -1, -1, 27, -1, -1, 28, -1, -1, 29, -1, -1, 30, -1, -1, 31, -1); __m256i GBBm = _mm256_setr_epi8(-1, -1, 11, -1, -1, 12, -1, -1, 13, -1, -1, 14, -1, -1, 15, -1, -1, 16, -1, -1, 17, -1, -1, 18, -1, -1, 19, -1, -1, 20, -1, -1); __m256i GCAm = _mm256_setr_epi8(21, -1, -1, 22, -1, -1, 23, -1, -1, 24, -1, -1, 25, -1, -1, 26, 5, -1, -1, 6, -1, -1, 7, -1, -1, 8, -1, -1, 9, -1, -1, 10); __m256i BACm = _mm256_setr_epi8(-1, -1, 0, -1, -1, 1, -1, -1, 2, -1, -1, 3, -1, -1, 4, -1, 26, -1, -1, 27, -1, -1, 28, -1, -1, 29, -1, -1, 30, -1, -1, 31); __m256i BBBm = _mm256_setr_epi8(10, -1, -1, 11, -1, -1, 12, -1, -1, 13, -1, -1, 14, -1, -1, 15, -1, -1, 16, -1, -1, 17, -1, -1, 18, -1, -1, 19, -1, -1, 20, -1); __m256i BCAm = _mm256_setr_epi8(-1, 21, -1, -1, 22, -1, -1, 23, -1, -1, 24, -1, -1, 25, -1, -1, -1, 5, -1, -1, 6, -1, -1, 7, -1, -1, 8, -1, -1, 9, -1, -1); __m256i RAC = _mm256_shuffle_epi8({in1}, RACm); __m256i GAC = _mm256_shuffle_epi8({in2}, GACm); __m256i BAC = _mm256_shuffle_epi8({in3}, BACm); __m256i RBB = _mm256_shuffle_epi8({in1}, RBBm); __m256i GBB = _mm256_shuffle_epi8({in2}, GBBm); __m256i BBB = _mm256_shuffle_epi8({in3}, BBBm); __m256i RCA = _mm256_shuffle_epi8({in1}, RCAm); __m256i GCA = _mm256_shuffle_epi8({in2}, GCAm); __m256i BCA = _mm256_shuffle_epi8({in3}, BCAm); __m256i AC = _mm256_or_si256(RAC, GAC); AC = _mm256_or_si256(AC, BAC); __m256i B = _mm256_or_si256(RBB, GBB); B = _mm256_or_si256(B, BBB); __m256i CA = _mm256_or_si256(RCA, GCA); CA = _mm256_or_si256(CA, BCA); __m256i A = _mm256_permute2f128_si256(AC, CA, 2 << 4); __m256i C = _mm256_permute2f128_si256(AC, CA, (1 << 4) | 3); {store}'''.format(store=store3('avx', typ, align, fmtspec, 'A', 'B', 'C'), **fmtspec) else: return \ '''__m128i Ra = {exlo_in1}; __m128i Rb = {exhi_in1}; __m128i Ga = {exlo_in2}; __m128i Gb = {exhi_in2}; __m128i Ba = {exlo_in3}; __m128i Bb = {exhi_in3}; __m128i RAm = _mm_set_epi8( 5, -1, -1, 4, -1, -1, 3, -1, -1, 2, -1, -1, 1, -1, -1, 0); __m128i GAm = _mm_set_epi8(-1, -1, 4, -1, -1, 3, -1, -1, 2, -1, -1, 1, -1, -1, 0, -1); __m128i BAm = _mm_set_epi8(-1, 4, -1, -1, 3, -1, -1, 2, -1, -1, 1, -1, -1, 0, -1, -1); __m128i RA = _mm_shuffle_epi8(Ra, RAm); __m128i GA = _mm_shuffle_epi8(Ga, GAm); __m128i BA = _mm_shuffle_epi8(Ba, BAm); __m128i A0 = _mm_or_si128(RA, GA); A0 = _mm_or_si128(A0, BA); RA = _mm_shuffle_epi8(Rb, RAm); GA = _mm_shuffle_epi8(Gb, GAm); BA = _mm_shuffle_epi8(Bb, BAm); __m128i A1 = _mm_or_si128(RA, GA); A1 = _mm_or_si128(A1, BA); __m128i RBm = _mm_set_epi8(-1, 10, -1, -1, 9, -1, -1, 8, -1, -1, 7, -1, -1, 6, -1, -1); __m128i GBm = _mm_set_epi8(10, -1, -1, 9, -1, -1, 8, -1, -1, 7, -1, -1, 6, -1, -1, 5); __m128i BBm = _mm_set_epi8(-1, -1, 9, -1, -1, 8, -1, -1, 7, -1, -1, 6, -1, -1, 5, -1); __m128i RB = _mm_shuffle_epi8(Ra, RBm); __m128i GB = _mm_shuffle_epi8(Ga, GBm); __m128i BB = _mm_shuffle_epi8(Ba, BBm); __m128i B0 = _mm_or_si128(RB, GB); B0 = _mm_or_si128(B0, BB); RB = _mm_shuffle_epi8(Rb, RBm); GB = _mm_shuffle_epi8(Gb, GBm); BB = _mm_shuffle_epi8(Bb, BBm); __m128i B1 = _mm_or_si128(RB, GB); B1 = _mm_or_si128(B1, BB); __m128i RCm = _mm_set_epi8(-1, -1, 15, -1, -1, 14, -1, -1, 13, -1, -1, 12, -1, -1, 11, -1); __m128i GCm = _mm_set_epi8(-1, 15, -1, -1, 14, -1, -1, 13, -1, -1, 12, -1, -1, 11, -1, -1); __m128i BCm = _mm_set_epi8(15, -1, -1, 14, -1, -1, 13, -1, -1, 12, -1, -1, 11, -1, -1, 10); __m128i RC = _mm_shuffle_epi8(Ra, RCm); __m128i GC = _mm_shuffle_epi8(Ga, GCm); __m128i BC = _mm_shuffle_epi8(Ba, BCm); __m128i C0 = _mm_or_si128(RC, GC); C0 = _mm_or_si128(C0, BC); RC = _mm_shuffle_epi8(Rb, RCm); GC = _mm_shuffle_epi8(Gb, GCm); BC = _mm_shuffle_epi8(Bb, BCm); __m128i C1 = _mm_or_si128(RC, GC); C1 = _mm_or_si128(C1, BC); __m256i A = {mergeA0B0}; __m256i B = {mergeC0A1}; __m256i C = {mergeB1C1}; {store}'''.format(mergeA0B0=x86.setr('avx', typ, 'A0', 'B0'), mergeC0A1=x86.setr('avx', typ, 'C0', 'A1'), mergeB1C1=x86.setr('avx', typ, 'B1', 'C1'), store=store3('avx', typ, align, fmtspec, 'A', 'B', 'C'), **fmtspec) if typ in ['i16', 'u16']: if simd_ext == 'avx2': return \ '''__m256i RACm = _mm256_setr_epi8( 0, 1, -1, -1, -1, -1, 2, 3, -1, -1, -1, -1, 4, 5, -1, -1, -1, -1, -1, -1, 12, 13, -1, -1, -1, -1, 14, 15, -1, -1, -1, -1); __m256i RBBm = _mm256_setr_epi8(-1, -1, -1, -1, 12, 13, -1, -1, -1, -1, 14, 15, -1, -1, -1, -1, 0, 1, -1, -1, -1, -1, 2, 3, -1, -1, -1, -1, 4, 5, -1, -1); __m256i RCAm = _mm256_setr_epi8(-1, -1, 6, 7, -1, -1, -1, -1, 8, 9, -1, -1, -1, -1, 10, 11, -1, -1, 6, 7, -1, -1, -1, -1, 8, 9, -1, -1, -1, -1, 10, 11); __m256i GACm = _mm256_setr_epi8(-1, -1, 0, 1, -1, -1, -1, -1, 2, 3, -1, -1, -1, -1, 4, 5, 10, 11, -1, -1, -1, -1, 12, 13, -1, -1, -1, -1, 14, 15, -1, -1); __m256i GBBm = _mm256_setr_epi8(10, 11, -1, -1, -1, -1, 12, 13, -1, -1, -1, -1, 14, 15, -1, -1, -1, -1, 0, 1, -1, -1, -1, -1, 2, 3, -1, -1, -1, -1, 4, 5); __m256i GCAm = _mm256_setr_epi8(-1, -1, -1, -1, 6, 7, -1, -1, -1, -1, 8, 9, -1, -1, -1, -1, -1, -1, -1, -1, 6, 7, -1, -1, -1, -1, 8, 9, -1, -1, -1, -1); __m256i BACm = _mm256_setr_epi8(-1, -1, -1, -1, 0, 1, -1, -1, -1, -1, 2, 3, -1, -1, -1, -1, -1, -1, 10, 11, -1, -1, -1, -1, 12, 13, -1, -1, -1, -1, 14, 15); __m256i BBBm = _mm256_setr_epi8(-1, -1, 10, 11, -1, -1, -1, -1, 12, 13, -1, -1, -1, -1, 14, 15, -1, -1, -1, -1, 0, 1, -1, -1, -1, -1, 2, 3, -1, -1, -1, -1); __m256i BCAm = _mm256_setr_epi8( 4, 5, -1, -1, -1, -1, 6, 7, -1, -1, -1, -1, 8, 9, -1, -1, 4, 5, -1, -1, -1, -1, 6, 7, -1, -1, -1, -1, 8, 9, -1, -1); __m256i RAC = _mm256_shuffle_epi8({in1}, RACm); __m256i GAC = _mm256_shuffle_epi8({in2}, GACm); __m256i BAC = _mm256_shuffle_epi8({in3}, BACm); __m256i RBB = _mm256_shuffle_epi8({in1}, RBBm); __m256i GBB = _mm256_shuffle_epi8({in2}, GBBm); __m256i BBB = _mm256_shuffle_epi8({in3}, BBBm); __m256i RCA = _mm256_shuffle_epi8({in1}, RCAm); __m256i GCA = _mm256_shuffle_epi8({in2}, GCAm); __m256i BCA = _mm256_shuffle_epi8({in3}, BCAm); __m256i AC = _mm256_or_si256(RAC, GAC); AC = _mm256_or_si256(AC, BAC); __m256i B = _mm256_or_si256(RBB, GBB); B = _mm256_or_si256(B, BBB); __m256i CA = _mm256_or_si256(RCA, GCA); CA = _mm256_or_si256(CA, BCA); __m256i A = _mm256_permute2f128_si256(AC, CA, 2 << 4); __m256i C = _mm256_permute2f128_si256(AC, CA, (1 << 4) | 3); {store}'''.format(store=store3('avx', typ, align, fmtspec, 'A', 'B', 'C'), **fmtspec) else: return \ '''__m128i Ra = {exlo_in1}; __m128i Rb = {exhi_in1}; __m128i Ga = {exlo_in2}; __m128i Gb = {exhi_in2}; __m128i Ba = {exlo_in3}; __m128i Bb = {exhi_in3}; __m128i RAm = _mm_set_epi8(-1, -1, 5, 4, -1, -1, -1, -1, 3, 2, -1, -1, -1, -1, 1, 0); __m128i GAm = _mm_set_epi8( 5, 4, -1, -1, -1, -1, 3, 2, -1, -1, -1, -1, 1, 0, -1, -1); __m128i BAm = _mm_set_epi8(-1, -1, -1, -1, 3, 2, -1, -1, -1, -1, 1, 0, -1, -1, -1, -1); __m128i RA = _mm_shuffle_epi8(Ra, RAm); __m128i GA = _mm_shuffle_epi8(Ga, GAm); __m128i BA = _mm_shuffle_epi8(Ba, BAm); __m128i A0 = _mm_or_si128(RA, GA); A0 = _mm_or_si128(A0, BA); RA = _mm_shuffle_epi8(Rb, RAm); GA = _mm_shuffle_epi8(Gb, GAm); BA = _mm_shuffle_epi8(Bb, BAm); __m128i A1 = _mm_or_si128(RA, GA); A1 = _mm_or_si128(A1, BA); __m128i RBm = _mm_set_epi8(11, 10, -1, -1, -1, -1, 9, 8, -1, -1, -1, -1, 7, 6, -1, -1); __m128i GBm = _mm_set_epi8(-1, -1, -1, -1, 9, 8, -1, -1, -1, -1, 7, 6, -1, -1, -1, -1); __m128i BBm = _mm_set_epi8(-1, -1, 9, 8, -1, -1, -1, -1, 7, 6, -1, -1, -1, -1, 5, 4); __m128i RB = _mm_shuffle_epi8(Ra, RBm); __m128i GB = _mm_shuffle_epi8(Ga, GBm); __m128i BB = _mm_shuffle_epi8(Ba, BBm); __m128i B0 = _mm_or_si128(RB, GB); B0 = _mm_or_si128(B0, BB); RB = _mm_shuffle_epi8(Rb, RBm); GB = _mm_shuffle_epi8(Gb, GBm); BB = _mm_shuffle_epi8(Bb, BBm); __m128i B1 = _mm_or_si128(RB, GB); B1 = _mm_or_si128(B1, BB); __m128i RCm = _mm_set_epi8(-1, -1, -1, -1, 15, 14, -1, -1, -1, -1, 13, 12, -1, -1, -1, -1); __m128i GCm = _mm_set_epi8(-1, -1, 15, 14, -1, -1, -1, -1, 13, 12, -1, -1, -1, -1, 11, 10); __m128i BCm = _mm_set_epi8(15, 14, -1, -1, -1, -1, 13, 12, -1, -1, -1, -1, 11, 10, -1, -1); __m128i RC = _mm_shuffle_epi8(Ra, RCm); __m128i GC = _mm_shuffle_epi8(Ga, GCm); __m128i BC = _mm_shuffle_epi8(Ba, BCm); __m128i C0 = _mm_or_si128(RC, GC); C0 = _mm_or_si128(C0, BC); RC = _mm_shuffle_epi8(Rb, RCm); GC = _mm_shuffle_epi8(Gb, GCm); BC = _mm_shuffle_epi8(Bb, BCm); __m128i C1 = _mm_or_si128(RC, GC); C1 = _mm_or_si128(C1, BC); __m256i A = {mergeA0B0}; __m256i B = {mergeC0A1}; __m256i C = {mergeB1C1}; {store}'''.format(mergeA0B0=x86.setr('avx', typ, 'A0', 'B0'), mergeC0A1=x86.setr('avx', typ, 'C0', 'A1'), mergeB1C1=x86.setr('avx', typ, 'B1', 'C1'), store=store3('avx', typ, align, fmtspec, 'A', 'B', 'C'), **fmtspec) avx2_template = \ '''__m256i RAm = _mm256_setr_epi32( 0, -1, -1, 1, -1, -1, 2, -1); __m256i RBm = _mm256_setr_epi32(-1, 3, -1, -1, 4, -1, -1, 5); __m256i RCm = _mm256_setr_epi32(-1, -1, 6, -1, -1, 7, -1, -1); __m256i GAm = _mm256_setr_epi32(-1, 0, -1, -1, 1, -1, -1, 2); __m256i GBm = _mm256_setr_epi32(-1, -1, 3, -1, -1, 4, -1, -1); __m256i GCm = _mm256_setr_epi32( 5, -1, -1, 6, -1, -1, 7, -1); __m256i BAm = _mm256_setr_epi32(-1, -1, 0, -1, -1, 1, -1, -1); __m256i BBm = _mm256_setr_epi32( 2, -1, -1, 3, -1, -1, 4, -1); __m256i BCm = _mm256_setr_epi32(-1, 5, -1, -1, 6, -1, -1, 7); {styp} RA = _mm256_permutevar8x32{suf}({in1}, RAm); {styp} RB = _mm256_permutevar8x32{suf}({in1}, RBm); {styp} RC = _mm256_permutevar8x32{suf}({in1}, RCm); {styp} GA = _mm256_permutevar8x32{suf}({in2}, GAm); {styp} GB = _mm256_permutevar8x32{suf}({in2}, GBm); {styp} GC = _mm256_permutevar8x32{suf}({in2}, GCm); {styp} BA = _mm256_permutevar8x32{suf}({in3}, BAm); {styp} BB = _mm256_permutevar8x32{suf}({in3}, BBm); {styp} BC = _mm256_permutevar8x32{suf}({in3}, BCm); {styp} A = _mm256_blend{suf}(RA, GA, 2 + 16 + 128); A = _mm256_blend{suf}(A, BA, 4 + 32); {styp} B = _mm256_blend{suf}(RB, GB, 4 + 32); B = _mm256_blend{suf}(B, BB, 1 + 8 + 64); {styp} C = _mm256_blend{suf}(RC, GC, 1 + 8 + 64); C = _mm256_blend{suf}(C, BC, 2 + 16 + 128); {store}'''.format(store=store3('avx', typ, align, fmtspec, 'A', 'B', 'C'), **fmtspec) if typ == 'f32': if simd_ext == 'avx2': return avx2_template else: return \ '''__m256i RAm = _mm256_setr_epi32( 0, -1, -1, 1, -1, -1, 2, -1); __m256i RBm = _mm256_setr_epi32(-1, 3, -1, -1, 4, -1, -1, 5); __m256i RCm = _mm256_setr_epi32(-1, -1, 6, -1, -1, 7, -1, -1); __m256i GAm = _mm256_setr_epi32(-1, 0, -1, -1, 1, -1, -1, 2); __m256i GBm = _mm256_setr_epi32(-1, -1, 3, -1, -1, 4, -1, -1); __m256i GCm = _mm256_setr_epi32( 5, -1, -1, 6, -1, -1, 7, -1); __m256i BAm = _mm256_setr_epi32(-1, -1, 0, -1, -1, 1, -1, -1); __m256i BBm = _mm256_setr_epi32( 2, -1, -1, 3, -1, -1, 4, -1); __m256i BCm = _mm256_setr_epi32(-1, 5, -1, -1, 6, -1, -1, 7); __m256 RA = _mm256_permutevar_ps({in1}, RAm); __m256 RB = _mm256_permutevar_ps({in1}, RBm); __m256 RC = _mm256_permutevar_ps({in1}, RCm); __m256 GA = _mm256_permutevar_ps({in2}, GAm); __m256 GB = _mm256_permutevar_ps({in2}, GBm); __m256 GC = _mm256_permutevar_ps({in2}, GCm); __m256 BA = _mm256_permutevar_ps({in3}, BAm); __m256 BB = _mm256_permutevar_ps({in3}, BBm); __m256 BC = _mm256_permutevar_ps({in3}, BCm); __m256 A1 = _mm256_blend_ps(RA, GA, 2 + 16 + 128); A1 = _mm256_blend_ps(A1, BA, 4 + 32); __m256 B = _mm256_blend_ps(RB, GB, 4 + 32); B = _mm256_blend_ps(B, BB, 1 + 8 + 64); __m256 C1 = _mm256_blend_ps(RC, GC, 1 + 8 + 64); C1 = _mm256_blend_ps(C1, BC, 2 + 16 + 128); __m256 A = _mm256_permute2f128_ps(A1, C1, 2 << 4); __m256 C = _mm256_permute2f128_ps(A1, C1, (3 << 4) | 1); {store}'''.format(avx2_template=avx2_template, store=store3('avx', typ, align, fmtspec, 'A', 'B', 'C'), **fmtspec) if typ in ['i32', 'u32']: if simd_ext == 'avx2': return avx2_template else: return \ '''nsimd_store3{a}_avx_f32((f32 *){in0}, _mm256_castsi256_ps({in1}), _mm256_castsi256_ps({in2}), _mm256_castsi256_ps({in3}));'''. \ format(**fmtspec) if typ == 'f64': return \ '''__m256d invv1 = _mm256_permute_pd({in2}, 1 + 4); __m256d A1C0 = _mm256_blend_pd({in1}, {in3}, 1 + 4); __m256d A0B1 = _mm256_blend_pd({in1}, invv1, 2 + 8); __m256d B0C1 = _mm256_blend_pd(invv1, {in3}, 2 + 8); __m256d A = _mm256_permute2f128_pd(A0B1, A1C0, 2 << 4); __m256d B = _mm256_blend_pd(B0C1, A0B1, 4 + 8); __m256d C = _mm256_permute2f128_pd(A1C0, B0C1, (3 << 4) | 1); {store}'''.format(store=store3('avx', typ, align, fmtspec, 'A', 'B', 'C'), **fmtspec) if typ in ['i64', 'u64']: return \ '''nsimd_store3{a}_{simd_ext}_f64((f64 *){in0}, _mm256_castsi256_pd({in1}), _mm256_castsi256_pd({in2}), _mm256_castsi256_pd({in3}));'''. \ format(**fmtspec) ############################################################################### def load3_avx512(simd_ext, typ, align, fmtspec2): fmtspec = fmtspec2.copy() fmtspec['load_v0v1v2'] = get_load_v0v1v2(simd_ext, typ, align, fmtspec) fmtspec['exlo_v0'] = x86.extract(simd_ext, typ, x86.LO, 'v0') fmtspec['exhi_v0'] = x86.extract(simd_ext, typ, x86.HI, 'v0') fmtspec['exlo_v1'] = x86.extract(simd_ext, typ, x86.LO, 'v1') fmtspec['exhi_v1'] = x86.extract(simd_ext, typ, x86.HI, 'v1') fmtspec['exlo_v2'] = x86.extract(simd_ext, typ, x86.LO, 'v2') fmtspec['exhi_v2'] = x86.extract(simd_ext, typ, x86.HI, 'v2') fmtspec['a'] = 'a' if align else 'u' if typ in ['i8', 'u8']: return \ '''nsimd_{simd_ext}_v{typ}x3 ret; {load_v0v1v2} __m256i A0in = {exlo_v0}; __m256i B0in = {exhi_v0}; __m256i C0in = {exlo_v1}; __m256i A1in = {exhi_v1}; __m256i B1in = {exlo_v2}; __m256i C1in = {exhi_v2}; __m256i ARmask = _mm256_setr_epi8( 0, 3, 6, 9, 12, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 2, 5, 8, 11, 14, -1, -1, -1, -1, -1); __m256i BRmask = _mm256_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1); __m256i CRmask = _mm256_setr_epi8(-1, -1, -1, -1, -1, -1, 2, 5, 8, 11, 14, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, 4, 7, 10, 13); __m256i AR = _mm256_shuffle_epi8(A0in, ARmask); __m256i BR = _mm256_shuffle_epi8(B0in, BRmask); __m256i CR = _mm256_shuffle_epi8(C0in, CRmask); __m256i DR = _mm256_permute2f128_si256(AR, CR, (2 << 4) | 1); __m256i R0 = _mm256_or_si256(AR, BR); __m256i R1 = _mm256_or_si256(BR, CR); __m256i R2 = _mm256_permute2f128_si256(R0, R1, 3 << 4); __m256i R3 = _mm256_or_si256(DR, R2); AR = _mm256_shuffle_epi8(A1in, ARmask); BR = _mm256_shuffle_epi8(B1in, BRmask); CR = _mm256_shuffle_epi8(C1in, CRmask); DR = _mm256_permute2f128_si256(AR, CR, (2 << 4) | 1); R0 = _mm256_or_si256(AR, BR); R1 = _mm256_or_si256(BR, CR); R2 = _mm256_permute2f128_si256(R0, R1, 3 << 4); __m256i R3b = _mm256_or_si256(DR, R2); __m256i AGmask = _mm256_setr_epi8( 1, 4, 7, 10, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 3, 6, 9, 12, 15, -1, -1, -1, -1, -1); __m256i BGmask = _mm256_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 2, 5, 8, 11, 14, 1, 4, 7, 10, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1); __m256i CGmask = _mm256_setr_epi8(-1, -1, -1, -1, -1, 0, 3, 6, 9, 12, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 2, 5, 8, 11, 14); __m256i AG = _mm256_shuffle_epi8(A0in, AGmask); __m256i BG = _mm256_shuffle_epi8(B0in, BGmask); __m256i CG = _mm256_shuffle_epi8(C0in, CGmask); __m256i DG = _mm256_permute2f128_si256(AG, CG, (2 << 4) | 1); __m256i G0 = _mm256_or_si256(AG, BG); __m256i G1 = _mm256_or_si256(BG, CG); __m256i G2 = _mm256_permute2f128_si256(G0, G1, 3 << 4); __m256i G3 = _mm256_or_si256(DG, G2); AG = _mm256_shuffle_epi8(A1in, AGmask); BG = _mm256_shuffle_epi8(B1in, BGmask); CG = _mm256_shuffle_epi8(C1in, CGmask); DG = _mm256_permute2f128_si256(AG, CG, (2 << 4) | 1); G0 = _mm256_or_si256(AG, BG); G1 = _mm256_or_si256(BG, CG); G2 = _mm256_permute2f128_si256(G0, G1, 3 << 4); __m256i G3b = _mm256_or_si256(DG, G2); __m256i ABmask = _mm256_setr_epi8( 2, 5, 8, 11, 14, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, 4, 7, 10, 13, -1, -1, -1, -1, -1, -1); __m256i BBmask = _mm256_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1); __m256i CBmask = _mm256_setr_epi8(-1, -1, -1, -1, -1, 1, 4, 7, 10, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 3, 6, 9, 12, 15); __m256i AB = _mm256_shuffle_epi8(A0in, ABmask); __m256i BB = _mm256_shuffle_epi8(B0in, BBmask); __m256i CB = _mm256_shuffle_epi8(C0in, CBmask); __m256i DB = _mm256_permute2f128_si256(AB, CB, (2 << 4) | 1); __m256i B0 = _mm256_or_si256(AB, BB); __m256i B1 = _mm256_or_si256(BB, CB); __m256i B2 = _mm256_permute2f128_si256(B0, B1, 3 << 4); __m256i B3 = _mm256_or_si256(DB, B2); AB = _mm256_shuffle_epi8(A1in, ABmask); BB = _mm256_shuffle_epi8(B1in, BBmask); CB = _mm256_shuffle_epi8(C1in, CBmask); DB = _mm256_permute2f128_si256(AB, CB, (2 << 4) | 1); B0 = _mm256_or_si256(AB, BB); B1 = _mm256_or_si256(BB, CB); B2 = _mm256_permute2f128_si256(B0, B1, 3 << 4); __m256i B3b = _mm256_or_si256(DB, B2); ret.v0 = {mergeR}; ret.v1 = {mergeG}; ret.v2 = {mergeB}; return ret;'''. \ format(mergeR=x86.setr(simd_ext, typ, 'R3', 'R3b'), mergeG=x86.setr(simd_ext, typ, 'G3', 'G3b'), mergeB=x86.setr(simd_ext, typ, 'B3', 'B3b'), **fmtspec) if typ in ['i16', 'u16']: return \ '''nsimd_{simd_ext}_v{typ}x3 ret; {load_v0v1v2} __m256i A0a = {exlo_v0}; __m256i B0a = {exhi_v0}; __m256i C0a = {exlo_v1}; __m256i A0b = {exhi_v1}; __m256i B0b = {exlo_v2}; __m256i C0b = {exhi_v2}; __m256i ARmask = _mm256_setr_epi8( 0, 1, 6, 7, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 2, 3, 8, 9, 14, 15, -1, -1, -1, -1); __m256i BRmask = _mm256_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 4, 5, 10, 11, 0, 1, 6, 7, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1); __m256i CRmask = _mm256_setr_epi8(-1, -1, -1, -1, -1, -1, 2, 3, 8, 9, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 4, 5, 10, 11); __m256i AR = _mm256_shuffle_epi8(A0a, ARmask); __m256i BR = _mm256_shuffle_epi8(B0a, BRmask); __m256i CR = _mm256_shuffle_epi8(C0a, CRmask); __m256i DR = _mm256_permute2f128_si256(AR, CR, (2 << 4) | 1); __m256i R0 = _mm256_or_si256(AR, BR); __m256i R1 = _mm256_or_si256(BR, CR); __m256i R2 = _mm256_permute2f128_si256(R0, R1, 3 << 4); __m256i R3a = _mm256_or_si256(DR, R2); AR = _mm256_shuffle_epi8(A0b, ARmask); BR = _mm256_shuffle_epi8(B0b, BRmask); CR = _mm256_shuffle_epi8(C0b, CRmask); DR = _mm256_permute2f128_si256(AR, CR, (2 << 4) | 1); R0 = _mm256_or_si256(AR, BR); R1 = _mm256_or_si256(BR, CR); R2 = _mm256_permute2f128_si256(R0, R1, 3 << 4); __m256i R3b = _mm256_or_si256(DR, R2); __m256i AGmask = _mm256_setr_epi8( 2, 3, 8, 9, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 4, 5, 10, 11, -1, -1, -1, -1, -1, -1); __m256i BGmask = _mm256_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1); __m256i CGmask = _mm256_setr_epi8(-1, -1, -1, -1, -1, -1, 4, 5, 10, 11, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 1, 6, 7, 12, 13); __m256i AG = _mm256_shuffle_epi8(A0a, AGmask); __m256i BG = _mm256_shuffle_epi8(B0a, BGmask); __m256i CG = _mm256_shuffle_epi8(C0a, CGmask); __m256i DG = _mm256_permute2f128_si256(AG, CG, (2 << 4) | 1); __m256i G0 = _mm256_or_si256(AG, BG); __m256i G1 = _mm256_or_si256(BG, CG); __m256i G2 = _mm256_permute2f128_si256(G0, G1, 3 << 4); __m256i G3a = _mm256_or_si256(DG, G2); AG = _mm256_shuffle_epi8(A0b, AGmask); BG = _mm256_shuffle_epi8(B0b, BGmask); CG = _mm256_shuffle_epi8(C0b, CGmask); DG = _mm256_permute2f128_si256(AG, CG, (2 << 4) | 1); G0 = _mm256_or_si256(AG, BG); G1 = _mm256_or_si256(BG, CG); G2 = _mm256_permute2f128_si256(G0, G1, 3 << 4); __m256i G3b = _mm256_or_si256(DG, G2); __m256i ABmask = _mm256_setr_epi8( 4, 5, 10, 11, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 1, 6, 7, 12, 13, -1, -1, -1, -1, -1, -1); __m256i BBmask = _mm256_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1); __m256i CBmask = _mm256_setr_epi8(-1, -1, -1, -1, 0, 1, 6, 7, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 2, 3, 8, 9, 14, 15); __m256i AB = _mm256_shuffle_epi8(A0a, ABmask); __m256i BB = _mm256_shuffle_epi8(B0a, BBmask); __m256i CB = _mm256_shuffle_epi8(C0a, CBmask); __m256i DB = _mm256_permute2f128_si256(AB, CB, (2 << 4) | 1); __m256i B0 = _mm256_or_si256(AB, BB); __m256i B1 = _mm256_or_si256(BB, CB); __m256i B2 = _mm256_permute2f128_si256(B0, B1, 3 << 4); __m256i B3a = _mm256_or_si256(DB, B2); AB = _mm256_shuffle_epi8(A0b, ABmask); BB = _mm256_shuffle_epi8(B0b, BBmask); CB = _mm256_shuffle_epi8(C0b, CBmask); DB = _mm256_permute2f128_si256(AB, CB, (2 << 4) | 1); B0 = _mm256_or_si256(AB, BB); B1 = _mm256_or_si256(BB, CB); B2 = _mm256_permute2f128_si256(B0, B1, 3 << 4); __m256i B3b = _mm256_or_si256(DB, B2); ret.v0 = {mergeR}; ret.v1 = {mergeG}; ret.v2 = {mergeB}; return ret;'''. \ format(mergeR=x86.setr(simd_ext, typ, 'R3a', 'R3b'), mergeG=x86.setr(simd_ext, typ, 'G3a', 'G3b'), mergeB=x86.setr(simd_ext, typ, 'B3a', 'B3b'), **fmtspec) if typ in ['f32', 'i32', 'u32']: return \ '''nsimd_{simd_ext}_v{typ}x3 ret; {load_v0v1v2} __m512i RABm = _mm512_setr_epi32( 0, 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 0, 0, 0 , 0, 0); __m512i RABCm = _mm512_setr_epi32( 0, 1, 2, 3, 4, 5, 6 , 7, 8, 9, 10, 17, 20, 23, 26, 29); __m512i GABm = _mm512_setr_epi32( 1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31, 0, 0, 0 , 0, 0); __m512i GABCm = _mm512_setr_epi32( 0, 1, 2, 3, 4, 5, 6 , 7, 8, 9, 10, 18, 21, 24, 27, 30); __m512i BABm = _mm512_setr_epi32( 2, 5, 8, 11, 14, 17, 20, 23, 26, 29, 0, 0, 0, 0 , 0, 0); __m512i BABCm = _mm512_setr_epi32( 0, 1, 2, 3, 4, 5, 6 , 7, 8, 9, 16, 19, 22, 25, 28, 31); {styp} R = _mm512_permutex2var{suf}(v0, RABm, v1); ret.v0 = _mm512_permutex2var{suf}(R, RABCm, v2); {styp} G = _mm512_permutex2var{suf}(v0, GABm, v1); ret.v1 = _mm512_permutex2var{suf}(G, GABCm, v2); {styp} B = _mm512_permutex2var{suf}(v0, BABm, v1); ret.v2 = _mm512_permutex2var{suf}(B, BABCm, v2); return ret;'''.format(**fmtspec) if typ in ['f64', 'i64', 'u64']: return \ '''nsimd_{simd_ext}_v{typ}x3 ret; {load_v0v1v2} __m512i R_mask0 = _mm512_set_epi64( 0, 0, 15, 12, 9, 6, 3, 0); __m512i R_mask1 = _mm512_set_epi64(13, 10, 5, 4, 3, 2, 1, 0); {styp} A1 = _mm512_permutex2var{suf}(v0, R_mask0, v1); ret.v0 = _mm512_permutex2var{suf}(A1, R_mask1, v2); __m512i G_mask0 = _mm512_set_epi64( 0, 0, 0, 13, 10, 7, 4, 1); __m512i G_mask1 = _mm512_set_epi64(14, 11, 8, 4, 3, 2, 1, 0); {styp} B1 = _mm512_permutex2var{suf}(v0, G_mask0, v1); ret.v1 = _mm512_permutex2var{suf}(B1, G_mask1, v2); __m512i B_mask0 = _mm512_set_epi64( 0, 0, 0, 14, 11, 8, 5, 2); __m512i B_mask1 = _mm512_set_epi64(15, 12, 9, 4, 3, 2, 1, 0); {styp} C1 = _mm512_permutex2var{suf}(v0, B_mask0, v1); ret.v2 = _mm512_permutex2var{suf}(C1, B_mask1, v2); return ret;'''.format(**fmtspec) ############################################################################### def store3_avx512(simd_ext, typ, align, fmtspec2): fmtspec = fmtspec2.copy() fmtspec['exlo_in1'] = x86.extract(simd_ext, typ, x86.LO, common.in1) fmtspec['exhi_in1'] = x86.extract(simd_ext, typ, x86.HI, common.in1) fmtspec['exlo_in2'] = x86.extract(simd_ext, typ, x86.LO, common.in2) fmtspec['exhi_in2'] = x86.extract(simd_ext, typ, x86.HI, common.in2) fmtspec['exlo_in3'] = x86.extract(simd_ext, typ, x86.LO, common.in3) fmtspec['exhi_in3'] = x86.extract(simd_ext, typ, x86.HI, common.in3) fmtspec['a'] = 'a' if align else 'u' if typ in ['i8', 'u8']: return \ '''__m256i R0 = {exlo_in1}; __m256i R1 = {exhi_in1}; __m256i G0 = {exlo_in2}; __m256i G1 = {exhi_in2}; __m256i B0 = {exlo_in3}; __m256i B1 = {exhi_in3}; __m256i RACm = _mm256_setr_epi8( 0, -1, -1, 1, -1, -1, 2, -1, -1, 3, -1, -1, 4, -1, -1, 5, -1, 27, -1, -1, 28, -1, -1, 29, -1, -1, 30, -1, -1, 31, -1, -1); __m256i RBBm = _mm256_setr_epi8(-1, 11, -1, -1, 12, -1, -1, 13, -1, -1, 14, -1, -1, 15, -1, -1, 16, -1, -1, 17, -1, -1, 18, -1, -1, 19, -1, -1, 20, -1, -1, 21); __m256i RCAm = _mm256_setr_epi8(-1, -1, 22, -1, -1, 23, -1, -1, 24, -1, -1, 25, -1, -1, 26, -1, -1, -1, 6, -1, -1, 7, -1, -1, 8, -1, -1, 9, -1, -1, 10, -1); __m256i GACm = _mm256_setr_epi8(-1, 0, -1, -1, 1, -1, -1, 2, -1, -1, 3, -1, -1, 4, -1, -1, -1, -1, 27, -1, -1, 28, -1, -1, 29, -1, -1, 30, -1, -1, 31, -1); __m256i GBBm = _mm256_setr_epi8(-1, -1, 11, -1, -1, 12, -1, -1, 13, -1, -1, 14, -1, -1, 15, -1, -1, 16, -1, -1, 17, -1, -1, 18, -1, -1, 19, -1, -1, 20, -1, -1); __m256i GCAm = _mm256_setr_epi8(21, -1, -1, 22, -1, -1, 23, -1, -1, 24, -1, -1, 25, -1, -1, 26, 05, -1, -1, 6, -1, -1, 7, -1, -1, 8, -1, -1, 9, -1, -1, 10); __m256i BACm = _mm256_setr_epi8(-1, -1, 0, -1, -1, 1, -1, -1, 2, -1, -1, 3, -1, -1, 4, -1, 26, -1, -1, 27, -1, -1, 28, -1, -1, 29, -1, -1, 30, -1, -1, 31); __m256i BBBm = _mm256_setr_epi8(10, -1, -1, 11, -1, -1, 12, -1, -1, 13, -1, -1, 14, -1, -1, 15, -1, -1, 16, -1, -1, 17, -1, -1, 18, -1, -1, 19, -1, -1, 20, -1); __m256i BCAm = _mm256_setr_epi8(-1, 21, -1, -1, 22, -1, -1, 23, -1, -1, 24, -1, -1, 25, -1, -1, -1, 5, -1, -1, 6, -1, -1, 7, -1, -1, 8, -1, -1, 9, -1, -1); __m256i RAC = _mm256_shuffle_epi8(R0, RACm); __m256i GAC = _mm256_shuffle_epi8(G0, GACm); __m256i BAC = _mm256_shuffle_epi8(B0, BACm); __m256i AC0 = _mm256_or_si256(RAC, GAC); AC0 = _mm256_or_si256(AC0, BAC); __m256i RBB = _mm256_shuffle_epi8(R0, RBBm); __m256i GBB = _mm256_shuffle_epi8(G0, GBBm); __m256i BBB = _mm256_shuffle_epi8(B0, BBBm); __m256i BB0 = _mm256_or_si256(RBB, GBB); BB0 = _mm256_or_si256(BB0, BBB); __m256i RCA = _mm256_shuffle_epi8(R0, RCAm); __m256i GCA = _mm256_shuffle_epi8(G0, GCAm); __m256i BCA = _mm256_shuffle_epi8(B0, BCAm); __m256i CA0 = _mm256_or_si256(RCA, GCA); CA0 = _mm256_or_si256(CA0, BCA); __m256i AA0 = _mm256_permute2f128_si256(AC0, CA0, 2 << 4); __m256i CC0 = _mm256_permute2f128_si256(AC0, CA0, (1 << 4) | 3); RAC = _mm256_shuffle_epi8(R1, RACm); GAC = _mm256_shuffle_epi8(G1, GACm); BAC = _mm256_shuffle_epi8(B1, BACm); __m256i AC1 = _mm256_or_si256(RAC, GAC); AC1 = _mm256_or_si256(AC1, BAC); RBB = _mm256_shuffle_epi8(R1, RBBm); GBB = _mm256_shuffle_epi8(G1, GBBm); BBB = _mm256_shuffle_epi8(B1, BBBm); __m256i BB1 = _mm256_or_si256(RBB, GBB); BB1 = _mm256_or_si256(BB1, BBB); RCA = _mm256_shuffle_epi8(R1, RCAm); GCA = _mm256_shuffle_epi8(G1, GCAm); BCA = _mm256_shuffle_epi8(B1, BCAm); __m256i CA1 = _mm256_or_si256(RCA, GCA); CA1 = _mm256_or_si256(CA1, BCA); __m256i AA1 = _mm256_permute2f128_si256(AC1, CA1, 2 << 4); __m256i CC1 = _mm256_permute2f128_si256(AC1, CA1, (1 << 4) | 3); __m512i A = {mergeA0B0}; __m512i B = {mergeC0A1}; __m512i C = {mergeB1C1}; {store}'''. \ format(mergeA0B0=x86.setr(simd_ext, typ, 'AA0', 'BB0'), mergeC0A1=x86.setr(simd_ext, typ, 'CC0', 'AA1'), mergeB1C1=x86.setr(simd_ext, typ, 'BB1', 'CC1'), store=store3(simd_ext, typ, align, fmtspec, 'A', 'B', 'C'), **fmtspec) if typ in ['i16', 'u16']: return \ '''__m256i R0a = {exlo_in1}; __m256i R0b = {exhi_in1}; __m256i G0a = {exlo_in2}; __m256i G0b = {exhi_in2}; __m256i B0a = {exlo_in3}; __m256i B0b = {exhi_in3}; __m256i RACm = _mm256_setr_epi8( 0, 1, -1, -1, -1, -1, 2, 3, -1, -1, -1, -1, 4, 5, -1, -1, -1, -1, -1, -1, 12, 13, -1, -1, -1, -1, 14, 15, -1, -1, -1, -1); __m256i RBBm = _mm256_setr_epi8(-1, -1, -1, -1, 12, 13, -1, -1, -1, -1, 14, 15, -1, -1, -1, -1, 0, 1, -1, -1, -1, -1, 2, 3, -1, -1, -1, -1, 4, 5, -1, -1); __m256i RCAm = _mm256_setr_epi8(-1, -1, 6, 7, -1, -1, -1, -1, 8, 9, -1, -1, -1, -1, 10, 11, -1, -1, 6, 7, -1, -1, -1, -1, 8, 9, -1, -1, -1, -1, 10, 11); __m256i GACm = _mm256_setr_epi8(-1, -1, 0, 1, -1, -1, -1, -1, 2, 3, -1, -1, -1, -1, 4, 5, 10, 11, -1, -1, -1, -1, 12, 13, -1, -1, -1, -1, 14, 15, -1, -1); __m256i GBBm = _mm256_setr_epi8(10, 11, -1, -1, -1, -1, 12, 13, -1, -1, -1, -1, 14, 15, -1, -1, -1, -1, 0, 1, -1, -1, -1, -1, 2, 3, -1, -1, -1, -1, 4, 5); __m256i GCAm = _mm256_setr_epi8(-1, -1, -1, -1, 6, 7, -1, -1, -1, -1, 8, 9, -1, -1, -1, -1, -1, -1, -1, -1, 6, 7, -1, -1, -1, -1, 8, 9, -1, -1, -1, -1); __m256i BACm = _mm256_setr_epi8(-1, -1, -1, -1, 0, 1, -1, -1, -1, -1, 2, 3, -1, -1, -1, -1, -1, -1, 10, 11, -1, -1, -1, -1, 12, 13, -1, -1, -1, -1, 14, 15); __m256i BBBm = _mm256_setr_epi8(-1, -1, 10, 11, -1, -1, -1, -1, 12, 13, -1, -1, -1, -1, 14, 15, -1, -1, -1, -1, 0, 1, -1, -1, -1, -1, 2, 3, -1, -1, -1, -1); __m256i BCAm = _mm256_setr_epi8( 4, 5, -1, -1, -1, -1, 6, 7, -1, -1, -1, -1, 8, 9, -1, -1, 4, 5, -1, -1, -1, -1, 6, 7, -1, -1, -1, -1, 8, 9, -1, -1); __m256i RAC = _mm256_shuffle_epi8(R0a, RACm); __m256i GAC = _mm256_shuffle_epi8(G0a, GACm); __m256i BAC = _mm256_shuffle_epi8(B0a, BACm); __m256i RBB = _mm256_shuffle_epi8(R0a, RBBm); __m256i GBB = _mm256_shuffle_epi8(G0a, GBBm); __m256i BBB = _mm256_shuffle_epi8(B0a, BBBm); __m256i RCA = _mm256_shuffle_epi8(R0a, RCAm); __m256i GCA = _mm256_shuffle_epi8(G0a, GCAm); __m256i BCA = _mm256_shuffle_epi8(B0a, BCAm); __m256i AC = _mm256_or_si256(RAC, GAC); AC = _mm256_or_si256(AC, BAC); __m256i BBa = _mm256_or_si256(RBB, GBB); BBa = _mm256_or_si256(BBa, BBB); __m256i CA = _mm256_or_si256(RCA, GCA); CA = _mm256_or_si256(CA, BCA); __m256i AAa = _mm256_permute2f128_si256(AC, CA, 2 << 4); __m256i CCa = _mm256_permute2f128_si256(AC, CA, (1 << 4) | 3); RAC = _mm256_shuffle_epi8(R0b, RACm); GAC = _mm256_shuffle_epi8(G0b, GACm); BAC = _mm256_shuffle_epi8(B0b, BACm); RBB = _mm256_shuffle_epi8(R0b, RBBm); GBB = _mm256_shuffle_epi8(G0b, GBBm); BBB = _mm256_shuffle_epi8(B0b, BBBm); RCA = _mm256_shuffle_epi8(R0b, RCAm); GCA = _mm256_shuffle_epi8(G0b, GCAm); BCA = _mm256_shuffle_epi8(B0b, BCAm); AC = _mm256_or_si256(RAC, GAC); AC = _mm256_or_si256(AC, BAC); __m256i BBb = _mm256_or_si256(RBB, GBB); BBb = _mm256_or_si256(BBb, BBB); CA = _mm256_or_si256(RCA, GCA); CA = _mm256_or_si256(CA, BCA); __m256i AAb = _mm256_permute2f128_si256(AC, CA, 2 << 4); __m256i CCb = _mm256_permute2f128_si256(AC, CA, (1 << 4) | 3); __m512i A = {mergeAaBa}; __m512i B = {mergeCaAb}; __m512i C = {mergeBbCb}; {store}'''. \ format(mergeAaBa=x86.setr(simd_ext, typ, 'AAa', 'BBa'), mergeCaAb=x86.setr(simd_ext, typ, 'CCa', 'AAb'), mergeBbCb=x86.setr(simd_ext, typ, 'BBb', 'CCb'), store=store3(simd_ext, typ, align, fmtspec, 'A', 'B', 'C'), **fmtspec) if typ in ['f32', 'i32', 'u32']: return \ '''__m512i ARGm = _mm512_setr_epi32( 0, 16, 0, 1, 17, 0, 2, 18, 0, 3, 19, 0, 4, 20, 0, 5); __m512i ARGBm = _mm512_setr_epi32( 0, 1, 16, 3, 4, 17, 6, 7, 18, 9, 10, 19, 12, 13, 20, 15); __m512i BRGm = _mm512_setr_epi32(21, 0, 6, 22, 0, 7, 23, 0, 8, 24, 0, 9, 25, 0, 10, 26); __m512i BRGBm = _mm512_setr_epi32( 0, 21, 2, 3, 22, 5, 6, 23, 8, 9, 24, 11, 12, 25, 14, 15); __m512i CRGm = _mm512_setr_epi32( 0, 11, 27, 0, 12, 28, 0, 13, 29, 0, 14, 30, 0, 15, 31, 0); __m512i CRGBm = _mm512_setr_epi32(26, 1, 2, 27, 4, 5, 28, 7, 8, 29, 10, 11, 30, 13, 14, 31); {styp} A = _mm512_permutex2var{suf}({in1}, ARGm, {in2}); A = _mm512_permutex2var{suf}(A, ARGBm, {in3}); {styp} B = _mm512_permutex2var{suf}({in1}, BRGm, {in2}); B = _mm512_permutex2var{suf}(B, BRGBm, {in3}); {styp} C = _mm512_permutex2var{suf}({in1}, CRGm, {in2}); C = _mm512_permutex2var{suf}(C, CRGBm, {in3}); {store}'''. \ format(store=store3(simd_ext, typ, align, fmtspec, 'A', 'B', 'C'), **fmtspec) if typ in ['f64', 'i64', 'u64']: return \ '''__m512i A_mask0 = _mm512_set_epi64(10, 2, 0, 9, 1, 0, 8, 0); __m512i A_mask1 = _mm512_set_epi64( 7, 6, 9, 4, 3, 8, 1, 0); {styp} A1 = _mm512_permutex2var{suf}({in1}, A_mask0, {in2}); {styp} A2 = _mm512_permutex2var{suf}(A1, A_mask1, {in3}); __m512i B_mask0 = _mm512_set_epi64( 5, 0, 12, 4, 0, 11, 3, 0); __m512i B_mask1 = _mm512_set_epi64( 7, 12, 5, 4, 11, 2, 1, 10); {styp} B1 = _mm512_permutex2var{suf}({in1}, B_mask0, {in2}); {styp} B2 = _mm512_permutex2var{suf}(B1, B_mask1, {in3}); __m512i C_mask0 = _mm512_set_epi64( 0, 15, 7, 0, 14, 6, 0, 13); __m512i C_mask1 = _mm512_set_epi64(15, 6, 5, 14, 3, 2, 13, 0); {styp} C1 = _mm512_permutex2var{suf}({in1}, C_mask0, {in2}); {styp} C2 = _mm512_permutex2var{suf}(C1, C_mask1, {in3}); {store}'''. \ format(store=store3(simd_ext, typ, align, fmtspec, 'A2', 'B2', 'C2'), **fmtspec) ================================================ FILE: examples/module_fixed_point.cpp ================================================ // Copyright (c) 2019 Agenium Scale // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to // deal in the Software without restriction, including without limitation the // rights to use, copy, modify, merge, publish, distribute, sublicense, and/or // sell copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS // IN THE SOFTWARE. #include #include #include #include float rand_float() { return 4.0f * ((float) rand() / (float) RAND_MAX) - 2.0f; } int main() { // We use fixed point numbers with 8 bits of integer part and 8 bits of // decimal part. It will use 32 bits integers for internal storage. typedef nsimd::fixed_point::fp_t<8, 8> fp_t; typedef nsimd::fixed_point::pack fp_pack_t; const size_t v_size = nsimd::fixed_point::len(fp_t()); fp_t *input0 = (fp_t*)malloc(v_size * sizeof(fp_t)); fp_t *input1 = (fp_t *)malloc(v_size * sizeof(fp_t)); fp_t *res = (fp_t *)malloc(v_size * sizeof(fp_t)); // Input and output initializations for(size_t i = 0; i < nsimd::fixed_point::len(fp_t()); i++) { input0[i] = fp_t(rand_float()); input1[i] = fp_t(rand_float()); } fp_pack_t v0 = nsimd::fixed_point::loadu(input0); fp_pack_t v1 = nsimd::fixed_point::loadu(input1); fp_pack_t vres = nsimd::fixed_point::add(v0, v1); nsimd::fixed_point::storeu(res, vres); for(size_t i = 0; i < nsimd::fixed_point::len(fp_t()); i++) { std::cout << float(input0[i]) << " | " << float(input1[i]) << " | " << float(res[i]) << "\n"; } std::cout << std::endl; return EXIT_SUCCESS; } ================================================ FILE: examples/tutorial.cpp ================================================ #include #include #include #include template void uppercase_scalar(T *dst, const T *src, int n) { for (int i = 0; i < n; i++) { if (src[i] >= 'a' && src[i] <= 'z') { dst[i] = src[i] + ('A' - 'a'); } else { dst[i] = src[i]; } } } template void uppercase_simd(T *dst, const T *src, int n) { using namespace nsimd; typedef pack p_t; typedef packl pl_t; int l = len(); int i; for (i = 0; i + l <= n; i += l) { p_t text = loadu(src + i); pl_t mask = text >= 'a' && text <= 'z'; p_t then_pack = text + ('A' - 'a'); p_t TEXT = if_else(mask, then_pack, text); storeu(dst + i, TEXT); } pl_t mask = mask_for_loop_tail(i, n); p_t text = maskz_loadu(mask, src + i); p_t TEXT = if_else(text >= 'a' && text <= 'z', text + ('A' - 'a'), text); mask_storeu(mask, dst + i, TEXT); } int main(int argc, char **argv) { std::string input; for (int i = 1; i < argc; i++) { input += std::string(argv[i]); if (i < argc - 1) { input += std::string(" "); } } std::cout << "Orignal text : " << input << std::endl; std::vector dst_scalar(input.size() + 1); uppercase_scalar(&dst_scalar[0], (i8 *)input.c_str(), (int)input.size()); std::cout << "Scalar uppercase text: " << &dst_scalar[0] << std::endl; std::vector dst_simd(input.size() + 1); uppercase_simd(&dst_simd[0], (i8 *)input.c_str(), (int)input.size()); std::cout << "NSIMD uppercase text : " << &dst_simd[0] << std::endl; return 0; } ================================================ FILE: include/nsimd/c_adv_api.h ================================================ /* Copyright (c) 2021 Agenium Scale Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #ifndef NSIMD_C_ADV_API_H #define NSIMD_C_ADV_API_H #include #if NSIMD_C >= 2011 NSIMD_INLINE void nsimd_c11_type_unsupported(void) {} /* ------------------------------------------------------------------------- */ #include /* ------------------------------------------------------------------------- */ /* We add by hand parametrized loads/stores. */ /* loads */ #define nsimd_load_aligned(type, ptr) nsimd_loada(type, ptr) #define nsimd_load_unaligned(type, ptr) nsimd_loadu(type, ptr) #define nsimd_load(alignment, type, ptr) \ NSIMD_PP_CAT_2(nsimd_load_, alignment)(type, ptr) /* stores */ #define nsimd_store_aligned(ptr, vec) nsimd_storea(ptr, vec) #define nsimd_store_unaligned(ptr, vec) nsimd_storeu(ptr, vec) #define nsimd_store(alignment, ptr, vec) \ NSIMD_PP_CAT_2(nsimd_store_, alignment)(ptr, vec) /* ------------------------------------------------------------------------- */ /* Generic types */ #define nsimd_pack(type) NSIMD_PP_CAT_2(nsimd_pack_, type) #define nsimd_packl(type) NSIMD_PP_CAT_2(nsimd_packl_, type) #define nsimd_packx2(type) NSIMD_PP_CAT_2(nsimd_packx2_, type) #define nsimd_packx3(type) NSIMD_PP_CAT_2(nsimd_packx3_, type) #define nsimd_packx4(type) NSIMD_PP_CAT_2(nsimd_packx4_, type) #define nsimd_pack_a(type, simd_ext) NSIMD_PP_CAT_3(nsimd_pack_, type) #define nsimd_packl_a(type, simd_ext) NSIMD_PP_CAT_3(nsimd_packl_, type) #define nsimd_packx2_a(type, simd_ext) NSIMD_PP_CAT_3(nsimd_packx2_, type) #define nsimd_packx3_a(type, simd_ext) NSIMD_PP_CAT_3(nsimd_packx3_, type) #define nsimd_packx4_a(type, simd_ext) NSIMD_PP_CAT_3(nsimd_packx4_, type) #endif /* NSIMD_C >= 2011 */ #endif /* NSIMD_C_ADV_API_HPP */ ================================================ FILE: include/nsimd/cxx_adv_api.hpp ================================================ /* Copyright (c) 2021 Agenium Scale Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #ifndef NSIMD_CXX_ADV_API_HPP #define NSIMD_CXX_ADV_API_HPP #include #include // ---------------------------------------------------------------------------- namespace nsimd { // ---------------------------------------------------------------------------- // "mimic" static_assert in C++98 template struct nsimd_static_assert; template <> struct nsimd_static_assert {}; // ---------------------------------------------------------------------------- // Definition of pack template NSIMD_STRUCT pack; template NSIMD_STRUCT pack { typedef typename simd_traits::simd_vector simd_vector; typedef T value_type; typedef SimdExt simd_ext; static const int unroll = 1; static const int soa_num_packs = 1; simd_vector car; // Default ctor pack() {} // Ctor that splats template pack(S const &s) { car = set1(T(s), T(), SimdExt()); } // Ctor taking a SIMD vector pack(simd_vector v) { car = v; } // Underlying native SIMD vector getter simd_vector native_register() const { return car; } // Arithmetic and assignment operators pack &operator+=(pack const &other); pack &operator-=(pack const &other); pack &operator*=(pack const &other); pack &operator/=(pack const &other); pack &operator|=(pack const &other); pack &operator&=(pack const &other); pack &operator^=(pack const &other); pack &operator<<=(int); pack &operator>>=(int); // For std::cout'ing a pack friend std::ostream &operator<<(std::ostream &os, pack const &a0) { T buf[max_len_t::value]; storeu(buf, a0.car, T(), SimdExt()); os << "{ "; int n = len(a0); for (int i = 0; i < n; i++) { os << to_biggest(buf[i]); if (i < n - 1) { os << ", "; } } os << " }"; return os; } }; template NSIMD_STRUCT pack { typedef typename simd_traits::simd_vector simd_vector; typedef T value_type; typedef SimdExt simd_ext; static const int unroll = N; static const int soa_num_packs = 1; simd_vector car; pack cdr; // Default ctor pack() {} // Ctor that splats template pack(S const &s) : cdr(s) { car = set1(T(s), T(), SimdExt()); } // Arithmetic and assignment operators pack &operator+=(pack const &other); pack &operator-=(pack const &other); pack &operator*=(pack const &other); pack &operator/=(pack const &other); pack &operator|=(pack const &other); pack &operator&=(pack const &other); pack &operator^=(pack const &other); pack &operator<<=(int); pack &operator>>=(int); // For std::cout'ing a pack friend std::ostream &operator<<(std::ostream &os, pack const &a0) { os << pack(a0.car) << ", " << a0.cdr; return os; } }; #if NSIMD_CXX >= 2020 template struct is_pack_t : public std::false_type {}; template struct is_pack_t > : public std::true_type {}; template concept is_pack_c = is_pack_t::value; #define NSIMD_CONCEPT_PACK nsimd::is_pack_c #else #define NSIMD_CONCEPT_PACK typename #endif // ---------------------------------------------------------------------------- // Definition of logical template NSIMD_STRUCT packl; template NSIMD_STRUCT packl { typedef typename simd_traits::simd_vectorl simd_vectorl; simd_vectorl car; typedef T value_type; typedef SimdExt simd_ext; static const int unroll = 1; // Default ctor packl() {} // Ctor taking a SIMD vector packl(simd_vectorl v) { car = v; } // Ctor that splats template packl(S const &s) { car = set1l(int(s), T(), SimdExt()); } // Underlying native SIMD vector getter simd_vectorl native_register() const { return car; } // For std::cout'ing a packl friend std::ostream &operator<<(std::ostream &os, packl const &a0) { T buf[max_len_t::value]; storelu(buf, a0.car, T(), SimdExt()); os << "{ "; int n = len(a0); for (int i = 0; i < n; i++) { os << buf[i]; if (i < n - 1) { os << ", "; } } os << " }"; return os; } }; template NSIMD_STRUCT packl { typename simd_traits::simd_vectorl car; typedef T value_type; typedef SimdExt simd_ext; static const int unroll = N; packl cdr; // Default ctor packl() {} // Ctor that splats template packl(S const &s) : cdr(s) { car = set1l(int(s), T(), SimdExt()); } // For std::cout'ing a packl friend std::ostream &operator<<(std::ostream &os, packl const &a0) { os << packl(a0.car) << ", " << a0.cdr; return os; } }; #if NSIMD_CXX >= 2020 template struct is_packl_t : public std::false_type {}; template struct is_packl_t > : public std::true_type {}; template concept is_packl_c = is_packl_t::value; #define NSIMD_CONCEPT_PACKL nsimd::is_packl_c #else #define NSIMD_CONCEPT_PACKL typename #endif // ---------------------------------------------------------------------------- // Definition of SOA of degree 1 template NSIMD_STRUCT packx1; template NSIMD_STRUCT packx1 { typedef typename simd_traits::simd_vector simd_vector; typedef T value_type; typedef SimdExt simd_ext; static const int unroll = 1; static const int soa_num_packs = 1; pack v0; void set_car(simd_vector v0_) { v0.car = v0_; } }; template NSIMD_STRUCT packx1 { typedef typename simd_traits::simd_vector simd_vector; typedef T value_type; typedef SimdExt simd_ext; static const int unroll = N; static const int soa_num_packs = 1; pack v0; void set_car(simd_vector v0_) { v0.car = v0_; } void set_cdr(pack const &v0_) { v0.cdr = v0_; } }; #if NSIMD_CXX >= 2020 template struct is_packx1_t : public std::false_type {}; template struct is_packx1_t > : public std::true_type {}; template concept is_packx1_c = is_packx1_t::value; #define NSIMD_CONCEPT_PACKX1 nsimd::is_packx1_c #else #define NSIMD_CONCEPT_PACKX1 typename #endif // ---------------------------------------------------------------------------- // Definition of SOA of degree 2 template NSIMD_STRUCT packx2; template NSIMD_STRUCT packx2 { typedef typename simd_traits::simd_vector simd_vector; typedef T value_type; typedef SimdExt simd_ext; static const int unroll = 1; static const int soa_num_packs = 2; pack v0; pack v1; void set_car(simd_vector v0_, simd_vector v1_) { v0.car = v0_; v1.car = v1_; } }; template NSIMD_STRUCT packx2 { typedef typename simd_traits::simd_vector simd_vector; typedef T value_type; typedef SimdExt simd_ext; static const int unroll = N; static const int soa_num_packs = 2; pack v0; pack v1; void set_car(simd_vector v0_, simd_vector v1_) { v0.car = v0_; v1.car = v1_; } void set_cdr(pack const &v0_, pack const &v1_) { v0.cdr = v0_; v1.cdr = v1_; } }; #if NSIMD_CXX >= 2020 template struct is_packx2_t : public std::false_type {}; template struct is_packx2_t > : public std::true_type {}; template concept is_packx2_c = is_packx2_t::value; #define NSIMD_CONCEPT_PACKX2 nsimd::is_packx2_c #else #define NSIMD_CONCEPT_PACKX2 typename #endif // ---------------------------------------------------------------------------- // Definition of SOA of degree 3 template NSIMD_STRUCT packx3; template NSIMD_STRUCT packx3 { typedef typename simd_traits::simd_vector simd_vector; typedef T value_type; typedef SimdExt simd_ext; static const int unroll = 1; static const int soa_num_packs = 3; pack v0; pack v1; pack v2; void set_car(simd_vector v0_, simd_vector v1_, simd_vector v2_) { v0.car = v0_; v1.car = v1_; v2.car = v2_; } }; template NSIMD_STRUCT packx3 { typedef typename simd_traits::simd_vector simd_vector; typedef T value_type; typedef SimdExt simd_ext; static const int unroll = N; static const int soa_num_packs = 3; pack v0; pack v1; pack v2; void set_car(simd_vector v0_, simd_vector v1_, simd_vector v2_) { v0.car = v0_; v1.car = v1_; v2.car = v2_; } void set_cdr(pack const &v0_, pack const &v1_, pack const &v2_) { v0.cdr = v0_; v1.cdr = v1_; v2.cdr = v2_; } }; #if NSIMD_CXX >= 2020 template struct is_packx3_t : public std::false_type {}; template struct is_packx3_t > : public std::true_type {}; template concept is_packx3_c = is_packx3_t::value; #define NSIMD_CONCEPT_PACKX3 nsimd::is_packx3_c #else #define NSIMD_CONCEPT_PACKX3 typename #endif // ---------------------------------------------------------------------------- // Definition of SOA of degree 4 template NSIMD_STRUCT packx4; template NSIMD_STRUCT packx4 { typedef typename simd_traits::simd_vector simd_vector; typedef T value_type; typedef SimdExt simd_ext; static const int unroll = 1; static const int soa_num_packs = 4; pack v0; pack v1; pack v2; pack v3; void set_car(simd_vector v0_, simd_vector v1_, simd_vector v2_, simd_vector v3_) { v0.car = v0_; v1.car = v1_; v2.car = v2_; v3.car = v3_; } }; template NSIMD_STRUCT packx4 { typedef typename simd_traits::simd_vector simd_vector; typedef T value_type; typedef SimdExt simd_ext; static const int unroll = N; static const int soa_num_packs = 4; pack v0; pack v1; pack v2; pack v3; void set_car(simd_vector v0_, simd_vector v1_, simd_vector v2_, simd_vector v3_) { v0.car = v0_; v1.car = v1_; v2.car = v2_; v3.car = v3_; } void set_cdr( pack const &v0_, pack const &v1_, pack const &v2_, pack const &v3_) { v0.cdr = v0_; v1.cdr = v1_; v2.cdr = v2_; v3.cdr = v3_; } }; #if NSIMD_CXX >= 2020 template struct is_packx4_t : public std::false_type {}; template struct is_packx4_t > : public std::true_type {}; template concept is_packx4_c = is_packx4_t::value; #define NSIMD_CONCEPT_PACKX4 nsimd::is_packx4_c #else #define NSIMD_CONCEPT_PACKX4 typename #endif // ---------------------------------------------------------------------------- // A C++20 concept #if NSIMD_CXX >=2020 template concept any_pack_c = is_pack_c || is_packl_c || is_packx1_c || is_packx2_c || is_packx3_c || is_packx4_c; #define NSIMD_CONCEPT_ANY_PACK nsimd::any_pack_c #else #define NSIMD_CONCEPT_ANY_PACK typename #endif // ---------------------------------------------------------------------------- // The len function cannot be auto-generated template int len(pack const &) { return N * len(T(), SimdExt()); } template int len(packl const &) { return N * len(T(), SimdExt()); } template int len(packx1 const &) { return N * len(T(), SimdExt()); } template int len(packx2 const &) { return 2 * N * len(T(), SimdExt()); } template int len(packx3 const &) { return 3 * N * len(T(), SimdExt()); } template int len(packx4 const &) { return 4 * N * len(T(), SimdExt()); } template int len() { return len(Pack()); } // ---------------------------------------------------------------------------- // The addv function cannot be auto-generated template T addv(pack const &a0) { return addv(a0.car, T(), SimdExt()); } template T addv(pack const &a0) { return addv(a0.car, T(), SimdExt()) + addv(a0.cdr); } // ---------------------------------------------------------------------------- // The all function cannot be auto-generated template int all(packl const &a0) { return all(a0.car, T(), SimdExt()); } template int all(packl const &a0) { return all(a0.car, T(), SimdExt()) && all(a0.cdr); } // ---------------------------------------------------------------------------- // The any function cannot be auto-generated template int any(packl const &a0) { return any(a0.car, T(), SimdExt()); } template int any(packl const &a0) { return any(a0.car, T(), SimdExt()) || any(a0.cdr); } // ---------------------------------------------------------------------------- // The nbtrue function cannot be auto-generated template int nbtrue(packl const &a0) { return nbtrue(a0.car, T(), SimdExt()); } template int nbtrue(packl const &a0) { return nbtrue(a0.car, T(), SimdExt()) + nbtrue(a0.cdr); } // ---------------------------------------------------------------------------- // Include functions that act on packs } // namespace nsimd #include namespace nsimd { // ---------------------------------------------------------------------------- // Arithmetic and assignment operators // add template pack &pack:: operator+=(pack const &other) { this->car = add(this->car, other.car, T()); return *this; } template pack &pack:: operator+=(pack const &other) { this->car = add(this->car, other.car, T()); this->cdr += other.cdr; return *this; } // sub template pack &pack:: operator-=(pack const &other) { this->car = sub(this->car, other.car, T()); return *this; } template pack &pack:: operator-=(pack const &other) { this->car = sub(this->car, other.car, T()); this->cdr -= other.cdr; return *this; } // mul template pack &pack:: operator*=(pack const &other) { this->car = mul(this->car, other.car, T()); return *this; } template pack &pack:: operator*=(pack const &other) { this->car = mul(this->car, other.car, T()); this->cdr *= other.cdr; return *this; } // div template pack &pack:: operator/=(pack const &other) { this->car = div(this->car, other.car, T()); return *this; } template pack &pack:: operator/=(pack const &other) { this->car = div(this->car, other.car, T()); this->cdr /= other.cdr; return *this; } // orb template pack &pack:: operator|=(pack const &other) { this->car = orb(this->car, other.car, T()); return *this; } template pack &pack:: operator|=(pack const &other) { this->car = orb(this->car, other.car, T()); this->cdr |= other.cdr; return *this; } // andb template pack &pack:: operator&=(pack const &other) { this->car = andb(this->car, other.car, T()); return *this; } template pack &pack:: operator&=(pack const &other) { this->car = andb(this->car, other.car, T()); this->cdr &= other.cdr; return *this; } // xorb template pack &pack:: operator^=(pack const &other) { this->car = xorb(this->car, other.car, T()); return *this; } template pack &pack:: operator^=(pack const &other) { this->car = xorb(this->car, other.car, T()); this->cdr ^= other.cdr; return *this; } // left shift template pack &pack::operator<<=(int s) { this->car = shl(this->car, s, T()); return *this; } template pack &pack::operator<<=(int s) { this->car = shl(this->car, s, T()); this->cdr <<= s; return *this; } // right shift template pack &pack::operator>>=(int s) { this->car = shr(this->car, s, T()); return *this; } template pack &pack::operator>>=(int s) { this->car = shr(this->car, s, T()); this->cdr >>= s; return *this; } // ---------------------------------------------------------------------------- // The if_else function cannot be auto-generated template NSIMD_REQUIRES(sizeof_v == sizeof_v) pack if_else(packl const &a0, pack const &a1, pack const &a2) { pack ret; ret.car = if_else(a0.car, a1.car, a2.car, L(), T(), SimdExt()); return ret; } template NSIMD_REQUIRES(sizeof_v == sizeof_v) pack if_else(packl const &a0, pack const &a1, pack const &a2) { pack ret; ret.car = if_else(a0.car, a1.car, a2.car, L(), T(), SimdExt()); ret.cdr = if_else(a0.cdr, a1.cdr, a2.cdr); return ret; } // ---------------------------------------------------------------------------- // Mask loads and stores cannot be auto-generated template NSIMD_REQUIRES(sizeof_v == sizeof_v) void mask_storea(packl const &a0, T *a1, pack const &a2) { mask_storea1(reinterpretl >(a0), a1, a2); } template NSIMD_REQUIRES(sizeof_v == sizeof_v) void mask_storeu(packl const &a0, T *a1, pack const &a2) { mask_storeu1(reinterpretl >(a0), a1, a2); } template NSIMD_REQUIRES(sizeof_v == sizeof_v) pack maskz_loada(packl const &a0, const T *a1) { return maskz_loada1(reinterpretl >(a0), a1); } template NSIMD_REQUIRES(sizeof_v == sizeof_v) pack maskz_loadu(packl const &a0, const T *a1) { return maskz_loadu1(reinterpretl >(a0), a1); } template NSIMD_REQUIRES(sizeof_v == sizeof_v) pack masko_loada(packl const &a0, const T *a1, pack const &a2) { return masko_loada1(reinterpretl >(a0), a1, a2); } template NSIMD_REQUIRES(sizeof_v == sizeof_v) pack masko_loadu(packl const &a0, const T *a1, pack const &a2) { return masko_loadu1(reinterpretl >(a0), a1, a2); } // ---------------------------------------------------------------------------- // Loads/Stores templated on the alignment cannot be auto-generated namespace detail { template struct loadz_return_t { typedef nsimd::pack type; }; template struct load_helper {}; template struct load_helper { typedef typename SimdVector::value_type T; typedef typename SimdVector::simd_ext simd_ext; static const int N = SimdVector::unroll; static SimdVector load(const T *a0) { return loada(a0); } static SimdVector loadl(const T *a0) { return loadla(a0); } static SimdVector load2(const T *a0) { return load2a(a0); } static SimdVector load3(const T *a0) { return load3a(a0); } static SimdVector load4(const T *a0) { return load4a(a0); } static SimdVector maskz_load(packl const &a0, const T *a1) { return maskz_loada(a0, a1); } static pack masko_load(packl const &a0, const T *a1, pack const &a2) { return masko_loada(a0, a1, a2); } }; template struct load_helper { typedef typename SimdVector::value_type T; typedef typename SimdVector::simd_ext simd_ext; static const int N = SimdVector::unroll; static SimdVector load(const T *a0) { return loadu(a0); } static SimdVector loadl(const T *a0) { return loadlu(a0); } static SimdVector load2(const T *a0) { return load2u(a0); } static SimdVector load3(const T *a0) { return load3u(a0); } static SimdVector load4(const T *a0) { return load4u(a0); } static SimdVector maskz_load(packl const &a0, const T *a1) { return maskz_loadu(a0, a1); } static pack masko_load(packl const &a0, const T *a1, pack const &a2) { return masko_loadu(a0, a1, a2); } }; template struct store_helper {}; #define NSIMD_T typename P::value_type template <> struct store_helper { template static void store(NSIMD_T *a0, P const &a1) { storea(a0, a1); } template #if NSIMD_CXX >= 2020 requires std::is_same_v #endif static void mask_store(PL const &a0, NSIMD_T *a1, P const &a2) { mask_storea(a0, a1, a2); } template static void storel(NSIMD_T *a0, P const &a1) { storela(a0, a1); } template static void store2(NSIMD_T *a0, P const &a1, P const &a2) { store2a(a0, a1, a2); } template static void store3(NSIMD_T *a0, P const &a1, P const &a2, P const &a3) { store3a(a0, a1, a2, a3); } template static void store4(NSIMD_T *a0, P const &a1, P const &a2, P const &a3, P const &a4) { store4a(a0, a1, a2, a3, a4); } }; template <> struct store_helper { template static void store(NSIMD_T *a0, P const &a1) { storeu(a0, a1); } template #if NSIMD_CXX >= 2020 requires std::is_same_v #endif static void mask_store(PL const &a0, NSIMD_T *a1, P const &a2) { mask_storeu(a0, a1, a2); } template static void storel(NSIMD_T *a0, P const &a1) { storelu(a0, a1); } template static void store2(NSIMD_T *a0, P const &a1, P const &a2) { store2u(a0, a1, a2); } template static void store3(NSIMD_T *a0, P const &a1, P const &a2, P const &a3) { store3u(a0, a1, a2, a3); } template static void store4(NSIMD_T *a0, P const &a1, P const &a2, P const &a3, P const &a4) { store4u(a0, a1, a2, a3, a4); } }; #undef NSIMD_T } // namespace detail template SimdVector load(const typename SimdVector::value_type *ptr) { return detail::load_helper::load(ptr); } template pack maskz_load(Packl const &pl, const typename Packl::value_type *ptr) { return detail::load_helper, Alignment>::maskz_load(pl, ptr); } template Pack masko_load(Packl const &pl, const typename Pack::value_type *ptr, Pack const &p) { return detail::load_helper::masko_load(pl, ptr, p); } template SimdVector loadl(const typename SimdVector::value_type *ptr) { return detail::load_helper::loadl(ptr); } template SimdVector load2(const typename SimdVector::value_type *ptr) { return detail::load_helper::load2(ptr); } template SimdVector load3(const typename SimdVector::value_type *ptr) { return detail::load_helper::load3(ptr); } template SimdVector load4(const typename SimdVector::value_type *ptr) { return detail::load_helper::load4(ptr); } template void store(typename Pack::value_type *ptr, Pack const &p) { detail::store_helper::store(ptr, p); } template void mask_store(Packl const &pl, typename Pack::value_type *ptr, Pack const &p) { detail::store_helper::mask_store(pl, ptr, p); } template void storel(typename Packl::value_type *ptr, Packl const &pl) { return detail::store_helper::storel(ptr, pl); } template void store2(typename Pack::value_type *ptr, Pack const &p1, Pack const &p2) { return detail::store_helper::store2(ptr, p1, p2); } template void store3(typename Pack::value_type *ptr, Pack const &p1, Pack const &p2, Pack const &p3) { return detail::store_helper::store3(ptr, p1, p2, p3); } template void store4(typename Pack::value_type *ptr, Pack const &p1, Pack const &p2, Pack const &p3, Pack const &p4) { return detail::store_helper::store4(ptr, p1, p2, p3, p4); } // ---------------------------------------------------------------------------- template T native_register(T a) { return a; } template typename pack::simd_vector native_register(pack const &a) { return a.car; } // ---------------------------------------------------------------------------- // get_pack template class packx, int Ix> struct get_pack_helper {}; // ---------------------------------------------------------------------------- // get_pack_helper - packx1 template struct get_pack_helper {}; template struct get_pack_helper { const nsimd::pack & operator()(const packx1 &packx_) const { return packx_.v0; } }; // ---------------------------------------------------------------------------- // get_pack_helper - packx2 template struct get_pack_helper {}; template struct get_pack_helper { const nsimd::pack & operator()(const packx2 &packx_) const { return packx_.v0; } }; template struct get_pack_helper { const nsimd::pack & operator()(const packx2 &packx_) const { return packx_.v1; } }; // ---------------------------------------------------------------------------- // get_pack_helper - packx3 template struct get_pack_helper {}; template struct get_pack_helper { const nsimd::pack & operator()(const packx3 &packx_) const { return packx_.v0; } }; template struct get_pack_helper { const nsimd::pack & operator()(const packx3 &packx_) const { return packx_.v1; } }; template struct get_pack_helper { const nsimd::pack & operator()(const packx3 &packx_) const { return packx_.v2; } }; // ---------------------------------------------------------------------------- // get_pack_helper - packx4 template struct get_pack_helper {}; template struct get_pack_helper { const nsimd::pack & operator()(const packx4 &packx_) const { return packx_.v0; } }; template struct get_pack_helper { const nsimd::pack & operator()(const packx4 &packx_) const { return packx_.v1; } }; template struct get_pack_helper { const nsimd::pack & operator()(const packx4 &packx_) const { return packx_.v2; } }; template struct get_pack_helper { const nsimd::pack & operator()(const packx4 &packx_) const { return packx_.v3; } }; // ---------------------------------------------------------------------------- // get_pack // get_pack for packx[Y] with Y = 1 template pack get_pack(const pack &pack_) { nsimd_static_assert<0 == Ix>(); return pack_; } // ---------------------------------------------------------------------------- // get_pack // get_pack for packx[Y] with Y in {2, 3, 4} template class packx> pack get_pack(const packx &packx_) { return get_pack_helper()(packx_); } // ---------------------------------------------------------------------------- // to_pack_trait template struct to_pack_trait {}; template class _packx> struct to_pack_trait<_packx > { typedef pack::soa_num_packs * N, SimdExt> value_type; }; // ---------------------------------------------------------------------------- // to_pack // to_pack for packx[Y] with Y = 1 template pack to_pack(const pack &pack_) { return pack_; } template pack to_pack(const pack &pack_) { return pack_; } // ---------------------------------------------------------------------------- // to_pack // to_pack for packx[Y] with Y in {2, 3, 4} template pack to_pack(const packx1 &packx_) { nsimd::pack pack_; pack_.car = packx_.v0.car; return pack_; } template pack to_pack(const packx2 &packx_) { nsimd::pack pack_; pack_.car = packx_.v0.car; pack_.cdr.car = packx_.v1.car; return pack_; } template pack to_pack(const packx3 &packx_) { nsimd::pack pack_; pack_.car = packx_.v0.car; pack_.cdr.car = packx_.v1.car; pack_.cdr.cdr.car = packx_.v2.car; return pack_; } template pack to_pack(const packx4 &packx_) { nsimd::pack pack_; pack_.car = packx_.v0.car; pack_.cdr.car = packx_.v1.car; pack_.cdr.cdr.car = packx_.v2.car; pack_.cdr.cdr.cdr.car = packx_.v3.car; return pack_; } // ---------------------------------------------------------------------------- // to_pack for packx[Y] 1), SimdExt> with Y in {2, 3, 4} // Advance template class packx> struct to_pack_recurs_helper { static pack to_pack(const packx &from_packx, const pack &from_pack) { pack to_pack_; to_pack_.car = from_pack.car; to_pack_.cdr = to_pack_recurs_helper::to_pack(from_packx, from_pack.cdr); return to_pack_; } }; // Base case // Base case condition: to_pack_unroll_ix == 1 template class packx> struct to_pack_recurs_helper { static pack to_pack(const packx &from_packx, const pack &from_pack) { (void)from_packx; pack to_pack_; to_pack_.car = from_pack.car; // simd_vector return to_pack_; } }; // Switch: from_packx[i] --> from_packx[i+1] // Switch condition: from_pack_unroll_ix == 1 && to_pack_unroll_ix > 1 template class packx> struct to_pack_recurs_helper { static pack to_pack(const packx &from_packx, const pack &from_pack) { pack to_pack_; to_pack_.car = from_pack.car; // simd_vector // get next pack to_pack_.cdr = to_pack_recurs_helper< T, from_pack_init_N, from_pack_init_N, to_pack_unroll_ix - 1, which_from_pack_ix + 1, SimdExt, packx>::to_pack(from_packx, get_pack(from_packx)); return to_pack_; } }; template class packx> typename to_pack_trait >::value_type to_pack(const packx &from_packx) { static const int to_pack_unroll_ix = packx::soa_num_packs * N; pack to_pack_; to_pack_.car = from_packx.v0.car; // simd_vector to_pack_.cdr = to_pack_recurs_helper< T, N /* from_pack_init_N*/, N - 1 /* from_pack_unroll_ix */, to_pack_unroll_ix - 1 /* to_pack_unroll_ix */, 0 /* which_from_pack_ix */, SimdExt, packx>::to_pack(from_packx, from_packx.v0.cdr); return to_pack_; } // ---------------------------------------------------------------------------- // to_pack_interleave template pack to_pack_interleave(const pack &pack_) { return pack_; } template pack to_pack_interleave(const pack &pack_) { return pack_; } // ---------------------------------------------------------------------------- template pack to_pack_interleave(const packx1 &packx1_) { pack pack_1; pack_1.car = packx1_.v0.car; pack_1.cdr = packx1_.v0.cdr; return pack_1; } template pack to_pack_interleave(const packx1 &packx1_N) { pack pack_1; pack_1.car = packx1_N.v0.car; pack_1.cdr = packx1_N.v0.cdr; return pack_1; } // ---------------------------------------------------------------------------- template pack to_pack_interleave(const packx2 &packx2_) { nsimd::pack pack_2; pack_2.car = packx2_.v0.car; pack_2.cdr.car = packx2_.v1.car; return pack_2; } template pack to_pack_interleave(const packx2 &packx2_N) { pack pack_2xN; pack_2xN.car = packx2_N.v0.car; pack_2xN.cdr.car = packx2_N.v1.car; packx2 packx2_n_1; packx2_n_1.v0 = packx2_N.v0.cdr; packx2_n_1.v1 = packx2_N.v1.cdr; pack_2xN.cdr.cdr = to_pack_interleave(packx2_n_1); return pack_2xN; } // ---------------------------------------------------------------------------- template pack to_pack_interleave(const packx3 &packx3_) { nsimd::pack pack_3; pack_3.car = packx3_.v0.car; pack_3.cdr.car = packx3_.v1.car; pack_3.cdr.cdr.car = packx3_.v2.car; return pack_3; } template pack to_pack_interleave(const packx3 &packx3_n) { pack pack_3xn; pack_3xn.car = packx3_n.v0.car; pack_3xn.cdr.car = packx3_n.v1.car; pack_3xn.cdr.cdr.car = packx3_n.v2.car; packx3 packx3_n_1; packx3_n_1.v0 = packx3_n.v0.cdr; packx3_n_1.v1 = packx3_n.v1.cdr; packx3_n_1.v2 = packx3_n.v2.cdr; pack_3xn.cdr.cdr.cdr = to_pack_interleave(packx3_n_1); return pack_3xn; } // ---------------------------------------------------------------------------- template pack to_pack_interleave(const packx4 &packx4_) { nsimd::pack pack_4; pack_4.car = packx4_.v0.car; pack_4.cdr.car = packx4_.v1.car; pack_4.cdr.cdr.car = packx4_.v2.car; pack_4.cdr.cdr.cdr.car = packx4_.v3.car; return pack_4; } template pack to_pack_interleave(const packx4 &packx4_n) { pack pack_4xn; pack_4xn.car = packx4_n.v0.car; pack_4xn.cdr.car = packx4_n.v1.car; pack_4xn.cdr.cdr.car = packx4_n.v2.car; pack_4xn.cdr.cdr.cdr.car = packx4_n.v3.car; packx4 packx4_n_1; packx4_n_1.v0 = packx4_n.v0.cdr; packx4_n_1.v1 = packx4_n.v1.cdr; packx4_n_1.v2 = packx4_n.v2.cdr; packx4_n_1.v3 = packx4_n.v3.cdr; pack_4xn.cdr.cdr.cdr.cdr = to_pack_interleave(packx4_n_1); return pack_4xn; } } // namespace nsimd #endif ================================================ FILE: include/nsimd/cxx_adv_api_aliases.hpp ================================================ /* Copyright (c) 2021 Agenium Scale Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #ifndef NSIMD_CXX_ADV_API_ALIASES_HPP #define NSIMD_CXX_ADV_API_ALIASES_HPP #include namespace nsimd { /* ------------------------------------------------------------------------- */ template pack fabs(pack const &a0) { return abs(a0); } /* ------------------------------------------------------------------------- */ template pack fmin(pack const &a0, pack const &a1) { return min(a0, a1); } /* ------------------------------------------------------------------------- */ template pack fmax(pack const &a0, pack const &a1) { return max(a0, a1); } /* ------------------------------------------------------------------------- */ } // namespace nsimd #endif ================================================ FILE: include/nsimd/modules/fixed_point.hpp ================================================ /* Copyright (c) 2019 Agenium Scale Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #ifndef NSIMD_MODULES_FIXED_POINT_HPP #define NSIMD_MODULES_FIXED_POINT_HPP #include #include "nsimd/modules/fixed_point/fixed.hpp" #include "nsimd/modules/fixed_point/simd.hpp" #include "nsimd/modules/fixed_point/simd_math.hpp" namespace nsimd { namespace fixed_point { // ----------------------------------------------------------------------------- // ------------------------ Types definitions and len -------------------------- // ----------------------------------------------------------------------------- template NSIMD_STRUCT pack; template int len(const T &) { return fpsimd_n(T()); } template int len(const nsimd::fixed_point::pack &) { return fpsimd_n(fpsimd_t()); } template NSIMD_STRUCT pack { static const u8 lf = T::lf; static const u8 rt = T::rt; typedef fp_t value_type; fpsimd_t val; friend std::ostream &operator<<(std::ostream &os, pack &a0) { T *buf = new T[nsimd::fixed_point::len(a0)]; nsimd::fixed_point::simd_storeu( buf , a0.val ); os << "{ "; int n = nsimd::fixed_point::len(a0); for (int i = 0; i < n; i++) { os << buf[i]; if (i < n - 1) { os << ", "; } } os << " }"; delete[] buf; return os; } }; template NSIMD_STRUCT packl { static const u8 lf = T::lf; static const u8 rt = T::rt; typedef typename fp_t::logical_type value_type; fpsimdl_t val; }; // ----------------------------------------------------------------------------- // ------------------- Basic arithmetic operators ------------------------------ // ----------------------------------------------------------------------------- template NSIMD_INLINE pack add(const pack &a0, const pack &a1) { pack res; res.val = simd_add(a0.val, a1.val); return res; } template NSIMD_INLINE pack operator+(const pack &a0, const pack &a1) { return add( a0 , a1 ); } template NSIMD_INLINE pack sub(const pack &a0, const pack &a1) { pack res; res.val = simd_sub(a0.val, a1.val); return res; } template NSIMD_INLINE pack operator-(const pack &a0, const pack &a1) { return sub( a0 , a1 ); } template NSIMD_INLINE pack mul(const pack &a0, const pack &a1) { pack res; res.val = simd_mul(a0.val, a1.val); return res; } template NSIMD_INLINE pack operator*(const pack &a0, const pack &a1) { return mul( a0 , a1 ); } template NSIMD_INLINE pack div(const pack &a0, const pack &a1) { pack res; res.val = simd_div(a0.val, a1.val); return res; } template NSIMD_INLINE pack operator/(const pack &a0, const pack &a1) { return div( a0 , a1 ); } template NSIMD_INLINE pack fma(const pack &a0, const pack &a1, const pack &a2) { pack res; res.val = simd_fma(a0.val, a1.val, a2.val); return res; } template NSIMD_INLINE pack min(const pack &a0, const pack &a1) { pack res; res.val = simd_min(a0.val, a1.val); return res; } template NSIMD_INLINE pack max(const pack &a0, const pack &a1) { pack res; res.val = simd_max(a0.val, a1.val); return res; } // ----------------------------------------------------------------------------- // ------------------- Comparison operators ------------------------------------ // ----------------------------------------------------------------------------- template NSIMD_INLINE packl eq(const pack &a0, const pack &a1) { packl res; res.val = simd_eq(a0.val, a1.val); return res; } template NSIMD_INLINE pack operator==(const pack &a0, const pack &a1) { return eq( a0 , a1 ); } template NSIMD_INLINE packl ne(const pack &a0, const pack &a1) { packl res; res.val = simd_ne(a0.val, a1.val); return res; } template NSIMD_INLINE pack operator!=(const pack &a0, const pack &a1) { return ne( a0 , a1 ); } template NSIMD_INLINE packl le(const pack &a0, const pack &a1) { packl res; res.val = simd_le(a0.val, a1.val); return res; } template NSIMD_INLINE pack operator<=(const pack &a0, const pack &a1) { return le( a0 , a1 ); } template NSIMD_INLINE packl lt(const pack &a0, const pack &a1) { packl res; res.val = simd_lt(a0.val, a1.val); return res; } template NSIMD_INLINE pack operator<(const pack &a0, const pack &a1) { return lt( a0 , a1 ); } template NSIMD_INLINE packl ge(const pack &a0, const pack &a1) { packl res; res.val = simd_ge(a0.val, a1.val); return res; } template NSIMD_INLINE pack operator>=(const pack &a0, const pack &a1) { return ge( a0 , a1 ); } template NSIMD_INLINE packl gt(const pack &a0, const pack &a1) { packl res; res.val = simd_gt(a0.val, a1.val); return res; } template NSIMD_INLINE pack operator>(const pack &a0, const pack &a1) { return gt( a0 , a1 ); } template NSIMD_INLINE pack if_else1(const packl &a0, const pack &a1, const pack &a2) { pack res; res.val = simd_if_else1(a0.val, a1.val, a2.val); return res; } // ----------------------------------------------------------------------------- // ------------------- Bitwise operators -------------------------------------- // ----------------------------------------------------------------------------- template NSIMD_INLINE pack andb(const pack &a0, const pack &a1) { pack res; res.val = simd_andb(a0.val, a1.val); return res; } template NSIMD_INLINE packl andl(const packl &a0, const packl &a1) { packl res; res.val = simd_andl(a0.val, a1.val); return res; } template NSIMD_INLINE pack andnotb(const pack &a0, const pack &a1) { pack res; res.val = simd_andnotb(a0.val, a1.val); return res; } template NSIMD_INLINE packl andnotl(const packl &a0, const packl &a1) { packl res; res.val = simd_andnotl(a0.val, a1.val); return res; } template NSIMD_INLINE pack notb(pack a0) { pack res; res.val = simd_notb(a0.val); return res; } template NSIMD_INLINE packl notl(packl a0) { packl res; res.val = simd_notl(a0.val); return res; } template NSIMD_INLINE pack orb(const pack &a0, const pack &a1) { pack res; res.val = simd_orb(a0.val, a1.val); return res; } template NSIMD_INLINE packl orl(const packl &a0, const packl &a1) { packl res; res.val = simd_orl(a0.val, a1.val); return res; } template NSIMD_INLINE pack xorb(const pack &a0, const pack &a1) { pack res; res.val = simd_xorb(a0.val, a1.val); return res; } template NSIMD_INLINE packl xorl(const packl &a0, const packl &a1) { packl res; res.val = simd_xorl(a0.val, a1.val); return res; } // ----------------------------------------------------------------------------- // ------------------- Math functions ------------------------------------------ // ----------------------------------------------------------------------------- template NSIMD_INLINE pack abs(pack a0) { pack res; res.val = simd_abs(a0.val); return res; } template NSIMD_INLINE pack rec(pack a0) { pack res; res.val = simd_rec(a0.val); return res; } // ----------------------------------------------------------------------------- // -------------------- Load functions ----------------------------------------- // ----------------------------------------------------------------------------- template NSIMD_INLINE T set1(typename T::value_type a0) { T res; res.val = simd_set1(a0); return res; } template NSIMD_INLINE T loadu(typename T::value_type *p) { T res; res.val = simd_loadu(p); return res; } template NSIMD_INLINE T loada(typename T::value_type *p) { T res; res.val = simd_loada(p); return res; } template NSIMD_INLINE T loadlu(typename T::value_type *p) { T res; res.val = simd_loadlu(p); return res; } template NSIMD_INLINE T loadla(typename T::value_type *p) { T res; res.val = simd_loadla(p); return res; } // ----------------------------------------------------------------------------- // -------------------- Store functions ---------------------------------------- // ----------------------------------------------------------------------------- template NSIMD_INLINE void storeu(typename T::value_type *p, T v) { simd_storeu(p, v.val); } template NSIMD_INLINE void storea(typename T::value_type *p, T v) { simd_storea(p, v.val); } template NSIMD_INLINE void storelu(typename T::value_type *p, T v) { simd_storelu(p, v.val); } template NSIMD_INLINE void storela(typename T::value_type *p, T v) { simd_storela(p, v.val); } } // namespace fixed_point } // namespace nsimd #endif ================================================ FILE: include/nsimd/modules/memory_management.hpp ================================================ /* Copyright (c) 2021 Agenium Scale Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #ifndef NSIMD_MODULES_MEMORY_MANAGEMENT_HPP #define NSIMD_MODULES_MEMORY_MANAGEMENT_HPP #include #include #include #include namespace nsimd { // ---------------------------------------------------------------------------- // CUDA #if defined(NSIMD_CUDA) template T *device_malloc(size_t sz) { void *ret; if (cudaMalloc(&ret, sz * sizeof(T)) != cudaSuccess) { return NULL; } return (T *)ret; } template T *device_calloc(size_t sz) { void *ret; if (cudaMalloc(&ret, sz * sizeof(T)) != cudaSuccess) { return NULL; } if (cudaMemset((void *)ret, 0, sz * sizeof(T)) != cudaSuccess) { cudaFree(ret); return NULL; } return (T *)ret; } template void device_free(T *ptr) { cudaFree((void *)ptr); } template void copy_to_device(T *device_ptr, T *host_ptr, size_t sz) { cudaMemcpy((void *)device_ptr, (void *)host_ptr, sz * sizeof(T), cudaMemcpyHostToDevice); } template void copy_to_host(T *host_ptr, T *device_ptr, size_t sz) { cudaMemcpy((void *)host_ptr, (void *)device_ptr, sz * sizeof(T), cudaMemcpyDeviceToHost); } #define nsimd_fill_dev_mem_func(func_name, expr) \ template \ __global__ void kernel_##func_name##_(T *ptr, int n) { \ int i = threadIdx.x + blockIdx.x * blockDim.x; \ if (i < n) { \ ptr[i] = (T)(expr); \ } \ } \ \ template void func_name(T *ptr, size_t sz) { \ kernel_##func_name##_<<<(unsigned int)((sz + 127) / 128), 128>>>( \ ptr, int(sz)); \ } // ---------------------------------------------------------------------------- // ROCm #elif defined(NSIMD_ROCM) template T *device_malloc(size_t sz) { void *ret; if (hipMalloc(&ret, sz * sizeof(T)) != hipSuccess) { return NULL; } return (T *)ret; } template T *device_calloc(size_t sz) { void *ret; if (hipMalloc(&ret, sz * sizeof(T)) != hipSuccess) { return NULL; } if (hipMemset((void *)ret, 0, sz * sizeof(T)) != hipSuccess) { hipFree(ret); return NULL; } return (T *)ret; } template void device_free(T *ptr) { hipFree((void *)ptr); } template void copy_to_device(T *device_ptr, T *host_ptr, size_t sz) { hipMemcpy((void *)device_ptr, (void *)host_ptr, sz * sizeof(T), hipMemcpyHostToDevice); } template void copy_to_host(T *host_ptr, T *device_ptr, size_t sz) { hipMemcpy((void *)host_ptr, (void *)device_ptr, sz * sizeof(T), hipMemcpyDeviceToHost); } #define nsimd_fill_dev_mem_func(func_name, expr) \ template \ __global__ void kernel_##func_name##_(T *ptr, size_t n) { \ size_t i = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; \ if (i < n) { \ ptr[i] = (T)(expr); \ } \ } \ \ template void func_name(T *ptr, size_t sz) { \ hipLaunchKernelGGL((kernel_##func_name##_), \ (size_t)((sz + 127) / 128), 128, 0, NULL, ptr, \ (size_t)sz); \ } // ---------------------------------------------------------------------------- // oneAPI #elif defined(NSIMD_ONEAPI) template T *device_malloc(const size_t sz) { return sycl::malloc_device(sz, nsimd::oneapi::default_queue()); } template T *device_calloc(const size_t sz) { sycl::queue q = nsimd::oneapi::default_queue(); T *const ret = sycl::malloc_device(sz, q); if (ret == NULL) { return NULL; } q.memset((void *)ret, 0, sz * sizeof(T)).wait_and_throw(); return ret; } template void device_free(T *const ptr) { sycl::queue q = nsimd::oneapi::default_queue(); sycl::free(ptr, q); } template void copy_to_device(T *const device_ptr, const T *const host_ptr, const size_t sz) { sycl::queue q = nsimd::oneapi::default_queue(); q.memcpy((void *)device_ptr, (const void *)host_ptr, sz * sizeof(T)) .wait_and_throw(); } template void copy_to_host(T *const host_ptr, const T *const device_ptr, size_t sz) { sycl::queue q = nsimd::oneapi::default_queue(); q.memcpy((void *)host_ptr, (const void *)device_ptr, sz * sizeof(T)) .wait_and_throw(); } #define nsimd_fill_dev_mem_func(func_name, expr) \ template \ void kernel_##func_name##_(T *const ptr, const size_t sz, \ sycl::nd_item<1> item) { \ const size_t i = item.get_global_id().get(0); \ if (i < sz) { \ ptr[i] = nsimd::to(expr); \ } \ } \ \ template void func_name(T *const ptr, const size_t sz) { \ const size_t total_num_threads = \ nsimd::compute_total_num_threads(sz, THREADS_PER_BLOCK); \ sycl::queue q = nsimd::oneapi::default_queue(); \ q.parallel_for(sycl::nd_range<1>(sycl::range<1>(total_num_threads), \ sycl::range<1>(THREADS_PER_BLOCK)), \ [=](sycl::nd_item<1> item) { \ kernel_##func_name##_(ptr, sz, item); \ }) \ .wait_and_throw(); \ } // ---------------------------------------------------------------------------- // CPU #else template T *device_malloc(size_t sz) { return (T *)malloc(sz * sizeof(T)); } template T *device_calloc(size_t sz) { return (T *)calloc(sz * sizeof(T), 1); } template void device_free(T *ptr) { free((void *)ptr); } template void copy_to_device(T *device_ptr, T *host_ptr, size_t sz) { memcpy((void *)device_ptr, (void *)host_ptr, sz * sizeof(T)); } template void copy_to_host(T *host_ptr, T *device_ptr, size_t sz) { memcpy((void *)host_ptr, (void *)device_ptr, sz * sizeof(T)); } #define nsimd_fill_dev_mem_func(func_name, expr) \ template void func_name(T *ptr, size_t sz) { \ for (size_t i = 0; i < sz; i++) { \ ptr[i] = nsimd::to(expr); \ } \ } #endif // ---------------------------------------------------------------------------- // Pair of pointers template struct paired_pointers_t { T *device_ptr, *host_ptr; size_t sz; }; template paired_pointers_t pair_malloc(size_t sz) { paired_pointers_t ret; ret.sz = 0; #if defined(NSIMD_CUDA) || defined(NSIMD_ROCM) || defined(NSIMD_ONEAPI) ret.device_ptr = device_malloc(sz); if (ret.device_ptr == NULL) { ret.host_ptr = NULL; return ret; } ret.host_ptr = (T *)malloc(sz); if (ret.host_ptr == NULL) { device_free(ret.device_ptr); ret.device_ptr = NULL; return ret; } #else ret.device_ptr = device_malloc(sz); ret.host_ptr = ret.device_ptr; #endif ret.sz = sz; return ret; } template paired_pointers_t pair_malloc_or_exit(size_t sz) { paired_pointers_t ret = pair_malloc(sz); if (ret.device_ptr == NULL) { std::cerr << __FILE__ << ":" << __LINE__ << ": error cannot malloc " << sz << " bytes" << std::endl; exit(EXIT_FAILURE); } return ret; } template paired_pointers_t pair_calloc(size_t sz) { paired_pointers_t ret; ret.sz = 0; #if defined(NSIMD_CUDA) || defined(NSIMD_ROCM) || defined(NSIMD_ONEAPI) ret.device_ptr = device_calloc(sz); if (ret.device_ptr == NULL) { ret.host_ptr = NULL; return ret; } ret.host_ptr = calloc(sz, 1); if (ret.host_ptr == NULL) { device_free(ret.device_ptr); ret.device_ptr = NULL; return ret; } #else ret.device_ptr = device_calloc(sz); ret.host_ptr = ret.device_ptr; #endif ret.sz = sz; return ret; } template paired_pointers_t pair_calloc_or_exit(size_t sz) { paired_pointers_t ret = pair_calloc(sz); if (ret.device_ptr == NULL) { std::cerr << __FILE__ << ":" << __LINE__ << ": error cannot calloc " << sz << " bytes" << std::endl; exit(EXIT_FAILURE); } return ret; } template void pair_free(paired_pointers_t p) { #if defined(NSIMD_CUDA) || defined(NSIMD_ROCM) || defined(NSIMD_ONEAPI) device_free(p.device_free); free((void *)p.host_ptr); #else free((void *)p.host_ptr); #endif } template void copy_to_device(paired_pointers_t p) { #if defined(NSIMD_CUDA) || defined(NSIMD_ROCM) || defined(NSIMD_ONEAPI) copy_to_device(p.device_ptr, p.host_ptr, p.sz); #else (void)p; #endif } template void copy_to_host(paired_pointers_t p) { #if defined(NSIMD_CUDA) || defined(NSIMD_ROCM) || defined(NSIMD_ONEAPI) copy_to_host(p.host_ptr, p.device_ptr, p.sz); #else (void)p; #endif } } // namespace nsimd #endif ================================================ FILE: include/nsimd/modules/spmd.hpp ================================================ /* Copyright (c) 2020 Agenium Scale Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #ifndef NSIMD_MODULES_SPMD_HPP #define NSIMD_MODULES_SPMD_HPP #include #include #include #include namespace spmd { #if NSIMD_CXX < 2011 || NSIMD_C < 1999 #define NSIMD_VARIADIC_MACROS_IS_EXTENSION #endif #ifdef NSIMD_VARIADIC_MACROS_IS_EXTENSION #if defined(NSIMD_IS_GCC) /* Not emitting the warning -Wvariadic-macros is not possible with GCC <= 12. It is a bug. A workaround is to tell GCC to consider this header file as a system header file so that all warnings are not emitted. This is not satisfying but necessary for the moment. */ #pragma GCC system_header #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wvariadic-macros" #elif defined(NSIMD_IS_CLANG) #pragma clang diagnostic push #pragma clang diagnostic ignored "-Wvariadic-macros" #endif #endif // ---------------------------------------------------------------------------- // GPUs: CUDA, ROCm or oneAPI #if defined(NSIMD_CUDA) || defined(NSIMD_ROCM) || defined(NSIMD_ONEAPI) #if defined(NSIMD_CUDA) // 1d kernel definition #define spmd_kernel_1d(name, ...) \ template __global__ void name(__VA_ARGS__, int n) { \ int spmd_i_ = threadIdx.x + blockIdx.x * blockDim.x; \ if (spmd_i_ < n) { // templated kernel definition #define spmd_tmpl_kernel_1d(name, template_argument, ...) \ template \ __global__ void name(__VA_ARGS__, int n) { \ int spmd_i_ = threadIdx.x + blockIdx.x * blockDim.x; \ if (spmd_i_ < n) { #elif defined(NSIMD_ROCM) // 1d kernel definition #define spmd_kernel_1d(name, ...) \ template \ __global__ void name(__VA_ARGS__, size_t n) { \ size_t spmd_i_ = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; \ if (spmd_i_ < n) { // templated kernel definition #define spmd_tmpl_kernel_1d(name, template_argument, ...) \ template \ __global__ void name(__VA_ARGS__, size_t n) { \ size_t spmd_i_ = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; \ if (spmd_i_ < n) { #else // 1d kernel definition #define spmd_kernel_1d(name, ...) \ template \ inline void name(__VA_ARGS__, const size_t n, sycl::nd_item<1> item) { \ size_t spmd_i_ = item.get_global_id().get(0); \ if (spmd_i_ < n) { // templated kernel definition #define spmd_tmpl_kernel_1d(name, template_argument, ...) \ template \ inline void name(__VA_ARGS__, const size_t n, sycl::nd_item<1> item) { \ size_t spmd_i_ = item.get_global_id().get(0); \ if (spmd_i_ < n) { #endif #define spmd_kernel_end \ } \ } #if defined(NSIMD_CUDA) || defined(NSIMD_ROCM) // device function #define spmd_dev_func(type_name, ...) \ template __device__ type_name(__VA_ARGS__) { // templated device function #define spmd_tmpl_dev_func(type_name, template_argument, ...) \ template \ __device__ type_name(__VA_ARGS__) { #else // device function #define spmd_dev_func(type_name, ...) \ template type_name(__VA_ARGS__) { // templated device function #define spmd_tmpl_dev_func(type_name, template_argument, ...) \ template \ type_name(__VA_ARGS__) { #endif #define spmd_dev_func_end } // call spmd_dev_function #define spmd_call_dev_func(name, ...) name(__VA_ARGS__) // call templated spmd_dev_function #define spmd_call_tmpl_dev_func(name, template_argument, ...) \ name(__VA_ARGS__) #if defined(NSIMD_CUDA) // launch 1d kernel CUDA #define spmd_launch_kernel_1d(name, spmd_scalar_bits_, threads_per_block, n, \ ...) \ name \ <<<(unsigned int)nsimd_kernel_param(n, threads_per_block), \ (unsigned int)(threads_per_block)>>>(__VA_ARGS__, (int)n) #elif defined(NSIMD_ROCM) // launch 1d kernel ROCm #define spmd_launch_kernel_1d(name, spmd_scalar_bits_, threads_per_block, n, \ ...) \ hipLaunchKernelGGL((name), \ (size_t)nsimd_kernel_param(n, threads_per_block), \ (size_t)(threads_per_block), 0, NULL, __VA_ARGS__, \ (size_t)n) #else // launch 1d kernel oneAPI #define spmd_launch_kernel_1d(name, spmd_scalar_bits_, threads_per_block, n, \ ...) \ size_t total_num_threads = \ (size_t)nsimd_kernel_param(n, threads_per_block); \ sycl::queue q = nsimd::oneapi::default_queue(); \ q.parallel_for(sycl::nd_range<1>(sycl::range<1>(total_num_threads), \ sycl::range<1>(threads_per_block)), \ [=](sycl::nd_item<1> item) { \ name(__VA_ARGS__, (size_t)n, item); \ }) \ .wait_and_throw(); #endif // supported types (generic) template struct type_t {}; // supported types (scalar) template <> struct type_t<8> { typedef i8 itype; typedef u8 utype; typedef bool btype; }; template <> struct type_t<16> { typedef i16 itype; typedef u16 utype; typedef f16 ftype; typedef bool btype; }; template <> struct type_t<32> { typedef i32 itype; typedef u32 utype; typedef f32 ftype; typedef bool btype; }; template <> struct type_t<64> { typedef i64 itype; typedef u64 utype; typedef f64 ftype; typedef bool btype; }; // supported types (generic) #define k_int typename spmd::type_t::itype #define k_uint typename spmd::type_t::utype #define k_float typename spmd::type_t::ftype #define k_bool typename spmd::type_t::btype // loads and stores (generic) #define k_store(base_addr, value) \ do { \ base_addr[spmd_i_] = value; \ } while (0) #define k_unmasked_store(base_addr, value) k_store(base_addr, value) #define k_load(base_addr) base_addr[spmd_i_] #define k_unmasked_load(base_addr) k_load(base_addr) // f32 <--> f16 conversions #if defined(NSIMD_CUDA) || defined(NSIMD_ROCM) #define k_f32_to_f16(a) __float2half(a) #define k_f16_to_f32(a) __half2float(a) #else #define k_f32_to_f16(a) f16(a) #define k_f16_to_f32(a) static_cast(a) #endif // assignment statement #define k_set(var, value) \ do { \ var = value; \ } while (0) #define k_unmasked_set(var, value) k_set(var, value) // while statement (k_while) #define k_while(cond) while (cond) { #define k_endwhile } // break statement (k_break) #define k_break break // continue statement (k_continue) #define k_continue continue // endwhile statement (k_endwhile) #define k_endwhile } // if statement (k_if) #define k_if(cond) if (cond) { // elseif statement (k_elseif) #define k_elseif(cond) \ } \ else if (cond) { // else statement (k_else) #define k_else \ } \ else { // endif statement (k_endif) #define k_endif } // ---------------------------------------------------------------------------- // SIMD and SCALAR: dispatch between the two is done on a type #else // helpers template nsimd::pack to_pack(T a) { return nsimd::pack(a); } template nsimd::pack to_pack(nsimd::pack const &a) { return a; } template nsimd::packl to_packl(bool a) { return nsimd::packl(int(a)); } template nsimd::packl to_packl(Pack const &a) { return nsimd::reinterpretl >(a); } template struct base_type { typedef T type; }; template struct base_type > { typedef T type; }; template struct base_type > { typedef T type; }; // type indicating SIMD or scalar kernel struct KernelScalar {}; struct KernelSIMD {}; // common to all function: mainly to avoid warnings #define spmd_func_begin_ \ (void)spmd_i_; \ (void)spmd_mask_; \ k_bool spmd_off_lanes_return_(false); \ (void)spmd_off_lanes_return_; \ k_bool spmd_off_lanes_break_(false); \ (void)spmd_off_lanes_break_; \ k_bool spmd_off_lanes_continue_(false); \ (void)spmd_off_lanes_continue_; // 1d kernel definition #define spmd_kernel_1d(name, ...) \ template \ void name(nsimd_nat spmd_i_, spmd_MaskType_ spmd_mask_, __VA_ARGS__) { \ spmd_func_begin_ // templated kernel definition #define spmd_tmpl_kernel_1d(name, template_argument, ...) \ template \ void name(nsimd_nat spmd_i_, spmd_MaskType_ spmd_mask_, __VA_ARGS__) { \ spmd_func_begin_ #define spmd_kernel_end } // device function #define spmd_dev_func(type_name, ...) \ template \ type_name(nsimd_nat spmd_i_, spmd_MaskType_ spmd_mask_, __VA_ARGS__) { \ spmd_func_begin_ // templated device function #define spmd_tmpl_dev_func(type_name, template_argument, ...) \ template \ type_name(nsimd_nat spmd_i_, spmd_MaskType_ spmd_mask_, __VA_ARGS__) { \ spmd_func_begin_ #define spmd_dev_func_end } // call spmd_dev_function #define spmd_call_dev_func(name, ...) \ name(spmd_i_, spmd_mask_, \ __VA_ARGS__) // call templated spmd_dev_function #define spmd_call_tmpl_dev_func(name, template_argument, ...) \ name( \ spmd_i_, spmd_mask_, __VA_ARGS__) // launch 1d kernel #define spmd_launch_kernel_1d(name, spmd_scalar_bits_, spmd_unroll_, spmd_n_, \ ...) \ { \ spmd::type_t::btype \ spmd_mask_(true); \ nsimd_nat spmd_i_; \ nsimd_nat len = \ nsimd::len(spmd::type_t::itype()); \ for (spmd_i_ = 0; spmd_i_ + len <= spmd_n_; spmd_i_ += len) { \ name( \ spmd_i_, spmd_mask_, __VA_ARGS__); \ } \ for (; spmd_i_ < spmd_n_; spmd_i_++) { \ name( \ spmd_i_, true, __VA_ARGS__); \ } \ } // launch 1d templated kernel #define spmd_launch_tmpl_kernel_1d( \ name, template_argument, spmd_scalar_bits_, spmd_unroll_, spmd_n_, ...) \ { \ typename spmd::type_t::btype spmd_mask_(true); \ nsimd_nat spmd_i_; \ nsimd_nat len = \ nsimd::len(typename spmd::type_t::itype()); \ for (spmd_i_ = 0; spmd_i_ + len <= spmd_n_; spmd_i_ += len) { \ name(spmd_i_, spmd_mask_, __VA_ARGS__); \ } \ for (; spmd_i_ < spmd_n_; spmd_i_++) { \ name(spmd_i_, true, __VA_ARGS__); \ } \ } // supported types (generic) template struct type_t {}; // supported types (scalar) template struct type_t { typedef i8 itype; typedef u8 utype; typedef bool btype; }; template struct type_t { typedef i16 itype; typedef u16 utype; typedef f16 ftype; typedef bool btype; }; template struct type_t { typedef i32 itype; typedef u32 utype; typedef f32 ftype; typedef bool btype; }; template struct type_t { typedef i64 itype; typedef u64 utype; typedef f64 ftype; typedef bool btype; }; // supported types (SIMD) template struct type_t { typedef nsimd::pack itype; typedef nsimd::pack utype; typedef nsimd::packl btype; }; template struct type_t { typedef nsimd::pack itype; typedef nsimd::pack utype; typedef nsimd::pack ftype; typedef nsimd::packl btype; }; template struct type_t { typedef nsimd::pack itype; typedef nsimd::pack utype; typedef nsimd::pack ftype; typedef nsimd::packl btype; }; template struct type_t { typedef nsimd::pack itype; typedef nsimd::pack utype; typedef nsimd::pack ftype; typedef nsimd::packl btype; }; // supported types (generic) #define k_int \ typename spmd::type_t::itype #define k_uint \ typename spmd::type_t::utype #define k_float \ typename spmd::type_t::ftype #define k_bool \ typename spmd::type_t::btype // loads and stores (generic) template struct store_helper {}; template struct load_helper {}; #define k_store(base_addr, value) \ spmd::store_helper::impl(spmd_mask_, &base_addr[spmd_i_], \ value) #define k_unmasked_store(base_addr, value) \ spmd::store_helper::unmasked_impl(&base_addr[spmd_i_], \ value) #define k_load(base_addr) \ spmd::load_helper::impl(spmd_mask_, &base_addr[spmd_i_]) #define k_unmasked_load(base_addr) \ spmd::load_helper::template unmasked_impl( \ &base_addr[spmd_i_]) // loads and stores (scalar) template <> struct store_helper { template static void impl(bool mask, T *addr, S value) { if (mask) { *addr = nsimd::to(value); } } template static void unmasked_impl(T *addr, S value) { *addr = nsimd::to(value); } }; template <> struct load_helper { template static T impl(bool mask, T *addr) { if (mask) { return *addr; } else { return nsimd::to(0); } } template static T unmasked_impl(T *addr) { return *addr; } }; template <> struct store_helper { template static void impl(nsimd::packl const &mask, S *addr, nsimd::pack const &value) { nsimd::mask_storeu(mask, addr, value); } template static void impl(nsimd::packl const &mask, S *addr, U value) { nsimd::mask_storeu(mask, addr, nsimd::pack(nsimd::to(value))); } template static void unmasked_impl(T *addr, nsimd::pack const &value) { nsimd::storeu(addr, value); } template static void unmasked_impl(T *addr, S value) { nsimd::storeu(addr, nsimd::pack(nsimd::to(value))); } }; template <> struct load_helper { template static nsimd::pack impl(nsimd::packl const &mask, S *addr) { return nsimd::maskz_loadu(mask, addr); } template static nsimd::pack unmasked_impl(T *addr) { return nsimd::loadu >(addr); } }; // f32 <--> f16 conversions #define k_f32_to_f16(a) nsimd_f32_to_f16(a) #define k_f16_to_f32(a) nsimd_f16_to_f32(a) // Clear lanes template nsimd::packl clear_lanes(nsimd::packl const &mask, nsimd::packl const &lanes) { return nsimd::andnotl(mask, lanes); } inline bool clear_lanes(bool mask, bool lanes) { return lanes ? false : mask; } // assignment statement template void k_set_(bool mask, T &var, S value) { if (mask) { var = nsimd::to(value); } } template void k_set_(nsimd::packl const &mask, nsimd::pack &var, U value) { var = nsimd::if_else(mask, nsimd::pack(S(value)), var); } template void k_set_(nsimd::packl const &mask, nsimd::pack &var, nsimd::pack const &value) { var = nsimd::if_else(mask, value, var); } template void k_set_(nsimd::packl const &mask, nsimd::packl &var, U value) { var = nsimd::reinterpretl >( mask && nsimd::pack(int(value))); } template void k_set_(nsimd::packl const &mask, nsimd::packl &var, nsimd::packl const &value) { var = nsimd::reinterpretl >(mask && value); } #define k_set(var, value) spmd::k_set_(spmd_mask_, var, value) #define k_unmasked_set(var, value) \ do { \ var = value; \ } while (0) template bool any(nsimd::packl const a) { return nsimd::any(a); } template typename type_t::btype to_k_bool_(Packl const &a) { return nsimd::reinterpretl< typename type_t::btype>(a); } template inline bool to_k_bool_(bool a) { return a; } #define k_to_bool(a) \ spmd::to_k_bool_(a) inline bool any(bool a) { return a; } // while statement (k_while) #define k_while(cond) \ { \ k_bool spmd_middle_mask_ = spmd_mask_; \ k_bool spmd_off_lanes_break_(false); \ (void)spmd_off_lanes_break_; \ k_bool spmd_off_lanes_continue_(false); \ (void)spmd_off_lanes_continue_; \ { \ while (spmd::any(cond)) { \ k_bool spmd_cond_ = \ spmd::to_k_bool_( \ cond); \ { \ k_bool spmd_mask_ = spmd_cond_ && spmd_middle_mask_; \ spmd_mask_ = spmd::clear_lanes(spmd_mask_, spmd_off_lanes_break_); \ spmd_mask_ = spmd::clear_lanes(spmd_mask_, spmd_off_lanes_return_); // break statement (k_break) #define k_break \ spmd_off_lanes_break_ = spmd_off_lanes_break_ || spmd_mask_; \ spmd_mask_ = false; // continue statement (k_continue) #define k_continue \ spmd_off_lanes_continue_ = spmd_off_lanes_continue_ || spmd_mask_; \ spmd_mask_ = false; // endwhile statement (k_endwhile) #define k_endwhile \ } \ } \ } \ } \ spmd_mask_ = spmd::clear_lanes(spmd_mask_, spmd_off_lanes_return_); // return statement (k_return) #define k_return \ spmd_off_lanes_return_ = spmd_off_lanes_return_ || spmd_mask_; \ spmd_mask_ = false; // if statement (k_if) #define k_if(cond) \ { \ k_bool spmd_cond_ = \ spmd::to_k_bool_(cond); \ k_bool spmd_middle_mask_ = spmd_mask_; \ { \ k_bool spmd_mask_ = spmd_cond_ && spmd_middle_mask_; // elseif statement (k_elseif) #define k_elseif(cond) \ } \ spmd_mask_ = spmd::clear_lanes(spmd_mask_, spmd_off_lanes_return_); \ spmd_mask_ = spmd::clear_lanes(spmd_mask_, spmd_off_lanes_break_); \ spmd_mask_ = spmd::clear_lanes(spmd_mask_, spmd_off_lanes_continue_); \ spmd_middle_mask_ = spmd::clear_lanes(spmd_middle_mask_, spmd_cond_); \ spmd_cond_ = \ spmd::to_k_bool_(cond); \ { \ k_bool spmd_mask_ = spmd_cond_ && spmd_middle_mask_; // else statement (k_else) #define k_else \ } \ spmd_mask_ = spmd::clear_lanes(spmd_mask_, spmd_off_lanes_return_); \ spmd_mask_ = spmd::clear_lanes(spmd_mask_, spmd_off_lanes_break_); \ spmd_mask_ = spmd::clear_lanes(spmd_mask_, spmd_off_lanes_continue_); \ spmd_middle_mask_ = spmd::clear_lanes(spmd_middle_mask_, spmd_cond_); \ { \ k_bool spmd_mask_ = spmd_middle_mask_; // endif statement (k_endif) #define k_endif \ } \ } \ spmd_mask_ = spmd::clear_lanes(spmd_mask_, spmd_off_lanes_return_); \ spmd_mask_ = spmd::clear_lanes(spmd_mask_, spmd_off_lanes_break_); \ spmd_mask_ = spmd::clear_lanes(spmd_mask_, spmd_off_lanes_continue_); // ---------------------------------------------------------------------------- #endif #ifdef NSIMD_VARIADIC_MACROS_IS_EXTENSION #if defined(NSIMD_IS_GCC) #pragma GCC diagnostic pop #elif defined(NSIMD_IS_CLANG) #pragma clang diagnostic pop #endif #endif } // namespace spmd #include #endif ================================================ FILE: include/nsimd/modules/tet1d.hpp ================================================ /* Copyright (c) 2021 Agenium Scale Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #ifndef NSIMD_MODULES_TET1D_HPP #define NSIMD_MODULES_TET1D_HPP #include #include #include #include namespace tet1d { // ---------------------------------------------------------------------------- // general definitions struct none_t {}; template struct node {}; const nsimd::nat end = nsimd::nat(-1); // ---------------------------------------------------------------------------- // Error management #if defined(NSIMD_CUDA) #define nsimd_cuda_assert(ans) tet1d::gpuCheck((ans), __FILE__, __LINE__) inline void gpuCheck(cudaError_t code, const char *file, int line) { if (code != cudaSuccess) { fprintf(stderr, "NSIMD Internal error:\n\ttet1d Error: %s %s %d\n", cudaGetErrorString(code), file, line); exit(code); } } #endif // ---------------------------------------------------------------------------- // supported kernels #if defined(NSIMD_CUDA) // CUDA component wise kernel template __global__ void gpu_kernel_component_wise(T *dst, Expr const expr, nsimd::nat n) { int i = threadIdx.x + blockIdx.x * blockDim.x; if (i < n) { dst[i] = expr.gpu_get(i); } } // CUDA component wise kernel with masked output template __global__ void gpu_kernel_component_wise_mask(T *dst, Mask const mask, Expr const expr, nsimd::nat n) { int i = threadIdx.x + blockIdx.x * blockDim.x; if (i < n && mask.gpu_get(i)) { dst[i] = expr.gpu_get(i); } } #elif defined(NSIMD_ROCM) // ROCM component wise kernel template __global__ void gpu_kernel_component_wise(T *dst, Expr const expr, nsimd::nat n) { int i = int(hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x); if (i < n) { dst[i] = expr.gpu_get(i); } } // ROCM component wise kernel with masked output template __global__ void gpu_kernel_component_wise_mask(T *dst, Mask const mask, Expr const expr, nsimd::nat n) { int i = int(hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x); if (i < n && mask.gpu_get(i)) { dst[i] = expr.gpu_get(i); } } #elif defined(NSIMD_ONEAPI) // oneAPI component wise kernel template void oneapi_kernel_component_wise(T *dst, Expr const expr, nsimd::nat n, sycl::nd_item<1> item) { const int i = static_cast(item.get_global_id().get(0)); if (i < n) { dst[i] = expr.gpu_get(i); } } // oneAPI component wise kernel with masked output template void oneapi_kernel_component_wise_mask(T *dst, Mask const mask, Expr const expr, nsimd::nat n, sycl::nd_item<1> item) { nsimd::nat i = static_cast(item.get_global_id().get(0)); if (i < n && mask.gpu_get(i)) { dst[i] = expr.gpu_get(i); } } #else // CPU component wise kernel template void cpu_kernel_component_wise(T *dst, Expr const &expr, nsimd::nat n) { nsimd::nat i; int len = nsimd::len(Pack()); for (i = 0; i + len < n; i += len) { nsimd::storeu(&dst[i], expr.template simd_get(i)); } for (; i < n; i++) { dst[i] = expr.scalar_get(i); } } // CPU component wise kernel with masked output template void cpu_kernel_component_wise_mask(T *dst, Mask const &mask, Expr const &expr, nsimd::nat n) { nsimd::nat i; int len = nsimd::len(Pack()); for (i = 0; i + len < n; i += len) { nsimd::storeu(&dst[i], nsimd::if_else(mask.template simd_get(i), expr.template simd_get(i), nsimd::loadu(&dst[i]))); } for (; i < n; i++) { if (mask.scalar_get(i)) { dst[i] = expr.scalar_get(i); } } } #endif // ---------------------------------------------------------------------------- // helper for computing sizes of 1D vectors nsimd::nat compute_size(nsimd::nat sz1, nsimd::nat sz2) { assert(sz1 >= 0 || sz2 >= 0); assert((sz1 < 0 && sz2 >= 0) || (sz1 >= 0 && sz2 < 0) || (sz1 == sz2)); if (sz1 < 0) { return sz2; } else { return sz1; } } nsimd::nat compute_size(nsimd::nat sz1, nsimd::nat sz2, nsimd::nat sz3) { return compute_size(compute_size(sz1, sz2), sz3); } // ---------------------------------------------------------------------------- // meta for building a pack from another ignoring the base type template struct to_pack_t { static const int unroll = Pack::unroll; typedef typename Pack::simd_ext simd_ext; typedef nsimd::pack type; }; template struct to_pack_t, Pack> { static const int unroll = Pack::unroll; typedef typename Pack::simd_ext simd_ext; typedef nsimd::pack type; }; template struct to_packl_t { static const int unroll = Pack::unroll; typedef typename Pack::simd_ext simd_ext; typedef nsimd::packl type; }; template struct to_packl_t, Pack> { static const int unroll = Pack::unroll; typedef typename Pack::simd_ext simd_ext; typedef nsimd::packl type; }; // ---------------------------------------------------------------------------- // scalar node struct scalar_t {}; template struct node { typedef T in_type; typedef T out_type; T value; #if defined(NSIMD_CUDA) || defined(NSIMD_ROCM) __device__ T gpu_get(nsimd::nat) const { return value; } #elif defined(NSIMD_ONEAPI) T gpu_get(nsimd::nat) const { return value; } #else T scalar_get(nsimd::nat) const { return value; } template typename to_pack_t::type simd_get(nsimd::nat) const { typedef typename to_pack_t::type pack; return pack(value); } #endif nsimd::nat size() const { return -1; } }; // ---------------------------------------------------------------------------- // build a node from a scalar and a node template struct to_node_t { typedef node type; static type impl(T n) { type ret; ret.value = n; return ret; } }; template struct to_node_t > { typedef node type; static type impl(type node) { return node; } }; template typename to_node_t::type to_node(T n) { return to_node_t::impl(n); } // ---------------------------------------------------------------------------- // convert literal to one NSIMD base type template struct literal_to { template static T impl(S a) { return T(a); } }; template <> struct literal_to { template static f16 impl(S a) { return nsimd_f32_to_f16(f32(a)); } }; // ---------------------------------------------------------------------------- // input node struct in_t {}; #define TET1D_IN(T) tet1d::node template struct node { const T *data; nsimd::nat sz; typedef T in_type; typedef T out_type; #if defined(NSIMD_CUDA) || defined(NSIMD_ROCM) __device__ T gpu_get(nsimd::nat i) const { return data[i]; } #elif defined(NSIMD_ONEAPI) T gpu_get(nsimd::nat i) const { return data[i]; } #else T scalar_get(nsimd::nat i) const { return data[i]; } template typename to_pack_t::type simd_get(nsimd::nat i) const { typedef typename to_pack_t::type pack; return nsimd::loadu(&data[i]); } #endif nsimd::nat size() const { return sz; } template node operator()(I0 i0_, I1 i1_) const { node ret; nsimd::nat i0 = nsimd::nat(i0_); nsimd::nat i1 = nsimd::nat(i1_); i0 = i0 >= 0 ? i0 : sz + i0; i1 = i1 >= 0 ? i1 : sz + i1; assert(0 <= i0 && i0 < i1 && i1 < sz); ret.data = &data[i0]; ret.sz = i1 - i0 + 1; return ret; } }; // return an input node from a pointer template inline node in(const T *data, I sz) { node ret; ret.data = data; ret.sz = nsimd::nat(sz); return ret; } // ---------------------------------------------------------------------------- // output with condition node: I(I > 50) = ... struct mask_out_t {}; template struct node { typedef typename Pack::value_type T; T *data; nsimd::nat threads_per_block; void *stream; Mask mask; template node operator=(node const &expr) { #if defined(NSIMD_CUDA) || defined(NSIMD_ROCM) || defined(NSIMD_ONEAPI) nsimd::nat expr_size = compute_size(mask.size(), expr.size()); nsimd::nat nt = threads_per_block < 0 ? 128 : threads_per_block; nsimd::nat param = nsimd_kernel_param(expr_size, nt); assert(nt > 0 && nt <= UINT_MAX); assert(param > 0 && param <= UINT_MAX); #if defined(NSIMD_CUDA) cudaStream_t s = (stream == NULL ? NULL : *(cudaStream_t *)stream); // clang-format off gpu_kernel_component_wise_mask<<<(unsigned int)(param), (unsigned int)(nt), 0, s>>> (data, mask, expr, expr_size); // clang-format on #elif defined(NSIMD_ROCM) hipStream_t s = stream == NULL ? NULL : *(hipStream_t *)stream; hipLaunchKernelGGL(gpu_kernel_component_wise_mask, (unsigned int)(param), (unsigned int)(nt), 0, s, data, mask, expr, expr_size); #else sycl::queue q = nsimd::oneapi::default_queue(); q.parallel_for(sycl::nd_range<1>(sycl::range<1>((size_t)param), sycl::range<1>((size_t)nt)), [=, *this](sycl::nd_item<1> item) { oneapi_kernel_component_wise_mask(data, mask, expr, expr_size, item); }) .wait_and_throw(); #endif #else cpu_kernel_component_wise_mask( data, mask, expr, compute_size(mask.size(), expr.size())); #endif return *this; } template node operator=(S a) { return operator=(to_node(literal_to::impl(a))); } }; // ---------------------------------------------------------------------------- // output node struct out_t {}; #define TET1D_OUT(T) \ tet1d::node > #define TET1D_OUT_EX(T, N, SimdExt) \ tet1d::node > template struct node { typedef typename Pack::value_type T; T *data; nsimd::nat threads_per_block; void *stream; template node operator()(Mask mask) const { node ret; ret.data = data; ret.mask = mask; ret.threads_per_block = threads_per_block; ret.stream = stream; return ret; } template node operator=(node const &expr) { #if defined(NSIMD_CUDA) || defined(NSIMD_ROCM) || defined(NSIMD_ONEAPI) nsimd::nat nt = threads_per_block < 0 ? 128 : threads_per_block; nsimd::nat param = nsimd_kernel_param(expr.size(), nt); assert(nt > 0 && nt <= UINT_MAX); assert(param > 0 && param <= UINT_MAX); #if defined(NSIMD_CUDA) cudaStream_t s = stream == NULL ? NULL : *(cudaStream_t *)stream; // clang-format off gpu_kernel_component_wise<<<(unsigned int)(param), (unsigned int)(nt), 0, s>>>(data, expr, expr.size()); // clang-format on #elif defined(NSIMD_ROCM) hipStream_t s = stream == NULL ? NULL : *(hipStream_t *)stream; hipLaunchKernelGGL( (gpu_kernel_component_wise >), (unsigned int)(param), (unsigned int)(nt), 0, s, data, expr, expr.size()); #else sycl::queue q = nsimd::oneapi::default_queue(); q.parallel_for( sycl::nd_range<1>(sycl::range<1>((size_t)param), sycl::range<1>((size_t)nt)), [=, *this](sycl::nd_item<1> item) { oneapi_kernel_component_wise(data, expr, expr.size(), item); }) .wait_and_throw(); #endif #else cpu_kernel_component_wise(data, expr, expr.size()); #endif return *this; } }; // return an output node from a pointer template node > out(T *data) { node > ret; ret.data = data; ret.threads_per_block = 128; ret.stream = NULL; return ret; } template node out(T *data, int threads_per_block, void *stream) { node ret; ret.data = data; ret.threads_per_block = threads_per_block; ret.stream = stream; return ret; } // ---------------------------------------------------------------------------- } // namespace tet1d #include #endif ================================================ FILE: include/nsimd/nsimd-all.h ================================================ /* Copyright (c) 2021 Agenium Scale Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #ifndef NSIMD_ALL_H #define NSIMD_ALL_H #include #include #endif ================================================ FILE: include/nsimd/nsimd-all.hpp ================================================ /* Copyright (c) 2021 Agenium Scale Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #ifndef NSIMD_ALL_HPP #define NSIMD_ALL_HPP #include #include #include #include #endif ================================================ FILE: include/nsimd/nsimd.h ================================================ /* Copyright (c) 2021 Agenium Scale Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #ifndef NSIMD_H #define NSIMD_H /* clang-format off */ /* ------------------------------------------------------------------------- */ /* Compiler detection (order matters https://stackoverflow.com/a/28166605) */ /* Detect host compiler */ #if defined(_MSC_VER) #define NSIMD_IS_MSVC #elif defined(__ibmxl_version__) #define NSIMD_IS_XLC #elif defined(__FCC_version__) #define NSIMD_IS_FCC #elif defined(__INTEL_COMPILER) #define NSIMD_IS_ICC #elif defined(__clang__) #define NSIMD_IS_CLANG #elif defined(__GNUC__) || defined(__GNUG__) #define NSIMD_IS_GCC #endif /* Detect device compiler, if any */ #if defined(__HIPCC__) #define NSIMD_IS_HIPCC #elif defined(__INTEL_CLANG_COMPILER) || defined(__INTEL_LLVM_COMPILER) #define NSIMD_IS_DPCPP #elif defined(__NVCC__) #define NSIMD_IS_NVCC #endif /* ------------------------------------------------------------------------- */ /* C standard detection */ #ifdef NSIMD_IS_MSVC #define NSIMD_C 1999 #else #ifdef __STDC_VERSION__ #if __STDC_VERSION__ == 199901L #define NSIMD_C 1999 #elif __STDC_VERSION__ >= 201112L #define NSIMD_C 2011 #else #define NSIMD_C 1989 #endif #else #define NSIMD_C 1989 #endif #endif /* ------------------------------------------------------------------------- */ /* C++ standard detection */ #ifdef NSIMD_IS_MSVC #ifdef _MSVC_LANG #define NSIMD__cplusplus _MSVC_LANG #else #define NSIMD__cplusplus __cplusplus #endif #else #ifdef __cplusplus #define NSIMD__cplusplus __cplusplus #else #define NSIMD__cplusplus 0 #endif #endif #if NSIMD__cplusplus > 0 && NSIMD__cplusplus < 201103L #define NSIMD_CXX 1998 #elif NSIMD__cplusplus >= 201103L && NSIMD__cplusplus < 201402L #define NSIMD_CXX 2011 #elif NSIMD__cplusplus >= 201402L && NSIMD__cplusplus < 201703L #define NSIMD_CXX 2014 #elif NSIMD__cplusplus == 201703L #define NSIMD_CXX 2017 #elif NSIMD__cplusplus >= 201704L #define NSIMD_CXX 2020 #else #define NSIMD_CXX 0 #endif #if NSIMD_CXX >= 2020 #include #include #endif /* ------------------------------------------------------------------------- */ /* Use of long long for GCC even in C89 and C++98. Note that for some reason */ /* the use of the __extension__ keyword does not prevent warning so we deal */ /* with them now. We keep the __extension__ keyword in case. */ #if NSIMD_CXX < 2011 && NSIMD_C < 1999 #define NSIMD_LONGLONG_IS_EXTENSION #endif #ifdef NSIMD_LONGLONG_IS_EXTENSION #if defined(NSIMD_IS_GCC) #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wlong-long" #elif defined(NSIMD_IS_CLANG) #pragma clang diagnostic push #pragma clang diagnostic ignored "-Wlong-long" #endif #endif typedef long long nsimd_longlong; typedef unsigned long long nsimd_ulonglong; #if NSIMD_CXX > 0 namespace nsimd { typedef long long longlong; typedef unsigned long long ulonglong; } // namespace nsimd #endif #ifdef __UINT64_TYPE__ typedef __UINT64_TYPE__ nsimd_uint64_type; #endif #ifdef __INT64_TYPE__ typedef __INT64_TYPE__ nsimd_int64_type; #endif #ifdef NSIMD_LONGLONG_IS_EXTENSION #if defined(NSIMD_IS_GCC) #pragma GCC diagnostic pop #elif defined(NSIMD_IS_CLANG) #pragma clang diagnostic pop #endif #endif /* ------------------------------------------------------------------------- */ /* Register size detection */ #if defined(__x86_64) || defined(__x86_64__) || defined(__amd64__) || \ defined(__amd64) || defined(_M_AMD64) || defined(__aarch64__) || \ defined(_M_ARM64) || defined(__PPC64__) #define NSIMD_WORD_SIZE 64 #else #define NSIMD_WORD_SIZE 32 #endif /* ------------------------------------------------------------------------- */ /* Architecture detection */ #if defined(i386) || defined(__i386__) || defined(__i486__) || \ defined(__i586__) || defined(__i686__) || defined(__i386) || \ defined(_M_IX86) || defined(_X86_) || defined(__THW_INTEL__) || \ defined(__I86__) || defined(__INTEL__) || defined(__x86_64) || \ defined(__x86_64__) || defined(__amd64__) || defined(__amd64) || \ defined(_M_X64) #define NSIMD_X86 #elif defined(__arm__) || defined(__arm64) || defined(__thumb__) || \ defined(__TARGET_ARCH_ARM) || defined(__TARGET_ARCH_THUMB) || \ defined(_M_ARM) || defined(_M_ARM64) || defined(__arch64__) #define NSIMD_ARM #elif defined(__ppc__) || defined(__powerpc__) || defined(__PPC__) #define NSIMD_POWERPC #else #define NSIMD_GENERIC #endif /* ------------------------------------------------------------------------- */ /* Microsoft DLL specifics */ #ifdef NSIMD_IS_MSVC #define NSIMD_DLLEXPORT __declspec(dllexport) #define NSIMD_DLLIMPORT __declspec(dllimport) #else #define NSIMD_DLLEXPORT #define NSIMD_DLLIMPORT extern #endif /* ------------------------------------------------------------------------- */ /* DLL specifics when inside/outside the library */ #ifdef NSIMD_INSIDE #define NSIMD_DLLSPEC NSIMD_DLLEXPORT #else #define NSIMD_DLLSPEC NSIMD_DLLIMPORT #endif /* ------------------------------------------------------------------------- */ /* Vector calling convention: https://devblogs.microsoft.com/cppblog /introducing-vector-calling-convention/ */ #if defined(NSIMD_IS_MSVC) && NSIMD_WORD_SIZE == 32 #define NSIMD_VECTORCALL __vectorcall #else #define NSIMD_VECTORCALL #endif /* ------------------------------------------------------------------------- */ /* inline in nsimd is ONLY useful for linkage */ #if NSIMD_CXX > 0 || NSIMD_C > 1989 #if NSIMD_C > 0 && defined(NSIMD_IS_MSVC) #define NSIMD_INLINE static __inline #else #define NSIMD_INLINE static inline #endif #else #if defined(NSIMD_IS_GCC) || defined(NSIMD_IS_CLANG) #define NSIMD_INLINE __extension__ static __inline #else #define NSIMD_INLINE #endif #endif /* ------------------------------------------------------------------------- */ /* Pre-processor */ #define NSIMD_PP_CAT_2_e(a, b) a##b #define NSIMD_PP_CAT_2(a, b) NSIMD_PP_CAT_2_e(a, b) #define NSIMD_PP_CAT_3_e(a, b, c) a##b##c #define NSIMD_PP_CAT_3(a, b, c) NSIMD_PP_CAT_3_e(a, b, c) #define NSIMD_PP_CAT_4_e(a, b, c, d) a##b##c##d #define NSIMD_PP_CAT_4(a, b, c, d) NSIMD_PP_CAT_4_e(a, b, c, d) #define NSIMD_PP_CAT_5_e(a, b, c, d, e) a##b##c##d##e #define NSIMD_PP_CAT_5(a, b, c, d, e) NSIMD_PP_CAT_5_e(a, b, c, d, e) #define NSIMD_PP_CAT_6_e(a, b, c, d, e, f) a##b##c##d##e##f #define NSIMD_PP_CAT_6(a, b, c, d, e, f) NSIMD_PP_CAT_6_e(a, b, c, d, e, f) #define NSIMD_PP_EXPAND_e(a) a #define NSIMD_PP_EXPAND(a) NSIMD_PP_EXPAND_e(a) /* ------------------------------------------------------------------------- */ /* Detect architecture/SIMD */ #if defined(CPU) && !defined(NSIMD_CPU) #define NSIMD_CPU #endif /* Intel */ #if defined(SSE2) && !defined(NSIMD_SSE2) #define NSIMD_SSE2 #endif #if defined(SSE42) && !defined(NSIMD_SSE42) #define NSIMD_SSE42 #endif #if defined(AVX) && !defined(NSIMD_AVX) #define NSIMD_AVX #endif #if defined(AVX2) && !defined(NSIMD_AVX2) #define NSIMD_AVX2 #endif #if defined(AVX512_KNL) && !defined(NSIMD_AVX512_KNL) #define NSIMD_AVX512_KNL #endif #if defined(AVX512_SKYLAKE) && !defined(NSIMD_AVX512_SKYLAKE) #define NSIMD_AVX512_SKYLAKE #endif #if defined(FP16) && !defined(NSIMD_FP16) #define NSIMD_FP16 #endif #if defined(FMA) && !defined(NSIMD_FMA) #define NSIMD_FMA #endif /* ARM */ #if defined(NEON128) && !defined(NSIMD_NEON128) #define NSIMD_NEON128 #endif #if defined(AARCH64) && !defined(NSIMD_AARCH64) #define NSIMD_AARCH64 #endif #if defined(SVE) && !defined(NSIMD_SVE) #define NSIMD_SVE #define NSIMD_SVE_FAMILY #endif #if defined(SVE128) && !defined(NSIMD_SVE128) #define NSIMD_SVE128 #define NSIMD_SVE_FAMILY #endif #if defined(SVE256) && !defined(NSIMD_SVE256) #define NSIMD_SVE256 #define NSIMD_SVE_FAMILY #endif #if defined(SVE512) && !defined(NSIMD_SVE512) #define NSIMD_SVE512 #define NSIMD_SVE_FAMILY #endif #if defined(SVE1024) && !defined(NSIMD_SVE1024) #define NSIMD_SVE1024 #define NSIMD_SVE_FAMILY #endif #if defined(SVE2048) && !defined(NSIMD_SVE2048) #define NSIMD_SVE2048 #define NSIMD_SVE_FAMILY #endif /* PPC */ #if (defined(VMX) || defined(ALTIVEC)) && !defined(NSIMD_VMX) #define NSIMD_VMX #endif #if defined(VSX) && !defined(NSIMD_VSX) #define NSIMD_VSX #endif /* CUDA */ #if defined(CUDA) && !defined(NSIMD_CUDA) #define NSIMD_CUDA #endif /* ROCm */ #if defined(ROCM) && !defined(NSIMD_ROCM) #define NSIMD_ROCM #endif /* oneAPI */ #if defined(ONEAPI) && !defined(NSIMD_ONEAPI) #define NSIMD_ONEAPI /* undef ONEAPI is needed because ONEAPI is used as a namespace in DPC++: sycl::ONEAPI */ #ifdef ONEAPI #undef ONEAPI #endif #endif /* ------------------------------------------------------------------------- */ /* Set NSIMD_SIMD and NSIMD_PLATFORM macro, include the correct header. */ #if defined(NSIMD_SSE2) #define NSIMD_PLATFORM x86 #define NSIMD_SIMD sse2 #include #if defined(NSIMD_FMA) || defined(NSIMD_FP16) #include #endif /* For some reason MSVC <= 2015 has intrinsics defined in another header */ #ifdef NSIMD_IS_MSVC #include #endif #if NSIMD_CXX > 0 namespace nsimd { struct cpu {}; struct sse2 {}; #if NSIMD_CXX >= 2020 template concept simd_ext_c = std::is_same_v || std::is_same_v; #define NSIMD_LIST_SIMD_EXT cpu, sse2 #endif } // namespace nsimd #endif #elif defined(NSIMD_SSE42) #define NSIMD_PLATFORM x86 #define NSIMD_SIMD sse42 #include #if defined(NSIMD_FMA) || defined(NSIMD_FP16) #include #endif /* For some reason MSVC <= 2015 has intrinsics defined in another header */ #ifdef NSIMD_IS_MSVC #include #endif #if NSIMD_CXX > 0 namespace nsimd { struct cpu {}; struct sse2 {}; struct sse42 {}; #if nsIMD_CXX >= 2020 template concept simd_ext_c = std::is_same_v || std::is_same_v || std::is_same_v; #define NSIMD_LIST_SIMD_EXT cpu, sse2, sse42 #endif } // namespace nsimd #endif #elif defined(NSIMD_AVX) #define NSIMD_PLATFORM x86 #define NSIMD_SIMD avx #include /* For some reason MSVC <= 2015 has intrinsics defined in another header */ #ifdef NSIMD_IS_MSVC #include #endif #if NSIMD_CXX > 0 namespace nsimd { struct cpu {}; struct sse2 {}; struct sse42 {}; struct avx {}; #if NSIMD_CXX >= 2020 template concept simd_ext_c = std::is_same_v || std::is_same_v || std::is_same_v || std::is_same_v; #define NSIMD_LIST_SIMD_EXT cpu, sse2, sse42, avx #endif } // namespace nsimd #endif #elif defined(NSIMD_AVX2) #define NSIMD_PLATFORM x86 #define NSIMD_SIMD avx2 #include /* For some reason MSVC <= 2015 has intrinsics defined in another header */ #ifdef NSIMD_IS_MSVC #include #endif #if NSIMD_CXX > 0 namespace nsimd { struct cpu {}; struct sse2 {}; struct sse42 {}; struct avx {}; struct avx2 {}; #if NSIMD_CXX >= 2020 template concept simd_ext_c = std::is_same_v || std::is_same_v || std::is_same_v || std::is_same_v || std::is_same_v; #define NSIMD_LIST_SIMD_EXT cpu, sse2, sse42, avx, avx2 #endif } // namespace nsimd #endif #elif defined(NSIMD_AVX512_KNL) #define NSIMD_PLATFORM x86 #define NSIMD_SIMD avx512_knl #include #if NSIMD_CXX > 0 namespace nsimd { struct cpu {}; struct sse2 {}; struct sse42 {}; struct avx {}; struct avx2 {}; struct avx512_knl {}; #if NSIMD_CXX >= 2020 template concept simd_ext_c = std::is_same_v || std::is_same_v || std::is_same_v || std::is_same_v || std::is_same_v || std::is_same_v; #define NSIMD_LIST_SIMD_EXT cpu, sse2, sse42, avx, avx2, avx512_knl #endif } // namespace nsimd #endif #elif defined(NSIMD_AVX512_SKYLAKE) #define NSIMD_PLATFORM x86 #define NSIMD_SIMD avx512_skylake #include #if NSIMD_CXX > 0 namespace nsimd { struct cpu {}; struct sse2 {}; struct sse42 {}; struct avx {}; struct avx2 {}; struct avx512_skylake {}; #if NSIMD_CXX >= 2020 template concept simd_ext_c = std::is_same_v || std::is_same_v || std::is_same_v || std::is_same_v || std::is_same_v || std::is_same_v; #define NSIMD_LIST_SIMD_EXT cpu, sse2, sse42, avx, avx2, avx512_skylake #endif } // namespace nsimd #endif #elif defined(NSIMD_NEON128) #define NSIMD_PLATFORM arm #define NSIMD_SIMD neon128 #include #if NSIMD_CXX > 0 namespace nsimd { struct cpu {}; struct neon128 {}; #if NSIMD_CXX >= 2020 template concept simd_ext_c = std::is_same_v || std::is_same_v; #define NSIMD_LIST_SIMD_EXT cpu, neon128 #endif } // namespace nsimd #endif #elif defined(NSIMD_AARCH64) #define NSIMD_PLATFORM arm #define NSIMD_SIMD aarch64 #include #if NSIMD_CXX > 0 namespace nsimd { struct cpu {}; struct aarch64 {}; #if NSIMD_CXX >= 2020 template concept simd_ext_c = std::is_same_v || std::is_same_v; #define NSIMD_LIST_SIMD_EXT cpu, aarch64 #endif } // namespace nsimd #endif #elif defined(NSIMD_SVE) #define NSIMD_PLATFORM arm #define NSIMD_SIMD sve #include #include #if NSIMD_CXX > 0 namespace nsimd { struct cpu {}; struct aarch64 {}; struct sve {}; #if NSIMD_CXX >= 2020 template concept simd_ext_c = std::is_same_v || std::is_same_v || std::is_same_v; #define NSIMD_LIST_SIMD_EXT cpu, aarch64, sve #endif } // namespace nsimd #endif #elif defined(NSIMD_SVE128) #define NSIMD_PLATFORM arm #define NSIMD_SIMD sve128 #include #include #if NSIMD_CXX > 0 namespace nsimd { struct cpu {}; struct aarch64 {}; struct sve128 {}; #if NSIMD_CXX >= 2020 template concept simd_ext_c = std::is_same_v || std::is_same_v || std::is_same_v; #define NSIMD_LIST_SIMD_EXT cpu, aarch64, sve128 #endif } // namespace nsimd #endif #elif defined(NSIMD_SVE256) #define NSIMD_PLATFORM arm #define NSIMD_SIMD sve256 #include #include #if NSIMD_CXX > 0 namespace nsimd { struct cpu {}; struct aarch64 {}; struct sve256 {}; #if NSIMD_CXX >= 2020 template concept simd_ext_c = std::is_same_v || std::is_same_v || std::is_same_v; #define NSIMD_LIST_SIMD_EXT cpu, aarch64, sve256 #endif } // namespace nsimd #endif #elif defined(NSIMD_SVE512) #define NSIMD_PLATFORM arm #define NSIMD_SIMD sve512 #include #include #if NSIMD_CXX > 0 namespace nsimd { struct cpu {}; struct aarch64 {}; struct sve512 {}; #if NSIMD_CXX >= 2020 template concept simd_ext_c = std::is_same_v || std::is_same_v || std::is_same_v; #define NSIMD_LIST_SIMD_EXT cpu, aarch64, sve512 #endif } // namespace nsimd #endif #elif defined(NSIMD_SVE1024) #define NSIMD_PLATFORM arm #define NSIMD_SIMD sve1024 #include #include #if NSIMD_CXX > 0 namespace nsimd { struct cpu {}; struct aarch64 {}; struct sve1024 {}; #if NSIMD_CXX >= 2020 template concept simd_ext_c = std::is_same_v || std::is_same_v || std::is_same_v; #define NSIMD_LIST_SIMD_EXT cpu, aarch64, sve1024 #endif } // namespace nsimd #endif #elif defined(NSIMD_SVE2048) #define NSIMD_PLATFORM arm #define NSIMD_SIMD sve2048 #include #include #if NSIMD_CXX > 0 namespace nsimd { struct cpu {}; struct aarch64 {}; struct sve2048 {}; #if NSIMD_CXX >= 2020 template concept simd_ext_c = std::is_same_v || std::is_same_v || std::is_same_v; #define NSIMD_LIST_SIMD_EXT cpu, aarch64, sve2048 #endif } // namespace nsimd #endif #elif defined(NSIMD_VMX) #define NSIMD_PLATFORM ppc #define NSIMD_SIMD vmx #ifdef NSIMD_IS_CLANG /* New version of clang are spamming useless warning comming from their */ /* altivec.h file */ #pragma clang diagnostic ignored "-Wc11-extensions" #pragma clang diagnostic ignored "-Wc++11-long-long" #endif #include #ifdef bool #undef bool #endif #ifdef pixel #undef pixel #endif #ifdef vector #undef vector #endif #if NSIMD_CXX > 0 namespace nsimd { struct cpu {}; struct vmx {}; #if NSIMD_CXX >= 2020 template concept simd_ext_c = std::is_same_v || std::is_same_v; #define NSIMD_LIST_SIMD_EXT cpu, vmx #endif } // namespace nsimd #endif #elif defined(NSIMD_VSX) #define NSIMD_PLATFORM ppc #define NSIMD_SIMD vsx #ifdef NSIMD_IS_CLANG /* New version of clang are spamming useless warning comming from their */ /* altivec.h file */ #pragma clang diagnostic ignored "-Wc11-extensions" #pragma clang diagnostic ignored "-Wc++11-long-long" #endif #include #ifdef bool #undef bool #endif #ifdef pixel #undef pixel #endif #ifdef vector #undef vector #endif #if NSIMD_CXX > 0 namespace nsimd { struct cpu {}; struct vmx {}; struct vsx {}; #if NSIMD_CXX >= 2020 template concept simd_ext_c = std::is_same_v || std::is_same_v || std::is_same_v; #define NSIMD_LIST_SIMD_EXT cpu, vsx #endif } // namespace nsimd #endif #else #ifdef NSIMD_CUDA #if defined(NSIMD_IS_GCC) #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wunused-function" #elif defined(NSIMD_IS_CLANG) #pragma clang diagnostic push #pragma clang diagnostic ignored "-Wunused-function" #endif #include #if defined(NSIMD_IS_GCC) #pragma GCC diagnostic pop #elif defined(NSIMD_IS_CLANG) #pragma clang diagnostic pop #endif #endif #ifdef NSIMD_ROCM #include #include #endif #if defined(NSIMD_ONEAPI) && NSIMD_CXX > 0 #include extern "C" { NSIMD_DLLSPEC void *nsimd_oneapi_default_queue(); } // extern "C" namespace nsimd { namespace oneapi { NSIMD_INLINE sycl::queue &default_queue() { return *(sycl::queue *)nsimd_oneapi_default_queue(); } } // namespace oneapi } // namespace nsimd #endif #define NSIMD_SIMD cpu #define NSIMD_PLATFORM cpu #ifdef NSIMD_IS_MSVC #include #endif #if NSIMD_CXX > 0 namespace nsimd { struct cpu {}; #if NSIMD_CXX >= 2020 template concept simd_ext_c = std::is_same_v; #define NSIMD_LIST_SIMD_EXT cpu #endif } // namespace nsimd #endif #endif #if NSIMD_CXX >= 2020 #define NSIMD_CONCEPT_SIMD_EXT nsimd::simd_ext_c #else #define NSIMD_CONCEPT_SIMD_EXT typename #endif /* ------------------------------------------------------------------------- */ /* For ARM SVE we need a special struct */ #ifdef NSIMD_SVE #define NSIMD_STRUCT __sizeless_struct #else #define NSIMD_STRUCT struct #endif /* ------------------------------------------------------------------------- */ /* Shorter typedefs for integers and their limits */ #if NSIMD_CXX > 0 #include #else #include #endif #if defined(NSIMD_ONEAPI) typedef sycl::cl_char i8; typedef sycl::cl_uchar u8; typedef sycl::cl_short i16; typedef sycl::cl_ushort u16; typedef sycl::cl_int i32; typedef sycl::cl_uint u32; typedef sycl::cl_long i64; typedef sycl::cl_ulong u64; #elif defined(NSIMD_IS_MSVC) typedef unsigned __int8 u8; typedef signed __int8 i8; typedef unsigned __int16 u16; typedef signed __int16 i16; typedef unsigned __int32 u32; typedef signed __int32 i32; typedef unsigned __int64 u64; typedef signed __int64 i64; #else typedef unsigned char u8; typedef signed char i8; typedef unsigned short u16; typedef signed short i16; #ifdef __UINT32_TYPE__ typedef __UINT32_TYPE__ u32; #else #if defined(NSIMD_NEON128) && __ARM_ARCH <= 6 typedef unsigned long u32; #else typedef unsigned int u32; #endif #endif #ifdef __INT32_TYPE__ typedef __INT32_TYPE__ i32; #else #if defined(NSIMD_NEON128) && __ARM_ARCH <= 6 typedef signed long i32; #else typedef signed int i32; #endif #endif #if defined(NSIMD_VMX) || defined(NSIMD_VSX) typedef nsimd_ulonglong u64; typedef nsimd_longlong i64; #elif NSIMD_WORD_SIZE == 64 #ifdef __UINT64_TYPE__ typedef nsimd_uint64_type u64; #else typedef unsigned long u64; #endif #ifdef __INT64_TYPE__ typedef nsimd_int64_type i64; #else typedef signed long i64; #endif #else #if defined(NSIMD_IS_GCC) || defined(NSIMD_IS_CLANG) typedef nsimd_ulonglong u64; typedef nsimd_longlong i64; #else typedef unsigned long long u64; typedef signed long long i64; #endif #endif #endif #define NSIMD_U8_MIN ((u8)0) #define NSIMD_U8_MAX UCHAR_MAX #define NSIMD_I8_MIN SCHAR_MIN #define NSIMD_I8_MAX SCHAR_MAX #define NSIMD_U16_MIN ((u16)0) #define NSIMD_U16_MAX USHRT_MAX #define NSIMD_I16_MIN SHRT_MIN #define NSIMD_I16_MAX SHRT_MAX #define NSIMD_U32_MIN ((u32)0) #define NSIMD_U32_MAX UINT_MAX #define NSIMD_I32_MIN INT_MIN #define NSIMD_I32_MAX INT_MAX #ifdef NSIMD_IS_MSVC #define NSIMD_U64_MIN ((u64)0) #define NSIMD_U64_MAX ULLONG_MAX #define NSIMD_I64_MIN LLONG_MIN #define NSIMD_I64_MAX LLONG_MAX #else #if NSIMD_WORD_SIZE == 64 #define NSIMD_U64_MIN ((u64)0) #define NSIMD_U64_MAX ULONG_MAX #define NSIMD_I64_MIN LONG_MIN #define NSIMD_I64_MAX LONG_MAX #else #define NSIMD_U64_MIN ((u64)0) #define NSIMD_U64_MAX (~((u64)0)) #define NSIMD_I64_MIN ((i64)1 << 63) #define NSIMD_I64_MAX (~((i64)1 << 63)) #endif #endif /* ------------------------------------------------------------------------- */ /* Shorter typedefs for floatting point types */ #if ((defined(NSIMD_NEON128) || defined(NSIMD_AARCH64)) && \ defined(NSIMD_FP16)) || defined(NSIMD_SVE_FAMILY) #define NSIMD_ARM_FP16 #endif #ifdef NSIMD_ARM_FP16 typedef __fp16 f16; #define NSIMD_NATIVE_FP16 #elif defined(NSIMD_CUDA) || defined(NSIMD_ROCM) typedef __half f16; #define NSIMD_NATIVE_FP16 #elif defined(NSIMD_ONEAPI) typedef sycl::half f16; #define NSIMD_NATIVE_FP16 #else typedef struct { u16 u; } f16; #endif #if defined(NSIMD_ONEAPI) typedef sycl::cl_float f32; typedef sycl::cl_double f64; #else typedef float f32; typedef double f64; #endif /* ------------------------------------------------------------------------- */ /* Native register size (for now only 32 and 64 bits) types */ #if NSIMD_WORD_SIZE == 64 typedef i64 nsimd_nat; #else typedef i32 nsimd_nat; #endif #if NSIMD_CXX > 0 namespace nsimd { typedef nsimd_nat nat; } // namespace nsimd #endif /* ------------------------------------------------------------------------- */ /* C++ traits for base types */ #if NSIMD_CXX > 0 namespace nsimd { // Some C++20 concepts first #if NSIMD_CXX >= 2020 template concept simd_value_type_c = std::is_same_v || std::is_same_v || std::is_same_v || std::is_same_v || std::is_same_v || std::is_same_v || std::is_same_v || std::is_same_v || std::is_same_v || std::is_same_v || std::is_same_v; #define NSIMD_CONCEPT_VALUE_TYPE nsimd::simd_value_type_c template concept simd_value_type_or_bool_c = simd_value_type_c || std::is_same_v; #define NSIMD_CONCEPT_VALUE_TYPE_OR_BOOL nsimd::simd_value_type_or_bool_c // We need our own sizeof because of f16 which can be 4 bytes (i.e. a // float) on systems where there is no support for native f16. template struct sizeof_t { static const size_t value = sizeof(T); }; template <> struct sizeof_t { static const size_t value = 2; }; template const size_t sizeof_v = sizeof_t::value; #define NSIMD_REQUIRES(cond) requires(cond) #else #define NSIMD_CONCEPT_VALUE_TYPE typename #define NSIMD_CONCEPT_VALUE_TYPE_OR_BOOL typename #define NSIMD_REQUIRES(cond) #endif template struct traits {}; // 8-bits template <> struct traits { typedef i8 itype; typedef u8 utype; }; template <> struct traits { typedef i8 itype; typedef u8 utype; }; // 16-bits template <> struct traits { typedef i16 itype; typedef u16 utype; typedef f16 ftype; }; template <> struct traits { typedef i16 itype; typedef u16 utype; typedef f16 ftype; }; template <> struct traits { typedef i16 itype; typedef u16 utype; typedef f16 ftype; }; // 32-bits template <> struct traits { typedef i32 itype; typedef u32 utype; typedef f32 ftype; }; template <> struct traits { typedef i32 itype; typedef u32 utype; typedef f32 ftype; }; template <> struct traits { typedef i32 itype; typedef u32 utype; typedef f32 ftype; }; // 64-bits template <> struct traits { typedef i64 itype; typedef u64 utype; typedef f64 ftype; }; template <> struct traits { typedef i64 itype; typedef u64 utype; typedef f64 ftype; }; template <> struct traits { typedef i64 itype; typedef u64 utype; typedef f64 ftype; }; } // namespace nsimd #endif /* ------------------------------------------------------------------------- */ /* Set if denormalized float are set to 0 */ #ifdef NSIMD_NEON128 #define NSIMD_DNZ_FLUSH_TO_ZERO #endif /* clang-format on */ /* ------------------------------------------------------------------------- */ /* POPCNT: GCC and Clang have intrinsics */ NSIMD_INLINE int nsimd_popcnt32_(u32 a) { #if defined(NSIMD_IS_GCC) || defined(NSIMD_IS_CLANG) return __builtin_popcount(a); #elif defined(NSIMD_IS_MSVC) return (int)__popcnt(a); #else int i, ret = 0; for (i = 0; i < 32; i++) { ret += (int)((a >> i) & 1); } return ret; #endif } NSIMD_INLINE int nsimd_popcnt64_(u64 a) { #if defined(NSIMD_IS_GCC) || defined(NSIMD_IS_CLANG) #if __SIZEOF_LONG__ == 4 return __builtin_popcountl((u32)(a & 0xFFFFFFFF)) + __builtin_popcountl((u32)(a >> 32)); #else return __builtin_popcountl(a); #endif #elif defined(NSIMD_IS_MSVC) #if NSIMD_WORD_SIZE == 64 return (int)__popcnt64(a); #else return (int)__popcnt((u32)(a & 0xFFFFFFFF)) + (int)__popcnt((u32)(a >> 32)); #endif #else int i, ret = 0; for (i = 0; i < 64; i++) { ret += (int)((a >> i) & 1); } return ret; #endif } /* ------------------------------------------------------------------------- */ /* Macro to automatically include function depending on detected platform/SIMD */ #define NSIMD_AUTO_INCLUDE(path) /* ------------------------------------------------------------------------- */ /* Standard includes */ /* clang-format off */ #if NSIMD_CXX > 0 #include #include #else #include #include #endif /* clang-format on */ /* ------------------------------------------------------------------------- */ /* Now includes detected SIMD types */ #if NSIMD_CXX > 0 namespace nsimd { template struct simd_traits {}; } // namespace nsimd // Those are for writing shorter code #define NSIMD_NSV(T, SIMD_EXT) \ typename nsimd::simd_traits::simd_vector #define NSIMD_NSVX2(T, SIMD_EXT) \ typename nsimd::simd_traits::simd_vectorx2 #define NSIMD_NSVX3(T, SIMD_EXT) \ typename nsimd::simd_traits::simd_vectorx3 #define NSIMD_NSVX4(T, SIMD_EXT) \ typename nsimd::simd_traits::simd_vectorx4 #define NSIMD_NSVL(L, SIMD_EXT) \ typename nsimd::simd_traits::simd_vectorl #endif #include NSIMD_AUTO_INCLUDE(types.h) /* ------------------------------------------------------------------------- */ /* Macro/typedefs for SIMD infos */ #define vec(T) NSIMD_PP_CAT_4(nsimd_, NSIMD_SIMD, _v, T) #define vecl(T) NSIMD_PP_CAT_4(nsimd_, NSIMD_SIMD, _vl, T) #define vecx2(T) NSIMD_PP_CAT_5(nsimd_, NSIMD_SIMD, _v, T, x2) #define vecx3(T) NSIMD_PP_CAT_5(nsimd_, NSIMD_SIMD, _v, T, x3) #define vecx4(T) NSIMD_PP_CAT_5(nsimd_, NSIMD_SIMD, _v, T, x4) typedef vec(i8) vi8; typedef vec(u8) vu8; typedef vec(i16) vi16; typedef vec(u16) vu16; typedef vec(i32) vi32; typedef vec(u32) vu32; typedef vec(i64) vi64; typedef vec(u64) vu64; typedef vec(f16) vf16; typedef vec(f32) vf32; typedef vec(f64) vf64; typedef vecx2(i8) vi8x2; typedef vecx2(u8) vu8x2; typedef vecx2(i16) vi16x2; typedef vecx2(u16) vu16x2; typedef vecx2(i32) vi32x2; typedef vecx2(u32) vu32x2; typedef vecx2(i64) vi64x2; typedef vecx2(u64) vu64x2; typedef vecx2(f16) vf16x2; typedef vecx2(f32) vf32x2; typedef vecx2(f64) vf64x2; typedef vecx3(i8) vi8x3; typedef vecx3(u8) vu8x3; typedef vecx3(i16) vi16x3; typedef vecx3(u16) vu16x3; typedef vecx3(i32) vi32x3; typedef vecx3(u32) vu32x3; typedef vecx3(i64) vi64x3; typedef vecx3(u64) vu64x3; typedef vecx3(f16) vf16x3; typedef vecx3(f32) vf32x3; typedef vecx3(f64) vf64x3; typedef vecx4(i8) vi8x4; typedef vecx4(u8) vu8x4; typedef vecx4(i16) vi16x4; typedef vecx4(u16) vu16x4; typedef vecx4(i32) vi32x4; typedef vecx4(u32) vu32x4; typedef vecx4(i64) vi64x4; typedef vecx4(u64) vu64x4; typedef vecx4(f16) vf16x4; typedef vecx4(f32) vf32x4; typedef vecx4(f64) vf64x4; typedef vecl(i8) vli8; typedef vecl(u8) vlu8; typedef vecl(i16) vli16; typedef vecl(u16) vlu16; typedef vecl(i32) vli32; typedef vecl(u32) vlu32; typedef vecl(i64) vli64; typedef vecl(u64) vlu64; typedef vecl(f16) vlf16; typedef vecl(f32) vlf32; typedef vecl(f64) vlf64; #define vec_a(T, simd_ext) NSIMD_PP_CAT_4(nsimd_, simd_ext, _v, T) #define vecl_a(T, simd_ext) NSIMD_PP_CAT_4(nsimd_, simd_ext, _vl, T) #if NSIMD_CXX > 0 namespace nsimd { /* Alignment tags */ struct aligned {}; struct unaligned {}; #if NSIMD_CXX >= 2020 template concept alignment_c = std::is_same_v || std::is_same_v; #define NSIMD_CONCEPT_ALIGNMENT nsimd::alignment_c #else #define NSIMD_CONCEPT_ALIGNMENT typename #endif #if NSIMD_CXX >= 2011 template using simd_vector = typename simd_traits::simd_vector; template using simd_vectorl = typename simd_traits::simd_vectorl; #endif } // namespace nsimd #endif /* clang-format off */ #if defined(NSIMD_X86) #define NSIMD_MAX_ALIGNMENT 64 #elif defined(NSIMD_ARM) #define NSIMD_MAX_ALIGNMENT 256 #elif defined(NSIMD_POWERPC) #define NSIMD_MAX_ALIGNMENT 64 #else #define NSIMD_MAX_ALIGNMENT 16 #endif /* TODO: provide C++14 alignment constpexxr */ /* clang-format on */ #define NSIMD_NB_REGISTERS NSIMD_PP_CAT_3(NSIMD_, NSIMD_SIMD, _NB_REGISTERS) #define NSIMD_MAX_LEN_BIT 2048 #define NSIMD_MAX_LEN_i8 (NSIMD_MAX_LEN_BIT / 8) #define NSIMD_MAX_LEN_u8 (NSIMD_MAX_LEN_BIT / 8) #define NSIMD_MAX_LEN_i16 (NSIMD_MAX_LEN_BIT / 16) #define NSIMD_MAX_LEN_u16 (NSIMD_MAX_LEN_BIT / 16) #define NSIMD_MAX_LEN_f16 (NSIMD_MAX_LEN_BIT / 16) #define NSIMD_MAX_LEN_i32 (NSIMD_MAX_LEN_BIT / 32) #define NSIMD_MAX_LEN_u32 (NSIMD_MAX_LEN_BIT / 32) #define NSIMD_MAX_LEN_f32 (NSIMD_MAX_LEN_BIT / 32) #define NSIMD_MAX_LEN_i64 (NSIMD_MAX_LEN_BIT / 64) #define NSIMD_MAX_LEN_u64 (NSIMD_MAX_LEN_BIT / 64) #define NSIMD_MAX_LEN_f64 (NSIMD_MAX_LEN_BIT / 64) #define NSIMD_MAX_LEN_e(typ) NSIMD_MAX_LEN_##typ #define NSIMD_MAX_LEN(typ) NSIMD_MAX_LEN_e(typ) #if NSIMD_CXX > 0 namespace nsimd { template struct max_len_t {}; template <> struct max_len_t { static const int value = NSIMD_MAX_LEN_BIT / 8; }; template <> struct max_len_t { static const int value = NSIMD_MAX_LEN_BIT / 8; }; template <> struct max_len_t { static const int value = NSIMD_MAX_LEN_BIT / 16; }; template <> struct max_len_t { static const int value = NSIMD_MAX_LEN_BIT / 16; }; template <> struct max_len_t { static const int value = NSIMD_MAX_LEN_BIT / 16; }; template <> struct max_len_t { static const int value = NSIMD_MAX_LEN_BIT / 32; }; template <> struct max_len_t { static const int value = NSIMD_MAX_LEN_BIT / 32; }; template <> struct max_len_t { static const int value = NSIMD_MAX_LEN_BIT / 32; }; template <> struct max_len_t { static const int value = NSIMD_MAX_LEN_BIT / 64; }; template <> struct max_len_t { static const int value = NSIMD_MAX_LEN_BIT / 64; }; template <> struct max_len_t { static const int value = NSIMD_MAX_LEN_BIT / 64; }; #if NSIMD_CXX >= 2014 template constexpr int max_len = max_len_t::value; #endif } // namespace nsimd #endif /* ------------------------------------------------------------------------- */ /* Memory functions */ /* clang-format off */ #if NSIMD_CXX > 0 #include #include #include #endif /* clang-format on */ /* ------------------------------------------------------------------------- */ #if NSIMD_CXX > 0 extern "C" { #endif NSIMD_DLLSPEC void *nsimd_aligned_alloc(nsimd_nat); NSIMD_DLLSPEC void nsimd_aligned_free(void *); #if NSIMD_CXX > 0 } // extern "C" #endif /* ------------------------------------------------------------------------- */ /* C++ templated functions */ #if NSIMD_CXX > 0 namespace nsimd { NSIMD_INLINE void *aligned_alloc(nsimd_nat n) { return nsimd_aligned_alloc(n); } NSIMD_INLINE void aligned_free(void *ptr) { nsimd_aligned_free(ptr); } template T *aligned_alloc_for(nsimd_nat n) { return (T *)aligned_alloc(n * (nsimd_nat)sizeof(T)); } template void aligned_free_for(void *ptr) { return aligned_free((T *)ptr); } } // namespace nsimd #endif /* ------------------------------------------------------------------------- */ /* C++ <11 allocator */ #if NSIMD_CXX > 0 && NSIMD_CXX < 2011 namespace nsimd { template class allocator { public: typedef T value_type; typedef value_type *pointer; typedef const value_type *const_pointer; typedef value_type &reference; typedef const value_type &const_reference; typedef std::size_t size_type; typedef std::ptrdiff_t difference_type; public: template struct rebind { typedef allocator other; }; public: allocator() {} ~allocator() {} allocator(allocator const &) {} template inline explicit allocator(allocator const &) {} pointer address(reference r) { return &r; } const_pointer address(const_reference r) { return &r; } pointer allocate(size_type n) { return reinterpret_cast(aligned_alloc_for((nsimd_nat)n)); } pointer allocate(size_type n, const void *) { return allocate(n); } void deallocate(pointer p, size_type) { aligned_free_for(p); } size_type max_size() const { return size_type(-1) / sizeof(T); } void construct(pointer p, const T &t) { new (p) T(t); } void destroy(pointer p) { p->~T(); } bool operator==(allocator const &) { return true; } bool operator!=(allocator const &a) { return !operator==(a); } }; } // namespace nsimd #endif /* ------------------------------------------------------------------------- */ /* C++ >=11 allocator */ #if NSIMD_CXX >= 2011 namespace nsimd { template struct allocator { using value_type = T; allocator() = default; template allocator(allocator const &) {} T *allocate(std::size_t n) { if (n > std::size_t(-1) / sizeof(T)) { throw std::bad_alloc(); } T *ptr = aligned_alloc_for((nsimd_nat)n); if (ptr != NULL) { return ptr; } throw std::bad_alloc(); } void deallocate(T *ptr, std::size_t) { nsimd::aligned_free(ptr); } }; template bool operator==(allocator const &, allocator const &) { return true; } template bool operator!=(allocator const &, allocator const &) { return false; } } // namespace nsimd #endif /* ------------------------------------------------------------------------- */ /* scoped allocator */ #if NSIMD_CXX > 0 namespace nsimd { template struct scoped_aligned_mem_for { std::vector > data; template #if NSIMD_CXX >= 2020 requires std::integral #endif scoped_aligned_mem_for(I n) { data.resize(size_t(n)); } const T *get() const { return &data[0]; } T *get() { return &data[0]; } }; } // namespace nsimd #endif /* ------------------------------------------------------------------------- */ /* Conversion functions f16 <---> f32 for C but only when compiling with a */ /* host compiler. Otherwise we must have C++ linkage as fp16 types are */ /* defined as C++ classes . */ #if NSIMD_CXX > 0 && !defined(NSIMD_CUDA) && !defined(NSIMD_ROCM) #define NSIMD_C_LINKAGE_FOR_F16 #endif #ifdef NSIMD_C_LINKAGE_FOR_F16 extern "C" { #endif NSIMD_DLLSPEC u16 nsimd_f32_to_u16(f32); NSIMD_DLLSPEC f32 nsimd_u16_to_f32(u16); #ifdef NSIMD_ARM_FP16 NSIMD_INLINE f16 nsimd_f32_to_f16(f32 a) { return (f16)a; } NSIMD_INLINE f32 nsimd_f16_to_f32(f16 a) { return (f32)a; } #elif (defined(NSIMD_CUDA) && __CUDACC_VER_MAJOR__ >= 10) || \ defined(NSIMD_ROCM) inline f16 nsimd_f32_to_f16(f32 a) { return __float2half(a); } inline f32 nsimd_f16_to_f32(f16 a) { return __half2float(a); } #elif defined(NSIMD_CUDA) && __CUDACC_VER_MAJOR__ < 10 inline f16 nsimd_f32_to_f16(f32 a) { u16 ret = nsimd_f32_to_u16(a); return *(__half *)&ret; } inline f32 nsimd_f16_to_f32(f16 a) { return nsimd_u16_to_f32(*(u16 *)&a); } #elif defined(NSIMD_ONEAPI) inline f16 nsimd_f32_to_f16(f32 a) { return static_cast(a); } inline f32 nsimd_f16_to_f32(f16 a) { return static_cast(a); } #else NSIMD_DLLSPEC f16 nsimd_f32_to_f16(f32); NSIMD_DLLSPEC f32 nsimd_f16_to_f32(f16); #endif #ifdef NSIMD_C_LINKAGE_FOR_F16 } // extern "C" #endif /* ------------------------------------------------------------------------- */ /* Conversion functions f16 <---> f32 for C++ */ #if NSIMD_CXX > 0 namespace nsimd { NSIMD_DLLSPEC u16 f32_to_u16(f32); NSIMD_DLLSPEC f32 u16_to_f32(u16); #ifdef NSIMD_ARM_FP16 NSIMD_INLINE f16 f32_to_f16(f32 a) { return (f16)a; } NSIMD_INLINE f32 f16_to_f32(f16 a) { return (f32)a; } #else NSIMD_DLLSPEC f16 f32_to_f16(f32); NSIMD_DLLSPEC f32 f16_to_f32(f16); #endif } // namespace nsimd #endif /* ------------------------------------------------------------------------- */ /* Helper to print scalar values, converts to bigger type */ NSIMD_INLINE u64 nsimd_to_biggest_u8(u8 a) { return (u64)a; } NSIMD_INLINE u64 nsimd_to_biggest_u16(u16 a) { return (u64)a; } NSIMD_INLINE u64 nsimd_to_biggest_u32(u32 a) { return (u64)a; } NSIMD_INLINE u64 nsimd_to_biggest_u64(u64 a) { return a; } NSIMD_INLINE i64 nsimd_to_biggest_i8(i8 a) { return (i64)a; } NSIMD_INLINE i64 nsimd_to_biggest_i16(i16 a) { return (i64)a; } NSIMD_INLINE i64 nsimd_to_biggest_i32(i32 a) { return (i64)a; } NSIMD_INLINE i64 nsimd_to_biggest_i64(i64 a) { return a; } NSIMD_INLINE f64 nsimd_to_biggest_f16(f16 a) { return (f64)nsimd_f16_to_f32(a); } NSIMD_INLINE f64 nsimd_to_biggest_f32(f32 a) { return (f64)a; } NSIMD_INLINE f64 nsimd_to_biggest_f64(f64 a) { return a; } #if NSIMD_CXX > 0 namespace nsimd { NSIMD_INLINE u64 to_biggest(u8 a) { return nsimd_to_biggest_u8(a); } NSIMD_INLINE u64 to_biggest(u16 a) { return nsimd_to_biggest_u16(a); } NSIMD_INLINE u64 to_biggest(u32 a) { return nsimd_to_biggest_u32(a); } NSIMD_INLINE u64 to_biggest(u64 a) { return nsimd_to_biggest_u64(a); } NSIMD_INLINE i64 to_biggest(i8 a) { return nsimd_to_biggest_i8(a); } NSIMD_INLINE i64 to_biggest(i16 a) { return nsimd_to_biggest_i16(a); } NSIMD_INLINE i64 to_biggest(i32 a) { return nsimd_to_biggest_i32(a); } NSIMD_INLINE i64 to_biggest(i64 a) { return nsimd_to_biggest_i64(a); } NSIMD_INLINE f64 to_biggest(f16 a) { return nsimd_to_biggest_f16(a); } NSIMD_INLINE f64 to_biggest(f32 a) { return nsimd_to_biggest_f32(a); } NSIMD_INLINE f64 to_biggest(f64 a) { return nsimd_to_biggest_f64(a); } } // namespace nsimd #endif /* ------------------------------------------------------------------------- */ /* General conversion for C++ */ #if NSIMD_CXX > 0 namespace nsimd { template struct to_helper { static T to(T, S value) { return (T)value; } }; template <> struct to_helper { static f16 to(f16, f16 value) { return value; } }; template struct to_helper { static f16 to(f16, S value) { return nsimd_f32_to_f16((f32)value); } }; template struct to_helper { static T to(T, f16 value) { return (T)nsimd_f16_to_f32(value); } }; template T to(S value) { return to_helper::to(T(), value); } } // namespace nsimd #endif /* ------------------------------------------------------------------------- */ /* SIMD-related functions */ /* clang-format off */ #if defined(NSIMD_IS_MSVC) /* We do not want MSVC to warn us about unary minus on an unsigned type. It is well defined in standards: unsigned arithmetic is done modulo 2^n. */ #pragma warning(push) #pragma warning(disable : 4146) #elif defined(NSIMD_IS_CLANG) && NSIMD_CXX < 2011 /* When compiling with Clang with C++98 or C++03, some Intel intrinsics are implemented as macros which contain long long but long long are not standard. To get rid of a lot of warning we push the corresponding warning here. */ #pragma clang diagnostic push #pragma clang diagnostic ignored "-Wc++11-long-long" #elif defined(NSIMD_IS_GCC) && defined(NSIMD_SVE_FAMILY) /* Using SVE intrinsics svundef_XXX() is supposed to silence the -Wuninitialized warnings but it does not with GCC 10.0 up to GCC 10.2 so we silence the warning manually for now. */ #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wuninitialized" #elif defined(NSIMD_IS_GCC) && NSIMD_CXX > 0 && \ (defined(NSIMD_VMX) || defined(NSIMD_VSX)) /* When compiling POWERPC intrinsics inside C++ code with GCC we get tons of -Wunused-but-set-parameter. This is a GCC bug. For now we slience the warnings here. */ #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wunused-but-set-parameter" #pragma GCC diagnostic ignored "-Wunused-but-set-variable" #endif #include #if defined(NSIMD_IS_MSVC) #pragma warning(pop) #elif defined(NSIMD_IS_CLANG) && NSIMD_CXX < 2011 #pragma clang diagnostic pop #elif defined(NSIMD_IS_GCC) && defined(NSIMD_SVE_FAMILY) #pragma GCC diagnostic pop #elif defined(NSIMD_IS_GCC) && NSIMD_CXX > 0 && \ (defined(NSIMD_VMX) || defined(NSIMD_VSX)) #pragma GCC diagnostic pop #endif /* clang-format on */ /* ------------------------------------------------------------------------- */ /* If_else cannot be auto-generated */ #define vif_else(a0, a1, a2, typel, type) \ NSIMD_PP_CAT_4(nsimd_if_else1_, NSIMD_SIMD, _, type) \ (NSIMD_PP_CAT_6(nsimd_vreinterpretl_, NSIMD_SIMD, _, type, _, typel)(a0), \ a1, a2) #define vif_else_e(a0, a1, a2, typel, type, simd_ext) \ NSIMD_PP_CAT_4(nsimd_if_else1_, simd_ext, _, type) \ (NSIMD_PP_CAT_6(nsimd_vreinterpretl_, simd_ext, _, type, _, typel)(a0), a1, \ a2) #if NSIMD_CXX > 0 namespace nsimd { template NSIMD_REQUIRES(sizeof_v == sizeof_v) NSIMD_NSV(T, NSIMD_SIMD) if_else(NSIMD_NSVL(L, NSIMD_SIMD) a0, NSIMD_NSV(T, NSIMD_SIMD) a1, NSIMD_NSV(T, NSIMD_SIMD) a2, L, T) { return if_else1(reinterpretl(a0, L(), T(), NSIMD_SIMD()), a1, a2, T(), NSIMD_SIMD()); } template NSIMD_REQUIRES(sizeof_v == sizeof_v) NSIMD_NSV(T, SimdExt) if_else(NSIMD_NSVL(L, SimdExt) a0, NSIMD_NSV(T, SimdExt) a1, NSIMD_NSV(T, SimdExt) a2, L, T, SimdExt) { return if_else1(reinterpretl(a0, L(), T(), SimdExt()), a1, a2, T(), SimdExt()); } } // namespace nsimd #endif /* ------------------------------------------------------------------------- */ /* Loads/stores can be parametrized/templated by the alignment */ #define NSIMD_ALIGNED a #define NSIMD_UNALIGNED u #define vload(a0, type, alignment) \ NSIMD_PP_CAT_6(nsimd_load, alignment, _, NSIMD_SIMD, _, type)(a0) #define vload_e(a0, type, simd_ext, alignment) \ NSIMD_PP_CAT_6(nsimd_load, alignment, _, simd_ext, _, type)(a0) #define vload2(a0, type, alignment) \ NSIMD_PP_CAT_6(nsimd_load2, alignment, _, NSIMD_SIMD, _, type)(a0) #define vload2_e(a0, type, simd_ext, alignment) \ NSIMD_PP_CAT_6(nsimd_load2, alignment, _, simd_ext, _, type)(a0) #define vload3(a0, type, alignment) \ NSIMD_PP_CAT_6(nsimd_load3, alignment, _, NSIMD_SIMD, _, type)(a0) #define vload3_e(a0, type, simd_ext, alignment) \ NSIMD_PP_CAT_6(nsimd_load3, alignment, _, simd_ext, _, type)(a0) #define vload4(a0, type, alignment) \ NSIMD_PP_CAT_6(nsimd_load4, alignment, _, NSIMD_SIMD, _, type)(a0) #define vload4_e(a0, type, simd_ext, alignment) \ NSIMD_PP_CAT_6(nsimd_load4, alignment, _, simd_ext, _, type)(a0) #define vloadl(a0, type, alignment) \ NSIMD_PP_CAT_6(nsimd_loadl, alignment_, NSIMD_SIMD, _, type)(a0) #define vloadl_e(a0, type, simd_ext, alignment) \ NSIMD_PP_CAT_6(nsimd_loadl, alignment_, simd_ext, _, type)(a0) #define vstore(a0, a1, type, alignment) \ NSIMD_PP_CAT_6(nsimd_store, alignment, _, NSIMD_SIMD, _, type)(a0, a1) #define vstore_e(a0, a1, type, simd_ext, alignment) \ NSIMD_PP_CAT_6(nsimd_store, alignment, _, simd_ext, _, type)(a0, a1) #define vstore2(a0, a1, a2, type, alignment) \ NSIMD_PP_CAT_4(nsimd_store2, alignment, _, NSIMD_SIMD, _, type)(a0, a1, a2) #define vstore2_e(a0, a1, a2, type, simd_ext, alignment) \ NSIMD_PP_CAT_4(nsimd_store2, alignment, _, simd_ext, _, type)(a0, a1, a2) #define vstore3(a0, a1, a2, a3, type, alignment) \ NSIMD_PP_CAT_4(nsimd_store3, alignment, _, NSIMD_SIMD, _, type) \ (a0, a1, a2, a3) #define vstore3_e(a0, a1, a2, a3, type, simd_ext, alignment) \ NSIMD_PP_CAT_4(nsimd_store3, alignment, _, simd_ext, _, type)(a0, a1, a2, a3) #define vstore4(a0, a1, a2, a3, a4, type, alignment) \ NSIMD_PP_CAT_4(nsimd_store3, alignment, _, NSIMD_SIMD, _, type) \ (a0, a1, a2, a3, a4) #define vstore4_e(a0, a1, a2, a3, a4, type, simd_ext, alignment) \ NSIMD_PP_CAT_4(nsimd_store3, alignment, _, simd_ext, _, type) \ (a0, a1, a2, a3, a4) #define vstorel(a0, a1, type, alignment) \ NSIMD_PP_CAT_6(nsimd_storel, alignment, _, NSIMD_SIMD, _, type)(a0, a1) #define vstorel_e(a0, a1, type, simd_ext, alignment) \ NSIMD_PP_CAT_6(nsimd_storel, alignment, _, simd_ext, _, type)(a0, a1) #if NSIMD_CXX > 0 namespace nsimd { template NSIMD_NSV(T, NSIMD_SIMD) load(const T *ptr, T, aligned) { return loada(ptr, T(), NSIMD_SIMD()); } template NSIMD_NSV(T, NSIMD_SIMD) load(const T *ptr, T, unaligned) { return loadu(ptr, T(), NSIMD_SIMD()); } template NSIMD_NSV(T, SimdExt) load(const T *ptr, T, SimdExt, aligned) { return loada(ptr, T(), SimdExt()); } template NSIMD_NSV(T, SimdExt) load(const T *ptr, T, SimdExt, unaligned) { return loadu(ptr, T(), SimdExt()); } template NSIMD_NSVX2(T, NSIMD_SIMD) load2(const T *ptr, T, aligned) { return load2a(ptr, T(), NSIMD_SIMD()); } template NSIMD_NSVX2(T, NSIMD_SIMD) load2(const T *ptr, T, unaligned) { return load2u(ptr, T(), NSIMD_SIMD()); } template NSIMD_NSVX2(T, SimdExt) load2(const T *ptr, T, SimdExt, aligned) { return load2a(ptr, T(), SimdExt()); } template NSIMD_NSVX2(T, SimdExt) load2(const T *ptr, T, SimdExt, unaligned) { return load2u(ptr, T(), SimdExt()); } template NSIMD_NSVX3(T, NSIMD_SIMD) load3(const T *ptr, T, aligned) { return load3a(ptr, T(), NSIMD_SIMD()); } template NSIMD_NSVX3(T, NSIMD_SIMD) load3(const T *ptr, T, unaligned) { return load3u(ptr, T(), NSIMD_SIMD()); } template NSIMD_NSVX3(T, SimdExt) load3(const T *ptr, T, SimdExt, aligned) { return load3a(ptr, T(), SimdExt()); } template NSIMD_NSVX3(T, SimdExt) load3(const T *ptr, T, SimdExt, unaligned) { return load3u(ptr, T(), SimdExt()); } template NSIMD_NSVX4(T, NSIMD_SIMD) load4(const T *ptr, T, aligned) { return load4a(ptr, T(), NSIMD_SIMD()); } template NSIMD_NSVX4(T, NSIMD_SIMD) load4(const T *ptr, T, unaligned) { return load4u(ptr, T(), NSIMD_SIMD()); } template NSIMD_NSVX4(T, SimdExt) load4(const T *ptr, T, SimdExt, aligned) { return load4a(ptr, T(), SimdExt()); } template NSIMD_NSVX4(T, SimdExt) load4(const T *ptr, T, SimdExt, unaligned) { return load4u(ptr, T(), SimdExt()); } template NSIMD_NSVL(T, NSIMD_SIMD) loadlu(const T *ptr, T, aligned) { return loadla(ptr, T(), NSIMD_SIMD()); } template NSIMD_NSVL(T, NSIMD_SIMD) loadlu(const T *ptr, T, unaligned) { return loadlu(ptr, T(), NSIMD_SIMD()); } template NSIMD_NSVL(T, NSIMD_SIMD) loadlu(const T *ptr, T, SimdExt, aligned) { return loadla(ptr, T(), SimdExt()); } template NSIMD_NSVL(T, NSIMD_SIMD) loadlu(const T *ptr, T, SimdExt, unaligned) { return loadlu(ptr, T(), SimdExt()); } template void store(T *ptr, NSIMD_NSV(T, NSIMD_SIMD) a1, T, aligned) { storea(ptr, a1, T(), NSIMD_SIMD()); } template void store(T *ptr, NSIMD_NSV(T, NSIMD_SIMD) a1, T, unaligned) { storeu(ptr, a1, T(), NSIMD_SIMD()); } template void store(T *ptr, NSIMD_NSV(T, SimdExt) a1, T, SimdExt, aligned) { storea(ptr, a1, T(), SimdExt()); } template void store(T *ptr, NSIMD_NSV(T, SimdExt) a1, T, SimdExt, unaligned) { storeu(ptr, a1, T(), SimdExt()); } template void store2(T *ptr, NSIMD_NSV(T, NSIMD_SIMD) a1, NSIMD_NSV(T, NSIMD_SIMD) a2, T, aligned) { store2a(ptr, a1, a2, T(), NSIMD_SIMD()); } template void store2(T *ptr, NSIMD_NSV(T, NSIMD_SIMD) a1, NSIMD_NSV(T, NSIMD_SIMD) a2, T, unaligned) { store2u(ptr, a1, a2, T(), NSIMD_SIMD()); } template void store2(T *ptr, NSIMD_NSV(T, SimdExt) a1, NSIMD_NSV(T, SimdExt) a2, T, SimdExt, aligned) { store2a(ptr, a1, a2, T(), SimdExt()); } template void store2(T *ptr, NSIMD_NSV(T, SimdExt) a1, NSIMD_NSV(T, SimdExt) a2, T, SimdExt, unaligned) { store2u(ptr, a1, a2, T(), SimdExt()); } template void store3(T *ptr, NSIMD_NSV(T, NSIMD_SIMD) a1, NSIMD_NSV(T, NSIMD_SIMD) a2, NSIMD_NSV(T, NSIMD_SIMD) a3, T, aligned) { store3a(ptr, a1, a2, a3, T(), NSIMD_SIMD()); } template void store3(T *ptr, NSIMD_NSV(T, NSIMD_SIMD) a1, NSIMD_NSV(T, NSIMD_SIMD) a2, NSIMD_NSV(T, NSIMD_SIMD) a3, T, unaligned) { store3u(ptr, a1, a2, a3, T(), NSIMD_SIMD()); } template void store3(T *ptr, NSIMD_NSV(T, SimdExt) a1, NSIMD_NSV(T, SimdExt) a2, NSIMD_NSV(T, SimdExt) a3, T, SimdExt, aligned) { store3a(ptr, a1, a2, a3, T(), SimdExt()); } template void store3(T *ptr, NSIMD_NSV(T, SimdExt) a1, NSIMD_NSV(T, SimdExt) a2, NSIMD_NSV(T, SimdExt) a3, T, SimdExt, unaligned) { store3u(ptr, a1, a2, a3, T(), SimdExt()); } template void store4(T *ptr, NSIMD_NSV(T, NSIMD_SIMD) a1, NSIMD_NSV(T, NSIMD_SIMD) a2, NSIMD_NSV(T, NSIMD_SIMD) a3, NSIMD_NSV(T, NSIMD_SIMD) a4, T, aligned) { store4a(ptr, a1, a2, a3, a4, T(), NSIMD_SIMD()); } template void store4(T *ptr, NSIMD_NSV(T, NSIMD_SIMD) a1, NSIMD_NSV(T, NSIMD_SIMD) a2, NSIMD_NSV(T, NSIMD_SIMD) a3, NSIMD_NSV(T, NSIMD_SIMD) a4, T, unaligned) { store4u(ptr, a1, a2, a3, a4, T(), NSIMD_SIMD()); } template void store4(T *ptr, NSIMD_NSV(T, SimdExt) a1, NSIMD_NSV(T, SimdExt) a2, NSIMD_NSV(T, SimdExt) a3, NSIMD_NSV(T, SimdExt) a4, T, SimdExt, aligned) { store4a(ptr, a1, a2, a3, a4, T(), SimdExt()); } template void store4(T *ptr, NSIMD_NSV(T, SimdExt) a1, NSIMD_NSV(T, SimdExt) a2, NSIMD_NSV(T, SimdExt) a3, NSIMD_NSV(T, SimdExt) a4, T, SimdExt, unaligned) { store4u(ptr, a1, a2, a3, a4, T(), SimdExt()); } template void storel(T *ptr, NSIMD_NSV(T, NSIMD_SIMD) a1, T, aligned) { storela(ptr, a1, T(), NSIMD_SIMD()); } template void storel(T *ptr, NSIMD_NSV(T, NSIMD_SIMD) a1, T, unaligned) { storelu(ptr, a1, T(), NSIMD_SIMD()); } template void storel(T *ptr, NSIMD_NSV(T, SimdExt) a1, T, SimdExt, aligned) { storela(ptr, a1, T(), SimdExt()); } template void storel(T *ptr, NSIMD_NSV(T, SimdExt) a1, T, SimdExt, unaligned) { storelu(ptr, a1, T(), SimdExt()); } } // namespace nsimd #endif /* ------------------------------------------------------------------------- */ /* Scalar utilisties */ #include /* ------------------------------------------------------------------------- */ /* Some undefs */ #if NSIMD_CXX > 0 #undef NSIMD_NSV #undef NSIMD_NSVX2 #undef NSIMD_NSVX3 #undef NSIMD_NSVX4 #undef NSIMD_NSVL #endif /* ------------------------------------------------------------------------- */ /* isnan, isnormal and isinf functions */ NSIMD_INLINE int nsimd_isnan_f16(f16 a) { /* We assume IEEE representation for f16's */ u16 b = nsimd_scalar_reinterpret_u16_f16(a); if ((((((u32)b) >> 10) & 0x1F) == 0x1F) && ((((u32)b) << 6) != 0u)) { return 1; } else { return 0; } } NSIMD_INLINE int nsimd_isnan_f32(f32 a) { /* We assume IEEE representation for f32's */ u32 b = nsimd_scalar_reinterpret_u32_f32(a); if ((((b >> 23) & 0xFF) == 0xFF) && ((b << 9) != 0u)) { return 1; } else { return 0; } } NSIMD_INLINE int nsimd_isnan_f64(f64 a) { /* We assume IEEE representation for f64's */ u64 b = nsimd_scalar_reinterpret_u64_f64(a); if ((((b >> 52) & 0x7FF) == 0x7FF) && ((b << 12) != 0u)) { return 1; } else { return 0; } } NSIMD_INLINE int nsimd_isinf_f16(f16 a) { /* We assume IEEE representation for f16's */ u16 b = nsimd_scalar_reinterpret_u16_f16(a); if ((((((u32)b) >> 10) & 0x1F) == 0x1F) && ((((u32)b) << 6) == 0u)) { return 1; } else { return 0; } } NSIMD_INLINE int nsimd_isinf_f32(f32 a) { /* We assume IEEE representation for f32's */ u32 b = nsimd_scalar_reinterpret_u32_f32(a); if ((((b >> 23) & 0xFF) == 0xFF) && ((b << 9) == 0u)) { return 1; } else { return 0; } } NSIMD_INLINE int nsimd_isinf_f64(f64 a) { /* We assume IEEE representation for f64's */ u64 b = nsimd_scalar_reinterpret_u64_f64(a); if ((((b >> 52) & 0x7FF) == 0x7FF) && ((b << 12) == 0u)) { return 1; } else { return 0; } } NSIMD_INLINE int nsimd_isnormal_f16(f16 a) { /* We assume IEEE representation for f16's */ u16 b = nsimd_scalar_reinterpret_u16_f16(a); if ((((((u32)b) >> 10) & 0x1F) == 0u) && ((((u32)b) << 6) != 0u)) { return 1; } else { return 0; } } NSIMD_INLINE int nsimd_isnormal_f32(f32 a) { /* We assume IEEE representation for f32's */ u32 b = nsimd_scalar_reinterpret_u32_f32(a); if (!((((b >> 23) & 0xFF) == 0u) && ((b << 9) != 0u))) { return 1; } else { return 0; } } NSIMD_INLINE int nsimd_isnormal_f64(f64 a) { /* We assume IEEE representation for f64's */ u64 b = nsimd_scalar_reinterpret_u64_f64(a); if (!((((b >> 52) & 0x7FF) == 0u) && ((b << 12) != 0u))) { return 1; } else { return 0; } } #if NSIMD_CXX > 0 namespace nsimd { NSIMD_INLINE int isnan(f16 a) { return nsimd_isnan_f16(a); } NSIMD_INLINE int isnan(f32 a) { return nsimd_isnan_f32(a); } NSIMD_INLINE int isnan(f64 a) { return nsimd_isnan_f64(a); } NSIMD_INLINE int isinf(f16 a) { return nsimd_isinf_f16(a); } NSIMD_INLINE int isinf(f32 a) { return nsimd_isinf_f32(a); } NSIMD_INLINE int isinf(f64 a) { return nsimd_isinf_f64(a); } NSIMD_INLINE int isnormal(f16 a) { return nsimd_isnormal_f16(a); } NSIMD_INLINE int isnormal(f32 a) { return nsimd_isnormal_f32(a); } NSIMD_INLINE int isnormal(f64 a) { return nsimd_isnormal_f64(a); } } // namespace nsimd #endif /* ------------------------------------------------------------------------- */ /* Difference in log UFP, returns an nat, see documentation for more infos */ #if NSIMD_CXX > 0 extern "C" { #endif NSIMD_DLLSPEC int nsimd_ufp_f16(f16, f16); NSIMD_DLLSPEC int nsimd_ufp_f32(f32, f32); NSIMD_DLLSPEC int nsimd_ufp_f64(f64, f64); #if NSIMD_CXX > 0 } // extern "C" #endif #if NSIMD_CXX > 0 namespace nsimd { NSIMD_INLINE int ufp(f16 a, f16 b) { return nsimd_ufp_f16(a, b); } NSIMD_INLINE int ufp(f32 a, f32 b) { return nsimd_ufp_f32(a, b); } NSIMD_INLINE int ufp(f64 a, f64 b) { return nsimd_ufp_f64(a, b); } } // namespace nsimd #endif /* ------------------------------------------------------------------------- */ /* Get last kernel parameter */ #if NSIMD_CXX > 0 extern "C" { #endif NSIMD_DLLSPEC nsimd_nat nsimd_kernel_param(nsimd_nat, nsimd_nat); #if NSIMD_CXX > 0 } // extern "C" #endif /* ------------------------------------------------------------------------- */ #endif ================================================ FILE: scripts/FindNSIMD.cmake ================================================ # MIT License # # Copyright (c) 2020 Agenium Scale # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. # #.rst: # FindNSIMD # --------- # # Find the NSIMD library, Agenium Scale's vectorization library. # # Result variables # ^^^^^^^^^^^^^^^^ # # This module will set the following variables in your project: # # ``NSIMD_INCLUDE_DIRS`` # where to find nsimd.h, etc. # ``NSIMD_LIBRARY_DIRS`` # where to find the library to link against to use NSIMD. # ``NSIMD_LIBRARIES`` # the library to link against to use NSIMD. # ``NSIMD_FOUND`` # If false, do not try to use NSIMD. if (NOT NSIMD_FOUND AND NOT DEFINED NSIMD_LIBRARIES) list(LENGTH NSIMD_FIND_COMPONENTS l) if ("${l}" STREQUAL "0") find_library(NSIMD_LIBRARIES NAMES nsimd_cpu nsimd_sse2 nsimd_sse42 nsimd_avx nsimd_avx2 nsimd_avx512_knl nsimd_avx512_skylake nsimd_neon128 nsimd_aarch64 nsimd_sve nsimd_sve128 nsimd_sve256 nsimd_sve512 nsimd_sve1024 nsimd_sve2048 nsimd_cuda nsimd_rocm) elseif("${l}" STREQUAL "1") list(GET NSIMD_FIND_COMPONENTS 0 simd_ext) find_library(NSIMD_LIBRARIES NAMES nsimd_${simd_ext}) else() if (NOT NSIMD_FIND_QUIETLY) message(FATAL_ERROR "cannot handle several components") endif() endif() endif() if (NOT NSIMD_FOUND AND NOT DEFINED NSIMD_INCLUDE_DIRS) find_path(NSIMD_INCLUDE_DIRS NAMES nsimd/nsimd.h) endif() if (NOT "${NSIMD_INCLUDE_DIRS}" STREQUAL "NSIMD_INCLUDE_DIRS-NOTFOUND" AND NOT "${NSIMD_LIBRARIES}" STREQUAL "NSIMD_LIBRARIES-NOTFOUND") get_filename_component(NSIMD_LIBRARY_DIRS ${NSIMD_LIBRARIES} DIRECTORY) if (NOT NSIMD_FIND_QUIETLY) message(STATUS "[include dir = ${NSIMD_INCLUDE_DIRS}]" " [library = ${NSIMD_LIBRARIES}]") endif() set(NSIMD_FOUND TRUE) else() if (NOT NSIMD_FIND_QUIETLY) if (NOT DEFINED NSIMD_INCLUDE_DIRS) set(msg "[cannot determine include dir]") else() set(msg "[include dir = ${NSIMD_INCLUDE_DIRS}]") endif() if (NOT DEFINED NSIMD_LIBRARIES) set(msg "${msg} [cannot determine library dir]") else() set(msg "${msg} [library = ${NSIMD_LIBRARIES}]") endif() if (NSIMD_FIND_REQUIRED) message(FATAL_ERROR "${msg}") else() message(STATUS "${msg}") endif() endif() set(NSIMD_FOUND FALSE) endif() ================================================ FILE: scripts/aarch64-linux-gnu-clang++.sh ================================================ #!/bin/bash clang++ --target=aarch64-linux-gnu "$@" ================================================ FILE: scripts/aarch64-linux-gnu-clang.sh ================================================ #!/bin/bash clang --target=aarch64-linux-gnu "$@" ================================================ FILE: scripts/build-tests.bat ================================================ @echo off REM Copyright (c) 2020 Agenium Scale REM REM Permission is hereby granted, free of charge, to any person obtaining a copy REM of this software and associated documentation files (the "Software"), to deal REM in the Software without restriction, including without limitation the rights REM to use, copy, modify, merge, publish, distribute, sublicense, and/or sell REM copies of the Software, and to permit persons to whom the Software is REM furnished to do so, subject to the following conditions: REM REM The above copyright notice and this permission notice shall be included in all REM copies or substantial portions of the Software. REM REM THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR REM IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, REM FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE REM AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER REM LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, REM OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE REM SOFTWARE. REM ########################################################################### setlocal EnableDelayedExpansion pushd "%~dp0" REM ########################################################################### set BUILD_BAT="%CD%\build.bat" set HATCH_PY="%CD%\..\egg\hatch.py" set NSCONFIG="%CD%\..\nstools\bin\nsconfig.exe" set BUILD_ROOT="%CD%\.." REM ########################################################################### REM Run build.bat call %BUILD_BAT% %* if errorlevel 1 goto end_nok REM ########################################################################### REM Generate NSIMD python %HATCH_PY% -tf if errorlevel 1 goto end_nok REM ########################################################################### REM Build tests (checks on command line arguments has benn done by build.bat) set SIMD_EXTS_ARG=%2 set SIMD_EXTS=%SIMD_EXTS_ARG:/=,% if "%3" == "" ( set COMPILER_ARG=cl ) else ( set COMPILER_ARG=%4 ) set COMPILERS=%COMPILER_ARG:/=,% for %%g in (%COMPILERS%) do ( for %%h in (%SIMD_EXTS%) do ( set BUILD_DIR=%BUILD_ROOT%\build-%%h-%%g if exist !BUILD_DIR! rd /Q /S !BUILD_DIR! md !BUILD_DIR! pushd !BUILD_DIR! %NSCONFIG% .. -Dsimd=%%h -suite=%%g if exist %BUILD_ROOT%\targets.txt ( set "TS= " for /F %%k in ('type %BUILD_ROOT%\targets.txt') do ( ninja -t targets all | findstr /R "^tests" | findstr /R "%%k" ^ >_targets.txt for /F %%l in ('type _targets.txt') do ( set TMP1=%%l set T=!TMP1::=! set TS=!TS! !T! ) ) ) else ( set TS=tests ) echo *** !TS! ninja !TS! popd ) ) REM ########################################################################### :end_ok popd endlocal exit /B 0 :end_nok popd endlocal exit /B 1 ================================================ FILE: scripts/build-tests.sh ================================================ #!/bin/bash # Copyright (c) 2021 Agenium Scale # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. ############################################################################### cd `dirname $0` #set -x set -e ############################################################################### # Init BUILD_SH="${PWD}/build.sh" HATCH_PY="${PWD}/../egg/hatch.py" BUILD_ROOT="${PWD}/.." ############################################################################### # Generate NSIMD tests python3 --version 1>/dev/null 2>/dev/null && \ python3 "${HATCH_PY}" -tf || \ python "${HATCH_PY}" -tf ############################################################################### # Run build.sh bash "${BUILD_SH}" "$@" || exit 1 ############################################################################### # Parse command line arguments (check has been done by build.sh) SIMD_EXTS=`echo "${2}" | sed -e 's,/, ,g'` if [ "${3}" == "" ]; then COMPILER_ARG="gcc" else COMPILER_ARG="${4}" fi COMPILERS=`echo ${COMPILER_ARG} | sed 's,/, ,g'` ############################################################################### # Build tests for compiler in ${COMPILERS}; do for simd_ext in ${SIMD_EXTS}; do BUILD_DIR="${BUILD_ROOT}/build-${simd_ext}-${compiler}" if [ -e "${BUILD_ROOT}/targets.txt" ]; then GLOBS=`cat ${BUILD_ROOT}/targets.txt | tr '\n' '|' | sed 's/|$//g'` TARGETS=`(cd ${BUILD_DIR} && ninja -t targets all | grep -E '^tests.') \ | sed 's/:.*//g' | grep -E "(${GLOBS})" | tr '\n' ' '` else TARGETS="tests" fi (cd "${BUILD_DIR}" && ninja ${TARGETS}) done done ================================================ FILE: scripts/build.bat ================================================ @echo off REM Copyright (c) 2020 Agenium Scale REM REM Permission is hereby granted, free of charge, to any person obtaining a copy REM of this software and associated documentation files (the "Software"), to deal REM in the Software without restriction, including without limitation the rights REM to use, copy, modify, merge, publish, distribute, sublicense, and/or sell REM copies of the Software, and to permit persons to whom the Software is REM furnished to do so, subject to the following conditions: REM REM The above copyright notice and this permission notice shall be included in all REM copies or substantial portions of the Software. REM REM THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR REM IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, REM FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE REM AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER REM LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, REM OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE REM SOFTWARE. REM ########################################################################### setlocal EnableDelayedExpansion pushd "%~dp0" REM ########################################################################### REM Init set SETUP_BAT="%CD%\setup.bat" set NSCONFIG="%CD%\..\nstools\bin\nsconfig.exe" set HATCH_PY="%CD%\..\egg\hatch.py" set BUILD_ROOT="%CD%\.." REM ########################################################################### REM Run setup call %SETUP_BAT% if errorlevel 1 goto end_nok REM ########################################################################### REM Generate NSIMD python %HATCH_PY% -lf if errorlevel 1 goto end_nok REM ########################################################################### REM Check/parse command line arguments if "%1" == "" ( echo %0: usage: %0 for simd_ext1/.../simd_ext2 [with compiler1/.../compiler2] goto end_nok ) if not "%1" == "for" ( echo ERROR: expected 'for' as first argument goto end_nok ) if "%2" == "" ( echo "ERROR: no SIMD extension given" goto end_nok ) set SIMD_EXTS_ARG=%2 set SIMD_EXTS=%SIMD_EXTS_ARG:/=,% if "%3" == "" ( set COMPILER_ARG=msvc ) else ( if "%3" == "with" ( if "%4" == "" ( echo "ERROR: no compiler given after with" goto end_nok ) set COMPILER_ARG=%4 ) else ( echo ERROR: expected 'with' as fourth argument goto end_nok ) ) set COMPILERS=%COMPILER_ARG:/=,% REM ########################################################################### REM Build NSIMD : one build directory per SIMD extension per compiler for %%g in (%COMPILERS%) do ( for %%h in (%SIMD_EXTS%) do ( set BUILD_DIR=%BUILD_ROOT%\build-%%h-%%g if exist !BUILD_DIR! rd /Q /S !BUILD_DIR! md !BUILD_DIR! pushd !BUILD_DIR! %NSCONFIG% .. -Dsimd=%%h -suite=%%g ninja popd ) ) REM ########################################################################### :end_ok popd endlocal exit /B 0 :end_nok popd endlocal exit /B 1 ================================================ FILE: scripts/build.sh ================================================ #!/bin/bash # Copyright (c) 2021 Agenium Scale # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. ############################################################################### cd `dirname $0` set -x set -e ############################################################################### # Init SETUP_SH="${PWD}/setup.sh" NSCONFIG="${PWD}/../nstools/nsconfig/nsconfig" HATCH_PY="${PWD}/../egg/hatch.py" BUILD_ROOT="${PWD}/.." ############################################################################### # Run setup bash "${SETUP_SH}" ############################################################################### # Generate NSIMD python3 --version 1>/dev/null 2>/dev/null && \ python3 "${HATCH_PY}" -lf || \ python "${HATCH_PY}" -lf ############################################################################### # Check/parse command line arguments if [ "${1}" == "" ]; then echo "$0: usage: $0 for simd_ext1,...,simd_ext2 [with compiler]" exit 0 fi if [ "${1}" != "for" ]; then echo "ERROR: expected 'for' as first argument" exit 1 fi if [ "${2}" == "" ]; then echo "ERROR: no SIMD extension given after 'for'" exit 1 fi SIMD_EXTS=`echo "${2}" | sed -e 's,/, ,g'` if [ "${3}" == "" ]; then COMPILER_ARG="gcc" elif [ "${3}" == "with" ]; then if [ "${4}" == "" ]; then echo "ERROR: no compiler given after 'with'" exit 1 fi COMPILER_ARG="${4}" else echo "ERROR: expected 'with' as fourth argument" exit 1 fi COMPILERS=`echo ${COMPILER_ARG} | sed 's,/, ,g'` ############################################################################### # Build NSIMD : one build directory per SIMD extension per compiler for compiler in ${COMPILERS}; do for simd_ext in ${SIMD_EXTS}; do BUILD_DIR="${BUILD_ROOT}/build-${simd_ext}-${compiler}" rm -rf "${BUILD_DIR}" mkdir -p "${BUILD_DIR}" (cd "${BUILD_DIR}" && \ "${NSCONFIG}" .. -Dsimd=${simd_ext} -suite=${compiler}) (cd "${BUILD_DIR}" && ninja) done done ================================================ FILE: scripts/ci-clang.txt ================================================ camelot.numscale.com (sse2-sse42-clang) - bash scripts/build-tests.sh for sse2/sse42 with clang - cd build-sse2-clang - ../nstools/bin/nstest -j80 - cd ../build-sse42-clang - ../nstools/bin/nstest -j80 gaunes.numscale.com (avx-avx2-clang) - bash scripts/build-tests.sh for avx/avx2 with clang - cd build-avx-clang - ../nstools/bin/nstest -j80 - cd ../build-avx2-clang - ../nstools/bin/nstest -j80 caradigan.numscale.com (aarch64-clang-1) - bash scripts/setup.sh - python3 egg/hatch.py -ltf - mkdir build-aarch64-clang - cd build-aarch64-clang - ../nstools/bin/nsconfig .. -Dsimd=aarch64 -comp=clang - ninja tests.c99 tests.cpp98 tests.cpp11 - ../nstools/bin/nstest -j80 carahes.numscale.com (aarch64-clang-2) - bash scripts/setup.sh - python3 egg/hatch.py -ltf - mkdir build-aarch64-clang - cd build-aarch64-clang - ../nstools/bin/nsconfig .. -Dsimd=aarch64 -comp=clang - ninja tests.c99 tests.cpp98 tests.cpp11 - ../nstools/bin/nstest -j80 camlann.numscale.com (aarch64-clang-3) - bash scripts/setup.sh - python3 egg/hatch.py -ltf - mkdir build-aarch64-clang - cd build-aarch64-clang - ../nstools/bin/nsconfig .. -Dsimd=aarch64 -comp=clang - ninja tests.c99 tests.cpp98 tests.cpp11 - ../nstools/bin/nstest -j80 ================================================ FILE: scripts/ci-scale.txt ================================================ camelot.hpc.scale {/home/gquintin} - mkdir cmake-build-sse2 - cd cmake-build-sse2 - cmake .. -Dsimd=sse2 - make -j10 - cd .. - mkdir cmake-build-sse42 - cd cmake-build-sse42 - cmake .. -Dsimd=sse42 - make -j10 - cd .. - bash scripts/build-tests.sh for sse2/sse42 with gcc - cd build-sse2-gcc - ../nstools/bin/nstest -j80 - cd ../build-sse42-gcc - ../nstools/bin/nstest -j80 glastonbury.hpc.scale {/home/gquintin} - source /etc/profile.d/modules.sh - module load cmake/3.1.0 - mkdir cmake-build-avx512_skylake - cd cmake-build-avx512_skylake - cmake .. -Dsimd=avx512_skylake - make -j10 - cd .. - bash scripts/build-tests.sh for avx512_skylake with gcc - cd build-avx512_skylake-gcc - ../nstools/bin/nstest -j40 carduel.hpc.scale {/home/gquintin} - source /etc/profile.d/profile.sh - module load cmake/3.1.0 - mkdir cmake-build-avx512_knl - cd cmake-build-avx512_knl - cmake .. -Dsimd=avx512_knl - make -j10 - cd .. - bash scripts/build-tests.sh for avx512_knl with gcc - cd build-avx512_knl-gcc - ../nstools/bin/nstest -j80 gaunes.hpc.scale {/home/gquintin} - mkdir cmake-build-avx - cd cmake-build-avx - cmake .. -Dsimd=avx - make -j10 - cd .. - mkdir cmake-build-avx2 - cd cmake-build-avx2 - cmake .. -Dsimd=avx2 - make -j10 - cd .. - bash scripts/build-tests.sh for avx/avx2 with gcc - cd build-avx-gcc - ../nstools/bin/nstest -j80 - cd ../build-avx2-gcc - ../nstools/bin/nstest -j80 - cd .. - mkdir cmake-build-armel - cd cmake-build-armel - cmake .. -Dsimd=neon128 -DCMAKE_CXX_COMPILER=arm-linux-gnueabi-gcc - make -j10 - cd .. - mkdir build-neon128-gcc - cd build-neon128-gcc - ../nstools/bin/nsconfig .. -Dsimd=neon128 -comp=cc,gcc,arm-linux-gnueabi-gcc,5,armel -comp=c++,gcc,arm-linux-gnueabi-g++,5,armel - ninja tests - ../nstools/bin/nstest -j80 --prefix="qemu-arm" logres.hpc.scale {/home/gquintin} - mkdir cmake-build-cpu - cd cmake-build-cpu - cmake .. -Dsimd=cpu - make -j10 - cd .. - bash scripts/build-tests.sh for cpu with gcc - cd build-cpu-gcc - ../nstools/bin/nstest -j80 - export PATH=${PATH}:/usr/local/cuda/bin - export LD_LIBRARY_PATH=/usr/local/cuda/lib64 - mkdir ../build-cuda-nvcc - cd ../build-cuda-nvcc - ../nstools/bin/nsconfig .. -Dsimd=cuda -Dcuda_arch_flags=-msm_75 -suite=cuda - ninja tests - ../nstools/bin/nstest -j20 bowden.hpc.scale {/home/gquintin} - bash scripts/build-tests.sh for rocm with rocm - cd build-rocm-rocm - ../nstools/bin/nstest -j80 - cd .. - mkdir build-cpp20 - source /etc/profile.d/profile.sh - module load gcc/10.2.0 - cd build-cpp20 - ../nstools/bin/nsconfig .. -Dsimd=sse42 -suite=gcc - ninja tests.cpp20 - ../nstools/bin/nstest -j80 - cd .. - bash tests/FindNSIMD.cmake.sh caradigan.hpc.scale {/home/gquintin} - mkdir cmake-build-aarch64 - cd cmake-build-aarch64 - cmake .. -Dsimd=aarch64 - make -j10 - cd .. - bash scripts/build-tests.sh for aarch64 with gcc - cd build-aarch64-gcc - ../nstools/bin/nstest -j80 - cd .. - mkdir cmake-build-neon128 - cd cmake-build-neon128 - cmake .. -Dsimd=neon128 -DCMAKE_CXX_COMPILER=arm-linux-gnueabihf-gcc -DNSIMD_ARM32_IS_ARMEL=OFF - make -j10 - cd .. - mkdir build-neon128-gcc - cd build-neon128-gcc - ../nstools/bin/nsconfig .. -Dsimd=neon128 -comp=cc,gcc,arm-linux-gnueabihf-gcc,5,armhf -comp=c++,gcc,arm-linux-gnueabihf-g++,5,armhf - ninja tests - ../nstools/bin/nstest -j80 carahes.hpc.scale {/home/gquintin} - source /etc/profile.d/profile.sh - module load gcc/10.2.0 - mkdir cmake-build-sve128 - cd cmake-build-sve128 - cmake .. -Dsimd=sve128 - make -j10 - cd .. - bash scripts/build-tests.sh for sve128 with gcc - cd build-sve128-gcc - module load qemu/4.2.0 - ../nstools/bin/nstest -j80 --prefix="qemu-aarch64 -cpu max,sve-max-vq=1" WIN.gorre2 {/home/gquintin} ["C:\Program Files (x86)\Microsoft Visual Studio\2019\Professional\VC\Auxiliary\Build\vcvars64.bat"] - setlocal - call "C:\Program Files (x86)\Microsoft Visual Studio 14.0\VC\vcvarsall.bat" x86 - set PATH=%PATH%;C:\Program Files (x86)\CMake\bin - md cmake-build32-sse2 - cd cmake-build32-sse2 - cmake .. -Dsimd=sse2 -DCMAKE_CXX_COMPILER=cl -G "NMake Makefiles" - nmake - cd .. - md cmake-build32-sse42 - cd cmake-build32-sse42 - cmake .. -Dsimd=sse42 -DCMAKE_CXX_COMPILER=cl -G "NMake Makefiles" - nmake - cd .. - md cmake-build32-avx - cd cmake-build32-avx - cmake .. -Dsimd=avx -DCMAKE_CXX_COMPILER=cl -G "NMake Makefiles" - nmake - cd .. - md cmake-build32-avx2 - cd cmake-build32-avx2 - cmake .. -Dsimd=avx2 -DCMAKE_CXX_COMPILER=cl -G "NMake Makefiles" - nmake - cd .. - call scripts\build for sse2/sse42/avx/avx2 with msvc - endlocal - setlocal - call "C:\Program Files (x86)\Microsoft Visual Studio\2019\Professional\VC\Auxiliary\Build\vcvars64.bat" - md cmake-build64-sse2 - cd cmake-build64-sse2 - cmake .. -Dsimd=sse2 -DCMAKE_CXX_COMPILER=cl -G "NMake Makefiles" - nmake - cd .. - md cmake-build64-sse42 - cd cmake-build64-sse42 - cmake .. -Dsimd=sse42 -DCMAKE_CXX_COMPILER=cl -G "NMake Makefiles" - nmake - cd .. - md cmake-build64-avx - cd cmake-build64-avx - cmake .. -Dsimd=avx -DCMAKE_CXX_COMPILER=cl -G "NMake Makefiles" - nmake - cd .. - md cmake-build64-avx2 - cd cmake-build64-avx2 - cmake .. -Dsimd=avx2 -DCMAKE_CXX_COMPILER=cl -G "NMake Makefiles" - nmake - cd .. - call scripts\build-tests for avx2 with msvc - cd build-avx2-msvc - ..\nstools\bin\nstest -j60 - endlocal couillere {/Users/gquintin} - export PATH=${PATH}:/opt/homebrew/bin - python3 egg/hatch.py -ltf - bash scripts/setup.sh - mkdir build-aarch64-xcode - cd build-aarch64-xcode - ../nstools/bin/nsconfig .. -Dsimd=aarch64 -suite=llvm -Dmpfr="-I/opt/homebrew/include -L/opt/homebrew/lib -lmpfr" - ninja - ninja tests - ../nstools/bin/nstest -j16 ================================================ FILE: scripts/ci-test.txt ================================================ couillere {/Users/gquintin} - export PATH=${PATH}:/opt/homebrew/bin - python3 egg/hatch.py -ltf - bash scripts/setup.sh - mkdir build-aarch64-xcode - cd build-aarch64-xcode - ../nstools/bin/nsconfig .. -Dsimd=aarch64 -suite=llvm -Dmpfr="-I/opt/homebrew/include -L/opt/homebrew/lib -lmpfr" - ninja - ninja tests - ../nstools/bin/nstest -j16 ================================================ FILE: scripts/ci.sh ================================================ #!/bin/sh # Copyright (c) 2021 Agenium Scale # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. ############################################################################### # Argument parsing if [ "$2" == "" ]; then echo "ERROR: usage: $0 JOBS_FILE NSTOOLS_CHECKOUT_LAST_COMMIT" exit 1 fi JOBS_FILE="`realpath $1`" NSIMD_NSTOOLS_CHECKOUT_LATER="$2" cd `dirname $0` #set -x set -e ############################################################################### # Init SSH="ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \ -o LogLevel=error" SCP="scp -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \ -o LogLevel=error" GIT_URL=`git remote get-url origin` GIT_BRANCH=`git rev-parse --abbrev-ref HEAD` TMP_DIR="${PWD}/../_ci" ONE_LINER_C="${PWD}/../scripts/one-liner.c" SSHJOB_C="${PWD}/../nstools/sshjob/sshjob.c" # Empty tmp directory if [ -f "${JOBS_FILE}" ]; then rm -rf "${TMP_DIR}" mkdir -p "${TMP_DIR}" fi ############################################################################### # Build jobs scripts if [ -f "${JOBS_FILE}" ]; then CURRENT_JOB="" DESC="" REMOTE_HOST="Linux" while read -r line; do # Empty lines if [ "`echo ${line} | sed 's/[ \t]*//g'`" == "" ]; then continue fi # Comments if [ "`echo ${line} | cut -c 1`" == "#" ]; then continue fi if [ "`echo ${line} | cut -c 1`" == "-" ]; then echo "`echo ${line} | cut -c 2- | sed 's/^ *//g'`" >>"${CURRENT_JOB}" echo >>"${CURRENT_JOB}" else ADDR=`echo ${line} | sed -e 's/<.*//g' -e 's/ *//g'` DESC=`echo ${line} | sed -e 's/.*.*//g'` REMOTE_DIR=`echo ${line} | sed -e 's/.*{//g' -e 's/}.*//g'` EXTRA=`echo ${line} | sed -e 's/.*\[//g' -e 's/].*//g'` REMOTE_HOST=`echo ${ADDR} | head -c 4` echo ${REMOTE_DIR} >"${TMP_DIR}/${ADDR}--${DESC}.work.dir" if [ "${REMOTE_HOST}" == "WIN." ]; then CURRENT_JOB="${TMP_DIR}/${ADDR}--${DESC}.bat" # <-- this must be before ADDR="`echo ${ADDR} | tail -c +5`" # <-- this REMOTE_HOST="Windows" cat >"${CURRENT_JOB}" <<-EOF @echo off setlocal pushd "%~dp0" set NSTOOLS_CHECKOUT_LAST_COMMIT="${NSTOOLS_CHECKOUT_LAST_COMMIT}" if exist ci-nsimd-${DESC} rd /Q /S ci-nsimd-${DESC} git clone ${GIT_URL} ci-nsimd-${DESC} git -C ci-nsimd-${DESC} checkout ${GIT_BRANCH} pushd ci-nsimd-${DESC} REM ---------------------------------------------------------------- REM User commands from here EOF # On Windows we need a native compiler. On Linux we have cc in the # PATH but on Windows we have nothing. We need a MSVC but there is # no easy way to find one. So we parse what is between [...] which # contains the path to the vcvarsall.bat script to load the compiler cat >"${TMP_DIR}/${ADDR}--${DESC}-native-cl" <<-EOF @echo off setlocal call ${EXTRA} cl %* exit /B %ERRORLEVEL% EOF else CURRENT_JOB="${TMP_DIR}/${ADDR}--${DESC}.sh" REMOTE_HOST="Linux" cat >"${CURRENT_JOB}" <<-EOF #!/bin/sh cd \`dirname \$0\` set -e export NSTOOLS_CHECKOUT_LAST_COMMIT="${NSTOOLS_CHECKOUT_LAST_COMMIT}" rm -rf ci-nsimd-${DESC} git clone ${GIT_URL} ci-nsimd-${DESC} git -C ci-nsimd-${DESC} checkout ${GIT_BRANCH} cd ci-nsimd-${DESC} # ------------------------------------------------------------------ # User commands from here EOF fi fi done <"${JOBS_FILE}" fi ############################################################################### # Launch jobs if [ -f "${JOBS_FILE}" ]; then echo "-- NSIMD CI" echo "-- " echo "-- Initialization:" for job in `find ${TMP_DIR} -iregex '.*\.\(bat\|sh\)'`; do ADDR=`basename ${job} | \ sed -e 's/\.sh$//g' -e 's/\.bat$//g' -e 's/--.*//g'` DESC=`basename ${job} | \ sed -e 's/\.sh$//g' -e 's/\.bat$//g' -e 's/.*--//g'` REMOTE_DIR="`cat ${TMP_DIR}/${ADDR}--${DESC}.work.dir`" W_REMOTE_DIR="`echo ${REMOTE_DIR} | tr / \\\\\\`" REMOTE_HOST=`echo ${ADDR} | head -c 4` if [ "${REMOTE_HOST}" == "WIN." ]; then REMOTE_HOST="Windows" ADDR="`echo ${ADDR} | tail -c +5`" else REMOTE_HOST="Linux" fi echo "-- Found new job: ${DESC}" echo "-- Remote machine will be: ${ADDR}" if [ "${REMOTE_HOST}" == "Windows" ]; then echo "-- Working directory will be: ${W_REMOTE_DIR}" ${SSH} ${ADDR} if not exist ${W_REMOTE_DIR} md ${W_REMOTE_DIR} else echo "-- Working directory will be: ${REMOTE_DIR}" ${SSH} ${ADDR} mkdir -p ${REMOTE_DIR} fi echo "-- Launching commands" if [ "${REMOTE_HOST}" == "Windows" ]; then ${SCP} ${job} ${ADDR}:${W_REMOTE_DIR} ${SCP} ${ONE_LINER_C} ${ADDR}:${W_REMOTE_DIR} ${SCP} ${SSHJOB_C} ${ADDR}:${W_REMOTE_DIR} ${SCP} ${TMP_DIR}/${ADDR}--${DESC}-native-cl \ ${ADDR}:${W_REMOTE_DIR}\\native-cl.bat ${SSH} ${ADDR} "cd ${W_REMOTE_DIR} & \ native-cl /Ox /W3 /D_CRT_SECURE_NO_WARNINGS one-liner.c" ${SSH} ${ADDR} "cd ${W_REMOTE_DIR} & \ native-cl /Ox /W3 /D_CRT_SECURE_NO_WARNINGS sshjob.c" ${SSH} ${ADDR} "cd ${W_REMOTE_DIR} & \ sshjob run \"`basename ${job}` 2>&1 | \ one-liner ci-nsimd-${DESC}-output.txt \ ci-nsimd-${DESC}-one-liner.txt\"" \ | sed 's/\r//g' >${TMP_DIR}/ci-nsimd-${DESC}-pid.txt else ${SCP} ${job} ${ADDR}:${REMOTE_DIR} ${SCP} ${ONE_LINER_C} ${ADDR}:${REMOTE_DIR} ${SCP} ${SSHJOB_C} ${ADDR}:${REMOTE_DIR} ${SSH} ${ADDR} "cd ${REMOTE_DIR} && cc -O2 one-liner.c -o one-liner" ${SSH} ${ADDR} "cd ${REMOTE_DIR} && cc -O2 sshjob.c -o sshjob" ${SSH} ${ADDR} "cd ${REMOTE_DIR} && \ ./sshjob run \"bash `basename ${job}` 2>&1 | \ ./one-liner ci-nsimd-${DESC}-output.txt \ ci-nsimd-${DESC}-one-liner.txt\"" \ >${TMP_DIR}/ci-nsimd-${DESC}-pid.txt fi done sleep 2 fi ############################################################################### # Build associative arrays REMOTE_HOST_A="" ADDR_A="" DESC_A="" ONE_LINER_A="" KILL_COMMAND_A="" LOG_A="" N=0 for job in `find ${TMP_DIR} -iregex '.*\.\(bat\|sh\)'`; do ADDR=`basename ${job} | \ sed -e 's/\.sh$//g' -e 's/\.bat$//g' -e 's/--.*//g'` DESC=`basename ${job} | \ sed -e 's/\.sh$//g' -e 's/\.bat$//g' -e 's/.*--//g'` REMOTE_DIR="`cat ${TMP_DIR}/${ADDR}--${DESC}.work.dir`" W_REMOTE_DIR="`echo ${REMOTE_DIR} | tr / \\\\\\`" LOG="${REMOTE_DIR}/ci-nsimd-${DESC}-output.txt" REMOTE_HOST="`echo ${ADDR} | head -c 4`" PID="`sed -e 's/\r//g' ${TMP_DIR}/ci-nsimd-${DESC}-pid.txt`" if [ "${REMOTE_HOST}" == "WIN." ]; then REMOTE_HOST="Windows" ADDR="`echo ${ADDR} | tail -c +5`" ONE_LINER="${W_REMOTE_DIR}\\ci-nsimd-${DESC}-one-liner.txt" KILL_COMMAND="${W_REMOTE_DIR}\\sshjob kill ${PID}" else REMOTE_HOST="Linux" ONE_LINER="${REMOTE_DIR}/ci-nsimd-${DESC}-one-liner.txt" KILL_COMMAND="${REMOTE_DIR}/sshjob kill ${PID}" fi ADDR_A="${ADDR_A}${ADDR}:" DESC_A="${DESC_A}${DESC}:" ONE_LINER_A="${ONE_LINER_A}${ONE_LINER}:" KILL_COMMAND_A="${KILL_COMMAND_A}${KILL_COMMAND}:" LOG_A="${LOG_A}${LOG}:" REMOTE_HOST_A="${REMOTE_HOST_A}${REMOTE_HOST}:" N=`expr ${N} + 1` done get_a() { echo ${1} | cut -f${2} -d':' } ############################################################################### # Monitor jobs (main event loop) if [ -d "${JOBS_FILE}" ]; then TMP_DIR="${JOBS_FILE}" fi trap "stty echo icanon; exit 0" SIGINT stty -echo -icanon clear key="" selected=1 echo2() { printf "%-${COLUMNS}s" " " printf "\r" echo "${1}" } while true; do if [ "${selected}" -gt "${N}" ]; then selected=${N} fi if [ "${selected}" -lt "1" ]; then selected=1 fi # Display part tput cup 0 0 key="" echo2 echo2 "[q] quit [D] download outputs and quit [T] kill all jobs" echo2 "[j] select next [k] select previous [t] kill selected job" echo2 " [d] see selected job log" echo2 for i in `seq 1 ${N}`; do ( ADDR=`get_a ${ADDR_A} ${i}` ONE_LINER=`get_a ${ONE_LINER_A} ${i}` REMOTE_HOST=`get_a ${REMOTE_HOST_A} ${i}` if [ "${REMOTE_HOST}" == "Windows" ]; then STATUS=`${SSH} ${ADDR} "if exist ${ONE_LINER} type ${ONE_LINER}" \ || true` else STATUS=`${SSH} ${ADDR} "[ -f ${ONE_LINER} ] && cat ${ONE_LINER}" \ || true` fi echo ${STATUS} >${TMP_DIR}/one-liner-${i}.txt ) "${ROOT}/run-${s}.sh" echo >>"${ROOT}/run-${s}.sh" echo 'cd `dirname $0`' >>"${ROOT}/run-${s}.sh" echo "mkdir -p ${s}" >>"${ROOT}/run-${s}.sh" echo "cd ${s}" >>"${ROOT}/run-${s}.sh" echo >>"${ROOT}/run-${s}.sh" done continue fi # Standard line (part of a script) if [ "${SIMD_EXTS}" != "" ]; then for s in ${SIMD_EXTS}; do echo ${line} | sed -e "s,SIMD_EXT,${s},g" \ -e "s,SRC_DIR,${PWD}/..,g" \ -e "s,NSCONFIG,${NSCONFIG},g" \ -e "s,NSTEST,${NSTEST},g" \ -e "s,NPROC,${NPROC},g" \ -e "s,TARGET,${TARGET},g" \ >>"${ROOT}/run-${s}.sh" done fi done <"${INPUT}" # ----------------------------------------------------------------------------- # Compile all tests for i in ${ROOT}/*.sh; do ( bash ${i} || true ) | tee ${i}.log done ================================================ FILE: scripts/one-liner.c ================================================ /* Copyright (c) 2020 Agenium Scale Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ /* ------------------------------------------------------------------------- */ /* This program needs to be as portable as possible as it is intended for Windows hosts with an unknown version of Visual Studio. It is compiled before running the tests of NSIMD. Its purpose is to read stdin and put all into an accumulator file and from time to time (approximatively every second) put a line of text into another file. */ #include #include #include #include #include #define DO(cmd, error_code, goto_label_on_error) \ do { \ errno = 0; \ if ((cmd) == error_code) { \ fprintf(stderr, "%s: error: " #cmd ": %s\n", argv[0], strerror(errno)); \ ret = -1; \ goto goto_label_on_error; \ } \ } while (0) int main(int argc, char **argv) { FILE *acc, *one = NULL; char *buf; int ret = 0; size_t n = 1024; time_t tick; if (argc != 3) { fprintf(stderr, "%s: ERROR: usage: one-liner acc.txt one-liner.txt", argv[0]); return -1; } DO(acc = fopen(argv[1], "wb"), NULL, end); DO(buf = malloc(n), NULL, free_acc); tick = time(NULL); for (;;) { time_t t; size_t i = 0; int end_of_file = 0; for (;;) { int code = fgetc(stdin); if (code == EOF || code == '\n') { buf[i] = '\n'; buf[i + 1] = 0; end_of_file = (code == EOF); break; } buf[i] = (char)code; if (i >= n - 2) { n = n * 2; DO(buf = realloc(buf, n), NULL, free_buf); } i++; } DO(fputs(buf, acc), EOF, free_buf); DO(fflush(acc), EOF, free_buf); t = time(NULL); if (t - tick >= 1) { DO(one = fopen(argv[2], "wb"), NULL, free_buf); DO(fputs(buf, one), EOF, free_one); DO(fflush(one), EOF, free_one); DO(fclose(one), EOF, free_one); one = NULL; tick = t; } if (end_of_file) { break; } } DO(one = fopen(argv[2], "wb"), NULL, free_buf); DO(fputs("Finished", one), EOF, free_one); DO(fflush(one), EOF, free_one); free_one: if (one != NULL && fclose(one) == EOF) { fprintf(stderr, "%s: NOTE: error on closing '%s': %s\n", argv[0], argv[2], strerror(errno)); } free_buf: free(buf); free_acc: if (fclose(acc) == EOF) { fprintf(stderr, "%s: NOTE: error on closing '%s': %s\n", argv[0], argv[1], strerror(errno)); } end: return ret; } ================================================ FILE: scripts/powerpc64le-linux-gnu-clang++.sh ================================================ #!/bin/bash clang++ --target=powerpc64le-linux-gnu \ -I/usr/powerpc64le-linux-gnu/include/c++/8/powerpc64le-linux-gnu "$@" ================================================ FILE: scripts/powerpc64le-linux-gnu-clang.sh ================================================ #!/bin/bash clang --target=powerpc64le-linux-gnu \ -I/usr/powerpc64le-linux-gnu/include/c++/8/powerpc64le-linux-gnu "$@" ================================================ FILE: scripts/setup.bat ================================================ @echo off REM Copyright (c) 2020 Agenium Scale REM REM Permission is hereby granted, free of charge, to any person obtaining a copy REM of this software and associated documentation files (the "Software"), to deal REM in the Software without restriction, including without limitation the rights REM to use, copy, modify, merge, publish, distribute, sublicense, and/or sell REM copies of the Software, and to permit persons to whom the Software is REM furnished to do so, subject to the following conditions: REM REM The above copyright notice and this permission notice shall be included in all REM copies or substantial portions of the Software. REM REM THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR REM IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, REM FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE REM AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER REM LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, REM OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE REM SOFTWARE. REM ########################################################################### setlocal EnableDelayedExpansion pushd "%~dp0" REM ########################################################################### REM Init set NSTOOLS_DIR="%CD%\..\nstools" REM ########################################################################### REM Pull nstools if exist "%NSTOOLS_DIR%\README.md" ( pushd %NSTOOLS_DIR% git pull || cd . popd ) else ( if exist "..\.git" ( git remote get-url origin >_tmp-nsimd-url.txt set /P NSIMD_URL=<_tmp-nsimd-url.txt set NSTOOLS_URL=!NSIMD_URL:nsimd=nstools! del /F /Q _tmp-nsimd-url.txt pushd ".." git clone !NSTOOLS_URL! nstools popd ) else ( pushd ".." git clone "https://github.com/agenium-scale/nstools.git" nstools popd ) ) if "%NSTOOLS_CHECKOUT_LAST_COMMIT%" == "" ( git -C %NSTOOLS_DIR% checkout v3.0 ) else ( git -C %NSTOOLS_DIR% checkout master ) REM ########################################################################### REM Create bin directory if not exist %NSTOOLS_DIR%\bin ( md %NSTOOLS_DIR%\bin ) REM ########################################################################### REM Build nsconfig (if not already built) pushd %NSTOOLS_DIR%\nsconfig nmake /F Makefile.win nsconfig.exe nmake /F Makefile.win nstest.exe copy /Y "nsconfig.exe" %NSTOOLS_DIR%\bin copy /Y "nstest.exe" %NSTOOLS_DIR%\bin popd popd endlocal exit /B 0 ================================================ FILE: scripts/setup.sh ================================================ #!/bin/bash # Copyright (c) 2021 Agenium Scale # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. ############################################################################### cd `dirname $0` set -x set -e ############################################################################### # Init NSTOOLS_DIR="${PWD}/../nstools" ############################################################################### # Build nsconfig (if not already built) [ -d "${NSTOOLS_DIR}" ] || \ ( cd "${PWD}/.." && \ ( [ -d .git ] \ && ( git clone `git remote get-url origin | sed s/nsimd/nstools/g` ) \ || ( git clone "https://github.com/agenium-scale/nstools.git" ) ) ) if [ "${NSTOOLS_CHECKOUT_LAST_COMMIT}" == "" ]; then git -C "${NSTOOLS_DIR}" checkout v3.0 else git -C "${NSTOOLS_DIR}" checkout master git -C "${NSTOOLS_DIR}" pull fi ( cd "${NSTOOLS_DIR}/nsconfig" && \ make -B -j8 -f Makefile.nix nsconfig && \ make -B -j8 -f Makefile.nix nstest ) ================================================ FILE: src/dd.h ================================================ // Copyright Naoki Shibata and contributors 2010 - 2020. // Distributed under the Boost Software License, Version 1.0. // (See accompanying file LICENSE.txt or copy at // http://www.boost.org/LICENSE_1_0.txt) #if !(defined(ENABLE_SVE) || defined(ENABLE_SVENOFMA)) typedef struct { vdouble x, y; } vdouble2; static vdouble vd2getx_vd_vd2(vdouble2 v) { return v.x; } static vdouble vd2gety_vd_vd2(vdouble2 v) { return v.y; } static vdouble2 vd2setxy_vd2_vd_vd(vdouble x, vdouble y) { vdouble2 v; v.x = x; v.y = y; return v; } static vdouble2 vd2setx_vd2_vd2_vd(vdouble2 v, vdouble d) { v.x = d; return v; } static vdouble2 vd2sety_vd2_vd2_vd(vdouble2 v, vdouble d) { v.y = d; return v; } #endif static INLINE CONST VECTOR_CC vdouble vupper_vd_vd(vdouble d) { return vreinterpret_vd_vm(vand_vm_vm_vm(vreinterpret_vm_vd(d), vcast_vm_i_i(0xffffffff, 0xf8000000))); } static INLINE CONST VECTOR_CC vdouble2 vcast_vd2_vd_vd(vdouble h, vdouble l) { return vd2setxy_vd2_vd_vd(h, l); } static INLINE CONST VECTOR_CC vdouble2 vcast_vd2_d_d(double h, double l) { return vd2setxy_vd2_vd_vd(vcast_vd_d(h), vcast_vd_d(l)); } static INLINE CONST VECTOR_CC vdouble2 vsel_vd2_vo_vd2_vd2(vopmask m, vdouble2 x, vdouble2 y) { return vd2setxy_vd2_vd_vd(vsel_vd_vo_vd_vd(m, vd2getx_vd_vd2(x), vd2getx_vd_vd2(y)), vsel_vd_vo_vd_vd(m, vd2gety_vd_vd2(x), vd2gety_vd_vd2(y))); } static INLINE CONST VECTOR_CC vdouble2 vsel_vd2_vo_d_d_d_d(vopmask o, double x1, double y1, double x0, double y0) { return vd2setxy_vd2_vd_vd(vsel_vd_vo_d_d(o, x1, x0), vsel_vd_vo_d_d(o, y1, y0)); } static INLINE CONST VECTOR_CC vdouble vadd_vd_3vd(vdouble v0, vdouble v1, vdouble v2) { return vadd_vd_vd_vd(vadd_vd_vd_vd(v0, v1), v2); } static INLINE CONST VECTOR_CC vdouble vadd_vd_4vd(vdouble v0, vdouble v1, vdouble v2, vdouble v3) { return vadd_vd_3vd(vadd_vd_vd_vd(v0, v1), v2, v3); } static INLINE CONST VECTOR_CC vdouble vadd_vd_5vd(vdouble v0, vdouble v1, vdouble v2, vdouble v3, vdouble v4) { return vadd_vd_4vd(vadd_vd_vd_vd(v0, v1), v2, v3, v4); } static INLINE CONST VECTOR_CC vdouble vadd_vd_6vd(vdouble v0, vdouble v1, vdouble v2, vdouble v3, vdouble v4, vdouble v5) { return vadd_vd_5vd(vadd_vd_vd_vd(v0, v1), v2, v3, v4, v5); } static INLINE CONST VECTOR_CC vdouble vadd_vd_7vd(vdouble v0, vdouble v1, vdouble v2, vdouble v3, vdouble v4, vdouble v5, vdouble v6) { return vadd_vd_6vd(vadd_vd_vd_vd(v0, v1), v2, v3, v4, v5, v6); } static INLINE CONST VECTOR_CC vdouble vsub_vd_3vd(vdouble v0, vdouble v1, vdouble v2) { return vsub_vd_vd_vd(vsub_vd_vd_vd(v0, v1), v2); } static INLINE CONST VECTOR_CC vdouble vsub_vd_4vd(vdouble v0, vdouble v1, vdouble v2, vdouble v3) { return vsub_vd_3vd(vsub_vd_vd_vd(v0, v1), v2, v3); } static INLINE CONST VECTOR_CC vdouble vsub_vd_5vd(vdouble v0, vdouble v1, vdouble v2, vdouble v3, vdouble v4) { return vsub_vd_4vd(vsub_vd_vd_vd(v0, v1), v2, v3, v4); } static INLINE CONST VECTOR_CC vdouble vsub_vd_6vd(vdouble v0, vdouble v1, vdouble v2, vdouble v3, vdouble v4, vdouble v5) { return vsub_vd_5vd(vsub_vd_vd_vd(v0, v1), v2, v3, v4, v5); } // static INLINE CONST VECTOR_CC vdouble2 ddneg_vd2_vd2(vdouble2 x) { return vcast_vd2_vd_vd(vneg_vd_vd(vd2getx_vd_vd2(x)), vneg_vd_vd(vd2gety_vd_vd2(x))); } static INLINE CONST VECTOR_CC vdouble2 ddabs_vd2_vd2(vdouble2 x) { return vcast_vd2_vd_vd(vabs_vd_vd(vd2getx_vd_vd2(x)), vreinterpret_vd_vm(vxor_vm_vm_vm(vreinterpret_vm_vd(vd2gety_vd_vd2(x)), vand_vm_vm_vm(vreinterpret_vm_vd(vd2getx_vd_vd2(x)), vreinterpret_vm_vd(vcast_vd_d(-0.0)))))); } static INLINE CONST VECTOR_CC vdouble2 ddnormalize_vd2_vd2(vdouble2 t) { vdouble s = vadd_vd_vd_vd(vd2getx_vd_vd2(t), vd2gety_vd_vd2(t)); return vd2setxy_vd2_vd_vd(s, vadd_vd_vd_vd(vsub_vd_vd_vd(vd2getx_vd_vd2(t), s), vd2gety_vd_vd2(t))); } static INLINE CONST VECTOR_CC vdouble2 ddscale_vd2_vd2_vd(vdouble2 d, vdouble s) { return vd2setxy_vd2_vd_vd(vmul_vd_vd_vd(vd2getx_vd_vd2(d), s), vmul_vd_vd_vd(vd2gety_vd_vd2(d), s)); } static INLINE CONST VECTOR_CC vdouble2 ddadd_vd2_vd_vd(vdouble x, vdouble y) { vdouble s = vadd_vd_vd_vd(x, y); return vd2setxy_vd2_vd_vd(s, vadd_vd_vd_vd(vsub_vd_vd_vd(x, s), y)); } static INLINE CONST VECTOR_CC vdouble2 ddadd2_vd2_vd_vd(vdouble x, vdouble y) { vdouble s = vadd_vd_vd_vd(x, y); vdouble v = vsub_vd_vd_vd(s, x); return vd2setxy_vd2_vd_vd(s, vadd_vd_vd_vd(vsub_vd_vd_vd(x, vsub_vd_vd_vd(s, v)), vsub_vd_vd_vd(y, v))); } static INLINE CONST VECTOR_CC vdouble2 ddadd_vd2_vd2_vd(vdouble2 x, vdouble y) { vdouble s = vadd_vd_vd_vd(vd2getx_vd_vd2(x), y); return vd2setxy_vd2_vd_vd(s, vadd_vd_3vd(vsub_vd_vd_vd(vd2getx_vd_vd2(x), s), y, vd2gety_vd_vd2(x))); } static INLINE CONST VECTOR_CC vdouble2 ddsub_vd2_vd2_vd(vdouble2 x, vdouble y) { vdouble s = vsub_vd_vd_vd(vd2getx_vd_vd2(x), y); return vd2setxy_vd2_vd_vd(s, vadd_vd_vd_vd(vsub_vd_vd_vd(vsub_vd_vd_vd(vd2getx_vd_vd2(x), s), y), vd2gety_vd_vd2(x))); } static INLINE CONST VECTOR_CC vdouble2 ddadd2_vd2_vd2_vd(vdouble2 x, vdouble y) { vdouble s = vadd_vd_vd_vd(vd2getx_vd_vd2(x), y); vdouble v = vsub_vd_vd_vd(s, vd2getx_vd_vd2(x)); vdouble w = vadd_vd_vd_vd(vsub_vd_vd_vd(vd2getx_vd_vd2(x), vsub_vd_vd_vd(s, v)), vsub_vd_vd_vd(y, v)); return vd2setxy_vd2_vd_vd(s, vadd_vd_vd_vd(w, vd2gety_vd_vd2(x))); } static INLINE CONST VECTOR_CC vdouble2 ddadd_vd2_vd_vd2(vdouble x, vdouble2 y) { vdouble s = vadd_vd_vd_vd(x, vd2getx_vd_vd2(y)); return vd2setxy_vd2_vd_vd(s, vadd_vd_3vd(vsub_vd_vd_vd(x, s), vd2getx_vd_vd2(y), vd2gety_vd_vd2(y))); } static INLINE CONST VECTOR_CC vdouble2 ddadd2_vd2_vd_vd2(vdouble x, vdouble2 y) { vdouble s = vadd_vd_vd_vd(x, vd2getx_vd_vd2(y)); vdouble v = vsub_vd_vd_vd(s, x); return vd2setxy_vd2_vd_vd(s, vadd_vd_vd_vd(vadd_vd_vd_vd(vsub_vd_vd_vd(x, vsub_vd_vd_vd(s, v)), vsub_vd_vd_vd(vd2getx_vd_vd2(y), v)), vd2gety_vd_vd2(y))); } static INLINE CONST VECTOR_CC vdouble2 ddadd_vd2_vd2_vd2(vdouble2 x, vdouble2 y) { // |x| >= |y| vdouble s = vadd_vd_vd_vd(vd2getx_vd_vd2(x), vd2getx_vd_vd2(y)); return vd2setxy_vd2_vd_vd(s, vadd_vd_4vd(vsub_vd_vd_vd(vd2getx_vd_vd2(x), s), vd2getx_vd_vd2(y), vd2gety_vd_vd2(x), vd2gety_vd_vd2(y))); } static INLINE CONST VECTOR_CC vdouble2 ddadd2_vd2_vd2_vd2(vdouble2 x, vdouble2 y) { vdouble s = vadd_vd_vd_vd(vd2getx_vd_vd2(x), vd2getx_vd_vd2(y)); vdouble v = vsub_vd_vd_vd(s, vd2getx_vd_vd2(x)); vdouble t = vadd_vd_vd_vd(vsub_vd_vd_vd(vd2getx_vd_vd2(x), vsub_vd_vd_vd(s, v)), vsub_vd_vd_vd(vd2getx_vd_vd2(y), v)); return vd2setxy_vd2_vd_vd(s, vadd_vd_vd_vd(t, vadd_vd_vd_vd(vd2gety_vd_vd2(x), vd2gety_vd_vd2(y)))); } static INLINE CONST VECTOR_CC vdouble2 ddsub_vd2_vd_vd(vdouble x, vdouble y) { // |x| >= |y| vdouble s = vsub_vd_vd_vd(x, y); return vd2setxy_vd2_vd_vd(s, vsub_vd_vd_vd(vsub_vd_vd_vd(x, s), y)); } static INLINE CONST VECTOR_CC vdouble2 ddsub_vd2_vd2_vd2(vdouble2 x, vdouble2 y) { // |x| >= |y| vdouble s = vsub_vd_vd_vd(vd2getx_vd_vd2(x), vd2getx_vd_vd2(y)); vdouble t = vsub_vd_vd_vd(vd2getx_vd_vd2(x), s); t = vsub_vd_vd_vd(t, vd2getx_vd_vd2(y)); t = vadd_vd_vd_vd(t, vd2gety_vd_vd2(x)); return vd2setxy_vd2_vd_vd(s, vsub_vd_vd_vd(t, vd2gety_vd_vd2(y))); } #ifdef ENABLE_FMA_DP static INLINE CONST VECTOR_CC vdouble2 dddiv_vd2_vd2_vd2(vdouble2 n, vdouble2 d) { vdouble t = vrec_vd_vd(vd2getx_vd_vd2(d)); vdouble s = vmul_vd_vd_vd(vd2getx_vd_vd2(n), t); vdouble u = vfmapn_vd_vd_vd_vd(t, vd2getx_vd_vd2(n), s); vdouble v = vfmanp_vd_vd_vd_vd(vd2gety_vd_vd2(d), t, vfmanp_vd_vd_vd_vd(vd2getx_vd_vd2(d), t, vcast_vd_d(1))); return vd2setxy_vd2_vd_vd(s, vfma_vd_vd_vd_vd(s, v, vfma_vd_vd_vd_vd(vd2gety_vd_vd2(n), t, u))); } static INLINE CONST VECTOR_CC vdouble2 ddmul_vd2_vd_vd(vdouble x, vdouble y) { vdouble s = vmul_vd_vd_vd(x, y); return vd2setxy_vd2_vd_vd(s, vfmapn_vd_vd_vd_vd(x, y, s)); } static INLINE CONST VECTOR_CC vdouble2 ddsqu_vd2_vd2(vdouble2 x) { vdouble s = vmul_vd_vd_vd(vd2getx_vd_vd2(x), vd2getx_vd_vd2(x)); return vd2setxy_vd2_vd_vd(s, vfma_vd_vd_vd_vd(vadd_vd_vd_vd(vd2getx_vd_vd2(x), vd2getx_vd_vd2(x)), vd2gety_vd_vd2(x), vfmapn_vd_vd_vd_vd(vd2getx_vd_vd2(x), vd2getx_vd_vd2(x), s))); } static INLINE CONST VECTOR_CC vdouble2 ddmul_vd2_vd2_vd2(vdouble2 x, vdouble2 y) { vdouble s = vmul_vd_vd_vd(vd2getx_vd_vd2(x), vd2getx_vd_vd2(y)); return vd2setxy_vd2_vd_vd(s, vfma_vd_vd_vd_vd(vd2getx_vd_vd2(x), vd2gety_vd_vd2(y), vfma_vd_vd_vd_vd(vd2gety_vd_vd2(x), vd2getx_vd_vd2(y), vfmapn_vd_vd_vd_vd(vd2getx_vd_vd2(x), vd2getx_vd_vd2(y), s)))); } static INLINE CONST VECTOR_CC vdouble ddmul_vd_vd2_vd2(vdouble2 x, vdouble2 y) { return vfma_vd_vd_vd_vd(vd2getx_vd_vd2(x), vd2getx_vd_vd2(y), vfma_vd_vd_vd_vd(vd2gety_vd_vd2(x), vd2getx_vd_vd2(y), vmul_vd_vd_vd(vd2getx_vd_vd2(x), vd2gety_vd_vd2(y)))); } static INLINE CONST VECTOR_CC vdouble ddsqu_vd_vd2(vdouble2 x) { return vfma_vd_vd_vd_vd(vd2getx_vd_vd2(x), vd2getx_vd_vd2(x), vadd_vd_vd_vd(vmul_vd_vd_vd(vd2getx_vd_vd2(x), vd2gety_vd_vd2(x)), vmul_vd_vd_vd(vd2getx_vd_vd2(x), vd2gety_vd_vd2(x)))); } static INLINE CONST VECTOR_CC vdouble2 ddmul_vd2_vd2_vd(vdouble2 x, vdouble y) { vdouble s = vmul_vd_vd_vd(vd2getx_vd_vd2(x), y); return vd2setxy_vd2_vd_vd(s, vfma_vd_vd_vd_vd(vd2gety_vd_vd2(x), y, vfmapn_vd_vd_vd_vd(vd2getx_vd_vd2(x), y, s))); } static INLINE CONST VECTOR_CC vdouble2 ddrec_vd2_vd(vdouble d) { vdouble s = vrec_vd_vd(d); return vd2setxy_vd2_vd_vd(s, vmul_vd_vd_vd(s, vfmanp_vd_vd_vd_vd(d, s, vcast_vd_d(1)))); } static INLINE CONST VECTOR_CC vdouble2 ddrec_vd2_vd2(vdouble2 d) { vdouble s = vrec_vd_vd(vd2getx_vd_vd2(d)); return vd2setxy_vd2_vd_vd(s, vmul_vd_vd_vd(s, vfmanp_vd_vd_vd_vd(vd2gety_vd_vd2(d), s, vfmanp_vd_vd_vd_vd(vd2getx_vd_vd2(d), s, vcast_vd_d(1))))); } #else static INLINE CONST VECTOR_CC vdouble2 dddiv_vd2_vd2_vd2(vdouble2 n, vdouble2 d) { vdouble t = vrec_vd_vd(vd2getx_vd_vd2(d)); vdouble dh = vupper_vd_vd(vd2getx_vd_vd2(d)), dl = vsub_vd_vd_vd(vd2getx_vd_vd2(d), dh); vdouble th = vupper_vd_vd(t ), tl = vsub_vd_vd_vd(t , th); vdouble nhh = vupper_vd_vd(vd2getx_vd_vd2(n)), nhl = vsub_vd_vd_vd(vd2getx_vd_vd2(n), nhh); vdouble s = vmul_vd_vd_vd(vd2getx_vd_vd2(n), t); vdouble u = vadd_vd_5vd(vsub_vd_vd_vd(vmul_vd_vd_vd(nhh, th), s), vmul_vd_vd_vd(nhh, tl), vmul_vd_vd_vd(nhl, th), vmul_vd_vd_vd(nhl, tl), vmul_vd_vd_vd(s, vsub_vd_5vd(vcast_vd_d(1), vmul_vd_vd_vd(dh, th), vmul_vd_vd_vd(dh, tl), vmul_vd_vd_vd(dl, th), vmul_vd_vd_vd(dl, tl)))); return vd2setxy_vd2_vd_vd(s, vmla_vd_vd_vd_vd(t, vsub_vd_vd_vd(vd2gety_vd_vd2(n), vmul_vd_vd_vd(s, vd2gety_vd_vd2(d))), u)); } static INLINE CONST VECTOR_CC vdouble2 ddmul_vd2_vd_vd(vdouble x, vdouble y) { vdouble xh = vupper_vd_vd(x), xl = vsub_vd_vd_vd(x, xh); vdouble yh = vupper_vd_vd(y), yl = vsub_vd_vd_vd(y, yh); vdouble s = vmul_vd_vd_vd(x, y); return vd2setxy_vd2_vd_vd(s, vadd_vd_5vd(vmul_vd_vd_vd(xh, yh), vneg_vd_vd(s), vmul_vd_vd_vd(xl, yh), vmul_vd_vd_vd(xh, yl), vmul_vd_vd_vd(xl, yl))); } static INLINE CONST VECTOR_CC vdouble2 ddmul_vd2_vd2_vd(vdouble2 x, vdouble y) { vdouble xh = vupper_vd_vd(vd2getx_vd_vd2(x)), xl = vsub_vd_vd_vd(vd2getx_vd_vd2(x), xh); vdouble yh = vupper_vd_vd(y ), yl = vsub_vd_vd_vd(y , yh); vdouble s = vmul_vd_vd_vd(vd2getx_vd_vd2(x), y); return vd2setxy_vd2_vd_vd(s, vadd_vd_6vd(vmul_vd_vd_vd(xh, yh), vneg_vd_vd(s), vmul_vd_vd_vd(xl, yh), vmul_vd_vd_vd(xh, yl), vmul_vd_vd_vd(xl, yl), vmul_vd_vd_vd(vd2gety_vd_vd2(x), y))); } static INLINE CONST VECTOR_CC vdouble2 ddmul_vd2_vd2_vd2(vdouble2 x, vdouble2 y) { vdouble xh = vupper_vd_vd(vd2getx_vd_vd2(x)), xl = vsub_vd_vd_vd(vd2getx_vd_vd2(x), xh); vdouble yh = vupper_vd_vd(vd2getx_vd_vd2(y)), yl = vsub_vd_vd_vd(vd2getx_vd_vd2(y), yh); vdouble s = vmul_vd_vd_vd(vd2getx_vd_vd2(x), vd2getx_vd_vd2(y)); return vd2setxy_vd2_vd_vd(s, vadd_vd_7vd(vmul_vd_vd_vd(xh, yh), vneg_vd_vd(s), vmul_vd_vd_vd(xl, yh), vmul_vd_vd_vd(xh, yl), vmul_vd_vd_vd(xl, yl), vmul_vd_vd_vd(vd2getx_vd_vd2(x), vd2gety_vd_vd2(y)), vmul_vd_vd_vd(vd2gety_vd_vd2(x), vd2getx_vd_vd2(y)))); } static INLINE CONST VECTOR_CC vdouble ddmul_vd_vd2_vd2(vdouble2 x, vdouble2 y) { vdouble xh = vupper_vd_vd(vd2getx_vd_vd2(x)), xl = vsub_vd_vd_vd(vd2getx_vd_vd2(x), xh); vdouble yh = vupper_vd_vd(vd2getx_vd_vd2(y)), yl = vsub_vd_vd_vd(vd2getx_vd_vd2(y), yh); return vadd_vd_6vd(vmul_vd_vd_vd(vd2gety_vd_vd2(x), yh), vmul_vd_vd_vd(xh, vd2gety_vd_vd2(y)), vmul_vd_vd_vd(xl, yl), vmul_vd_vd_vd(xh, yl), vmul_vd_vd_vd(xl, yh), vmul_vd_vd_vd(xh, yh)); } static INLINE CONST VECTOR_CC vdouble2 ddsqu_vd2_vd2(vdouble2 x) { vdouble xh = vupper_vd_vd(vd2getx_vd_vd2(x)), xl = vsub_vd_vd_vd(vd2getx_vd_vd2(x), xh); vdouble s = vmul_vd_vd_vd(vd2getx_vd_vd2(x), vd2getx_vd_vd2(x)); return vd2setxy_vd2_vd_vd(s, vadd_vd_5vd(vmul_vd_vd_vd(xh, xh), vneg_vd_vd(s), vmul_vd_vd_vd(vadd_vd_vd_vd(xh, xh), xl), vmul_vd_vd_vd(xl, xl), vmul_vd_vd_vd(vd2getx_vd_vd2(x), vadd_vd_vd_vd(vd2gety_vd_vd2(x), vd2gety_vd_vd2(x))))); } static INLINE CONST VECTOR_CC vdouble ddsqu_vd_vd2(vdouble2 x) { vdouble xh = vupper_vd_vd(vd2getx_vd_vd2(x)), xl = vsub_vd_vd_vd(vd2getx_vd_vd2(x), xh); return vadd_vd_5vd(vmul_vd_vd_vd(xh, vd2gety_vd_vd2(x)), vmul_vd_vd_vd(xh, vd2gety_vd_vd2(x)), vmul_vd_vd_vd(xl, xl), vadd_vd_vd_vd(vmul_vd_vd_vd(xh, xl), vmul_vd_vd_vd(xh, xl)), vmul_vd_vd_vd(xh, xh)); } static INLINE CONST VECTOR_CC vdouble2 ddrec_vd2_vd(vdouble d) { vdouble t = vrec_vd_vd(d); vdouble dh = vupper_vd_vd(d), dl = vsub_vd_vd_vd(d, dh); vdouble th = vupper_vd_vd(t), tl = vsub_vd_vd_vd(t, th); return vd2setxy_vd2_vd_vd(t, vmul_vd_vd_vd(t, vsub_vd_5vd(vcast_vd_d(1), vmul_vd_vd_vd(dh, th), vmul_vd_vd_vd(dh, tl), vmul_vd_vd_vd(dl, th), vmul_vd_vd_vd(dl, tl)))); } static INLINE CONST VECTOR_CC vdouble2 ddrec_vd2_vd2(vdouble2 d) { vdouble t = vrec_vd_vd(vd2getx_vd_vd2(d)); vdouble dh = vupper_vd_vd(vd2getx_vd_vd2(d)), dl = vsub_vd_vd_vd(vd2getx_vd_vd2(d), dh); vdouble th = vupper_vd_vd(t ), tl = vsub_vd_vd_vd(t , th); return vd2setxy_vd2_vd_vd(t, vmul_vd_vd_vd(t, vsub_vd_6vd(vcast_vd_d(1), vmul_vd_vd_vd(dh, th), vmul_vd_vd_vd(dh, tl), vmul_vd_vd_vd(dl, th), vmul_vd_vd_vd(dl, tl), vmul_vd_vd_vd(vd2gety_vd_vd2(d), t)))); } #endif static INLINE CONST VECTOR_CC vdouble2 ddsqrt_vd2_vd2(vdouble2 d) { vdouble t = vsqrt_vd_vd(vadd_vd_vd_vd(vd2getx_vd_vd2(d), vd2gety_vd_vd2(d))); return ddscale_vd2_vd2_vd(ddmul_vd2_vd2_vd2(ddadd2_vd2_vd2_vd2(d, ddmul_vd2_vd_vd(t, t)), ddrec_vd2_vd(t)), vcast_vd_d(0.5)); } static INLINE CONST VECTOR_CC vdouble2 ddsqrt_vd2_vd(vdouble d) { vdouble t = vsqrt_vd_vd(d); return ddscale_vd2_vd2_vd(ddmul_vd2_vd2_vd2(ddadd2_vd2_vd_vd2(d, ddmul_vd2_vd_vd(t, t)), ddrec_vd2_vd(t)), vcast_vd_d(0.5)); } ================================================ FILE: src/df.h ================================================ // Copyright Naoki Shibata and contributors 2010 - 2020. // Distributed under the Boost Software License, Version 1.0. // (See accompanying file LICENSE.txt or copy at // http://www.boost.org/LICENSE_1_0.txt) #if !(defined(ENABLE_SVE) || defined(ENABLE_SVENOFMA)) typedef struct { vfloat x, y; } vfloat2; static vfloat vf2getx_vf_vf2(vfloat2 v) { return v.x; } static vfloat vf2gety_vf_vf2(vfloat2 v) { return v.y; } static vfloat2 vf2setxy_vf2_vf_vf(vfloat x, vfloat y) { vfloat2 v; v.x = x; v.y = y; return v; } static vfloat2 vf2setx_vf2_vf2_vf(vfloat2 v, vfloat d) { v.x = d; return v; } static vfloat2 vf2sety_vf2_vf2_vf(vfloat2 v, vfloat d) { v.y = d; return v; } #endif static INLINE CONST VECTOR_CC vfloat vupper_vf_vf(vfloat d) { return vreinterpret_vf_vi2(vand_vi2_vi2_vi2(vreinterpret_vi2_vf(d), vcast_vi2_i(0xfffff000))); } static INLINE CONST VECTOR_CC vfloat2 vcast_vf2_vf_vf(vfloat h, vfloat l) { return vf2setxy_vf2_vf_vf(h, l); } static INLINE CONST VECTOR_CC vfloat2 vcast_vf2_f_f(float h, float l) { return vf2setxy_vf2_vf_vf(vcast_vf_f(h), vcast_vf_f(l)); } static INLINE CONST VECTOR_CC vfloat2 vcast_vf2_d(double d) { return vf2setxy_vf2_vf_vf(vcast_vf_f(d), vcast_vf_f(d - (float)d)); } static INLINE CONST VECTOR_CC vfloat2 vsel_vf2_vo_vf2_vf2(vopmask m, vfloat2 x, vfloat2 y) { return vf2setxy_vf2_vf_vf(vsel_vf_vo_vf_vf(m, vf2getx_vf_vf2(x), vf2getx_vf_vf2(y)), vsel_vf_vo_vf_vf(m, vf2gety_vf_vf2(x), vf2gety_vf_vf2(y))); } static INLINE CONST VECTOR_CC vfloat2 vsel_vf2_vo_f_f_f_f(vopmask o, float x1, float y1, float x0, float y0) { return vf2setxy_vf2_vf_vf(vsel_vf_vo_f_f(o, x1, x0), vsel_vf_vo_f_f(o, y1, y0)); } static INLINE CONST VECTOR_CC vfloat2 vsel_vf2_vo_vo_d_d_d(vopmask o0, vopmask o1, double d0, double d1, double d2) { return vsel_vf2_vo_vf2_vf2(o0, vcast_vf2_d(d0), vsel_vf2_vo_vf2_vf2(o1, vcast_vf2_d(d1), vcast_vf2_d(d2))); } static INLINE CONST VECTOR_CC vfloat2 vsel_vf2_vo_vo_vo_d_d_d_d(vopmask o0, vopmask o1, vopmask o2, double d0, double d1, double d2, double d3) { return vsel_vf2_vo_vf2_vf2(o0, vcast_vf2_d(d0), vsel_vf2_vo_vf2_vf2(o1, vcast_vf2_d(d1), vsel_vf2_vo_vf2_vf2(o2, vcast_vf2_d(d2), vcast_vf2_d(d3)))); } static INLINE CONST VECTOR_CC vfloat2 vabs_vf2_vf2(vfloat2 x) { return vcast_vf2_vf_vf(vreinterpret_vf_vm(vxor_vm_vm_vm(vand_vm_vm_vm(vreinterpret_vm_vf(vcast_vf_f(-0.0)), vreinterpret_vm_vf(vf2getx_vf_vf2(x))), vreinterpret_vm_vf(vf2getx_vf_vf2(x)))), vreinterpret_vf_vm(vxor_vm_vm_vm(vand_vm_vm_vm(vreinterpret_vm_vf(vcast_vf_f(-0.0)), vreinterpret_vm_vf(vf2getx_vf_vf2(x))), vreinterpret_vm_vf(vf2gety_vf_vf2(x))))); } static INLINE CONST VECTOR_CC vfloat vadd_vf_3vf(vfloat v0, vfloat v1, vfloat v2) { return vadd_vf_vf_vf(vadd_vf_vf_vf(v0, v1), v2); } static INLINE CONST VECTOR_CC vfloat vadd_vf_4vf(vfloat v0, vfloat v1, vfloat v2, vfloat v3) { return vadd_vf_3vf(vadd_vf_vf_vf(v0, v1), v2, v3); } static INLINE CONST VECTOR_CC vfloat vadd_vf_5vf(vfloat v0, vfloat v1, vfloat v2, vfloat v3, vfloat v4) { return vadd_vf_4vf(vadd_vf_vf_vf(v0, v1), v2, v3, v4); } static INLINE CONST VECTOR_CC vfloat vadd_vf_6vf(vfloat v0, vfloat v1, vfloat v2, vfloat v3, vfloat v4, vfloat v5) { return vadd_vf_5vf(vadd_vf_vf_vf(v0, v1), v2, v3, v4, v5); } static INLINE CONST VECTOR_CC vfloat vadd_vf_7vf(vfloat v0, vfloat v1, vfloat v2, vfloat v3, vfloat v4, vfloat v5, vfloat v6) { return vadd_vf_6vf(vadd_vf_vf_vf(v0, v1), v2, v3, v4, v5, v6); } static INLINE CONST VECTOR_CC vfloat vsub_vf_3vf(vfloat v0, vfloat v1, vfloat v2) { return vsub_vf_vf_vf(vsub_vf_vf_vf(v0, v1), v2); } static INLINE CONST VECTOR_CC vfloat vsub_vf_4vf(vfloat v0, vfloat v1, vfloat v2, vfloat v3) { return vsub_vf_3vf(vsub_vf_vf_vf(v0, v1), v2, v3); } static INLINE CONST VECTOR_CC vfloat vsub_vf_5vf(vfloat v0, vfloat v1, vfloat v2, vfloat v3, vfloat v4) { return vsub_vf_4vf(vsub_vf_vf_vf(v0, v1), v2, v3, v4); } // static INLINE CONST VECTOR_CC vfloat2 dfneg_vf2_vf2(vfloat2 x) { return vcast_vf2_vf_vf(vneg_vf_vf(vf2getx_vf_vf2(x)), vneg_vf_vf(vf2gety_vf_vf2(x))); } static INLINE CONST VECTOR_CC vfloat2 dfabs_vf2_vf2(vfloat2 x) { return vcast_vf2_vf_vf(vabs_vf_vf(vf2getx_vf_vf2(x)), vreinterpret_vf_vm(vxor_vm_vm_vm(vreinterpret_vm_vf(vf2gety_vf_vf2(x)), vand_vm_vm_vm(vreinterpret_vm_vf(vf2getx_vf_vf2(x)), vreinterpret_vm_vf(vcast_vf_f(-0.0f)))))); } static INLINE CONST VECTOR_CC vfloat2 dfnormalize_vf2_vf2(vfloat2 t) { vfloat s = vadd_vf_vf_vf(vf2getx_vf_vf2(t), vf2gety_vf_vf2(t)); return vf2setxy_vf2_vf_vf(s, vadd_vf_vf_vf(vsub_vf_vf_vf(vf2getx_vf_vf2(t), s), vf2gety_vf_vf2(t))); } static INLINE CONST VECTOR_CC vfloat2 dfscale_vf2_vf2_vf(vfloat2 d, vfloat s) { return vf2setxy_vf2_vf_vf(vmul_vf_vf_vf(vf2getx_vf_vf2(d), s), vmul_vf_vf_vf(vf2gety_vf_vf2(d), s)); } static INLINE CONST VECTOR_CC vfloat2 dfadd_vf2_vf_vf(vfloat x, vfloat y) { vfloat s = vadd_vf_vf_vf(x, y); return vf2setxy_vf2_vf_vf(s, vadd_vf_vf_vf(vsub_vf_vf_vf(x, s), y)); } static INLINE CONST VECTOR_CC vfloat2 dfadd2_vf2_vf_vf(vfloat x, vfloat y) { vfloat s = vadd_vf_vf_vf(x, y); vfloat v = vsub_vf_vf_vf(s, x); return vf2setxy_vf2_vf_vf(s, vadd_vf_vf_vf(vsub_vf_vf_vf(x, vsub_vf_vf_vf(s, v)), vsub_vf_vf_vf(y, v))); } static INLINE CONST VECTOR_CC vfloat2 dfadd2_vf2_vf_vf2(vfloat x, vfloat2 y) { vfloat s = vadd_vf_vf_vf(x, vf2getx_vf_vf2(y)); vfloat v = vsub_vf_vf_vf(s, x); return vf2setxy_vf2_vf_vf(s, vadd_vf_vf_vf(vadd_vf_vf_vf(vsub_vf_vf_vf(x, vsub_vf_vf_vf(s, v)), vsub_vf_vf_vf(vf2getx_vf_vf2(y), v)), vf2gety_vf_vf2(y))); } static INLINE CONST VECTOR_CC vfloat2 dfadd_vf2_vf2_vf(vfloat2 x, vfloat y) { vfloat s = vadd_vf_vf_vf(vf2getx_vf_vf2(x), y); return vf2setxy_vf2_vf_vf(s, vadd_vf_3vf(vsub_vf_vf_vf(vf2getx_vf_vf2(x), s), y, vf2gety_vf_vf2(x))); } static INLINE CONST VECTOR_CC vfloat2 dfsub_vf2_vf2_vf(vfloat2 x, vfloat y) { vfloat s = vsub_vf_vf_vf(vf2getx_vf_vf2(x), y); return vf2setxy_vf2_vf_vf(s, vadd_vf_vf_vf(vsub_vf_vf_vf(vsub_vf_vf_vf(vf2getx_vf_vf2(x), s), y), vf2gety_vf_vf2(x))); } static INLINE CONST VECTOR_CC vfloat2 dfadd2_vf2_vf2_vf(vfloat2 x, vfloat y) { vfloat s = vadd_vf_vf_vf(vf2getx_vf_vf2(x), y); vfloat v = vsub_vf_vf_vf(s, vf2getx_vf_vf2(x)); vfloat t = vadd_vf_vf_vf(vsub_vf_vf_vf(vf2getx_vf_vf2(x), vsub_vf_vf_vf(s, v)), vsub_vf_vf_vf(y, v)); return vf2setxy_vf2_vf_vf(s, vadd_vf_vf_vf(t, vf2gety_vf_vf2(x))); } static INLINE CONST VECTOR_CC vfloat2 dfadd_vf2_vf_vf2(vfloat x, vfloat2 y) { vfloat s = vadd_vf_vf_vf(x, vf2getx_vf_vf2(y)); return vf2setxy_vf2_vf_vf(s, vadd_vf_3vf(vsub_vf_vf_vf(x, s), vf2getx_vf_vf2(y), vf2gety_vf_vf2(y))); } static INLINE CONST VECTOR_CC vfloat2 dfadd_vf2_vf2_vf2(vfloat2 x, vfloat2 y) { // |x| >= |y| vfloat s = vadd_vf_vf_vf(vf2getx_vf_vf2(x), vf2getx_vf_vf2(y)); return vf2setxy_vf2_vf_vf(s, vadd_vf_4vf(vsub_vf_vf_vf(vf2getx_vf_vf2(x), s), vf2getx_vf_vf2(y), vf2gety_vf_vf2(x), vf2gety_vf_vf2(y))); } static INLINE CONST VECTOR_CC vfloat2 dfadd2_vf2_vf2_vf2(vfloat2 x, vfloat2 y) { vfloat s = vadd_vf_vf_vf(vf2getx_vf_vf2(x), vf2getx_vf_vf2(y)); vfloat v = vsub_vf_vf_vf(s, vf2getx_vf_vf2(x)); vfloat t = vadd_vf_vf_vf(vsub_vf_vf_vf(vf2getx_vf_vf2(x), vsub_vf_vf_vf(s, v)), vsub_vf_vf_vf(vf2getx_vf_vf2(y), v)); return vf2setxy_vf2_vf_vf(s, vadd_vf_vf_vf(t, vadd_vf_vf_vf(vf2gety_vf_vf2(x), vf2gety_vf_vf2(y)))); } static INLINE CONST VECTOR_CC vfloat2 dfsub_vf2_vf_vf(vfloat x, vfloat y) { // |x| >= |y| vfloat s = vsub_vf_vf_vf(x, y); return vf2setxy_vf2_vf_vf(s, vsub_vf_vf_vf(vsub_vf_vf_vf(x, s), y)); } static INLINE CONST VECTOR_CC vfloat2 dfsub_vf2_vf2_vf2(vfloat2 x, vfloat2 y) { // |x| >= |y| vfloat s = vsub_vf_vf_vf(vf2getx_vf_vf2(x), vf2getx_vf_vf2(y)); vfloat t = vsub_vf_vf_vf(vf2getx_vf_vf2(x), s); t = vsub_vf_vf_vf(t, vf2getx_vf_vf2(y)); t = vadd_vf_vf_vf(t, vf2gety_vf_vf2(x)); return vf2setxy_vf2_vf_vf(s, vsub_vf_vf_vf(t, vf2gety_vf_vf2(y))); } #ifdef ENABLE_FMA_SP static INLINE CONST VECTOR_CC vfloat2 dfdiv_vf2_vf2_vf2(vfloat2 n, vfloat2 d) { vfloat t = vrec_vf_vf(vf2getx_vf_vf2(d)); vfloat s = vmul_vf_vf_vf(vf2getx_vf_vf2(n), t); vfloat u = vfmapn_vf_vf_vf_vf(t, vf2getx_vf_vf2(n), s); vfloat v = vfmanp_vf_vf_vf_vf(vf2gety_vf_vf2(d), t, vfmanp_vf_vf_vf_vf(vf2getx_vf_vf2(d), t, vcast_vf_f(1))); return vf2setxy_vf2_vf_vf(s, vfma_vf_vf_vf_vf(s, v, vfma_vf_vf_vf_vf(vf2gety_vf_vf2(n), t, u))); } static INLINE CONST VECTOR_CC vfloat2 dfmul_vf2_vf_vf(vfloat x, vfloat y) { vfloat s = vmul_vf_vf_vf(x, y); return vf2setxy_vf2_vf_vf(s, vfmapn_vf_vf_vf_vf(x, y, s)); } static INLINE CONST VECTOR_CC vfloat2 dfsqu_vf2_vf2(vfloat2 x) { vfloat s = vmul_vf_vf_vf(vf2getx_vf_vf2(x), vf2getx_vf_vf2(x)); return vf2setxy_vf2_vf_vf(s, vfma_vf_vf_vf_vf(vadd_vf_vf_vf(vf2getx_vf_vf2(x), vf2getx_vf_vf2(x)), vf2gety_vf_vf2(x), vfmapn_vf_vf_vf_vf(vf2getx_vf_vf2(x), vf2getx_vf_vf2(x), s))); } static INLINE CONST VECTOR_CC vfloat dfsqu_vf_vf2(vfloat2 x) { return vfma_vf_vf_vf_vf(vf2getx_vf_vf2(x), vf2getx_vf_vf2(x), vadd_vf_vf_vf(vmul_vf_vf_vf(vf2getx_vf_vf2(x), vf2gety_vf_vf2(x)), vmul_vf_vf_vf(vf2getx_vf_vf2(x), vf2gety_vf_vf2(x)))); } static INLINE CONST VECTOR_CC vfloat2 dfmul_vf2_vf2_vf2(vfloat2 x, vfloat2 y) { vfloat s = vmul_vf_vf_vf(vf2getx_vf_vf2(x), vf2getx_vf_vf2(y)); return vf2setxy_vf2_vf_vf(s, vfma_vf_vf_vf_vf(vf2getx_vf_vf2(x), vf2gety_vf_vf2(y), vfma_vf_vf_vf_vf(vf2gety_vf_vf2(x), vf2getx_vf_vf2(y), vfmapn_vf_vf_vf_vf(vf2getx_vf_vf2(x), vf2getx_vf_vf2(y), s)))); } static INLINE CONST VECTOR_CC vfloat dfmul_vf_vf2_vf2(vfloat2 x, vfloat2 y) { return vfma_vf_vf_vf_vf(vf2getx_vf_vf2(x), vf2getx_vf_vf2(y), vfma_vf_vf_vf_vf(vf2gety_vf_vf2(x), vf2getx_vf_vf2(y), vmul_vf_vf_vf(vf2getx_vf_vf2(x), vf2gety_vf_vf2(y)))); } static INLINE CONST VECTOR_CC vfloat2 dfmul_vf2_vf2_vf(vfloat2 x, vfloat y) { vfloat s = vmul_vf_vf_vf(vf2getx_vf_vf2(x), y); return vf2setxy_vf2_vf_vf(s, vfma_vf_vf_vf_vf(vf2gety_vf_vf2(x), y, vfmapn_vf_vf_vf_vf(vf2getx_vf_vf2(x), y, s))); } static INLINE CONST VECTOR_CC vfloat2 dfrec_vf2_vf(vfloat d) { vfloat s = vrec_vf_vf(d); return vf2setxy_vf2_vf_vf(s, vmul_vf_vf_vf(s, vfmanp_vf_vf_vf_vf(d, s, vcast_vf_f(1)))); } static INLINE CONST VECTOR_CC vfloat2 dfrec_vf2_vf2(vfloat2 d) { vfloat s = vrec_vf_vf(vf2getx_vf_vf2(d)); return vf2setxy_vf2_vf_vf(s, vmul_vf_vf_vf(s, vfmanp_vf_vf_vf_vf(vf2gety_vf_vf2(d), s, vfmanp_vf_vf_vf_vf(vf2getx_vf_vf2(d), s, vcast_vf_f(1))))); } #else static INLINE CONST VECTOR_CC vfloat2 dfdiv_vf2_vf2_vf2(vfloat2 n, vfloat2 d) { vfloat t = vrec_vf_vf(vf2getx_vf_vf2(d)); vfloat dh = vupper_vf_vf(vf2getx_vf_vf2(d)), dl = vsub_vf_vf_vf(vf2getx_vf_vf2(d), dh); vfloat th = vupper_vf_vf(t ), tl = vsub_vf_vf_vf(t , th); vfloat nhh = vupper_vf_vf(vf2getx_vf_vf2(n)), nhl = vsub_vf_vf_vf(vf2getx_vf_vf2(n), nhh); vfloat s = vmul_vf_vf_vf(vf2getx_vf_vf2(n), t); vfloat u, w; w = vcast_vf_f(-1); w = vmla_vf_vf_vf_vf(dh, th, w); w = vmla_vf_vf_vf_vf(dh, tl, w); w = vmla_vf_vf_vf_vf(dl, th, w); w = vmla_vf_vf_vf_vf(dl, tl, w); w = vneg_vf_vf(w); u = vmla_vf_vf_vf_vf(nhh, th, vneg_vf_vf(s)); u = vmla_vf_vf_vf_vf(nhh, tl, u); u = vmla_vf_vf_vf_vf(nhl, th, u); u = vmla_vf_vf_vf_vf(nhl, tl, u); u = vmla_vf_vf_vf_vf(s, w, u); return vf2setxy_vf2_vf_vf(s, vmla_vf_vf_vf_vf(t, vsub_vf_vf_vf(vf2gety_vf_vf2(n), vmul_vf_vf_vf(s, vf2gety_vf_vf2(d))), u)); } static INLINE CONST VECTOR_CC vfloat2 dfmul_vf2_vf_vf(vfloat x, vfloat y) { vfloat xh = vupper_vf_vf(x), xl = vsub_vf_vf_vf(x, xh); vfloat yh = vupper_vf_vf(y), yl = vsub_vf_vf_vf(y, yh); vfloat s = vmul_vf_vf_vf(x, y), t; t = vmla_vf_vf_vf_vf(xh, yh, vneg_vf_vf(s)); t = vmla_vf_vf_vf_vf(xl, yh, t); t = vmla_vf_vf_vf_vf(xh, yl, t); t = vmla_vf_vf_vf_vf(xl, yl, t); return vf2setxy_vf2_vf_vf(s, t); } static INLINE CONST VECTOR_CC vfloat2 dfmul_vf2_vf2_vf(vfloat2 x, vfloat y) { vfloat xh = vupper_vf_vf(vf2getx_vf_vf2(x)), xl = vsub_vf_vf_vf(vf2getx_vf_vf2(x), xh); vfloat yh = vupper_vf_vf(y ), yl = vsub_vf_vf_vf(y , yh); vfloat s = vmul_vf_vf_vf(vf2getx_vf_vf2(x), y), t; t = vmla_vf_vf_vf_vf(xh, yh, vneg_vf_vf(s)); t = vmla_vf_vf_vf_vf(xl, yh, t); t = vmla_vf_vf_vf_vf(xh, yl, t); t = vmla_vf_vf_vf_vf(xl, yl, t); t = vmla_vf_vf_vf_vf(vf2gety_vf_vf2(x), y, t); return vf2setxy_vf2_vf_vf(s, t); } static INLINE CONST VECTOR_CC vfloat2 dfmul_vf2_vf2_vf2(vfloat2 x, vfloat2 y) { vfloat xh = vupper_vf_vf(vf2getx_vf_vf2(x)), xl = vsub_vf_vf_vf(vf2getx_vf_vf2(x), xh); vfloat yh = vupper_vf_vf(vf2getx_vf_vf2(y)), yl = vsub_vf_vf_vf(vf2getx_vf_vf2(y), yh); vfloat s = vmul_vf_vf_vf(vf2getx_vf_vf2(x), vf2getx_vf_vf2(y)), t; t = vmla_vf_vf_vf_vf(xh, yh, vneg_vf_vf(s)); t = vmla_vf_vf_vf_vf(xl, yh, t); t = vmla_vf_vf_vf_vf(xh, yl, t); t = vmla_vf_vf_vf_vf(xl, yl, t); t = vmla_vf_vf_vf_vf(vf2getx_vf_vf2(x), vf2gety_vf_vf2(y), t); t = vmla_vf_vf_vf_vf(vf2gety_vf_vf2(x), vf2getx_vf_vf2(y), t); return vf2setxy_vf2_vf_vf(s, t); } static INLINE CONST VECTOR_CC vfloat dfmul_vf_vf2_vf2(vfloat2 x, vfloat2 y) { vfloat xh = vupper_vf_vf(vf2getx_vf_vf2(x)), xl = vsub_vf_vf_vf(vf2getx_vf_vf2(x), xh); vfloat yh = vupper_vf_vf(vf2getx_vf_vf2(y)), yl = vsub_vf_vf_vf(vf2getx_vf_vf2(y), yh); return vadd_vf_6vf(vmul_vf_vf_vf(vf2gety_vf_vf2(x), yh), vmul_vf_vf_vf(xh, vf2gety_vf_vf2(y)), vmul_vf_vf_vf(xl, yl), vmul_vf_vf_vf(xh, yl), vmul_vf_vf_vf(xl, yh), vmul_vf_vf_vf(xh, yh)); } static INLINE CONST VECTOR_CC vfloat2 dfsqu_vf2_vf2(vfloat2 x) { vfloat xh = vupper_vf_vf(vf2getx_vf_vf2(x)), xl = vsub_vf_vf_vf(vf2getx_vf_vf2(x), xh); vfloat s = vmul_vf_vf_vf(vf2getx_vf_vf2(x), vf2getx_vf_vf2(x)), t; t = vmla_vf_vf_vf_vf(xh, xh, vneg_vf_vf(s)); t = vmla_vf_vf_vf_vf(vadd_vf_vf_vf(xh, xh), xl, t); t = vmla_vf_vf_vf_vf(xl, xl, t); t = vmla_vf_vf_vf_vf(vf2getx_vf_vf2(x), vadd_vf_vf_vf(vf2gety_vf_vf2(x), vf2gety_vf_vf2(x)), t); return vf2setxy_vf2_vf_vf(s, t); } static INLINE CONST VECTOR_CC vfloat dfsqu_vf_vf2(vfloat2 x) { vfloat xh = vupper_vf_vf(vf2getx_vf_vf2(x)), xl = vsub_vf_vf_vf(vf2getx_vf_vf2(x), xh); return vadd_vf_5vf(vmul_vf_vf_vf(xh, vf2gety_vf_vf2(x)), vmul_vf_vf_vf(xh, vf2gety_vf_vf2(x)), vmul_vf_vf_vf(xl, xl), vadd_vf_vf_vf(vmul_vf_vf_vf(xh, xl), vmul_vf_vf_vf(xh, xl)), vmul_vf_vf_vf(xh, xh)); } static INLINE CONST VECTOR_CC vfloat2 dfrec_vf2_vf(vfloat d) { vfloat t = vrec_vf_vf(d); vfloat dh = vupper_vf_vf(d), dl = vsub_vf_vf_vf(d, dh); vfloat th = vupper_vf_vf(t), tl = vsub_vf_vf_vf(t, th); vfloat u = vcast_vf_f(-1); u = vmla_vf_vf_vf_vf(dh, th, u); u = vmla_vf_vf_vf_vf(dh, tl, u); u = vmla_vf_vf_vf_vf(dl, th, u); u = vmla_vf_vf_vf_vf(dl, tl, u); return vf2setxy_vf2_vf_vf(t, vmul_vf_vf_vf(vneg_vf_vf(t), u)); } static INLINE CONST VECTOR_CC vfloat2 dfrec_vf2_vf2(vfloat2 d) { vfloat t = vrec_vf_vf(vf2getx_vf_vf2(d)); vfloat dh = vupper_vf_vf(vf2getx_vf_vf2(d)), dl = vsub_vf_vf_vf(vf2getx_vf_vf2(d), dh); vfloat th = vupper_vf_vf(t ), tl = vsub_vf_vf_vf(t , th); vfloat u = vcast_vf_f(-1); u = vmla_vf_vf_vf_vf(dh, th, u); u = vmla_vf_vf_vf_vf(dh, tl, u); u = vmla_vf_vf_vf_vf(dl, th, u); u = vmla_vf_vf_vf_vf(dl, tl, u); u = vmla_vf_vf_vf_vf(vf2gety_vf_vf2(d), t, u); return vf2setxy_vf2_vf_vf(t, vmul_vf_vf_vf(vneg_vf_vf(t), u)); } #endif static INLINE CONST VECTOR_CC vfloat2 dfsqrt_vf2_vf2(vfloat2 d) { #ifdef ENABLE_RECSQRT_SP vfloat x = vrecsqrt_vf_vf(vadd_vf_vf_vf(vf2getx_vf_vf2(d), vf2gety_vf_vf2(d))); vfloat2 r = dfmul_vf2_vf2_vf(d, x); return dfscale_vf2_vf2_vf(dfmul_vf2_vf2_vf2(r, dfadd2_vf2_vf2_vf(dfmul_vf2_vf2_vf(r, x), vcast_vf_f(-3.0))), vcast_vf_f(-0.5)); #else vfloat t = vsqrt_vf_vf(vadd_vf_vf_vf(vf2getx_vf_vf2(d), vf2gety_vf_vf2(d))); return dfscale_vf2_vf2_vf(dfmul_vf2_vf2_vf2(dfadd2_vf2_vf2_vf2(d, dfmul_vf2_vf_vf(t, t)), dfrec_vf2_vf(t)), vcast_vf_f(0.5)); #endif } static INLINE CONST VECTOR_CC vfloat2 dfsqrt_vf2_vf(vfloat d) { vfloat t = vsqrt_vf_vf(d); return dfscale_vf2_vf2_vf(dfmul_vf2_vf2_vf2(dfadd2_vf2_vf_vf2(d, dfmul_vf2_vf_vf(t, t)), dfrec_vf2_vf(t)), vcast_vf_f(0.5f)); } ================================================ FILE: src/estrin.h ================================================ // Copyright Naoki Shibata and contributors 2010 - 2020. // Distributed under the Boost Software License, Version 1.0. // (See accompanying file LICENSE.txt or copy at // http://www.boost.org/LICENSE_1_0.txt) // These are macros for evaluating polynomials using Estrin's method #define POLY2(x, c1, c0) MLA(x, C2V(c1), C2V(c0)) #define POLY3(x, x2, c2, c1, c0) MLA(x2, C2V(c2), MLA(x, C2V(c1), C2V(c0))) #define POLY4(x, x2, c3, c2, c1, c0) MLA(x2, MLA(x, C2V(c3), C2V(c2)), MLA(x, C2V(c1), C2V(c0))) #define POLY5(x, x2, x4, c4, c3, c2, c1, c0) MLA(x4, C2V(c4), POLY4(x, x2, c3, c2, c1, c0)) #define POLY6(x, x2, x4, c5, c4, c3, c2, c1, c0) MLA(x4, POLY2(x, c5, c4), POLY4(x, x2, c3, c2, c1, c0)) #define POLY7(x, x2, x4, c6, c5, c4, c3, c2, c1, c0) MLA(x4, POLY3(x, x2, c6, c5, c4), POLY4(x, x2, c3, c2, c1, c0)) #define POLY8(x, x2, x4, c7, c6, c5, c4, c3, c2, c1, c0) MLA(x4, POLY4(x, x2, c7, c6, c5, c4), POLY4(x, x2, c3, c2, c1, c0)) #define POLY9(x, x2, x4, x8, c8, c7, c6, c5, c4, c3, c2, c1, c0)\ MLA(x8, C2V(c8), POLY8(x, x2, x4, c7, c6, c5, c4, c3, c2, c1, c0)) #define POLY10(x, x2, x4, x8, c9, c8, c7, c6, c5, c4, c3, c2, c1, c0)\ MLA(x8, POLY2(x, c9, c8), POLY8(x, x2, x4, c7, c6, c5, c4, c3, c2, c1, c0)) #define POLY11(x, x2, x4, x8, ca, c9, c8, c7, c6, c5, c4, c3, c2, c1, c0)\ MLA(x8, POLY3(x, x2, ca, c9, c8), POLY8(x, x2, x4, c7, c6, c5, c4, c3, c2, c1, c0)) #define POLY12(x, x2, x4, x8, cb, ca, c9, c8, c7, c6, c5, c4, c3, c2, c1, c0)\ MLA(x8, POLY4(x, x2, cb, ca, c9, c8), POLY8(x, x2, x4, c7, c6, c5, c4, c3, c2, c1, c0)) #define POLY13(x, x2, x4, x8, cc, cb, ca, c9, c8, c7, c6, c5, c4, c3, c2, c1, c0)\ MLA(x8, POLY5(x, x2, x4, cc, cb, ca, c9, c8), POLY8(x, x2, x4, c7, c6, c5, c4, c3, c2, c1, c0)) #define POLY14(x, x2, x4, x8, cd, cc, cb, ca, c9, c8, c7, c6, c5, c4, c3, c2, c1, c0)\ MLA(x8, POLY6(x, x2, x4, cd, cc, cb, ca, c9, c8), POLY8(x, x2, x4, c7, c6, c5, c4, c3, c2, c1, c0)) #define POLY15(x, x2, x4, x8, ce, cd, cc, cb, ca, c9, c8, c7, c6, c5, c4, c3, c2, c1, c0)\ MLA(x8, POLY7(x, x2, x4, ce, cd, cc, cb, ca, c9, c8), POLY8(x, x2, x4, c7, c6, c5, c4, c3, c2, c1, c0)) #define POLY16(x, x2, x4, x8, cf, ce, cd, cc, cb, ca, c9, c8, c7, c6, c5, c4, c3, c2, c1, c0)\ MLA(x8, POLY8(x, x2, x4, cf, ce, cd, cc, cb, ca, c9, c8), POLY8(x, x2, x4, c7, c6, c5, c4, c3, c2, c1, c0)) #define POLY17(x, x2, x4, x8, x16, d0, cf, ce, cd, cc, cb, ca, c9, c8, c7, c6, c5, c4, c3, c2, c1, c0)\ MLA(x16, C2V(d0), POLY16(x, x2, x4, x8, cf, ce, cd, cc, cb, ca, c9, c8, c7, c6, c5, c4, c3, c2, c1, c0)) #define POLY18(x, x2, x4, x8, x16, d1, d0, cf, ce, cd, cc, cb, ca, c9, c8, c7, c6, c5, c4, c3, c2, c1, c0)\ MLA(x16, POLY2(x, d1, d0), POLY16(x, x2, x4, x8, cf, ce, cd, cc, cb, ca, c9, c8, c7, c6, c5, c4, c3, c2, c1, c0)) #define POLY19(x, x2, x4, x8, x16, d2, d1, d0, cf, ce, cd, cc, cb, ca, c9, c8, c7, c6, c5, c4, c3, c2, c1, c0)\ MLA(x16, POLY3(x, x2, d2, d1, d0), POLY16(x, x2, x4, x8, cf, ce, cd, cc, cb, ca, c9, c8, c7, c6, c5, c4, c3, c2, c1, c0)) ================================================ FILE: src/fp16.cpp ================================================ /* Copyright (c) 2021 Agenium Scale Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ /* We follow IEEE754-2008 for FP16 (= binary16) storage. However IEEE754 compliance is not guaranteed by C/C++ standards and therefore we propose two modes: - IEEE754 mode with NaNs, INFs, ... (this is the default) - non IEEE754 mode compatible with only C89 (no NaNs, INFs...) FP16 format ----------- +---+--------+--------------+ | S | E EEEE | MM MMMM MMMM | +---+--------+--------------+ 15 14 10 9 0 FP16 interpretation ------------------- S = sign bit E = exponent bits (offset is 15), emin = -14, emax = 15 M = mantissa bits E == 0 and M != 0 => subnormal => (-1)^S x 2^(-14) x (0 + 2^(-10) x T) 32 > E > 0 => normal => (-1)^S x 2^(E - 15) x (1 + 2^(-10) x T) FP32 format ----------- +---+-----------+------------------------------+ | S | EEEE EEEE | MMM MMMM MMMM MMMM MMMM MMMM | +---+-----------+------------------------------+ 31 30 23 22 0 FP32 interpretation ------------------- S = sign bit E = exponent bits (offset is 127), emin = -126, emax = 127 M = mantissa bits E == 0 and M != 0 => subnormal => (-1)^S x 2^(-126) x (0 + 2^(-23) x T) 256 > E > 0 => normal => (-1)^S x 2^(E - 127) x (1 + 2^(-23) x T) In both cases we treat subnormal numbers as zeros. Moreover the implementation below was written so that it can easily be SIMD'ed. */ #define NSIMD_INSIDE #include #ifdef NSIMD_NO_IEEE754 #include #endif #include #ifdef NSIMD_C_LINKAGE_FOR_F16 extern "C" { #endif // ---------------------------------------------------------------------------- // Convert a FP16 as an u16 to a float NSIMD_DLLEXPORT float nsimd_u16_to_f32(u16 a) { #ifdef NSIMD_NO_IEEE754 float sign; int exponent, mantissa; sign = (a >> 15) == 1 ? -1.0f : 1.0f; exponent = (a >> 10) & 0x1F; mantissa = (float)(a & 0x3FF); if (exponent == 0) { return std::ldexp(sign * mantissa, -24); } else { return std::ldexp(sign * (0x400 | mantissa), exponent - 25); } #else u32 sign, mantissa, exponent; sign = a & 0x8000; exponent = (a >> 10) & 0x1F; mantissa = (a & 0x3FF); if (exponent == 31) { /* We have a NaN of an INF. */ exponent = 255; /* Force the first bit of the mantissa to 1 to be compatible with the way * Intel convert f16 to f32 */ if (mantissa != 0) { //mantissa |= 0x200; } } else if (exponent == 0 && mantissa == 0) { /* Nothing to do */ } else if (exponent == 0) { u32 mask = mantissa; /* Find the most significant bit of the mantissa (could use a better * algorithm) */ int i = -1; do { ++i; mask <<= 1; } while ((mask & 0x400) == 0); /* Update the mantissa and the exponent */ mantissa = (mask & 0x3ff); exponent += (u32)(112 - i); } else { /* the exponent must be recomputed -15 + 127 */ exponent += 112; } /* We then rebuild the float */ return nsimd_scalar_reinterpret_f32_u32( (sign << 16) | (((u32)exponent) << 23) | (mantissa << 13)); #endif } // ---------------------------------------------------------------------------- // Convert a FP16 to a float #ifndef NSIMD_NATIVE_FP16 NSIMD_DLLEXPORT f32 nsimd_f16_to_f32(f16 a) { return nsimd_u16_to_f32(a.u); } #endif // ---------------------------------------------------------------------------- // Convert a float to a FP16 as an u16 NSIMD_DLLEXPORT u16 nsimd_f32_to_u16(f32 a) { #ifdef NSIMD_NO_IEEE754 double frac; int exponent; u32 sign, mantissa; /* Get mantissa (= fractional part) and exponent. */ frac = std::frexp(a, &exponent); /* Get sign and make sure frac is positive. */ if (frac < 0) { sign = 1u; frac = -frac; } else { sign = 0u; } /* Add 1 to the exponent to have the IEEE exponent: The mantissa here lives in [0.5, 1) whereas for IEEE it must live in [1, 2). */ exponent++; if (exponent < -14) { /* We have a too small number, returns zero */ return (u16)(sign << 15); } else if (exponent > 15) { /* We have a too big number, return INF */ return (u16)((sign << 15) | 0x7C00); } else { /* We have a normal number. Get the mantissa: frac lives in [0.5, 1) and is of the form 0.1XXXXXXX, therefore to get the mantissa frac must be multiplied by 2^11 = 2048. Then it will be of the form 1XX XXXX XXXX.XXXXX, so we have to get rid of the leading bit. */ mantissa = (u32)(frac * 2048.0) & 0x3FF; return (u16)((sign << 15) | ((u32)(exponent + 15) << 10) | mantissa); } #else u32 sign, mantissa; int exponent; u32 in_u = nsimd_scalar_reinterpret_u32_f32(a); sign = in_u & 0x80000000; exponent = (int)((in_u >> 23) & 0xFF); mantissa = (in_u & 0x7FFFFF); if (exponent == 255 && mantissa != 0) { /* Nan */ return (u16)(0xffff); } const f32 biggest_f16 = nsimd_scalar_reinterpret_f32_u32(0x477ff000); if (a >= biggest_f16 || a <= -biggest_f16) { /* Number is too big to be representable in half => return infinity */ return (u16)(sign >> 16 | 0x1f << 10); } const f32 smallest_f16 = nsimd_scalar_reinterpret_f32_u32(0x33000000); if (a <= smallest_f16 && a >= -smallest_f16) { /* Number is too small to be representable in half => return ±0 */ return (u16)(sign >> 16); } /* For FP32 exponent bias is 127, compute the real exponent. */ exponent -= 127; /* Following algorithm taken from: * https://fgiesen.wordpress.com/2012/03/28/half-to-float-done-quic/ */ const f32 denormal_f16 = nsimd_scalar_reinterpret_f32_u32(0x38800000); if (a < denormal_f16 && a > -denormal_f16) { /* Denormalized half */ const u32 magic_u = ((127 - 15) + (23 - 10) + 1) << 23; const f32 magic_f = nsimd_scalar_reinterpret_f32_u32(magic_u); u32 in_u = nsimd_scalar_reinterpret_u32_f32(a); in_u &= ~0x80000000u; f32 in_f = nsimd_scalar_reinterpret_f32_u32(in_u); in_f += magic_f; in_u = nsimd_scalar_reinterpret_u32_f32(in_f); in_u -= magic_u; return (u16)((sign >> 16) | in_u); } /* Normal half */ in_u &= ~0x80000000U; u32 mant_odd = (in_u >> 13) & 1; in_u += ((u32)(15 - 127) << 23) + 0xfffU; in_u += mant_odd; return (u16)((sign >> 16) | (in_u >> 13)); #endif } // ---------------------------------------------------------------------------- // Convert a float to a FP16 #ifndef NSIMD_NATIVE_FP16 NSIMD_DLLEXPORT f16 nsimd_f32_to_f16(f32 a) { f16 ret; ret.u = nsimd_f32_to_u16(a); return ret; } #endif // ---------------------------------------------------------------------------- #ifdef NSIMD_C_LINKAGE_FOR_F16 } // extern "C" #endif // ---------------------------------------------------------------------------- // C++ versions in namespace nsimd namespace nsimd { NSIMD_DLLEXPORT u16 f32_to_u16(f32 a) { return nsimd_f32_to_u16(a); } NSIMD_DLLEXPORT f32 u16_to_f32(u16 a) { return nsimd_u16_to_f32(a); } #ifndef NSIMD_NATIVE_FP16 NSIMD_DLLEXPORT f16 f32_to_f16(f32 a) { return nsimd_f32_to_f16(a); } NSIMD_DLLEXPORT f32 f16_to_f32(f16 a) { return nsimd_f16_to_f32(a); } #endif } // namespace nsimd ================================================ FILE: src/gpu.cpp ================================================ /* Copyright (c) 2021 Agenium Scale Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #define NSIMD_INSIDE #include #if defined(NSIMD_ONEAPI) && NSIMD_CXX > 0 // ---------------------------------------------------------------------------- // oneAPI // NSIMD error handler namespace nsimd { namespace oneapi { template struct sycl_async_error_handler { void operator()(const sycl::exception_list &elist) { for (const auto &exc : elist) { try { std::rethrow_exception(exc); } catch (const Exception &exc) { fprintf(stderr, "NSIMD Internal error:\n\tError: %s %s %d\n", exc.what(), __FILE__, __LINE__); exit(EXIT_FAILURE); } } } }; } // namespace oneapi } // namespace nsimd extern "C" { // Singleton to get default oneAPI queue NSIMD_DLLSPEC void *nsimd_oneapi_default_queue() { static sycl::queue ret(sycl::default_selector{}, nsimd::oneapi::sycl_async_error_handler<>{}); return (void *)&ret; } NSIMD_DLLSPEC nsimd_nat nsimd_kernel_param(nsimd_nat nb_items, nsimd_nat block_size) { return block_size * ((nb_items + block_size - 1) / block_size); } } // extern "C" #elif defined(NSIMD_CUDA) || defined(NSIMD_ROCM) // ---------------------------------------------------------------------------- // CUDA/ROCm NSIMD_DLLSPEC nsimd_nat nsimd_kernel_param(nsimd_nat nb_items, nsimd_nat block_size) { return (nb_items + block_size - 1) / block_size; } #else // ---------------------------------------------------------------------------- // CPU/SIMD NSIMD_DLLSPEC nsimd_nat nsimd_kernel_param(nsimd_nat nb_items, nsimd_nat block_size) { return nb_items / block_size; } // ---------------------------------------------------------------------------- #endif ================================================ FILE: src/helperadvsimd.h ================================================ /*********************************************************************/ /* Copyright ARM Ltd. 2010 - 2019. */ /* Distributed under the Boost Software License, Version 1.0. */ /* (See accompanying file LICENSE.txt or copy at */ /* http://www.boost.org/LICENSE_1_0.txt) */ /*********************************************************************/ #ifndef __ARM_NEON #error Please specify advsimd flags. #endif #if !defined(SLEEF_GENHEADER) #include #include #include "misc.h" #endif // #if !defined(SLEEF_GENHEADER) #define ENABLE_DP //@#define ENABLE_DP #define LOG2VECTLENDP 1 //@#define LOG2VECTLENDP 1 #define VECTLENDP (1 << LOG2VECTLENDP) //@#define VECTLENDP (1 << LOG2VECTLENDP) #define ENABLE_SP //@#define ENABLE_SP #define LOG2VECTLENSP 2 //@#define LOG2VECTLENSP 2 #define VECTLENSP (1 << LOG2VECTLENSP) //@#define VECTLENSP (1 << LOG2VECTLENSP) #if CONFIG == 1 #define ENABLE_FMA_DP //@#define ENABLE_FMA_DP #define ENABLE_FMA_SP //@#define ENABLE_FMA_SP #endif #define FULL_FP_ROUNDING //@#define FULL_FP_ROUNDING #define ACCURATE_SQRT //@#define ACCURATE_SQRT #define ISANAME "AArch64 AdvSIMD" // Mask definition typedef uint32x4_t vmask; typedef uint32x4_t vopmask; // Single precision definitions typedef float32x4_t vfloat; typedef int32x4_t vint2; // Double precision definitions typedef float64x2_t vdouble; typedef int32x2_t vint; typedef struct { vmask x, y; } vmask2; #define DFTPRIORITY 10 static INLINE int vavailability_i(int name) { return 3; } static INLINE void vprefetch_v_p(const void *ptr) { } static INLINE VECTOR_CC int vtestallones_i_vo32(vopmask g) { uint32x2_t x0 = vand_u32(vget_low_u32(g), vget_high_u32(g)); uint32x2_t x1 = vpmin_u32(x0, x0); return vget_lane_u32(x1, 0); } static INLINE VECTOR_CC int vtestallones_i_vo64(vopmask g) { uint32x2_t x0 = vand_u32(vget_low_u32(g), vget_high_u32(g)); uint32x2_t x1 = vpmin_u32(x0, x0); return vget_lane_u32(x1, 0); } // Vector load / store static INLINE VECTOR_CC vdouble vload_vd_p(const double *ptr) { return vld1q_f64(ptr); } static INLINE VECTOR_CC vdouble vloadu_vd_p(const double *ptr) { return vld1q_f64(ptr); } static INLINE VECTOR_CC void vstore_v_p_vd(double *ptr, vdouble v) { vst1q_f64(ptr, v); } static INLINE VECTOR_CC void vstoreu_v_p_vd(double *ptr, vdouble v) { vst1q_f64(ptr, v); } static INLINE VECTOR_CC vfloat vload_vf_p(const float *ptr) { return vld1q_f32(ptr); } static INLINE VECTOR_CC vfloat vloadu_vf_p(const float *ptr) { return vld1q_f32(ptr); } static INLINE VECTOR_CC void vstore_v_p_vf(float *ptr, vfloat v) { vst1q_f32(ptr, v); } static INLINE VECTOR_CC void vstoreu_v_p_vf(float *ptr, vfloat v) { vst1q_f32(ptr, v); } static INLINE VECTOR_CC vint2 vloadu_vi2_p(int32_t *p) { return vld1q_s32(p); } static INLINE VECTOR_CC void vstoreu_v_p_vi2(int32_t *p, vint2 v) { vst1q_s32(p, v); } static INLINE VECTOR_CC vint vloadu_vi_p(int32_t *p) { return vld1_s32(p); } static INLINE VECTOR_CC void vstoreu_v_p_vi(int32_t *p, vint v) { vst1_s32(p, v); } static INLINE VECTOR_CC vdouble vgather_vd_p_vi(const double *ptr, vint vi) { return ((vdouble) { ptr[vget_lane_s32(vi, 0)], ptr[vget_lane_s32(vi, 1)]} ); } static INLINE VECTOR_CC vfloat vgather_vf_p_vi2(const float *ptr, vint2 vi2) { return ((vfloat) { ptr[vgetq_lane_s32(vi2, 0)], ptr[vgetq_lane_s32(vi2, 1)], ptr[vgetq_lane_s32(vi2, 2)], ptr[vgetq_lane_s32(vi2, 3)] }); } // Basic logical operations for mask static INLINE VECTOR_CC vmask vand_vm_vm_vm(vmask x, vmask y) { return vandq_u32(x, y); } static INLINE VECTOR_CC vmask vandnot_vm_vm_vm(vmask x, vmask y) { return vbicq_u32(y, x); } static INLINE VECTOR_CC vmask vor_vm_vm_vm(vmask x, vmask y) { return vorrq_u32(x, y); } static INLINE VECTOR_CC vmask vxor_vm_vm_vm(vmask x, vmask y) { return veorq_u32(x, y); } // Mask <--> single precision reinterpret static INLINE VECTOR_CC vmask vreinterpret_vm_vf(vfloat vf) { return vreinterpretq_u32_f32(vf); } static INLINE VECTOR_CC vfloat vreinterpret_vf_vm(vmask vm) { return vreinterpretq_f32_u32(vm); } static INLINE VECTOR_CC vint2 vcast_vi2_vm(vmask vm) { return vreinterpretq_s32_u32(vm); } static INLINE VECTOR_CC vmask vcast_vm_vi2(vint2 vi) { return vreinterpretq_u32_s32(vi); } // Mask <--> double precision reinterpret static INLINE VECTOR_CC vmask vreinterpret_vm_vd(vdouble vd) { return vreinterpretq_u32_f64(vd); } static INLINE VECTOR_CC vdouble vreinterpret_vd_vm(vmask vm) { return vreinterpretq_f64_u32(vm); } static INLINE VECTOR_CC vfloat vreinterpret_vf_vi2(vint2 vm) { return vreinterpretq_f32_s32(vm); } static INLINE VECTOR_CC vint2 vreinterpret_vi2_vf(vfloat vf) { return vreinterpretq_s32_f32(vf); } static INLINE VECTOR_CC vint2 vreinterpret_vi2_vd(vdouble vd) { return vreinterpretq_s32_f64(vd); } /****************************************/ /* Single precision FP operations */ /****************************************/ // Broadcast static INLINE VECTOR_CC vfloat vcast_vf_f(float f) { return vdupq_n_f32(f); } // Add, Sub, Mul static INLINE VECTOR_CC vfloat vadd_vf_vf_vf(vfloat x, vfloat y) { return vaddq_f32(x, y); } static INLINE VECTOR_CC vfloat vsub_vf_vf_vf(vfloat x, vfloat y) { return vsubq_f32(x, y); } static INLINE VECTOR_CC vfloat vmul_vf_vf_vf(vfloat x, vfloat y) { return vmulq_f32(x, y); } // |x|, -x static INLINE VECTOR_CC vfloat vabs_vf_vf(vfloat f) { return vabsq_f32(f); } static INLINE VECTOR_CC vfloat vneg_vf_vf(vfloat f) { return vnegq_f32(f); } #if CONFIG == 1 // Multiply accumulate: z = z + x * y static INLINE VECTOR_CC vfloat vmla_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vfmaq_f32(z, x, y); } // Multiply subtract: z = z - x * y static INLINE VECTOR_CC vfloat vmlanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vfmsq_f32(z, x, y); } // Multiply subtract: z = x * y - z static INLINE VECTOR_CC vfloat vmlapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vneg_vf_vf(vfmsq_f32(z, x, y)); } #else static INLINE VECTOR_CC vfloat vmla_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vadd_vf_vf_vf(vmul_vf_vf_vf(x, y), z); } static INLINE VECTOR_CC vfloat vmlanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vsub_vf_vf_vf(z, vmul_vf_vf_vf(x, y)); } static INLINE VECTOR_CC vfloat vmlapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vsub_vf_vf_vf(vmul_vf_vf_vf(x, y), z); } #endif static INLINE VECTOR_CC vfloat vfma_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { // z + x * y return vfmaq_f32(z, x, y); } static INLINE VECTOR_CC vfloat vfmanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { // z - x * y return vfmsq_f32(z, x, y); } static INLINE VECTOR_CC vfloat vfmapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { // x * y - z return vneg_vf_vf(vfmanp_vf_vf_vf_vf(x, y, z)); } // Reciprocal 1/x, Division, Square root static INLINE VECTOR_CC vfloat vdiv_vf_vf_vf(vfloat n, vfloat d) { #ifndef ENABLE_ALTDIV return vdivq_f32(n, d); #else // Finite numbers (including denormal) only, gives mostly correctly rounded result float32x4_t t, u, x, y; uint32x4_t i0, i1; i0 = vandq_u32(vreinterpretq_u32_f32(n), vdupq_n_u32(0x7c000000)); i1 = vandq_u32(vreinterpretq_u32_f32(d), vdupq_n_u32(0x7c000000)); i0 = vsubq_u32(vdupq_n_u32(0x7d000000), vshrq_n_u32(vaddq_u32(i0, i1), 1)); t = vreinterpretq_f32_u32(i0); y = vmulq_f32(d, t); x = vmulq_f32(n, t); t = vrecpeq_f32(y); t = vmulq_f32(t, vrecpsq_f32(y, t)); t = vmulq_f32(t, vrecpsq_f32(y, t)); u = vmulq_f32(x, t); u = vfmaq_f32(u, vfmsq_f32(x, y, u), t); return u; #endif } static INLINE VECTOR_CC vfloat vrec_vf_vf(vfloat d) { #ifndef ENABLE_ALTDIV return vdiv_vf_vf_vf(vcast_vf_f(1.0f), d); #else return vbslq_f32(vceqq_f32(vabs_vf_vf(d), vcast_vf_f(SLEEF_INFINITYf)), vcast_vf_f(0), vdiv_vf_vf_vf(vcast_vf_f(1.0f), d)); #endif } static INLINE VECTOR_CC vfloat vsqrt_vf_vf(vfloat d) { #ifndef ENABLE_ALTSQRT return vsqrtq_f32(d); #else // Gives correctly rounded result for all input range vfloat w, x, y, z; y = vrsqrteq_f32(d); x = vmul_vf_vf_vf(d, y); w = vmul_vf_vf_vf(vcast_vf_f(0.5), y); y = vfmanp_vf_vf_vf_vf(x, w, vcast_vf_f(0.5)); x = vfma_vf_vf_vf_vf(x, y, x); w = vfma_vf_vf_vf_vf(w, y, w); y = vfmanp_vf_vf_vf_vf(x, w, vcast_vf_f(1.5)); w = vadd_vf_vf_vf(w, w); w = vmul_vf_vf_vf(w, y); x = vmul_vf_vf_vf(w, d); y = vfmapn_vf_vf_vf_vf(w, d, x); z = vfmanp_vf_vf_vf_vf(w, x, vcast_vf_f(1)); z = vfmanp_vf_vf_vf_vf(w, y, z); w = vmul_vf_vf_vf(vcast_vf_f(0.5), x); w = vfma_vf_vf_vf_vf(w, z, y); w = vadd_vf_vf_vf(w, x); return vbslq_f32(vorrq_u32(vceqq_f32(d, vcast_vf_f(0)), vceqq_f32(d, vcast_vf_f(SLEEF_INFINITYf))), d, w); #endif } // max, min static INLINE VECTOR_CC vfloat vmax_vf_vf_vf(vfloat x, vfloat y) { return vmaxq_f32(x, y); } static INLINE VECTOR_CC vfloat vmin_vf_vf_vf(vfloat x, vfloat y) { return vminq_f32(x, y); } // Comparisons static INLINE VECTOR_CC vmask veq_vm_vf_vf(vfloat x, vfloat y) { return vceqq_f32(x, y); } static INLINE VECTOR_CC vmask vneq_vm_vf_vf(vfloat x, vfloat y) { return vmvnq_u32(vceqq_f32(x, y)); } static INLINE VECTOR_CC vmask vlt_vm_vf_vf(vfloat x, vfloat y) { return vcltq_f32(x, y); } static INLINE VECTOR_CC vmask vle_vm_vf_vf(vfloat x, vfloat y) { return vcleq_f32(x, y); } static INLINE VECTOR_CC vmask vgt_vm_vf_vf(vfloat x, vfloat y) { return vcgtq_f32(x, y); } static INLINE VECTOR_CC vmask vge_vm_vf_vf(vfloat x, vfloat y) { return vcgeq_f32(x, y); } // Conditional select static INLINE VECTOR_CC vfloat vsel_vf_vm_vf_vf(vmask mask, vfloat x, vfloat y) { return vbslq_f32(mask, x, y); } // int <--> float conversions static INLINE VECTOR_CC vint2 vtruncate_vi2_vf(vfloat vf) { return vcvtq_s32_f32(vf); } static INLINE VECTOR_CC vfloat vcast_vf_vi2(vint2 vi) { return vcvtq_f32_s32(vi); } static INLINE VECTOR_CC vint2 vcast_vi2_i(int i) { return vdupq_n_s32(i); } static INLINE VECTOR_CC vint2 vrint_vi2_vf(vfloat d) { return vcvtq_s32_f32(vrndnq_f32(d)); } /***************************************/ /* Single precision integer operations */ /***************************************/ // Add, Sub, Neg (-x) static INLINE VECTOR_CC vint2 vadd_vi2_vi2_vi2(vint2 x, vint2 y) { return vaddq_s32(x, y); } static INLINE VECTOR_CC vint2 vsub_vi2_vi2_vi2(vint2 x, vint2 y) { return vsubq_s32(x, y); } static INLINE VECTOR_CC vint2 vneg_vi2_vi2(vint2 e) { return vnegq_s32(e); } // Logical operations static INLINE VECTOR_CC vint2 vand_vi2_vi2_vi2(vint2 x, vint2 y) { return vandq_s32(x, y); } static INLINE VECTOR_CC vint2 vandnot_vi2_vi2_vi2(vint2 x, vint2 y) { return vbicq_s32(y, x); } static INLINE VECTOR_CC vint2 vor_vi2_vi2_vi2(vint2 x, vint2 y) { return vorrq_s32(x, y); } static INLINE VECTOR_CC vint2 vxor_vi2_vi2_vi2(vint2 x, vint2 y) { return veorq_s32(x, y); } // Shifts #define vsll_vi2_vi2_i(x, c) vshlq_n_s32(x, c) //@#define vsll_vi2_vi2_i(x, c) vshlq_n_s32(x, c) #define vsrl_vi2_vi2_i(x, c) \ vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(x), c)) //@#define vsrl_vi2_vi2_i(x, c) vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(x), c)) #define vsra_vi2_vi2_i(x, c) vshrq_n_s32(x, c) //@#define vsra_vi2_vi2_i(x, c) vshrq_n_s32(x, c) #define vsra_vi_vi_i(x, c) vshr_n_s32(x, c) //@#define vsra_vi_vi_i(x, c) vshr_n_s32(x, c) #define vsll_vi_vi_i(x, c) vshl_n_s32(x, c) //@#define vsll_vi_vi_i(x, c) vshl_n_s32(x, c) #define vsrl_vi_vi_i(x, c) \ vreinterpret_s32_u32(vshr_n_u32(vreinterpret_u32_s32(x), c)) //@#define vsrl_vi_vi_i(x, c) vreinterpret_s32_u32(vshr_n_u32(vreinterpret_u32_s32(x), c)) // Comparison returning masks static INLINE VECTOR_CC vmask veq_vm_vi2_vi2(vint2 x, vint2 y) { return vceqq_s32(x, y); } static INLINE VECTOR_CC vmask vgt_vm_vi2_vi2(vint2 x, vint2 y) { return vcgeq_s32(x, y); } // Comparison returning integers static INLINE VECTOR_CC vint2 vgt_vi2_vi2_vi2(vint2 x, vint2 y) { return vreinterpretq_s32_u32(vcgeq_s32(x, y)); } static INLINE VECTOR_CC vint2 veq_vi2_vi2_vi2(vint2 x, vint2 y) { return vreinterpretq_s32_u32(vceqq_s32(x, y)); } // Conditional select static INLINE VECTOR_CC vint2 vsel_vi2_vm_vi2_vi2(vmask m, vint2 x, vint2 y) { return vbslq_s32(m, x, y); } /* -------------------------------------------------------------------------- */ /* -------------------------------------------------------------------------- */ /* -------------------------------------------------------------------------- */ /* -------------------------------------------------------------------------- */ /****************************************/ /* Double precision FP operations */ /****************************************/ // Broadcast static INLINE VECTOR_CC vdouble vcast_vd_d(double f) { return vdupq_n_f64(f); } // Add, Sub, Mul static INLINE VECTOR_CC vdouble vadd_vd_vd_vd(vdouble x, vdouble y) { return vaddq_f64(x, y); } static INLINE VECTOR_CC vdouble vsub_vd_vd_vd(vdouble x, vdouble y) { return vsubq_f64(x, y); } static INLINE VECTOR_CC vdouble vmul_vd_vd_vd(vdouble x, vdouble y) { return vmulq_f64(x, y); } // |x|, -x static INLINE VECTOR_CC vdouble vabs_vd_vd(vdouble f) { return vabsq_f64(f); } static INLINE VECTOR_CC vdouble vneg_vd_vd(vdouble f) { return vnegq_f64(f); } // max, min static INLINE VECTOR_CC vdouble vmax_vd_vd_vd(vdouble x, vdouble y) { return vmaxq_f64(x, y); } static INLINE VECTOR_CC vdouble vmin_vd_vd_vd(vdouble x, vdouble y) { return vminq_f64(x, y); } #if CONFIG == 1 // Multiply accumulate: z = z + x * y static INLINE VECTOR_CC vdouble vmla_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vfmaq_f64(z, x, y); } static INLINE VECTOR_CC vdouble vmlanp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vfmsq_f64(z, x, y); } //[z = x * y - z] static INLINE VECTOR_CC vdouble vmlapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vneg_vd_vd(vfmsq_f64(z, x, y)); } #else static INLINE VECTOR_CC vdouble vmla_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vadd_vd_vd_vd(vmul_vd_vd_vd(x, y), z); } static INLINE VECTOR_CC vdouble vmlapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vsub_vd_vd_vd(vmul_vd_vd_vd(x, y), z); } #endif static INLINE VECTOR_CC vdouble vfma_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { // z + x * y return vfmaq_f64(z, x, y); } static INLINE VECTOR_CC vdouble vfmanp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { // z - x * y return vfmsq_f64(z, x, y); } static INLINE VECTOR_CC vdouble vfmapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { // x * y - z return vneg_vd_vd(vfmanp_vd_vd_vd_vd(x, y, z)); } // Reciprocal 1/x, Division, Square root static INLINE VECTOR_CC vdouble vdiv_vd_vd_vd(vdouble n, vdouble d) { #ifndef ENABLE_ALTDIV return vdivq_f64(n, d); #else // Finite numbers (including denormal) only, gives mostly correctly rounded result float64x2_t t, u, x, y; uint64x2_t i0, i1; i0 = vandq_u64(vreinterpretq_u64_f64(n), vdupq_n_u64(0x7fc0000000000000L)); i1 = vandq_u64(vreinterpretq_u64_f64(d), vdupq_n_u64(0x7fc0000000000000L)); i0 = vsubq_u64(vdupq_n_u64(0x7fd0000000000000L), vshrq_n_u64(vaddq_u64(i0, i1), 1)); t = vreinterpretq_f64_u64(i0); y = vmulq_f64(d, t); x = vmulq_f64(n, t); t = vrecpeq_f64(y); t = vmulq_f64(t, vrecpsq_f64(y, t)); t = vmulq_f64(t, vrecpsq_f64(y, t)); t = vmulq_f64(t, vrecpsq_f64(y, t)); u = vmulq_f64(x, t); u = vfmaq_f64(u, vfmsq_f64(x, y, u), t); return u; #endif } static INLINE VECTOR_CC vdouble vrec_vd_vd(vdouble d) { #ifndef ENABLE_ALTDIV return vdiv_vd_vd_vd(vcast_vd_d(1.0f), d); #else return vbslq_f64(vceqq_f64(vabs_vd_vd(d), vcast_vd_d(SLEEF_INFINITY)), vcast_vd_d(0), vdiv_vd_vd_vd(vcast_vd_d(1.0f), d)); #endif } static INLINE VECTOR_CC vdouble vsqrt_vd_vd(vdouble d) { #ifndef ENABLE_ALTSQRT return vsqrtq_f64(d); #else // Gives correctly rounded result for all input range vdouble w, x, y, z; y = vrsqrteq_f64(d); x = vmul_vd_vd_vd(d, y); w = vmul_vd_vd_vd(vcast_vd_d(0.5), y); y = vfmanp_vd_vd_vd_vd(x, w, vcast_vd_d(0.5)); x = vfma_vd_vd_vd_vd(x, y, x); w = vfma_vd_vd_vd_vd(w, y, w); y = vfmanp_vd_vd_vd_vd(x, w, vcast_vd_d(0.5)); x = vfma_vd_vd_vd_vd(x, y, x); w = vfma_vd_vd_vd_vd(w, y, w); y = vfmanp_vd_vd_vd_vd(x, w, vcast_vd_d(1.5)); w = vadd_vd_vd_vd(w, w); w = vmul_vd_vd_vd(w, y); x = vmul_vd_vd_vd(w, d); y = vfmapn_vd_vd_vd_vd(w, d, x); z = vfmanp_vd_vd_vd_vd(w, x, vcast_vd_d(1)); z = vfmanp_vd_vd_vd_vd(w, y, z); w = vmul_vd_vd_vd(vcast_vd_d(0.5), x); w = vfma_vd_vd_vd_vd(w, z, y); w = vadd_vd_vd_vd(w, x); return vbslq_f64(vorrq_u64(vceqq_f64(d, vcast_vd_d(0)), vceqq_f64(d, vcast_vd_d(SLEEF_INFINITY))), d, w); #endif } /* Comparisons */ static INLINE VECTOR_CC vopmask veq_vo_vd_vd(vdouble x, vdouble y) { return vreinterpretq_u32_u64(vceqq_f64(x, y)); } static INLINE VECTOR_CC vopmask vneq_vo_vd_vd(vdouble x, vdouble y) { return vmvnq_u32(vreinterpretq_u32_u64(vceqq_f64(x, y))); } static INLINE VECTOR_CC vopmask vlt_vo_vd_vd(vdouble x, vdouble y) { return vreinterpretq_u32_u64(vcltq_f64(x, y)); } static INLINE VECTOR_CC vopmask vgt_vo_vd_vd(vdouble x, vdouble y) { return vreinterpretq_u32_u64(vcgtq_f64(x, y)); } static INLINE VECTOR_CC vopmask vle_vo_vd_vd(vdouble x, vdouble y) { return vreinterpretq_u32_u64(vcleq_f64(x, y)); } static INLINE VECTOR_CC vopmask vge_vo_vd_vd(vdouble x, vdouble y) { return vreinterpretq_u32_u64(vcgeq_f64(x, y)); } // Conditional select static INLINE VECTOR_CC vdouble vsel_vd_vo_vd_vd(vopmask mask, vdouble x, vdouble y) { return vbslq_f64(vreinterpretq_u64_u32(mask), x, y); } #if 1 static INLINE CONST VECTOR_CC vdouble vsel_vd_vo_d_d(vopmask o, double v1, double v0) { return vsel_vd_vo_vd_vd(o, vcast_vd_d(v1), vcast_vd_d(v0)); } static INLINE VECTOR_CC vdouble vsel_vd_vo_vo_d_d_d(vopmask o0, vopmask o1, double d0, double d1, double d2) { return vsel_vd_vo_vd_vd(o0, vcast_vd_d(d0), vsel_vd_vo_d_d(o1, d1, d2)); } static INLINE VECTOR_CC vdouble vsel_vd_vo_vo_vo_d_d_d_d(vopmask o0, vopmask o1, vopmask o2, double d0, double d1, double d2, double d3) { return vsel_vd_vo_vd_vd(o0, vcast_vd_d(d0), vsel_vd_vo_vd_vd(o1, vcast_vd_d(d1), vsel_vd_vo_d_d(o2, d2, d3))); } #else // This implementation is slower on the current CPU models (as of May 2017.) // I(Naoki Shibata) expect that on future CPU models with hardware similar to Super Shuffle Engine, this implementation will be faster. static INLINE CONST VECTOR_CC vdouble vsel_vd_vo_d_d(vopmask o, double d0, double d1) { uint8x16_t idx = vbslq_u8(vreinterpretq_u8_u32(o), (uint8x16_t) { 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7 }, (uint8x16_t) { 8, 9, 10, 11, 12, 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15 }); uint8x16_t tab = (uint8x16_t) (float64x2_t) { d0, d1 }; return (vdouble) vqtbl1q_u8(tab, idx); } static INLINE VECTOR_CC vdouble vsel_vd_vo_vo_vo_d_d_d_d(vopmask o0, vopmask o1, vopmask o2, double d0, double d1, double d2, double d3) { uint8x16_t idx = vbslq_u8(vreinterpretq_u8_u32(o0), (uint8x16_t) { 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7 }, vbslq_u8(vreinterpretq_u8_u32(o1), (uint8x16_t) { 8, 9, 10, 11, 12, 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15 }, vbslq_u8(vreinterpretq_u8_u32(o2), (uint8x16_t) { 16, 17, 18, 19, 20, 21, 22, 23, 16, 17, 18, 19, 20, 21, 22, 23 }, (uint8x16_t) { 24, 25, 26, 27, 28, 29, 30, 31, 24, 25, 26, 27, 28, 29, 30, 31 }))); uint8x16x2_t tab = { { (uint8x16_t) (float64x2_t) { d0, d1 }, (uint8x16_t) (float64x2_t) { d2, d3 } } }; return (vdouble) vqtbl2q_u8(tab, idx); } static INLINE VECTOR_CC vdouble vsel_vd_vo_vo_d_d_d(vopmask o0, vopmask o1, double d0, double d1, double d2) { return vsel_vd_vo_vo_vo_d_d_d_d(o0, o1, o1, d0, d1, d2, d2); } #endif static INLINE VECTOR_CC vdouble vrint_vd_vd(vdouble d) { return vrndnq_f64(d); } static INLINE VECTOR_CC vfloat vrint_vf_vf(vfloat d) { return vrndnq_f32(d); } /****************************************/ /* int <--> float conversions */ /****************************************/ static INLINE VECTOR_CC vint vtruncate_vi_vd(vdouble vf) { return vmovn_s64(vcvtq_s64_f64(vf)); } static INLINE VECTOR_CC vdouble vcast_vd_vi(vint vi) { return vcvtq_f64_s64(vmovl_s32(vi)); } static INLINE VECTOR_CC vint vcast_vi_i(int i) { return vdup_n_s32(i); } static INLINE VECTOR_CC vint vrint_vi_vd(vdouble d) { return vqmovn_s64(vcvtq_s64_f64(vrndnq_f64(d))); } /***************************************/ /* Integer operations */ /***************************************/ // Add, Sub, Neg (-x) static INLINE VECTOR_CC vint vadd_vi_vi_vi(vint x, vint y) { return vadd_s32(x, y); } static INLINE VECTOR_CC vint vsub_vi_vi_vi(vint x, vint y) { return vsub_s32(x, y); } static INLINE VECTOR_CC vint vneg_vi_vi(vint e) { return vneg_s32(e); } // Logical operations static INLINE VECTOR_CC vint vand_vi_vi_vi(vint x, vint y) { return vand_s32(x, y); } static INLINE VECTOR_CC vint vandnot_vi_vi_vi(vint x, vint y) { return vbic_s32(y, x); } static INLINE VECTOR_CC vint vor_vi_vi_vi(vint x, vint y) { return vorr_s32(x, y); } static INLINE VECTOR_CC vint vxor_vi_vi_vi(vint x, vint y) { return veor_s32(x, y); } // Comparison returning masks static INLINE VECTOR_CC vopmask veq_vo_vi_vi(vint x, vint y) { return vcombine_u32(vceq_s32(x, y), vdup_n_u32(0)); } // Conditional select static INLINE VECTOR_CC vint vsel_vi_vm_vi_vi(vmask m, vint x, vint y) { return vbsl_s32(vget_low_u32(m), x, y); } /***************************************/ /* Predicates */ /***************************************/ static INLINE VECTOR_CC vopmask visinf_vo_vd(vdouble d) { const float64x2_t inf = vdupq_n_f64(SLEEF_INFINITY); const float64x2_t neg_inf = vdupq_n_f64(-SLEEF_INFINITY); uint64x2_t cmp = vorrq_u64(vceqq_f64(d, inf), vceqq_f64(d, neg_inf)); return vreinterpretq_u32_u64(cmp); } static INLINE VECTOR_CC vopmask visnan_vo_vd(vdouble d) { return vmvnq_u32(vreinterpretq_u32_u64(vceqq_f64(d, d))); } static INLINE VECTOR_CC vopmask vispinf_vo_vd(vdouble d) { return vreinterpretq_u32_u64(vceqq_f64(d, vdupq_n_f64(SLEEF_INFINITY))); } static INLINE VECTOR_CC vopmask visminf_vo_vd(vdouble d) { return vreinterpretq_u32_u64(vceqq_f64(d, vdupq_n_f64(-SLEEF_INFINITY))); } static INLINE VECTOR_CC vfloat vsel_vf_vo_vf_vf(vopmask mask, vfloat x, vfloat y) { return vbslq_f32(mask, x, y); } static INLINE CONST VECTOR_CC vfloat vsel_vf_vo_f_f(vopmask o, float v1, float v0) { return vsel_vf_vo_vf_vf(o, vcast_vf_f(v1), vcast_vf_f(v0)); } static INLINE VECTOR_CC vfloat vsel_vf_vo_vo_f_f_f(vopmask o0, vopmask o1, float d0, float d1, float d2) { return vsel_vf_vo_vf_vf(o0, vcast_vf_f(d0), vsel_vf_vo_f_f(o1, d1, d2)); } static INLINE VECTOR_CC vfloat vsel_vf_vo_vo_vo_f_f_f_f(vopmask o0, vopmask o1, vopmask o2, float d0, float d1, float d2, float d3) { return vsel_vf_vo_vf_vf(o0, vcast_vf_f(d0), vsel_vf_vo_vf_vf(o1, vcast_vf_f(d1), vsel_vf_vo_f_f(o2, d2, d3))); } static INLINE VECTOR_CC vopmask veq_vo_vf_vf(vfloat x, vfloat y) { return vceqq_f32(x, y); } static INLINE VECTOR_CC vopmask vneq_vo_vf_vf(vfloat x, vfloat y) { return vmvnq_u32(vceqq_f32(x, y)); } static INLINE VECTOR_CC vopmask vlt_vo_vf_vf(vfloat x, vfloat y) { return vcltq_f32(x, y); } static INLINE VECTOR_CC vopmask vle_vo_vf_vf(vfloat x, vfloat y) { return vcleq_f32(x, y); } static INLINE VECTOR_CC vopmask vgt_vo_vf_vf(vfloat x, vfloat y) { return vcgtq_f32(x, y); } static INLINE VECTOR_CC vopmask vge_vo_vf_vf(vfloat x, vfloat y) { return vcgeq_f32(x, y); } static INLINE VECTOR_CC vopmask veq_vo_vi2_vi2(vint2 x, vint2 y) { return vceqq_s32(x, y); } static INLINE VECTOR_CC vopmask vgt_vo_vi2_vi2(vint2 x, vint2 y) { return vcgtq_s32(x, y); } static INLINE VECTOR_CC vopmask vgt_vo_vi_vi(vint x, vint y) { return vcombine_u32(vcgt_s32(x, y), vdup_n_u32(0)); } static INLINE VECTOR_CC vopmask visinf_vo_vf(vfloat d) { return veq_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(SLEEF_INFINITYf)); } static INLINE VECTOR_CC vopmask vispinf_vo_vf(vfloat d) { return veq_vo_vf_vf(d, vcast_vf_f(SLEEF_INFINITYf)); } static INLINE VECTOR_CC vopmask visminf_vo_vf(vfloat d) { return veq_vo_vf_vf(d, vcast_vf_f(-SLEEF_INFINITYf)); } static INLINE VECTOR_CC vopmask visnan_vo_vf(vfloat d) { return vneq_vo_vf_vf(d, d); } static INLINE VECTOR_CC vopmask vcast_vo32_vo64(vopmask m) { return vuzpq_u32(m, m).val[0]; } static INLINE VECTOR_CC vopmask vcast_vo64_vo32(vopmask m) { return vzipq_u32(m, m).val[0]; } static INLINE VECTOR_CC vopmask vand_vo_vo_vo(vopmask x, vopmask y) { return vandq_u32(x, y); } static INLINE VECTOR_CC vopmask vandnot_vo_vo_vo(vopmask x, vopmask y) { return vbicq_u32(y, x); } static INLINE VECTOR_CC vopmask vor_vo_vo_vo(vopmask x, vopmask y) { return vorrq_u32(x, y); } static INLINE VECTOR_CC vopmask vxor_vo_vo_vo(vopmask x, vopmask y) { return veorq_u32(x, y); } static INLINE VECTOR_CC vint2 vsel_vi2_vo_vi2_vi2(vopmask m, vint2 x, vint2 y) { return vbslq_s32(m, x, y); } static INLINE VECTOR_CC vint2 vand_vi2_vo_vi2(vopmask x, vint2 y) { return vandq_s32(vreinterpretq_s32_u32(x), y); } static INLINE VECTOR_CC vint2 vandnot_vi2_vo_vi2(vopmask x, vint2 y) { return vbicq_s32(y, vreinterpretq_s32_u32(x)); } static INLINE VECTOR_CC vint vandnot_vi_vo_vi(vopmask x, vint y) { return vbic_s32(y, vget_low_s32(vreinterpretq_s32_u32(x))); } static INLINE VECTOR_CC vmask vand_vm_vo32_vm(vopmask x, vmask y) { return vandq_u32(x, y); } static INLINE VECTOR_CC vmask vand_vm_vo64_vm(vopmask x, vmask y) { return vandq_u32(x, y); } static INLINE VECTOR_CC vmask vandnot_vm_vo32_vm(vopmask x, vmask y) { return vbicq_u32(y, x); } static INLINE VECTOR_CC vmask vandnot_vm_vo64_vm(vopmask x, vmask y) { return vbicq_u32(y, x); } static INLINE VECTOR_CC vmask vor_vm_vo32_vm(vopmask x, vmask y) { return vorrq_u32(x, y); } static INLINE VECTOR_CC vmask vor_vm_vo64_vm(vopmask x, vmask y) { return vorrq_u32(x, y); } static INLINE VECTOR_CC vmask vxor_vm_vo32_vm(vopmask x, vmask y) { return veorq_u32(x, y); } static INLINE VECTOR_CC vfloat vtruncate_vf_vf(vfloat vd) { return vrndq_f32(vd); } static INLINE VECTOR_CC vmask vcast_vm_i_i(int i0, int i1) { return vreinterpretq_u32_u64(vdupq_n_u64((0xffffffff & (uint64_t)i1) | (((uint64_t)i0) << 32))); } static INLINE VECTOR_CC vopmask veq64_vo_vm_vm(vmask x, vmask y) { return vreinterpretq_u32_u64(vceqq_s64(vreinterpretq_s64_u32(x), vreinterpretq_s64_u32(y))); } static INLINE VECTOR_CC vmask vadd64_vm_vm_vm(vmask x, vmask y) { return vreinterpretq_u32_s64(vaddq_s64(vreinterpretq_s64_u32(x), vreinterpretq_s64_u32(y))); } static INLINE VECTOR_CC vint vsel_vi_vo_vi_vi(vopmask m, vint x, vint y) { return vbsl_s32(vget_low_u32(m), x, y); } // Logical operations static INLINE VECTOR_CC vint vand_vi_vo_vi(vopmask x, vint y) { return vand_s32(vreinterpret_s32_u32(vget_low_u32(x)), y); } static INLINE VECTOR_CC vint2 vcastu_vi2_vi(vint vi) { return vreinterpretq_s32_u32(vrev64q_u32(vreinterpretq_u32_u64(vmovl_u32(vreinterpret_u32_s32(vi))))); } static INLINE VECTOR_CC vint vcastu_vi_vi2(vint2 vi2) { return vreinterpret_s32_u32(vmovn_u64(vreinterpretq_u64_u32(vrev64q_u32(vreinterpretq_u32_s32(vi2))))); } static INLINE VECTOR_CC vdouble vreinterpret_vd_vi2(vint2 vi) { return vreinterpretq_f64_s32(vi); } static INLINE VECTOR_CC vdouble vtruncate_vd_vd(vdouble vd) { return vrndq_f64(vd); } // #define PNMASK ((vdouble) { +0.0, -0.0 }) #define NPMASK ((vdouble) { -0.0, +0.0 }) #define PNMASKf ((vfloat) { +0.0f, -0.0f, +0.0f, -0.0f }) #define NPMASKf ((vfloat) { -0.0f, +0.0f, -0.0f, +0.0f }) static INLINE VECTOR_CC vdouble vposneg_vd_vd(vdouble d) { return vreinterpret_vd_vm(vxor_vm_vm_vm(vreinterpret_vm_vd(d), vreinterpret_vm_vd(PNMASK))); } static INLINE VECTOR_CC vdouble vnegpos_vd_vd(vdouble d) { return vreinterpret_vd_vm(vxor_vm_vm_vm(vreinterpret_vm_vd(d), vreinterpret_vm_vd(NPMASK))); } static INLINE VECTOR_CC vfloat vposneg_vf_vf(vfloat d) { return (vfloat)vxor_vm_vm_vm((vmask)d, (vmask)PNMASKf); } static INLINE VECTOR_CC vfloat vnegpos_vf_vf(vfloat d) { return (vfloat)vxor_vm_vm_vm((vmask)d, (vmask)NPMASKf); } static INLINE VECTOR_CC vdouble vsubadd_vd_vd_vd(vdouble x, vdouble y) { return vadd_vd_vd_vd(x, vnegpos_vd_vd(y)); } static INLINE VECTOR_CC vfloat vsubadd_vf_vf_vf(vfloat d0, vfloat d1) { return vadd_vf_vf_vf(d0, vnegpos_vf_vf(d1)); } static INLINE VECTOR_CC vdouble vmlsubadd_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vsubadd_vd_vd_vd(vmul_vd_vd_vd(x, y), z); } static INLINE VECTOR_CC vfloat vmlsubadd_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vsubadd_vf_vf_vf(vmul_vf_vf_vf(x, y), z); } static INLINE VECTOR_CC vdouble vrev21_vd_vd(vdouble d0) { return (float64x2_t)vcombine_u64(vget_high_u64((uint64x2_t)d0), vget_low_u64((uint64x2_t)d0)); } static INLINE VECTOR_CC vdouble vreva2_vd_vd(vdouble vd) { return vd; } static INLINE VECTOR_CC void vstream_v_p_vd(double *ptr, vdouble v) { vstore_v_p_vd(ptr, v); } static INLINE VECTOR_CC void vscatter2_v_p_i_i_vd(double *ptr, int offset, int step, vdouble v) { vstore_v_p_vd((double *)(&ptr[2*offset]), v); } static INLINE VECTOR_CC void vsscatter2_v_p_i_i_vd(double *ptr, int offset, int step, vdouble v) { vstore_v_p_vd((double *)(&ptr[2*offset]), v); } static INLINE VECTOR_CC vfloat vrev21_vf_vf(vfloat d0) { return vrev64q_f32(d0); } static INLINE VECTOR_CC vfloat vreva2_vf_vf(vfloat d0) { return vcombine_f32(vget_high_f32(d0), vget_low_f32(d0)); } static INLINE VECTOR_CC vint2 vrev21_vi2_vi2(vint2 i) { return vreinterpret_vi2_vf(vrev21_vf_vf(vreinterpret_vf_vi2(i))); } static INLINE VECTOR_CC void vstream_v_p_vf(float *ptr, vfloat v) { vstore_v_p_vf(ptr, v); } static INLINE VECTOR_CC void vscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v) { vst1_f32((float *)(ptr+(offset + step * 0)*2), vget_low_f32(v)); vst1_f32((float *)(ptr+(offset + step * 1)*2), vget_high_f32(v)); } static INLINE VECTOR_CC void vsscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v) { vst1_f32((float *)(ptr+(offset + step * 0)*2), vget_low_f32(v)); vst1_f32((float *)(ptr+(offset + step * 1)*2), vget_high_f32(v)); } // static INLINE vmask2 vinterleave_vm2_vm2(vmask2 v) { return (vmask2) { vreinterpretq_u32_u64(vtrn1q_u64(vreinterpretq_u64_u32(v.x), vreinterpretq_u64_u32(v.y))), vreinterpretq_u32_u64(vtrn2q_u64(vreinterpretq_u64_u32(v.x), vreinterpretq_u64_u32(v.y))) }; } static INLINE vmask2 vuninterleave_vm2_vm2(vmask2 v) { return (vmask2) { vreinterpretq_u32_u64(vtrn1q_u64(vreinterpretq_u64_u32(v.x), vreinterpretq_u64_u32(v.y))), vreinterpretq_u32_u64(vtrn2q_u64(vreinterpretq_u64_u32(v.x), vreinterpretq_u64_u32(v.y))) }; } static INLINE vint vuninterleave_vi_vi(vint v) { return v; } static INLINE vdouble vinterleave_vd_vd(vdouble vd) { return vd; } static INLINE vdouble vuninterleave_vd_vd(vdouble vd) { return vd; } static INLINE vmask vinterleave_vm_vm(vmask vm) { return vm; } static INLINE vmask vuninterleave_vm_vm(vmask vm) { return vm; } static vmask2 vloadu_vm2_p(void *p) { vmask2 vm2; memcpy(&vm2, p, VECTLENDP * 16); return vm2; } #if !defined(SLEEF_GENHEADER) typedef Sleef_quad2 vargquad; static INLINE vmask2 vcast_vm2_aq(vargquad aq) { return vinterleave_vm2_vm2(vloadu_vm2_p(&aq)); } static INLINE vargquad vcast_aq_vm2(vmask2 vm2) { vm2 = vuninterleave_vm2_vm2(vm2); vargquad aq; memcpy(&aq, &vm2, VECTLENDP * 16); return aq; } #endif // #if !defined(SLEEF_GENHEADER) static INLINE int vtestallzeros_i_vo64(vopmask g) { uint32x2_t x0 = vorr_u32(vget_low_u32(g), vget_high_u32(g)); uint32x2_t x1 = vpmax_u32(x0, x0); return ~vget_lane_u32(x1, 0); } static INLINE vmask vsel_vm_vo64_vm_vm(vopmask m, vmask x, vmask y) { return vbslq_u32(m, x, y); } static INLINE vmask vsub64_vm_vm_vm(vmask x, vmask y) { return vreinterpretq_u32_s64(vsubq_s64(vreinterpretq_s64_u32(x), vreinterpretq_s64_u32(y))); } static INLINE vmask vneg64_vm_vm(vmask x) { return vreinterpretq_u32_s64(vnegq_s64(vreinterpretq_s64_u32(x))); } static INLINE vopmask vgt64_vo_vm_vm(vmask x, vmask y) { return vreinterpretq_u32_u64(vcgtq_s64(vreinterpretq_s64_u32(x), vreinterpretq_s64_u32(y))); } #define vsll64_vm_vm_i(x, c) vreinterpretq_u32_u64(vshlq_n_u64(vreinterpretq_u64_u32(x), c)) //@#define vsll64_vm_vm_i(x, c) vreinterpretq_u32_u64(vshlq_n_u64(vreinterpretq_u64_u32(x), c)) #define vsrl64_vm_vm_i(x, c) vreinterpretq_u32_u64(vshrq_n_u64(vreinterpretq_u64_u32(x), c)) //@#define vsrl64_vm_vm_i(x, c) vreinterpretq_u32_u64(vshrq_n_u64(vreinterpretq_u64_u32(x), c)) static INLINE vmask vcast_vm_vi(vint vi) { vmask m = vreinterpretq_u32_u64(vmovl_u32(vreinterpret_u32_s32(vi))); return vor_vm_vm_vm(vcast_vm_vi2(vcastu_vi2_vi(vreinterpret_s32_u32(vget_low_u32(vgt_vo_vi_vi(vcast_vi_i(0), vi))))), m); } static INLINE vint vcast_vi_vm(vmask vm) { return vreinterpret_s32_u32(vmovn_u64(vreinterpretq_u64_u32(vm))); } ================================================ FILE: src/helperavx.h ================================================ // Copyright Naoki Shibata and contributors 2010 - 2020. // Distributed under the Boost Software License, Version 1.0. // (See accompanying file LICENSE.txt or copy at // http://www.boost.org/LICENSE_1_0.txt) #if CONFIG == 1 #if !defined(__AVX__) && !defined(SLEEF_GENHEADER) #error Please specify -mavx. #endif #elif CONFIG == 4 #if (!defined(__AVX__) || !defined(__FMA4__)) && !defined(SLEEF_GENHEADER) #error Please specify -mavx and -mfma4. #endif #else #error CONFIG macro invalid or not defined #endif #define ENABLE_DP //@#define ENABLE_DP #define LOG2VECTLENDP 2 //@#define LOG2VECTLENDP 2 #define VECTLENDP (1 << LOG2VECTLENDP) //@#define VECTLENDP (1 << LOG2VECTLENDP) #define ENABLE_SP //@#define ENABLE_SP #define LOG2VECTLENSP (LOG2VECTLENDP+1) //@#define LOG2VECTLENSP (LOG2VECTLENDP+1) #define VECTLENSP (1 << LOG2VECTLENSP) //@#define VECTLENSP (1 << LOG2VECTLENSP) #define FULL_FP_ROUNDING //@#define FULL_FP_ROUNDING #define ACCURATE_SQRT //@#define ACCURATE_SQRT #if !defined(SLEEF_GENHEADER) #if defined(_MSC_VER) #include #else #include #endif #include #include "misc.h" #endif // #if !defined(SLEEF_GENHEADER) typedef __m256i vmask; typedef __m256i vopmask; typedef __m256d vdouble; typedef __m128i vint; typedef __m256 vfloat; typedef struct { __m128i x, y; } vint2; typedef struct { vmask x, y; } vmask2; // #if !defined(SLEEF_GENHEADER) #ifndef __SLEEF_H__ static inline void Sleef_x86CpuID(int32_t out[4], uint32_t eax, uint32_t ecx) { /* We don't care for cpuid detection */ out[0] = 0xFFFFFFFF; out[1] = 0xFFFFFFFF; out[2] = 0xFFFFFFFF; out[3] = 0xFFFFFFFF; } #endif static INLINE int cpuSupportsAVX() { int32_t reg[4]; Sleef_x86CpuID(reg, 1, 0); return (reg[2] & (1 << 28)) != 0; } static INLINE int cpuSupportsFMA4() { int32_t reg[4]; Sleef_x86CpuID(reg, 0x80000001, 0); return (reg[2] & (1 << 16)) != 0; } #if CONFIG == 4 && defined(__AVX__) && defined(__FMA4__) static INLINE int vavailability_i(int name) { //int d = __builtin_cpu_supports("avx") && __builtin_cpu_supports("fma4"); int d = cpuSupportsAVX() && cpuSupportsFMA4(); return d ? 3 : 0; } //typedef vint2 vint2_fma4; #define ENABLE_FMA_DP #define ENABLE_FMA_SP #define ISANAME "AVX + AMD FMA4" #define DFTPRIORITY 21 #else static INLINE int vavailability_i(int name) { int d = cpuSupportsAVX(); return d ? 3 : 0; } //typedef vint2 vint2_avx; #define ISANAME "AVX" #define DFTPRIORITY 20 #endif #endif // #if !defined(SLEEF_GENHEADER) static INLINE void vprefetch_v_p(const void *ptr) { _mm_prefetch(ptr, _MM_HINT_T0); } static INLINE int vtestallones_i_vo32(vopmask g) { return _mm_test_all_ones(_mm_and_si128(_mm256_extractf128_si256(g, 0), _mm256_extractf128_si256(g, 1))); } static INLINE int vtestallones_i_vo64(vopmask g) { return _mm_test_all_ones(_mm_and_si128(_mm256_extractf128_si256(g, 0), _mm256_extractf128_si256(g, 1))); } // static INLINE vdouble vcast_vd_d(double d) { return _mm256_set1_pd(d); } static INLINE vmask vreinterpret_vm_vd(vdouble vd) { return _mm256_castpd_si256(vd); } static INLINE vdouble vreinterpret_vd_vm(vmask vm) { return _mm256_castsi256_pd(vm); } static INLINE vint2 vreinterpret_vi2_vd(vdouble vd) { vint2 r; r.x = _mm256_castsi256_si128(vreinterpret_vm_vd(vd)); r.y = _mm256_extractf128_si256(vreinterpret_vm_vd(vd), 1); return r; } static INLINE vdouble vreinterpret_vd_vi2(vint2 vi) { vmask m = _mm256_castsi128_si256(vi.x); m = _mm256_insertf128_si256(m, vi.y, 1); return vreinterpret_vd_vm(m); } // static vint2 vloadu_vi2_p(int32_t *p) { vint2 r; r.x = _mm_loadu_si128((__m128i *) p ); r.y = _mm_loadu_si128((__m128i *)(p + 4)); return r; } static void vstoreu_v_p_vi2(int32_t *p, vint2 v) { _mm_storeu_si128((__m128i *) p , v.x); _mm_storeu_si128((__m128i *)(p + 4), v.y); } static vint vloadu_vi_p(int32_t *p) { return _mm_loadu_si128((__m128i *)p); } static void vstoreu_v_p_vi(int32_t *p, vint v) { _mm_storeu_si128((__m128i *)p, v); } // static INLINE vmask vand_vm_vm_vm(vmask x, vmask y) { return vreinterpret_vm_vd(_mm256_and_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); } static INLINE vmask vandnot_vm_vm_vm(vmask x, vmask y) { return vreinterpret_vm_vd(_mm256_andnot_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); } static INLINE vmask vor_vm_vm_vm(vmask x, vmask y) { return vreinterpret_vm_vd(_mm256_or_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); } static INLINE vmask vxor_vm_vm_vm(vmask x, vmask y) { return vreinterpret_vm_vd(_mm256_xor_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); } static INLINE vopmask vand_vo_vo_vo(vopmask x, vopmask y) { return vreinterpret_vm_vd(_mm256_and_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); } static INLINE vopmask vandnot_vo_vo_vo(vopmask x, vopmask y) { return vreinterpret_vm_vd(_mm256_andnot_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); } static INLINE vopmask vor_vo_vo_vo(vopmask x, vopmask y) { return vreinterpret_vm_vd(_mm256_or_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); } static INLINE vopmask vxor_vo_vo_vo(vopmask x, vopmask y) { return vreinterpret_vm_vd(_mm256_xor_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); } static INLINE vmask vand_vm_vo64_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm256_and_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); } static INLINE vmask vandnot_vm_vo64_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm256_andnot_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); } static INLINE vmask vor_vm_vo64_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm256_or_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); } static INLINE vmask vxor_vm_vo64_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm256_xor_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); } static INLINE vmask vand_vm_vo32_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm256_and_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); } static INLINE vmask vandnot_vm_vo32_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm256_andnot_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); } static INLINE vmask vor_vm_vo32_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm256_or_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); } static INLINE vmask vxor_vm_vo32_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm256_xor_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); } static INLINE vopmask vcast_vo32_vo64(vopmask o) { return _mm256_castsi128_si256(_mm256_cvtpd_epi32(_mm256_and_pd(vreinterpret_vd_vm(o), _mm256_set1_pd(-1.0)))); } static INLINE vopmask vcast_vo64_vo32(vopmask o) { return vreinterpret_vm_vd(_mm256_cmp_pd(_mm256_cvtepi32_pd(_mm256_castsi256_si128(o)), _mm256_set1_pd(-1.0), _CMP_EQ_OQ)); } // static INLINE vint vrint_vi_vd(vdouble vd) { return _mm256_cvtpd_epi32(vd); } static INLINE vint vtruncate_vi_vd(vdouble vd) { return _mm256_cvttpd_epi32(vd); } static INLINE vdouble vrint_vd_vd(vdouble vd) { return _mm256_round_pd(vd, _MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC); } static INLINE vdouble vtruncate_vd_vd(vdouble vd) { return _mm256_round_pd(vd, _MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC); } static INLINE vfloat vrint_vf_vf(vfloat vd) { return _mm256_round_ps(vd, _MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC); } static INLINE vfloat vtruncate_vf_vf(vfloat vf) { return _mm256_round_ps(vf, _MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC); } static INLINE vdouble vcast_vd_vi(vint vi) { return _mm256_cvtepi32_pd(vi); } static INLINE vint vcast_vi_i(int i) { return _mm_set1_epi32(i); } static INLINE vint2 vcastu_vi2_vi(vint vi) { vint2 r; r.x = _mm_and_si128(_mm_shuffle_epi32(vi, 0x40), _mm_set_epi32(-1, 0, -1, 0)); r.y = _mm_and_si128(_mm_shuffle_epi32(vi, 0xc8), _mm_set_epi32(-1, 0, -1, 0)); return r; } static INLINE vint vcastu_vi_vi2(vint2 vi) { return _mm_or_si128(_mm_and_si128(_mm_shuffle_epi32(vi.x, 0x0d), _mm_set_epi32( 0, 0, -1, -1)), _mm_and_si128(_mm_shuffle_epi32(vi.y, 0xd0), _mm_set_epi32(-1, -1, 0, 0))); } static INLINE vmask vcast_vm_i_i(int i0, int i1) { return _mm256_set_epi32(i0, i1, i0, i1, i0, i1, i0, i1); } static INLINE vopmask veq64_vo_vm_vm(vmask x, vmask y) { return vreinterpret_vm_vd(_mm256_cmp_pd(vreinterpret_vd_vm(vxor_vm_vm_vm(vxor_vm_vm_vm(x, y), vreinterpret_vm_vd(_mm256_set1_pd(1.0)))), _mm256_set1_pd(1.0), _CMP_EQ_OQ)); } // static INLINE vdouble vadd_vd_vd_vd(vdouble x, vdouble y) { return _mm256_add_pd(x, y); } static INLINE vdouble vsub_vd_vd_vd(vdouble x, vdouble y) { return _mm256_sub_pd(x, y); } static INLINE vdouble vmul_vd_vd_vd(vdouble x, vdouble y) { return _mm256_mul_pd(x, y); } static INLINE vdouble vdiv_vd_vd_vd(vdouble x, vdouble y) { return _mm256_div_pd(x, y); } static INLINE vdouble vrec_vd_vd(vdouble x) { return _mm256_div_pd(_mm256_set1_pd(1), x); } static INLINE vdouble vsqrt_vd_vd(vdouble x) { return _mm256_sqrt_pd(x); } static INLINE vdouble vabs_vd_vd(vdouble d) { return _mm256_andnot_pd(_mm256_set1_pd(-0.0), d); } static INLINE vdouble vneg_vd_vd(vdouble d) { return _mm256_xor_pd(_mm256_set1_pd(-0.0), d); } static INLINE vdouble vmax_vd_vd_vd(vdouble x, vdouble y) { return _mm256_max_pd(x, y); } static INLINE vdouble vmin_vd_vd_vd(vdouble x, vdouble y) { return _mm256_min_pd(x, y); } #if CONFIG == 1 static INLINE vdouble vmla_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vadd_vd_vd_vd(vmul_vd_vd_vd(x, y), z); } static INLINE vdouble vmlapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vsub_vd_vd_vd(vmul_vd_vd_vd(x, y), z); } static INLINE vdouble vmlanp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vsub_vd_vd_vd(z, vmul_vd_vd_vd(x, y)); } #else static INLINE vdouble vmla_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_macc_pd(x, y, z); } static INLINE vdouble vmlapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_msub_pd(x, y, z); } static INLINE vdouble vmlanp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_nmacc_pd(x, y, z); } static INLINE vdouble vfma_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_macc_pd(x, y, z); } static INLINE vdouble vfmapp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_macc_pd(x, y, z); } static INLINE vdouble vfmapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_msub_pd(x, y, z); } static INLINE vdouble vfmanp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_nmacc_pd(x, y, z); } static INLINE vdouble vfmann_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_nmsub_pd(x, y, z); } #endif static INLINE vopmask veq_vo_vd_vd(vdouble x, vdouble y) { return vreinterpret_vm_vd(_mm256_cmp_pd(x, y, _CMP_EQ_OQ)); } static INLINE vopmask vneq_vo_vd_vd(vdouble x, vdouble y) { return vreinterpret_vm_vd(_mm256_cmp_pd(x, y, _CMP_NEQ_UQ)); } static INLINE vopmask vlt_vo_vd_vd(vdouble x, vdouble y) { return vreinterpret_vm_vd(_mm256_cmp_pd(x, y, _CMP_LT_OQ)); } static INLINE vopmask vle_vo_vd_vd(vdouble x, vdouble y) { return vreinterpret_vm_vd(_mm256_cmp_pd(x, y, _CMP_LE_OQ)); } static INLINE vopmask vgt_vo_vd_vd(vdouble x, vdouble y) { return vreinterpret_vm_vd(_mm256_cmp_pd(x, y, _CMP_GT_OQ)); } static INLINE vopmask vge_vo_vd_vd(vdouble x, vdouble y) { return vreinterpret_vm_vd(_mm256_cmp_pd(x, y, _CMP_GE_OQ)); } // static INLINE vint vadd_vi_vi_vi(vint x, vint y) { return _mm_add_epi32(x, y); } static INLINE vint vsub_vi_vi_vi(vint x, vint y) { return _mm_sub_epi32(x, y); } static INLINE vint vneg_vi_vi(vint e) { return vsub_vi_vi_vi(vcast_vi_i(0), e); } static INLINE vint vand_vi_vi_vi(vint x, vint y) { return _mm_and_si128(x, y); } static INLINE vint vandnot_vi_vi_vi(vint x, vint y) { return _mm_andnot_si128(x, y); } static INLINE vint vor_vi_vi_vi(vint x, vint y) { return _mm_or_si128(x, y); } static INLINE vint vxor_vi_vi_vi(vint x, vint y) { return _mm_xor_si128(x, y); } static INLINE vint vandnot_vi_vo_vi(vopmask m, vint y) { return _mm_andnot_si128(_mm256_castsi256_si128(m), y); } static INLINE vint vand_vi_vo_vi(vopmask m, vint y) { return _mm_and_si128(_mm256_castsi256_si128(m), y); } static INLINE vint vsll_vi_vi_i(vint x, int c) { return _mm_slli_epi32(x, c); } static INLINE vint vsrl_vi_vi_i(vint x, int c) { return _mm_srli_epi32(x, c); } static INLINE vint vsra_vi_vi_i(vint x, int c) { return _mm_srai_epi32(x, c); } static INLINE vint veq_vi_vi_vi(vint x, vint y) { return _mm_cmpeq_epi32(x, y); } static INLINE vint vgt_vi_vi_vi(vint x, vint y) { return _mm_cmpgt_epi32(x, y); } static INLINE vopmask veq_vo_vi_vi(vint x, vint y) { return _mm256_castsi128_si256(_mm_cmpeq_epi32(x, y)); } static INLINE vopmask vgt_vo_vi_vi(vint x, vint y) { return _mm256_castsi128_si256(_mm_cmpgt_epi32(x, y)); } static INLINE vint vsel_vi_vo_vi_vi(vopmask o, vint x, vint y) { return _mm_blendv_epi8(y, x, _mm256_castsi256_si128(o)); } static INLINE vdouble vsel_vd_vo_vd_vd(vopmask o, vdouble x, vdouble y) { return _mm256_blendv_pd(y, x, _mm256_castsi256_pd(o)); } static INLINE CONST vdouble vsel_vd_vo_d_d(vopmask o, double v1, double v0) { return vsel_vd_vo_vd_vd(o, vcast_vd_d(v1), vcast_vd_d(v0)); } static INLINE vdouble vsel_vd_vo_vo_d_d_d(vopmask o0, vopmask o1, double d0, double d1, double d2) { return vsel_vd_vo_vd_vd(o0, vcast_vd_d(d0), vsel_vd_vo_d_d(o1, d1, d2)); } static INLINE vdouble vsel_vd_vo_vo_vo_d_d_d_d(vopmask o0, vopmask o1, vopmask o2, double d0, double d1, double d2, double d3) { return vsel_vd_vo_vd_vd(o0, vcast_vd_d(d0), vsel_vd_vo_vd_vd(o1, vcast_vd_d(d1), vsel_vd_vo_d_d(o2, d2, d3))); } static INLINE vopmask visinf_vo_vd(vdouble d) { return vreinterpret_vm_vd(_mm256_cmp_pd(vabs_vd_vd(d), _mm256_set1_pd(SLEEF_INFINITY), _CMP_EQ_OQ)); } static INLINE vopmask vispinf_vo_vd(vdouble d) { return vreinterpret_vm_vd(_mm256_cmp_pd(d, _mm256_set1_pd(SLEEF_INFINITY), _CMP_EQ_OQ)); } static INLINE vopmask visminf_vo_vd(vdouble d) { return vreinterpret_vm_vd(_mm256_cmp_pd(d, _mm256_set1_pd(-SLEEF_INFINITY), _CMP_EQ_OQ)); } static INLINE vopmask visnan_vo_vd(vdouble d) { return vreinterpret_vm_vd(_mm256_cmp_pd(d, d, _CMP_NEQ_UQ)); } static INLINE vdouble vload_vd_p(const double *ptr) { return _mm256_load_pd(ptr); } static INLINE vdouble vloadu_vd_p(const double *ptr) { return _mm256_loadu_pd(ptr); } static INLINE void vstore_v_p_vd(double *ptr, vdouble v) { _mm256_store_pd(ptr, v); } static INLINE void vstoreu_v_p_vd(double *ptr, vdouble v) { _mm256_storeu_pd(ptr, v); } static INLINE vdouble vgather_vd_p_vi(const double *ptr, vint vi) { int a[VECTLENDP]; vstoreu_v_p_vi(a, vi); return _mm256_set_pd(ptr[a[3]], ptr[a[2]], ptr[a[1]], ptr[a[0]]); } #if defined(_MSC_VER) // This function is needed when debugging on MSVC. static INLINE double vcast_d_vd(vdouble v) { double a[VECTLENDP]; vstoreu_v_p_vd(a, v); return a[0]; } #endif // static INLINE vint2 vcast_vi2_vm(vmask vm) { vint2 r; r.x = _mm256_castsi256_si128(vm); r.y = _mm256_extractf128_si256(vm, 1); return r; } static INLINE vmask vcast_vm_vi2(vint2 vi) { vmask m = _mm256_castsi128_si256(vi.x); m = _mm256_insertf128_si256(m, vi.y, 1); return m; } static INLINE vint2 vrint_vi2_vf(vfloat vf) { return vcast_vi2_vm(_mm256_cvtps_epi32(vf)); } static INLINE vint2 vtruncate_vi2_vf(vfloat vf) { return vcast_vi2_vm(_mm256_cvttps_epi32(vf)); } static INLINE vfloat vcast_vf_vi2(vint2 vi) { return _mm256_cvtepi32_ps(vcast_vm_vi2(vi)); } static INLINE vfloat vcast_vf_f(float f) { return _mm256_set1_ps(f); } static INLINE vint2 vcast_vi2_i(int i) { vint2 r; r.x = r.y = _mm_set1_epi32(i); return r; } static INLINE vmask vreinterpret_vm_vf(vfloat vf) { return _mm256_castps_si256(vf); } static INLINE vfloat vreinterpret_vf_vm(vmask vm) { return _mm256_castsi256_ps(vm); } static INLINE vfloat vreinterpret_vf_vi2(vint2 vi) { return vreinterpret_vf_vm(vcast_vm_vi2(vi)); } static INLINE vint2 vreinterpret_vi2_vf(vfloat vf) { return vcast_vi2_vm(vreinterpret_vm_vf(vf)); } static INLINE vfloat vadd_vf_vf_vf(vfloat x, vfloat y) { return _mm256_add_ps(x, y); } static INLINE vfloat vsub_vf_vf_vf(vfloat x, vfloat y) { return _mm256_sub_ps(x, y); } static INLINE vfloat vmul_vf_vf_vf(vfloat x, vfloat y) { return _mm256_mul_ps(x, y); } static INLINE vfloat vdiv_vf_vf_vf(vfloat x, vfloat y) { return _mm256_div_ps(x, y); } static INLINE vfloat vrec_vf_vf(vfloat x) { return vdiv_vf_vf_vf(vcast_vf_f(1.0f), x); } static INLINE vfloat vsqrt_vf_vf(vfloat x) { return _mm256_sqrt_ps(x); } static INLINE vfloat vabs_vf_vf(vfloat f) { return vreinterpret_vf_vm(vandnot_vm_vm_vm(vreinterpret_vm_vf(vcast_vf_f(-0.0f)), vreinterpret_vm_vf(f))); } static INLINE vfloat vneg_vf_vf(vfloat d) { return vreinterpret_vf_vm(vxor_vm_vm_vm(vreinterpret_vm_vf(vcast_vf_f(-0.0f)), vreinterpret_vm_vf(d))); } static INLINE vfloat vmax_vf_vf_vf(vfloat x, vfloat y) { return _mm256_max_ps(x, y); } static INLINE vfloat vmin_vf_vf_vf(vfloat x, vfloat y) { return _mm256_min_ps(x, y); } #if CONFIG == 1 static INLINE vfloat vmla_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vadd_vf_vf_vf(vmul_vf_vf_vf(x, y), z); } static INLINE vfloat vmlanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vsub_vf_vf_vf(z, vmul_vf_vf_vf(x, y)); } static INLINE vfloat vmlapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vsub_vf_vf_vf(vmul_vf_vf_vf(x, y), z); } #else static INLINE vfloat vmla_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_macc_ps(x, y, z); } static INLINE vfloat vmlanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_nmacc_ps(x, y, z); } static INLINE vfloat vmlapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_msub_ps(x, y, z); } static INLINE vfloat vfma_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_macc_ps(x, y, z); } static INLINE vfloat vfmapp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_macc_ps(x, y, z); } static INLINE vfloat vfmapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_msub_ps(x, y, z); } static INLINE vfloat vfmanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_nmacc_ps(x, y, z); } static INLINE vfloat vfmann_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_nmsub_ps(x, y, z); } #endif static INLINE vopmask veq_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm256_cmp_ps(x, y, _CMP_EQ_OQ)); } static INLINE vopmask vneq_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm256_cmp_ps(x, y, _CMP_NEQ_UQ)); } static INLINE vopmask vlt_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm256_cmp_ps(x, y, _CMP_LT_OQ)); } static INLINE vopmask vle_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm256_cmp_ps(x, y, _CMP_LE_OQ)); } static INLINE vopmask vgt_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm256_cmp_ps(x, y, _CMP_GT_OQ)); } static INLINE vopmask vge_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm256_cmp_ps(x, y, _CMP_GE_OQ)); } static INLINE vint2 vadd_vi2_vi2_vi2(vint2 x, vint2 y) { vint2 vi = { _mm_add_epi32(x.x, y.x), _mm_add_epi32(x.y, y.y) }; return vi; } static INLINE vint2 vsub_vi2_vi2_vi2(vint2 x, vint2 y) { vint2 vi = { _mm_sub_epi32(x.x, y.x), _mm_sub_epi32(x.y, y.y) }; return vi; } static INLINE vint2 vneg_vi2_vi2(vint2 e) { vint2 vi = { _mm_sub_epi32(_mm_set1_epi32(0), e.x), _mm_sub_epi32(_mm_set1_epi32(0), e.y) }; return vi; } static INLINE vint2 vand_vi2_vi2_vi2(vint2 x, vint2 y) { vint2 vi = { _mm_and_si128(x.x, y.x), _mm_and_si128(x.y, y.y) }; return vi; } static INLINE vint2 vandnot_vi2_vi2_vi2(vint2 x, vint2 y) { vint2 vi = { _mm_andnot_si128(x.x, y.x), _mm_andnot_si128(x.y, y.y) }; return vi; } static INLINE vint2 vor_vi2_vi2_vi2(vint2 x, vint2 y) { vint2 vi = { _mm_or_si128(x.x, y.x), _mm_or_si128(x.y, y.y) }; return vi; } static INLINE vint2 vxor_vi2_vi2_vi2(vint2 x, vint2 y) { vint2 vi = { _mm_xor_si128(x.x, y.x), _mm_xor_si128(x.y, y.y) }; return vi; } static INLINE vint2 vand_vi2_vo_vi2(vopmask x, vint2 y) { return vand_vi2_vi2_vi2(vcast_vi2_vm(x), y); } static INLINE vint2 vandnot_vi2_vo_vi2(vopmask x, vint2 y) { return vandnot_vi2_vi2_vi2(vcast_vi2_vm(x), y); } static INLINE vint2 vsll_vi2_vi2_i(vint2 x, int c) { vint2 vi = { _mm_slli_epi32(x.x, c), _mm_slli_epi32(x.y, c) }; return vi; } static INLINE vint2 vsrl_vi2_vi2_i(vint2 x, int c) { vint2 vi = { _mm_srli_epi32(x.x, c), _mm_srli_epi32(x.y, c) }; return vi; } static INLINE vint2 vsra_vi2_vi2_i(vint2 x, int c) { vint2 vi = { _mm_srai_epi32(x.x, c), _mm_srai_epi32(x.y, c) }; return vi; } static INLINE vopmask veq_vo_vi2_vi2(vint2 x, vint2 y) { vint2 r; r.x = _mm_cmpeq_epi32(x.x, y.x); r.y = _mm_cmpeq_epi32(x.y, y.y); return vcast_vm_vi2(r); } static INLINE vopmask vgt_vo_vi2_vi2(vint2 x, vint2 y) { vint2 r; r.x = _mm_cmpgt_epi32(x.x, y.x); r.y = _mm_cmpgt_epi32(x.y, y.y); return vcast_vm_vi2(r); } static INLINE vint2 veq_vi2_vi2_vi2(vint2 x, vint2 y) { vint2 r; r.x = _mm_cmpeq_epi32(x.x, y.x); r.y = _mm_cmpeq_epi32(x.y, y.y); return r; } static INLINE vint2 vgt_vi2_vi2_vi2(vint2 x, vint2 y) { vint2 r; r.x = _mm_cmpgt_epi32(x.x, y.x); r.y = _mm_cmpgt_epi32(x.y, y.y); return r; } static INLINE vint2 vsel_vi2_vo_vi2_vi2(vopmask m, vint2 x, vint2 y) { vint2 n = vcast_vi2_vm(m); vint2 r = { _mm_blendv_epi8(y.x, x.x, n.x), _mm_blendv_epi8(y.y, x.y, n.y) }; return r; } static INLINE vmask vadd64_vm_vm_vm(vmask x, vmask y) { vint2 ix = vcast_vi2_vm(x), iy = vcast_vi2_vm(y), iz; iz.x = _mm_add_epi64(ix.x, iy.x); iz.y = _mm_add_epi64(ix.y, iy.y); return vcast_vm_vi2(iz); } static INLINE vfloat vsel_vf_vo_vf_vf(vopmask o, vfloat x, vfloat y) { return _mm256_blendv_ps(y, x, _mm256_castsi256_ps(o)); } static INLINE CONST vfloat vsel_vf_vo_f_f(vopmask o, float v1, float v0) { return vsel_vf_vo_vf_vf(o, vcast_vf_f(v1), vcast_vf_f(v0)); } static INLINE vfloat vsel_vf_vo_vo_f_f_f(vopmask o0, vopmask o1, float d0, float d1, float d2) { return vsel_vf_vo_vf_vf(o0, vcast_vf_f(d0), vsel_vf_vo_f_f(o1, d1, d2)); } static INLINE vfloat vsel_vf_vo_vo_vo_f_f_f_f(vopmask o0, vopmask o1, vopmask o2, float d0, float d1, float d2, float d3) { return vsel_vf_vo_vf_vf(o0, vcast_vf_f(d0), vsel_vf_vo_vf_vf(o1, vcast_vf_f(d1), vsel_vf_vo_f_f(o2, d2, d3))); } static INLINE vopmask visinf_vo_vf(vfloat d) { return veq_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(SLEEF_INFINITYf)); } static INLINE vopmask vispinf_vo_vf(vfloat d) { return veq_vo_vf_vf(d, vcast_vf_f(SLEEF_INFINITYf)); } static INLINE vopmask visminf_vo_vf(vfloat d) { return veq_vo_vf_vf(d, vcast_vf_f(-SLEEF_INFINITYf)); } static INLINE vopmask visnan_vo_vf(vfloat d) { return vneq_vo_vf_vf(d, d); } // static INLINE vfloat vload_vf_p(const float *ptr) { return _mm256_load_ps(ptr); } static INLINE vfloat vloadu_vf_p(const float *ptr) { return _mm256_loadu_ps(ptr); } static INLINE void vstore_v_p_vf(float *ptr, vfloat v) { _mm256_store_ps(ptr, v); } static INLINE void vstoreu_v_p_vf(float *ptr, vfloat v) { _mm256_storeu_ps(ptr, v); } static INLINE vfloat vgather_vf_p_vi2(const float *ptr, vint2 vi2) { int a[VECTLENSP]; vstoreu_v_p_vi2(a, vi2); return _mm256_set_ps(ptr[a[7]], ptr[a[6]], ptr[a[5]], ptr[a[4]], ptr[a[3]], ptr[a[2]], ptr[a[1]], ptr[a[0]]); } #ifdef _MSC_VER // This function is needed when debugging on MSVC. static INLINE float vcast_f_vf(vfloat v) { float a[VECTLENSP]; vstoreu_v_p_vf(a, v); return a[0]; } #endif // #define PNMASK ((vdouble) { +0.0, -0.0, +0.0, -0.0 }) #define NPMASK ((vdouble) { -0.0, +0.0, -0.0, +0.0 }) #define PNMASKf ((vfloat) { +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f }) #define NPMASKf ((vfloat) { -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f }) static INLINE vdouble vposneg_vd_vd(vdouble d) { return vreinterpret_vd_vm(vxor_vm_vm_vm(vreinterpret_vm_vd(d), vreinterpret_vm_vd(PNMASK))); } static INLINE vdouble vnegpos_vd_vd(vdouble d) { return vreinterpret_vd_vm(vxor_vm_vm_vm(vreinterpret_vm_vd(d), vreinterpret_vm_vd(NPMASK))); } static INLINE vfloat vposneg_vf_vf(vfloat d) { return vreinterpret_vf_vm(vxor_vm_vm_vm(vreinterpret_vm_vf(d), vreinterpret_vm_vf(PNMASKf))); } static INLINE vfloat vnegpos_vf_vf(vfloat d) { return vreinterpret_vf_vm(vxor_vm_vm_vm(vreinterpret_vm_vf(d), vreinterpret_vm_vf(NPMASKf))); } static INLINE vdouble vsubadd_vd_vd_vd(vdouble x, vdouble y) { return _mm256_addsub_pd(x, y); } static INLINE vfloat vsubadd_vf_vf_vf(vfloat x, vfloat y) { return _mm256_addsub_ps(x, y); } #if CONFIG == 1 static INLINE vdouble vmlsubadd_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vsubadd_vd_vd_vd(vmul_vd_vd_vd(x, y), z); } static INLINE vfloat vmlsubadd_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vsubadd_vf_vf_vf(vmul_vf_vf_vf(x, y), z); } #else static INLINE vdouble vmlsubadd_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vmla_vd_vd_vd_vd(x, y, vnegpos_vd_vd(z)); } static INLINE vfloat vmlsubadd_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vmla_vf_vf_vf_vf(x, y, vnegpos_vf_vf(z)); } #endif static INLINE vdouble vrev21_vd_vd(vdouble d0) { return _mm256_shuffle_pd(d0, d0, (0 << 3) | (1 << 2) | (0 << 1) | (1 << 0)); } static INLINE vdouble vreva2_vd_vd(vdouble d0) { d0 = _mm256_permute2f128_pd(d0, d0, 1); return _mm256_shuffle_pd(d0, d0, (1 << 3) | (0 << 2) | (1 << 1) | (0 << 0)); } static INLINE void vstream_v_p_vd(double *ptr, vdouble v) { _mm256_stream_pd(ptr, v); } static INLINE void vscatter2_v_p_i_i_vd(double *ptr, int offset, int step, vdouble v) { _mm_store_pd(&ptr[(offset + step * 0)*2], _mm256_extractf128_pd(v, 0)); _mm_store_pd(&ptr[(offset + step * 1)*2], _mm256_extractf128_pd(v, 1)); } static INLINE void vsscatter2_v_p_i_i_vd(double *ptr, int offset, int step, vdouble v) { _mm_stream_pd(&ptr[(offset + step * 0)*2], _mm256_extractf128_pd(v, 0)); _mm_stream_pd(&ptr[(offset + step * 1)*2], _mm256_extractf128_pd(v, 1)); } // static INLINE vfloat vrev21_vf_vf(vfloat d0) { return _mm256_shuffle_ps(d0, d0, (2 << 6) | (3 << 4) | (0 << 2) | (1 << 0)); } static INLINE vfloat vreva2_vf_vf(vfloat d0) { d0 = _mm256_permute2f128_ps(d0, d0, 1); return _mm256_shuffle_ps(d0, d0, (1 << 6) | (0 << 4) | (3 << 2) | (2 << 0)); } static INLINE vint2 vrev21_vi2_vi2(vint2 i) { return vreinterpret_vi2_vf(vrev21_vf_vf(vreinterpret_vf_vi2(i))); } static INLINE void vstream_v_p_vf(float *ptr, vfloat v) { _mm256_stream_ps(ptr, v); } static INLINE void vscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v) { _mm_storel_pd((double *)(ptr+(offset + step * 0)*2), _mm_castsi128_pd(_mm_castps_si128(_mm256_extractf128_ps(v, 0)))); _mm_storeh_pd((double *)(ptr+(offset + step * 1)*2), _mm_castsi128_pd(_mm_castps_si128(_mm256_extractf128_ps(v, 0)))); _mm_storel_pd((double *)(ptr+(offset + step * 2)*2), _mm_castsi128_pd(_mm_castps_si128(_mm256_extractf128_ps(v, 1)))); _mm_storeh_pd((double *)(ptr+(offset + step * 3)*2), _mm_castsi128_pd(_mm_castps_si128(_mm256_extractf128_ps(v, 1)))); } static INLINE void vsscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v) { vscatter2_v_p_i_i_vf(ptr, offset, step, v); } // static INLINE vmask2 vinterleave_vm2_vm2(vmask2 v) { return (vmask2) { vreinterpret_vm_vd(_mm256_unpacklo_pd(vreinterpret_vd_vm(v.x), vreinterpret_vd_vm(v.y))), vreinterpret_vm_vd(_mm256_unpackhi_pd(vreinterpret_vd_vm(v.x), vreinterpret_vd_vm(v.y))) }; } static INLINE vmask2 vuninterleave_vm2_vm2(vmask2 v) { return (vmask2) { vreinterpret_vm_vd(_mm256_unpacklo_pd(vreinterpret_vd_vm(v.x), vreinterpret_vd_vm(v.y))), vreinterpret_vm_vd(_mm256_unpackhi_pd(vreinterpret_vd_vm(v.x), vreinterpret_vd_vm(v.y))) }; } static INLINE vint vuninterleave_vi_vi(vint v) { return _mm_shuffle_epi32(v, (0 << 0) | (2 << 2) | (1 << 4) | (3 << 6)); } static INLINE vdouble vinterleave_vd_vd(vdouble vd) { double tmp[4]; vstoreu_v_p_vd(tmp, vd); double t = tmp[1]; tmp[1] = tmp[2]; tmp[2] = t; return vloadu_vd_p(tmp); } static INLINE vdouble vuninterleave_vd_vd(vdouble vd) { double tmp[4]; vstoreu_v_p_vd(tmp, vd); double t = tmp[1]; tmp[1] = tmp[2]; tmp[2] = t; return vloadu_vd_p(tmp); } static INLINE vmask vinterleave_vm_vm(vmask vm) { double tmp[4]; vstoreu_v_p_vd(tmp, vreinterpret_vd_vm(vm)); double t = tmp[1]; tmp[1] = tmp[2]; tmp[2] = t; return vreinterpret_vm_vd(vloadu_vd_p(tmp)); } static INLINE vmask vuninterleave_vm_vm(vmask vm) { double tmp[4]; vstoreu_v_p_vd(tmp, vreinterpret_vd_vm(vm)); double t = tmp[1]; tmp[1] = tmp[2]; tmp[2] = t; return vreinterpret_vm_vd(vloadu_vd_p(tmp)); } static vmask2 vloadu_vm2_p(void *p) { vmask2 vm2; memcpy(&vm2, p, VECTLENDP * 16); return vm2; } #if !defined(SLEEF_GENHEADER) typedef Sleef_quad4 vargquad; static INLINE vmask2 vcast_vm2_aq(vargquad aq) { return vinterleave_vm2_vm2(vloadu_vm2_p(&aq)); } static INLINE vargquad vcast_aq_vm2(vmask2 vm2) { vm2 = vuninterleave_vm2_vm2(vm2); vargquad aq; memcpy(&aq, &vm2, VECTLENDP * 16); return aq; } #endif // #if !defined(SLEEF_GENHEADER) static INLINE int vtestallzeros_i_vo64(vopmask g) { return _mm_movemask_epi8(_mm_or_si128(_mm256_extractf128_si256(g, 0), _mm256_extractf128_si256(g, 1))) == 0; } static INLINE vmask vsel_vm_vo64_vm_vm(vopmask o, vmask x, vmask y) { return vreinterpret_vm_vd(_mm256_blendv_pd(vreinterpret_vd_vm(y), vreinterpret_vd_vm(x), vreinterpret_vd_vm(o))); } static INLINE vmask vsub64_vm_vm_vm(vmask x, vmask y) { __m128i xh = _mm256_extractf128_si256(x, 1), xl = _mm256_extractf128_si256(x, 0); __m128i yh = _mm256_extractf128_si256(y, 1), yl = _mm256_extractf128_si256(y, 0); vmask r = _mm256_castsi128_si256(_mm_sub_epi64(xl, yl)); return _mm256_insertf128_si256(r, _mm_sub_epi64(xh, yh), 1); } static INLINE vmask vneg64_vm_vm(vmask x) { return vsub64_vm_vm_vm(vcast_vm_i_i(0, 0), x); } static INLINE vopmask vgt64_vo_vm_vm(vmask x, vmask y) { __m128i xh = _mm256_extractf128_si256(x, 1), xl = _mm256_extractf128_si256(x, 0); __m128i yh = _mm256_extractf128_si256(y, 1), yl = _mm256_extractf128_si256(y, 0); vmask r = _mm256_castsi128_si256(_mm_cmpgt_epi64(xl, yl)); return _mm256_insertf128_si256(r, _mm_cmpgt_epi64(xh, yh), 1); } #define vsll64_vm_vm_i(x, c) \ _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_slli_epi64(_mm256_extractf128_si256(x, 0), c)), \ _mm_slli_epi64(_mm256_extractf128_si256(x, 1), c), 1) #define vsrl64_vm_vm_i(x, c) \ _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_srli_epi64(_mm256_extractf128_si256(x, 0), c)), \ _mm_srli_epi64(_mm256_extractf128_si256(x, 1), c), 1) //@#define vsll64_vm_vm_i(x, c) _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_slli_epi64(_mm256_extractf128_si256(x, 0), c)), _mm_slli_epi64(_mm256_extractf128_si256(x, 1), c), 1) //@#define vsrl64_vm_vm_i(x, c) _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_srli_epi64(_mm256_extractf128_si256(x, 0), c)), _mm_srli_epi64(_mm256_extractf128_si256(x, 1), c), 1) static INLINE vmask vcast_vm_vi(vint vi) { vint vi0 = _mm_and_si128(_mm_shuffle_epi32(vi, (1 << 4) | (1 << 6)), _mm_set_epi32(0, -1, 0, -1)); vint vi1 = _mm_and_si128(_mm_shuffle_epi32(vi, (2 << 0) | (2 << 2) | (3 << 4) | (3 << 6)), _mm_set_epi32(0, -1, 0, -1)); vmask m = _mm256_insertf128_si256(_mm256_castsi128_si256(vi0), vi1, 1); return vor_vm_vm_vm(vcast_vm_vi2(vcastu_vi2_vi(vand_vi_vo_vi(vgt_vo_vi_vi(vcast_vi_i(0), vi), vcast_vi_i(-1)))), m); } static INLINE vint vcast_vi_vm(vmask vm) { return _mm_or_si128(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(_mm256_castsi256_si128(vm)), _mm_set1_ps(0), 0x08)), _mm_castps_si128(_mm_shuffle_ps(_mm_set1_ps(0), _mm_castsi128_ps(_mm256_extractf128_si256(vm, 1)), 0x80))); } ================================================ FILE: src/helperavx2.h ================================================ // Copyright Naoki Shibata and contributors 2010 - 2020. // Distributed under the Boost Software License, Version 1.0. // (See accompanying file LICENSE.txt or copy at // http://www.boost.org/LICENSE_1_0.txt) #if CONFIG == 1 #if !defined(__AVX2__) && !defined(SLEEF_GENHEADER) #error Please specify -mavx2. #endif #else #error CONFIG macro invalid or not defined #endif #define ENABLE_DP //@#define ENABLE_DP #define LOG2VECTLENDP 2 //@#define LOG2VECTLENDP 2 #define VECTLENDP (1 << LOG2VECTLENDP) //@#define VECTLENDP (1 << LOG2VECTLENDP) #define ENABLE_FMA_DP //@#define ENABLE_FMA_DP #define ENABLE_SP //@#define ENABLE_SP #define LOG2VECTLENSP (LOG2VECTLENDP+1) //@#define LOG2VECTLENSP (LOG2VECTLENDP+1) #define VECTLENSP (1 << LOG2VECTLENSP) //@#define VECTLENSP (1 << LOG2VECTLENSP) #define ENABLE_FMA_SP //@#define ENABLE_FMA_SP #define FULL_FP_ROUNDING //@#define FULL_FP_ROUNDING #define ACCURATE_SQRT //@#define ACCURATE_SQRT #if !defined(SLEEF_GENHEADER) #if defined(_MSC_VER) #include #else #include #endif #include #include "misc.h" #endif // #if !defined(SLEEF_GENHEADER) typedef __m256i vmask; typedef __m256i vopmask; typedef __m256d vdouble; typedef __m128i vint; typedef __m256 vfloat; typedef __m256i vint2; typedef struct { vmask x, y; } vmask2; // #if !defined(SLEEF_GENHEADER) #ifndef __SLEEF_H__ static inline void Sleef_x86CpuID(int32_t out[4], uint32_t eax, uint32_t ecx) { /* We don't care for cpuid detection */ out[0] = 0xFFFFFFFF; out[1] = 0xFFFFFFFF; out[2] = 0xFFFFFFFF; out[3] = 0xFFFFFFFF; } #endif static INLINE int cpuSupportsAVX2() { int32_t reg[4]; Sleef_x86CpuID(reg, 7, 0); return (reg[1] & (1 << 5)) != 0; } static INLINE int cpuSupportsFMA() { int32_t reg[4]; Sleef_x86CpuID(reg, 1, 0); return (reg[2] & (1 << 12)) != 0; } #if CONFIG == 1 && defined(__AVX2__) static INLINE int vavailability_i(int name) { int d = cpuSupportsAVX2() && cpuSupportsFMA(); return d ? 3 : 0; } #define ISANAME "AVX2" #define DFTPRIORITY 25 #endif #endif // #if !defined(SLEEF_GENHEADER) static INLINE void vprefetch_v_p(const void *ptr) { _mm_prefetch(ptr, _MM_HINT_T0); } static INLINE int vtestallones_i_vo32(vopmask g) { return _mm_test_all_ones(_mm_and_si128(_mm256_extractf128_si256(g, 0), _mm256_extractf128_si256(g, 1))); } static INLINE int vtestallones_i_vo64(vopmask g) { return _mm_test_all_ones(_mm_and_si128(_mm256_extractf128_si256(g, 0), _mm256_extractf128_si256(g, 1))); } // static INLINE vdouble vcast_vd_d(double d) { return _mm256_set1_pd(d); } static INLINE vmask vreinterpret_vm_vd(vdouble vd) { return _mm256_castpd_si256(vd); } static INLINE vdouble vreinterpret_vd_vm(vmask vm) { return _mm256_castsi256_pd(vm); } static INLINE vint2 vreinterpret_vi2_vd(vdouble vd) { return _mm256_castpd_si256(vd); } static INLINE vdouble vreinterpret_vd_vi2(vint2 vi) { return _mm256_castsi256_pd(vi); } // static vint2 vloadu_vi2_p(int32_t *p) { return _mm256_loadu_si256((__m256i const *)p); } static void vstoreu_v_p_vi2(int32_t *p, vint2 v) { _mm256_storeu_si256((__m256i *)p, v); } static vint vloadu_vi_p(int32_t *p) { return _mm_loadu_si128((__m128i *)p); } static void vstoreu_v_p_vi(int32_t *p, vint v) { _mm_storeu_si128((__m128i *)p, v); } // static INLINE vmask vand_vm_vm_vm(vmask x, vmask y) { return vreinterpret_vm_vd(_mm256_and_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); } static INLINE vmask vandnot_vm_vm_vm(vmask x, vmask y) { return vreinterpret_vm_vd(_mm256_andnot_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); } static INLINE vmask vor_vm_vm_vm(vmask x, vmask y) { return vreinterpret_vm_vd(_mm256_or_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); } static INLINE vmask vxor_vm_vm_vm(vmask x, vmask y) { return vreinterpret_vm_vd(_mm256_xor_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); } static INLINE vopmask vand_vo_vo_vo(vopmask x, vopmask y) { return vreinterpret_vm_vd(_mm256_and_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); } static INLINE vopmask vandnot_vo_vo_vo(vopmask x, vopmask y) { return vreinterpret_vm_vd(_mm256_andnot_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); } static INLINE vopmask vor_vo_vo_vo(vopmask x, vopmask y) { return vreinterpret_vm_vd(_mm256_or_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); } static INLINE vopmask vxor_vo_vo_vo(vopmask x, vopmask y) { return vreinterpret_vm_vd(_mm256_xor_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); } static INLINE vmask vand_vm_vo64_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm256_and_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); } static INLINE vmask vandnot_vm_vo64_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm256_andnot_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); } static INLINE vmask vor_vm_vo64_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm256_or_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); } static INLINE vmask vxor_vm_vo64_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm256_xor_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); } static INLINE vmask vand_vm_vo32_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm256_and_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); } static INLINE vmask vandnot_vm_vo32_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm256_andnot_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); } static INLINE vmask vor_vm_vo32_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm256_or_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); } static INLINE vmask vxor_vm_vo32_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm256_xor_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); } static INLINE vopmask vcast_vo32_vo64(vopmask o) { return _mm256_permutevar8x32_epi32(o, _mm256_set_epi32(0, 0, 0, 0, 6, 4, 2, 0)); } static INLINE vopmask vcast_vo64_vo32(vopmask o) { return _mm256_permutevar8x32_epi32(o, _mm256_set_epi32(3, 3, 2, 2, 1, 1, 0, 0)); } // static INLINE vint vrint_vi_vd(vdouble vd) { return _mm256_cvtpd_epi32(vd); } static INLINE vint vtruncate_vi_vd(vdouble vd) { return _mm256_cvttpd_epi32(vd); } static INLINE vdouble vrint_vd_vd(vdouble vd) { return _mm256_round_pd(vd, _MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC); } static INLINE vfloat vrint_vf_vf(vfloat vd) { return _mm256_round_ps(vd, _MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC); } static INLINE vdouble vtruncate_vd_vd(vdouble vd) { return _mm256_round_pd(vd, _MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC); } static INLINE vfloat vtruncate_vf_vf(vfloat vf) { return _mm256_round_ps(vf, _MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC); } static INLINE vdouble vcast_vd_vi(vint vi) { return _mm256_cvtepi32_pd(vi); } static INLINE vint vcast_vi_i(int i) { return _mm_set1_epi32(i); } static INLINE vint2 vcastu_vi2_vi(vint vi) { return _mm256_slli_epi64(_mm256_cvtepi32_epi64(vi), 32); } static INLINE vint vcastu_vi_vi2(vint2 vi) { return _mm_or_si128(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(_mm256_castsi256_si128(vi)), _mm_set1_ps(0), 0x0d)), _mm_castps_si128(_mm_shuffle_ps(_mm_set1_ps(0), _mm_castsi128_ps(_mm256_extractf128_si256(vi, 1)), 0xd0))); } static INLINE vmask vcast_vm_i_i(int i0, int i1) { return _mm256_set_epi32(i0, i1, i0, i1, i0, i1, i0, i1); } static INLINE vopmask veq64_vo_vm_vm(vmask x, vmask y) { return _mm256_cmpeq_epi64(x, y); } static INLINE vmask vadd64_vm_vm_vm(vmask x, vmask y) { return _mm256_add_epi64(x, y); } // static INLINE vdouble vadd_vd_vd_vd(vdouble x, vdouble y) { return _mm256_add_pd(x, y); } static INLINE vdouble vsub_vd_vd_vd(vdouble x, vdouble y) { return _mm256_sub_pd(x, y); } static INLINE vdouble vmul_vd_vd_vd(vdouble x, vdouble y) { return _mm256_mul_pd(x, y); } static INLINE vdouble vdiv_vd_vd_vd(vdouble x, vdouble y) { return _mm256_div_pd(x, y); } static INLINE vdouble vrec_vd_vd(vdouble x) { return _mm256_div_pd(_mm256_set1_pd(1), x); } static INLINE vdouble vsqrt_vd_vd(vdouble x) { return _mm256_sqrt_pd(x); } static INLINE vdouble vabs_vd_vd(vdouble d) { return _mm256_andnot_pd(_mm256_set1_pd(-0.0), d); } static INLINE vdouble vneg_vd_vd(vdouble d) { return _mm256_xor_pd(_mm256_set1_pd(-0.0), d); } static INLINE vdouble vmla_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_fmadd_pd(x, y, z); } static INLINE vdouble vmlapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_fmsub_pd(x, y, z); } static INLINE vdouble vmlanp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_fnmadd_pd(x, y, z); } static INLINE vdouble vmax_vd_vd_vd(vdouble x, vdouble y) { return _mm256_max_pd(x, y); } static INLINE vdouble vmin_vd_vd_vd(vdouble x, vdouble y) { return _mm256_min_pd(x, y); } static INLINE vdouble vfma_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_fmadd_pd(x, y, z); } static INLINE vdouble vfmapp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_fmadd_pd(x, y, z); } static INLINE vdouble vfmapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_fmsub_pd(x, y, z); } static INLINE vdouble vfmanp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_fnmadd_pd(x, y, z); } static INLINE vdouble vfmann_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_fnmsub_pd(x, y, z); } static INLINE vopmask veq_vo_vd_vd(vdouble x, vdouble y) { return vreinterpret_vm_vd(_mm256_cmp_pd(x, y, _CMP_EQ_OQ)); } static INLINE vopmask vneq_vo_vd_vd(vdouble x, vdouble y) { return vreinterpret_vm_vd(_mm256_cmp_pd(x, y, _CMP_NEQ_UQ)); } static INLINE vopmask vlt_vo_vd_vd(vdouble x, vdouble y) { return vreinterpret_vm_vd(_mm256_cmp_pd(x, y, _CMP_LT_OQ)); } static INLINE vopmask vle_vo_vd_vd(vdouble x, vdouble y) { return vreinterpret_vm_vd(_mm256_cmp_pd(x, y, _CMP_LE_OQ)); } static INLINE vopmask vgt_vo_vd_vd(vdouble x, vdouble y) { return vreinterpret_vm_vd(_mm256_cmp_pd(x, y, _CMP_GT_OQ)); } static INLINE vopmask vge_vo_vd_vd(vdouble x, vdouble y) { return vreinterpret_vm_vd(_mm256_cmp_pd(x, y, _CMP_GE_OQ)); } // static INLINE vint vadd_vi_vi_vi(vint x, vint y) { return _mm_add_epi32(x, y); } static INLINE vint vsub_vi_vi_vi(vint x, vint y) { return _mm_sub_epi32(x, y); } static INLINE vint vneg_vi_vi(vint e) { return vsub_vi_vi_vi(vcast_vi_i(0), e); } static INLINE vint vand_vi_vi_vi(vint x, vint y) { return _mm_and_si128(x, y); } static INLINE vint vandnot_vi_vi_vi(vint x, vint y) { return _mm_andnot_si128(x, y); } static INLINE vint vor_vi_vi_vi(vint x, vint y) { return _mm_or_si128(x, y); } static INLINE vint vxor_vi_vi_vi(vint x, vint y) { return _mm_xor_si128(x, y); } static INLINE vint vandnot_vi_vo_vi(vopmask m, vint y) { return _mm_andnot_si128(_mm256_castsi256_si128(m), y); } static INLINE vint vand_vi_vo_vi(vopmask m, vint y) { return _mm_and_si128(_mm256_castsi256_si128(m), y); } static INLINE vint vsll_vi_vi_i(vint x, int c) { return _mm_slli_epi32(x, c); } static INLINE vint vsrl_vi_vi_i(vint x, int c) { return _mm_srli_epi32(x, c); } static INLINE vint vsra_vi_vi_i(vint x, int c) { return _mm_srai_epi32(x, c); } static INLINE vint veq_vi_vi_vi(vint x, vint y) { return _mm_cmpeq_epi32(x, y); } static INLINE vint vgt_vi_vi_vi(vint x, vint y) { return _mm_cmpgt_epi32(x, y); } static INLINE vopmask veq_vo_vi_vi(vint x, vint y) { return _mm256_castsi128_si256(_mm_cmpeq_epi32(x, y)); } static INLINE vopmask vgt_vo_vi_vi(vint x, vint y) { return _mm256_castsi128_si256(_mm_cmpgt_epi32(x, y)); } static INLINE vint vsel_vi_vo_vi_vi(vopmask m, vint x, vint y) { return _mm_blendv_epi8(y, x, _mm256_castsi256_si128(m)); } static INLINE vdouble vsel_vd_vo_vd_vd(vopmask o, vdouble x, vdouble y) { return _mm256_blendv_pd(y, x, _mm256_castsi256_pd(o)); } static INLINE vdouble vsel_vd_vo_d_d(vopmask o, double v1, double v0) { return _mm256_permutevar_pd(_mm256_set_pd(v1, v0, v1, v0), o); } static INLINE vdouble vsel_vd_vo_vo_vo_d_d_d_d(vopmask o0, vopmask o1, vopmask o2, double d0, double d1, double d2, double d3) { __m256i v = _mm256_castpd_si256(vsel_vd_vo_vd_vd(o0, _mm256_castsi256_pd(_mm256_set_epi32(1, 0, 1, 0, 1, 0, 1, 0)), vsel_vd_vo_vd_vd(o1, _mm256_castsi256_pd(_mm256_set_epi32(3, 2, 3, 2, 3, 2, 3, 2)), vsel_vd_vo_vd_vd(o2, _mm256_castsi256_pd(_mm256_set_epi32(5, 4, 5, 4, 5, 4, 5, 4)), _mm256_castsi256_pd(_mm256_set_epi32(7, 6, 7, 6, 7, 6, 7, 6)))))); return _mm256_castsi256_pd(_mm256_permutevar8x32_epi32(_mm256_castpd_si256(_mm256_set_pd(d3, d2, d1, d0)), v)); } static INLINE vdouble vsel_vd_vo_vo_d_d_d(vopmask o0, vopmask o1, double d0, double d1, double d2) { return vsel_vd_vo_vo_vo_d_d_d_d(o0, o1, o1, d0, d1, d2, d2); } static INLINE vopmask visinf_vo_vd(vdouble d) { return vreinterpret_vm_vd(_mm256_cmp_pd(vabs_vd_vd(d), _mm256_set1_pd(SLEEF_INFINITY), _CMP_EQ_OQ)); } static INLINE vopmask vispinf_vo_vd(vdouble d) { return vreinterpret_vm_vd(_mm256_cmp_pd(d, _mm256_set1_pd(SLEEF_INFINITY), _CMP_EQ_OQ)); } static INLINE vopmask visminf_vo_vd(vdouble d) { return vreinterpret_vm_vd(_mm256_cmp_pd(d, _mm256_set1_pd(-SLEEF_INFINITY), _CMP_EQ_OQ)); } static INLINE vopmask visnan_vo_vd(vdouble d) { return vreinterpret_vm_vd(_mm256_cmp_pd(d, d, _CMP_NEQ_UQ)); } #if defined(_MSC_VER) // This function is needed when debugging on MSVC. static INLINE double vcast_d_vd(vdouble v) { double s[4]; _mm256_storeu_pd(s, v); return s[0]; } #endif static INLINE vdouble vload_vd_p(const double *ptr) { return _mm256_load_pd(ptr); } static INLINE vdouble vloadu_vd_p(const double *ptr) { return _mm256_loadu_pd(ptr); } static INLINE void vstore_v_p_vd(double *ptr, vdouble v) { _mm256_store_pd(ptr, v); } static INLINE void vstoreu_v_p_vd(double *ptr, vdouble v) { _mm256_storeu_pd(ptr, v); } static INLINE vdouble vgather_vd_p_vi(const double *ptr, vint vi) { return _mm256_i32gather_pd(ptr, vi, 8); } // static INLINE vint2 vcast_vi2_vm(vmask vm) { return vm; } static INLINE vmask vcast_vm_vi2(vint2 vi) { return vi; } static INLINE vint2 vrint_vi2_vf(vfloat vf) { return vcast_vi2_vm(_mm256_cvtps_epi32(vf)); } static INLINE vint2 vtruncate_vi2_vf(vfloat vf) { return vcast_vi2_vm(_mm256_cvttps_epi32(vf)); } static INLINE vfloat vcast_vf_vi2(vint2 vi) { return _mm256_cvtepi32_ps(vcast_vm_vi2(vi)); } static INLINE vfloat vcast_vf_f(float f) { return _mm256_set1_ps(f); } static INLINE vint2 vcast_vi2_i(int i) { return _mm256_set1_epi32(i); } static INLINE vmask vreinterpret_vm_vf(vfloat vf) { return _mm256_castps_si256(vf); } static INLINE vfloat vreinterpret_vf_vm(vmask vm) { return _mm256_castsi256_ps(vm); } static INLINE vfloat vreinterpret_vf_vi2(vint2 vi) { return vreinterpret_vf_vm(vcast_vm_vi2(vi)); } static INLINE vint2 vreinterpret_vi2_vf(vfloat vf) { return vcast_vi2_vm(vreinterpret_vm_vf(vf)); } static INLINE vfloat vadd_vf_vf_vf(vfloat x, vfloat y) { return _mm256_add_ps(x, y); } static INLINE vfloat vsub_vf_vf_vf(vfloat x, vfloat y) { return _mm256_sub_ps(x, y); } static INLINE vfloat vmul_vf_vf_vf(vfloat x, vfloat y) { return _mm256_mul_ps(x, y); } static INLINE vfloat vdiv_vf_vf_vf(vfloat x, vfloat y) { return _mm256_div_ps(x, y); } static INLINE vfloat vrec_vf_vf(vfloat x) { return vdiv_vf_vf_vf(vcast_vf_f(1.0f), x); } static INLINE vfloat vsqrt_vf_vf(vfloat x) { return _mm256_sqrt_ps(x); } static INLINE vfloat vabs_vf_vf(vfloat f) { return vreinterpret_vf_vm(vandnot_vm_vm_vm(vreinterpret_vm_vf(vcast_vf_f(-0.0f)), vreinterpret_vm_vf(f))); } static INLINE vfloat vneg_vf_vf(vfloat d) { return vreinterpret_vf_vm(vxor_vm_vm_vm(vreinterpret_vm_vf(vcast_vf_f(-0.0f)), vreinterpret_vm_vf(d))); } static INLINE vfloat vmla_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_fmadd_ps(x, y, z); } static INLINE vfloat vmlapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_fmsub_ps(x, y, z); } static INLINE vfloat vmlanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_fnmadd_ps(x, y, z); } static INLINE vfloat vmax_vf_vf_vf(vfloat x, vfloat y) { return _mm256_max_ps(x, y); } static INLINE vfloat vmin_vf_vf_vf(vfloat x, vfloat y) { return _mm256_min_ps(x, y); } static INLINE vfloat vfma_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_fmadd_ps(x, y, z); } static INLINE vfloat vfmapp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_fmadd_ps(x, y, z); } static INLINE vfloat vfmapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_fmsub_ps(x, y, z); } static INLINE vfloat vfmanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_fnmadd_ps(x, y, z); } static INLINE vfloat vfmann_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_fnmsub_ps(x, y, z); } static INLINE vopmask veq_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm256_cmp_ps(x, y, _CMP_EQ_OQ)); } static INLINE vopmask vneq_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm256_cmp_ps(x, y, _CMP_NEQ_UQ)); } static INLINE vopmask vlt_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm256_cmp_ps(x, y, _CMP_LT_OQ)); } static INLINE vopmask vle_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm256_cmp_ps(x, y, _CMP_LE_OQ)); } static INLINE vopmask vgt_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm256_cmp_ps(x, y, _CMP_GT_OQ)); } static INLINE vopmask vge_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm256_cmp_ps(x, y, _CMP_GE_OQ)); } static INLINE vint2 vadd_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm256_add_epi32(x, y); } static INLINE vint2 vsub_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm256_sub_epi32(x, y); } static INLINE vint2 vneg_vi2_vi2(vint2 e) { return vsub_vi2_vi2_vi2(vcast_vi2_i(0), e); } static INLINE vint2 vand_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm256_and_si256(x, y); } static INLINE vint2 vandnot_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm256_andnot_si256(x, y); } static INLINE vint2 vor_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm256_or_si256(x, y); } static INLINE vint2 vxor_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm256_xor_si256(x, y); } static INLINE vint2 vand_vi2_vo_vi2(vopmask x, vint2 y) { return vand_vi2_vi2_vi2(vcast_vi2_vm(x), y); } static INLINE vint2 vandnot_vi2_vo_vi2(vopmask x, vint2 y) { return vandnot_vi2_vi2_vi2(vcast_vi2_vm(x), y); } static INLINE vint2 vsll_vi2_vi2_i(vint2 x, int c) { return _mm256_slli_epi32(x, c); } static INLINE vint2 vsrl_vi2_vi2_i(vint2 x, int c) { return _mm256_srli_epi32(x, c); } static INLINE vint2 vsra_vi2_vi2_i(vint2 x, int c) { return _mm256_srai_epi32(x, c); } static INLINE vopmask veq_vo_vi2_vi2(vint2 x, vint2 y) { return _mm256_cmpeq_epi32(x, y); } static INLINE vopmask vgt_vo_vi2_vi2(vint2 x, vint2 y) { return _mm256_cmpgt_epi32(x, y); } static INLINE vint2 veq_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm256_cmpeq_epi32(x, y); } static INLINE vint2 vgt_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm256_cmpgt_epi32(x, y); } static INLINE vint2 vsel_vi2_vo_vi2_vi2(vopmask m, vint2 x, vint2 y) { return _mm256_blendv_epi8(y, x, m); } static INLINE vfloat vsel_vf_vo_vf_vf(vopmask o, vfloat x, vfloat y) { return _mm256_blendv_ps(y, x, _mm256_castsi256_ps(o)); } // At this point, the following three functions are implemented in a generic way, // but I will try target-specific optimization later on. static INLINE CONST vfloat vsel_vf_vo_f_f(vopmask o, float v1, float v0) { return vsel_vf_vo_vf_vf(o, vcast_vf_f(v1), vcast_vf_f(v0)); } static INLINE vfloat vsel_vf_vo_vo_f_f_f(vopmask o0, vopmask o1, float d0, float d1, float d2) { return vsel_vf_vo_vf_vf(o0, vcast_vf_f(d0), vsel_vf_vo_f_f(o1, d1, d2)); } static INLINE vfloat vsel_vf_vo_vo_vo_f_f_f_f(vopmask o0, vopmask o1, vopmask o2, float d0, float d1, float d2, float d3) { return vsel_vf_vo_vf_vf(o0, vcast_vf_f(d0), vsel_vf_vo_vf_vf(o1, vcast_vf_f(d1), vsel_vf_vo_f_f(o2, d2, d3))); } static INLINE vopmask visinf_vo_vf(vfloat d) { return veq_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(SLEEF_INFINITYf)); } static INLINE vopmask vispinf_vo_vf(vfloat d) { return veq_vo_vf_vf(d, vcast_vf_f(SLEEF_INFINITYf)); } static INLINE vopmask visminf_vo_vf(vfloat d) { return veq_vo_vf_vf(d, vcast_vf_f(-SLEEF_INFINITYf)); } static INLINE vopmask visnan_vo_vf(vfloat d) { return vneq_vo_vf_vf(d, d); } #ifdef _MSC_VER // This function is needed when debugging on MSVC. static INLINE float vcast_f_vf(vfloat v) { float s[8]; _mm256_storeu_ps(s, v); return s[0]; } #endif static INLINE vfloat vload_vf_p(const float *ptr) { return _mm256_load_ps(ptr); } static INLINE vfloat vloadu_vf_p(const float *ptr) { return _mm256_loadu_ps(ptr); } static INLINE void vstore_v_p_vf(float *ptr, vfloat v) { _mm256_store_ps(ptr, v); } static INLINE void vstoreu_v_p_vf(float *ptr, vfloat v) { _mm256_storeu_ps(ptr, v); } static INLINE vfloat vgather_vf_p_vi2(const float *ptr, vint2 vi2) { return _mm256_i32gather_ps(ptr, vi2, 4); } // #define PNMASK ((vdouble) { +0.0, -0.0, +0.0, -0.0 }) #define NPMASK ((vdouble) { -0.0, +0.0, -0.0, +0.0 }) #define PNMASKf ((vfloat) { +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f }) #define NPMASKf ((vfloat) { -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f }) static INLINE vdouble vposneg_vd_vd(vdouble d) { return vreinterpret_vd_vm(vxor_vm_vm_vm(vreinterpret_vm_vd(d), vreinterpret_vm_vd(PNMASK))); } static INLINE vdouble vnegpos_vd_vd(vdouble d) { return vreinterpret_vd_vm(vxor_vm_vm_vm(vreinterpret_vm_vd(d), vreinterpret_vm_vd(NPMASK))); } static INLINE vfloat vposneg_vf_vf(vfloat d) { return vreinterpret_vf_vm(vxor_vm_vm_vm(vreinterpret_vm_vf(d), vreinterpret_vm_vf(PNMASKf))); } static INLINE vfloat vnegpos_vf_vf(vfloat d) { return vreinterpret_vf_vm(vxor_vm_vm_vm(vreinterpret_vm_vf(d), vreinterpret_vm_vf(NPMASKf))); } static INLINE vdouble vsubadd_vd_vd_vd(vdouble x, vdouble y) { return _mm256_addsub_pd(x, y); } static INLINE vfloat vsubadd_vf_vf_vf(vfloat x, vfloat y) { return _mm256_addsub_ps(x, y); } static INLINE vdouble vmlsubadd_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vmla_vd_vd_vd_vd(x, y, vnegpos_vd_vd(z)); } static INLINE vfloat vmlsubadd_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vmla_vf_vf_vf_vf(x, y, vnegpos_vf_vf(z)); } static INLINE vdouble vrev21_vd_vd(vdouble d0) { return _mm256_shuffle_pd(d0, d0, (0 << 3) | (1 << 2) | (0 << 1) | (1 << 0)); } static INLINE vdouble vreva2_vd_vd(vdouble d0) { d0 = _mm256_permute2f128_pd(d0, d0, 1); return _mm256_shuffle_pd(d0, d0, (1 << 3) | (0 << 2) | (1 << 1) | (0 << 0)); } static INLINE void vstream_v_p_vd(double *ptr, vdouble v) { _mm256_stream_pd(ptr, v); } static INLINE void vscatter2_v_p_i_i_vd(double *ptr, int offset, int step, vdouble v) { _mm_store_pd(&ptr[(offset + step * 0)*2], _mm256_extractf128_pd(v, 0)); _mm_store_pd(&ptr[(offset + step * 1)*2], _mm256_extractf128_pd(v, 1)); } static INLINE void vsscatter2_v_p_i_i_vd(double *ptr, int offset, int step, vdouble v) { _mm_stream_pd(&ptr[(offset + step * 0)*2], _mm256_extractf128_pd(v, 0)); _mm_stream_pd(&ptr[(offset + step * 1)*2], _mm256_extractf128_pd(v, 1)); } // static INLINE vfloat vrev21_vf_vf(vfloat d0) { return _mm256_shuffle_ps(d0, d0, (2 << 6) | (3 << 4) | (0 << 2) | (1 << 0)); } static INLINE vfloat vreva2_vf_vf(vfloat d0) { d0 = _mm256_permute2f128_ps(d0, d0, 1); return _mm256_shuffle_ps(d0, d0, (1 << 6) | (0 << 4) | (3 << 2) | (2 << 0)); } static INLINE vint2 vrev21_vi2_vi2(vint2 i) { return vreinterpret_vi2_vf(vrev21_vf_vf(vreinterpret_vf_vi2(i))); } static INLINE void vstream_v_p_vf(float *ptr, vfloat v) { _mm256_stream_ps(ptr, v); } static INLINE void vscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v) { _mm_storel_pd((double *)(ptr+(offset + step * 0)*2), _mm_castsi128_pd(_mm_castps_si128(_mm256_extractf128_ps(v, 0)))); _mm_storeh_pd((double *)(ptr+(offset + step * 1)*2), _mm_castsi128_pd(_mm_castps_si128(_mm256_extractf128_ps(v, 0)))); _mm_storel_pd((double *)(ptr+(offset + step * 2)*2), _mm_castsi128_pd(_mm_castps_si128(_mm256_extractf128_ps(v, 1)))); _mm_storeh_pd((double *)(ptr+(offset + step * 3)*2), _mm_castsi128_pd(_mm_castps_si128(_mm256_extractf128_ps(v, 1)))); } static INLINE void vsscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v) { vscatter2_v_p_i_i_vf(ptr, offset, step, v); } // static INLINE vmask2 vinterleave_vm2_vm2(vmask2 v) { return (vmask2) { _mm256_unpacklo_epi64(v.x, v.y), _mm256_unpackhi_epi64(v.x, v.y) }; } static INLINE vmask2 vuninterleave_vm2_vm2(vmask2 v) { return (vmask2) { _mm256_unpacklo_epi64(v.x, v.y), _mm256_unpackhi_epi64(v.x, v.y) }; } static INLINE vint vuninterleave_vi_vi(vint v) { return _mm_shuffle_epi32(v, (0 << 0) | (2 << 2) | (1 << 4) | (3 << 6)); } static INLINE vdouble vinterleave_vd_vd(vdouble vd) { return vreinterpret_vd_vm(_mm256_permute4x64_epi64(vreinterpret_vm_vd(vd), (3 << 6) | (1 << 4) | (2 << 2) | (0 << 0))); } static INLINE vdouble vuninterleave_vd_vd(vdouble vd) { return vreinterpret_vd_vm(_mm256_permute4x64_epi64(vreinterpret_vm_vd(vd), (3 << 6) | (1 << 4) | (2 << 2) | (0 << 0))); } static INLINE vmask vinterleave_vm_vm(vmask vm) { return _mm256_permute4x64_epi64(vm, (3 << 6) | (1 << 4) | (2 << 2) | (0 << 0)); } static INLINE vmask vuninterleave_vm_vm(vmask vm) { return _mm256_permute4x64_epi64(vm, (3 << 6) | (1 << 4) | (2 << 2) | (0 << 0)); } static vmask2 vloadu_vm2_p(void *p) { vmask2 vm2; memcpy(&vm2, p, VECTLENDP * 16); return vm2; } #if !defined(SLEEF_GENHEADER) typedef Sleef_quad4 vargquad; static INLINE vmask2 vcast_vm2_aq(vargquad aq) { return vinterleave_vm2_vm2(vloadu_vm2_p(&aq)); } static INLINE vargquad vcast_aq_vm2(vmask2 vm2) { vm2 = vuninterleave_vm2_vm2(vm2); vargquad aq; memcpy(&aq, &vm2, VECTLENDP * 16); return aq; } #endif // #if !defined(SLEEF_GENHEADER) static INLINE int vtestallzeros_i_vo64(vopmask g) { return _mm_movemask_epi8(_mm_or_si128(_mm256_extractf128_si256(g, 0), _mm256_extractf128_si256(g, 1))) == 0; } static INLINE vmask vsel_vm_vo64_vm_vm(vopmask o, vmask x, vmask y) { return _mm256_blendv_epi8(y, x, o); } static INLINE vmask vsub64_vm_vm_vm(vmask x, vmask y) { return _mm256_sub_epi64(x, y); } static INLINE vmask vneg64_vm_vm(vmask x) { return _mm256_sub_epi64(vcast_vm_i_i(0, 0), x); } static INLINE vopmask vgt64_vo_vm_vm(vmask x, vmask y) { return _mm256_cmpgt_epi64(x, y); } // signed compare #define vsll64_vm_vm_i(x, c) _mm256_slli_epi64(x, c) #define vsrl64_vm_vm_i(x, c) _mm256_srli_epi64(x, c) //@#define vsll64_vm_vm_i(x, c) _mm256_slli_epi64(x, c) //@#define vsrl64_vm_vm_i(x, c) _mm256_srli_epi64(x, c) static INLINE vmask vcast_vm_vi(vint vi) { return _mm256_cvtepi32_epi64(vi); } static INLINE vint vcast_vi_vm(vmask vm) { return _mm_or_si128(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(_mm256_castsi256_si128(vm)), _mm_set1_ps(0), 0x08)), _mm_castps_si128(_mm_shuffle_ps(_mm_set1_ps(0), _mm_castsi128_ps(_mm256_extractf128_si256(vm, 1)), 0x80))); } ================================================ FILE: src/helperavx512f.h ================================================ // Copyright Naoki Shibata and contributors 2010 - 2020. // Distributed under the Boost Software License, Version 1.0. // (See accompanying file LICENSE.txt or copy at // http://www.boost.org/LICENSE_1_0.txt) #if CONFIG == 1 || CONFIG == 2 #if !defined(__AVX512F__) && !defined(SLEEF_GENHEADER) #error Please specify -mavx512f. #endif #else #error CONFIG macro invalid or not defined #endif #define ENABLE_DP //@#define ENABLE_DP #define LOG2VECTLENDP 3 //@#define LOG2VECTLENDP 3 #define VECTLENDP (1 << LOG2VECTLENDP) //@#define VECTLENDP (1 << LOG2VECTLENDP) #define ENABLE_SP //@#define ENABLE_SP #define LOG2VECTLENSP (LOG2VECTLENDP+1) //@#define LOG2VECTLENSP (LOG2VECTLENDP+1) #define VECTLENSP (1 << LOG2VECTLENSP) //@#define VECTLENSP (1 << LOG2VECTLENSP) #if CONFIG == 1 #define ENABLE_FMA_DP //@#define ENABLE_FMA_DP #define ENABLE_FMA_SP //@#define ENABLE_FMA_SP #endif #define FULL_FP_ROUNDING //@#define FULL_FP_ROUNDING #define ACCURATE_SQRT //@#define ACCURATE_SQRT #if !defined(SLEEF_GENHEADER) #if defined(_MSC_VER) #include #else #include #endif #include #include "misc.h" #endif // #if !defined(SLEEF_GENHEADER) typedef __m512i vmask; typedef __mmask16 vopmask; typedef __m512d vdouble; typedef __m256i vint; typedef __m512 vfloat; typedef __m512i vint2; typedef struct { vmask x, y; } vmask2; // #if !defined(SLEEF_GENHEADER) #ifndef __SLEEF_H__ static inline void Sleef_x86CpuID(int32_t out[4], uint32_t eax, uint32_t ecx) { /* We don't care for cpuid detection */ out[0] = 0xFFFFFFFF; out[1] = 0xFFFFFFFF; out[2] = 0xFFFFFFFF; out[3] = 0xFFFFFFFF; } #endif static INLINE int cpuSupportsAVX512F() { int32_t reg[4]; Sleef_x86CpuID(reg, 7, 0); return (reg[1] & (1 << 16)) != 0; } #if CONFIG == 1 && defined(__AVX512F__) static INLINE int vavailability_i(int name) { int d = cpuSupportsAVX512F(); return d ? 3 : 0; } #define ISANAME "AVX512F" #define DFTPRIORITY 30 #endif #if CONFIG == 2 && defined(__AVX512F__) static INLINE int vavailability_i(int name) { int d = cpuSupportsAVX512F(); return d ? 3 : 0; } #define ISANAME "AVX512FNOFMA" #define DFTPRIORITY 0 #endif #endif // #if !defined(SLEEF_GENHEADER) static INLINE void vprefetch_v_p(const void *ptr) { _mm_prefetch(ptr, _MM_HINT_T0); } #ifdef __INTEL_COMPILER static INLINE int vtestallones_i_vo64(vopmask g) { return _mm512_mask2int(g) == 0xff; } static INLINE int vtestallones_i_vo32(vopmask g) { return _mm512_mask2int(g) == 0xffff; } #else static INLINE int vtestallones_i_vo64(vopmask g) { return g == 0xff; } static INLINE int vtestallones_i_vo32(vopmask g) { return g == 0xffff; } #endif // static vint2 vloadu_vi2_p(int32_t *p) { return _mm512_loadu_si512((__m512i const *)p); } static void vstoreu_v_p_vi2(int32_t *p, vint2 v) { _mm512_storeu_si512((__m512i *)p, v); } static vint vloadu_vi_p(int32_t *p) { return _mm256_loadu_si256((__m256i const *)p); } static void vstoreu_v_p_vi(int32_t *p, vint v) { _mm256_storeu_si256((__m256i *)p, v); } // static INLINE vmask vand_vm_vm_vm(vmask x, vmask y) { return _mm512_and_si512(x, y); } static INLINE vmask vandnot_vm_vm_vm(vmask x, vmask y) { return _mm512_andnot_si512(x, y); } static INLINE vmask vor_vm_vm_vm(vmask x, vmask y) { return _mm512_or_si512(x, y); } static INLINE vmask vxor_vm_vm_vm(vmask x, vmask y) { return _mm512_xor_si512(x, y); } static INLINE vopmask vand_vo_vo_vo(vopmask x, vopmask y) { return _mm512_kand(x, y); } static INLINE vopmask vandnot_vo_vo_vo(vopmask x, vopmask y) { return _mm512_kandn(x, y); } static INLINE vopmask vor_vo_vo_vo(vopmask x, vopmask y) { return _mm512_kor(x, y); } static INLINE vopmask vxor_vo_vo_vo(vopmask x, vopmask y) { return _mm512_kxor(x, y); } static INLINE vmask vand_vm_vo64_vm(vopmask o, vmask m) { return _mm512_mask_and_epi64(_mm512_set1_epi32(0), o, m, m); } static INLINE vmask vandnot_vm_vo64_vm(vopmask o, vmask m) { return _mm512_mask_and_epi64(m, o, _mm512_set1_epi32(0), _mm512_set1_epi32(0)); } static INLINE vmask vor_vm_vo64_vm(vopmask o, vmask m) { return _mm512_mask_or_epi64(m, o, _mm512_set1_epi32(-1), _mm512_set1_epi32(-1)); } static INLINE vmask vand_vm_vo32_vm(vopmask o, vmask m) { return _mm512_mask_and_epi32(_mm512_set1_epi32(0), o, m, m); } static INLINE vmask vandnot_vm_vo32_vm(vopmask o, vmask m) { return _mm512_mask_and_epi32(m, o, _mm512_set1_epi32(0), _mm512_set1_epi32(0)); } static INLINE vmask vor_vm_vo32_vm(vopmask o, vmask m) { return _mm512_mask_or_epi32(m, o, _mm512_set1_epi32(-1), _mm512_set1_epi32(-1)); } static INLINE vopmask vcast_vo32_vo64(vopmask o) { return o; } static INLINE vopmask vcast_vo64_vo32(vopmask o) { return o; } // static INLINE vint vrint_vi_vd(vdouble vd) { return _mm512_cvt_roundpd_epi32(vd, _MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC); } static INLINE vint vtruncate_vi_vd(vdouble vd) { return _mm512_cvt_roundpd_epi32(vd, _MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC); } static INLINE vdouble vcast_vd_vi(vint vi) { return _mm512_cvtepi32_pd(vi); } static INLINE vint vcast_vi_i(int i) { return _mm256_set1_epi32(i); } static INLINE vdouble vtruncate_vd_vd(vdouble vd) { return _mm512_roundscale_pd(vd, _MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC); } static INLINE vdouble vrint_vd_vd(vdouble vd) { return _mm512_roundscale_pd(vd, _MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC); } static INLINE vint2 vcastu_vi2_vi(vint vi) { return _mm512_maskz_permutexvar_epi32(0xaaaa, _mm512_set_epi32(7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0), _mm512_castsi256_si512(vi)); } static INLINE vint vcastu_vi_vi2(vint2 vi) { return _mm512_castsi512_si256(_mm512_maskz_permutexvar_epi32(0x00ff, _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 15, 13, 11, 9, 7, 5, 3, 1), vi)); } static INLINE vmask vcast_vm_i_i(int i0, int i1) { return _mm512_set_epi32(i0, i1, i0, i1, i0, i1, i0, i1, i0, i1, i0, i1, i0, i1, i0, i1); } static INLINE vopmask veq64_vo_vm_vm(vmask x, vmask y) { return _mm512_cmp_epi64_mask(x, y, _MM_CMPINT_EQ); } static INLINE vmask vadd64_vm_vm_vm(vmask x, vmask y) { return _mm512_add_epi64(x, y); } // static INLINE vdouble vcast_vd_d(double d) { return _mm512_set1_pd(d); } static INLINE vmask vreinterpret_vm_vd(vdouble vd) { return _mm512_castpd_si512(vd); } static INLINE vdouble vreinterpret_vd_vm(vmask vm) { return _mm512_castsi512_pd(vm); } static INLINE vint2 vreinterpret_vi2_vd(vdouble vd) { return _mm512_castpd_si512(vd); } static INLINE vdouble vreinterpret_vd_vi2(vint2 vi) { return _mm512_castsi512_pd(vi); } static INLINE vdouble vadd_vd_vd_vd(vdouble x, vdouble y) { return _mm512_add_pd(x, y); } static INLINE vdouble vsub_vd_vd_vd(vdouble x, vdouble y) { return _mm512_sub_pd(x, y); } static INLINE vdouble vmul_vd_vd_vd(vdouble x, vdouble y) { return _mm512_mul_pd(x, y); } static INLINE vdouble vdiv_vd_vd_vd(vdouble x, vdouble y) { return _mm512_div_pd(x, y); } static INLINE vdouble vrec_vd_vd(vdouble x) { return _mm512_div_pd(_mm512_set1_pd(1), x); } static INLINE vdouble vsqrt_vd_vd(vdouble x) { return _mm512_sqrt_pd(x); } static INLINE vdouble vabs_vd_vd(vdouble d) { return vreinterpret_vd_vm(_mm512_andnot_si512(vreinterpret_vm_vd(_mm512_set1_pd(-0.0)), vreinterpret_vm_vd(d))); } static INLINE vdouble vneg_vd_vd(vdouble d) { return vreinterpret_vd_vm(_mm512_xor_si512(vreinterpret_vm_vd(_mm512_set1_pd(-0.0)), vreinterpret_vm_vd(d))); } static INLINE vdouble vmax_vd_vd_vd(vdouble x, vdouble y) { return _mm512_max_pd(x, y); } static INLINE vdouble vmin_vd_vd_vd(vdouble x, vdouble y) { return _mm512_min_pd(x, y); } #if CONFIG == 1 static INLINE vdouble vmla_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm512_fmadd_pd(x, y, z); } static INLINE vdouble vmlapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm512_fmsub_pd(x, y, z); } static INLINE vdouble vmlanp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm512_fnmadd_pd(x, y, z); } #else static INLINE vdouble vmla_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vadd_vd_vd_vd(vmul_vd_vd_vd(x, y), z); } static INLINE vdouble vmlapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vsub_vd_vd_vd(vmul_vd_vd_vd(x, y), z); } #endif static INLINE vdouble vfma_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm512_fmadd_pd(x, y, z); } static INLINE vdouble vfmapp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm512_fmadd_pd(x, y, z); } static INLINE vdouble vfmapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm512_fmsub_pd(x, y, z); } static INLINE vdouble vfmanp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm512_fnmadd_pd(x, y, z); } static INLINE vdouble vfmann_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm512_fnmsub_pd(x, y, z); } static INLINE vopmask veq_vo_vd_vd(vdouble x, vdouble y) { return _mm512_cmp_pd_mask(x, y, _CMP_EQ_OQ); } static INLINE vopmask vneq_vo_vd_vd(vdouble x, vdouble y) { return _mm512_cmp_pd_mask(x, y, _CMP_NEQ_UQ); } static INLINE vopmask vlt_vo_vd_vd(vdouble x, vdouble y) { return _mm512_cmp_pd_mask(x, y, _CMP_LT_OQ); } static INLINE vopmask vle_vo_vd_vd(vdouble x, vdouble y) { return _mm512_cmp_pd_mask(x, y, _CMP_LE_OQ); } static INLINE vopmask vgt_vo_vd_vd(vdouble x, vdouble y) { return _mm512_cmp_pd_mask(x, y, _CMP_GT_OQ); } static INLINE vopmask vge_vo_vd_vd(vdouble x, vdouble y) { return _mm512_cmp_pd_mask(x, y, _CMP_GE_OQ); } // static INLINE vint vadd_vi_vi_vi(vint x, vint y) { return _mm256_add_epi32(x, y); } static INLINE vint vsub_vi_vi_vi(vint x, vint y) { return _mm256_sub_epi32(x, y); } static INLINE vint vneg_vi_vi(vint e) { return vsub_vi_vi_vi(vcast_vi_i(0), e); } static INLINE vint vand_vi_vi_vi(vint x, vint y) { return _mm256_and_si256(x, y); } static INLINE vint vandnot_vi_vi_vi(vint x, vint y) { return _mm256_andnot_si256(x, y); } static INLINE vint vandnot_vi_vo_vi(vopmask o, vint y) { return _mm512_castsi512_si256(_mm512_mask_and_epi32(_mm512_castsi256_si512(y), o, _mm512_set1_epi32(0), _mm512_set1_epi32(0))); } static INLINE vint vand_vi_vo_vi(vopmask o, vint y) { return _mm512_castsi512_si256(_mm512_mask_and_epi32(_mm512_set1_epi32(0), o, _mm512_castsi256_si512(y), _mm512_castsi256_si512(y))); } static INLINE vint vor_vi_vi_vi(vint x, vint y) { return _mm256_or_si256(x, y); } static INLINE vint vxor_vi_vi_vi(vint x, vint y) { return _mm256_xor_si256(x, y); } #define vsll_vi_vi_i(x, c) _mm256_slli_epi32(x, c) #define vsrl_vi_vi_i(x, c) _mm256_srli_epi32(x, c) #define vsra_vi_vi_i(x, c) _mm256_srai_epi32(x, c) //@#define vsll_vi_vi_i(x, c) _mm256_slli_epi32(x, c) //@#define vsrl_vi_vi_i(x, c) _mm256_srli_epi32(x, c) //@#define vsra_vi_vi_i(x, c) _mm256_srai_epi32(x, c) static INLINE vint veq_vi_vi_vi(vint x, vint y) { return _mm256_cmpeq_epi32(x, y); } static INLINE vint vgt_vi_vi_vi(vint x, vint y) { return _mm256_cmpgt_epi32(x, y); } static INLINE vopmask veq_vo_vi_vi(vint x, vint y) { return _mm512_cmp_epi32_mask(_mm512_castsi256_si512(x), _mm512_castsi256_si512(y), _MM_CMPINT_EQ); } static INLINE vopmask vgt_vo_vi_vi(vint x, vint y) { return _mm512_cmp_epi32_mask(_mm512_castsi256_si512(y), _mm512_castsi256_si512(x), _MM_CMPINT_LT); } static INLINE vdouble vsel_vd_vo_vd_vd(vopmask mask, vdouble x, vdouble y) { return _mm512_mask_blend_pd(mask, y, x); } static INLINE CONST vdouble vsel_vd_vo_d_d(vopmask o, double v1, double v0) { return vsel_vd_vo_vd_vd(o, vcast_vd_d(v1), vcast_vd_d(v0)); } #if 1 // Probably this is faster static INLINE vdouble vsel_vd_vo_vo_vo_d_d_d_d(vopmask o0, vopmask o1, vopmask o2, double d0, double d1, double d2, double d3) { __m512i v = _mm512_castpd_si512(vsel_vd_vo_vd_vd(o0, _mm512_castsi512_pd(_mm512_set_epi64(0, 0, 0, 0, 0, 0, 0, 0)), vsel_vd_vo_vd_vd(o1, _mm512_castsi512_pd(_mm512_set_epi64(1, 1, 1, 1, 1, 1, 1, 1)), vsel_vd_vo_vd_vd(o2, _mm512_castsi512_pd(_mm512_set_epi64(2, 2, 2, 2, 2, 2, 2, 2)), _mm512_castsi512_pd(_mm512_set_epi64(3, 3, 3, 3, 3, 3, 3, 3)))))); return _mm512_permutexvar_pd(v, _mm512_castpd256_pd512(_mm256_set_pd(d3, d2, d1, d0))); } static INLINE vdouble vsel_vd_vo_vo_d_d_d(vopmask o0, vopmask o1, double d0, double d1, double d2) { return vsel_vd_vo_vo_vo_d_d_d_d(o0, o1, o1, d0, d1, d2, d2); } #else static INLINE vdouble vsel_vd_vo_vo_d_d_d(vopmask o0, vopmask o1, double d0, double d1, double d2) { return vsel_vd_vo_vd_vd(o0, vcast_vd_d(d0), vsel_vd_vo_d_d(o1, d1, d2)); } static INLINE vdouble vsel_vd_vo_vo_vo_d_d_d_d(vopmask o0, vopmask o1, vopmask o2, double d0, double d1, double d2, double d3) { return vsel_vd_vo_vd_vd(o0, vcast_vd_d(d0), vsel_vd_vo_vd_vd(o1, vcast_vd_d(d1), vsel_vd_vo_d_d(o2, d2, d3))); } #endif static INLINE vopmask visinf_vo_vd(vdouble d) { return _mm512_cmp_pd_mask(vabs_vd_vd(d), _mm512_set1_pd(SLEEF_INFINITY), _CMP_EQ_OQ); } static INLINE vopmask vispinf_vo_vd(vdouble d) { return _mm512_cmp_pd_mask(d, _mm512_set1_pd(SLEEF_INFINITY), _CMP_EQ_OQ); } static INLINE vopmask visminf_vo_vd(vdouble d) { return _mm512_cmp_pd_mask(d, _mm512_set1_pd(-SLEEF_INFINITY), _CMP_EQ_OQ); } static INLINE vopmask visnan_vo_vd(vdouble d) { return _mm512_cmp_pd_mask(d, d, _CMP_NEQ_UQ); } static INLINE vint vilogbk_vi_vd(vdouble d) { return vrint_vi_vd(_mm512_getexp_pd(d)); } // vilogb2k_vi_vd is similar to vilogbk_vi_vd, but the argument has to // be a normalized FP value. static INLINE vint vilogb2k_vi_vd(vdouble d) { return vrint_vi_vd(_mm512_getexp_pd(d)); } static INLINE vdouble vgetexp_vd_vd(vdouble d) { return _mm512_getexp_pd(d); } static INLINE vfloat vgetexp_vf_vf(vfloat d) { return _mm512_getexp_ps(d); } static INLINE vdouble vgetmant_vd_vd(vdouble d) { return _mm512_getmant_pd(d, _MM_MANT_NORM_p75_1p5, _MM_MANT_SIGN_nan); } static INLINE vfloat vgetmant_vf_vf(vfloat d) { return _mm512_getmant_ps(d, _MM_MANT_NORM_p75_1p5, _MM_MANT_SIGN_nan); } #define vfixup_vd_vd_vd_vi2_i(a, b, c, imm) _mm512_fixupimm_pd((a), (b), (c), (imm)) #define vfixup_vf_vf_vf_vi2_i(a, b, c, imm) _mm512_fixupimm_ps((a), (b), (c), (imm)) //@#define vfixup_vd_vd_vd_vi2_i(a, b, c, imm) _mm512_fixupimm_pd((a), (b), (c), (imm)) //@#define vfixup_vf_vf_vf_vi2_i(a, b, c, imm) _mm512_fixupimm_ps((a), (b), (c), (imm)) #if defined(_MSC_VER) // This function is needed when debugging on MSVC. static INLINE double vcast_d_vd(vdouble v) { double s[VECTLENDP]; _mm512_storeu_pd(s, v); return s[0]; } #endif static INLINE vdouble vload_vd_p(const double *ptr) { return _mm512_load_pd(ptr); } static INLINE vdouble vloadu_vd_p(const double *ptr) { return _mm512_loadu_pd(ptr); } static INLINE void vstore_v_p_vd(double *ptr, vdouble v) { _mm512_store_pd(ptr, v); } static INLINE void vstoreu_v_p_vd(double *ptr, vdouble v) { _mm512_storeu_pd(ptr, v); } static INLINE vdouble vgather_vd_p_vi(const double *ptr, vint vi) { return _mm512_i32gather_pd(vi, ptr, 8); } // static INLINE vint vsel_vi_vo_vi_vi(vopmask m, vint x, vint y) { return _mm512_castsi512_si256(_mm512_mask_blend_epi32(m, _mm512_castsi256_si512(y), _mm512_castsi256_si512(x))); } // static INLINE vmask vreinterpret_vm_vf(vfloat vf) { return _mm512_castps_si512(vf); } static INLINE vfloat vreinterpret_vf_vm(vmask vm) { return _mm512_castsi512_ps(vm); } static INLINE vfloat vreinterpret_vf_vi2(vint2 vi) { return _mm512_castsi512_ps(vi); } static INLINE vint2 vreinterpret_vi2_vf(vfloat vf) { return _mm512_castps_si512(vf); } static INLINE vdouble vreinterpret_vd_vf(vfloat vf) { return _mm512_castps_pd(vf); } static INLINE vfloat vreinterpret_vf_vd(vdouble vd) { return _mm512_castpd_ps(vd); } static INLINE vint2 vcast_vi2_vm(vmask vm) { return vm; } static INLINE vmask vcast_vm_vi2(vint2 vi) { return vi; } static INLINE vfloat vcast_vf_vi2(vint2 vi) { return _mm512_cvtepi32_ps(vcast_vm_vi2(vi)); } static INLINE vfloat vcast_vf_f(float f) { return _mm512_set1_ps(f); } static INLINE vint2 vcast_vi2_i(int i) { return _mm512_set1_epi32(i); } static INLINE vint2 vrint_vi2_vf(vfloat vf) { return vcast_vi2_vm(_mm512_cvtps_epi32(vf)); } static INLINE vint2 vtruncate_vi2_vf(vfloat vf) { return vcast_vi2_vm(_mm512_cvttps_epi32(vf)); } static INLINE vfloat vtruncate_vf_vf(vfloat vd) { return _mm512_roundscale_ps(vd, _MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC); } static INLINE vfloat vrint_vf_vf(vfloat vd) { return _mm512_roundscale_ps(vd, _MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC); } static INLINE vfloat vadd_vf_vf_vf(vfloat x, vfloat y) { return _mm512_add_ps(x, y); } static INLINE vfloat vsub_vf_vf_vf(vfloat x, vfloat y) { return _mm512_sub_ps(x, y); } static INLINE vfloat vmul_vf_vf_vf(vfloat x, vfloat y) { return _mm512_mul_ps(x, y); } static INLINE vfloat vdiv_vf_vf_vf(vfloat x, vfloat y) { return _mm512_div_ps(x, y); } static INLINE vfloat vrec_vf_vf(vfloat x) { return vdiv_vf_vf_vf(vcast_vf_f(1.0f), x); } static INLINE vfloat vsqrt_vf_vf(vfloat x) { return _mm512_sqrt_ps(x); } static INLINE vfloat vabs_vf_vf(vfloat f) { return vreinterpret_vf_vm(vandnot_vm_vm_vm(vreinterpret_vm_vf(vcast_vf_f(-0.0f)), vreinterpret_vm_vf(f))); } static INLINE vfloat vneg_vf_vf(vfloat d) { return vreinterpret_vf_vm(vxor_vm_vm_vm(vreinterpret_vm_vf(vcast_vf_f(-0.0f)), vreinterpret_vm_vf(d))); } static INLINE vfloat vmax_vf_vf_vf(vfloat x, vfloat y) { return _mm512_max_ps(x, y); } static INLINE vfloat vmin_vf_vf_vf(vfloat x, vfloat y) { return _mm512_min_ps(x, y); } #if CONFIG == 1 static INLINE vfloat vmla_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm512_fmadd_ps(x, y, z); } static INLINE vfloat vmlapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm512_fmsub_ps(x, y, z); } static INLINE vfloat vmlanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm512_fnmadd_ps(x, y, z); } #else static INLINE vfloat vmla_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vadd_vf_vf_vf(vmul_vf_vf_vf(x, y), z); } static INLINE vfloat vmlanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vsub_vf_vf_vf(z, vmul_vf_vf_vf(x, y)); } static INLINE vfloat vmlapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vsub_vf_vf_vf(vmul_vf_vf_vf(x, y), z); } #endif static INLINE vfloat vfma_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm512_fmadd_ps(x, y, z); } static INLINE vfloat vfmapp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm512_fmadd_ps(x, y, z); } static INLINE vfloat vfmapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm512_fmsub_ps(x, y, z); } static INLINE vfloat vfmanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm512_fnmadd_ps(x, y, z); } static INLINE vfloat vfmann_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm512_fnmsub_ps(x, y, z); } static INLINE vopmask veq_vo_vf_vf(vfloat x, vfloat y) { return _mm512_cmp_ps_mask(x, y, _CMP_EQ_OQ); } static INLINE vopmask vneq_vo_vf_vf(vfloat x, vfloat y) { return _mm512_cmp_ps_mask(x, y, _CMP_NEQ_UQ); } static INLINE vopmask vlt_vo_vf_vf(vfloat x, vfloat y) { return _mm512_cmp_ps_mask(x, y, _CMP_LT_OQ); } static INLINE vopmask vle_vo_vf_vf(vfloat x, vfloat y) { return _mm512_cmp_ps_mask(x, y, _CMP_LE_OQ); } static INLINE vopmask vgt_vo_vf_vf(vfloat x, vfloat y) { return _mm512_cmp_ps_mask(x, y, _CMP_GT_OQ); } static INLINE vopmask vge_vo_vf_vf(vfloat x, vfloat y) { return _mm512_cmp_ps_mask(x, y, _CMP_GE_OQ); } static INLINE vint2 vadd_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm512_add_epi32(x, y); } static INLINE vint2 vsub_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm512_sub_epi32(x, y); } static INLINE vint2 vneg_vi2_vi2(vint2 e) { return vsub_vi2_vi2_vi2(vcast_vi2_i(0), e); } static INLINE vint2 vand_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm512_and_si512(x, y); } static INLINE vint2 vandnot_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm512_andnot_si512(x, y); } static INLINE vint2 vor_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm512_or_si512(x, y); } static INLINE vint2 vxor_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm512_xor_si512(x, y); } static INLINE vint2 vand_vi2_vo_vi2(vopmask o, vint2 m) { return _mm512_mask_and_epi32(_mm512_set1_epi32(0), o, m, m); } static INLINE vint2 vandnot_vi2_vo_vi2(vopmask o, vint2 m) { return _mm512_mask_and_epi32(m, o, _mm512_set1_epi32(0), _mm512_set1_epi32(0)); } #define vsll_vi2_vi2_i(x, c) _mm512_slli_epi32(x, c) #define vsrl_vi2_vi2_i(x, c) _mm512_srli_epi32(x, c) #define vsra_vi2_vi2_i(x, c) _mm512_srai_epi32(x, c) //@#define vsll_vi2_vi2_i(x, c) _mm512_slli_epi32(x, c) //@#define vsrl_vi2_vi2_i(x, c) _mm512_srli_epi32(x, c) //@#define vsra_vi2_vi2_i(x, c) _mm512_srai_epi32(x, c) static INLINE vopmask veq_vo_vi2_vi2(vint2 x, vint2 y) { return _mm512_cmpeq_epi32_mask(x, y); } static INLINE vopmask vgt_vo_vi2_vi2(vint2 x, vint2 y) { return _mm512_cmpgt_epi32_mask(x, y); } static INLINE vint2 veq_vi2_vi2_vi2(vint2 x, vint2 y) { __mmask16 m = _mm512_cmp_epi32_mask(x, y, _MM_CMPINT_EQ); return _mm512_mask_and_epi32(_mm512_set1_epi32(0), m, _mm512_set1_epi32(-1), _mm512_set1_epi32(-1)); } static INLINE vint2 vgt_vi2_vi2_vi2(vint2 x, vint2 y) { __mmask16 m = _mm512_cmp_epi32_mask(y, x, _MM_CMPINT_LT); return _mm512_mask_and_epi32(_mm512_set1_epi32(0), m, _mm512_set1_epi32(-1), _mm512_set1_epi32(-1)); } static INLINE vint2 vsel_vi2_vo_vi2_vi2(vopmask m, vint2 x, vint2 y) { return _mm512_mask_blend_epi32(m, y, x); } static INLINE vfloat vsel_vf_vo_vf_vf(vopmask m, vfloat x, vfloat y) { return _mm512_mask_blend_ps(m, y, x); } // At this point, the following three functions are implemented in a generic way, // but I will try target-specific optimization later on. static INLINE CONST vfloat vsel_vf_vo_f_f(vopmask o, float v1, float v0) { return vsel_vf_vo_vf_vf(o, vcast_vf_f(v1), vcast_vf_f(v0)); } static INLINE vfloat vsel_vf_vo_vo_f_f_f(vopmask o0, vopmask o1, float d0, float d1, float d2) { return vsel_vf_vo_vf_vf(o0, vcast_vf_f(d0), vsel_vf_vo_f_f(o1, d1, d2)); } static INLINE vfloat vsel_vf_vo_vo_vo_f_f_f_f(vopmask o0, vopmask o1, vopmask o2, float d0, float d1, float d2, float d3) { return vsel_vf_vo_vf_vf(o0, vcast_vf_f(d0), vsel_vf_vo_vf_vf(o1, vcast_vf_f(d1), vsel_vf_vo_f_f(o2, d2, d3))); } static INLINE vopmask visinf_vo_vf(vfloat d) { return veq_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(SLEEF_INFINITYf)); } static INLINE vopmask vispinf_vo_vf(vfloat d) { return veq_vo_vf_vf(d, vcast_vf_f(SLEEF_INFINITYf)); } static INLINE vopmask visminf_vo_vf(vfloat d) { return veq_vo_vf_vf(d, vcast_vf_f(-SLEEF_INFINITYf)); } static INLINE vopmask visnan_vo_vf(vfloat d) { return vneq_vo_vf_vf(d, d); } static INLINE vint2 vilogbk_vi2_vf(vfloat d) { return vrint_vi2_vf(_mm512_getexp_ps(d)); } static INLINE vint2 vilogb2k_vi2_vf(vfloat d) { return vrint_vi2_vf(_mm512_getexp_ps(d)); } #ifdef _MSC_VER // This function is needed when debugging on MSVC. static INLINE float vcast_f_vf(vfloat v) { float s[VECTLENSP]; _mm512_storeu_ps(s, v); return s[0]; } #endif static INLINE vfloat vload_vf_p(const float *ptr) { return _mm512_load_ps(ptr); } static INLINE vfloat vloadu_vf_p(const float *ptr) { return _mm512_loadu_ps(ptr); } static INLINE void vstore_v_p_vf(float *ptr, vfloat v) { _mm512_store_ps(ptr, v); } static INLINE void vstoreu_v_p_vf(float *ptr, vfloat v) { _mm512_storeu_ps(ptr, v); } static INLINE vfloat vgather_vf_p_vi2(const float *ptr, vint2 vi2) { return _mm512_i32gather_ps(vi2, ptr, 4); } // static INLINE vdouble vposneg_vd_vd(vdouble d) { return vreinterpret_vd_vm(_mm512_mask_xor_epi32(vreinterpret_vm_vd(d), 0xcccc, vreinterpret_vm_vd(d), vreinterpret_vm_vd(_mm512_set1_pd(-0.0)))); } static INLINE vdouble vnegpos_vd_vd(vdouble d) { return vreinterpret_vd_vm(_mm512_mask_xor_epi32(vreinterpret_vm_vd(d), 0x3333, vreinterpret_vm_vd(d), vreinterpret_vm_vd(_mm512_set1_pd(-0.0)))); } static INLINE vfloat vposneg_vf_vf(vfloat d) { return vreinterpret_vf_vm(_mm512_mask_xor_epi32(vreinterpret_vm_vf(d), 0xaaaa, vreinterpret_vm_vf(d), vreinterpret_vm_vf(_mm512_set1_ps(-0.0f)))); } static INLINE vfloat vnegpos_vf_vf(vfloat d) { return vreinterpret_vf_vm(_mm512_mask_xor_epi32(vreinterpret_vm_vf(d), 0x5555, vreinterpret_vm_vf(d), vreinterpret_vm_vf(_mm512_set1_ps(-0.0f)))); } static INLINE vdouble vsubadd_vd_vd_vd(vdouble x, vdouble y) { return vadd_vd_vd_vd(x, vnegpos_vd_vd(y)); } static INLINE vfloat vsubadd_vf_vf_vf(vfloat x, vfloat y) { return vadd_vf_vf_vf(x, vnegpos_vf_vf(y)); } static INLINE vdouble vmlsubadd_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm512_fmaddsub_pd(x, y, z); } static INLINE vfloat vmlsubadd_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm512_fmaddsub_ps(x, y, z); } static INLINE vdouble vrev21_vd_vd(vdouble vd) { return _mm512_permute_pd(vd, 0x55); } static INLINE vdouble vreva2_vd_vd(vdouble vd) { return vreinterpret_vd_vm(_mm512_permutexvar_epi32(_mm512_set_epi32(3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12), vreinterpret_vm_vd(vd))); } static INLINE void vstream_v_p_vd(double *ptr, vdouble v) { _mm512_stream_pd(ptr, v); } static INLINE void vscatter2_v_p_i_i_vd(double *ptr, int offset, int step, vdouble v) { _mm_store_pd(&ptr[(offset + step * 0)*2], _mm_castps_pd(_mm512_extractf32x4_ps(vreinterpret_vf_vd(v), 0))); _mm_store_pd(&ptr[(offset + step * 1)*2], _mm_castps_pd(_mm512_extractf32x4_ps(vreinterpret_vf_vd(v), 1))); _mm_store_pd(&ptr[(offset + step * 2)*2], _mm_castps_pd(_mm512_extractf32x4_ps(vreinterpret_vf_vd(v), 2))); _mm_store_pd(&ptr[(offset + step * 3)*2], _mm_castps_pd(_mm512_extractf32x4_ps(vreinterpret_vf_vd(v), 3))); } static INLINE void vsscatter2_v_p_i_i_vd(double *ptr, int offset, int step, vdouble v) { _mm_stream_pd(&ptr[(offset + step * 0)*2], _mm_castps_pd(_mm512_extractf32x4_ps(vreinterpret_vf_vd(v), 0))); _mm_stream_pd(&ptr[(offset + step * 1)*2], _mm_castps_pd(_mm512_extractf32x4_ps(vreinterpret_vf_vd(v), 1))); _mm_stream_pd(&ptr[(offset + step * 2)*2], _mm_castps_pd(_mm512_extractf32x4_ps(vreinterpret_vf_vd(v), 2))); _mm_stream_pd(&ptr[(offset + step * 3)*2], _mm_castps_pd(_mm512_extractf32x4_ps(vreinterpret_vf_vd(v), 3))); } // static INLINE vfloat vrev21_vf_vf(vfloat vf) { return _mm512_permute_ps(vf, 0xb1); } static INLINE vint2 vrev21_vi2_vi2(vint2 i) { return vreinterpret_vi2_vf(vrev21_vf_vf(vreinterpret_vf_vi2(i))); } static INLINE vfloat vreva2_vf_vf(vfloat vf) { return vreinterpret_vf_vm(_mm512_permutexvar_epi32(_mm512_set_epi32(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14), vreinterpret_vm_vf(vf))); } static INLINE void vstream_v_p_vf(float *ptr, vfloat v) { _mm512_stream_ps(ptr, v); } static INLINE void vscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v) { _mm_storel_pd((double *)(ptr+(offset + step * 0)*2), _mm_castps_pd(_mm512_extractf32x4_ps(v, 0))); _mm_storeh_pd((double *)(ptr+(offset + step * 1)*2), _mm_castps_pd(_mm512_extractf32x4_ps(v, 0))); _mm_storel_pd((double *)(ptr+(offset + step * 2)*2), _mm_castps_pd(_mm512_extractf32x4_ps(v, 1))); _mm_storeh_pd((double *)(ptr+(offset + step * 3)*2), _mm_castps_pd(_mm512_extractf32x4_ps(v, 1))); _mm_storel_pd((double *)(ptr+(offset + step * 4)*2), _mm_castps_pd(_mm512_extractf32x4_ps(v, 2))); _mm_storeh_pd((double *)(ptr+(offset + step * 5)*2), _mm_castps_pd(_mm512_extractf32x4_ps(v, 2))); _mm_storel_pd((double *)(ptr+(offset + step * 6)*2), _mm_castps_pd(_mm512_extractf32x4_ps(v, 3))); _mm_storeh_pd((double *)(ptr+(offset + step * 7)*2), _mm_castps_pd(_mm512_extractf32x4_ps(v, 3))); } static INLINE void vsscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v) { vscatter2_v_p_i_i_vf(ptr, offset, step, v); } // static INLINE vmask2 vinterleave_vm2_vm2(vmask2 v) { return (vmask2) { _mm512_unpacklo_epi64(v.x, v.y), _mm512_unpackhi_epi64(v.x, v.y) }; } static INLINE vmask2 vuninterleave_vm2_vm2(vmask2 v) { return (vmask2) { _mm512_unpacklo_epi64(v.x, v.y), _mm512_unpackhi_epi64(v.x, v.y) }; } static INLINE vint vuninterleave_vi_vi(vint v) { return _mm256_permutevar8x32_epi32(v, _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0)); } static INLINE vdouble vinterleave_vd_vd(vdouble vd) { return vreinterpret_vd_vm(_mm512_permutexvar_epi32(_mm512_set_epi32(15, 14, 7, 6, 13, 12, 5, 4, 11, 10, 3, 2, 9, 8, 1, 0), vreinterpret_vm_vd(vd))); } static INLINE vdouble vuninterleave_vd_vd(vdouble vd) { return vreinterpret_vd_vm(_mm512_permutexvar_epi32(_mm512_set_epi32(15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0), vreinterpret_vm_vd(vd))); } static INLINE vmask vinterleave_vm_vm(vmask vm) { return _mm512_permutexvar_epi32(_mm512_set_epi32(15, 14, 7, 6, 13, 12, 5, 4, 11, 10, 3, 2, 9, 8, 1, 0), vm); } static INLINE vmask vuninterleave_vm_vm(vmask vm) { return _mm512_permutexvar_epi32(_mm512_set_epi32(15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0), vm); } static vmask2 vloadu_vm2_p(void *p) { vmask2 vm2; memcpy(&vm2, p, VECTLENDP * 16); return vm2; } #if !defined(SLEEF_GENHEADER) typedef Sleef_quad8 vargquad; static INLINE vmask2 vcast_vm2_aq(vargquad aq) { return vinterleave_vm2_vm2(vloadu_vm2_p(&aq)); } static INLINE vargquad vcast_aq_vm2(vmask2 vm2) { vm2 = vuninterleave_vm2_vm2(vm2); vargquad aq; memcpy(&aq, &vm2, VECTLENDP * 16); return aq; } #endif // #if !defined(SLEEF_GENHEADER) #ifdef __INTEL_COMPILER static INLINE int vtestallzeros_i_vo64(vopmask g) { return _mm512_mask2int(g) == 0; } #else static INLINE int vtestallzeros_i_vo64(vopmask g) { return g == 0; } #endif static INLINE vmask vsel_vm_vo64_vm_vm(vopmask m, vmask x, vmask y) { return _mm512_mask_blend_epi64(m, y, x); } static INLINE vmask vsub64_vm_vm_vm(vmask x, vmask y) { return _mm512_sub_epi64(x, y); } static INLINE vmask vneg64_vm_vm(vmask x) { return _mm512_sub_epi64(vcast_vm_i_i(0, 0), x); } static INLINE vopmask vgt64_vo_vm_vm(vmask x, vmask y) { return _mm512_cmp_epi64_mask(y, x, _MM_CMPINT_LT); } // signed compare #define vsll64_vm_vm_i(x, c) _mm512_slli_epi64(x, c) #define vsrl64_vm_vm_i(x, c) _mm512_srli_epi64(x, c) //@#define vsll64_vm_vm_i(x, c) _mm512_slli_epi64(x, c) //@#define vsrl64_vm_vm_i(x, c) _mm512_srli_epi64(x, c) static INLINE vmask vcast_vm_vi(vint vi) { return _mm512_cvtepi32_epi64(vi); } static INLINE vint vcast_vi_vm(vmask vm) { return _mm512_cvtepi64_epi32(vm); } ================================================ FILE: src/helperneon32.h ================================================ // Copyright Naoki Shibata and contributors 2010 - 2020. // Distributed under the Boost Software License, Version 1.0. // (See accompanying file LICENSE.txt or copy at // http://www.boost.org/LICENSE_1_0.txt) #ifndef __ARM_NEON #error Please specify -mfpu=neon. #endif #ifdef __aarch64__ #warning This implementation is for AARCH32. #endif #define ENABLE_SP //@#define ENABLE_SP #define LOG2VECTLENSP 2 //@#define LOG2VECTLENSP 2 #define VECTLENSP (1 << LOG2VECTLENSP) //@#define VECTLENSP (1 << LOG2VECTLENSP) #if CONFIG == 4 #define ISANAME "AARCH32 NEON-VFPV4" #define ENABLE_FMA_SP //@#define ENABLE_FMA_SP #else #define ISANAME "AARCH32 NEON" #endif #define DFTPRIORITY 10 #define ENABLE_RECSQRT_SP //@#define ENABLE_RECSQRT_SP #include #include #include "misc.h" typedef uint32x4_t vmask; typedef uint32x4_t vopmask; //typedef int32x4_t vint; typedef float32x4_t vfloat; typedef int32x4_t vint2; // static INLINE void vprefetch_v_p(const void *ptr) { } static INLINE int vtestallones_i_vo32(vopmask g) { uint32x2_t x0 = vand_u32(vget_low_u32(g), vget_high_u32(g)); uint32x2_t x1 = vpmin_u32(x0, x0); return vget_lane_u32(x1, 0); } static vfloat vloaduf(float *p) { return vld1q_f32(p); } static void vstoreuf(float *p, vfloat v) { vst1q_f32(p, v); } static vint2 vloadu_vi2_p(int32_t *p) { return vld1q_s32(p); } static void vstoreu_v_p_vi2(int32_t *p, vint2 v) { vst1q_s32(p, v); } // static INLINE vmask vand_vm_vm_vm(vmask x, vmask y) { return vandq_u32(x, y); } static INLINE vmask vandnot_vm_vm_vm(vmask x, vmask y) { return vbicq_u32(y, x); } static INLINE vmask vor_vm_vm_vm(vmask x, vmask y) { return vorrq_u32(x, y); } static INLINE vmask vxor_vm_vm_vm(vmask x, vmask y) { return veorq_u32(x, y); } static INLINE vopmask vand_vo_vo_vo(vopmask x, vopmask y) { return vandq_u32(x, y); } static INLINE vopmask vandnot_vo_vo_vo(vopmask x, vopmask y) { return vbicq_u32(y, x); } static INLINE vopmask vor_vo_vo_vo(vopmask x, vopmask y) { return vorrq_u32(x, y); } static INLINE vopmask vxor_vo_vo_vo(vopmask x, vopmask y) { return veorq_u32(x, y); } static INLINE vmask vand_vm_vo64_vm(vopmask x, vmask y) { return vandq_u32(x, y); } static INLINE vmask vandnot_vm_vo64_vm(vopmask x, vmask y) { return vbicq_u32(y, x); } static INLINE vmask vor_vm_vo64_vm(vopmask x, vmask y) { return vorrq_u32(x, y); } static INLINE vmask vxor_vm_vo64_vm(vopmask x, vmask y) { return veorq_u32(x, y); } static INLINE vmask vand_vm_vo32_vm(vopmask x, vmask y) { return vandq_u32(x, y); } static INLINE vmask vandnot_vm_vo32_vm(vopmask x, vmask y) { return vbicq_u32(y, x); } static INLINE vmask vor_vm_vo32_vm(vopmask x, vmask y) { return vorrq_u32(x, y); } static INLINE vmask vxor_vm_vo32_vm(vopmask x, vmask y) { return veorq_u32(x, y); } static INLINE vopmask vcast_vo32_vo64(vopmask m) { return vuzpq_u32(m, m).val[0]; } static INLINE vopmask vcast_vo64_vo32(vopmask m) { return vzipq_u32(m, m).val[0]; } // static INLINE vmask vcast_vm_i_i(int i0, int i1) { return (vmask)vdupq_n_u64((uint64_t)i0 | (((uint64_t)i1) << 32)); } static INLINE vopmask veq64_vo_vm_vm(vmask x, vmask y) { uint32x4_t t = vceqq_u32(x, y); return vandq_u32(t, vrev64q_u32(t)); } // static INLINE vint2 vcast_vi2_vm(vmask vm) { return (vint2)vm; } static INLINE vmask vcast_vm_vi2(vint2 vi) { return (vmask)vi; } static INLINE vint2 vrint_vi2_vf(vfloat d) { return vcvtq_s32_f32(vaddq_f32(d, (float32x4_t)vorrq_u32(vandq_u32((uint32x4_t)d, (uint32x4_t)vdupq_n_f32(-0.0f)), (uint32x4_t)vdupq_n_f32(0.5f)))); } static INLINE vint2 vtruncate_vi2_vf(vfloat vf) { return vcvtq_s32_f32(vf); } static INLINE vfloat vcast_vf_vi2(vint2 vi) { return vcvtq_f32_s32(vi); } static INLINE vfloat vtruncate_vf_vf(vfloat vd) { return vcast_vf_vi2(vtruncate_vi2_vf(vd)); } static INLINE vfloat vrint_vf_vf(vfloat vd) { return vcast_vf_vi2(vrint_vi2_vf(vd)); } static INLINE vfloat vcast_vf_f(float f) { return vdupq_n_f32(f); } static INLINE vint2 vcast_vi2_i(int i) { return vdupq_n_s32(i); } static INLINE vmask vreinterpret_vm_vf(vfloat vf) { return (vmask)vf; } static INLINE vfloat vreinterpret_vf_vm(vmask vm) { return (vfloat)vm; } static INLINE vfloat vreinterpret_vf_vi2(vint2 vm) { return (vfloat)vm; } static INLINE vint2 vreinterpret_vi2_vf(vfloat vf) { return (vint2)vf; } static INLINE vfloat vadd_vf_vf_vf(vfloat x, vfloat y) { return vaddq_f32(x, y); } static INLINE vfloat vsub_vf_vf_vf(vfloat x, vfloat y) { return vsubq_f32(x, y); } static INLINE vfloat vmul_vf_vf_vf(vfloat x, vfloat y) { return vmulq_f32(x, y); } static INLINE vfloat vabs_vf_vf(vfloat f) { return vabsq_f32(f); } static INLINE vfloat vneg_vf_vf(vfloat f) { return vnegq_f32(f); } #if CONFIG == 4 static INLINE vfloat vmla_vf_vf_vf_vf (vfloat x, vfloat y, vfloat z) { return vfmaq_f32(z, x, y); } static INLINE vfloat vmlanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vfmsq_f32(z, x, y); } static INLINE vfloat vfma_vf_vf_vf_vf (vfloat x, vfloat y, vfloat z) { return vfmaq_f32(z, x, y); } static INLINE vfloat vfmanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vfmsq_f32(z, x, y); } static INLINE vfloat vfmapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vneg_vf_vf(vfmanp_vf_vf_vf_vf(x, y, z)); } static INLINE vfloat vmlapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vneg_vf_vf(vfmanp_vf_vf_vf_vf(x, y, z)); } static INLINE vfloat vdiv_vf_vf_vf(vfloat x, vfloat y) { float32x4_t t = vrecpeq_f32(y), u; t = vmulq_f32(t, vrecpsq_f32(y, t)); t = vfmaq_f32(t, vfmsq_f32(vdupq_n_f32(1.0f), y, t), t); u = vmulq_f32(x, t); return vfmaq_f32(u, vfmsq_f32(x, y, u), t); } static INLINE vfloat vsqrt_vf_vf(vfloat d) { float32x4_t x = vrsqrteq_f32(d); x = vmulq_f32(x, vrsqrtsq_f32(d, vmulq_f32(x, x))); x = vmulq_f32(x, vrsqrtsq_f32(d, vmulq_f32(x, x))); float32x4_t u = vmulq_f32(x, d); u = vfmaq_f32(u, vfmsq_f32(d, u, u), vmulq_f32(x, vdupq_n_f32(0.5))); return vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(u), vceqq_f32(d, vdupq_n_f32(0.0f)))); } static INLINE vfloat vrec_vf_vf(vfloat y) { float32x4_t t = vrecpeq_f32(y), u; t = vmulq_f32(t, vrecpsq_f32(y, t)); t = vfmaq_f32(t, vfmsq_f32(vdupq_n_f32(1.0f), y, t), t); return vfmaq_f32(t, vfmsq_f32(vdupq_n_f32(1.0f), y, t), t); } static INLINE vfloat vrecsqrt_vf_vf(vfloat d) { float32x4_t x = vrsqrteq_f32(d); x = vmulq_f32(x, vrsqrtsq_f32(d, vmulq_f32(x, x))); return vfmaq_f32(x, vfmsq_f32(vdupq_n_f32(1), x, vmulq_f32(x, d)), vmulq_f32(x, vdupq_n_f32(0.5))); } #else // #if CONFIG == 4 static INLINE vfloat vmla_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vmlaq_f32(z, x, y); } static INLINE vfloat vmlanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vmlsq_f32(z, x, y); } static INLINE vfloat vmlapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vneg_vf_vf(vmlsq_f32(z, x, y)); } static INLINE vfloat vdiv_vf_vf_vf(vfloat n, vfloat d) { float32x4_t x = vrecpeq_f32(d); x = vmulq_f32(x, vrecpsq_f32(d, x)); float32x4_t t = vmulq_f32(n, x); return vmlsq_f32(vaddq_f32(t, t), vmulq_f32(t, x), d); } static INLINE vfloat vsqrt_vf_vf(vfloat d) { float32x4_t x = vrsqrteq_f32(d); x = vmulq_f32(x, vrsqrtsq_f32(d, vmulq_f32(x, x))); float32x4_t u = vmulq_f32(x, d); u = vmlaq_f32(u, vmlsq_f32(d, u, u), vmulq_f32(x, vdupq_n_f32(0.5))); return vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(u), vceqq_f32(d, vdupq_n_f32(0.0f)))); } static INLINE vfloat vrec_vf_vf(vfloat d) { float32x4_t x = vrecpeq_f32(d); x = vmulq_f32(x, vrecpsq_f32(d, x)); return vmlsq_f32(vaddq_f32(x, x), vmulq_f32(x, x), d); } static INLINE vfloat vrecsqrt_vf_vf(vfloat d) { float32x4_t x = vrsqrteq_f32(d); x = vmulq_f32(x, vrsqrtsq_f32(d, vmulq_f32(x, x))); return vmlaq_f32(x, vmlsq_f32(vdupq_n_f32(1), x, vmulq_f32(x, d)), vmulq_f32(x, vdupq_n_f32(0.5))); } #endif // #if CONFIG == 4 static INLINE vfloat vmax_vf_vf_vf(vfloat x, vfloat y) { return vmaxq_f32(x, y); } static INLINE vfloat vmin_vf_vf_vf(vfloat x, vfloat y) { return vminq_f32(x, y); } static INLINE vopmask veq_vo_vf_vf(vfloat x, vfloat y) { return vceqq_f32(x, y); } static INLINE vopmask vneq_vo_vf_vf(vfloat x, vfloat y) { return vmvnq_u32(vceqq_f32(x, y)); } static INLINE vopmask vlt_vo_vf_vf(vfloat x, vfloat y) { return vcltq_f32(x, y); } static INLINE vopmask vle_vo_vf_vf(vfloat x, vfloat y) { return vcleq_f32(x, y); } static INLINE vopmask vgt_vo_vf_vf(vfloat x, vfloat y) { return vcgtq_f32(x, y); } static INLINE vopmask vge_vo_vf_vf(vfloat x, vfloat y) { return vcgeq_f32(x, y); } static INLINE vint2 vadd_vi2_vi2_vi2(vint2 x, vint2 y) { return vaddq_s32(x, y); } static INLINE vint2 vsub_vi2_vi2_vi2(vint2 x, vint2 y) { return vsubq_s32(x, y); } static INLINE vint2 vneg_vi2_vi2(vint2 e) { return vnegq_s32(e); } static INLINE vint2 vand_vi2_vi2_vi2(vint2 x, vint2 y) { return vandq_s32(x, y); } static INLINE vint2 vandnot_vi2_vi2_vi2(vint2 x, vint2 y) { return vbicq_s32(y, x); } static INLINE vint2 vor_vi2_vi2_vi2(vint2 x, vint2 y) { return vorrq_s32(x, y); } static INLINE vint2 vxor_vi2_vi2_vi2(vint2 x, vint2 y) { return veorq_s32(x, y); } static INLINE vint2 vand_vi2_vo_vi2(vopmask x, vint2 y) { return (vint2)vandq_u32(x, (vopmask)y); } static INLINE vint2 vandnot_vi2_vo_vi2(vopmask x, vint2 y) { return (vint2)vbicq_u32((vopmask)y, x); } #define vsll_vi2_vi2_i(x, c) vshlq_n_s32(x, c) #define vsrl_vi2_vi2_i(x, c) vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(x), c)) #define vsra_vi2_vi2_i(x, c) vshrq_n_s32(x, c) //@#define vsll_vi2_vi2_i(x, c) vshlq_n_s32(x, c) //@#define vsrl_vi2_vi2_i(x, c) vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(x), c)) //@#define vsra_vi2_vi2_i(x, c) vshrq_n_s32(x, c) static INLINE vopmask veq_vo_vi2_vi2(vint2 x, vint2 y) { return vceqq_s32(x, y); } static INLINE vopmask vgt_vo_vi2_vi2(vint2 x, vint2 y) { return vcgtq_s32(x, y); } static INLINE vint2 veq_vi2_vi2_vi2(vint2 x, vint2 y) { return (vint2)vceqq_s32(x, y); } static INLINE vint2 vgt_vi2_vi2_vi2(vint2 x, vint2 y) { return (vint2)vcgtq_s32(x, y); } static INLINE vint2 vsel_vi2_vo_vi2_vi2(vopmask m, vint2 x, vint2 y) { return (vint2)vbslq_u32(m, (vmask)x, (vmask)y); } static INLINE vfloat vsel_vf_vo_vf_vf(vopmask mask, vfloat x, vfloat y) { return (vfloat)vbslq_u32(mask, (vmask)x, (vmask)y); } static INLINE CONST vfloat vsel_vf_vo_f_f(vopmask o, float v1, float v0) { return vsel_vf_vo_vf_vf(o, vcast_vf_f(v1), vcast_vf_f(v0)); } static INLINE vfloat vsel_vf_vo_vo_f_f_f(vopmask o0, vopmask o1, float d0, float d1, float d2) { return vsel_vf_vo_vf_vf(o0, vcast_vf_f(d0), vsel_vf_vo_f_f(o1, d1, d2)); } static INLINE vfloat vsel_vf_vo_vo_vo_f_f_f_f(vopmask o0, vopmask o1, vopmask o2, float d0, float d1, float d2, float d3) { return vsel_vf_vo_vf_vf(o0, vcast_vf_f(d0), vsel_vf_vo_vf_vf(o1, vcast_vf_f(d1), vsel_vf_vo_f_f(o2, d2, d3))); } static INLINE vopmask visinf_vo_vf(vfloat d) { return veq_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(SLEEF_INFINITYf)); } static INLINE vopmask vispinf_vo_vf(vfloat d) { return veq_vo_vf_vf(d, vcast_vf_f(SLEEF_INFINITYf)); } static INLINE vopmask visminf_vo_vf(vfloat d) { return veq_vo_vf_vf(d, vcast_vf_f(-SLEEF_INFINITYf)); } static INLINE vopmask visnan_vo_vf(vfloat d) { return vneq_vo_vf_vf(d, d); } // This function is needed when debugging on MSVC. static INLINE float vcast_f_vf(vfloat v) { float p[4]; vst1q_f32 (p, v); return p[0]; } static INLINE int vavailability_i(int name) { if (name != 2) return 0; return vcast_f_vf(vadd_vf_vf_vf(vcast_vf_f(name), vcast_vf_f(name))) != 0.0; } static INLINE vfloat vload_vf_p(const float *ptr) { return vld1q_f32(__builtin_assume_aligned(ptr, 16)); } static INLINE vfloat vloadu_vf_p(const float *ptr) { return vld1q_f32(ptr); } static INLINE void vstore_v_p_vf(float *ptr, vfloat v) { vst1q_f32(__builtin_assume_aligned(ptr, 16), v); } static INLINE void vstoreu_v_p_vf(float *ptr, vfloat v) { vst1q_f32(ptr, v); } static INLINE vfloat vgather_vf_p_vi2(const float *ptr, vint2 vi2) { return ((vfloat) { ptr[vgetq_lane_s32(vi2, 0)], ptr[vgetq_lane_s32(vi2, 1)], ptr[vgetq_lane_s32(vi2, 2)], ptr[vgetq_lane_s32(vi2, 3)] }); } #define PNMASKf ((vfloat) { +0.0f, -0.0f, +0.0f, -0.0f }) #define NPMASKf ((vfloat) { -0.0f, +0.0f, -0.0f, +0.0f }) static INLINE vfloat vposneg_vf_vf(vfloat d) { return (vfloat)vxor_vm_vm_vm((vmask)d, (vmask)PNMASKf); } static INLINE vfloat vnegpos_vf_vf(vfloat d) { return (vfloat)vxor_vm_vm_vm((vmask)d, (vmask)NPMASKf); } static INLINE vfloat vsubadd_vf_vf_vf(vfloat d0, vfloat d1) { return vadd_vf_vf_vf(d0, vnegpos_vf_vf(d1)); } static INLINE vfloat vmlsubadd_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vsubadd_vf_vf_vf(vmul_vf_vf_vf(x, y), z); } static INLINE vfloat vrev21_vf_vf(vfloat d0) { return vrev64q_f32(d0); } static INLINE vfloat vreva2_vf_vf(vfloat d0) { return vcombine_f32(vget_high_f32(d0), vget_low_f32(d0)); } static INLINE vint2 vrev21_vi2_vi2(vint2 i) { return vreinterpret_vi2_vf(vrev21_vf_vf(vreinterpret_vf_vi2(i))); } static INLINE void vstream_v_p_vf(float *ptr, vfloat v) { vstore_v_p_vf(ptr, v); } static INLINE void vscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v) { vst1_f32((float *)(ptr+(offset + step * 0)*2), vget_low_f32(v)); vst1_f32((float *)(ptr+(offset + step * 1)*2), vget_high_f32(v)); } static INLINE void vsscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v) { vst1_f32((float *)(ptr+(offset + step * 0)*2), vget_low_f32(v)); vst1_f32((float *)(ptr+(offset + step * 1)*2), vget_high_f32(v)); } ================================================ FILE: src/helperpower_128.h ================================================ // Copyright Naoki Shibata and contributors 2010 - 2020. // Distributed under the Boost Software License, Version 1.0. // (See accompanying file LICENSE.txt or copy at // http://www.boost.org/LICENSE_1_0.txt) #if CONFIG == 1 || CONFIG == 2 #ifndef __VSX__ #error Please specify -mcpu=power8 or -mcpu=power9 #endif #else #error CONFIG macro invalid or not defined #endif #define ENABLE_DP //@#define ENABLE_DP #define LOG2VECTLENDP 1 //@#define LOG2VECTLENDP 1 #define VECTLENDP (1 << LOG2VECTLENDP) //@#define VECTLENDP (1 << LOG2VECTLENDP) #define ENABLE_SP //@#define ENABLE_SP #define LOG2VECTLENSP (LOG2VECTLENDP+1) //@#define LOG2VECTLENSP (LOG2VECTLENDP+1) #define VECTLENSP (1 << LOG2VECTLENSP) //@#define VECTLENSP (1 << LOG2VECTLENSP) #if CONFIG == 1 #define ENABLE_FMA_DP //@#define ENABLE_FMA_DP #define ENABLE_FMA_SP //@#define ENABLE_FMA_SP #endif #define ACCURATE_SQRT //@#define ACCURATE_SQRT #define FULL_FP_ROUNDING //@#define FULL_FP_ROUNDING #if !defined(SLEEF_GENHEADER) #include // undef altivec types since CPP and C99 use them as compiler tokens // use __vector and __bool instead #undef vector #undef bool #include #include "misc.h" #endif // #if !defined(SLEEF_GENHEADER) #define ISANAME "VSX" #define DFTPRIORITY 25 static INLINE int vavailability_i(int name) { return 3; } static INLINE void vprefetch_v_p(const void *ptr) { } /********************************************** ** Types ***********************************************/ typedef __vector unsigned int vmask; // using __bool with typedef may cause ambiguous errors #define vopmask __vector __bool int //@#define vopmask __vector __bool int typedef __vector signed int vint; typedef __vector signed int vint2; typedef __vector float vfloat; typedef __vector double vdouble; // internal use types typedef __vector unsigned int v__u32; typedef __vector unsigned char v__u8; typedef __vector signed long long v__i64; typedef __vector unsigned long long v__u64; #define v__b64 __vector __bool long long /********************************************** ** Utilities ***********************************************/ #define vset__vi(v0, v1) ((vint) {v0, v1, v0, v1}) #define vset__vi2(...) ((vint2) {__VA_ARGS__}) #define vset__vm(...) ((vmask) {__VA_ARGS__}) #define vset__vo(...) ((vopmask) {__VA_ARGS__}) #define vset__vf(...) ((vfloat) {__VA_ARGS__}) #define vset__vd(...) ((vdouble) {__VA_ARGS__}) #define vset__u8(...) ((v__u8) {__VA_ARGS__}) #define vset__u32(...) ((v__u32) {__VA_ARGS__}) #define vset__s64(...) ((v__i64) {__VA_ARGS__}) #define vset__u64(...) ((v__u64) {__VA_ARGS__}) #define vsetall__vi(v) vset__vi(v, v) #define vsetall__vi2(v) vset__vi2(v, v, v, v) #define vsetall__vm(v) vset__vm(v, v, v, v) #define vsetall__vo(v) vset__vo(v, v, v, v) #define vsetall__vf(v) vset__vf(v, v, v, v) #define vsetall__vd(v) vset__vd(v, v) #define vsetall__u8(v) vset__u8(v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v) #define vsetall__u32(v) vset__u32(v, v, v, v) #define vsetall__s64(v) vset__s64(v, v) #define vsetall__u64(v) vset__u64(v, v) #define vzero__vi() vsetall__vi(0) #define vzero__vi2() vsetall__vi2(0) #define vzero__vm() vsetall__vm(0) #define vzero__vo() vsetall__vo(0) #define vzero__vf() vsetall__vf(0) #define vzero__vd() vsetall__vd(0) #define vzero__u8() vsetall__u8(0) #define vzero__u32() vsetall__u32(0) #define vzero__s64() vsetall__s64(0) #define vzero__u64() vsetall__u64(0) //// Swap doubleword elements #ifdef __clang__ static INLINE v__u64 v__swapd_u64(v__u64 v) { return vec_xxpermdi(v, v, 2); } #else static INLINE v__u64 v__swapd_u64(v__u64 v) { __asm__ __volatile__("xxswapd %x0,%x1" : "=wa" (v) : "wa" (v)); return v; } #endif /********************************************** ** Memory ***********************************************/ ////////////// Unaligned memory access ////////////// /** * It's not safe to use vector assignment via (cast & dereference) for unaligned memory access * with almost all clang versions and gcc8 when VSX3 isn't enabled, * these compilers tends to generate instructions 'lvx/stvx' instead of 'lxvd2x/lxvw4x/stxvd2x/stxvw4x' * for more information check https://github.com/seiko2plus/vsx_mem_test * * TODO: check GCC(9, 10) */ //// load #if defined(__POWER9_VECTOR__) || (!defined(__clang__) && defined(__GNUC__) && __GNUC__ < 8) static vint vloadu_vi_p(const int32_t *ptr) { return *((vint*)ptr); } static INLINE vint2 vloadu_vi2_p(const int32_t *ptr) { return *((vint2*)ptr); } static INLINE vfloat vloadu_vf_p(const float *ptr) { return *((vfloat*)ptr); } static INLINE vdouble vloadu_vd_p(const double *ptr) { return *((vdouble*)ptr); } #else static vint vloadu_vi_p(const int32_t *ptr) { return vec_vsx_ld(0, ptr); } static INLINE vint2 vloadu_vi2_p(const int32_t *ptr) { return vec_vsx_ld(0, ptr); } static INLINE vfloat vloadu_vf_p(const float *ptr) { return vec_vsx_ld(0, ptr); } static INLINE vdouble vloadu_vd_p(const double *ptr) { return vec_vsx_ld(0, ptr); } #endif //// store #if defined(__POWER9_VECTOR__) || (!defined(__clang__) && defined(__GNUC__) && __GNUC__ < 8) static void vstoreu_v_p_vi(int32_t *ptr, vint v) { *((vint*)ptr) = v; } static void vstoreu_v_p_vi2(int32_t *ptr, vint2 v) { *((vint2*)ptr) = v; } static INLINE void vstoreu_v_p_vf(float *ptr, vfloat v) { *((vfloat*)ptr) = v; } static INLINE void vstoreu_v_p_vd(double *ptr, vdouble v) { *((vdouble*)ptr) = v; } #else static void vstoreu_v_p_vi(int32_t *ptr, vint v) { vec_vsx_st(v, 0, ptr); } static void vstoreu_v_p_vi2(int32_t *ptr, vint2 v) { vec_vsx_st(v, 0, ptr); } static INLINE void vstoreu_v_p_vf(float *ptr, vfloat v) { vec_vsx_st(v, 0, ptr); } static INLINE void vstoreu_v_p_vd(double *ptr, vdouble v) { vec_vsx_st(v, 0, ptr); } #endif ////////////// aligned memory access ////////////// //// load static INLINE vfloat vload_vf_p(const float *ptr) { return vec_ld(0, ptr); } static INLINE vdouble vload_vd_p(const double *ptr) { return *((vdouble*)ptr); } //// store static INLINE void vstore_v_p_vf(float *ptr, vfloat v) { vec_st(v, 0, ptr); } static INLINE void vstore_v_p_vd(double *ptr, vdouble v) { *((vdouble*)ptr) = v; } ////////////// non-temporal memory access ////////////// //// store static INLINE void vstream_v_p_vf(float *ptr, vfloat v) { vstore_v_p_vf(ptr, v); } static INLINE void vstream_v_p_vd(double *ptr, vdouble v) { vstore_v_p_vd(ptr, v); } ////////////// LUT ////////////// //// load static INLINE vdouble vgather_vd_p_vi(const double *ptr, vint vi) { return vset__vd(ptr[vec_extract(vi, 0)], ptr[vec_extract(vi, 1)]); } static INLINE vfloat vgather_vf_p_vi2(const float *ptr, vint2 vi2) { return vset__vf( ptr[vec_extract(vi2, 0)], ptr[vec_extract(vi2, 1)], ptr[vec_extract(vi2, 2)], ptr[vec_extract(vi2, 3)] ); } //// store static INLINE void vscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v) { const v__u64 vll = (v__u64)v; float *ptr_low = ptr + offset*2; float *ptr_high = ptr + (offset + step)*2; *((uint64_t*)ptr_low) = vec_extract(vll, 0); *((uint64_t*)ptr_high) = vec_extract(vll, 1); } static INLINE void vsscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v) { vscatter2_v_p_i_i_vf(ptr, offset, step, v); } static INLINE void vscatter2_v_p_i_i_vd(double *ptr, int offset, int step, vdouble v) { vstore_v_p_vd((double *)(&ptr[2*offset]), v); } static INLINE void vsscatter2_v_p_i_i_vd(double *ptr, int offset, int step, vdouble v) { vscatter2_v_p_i_i_vd(ptr, offset, step, v); } /********************************************** ** Misc **********************************************/ // vector with a specific value set to all lanes (Vector Splat) static INLINE vint vcast_vi_i(int i) { return vsetall__vi(i); } static INLINE vint2 vcast_vi2_i(int i) { return vsetall__vi2(i); } static INLINE vfloat vcast_vf_f(float f) { return vsetall__vf(f); } static INLINE vdouble vcast_vd_d(double d) { return vsetall__vd(d); } // cast static INLINE vint2 vcast_vi2_vm(vmask vm) { return (vint2)vm; } static INLINE vmask vcast_vm_vi2(vint2 vi) { return (vmask)vi; } // get the first element static INLINE float vcast_f_vf(vfloat v) { return vec_extract(v, 0); } static INLINE double vcast_d_vd(vdouble v) { return vec_extract(v, 0); } static INLINE vmask vreinterpret_vm_vd(vdouble vd) { return (vmask)vd; } static INLINE vdouble vreinterpret_vd_vm(vmask vm) { return (vdouble)vm; } static INLINE vint2 vreinterpret_vi2_vd(vdouble vd) { return (vint2)vd; } static INLINE vdouble vreinterpret_vd_vi2(vint2 vi) { return (vdouble)vi; } static INLINE vmask vreinterpret_vm_vf(vfloat vf) { return (vmask)vf; } static INLINE vfloat vreinterpret_vf_vm(vmask vm) { return (vfloat)vm; } static INLINE vfloat vreinterpret_vf_vi2(vint2 vi) { return (vfloat)vi; } static INLINE vint2 vreinterpret_vi2_vf(vfloat vf) { return (vint2)vf; } // per element select via mask (blend) static INLINE vdouble vsel_vd_vo_vd_vd(vopmask o, vdouble x, vdouble y) { return vec_sel(y, x, (v__b64)o); } static INLINE vfloat vsel_vf_vo_vf_vf(vopmask o, vfloat x, vfloat y) { return vec_sel(y, x, o); } static INLINE vint vsel_vi_vo_vi_vi(vopmask o, vint x, vint y) { return vec_sel(y, x, o); } static INLINE vint2 vsel_vi2_vo_vi2_vi2(vopmask o, vint2 x, vint2 y) { return vec_sel(y, x, o); } static INLINE vfloat vsel_vf_vo_f_f(vopmask o, float v1, float v0) { return vsel_vf_vo_vf_vf(o, vsetall__vf(v1), vsetall__vf(v0)); } static INLINE vfloat vsel_vf_vo_vo_f_f_f(vopmask o0, vopmask o1, float d0, float d1, float d2) { return vsel_vf_vo_vf_vf(o0, vsetall__vf(d0), vsel_vf_vo_f_f(o1, d1, d2)); } static INLINE vfloat vsel_vf_vo_vo_vo_f_f_f_f(vopmask o0, vopmask o1, vopmask o2, float d0, float d1, float d2, float d3) { return vsel_vf_vo_vf_vf(o0, vsetall__vf(d0), vsel_vf_vo_vf_vf(o1, vsetall__vf(d1), vsel_vf_vo_f_f(o2, d2, d3))); } static INLINE vdouble vsel_vd_vo_d_d(vopmask o, double v1, double v0) { return vsel_vd_vo_vd_vd(o, vsetall__vd(v1), vsetall__vd(v0)); } static INLINE vdouble vsel_vd_vo_vo_d_d_d(vopmask o0, vopmask o1, double d0, double d1, double d2) { return vsel_vd_vo_vd_vd(o0, vsetall__vd(d0), vsel_vd_vo_d_d(o1, d1, d2)); } static INLINE vdouble vsel_vd_vo_vo_vo_d_d_d_d(vopmask o0, vopmask o1, vopmask o2, double d0, double d1, double d2, double d3) { return vsel_vd_vo_vd_vd(o0, vsetall__vd(d0), vsel_vd_vo_vd_vd(o1, vsetall__vd(d1), vsel_vd_vo_d_d(o2, d2, d3))); } static INLINE int vtestallones_i_vo32(vopmask g) { return vec_all_ne((vint2)g, vzero__vi2()); } static INLINE int vtestallones_i_vo64(vopmask g) { return vec_all_ne((v__i64)g, vzero__s64()); } /********************************************** ** Conversions **********************************************/ ////////////// Numeric ////////////// // pack 64-bit mask to 32-bit static INLINE vopmask vcast_vo32_vo64(vopmask m) { return (vopmask)vec_pack((v__u64)m, (v__u64)m); } // clip 64-bit lanes to lower 32-bit static INLINE vint vcastu_vi_vi2(vint2 vi2) { return vec_mergeo(vi2, vec_splat(vi2, 3)); } // expand lower 32-bit mask static INLINE vopmask vcast_vo64_vo32(vopmask m) { return vec_mergeh(m, m); } // unsigned expand lower 32-bit integer static INLINE vint2 vcastu_vi2_vi(vint vi) { return vec_mergeh(vzero__vi(), vi); } // signed int to single-precision static INLINE vfloat vcast_vf_vi2(vint2 vi) { vfloat ret; #ifdef __clang__ ret = __builtin_convertvector(vi, vfloat); #else __asm__ __volatile__("xvcvsxwsp %x0,%x1" : "=wa" (ret) : "wa" (vi)); #endif return ret; } // lower signed int to double-precision static INLINE vdouble vcast_vd_vi(vint vi) { vdouble ret; vint swap = vec_mergeh(vi, vi); #ifdef __clang__ ret = __builtin_vsx_xvcvsxwdp(swap); #else __asm__ __volatile__("xvcvsxwdp %x0,%x1" : "=wa" (ret) : "wa" (swap)); #endif return ret; } // zip two scalars static INLINE vmask vcast_vm_i_i(int l, int h) { return (vmask)vec_mergeh(vsetall__vi2(h), vsetall__vi2(l)); } ////////////// Truncation ////////////// static INLINE vint2 vtruncate_vi2_vf(vfloat vf) { vint2 ret; #ifdef __clang__ ret = __builtin_convertvector(vf, vint2); #else __asm__ __volatile__("xvcvspsxws %x0,%x1" : "=wa" (ret) : "wa" (vf)); #endif return ret; } static INLINE vint vtruncate_vi_vd(vdouble vd) { vint ret; #ifdef __clang__ ret = __builtin_vsx_xvcvdpsxws(vd); #else __asm__ __volatile__("xvcvdpsxws %x0,%x1" : "=wa" (ret) : "wa" (vd)); #endif return vec_mergeo(ret, vec_splat(ret, 3)); } static INLINE vdouble vtruncate_vd_vd(vdouble vd) { return vec_trunc(vd); } static INLINE vfloat vtruncate_vf_vf(vfloat vf) { return vec_trunc(vf); } ////////////// Rounding ////////////// // towards the nearest even static INLINE vint vrint_vi_vd(vdouble vd) { return vtruncate_vi_vd(vec_rint(vd)); } static INLINE vint2 vrint_vi2_vf(vfloat vf) { return vtruncate_vi2_vf(vec_rint(vf)); } static INLINE vdouble vrint_vd_vd(vdouble vd) { return vec_rint(vd); } static INLINE vfloat vrint_vf_vf(vfloat vf) { return vec_rint(vf); } /********************************************** ** Logical **********************************************/ ////////////// And ////////////// static INLINE vint vand_vi_vi_vi(vint x, vint y) { return vec_and(x, y); } static INLINE vint vand_vi_vo_vi(vopmask x, vint y) { return vec_and((vint)x, y); } static INLINE vint2 vand_vi2_vi2_vi2(vint2 x, vint2 y) { return vec_and(x, y); } static INLINE vint2 vand_vi2_vo_vi2(vopmask x, vint2 y) { return (vint2)vec_and((vint2)x, y); } static INLINE vmask vand_vm_vm_vm(vmask x, vmask y) { return vec_and(x, y); } static INLINE vmask vand_vm_vo32_vm(vopmask x, vmask y) { return vec_and((vmask)x, y); } static INLINE vmask vand_vm_vo64_vm(vopmask x, vmask y) { return vec_and((vmask)x, y); } static INLINE vopmask vand_vo_vo_vo(vopmask x, vopmask y) { return vec_and(x, y); } ////////////// Or ////////////// static INLINE vint vor_vi_vi_vi(vint x, vint y) { return vec_or(x, y); } static INLINE vint2 vor_vi2_vi2_vi2(vint2 x, vint2 y) { return vec_or(x, y); } static INLINE vmask vor_vm_vm_vm(vmask x, vmask y) { return vec_or(x, y); } static INLINE vmask vor_vm_vo32_vm(vopmask x, vmask y) { return vec_or((vmask)x, y); } static INLINE vmask vor_vm_vo64_vm(vopmask x, vmask y) { return vec_or((vmask)x, y); } static INLINE vopmask vor_vo_vo_vo(vopmask x, vopmask y) { return vec_or(x, y); } ////////////// Xor ////////////// static INLINE vint vxor_vi_vi_vi(vint x, vint y) { return vec_xor(x, y); } static INLINE vint2 vxor_vi2_vi2_vi2(vint2 x, vint2 y) { return vec_xor(x, y); } static INLINE vmask vxor_vm_vm_vm(vmask x, vmask y) { return vec_xor(x, y); } static INLINE vmask vxor_vm_vo32_vm(vopmask x, vmask y) { return vec_xor((vmask)x, y); } static INLINE vmask vxor_vm_vo64_vm(vopmask x, vmask y) { return vec_xor((vmask)x, y); } static INLINE vopmask vxor_vo_vo_vo(vopmask x, vopmask y) { return vec_xor(x, y); } ////////////// Not ////////////// static INLINE vopmask vnot_vo_vo(vopmask o) { return vec_nor(o, o); } ////////////// And Not ((~x) & y) ////////////// static INLINE vint vandnot_vi_vi_vi(vint x, vint y) { return vec_andc(y, x); } static INLINE vint vandnot_vi_vo_vi(vopmask x, vint y) { return vec_andc(y, (vint)x); } static INLINE vint2 vandnot_vi2_vi2_vi2(vint2 x, vint2 y) { return vec_andc(y, x); } static INLINE vmask vandnot_vm_vm_vm(vmask x, vmask y) { return vec_andc(y, x); } static INLINE vmask vandnot_vm_vo64_vm(vopmask x, vmask y) { return vec_andc(y, x); } static INLINE vmask vandnot_vm_vo32_vm(vopmask x, vmask y) { return vec_andc(y, x); } static INLINE vopmask vandnot_vo_vo_vo(vopmask x, vopmask y) { return vec_andc(y, x); } static INLINE vint2 vandnot_vi2_vo_vi2(vopmask x, vint2 y) { return vec_andc(y, (vint2)x); } /********************************************** ** Comparison **********************************************/ ////////////// Equal ////////////// static INLINE vint veq_vi_vi_vi(vint x, vint y) { return (vint)vec_cmpeq(x, y); } static INLINE vopmask veq_vo_vi_vi(vint x, vint y) { return vec_cmpeq(x, y); } static INLINE vopmask veq_vo_vi2_vi2(vint2 x, vint2 y) { return vec_cmpeq(x, y); } static INLINE vint2 veq_vi2_vi2_vi2(vint2 x, vint2 y) { return (vint2)vec_cmpeq(x, y); } static INLINE vopmask veq64_vo_vm_vm(vmask x, vmask y) { return (vopmask)vec_cmpeq((v__u64)x, (v__u64)y); } static INLINE vopmask veq_vo_vf_vf(vfloat x, vfloat y) { return vec_cmpeq(x, y); } static INLINE vopmask veq_vo_vd_vd(vdouble x, vdouble y) { return (vopmask)vec_cmpeq(x, y); } ////////////// Not Equal ////////////// static INLINE vopmask vneq_vo_vf_vf(vfloat x, vfloat y) { return vnot_vo_vo(vec_cmpeq(x, y)); } static INLINE vopmask vneq_vo_vd_vd(vdouble x, vdouble y) { return vnot_vo_vo((vopmask)vec_cmpeq(x, y)); } ////////////// Less Than ////////////// static INLINE vopmask vlt_vo_vf_vf(vfloat x, vfloat y) { return vec_cmplt(x, y); } static INLINE vopmask vlt_vo_vd_vd(vdouble x, vdouble y) { return (vopmask)vec_cmplt(x, y); } ////////////// Greater Than ////////////// static INLINE vint vgt_vi_vi_vi(vint x, vint y) { return (vint)vec_cmpgt(x, y); } static INLINE vopmask vgt_vo_vi_vi(vint x, vint y) { return vec_cmpgt(x, y);} static INLINE vint2 vgt_vi2_vi2_vi2(vint2 x, vint2 y) { return (vint2)vec_cmpgt(x, y); } static INLINE vopmask vgt_vo_vi2_vi2(vint2 x, vint2 y) { return vec_cmpgt(x, y); } static INLINE vopmask vgt_vo_vf_vf(vfloat x, vfloat y) { return vec_cmpgt(x, y); } static INLINE vopmask vgt_vo_vd_vd(vdouble x, vdouble y) { return (vopmask)vec_cmpgt(x, y); } ////////////// Less Than Or Equal ////////////// static INLINE vopmask vle_vo_vf_vf(vfloat x, vfloat y) { return vec_cmple(x, y); } static INLINE vopmask vle_vo_vd_vd(vdouble x, vdouble y) { return (vopmask)vec_cmple(x, y); } ////////////// Greater Than Or Equal ////////////// static INLINE vopmask vge_vo_vf_vf(vfloat x, vfloat y) { return vec_cmpge(x, y); } static INLINE vopmask vge_vo_vd_vd(vdouble x, vdouble y) { return (vopmask)vec_cmpge(x, y); } ////////////// Special Cases ////////////// static INLINE vopmask visinf_vo_vf(vfloat d) { return vec_cmpeq(vec_abs(d), vsetall__vf(SLEEF_INFINITYf)); } static INLINE vopmask visinf_vo_vd(vdouble d) { return (vopmask)vec_cmpeq(vec_abs(d), vsetall__vd(SLEEF_INFINITY)); } static INLINE vopmask vispinf_vo_vf(vfloat d) { return vec_cmpeq(d, vsetall__vf(SLEEF_INFINITYf)); } static INLINE vopmask vispinf_vo_vd(vdouble d) { return (vopmask)vec_cmpeq(d, vsetall__vd(SLEEF_INFINITY)); } static INLINE vopmask visminf_vo_vf(vfloat d) { return vec_cmpeq(d, vsetall__vf(-SLEEF_INFINITYf)); } static INLINE vopmask visminf_vo_vd(vdouble d) { return (vopmask)vec_cmpeq(d, vsetall__vd(-SLEEF_INFINITY)); } static INLINE vopmask visnan_vo_vf(vfloat d) { return vnot_vo_vo(vec_cmpeq(d, d)); } static INLINE vopmask visnan_vo_vd(vdouble d) { return vnot_vo_vo((vopmask)vec_cmpeq(d, d)); } /********************************************** ** Shift **********************************************/ ////////////// Left ////////////// static INLINE vint vsll_vi_vi_i(vint x, int c) { return vec_sl (x, vsetall__u32(c)); } static INLINE vint2 vsll_vi2_vi2_i(vint2 x, int c) { return vec_sl(x, vsetall__u32(c)); } ////////////// Right ////////////// static INLINE vint vsrl_vi_vi_i(vint x, int c) { return vec_sr(x, vsetall__u32(c)); } static INLINE vint2 vsrl_vi2_vi2_i(vint2 x, int c) { return vec_sr(x, vsetall__u32(c)); } ////////////// Algebraic Right ////////////// static INLINE vint vsra_vi_vi_i(vint x, int c) { return vec_sra(x, vsetall__u32(c)); } static INLINE vint2 vsra_vi2_vi2_i(vint2 x, int c) { return vec_sra(x, vsetall__u32(c)); } /********************************************** ** Reorder **********************************************/ ////////////// Reverse ////////////// // Reverse elements order inside the lower and higher parts static INLINE vint2 vrev21_vi2_vi2(vint2 vi) { return vec_mergee(vec_mergeo(vi, vi), vi); } static INLINE vfloat vrev21_vf_vf(vfloat vf) { return (vfloat)vrev21_vi2_vi2((vint2)vf); } // Swap the lower and higher parts static INLINE vfloat vreva2_vf_vf(vfloat vf) { return (vfloat)v__swapd_u64((v__u64)vf); } static INLINE vdouble vrev21_vd_vd(vdouble vd) { return (vdouble)v__swapd_u64((v__u64)vd); } static INLINE vdouble vreva2_vd_vd(vdouble vd) { return vd; } /********************************************** ** Arithmetic **********************************************/ ////////////// Negation ////////////// static INLINE vint vneg_vi_vi(vint e) { #ifdef __clang__ return vec_neg(e); #else return vec_sub(vzero__vi(), e); #endif } static INLINE vint2 vneg_vi2_vi2(vint2 e) { return vneg_vi_vi(e); } static INLINE vfloat vneg_vf_vf(vfloat d) { vfloat ret; #ifdef __clang__ ret = vec_neg(d); #else __asm__ __volatile__("xvnegsp %x0,%x1" : "=wa" (ret) : "wa" (d)); #endif return ret; } static INLINE vdouble vneg_vd_vd(vdouble d) { vdouble ret; #ifdef __clang__ ret = vec_neg(d); #else __asm__ __volatile__("xvnegdp %x0,%x1" : "=wa" (ret) : "wa" (d)); #endif return ret; } static INLINE vfloat vposneg_vf_vf(vfloat d) { return vec_xor(d, vset__vf(+0.0f, -0.0f, +0.0f, -0.0f)); } static INLINE vdouble vposneg_vd_vd(vdouble d) { return vec_xor(d, vset__vd(+0.0, -0.0)); } static INLINE vfloat vnegpos_vf_vf(vfloat d) { return vec_xor(d, vset__vf(-0.0f, +0.0f, -0.0f, +0.0f)); } static INLINE vdouble vnegpos_vd_vd(vdouble d) { return vec_xor(d, vset__vd(-0.0, +0.0)); } ////////////// Addition ////////////// static INLINE vint vadd_vi_vi_vi(vint x, vint y) { return vec_add(x, y); } static INLINE vint2 vadd_vi2_vi2_vi2(vint2 x, vint2 y) { return vec_add(x, y); } static INLINE vfloat vadd_vf_vf_vf(vfloat x, vfloat y) { return vec_add(x, y); } static INLINE vdouble vadd_vd_vd_vd(vdouble x, vdouble y) { return vec_add(x, y); } static INLINE vmask vadd64_vm_vm_vm(vmask x, vmask y) { return (vmask)vec_add((v__i64)x, (v__i64)y); } ////////////// Subtraction ////////////// static INLINE vint vsub_vi_vi_vi(vint x, vint y) { return vec_sub(x, y); } static INLINE vint2 vsub_vi2_vi2_vi2(vint2 x, vint2 y) { return vec_sub(x, y); } static INLINE vfloat vsub_vf_vf_vf(vfloat x, vfloat y) { return vec_sub(x, y); } static INLINE vdouble vsub_vd_vd_vd(vdouble x, vdouble y) { return vec_sub(x, y); } static INLINE vdouble vsubadd_vd_vd_vd(vdouble x, vdouble y) { return vec_add(x, vnegpos_vd_vd(y)); } static INLINE vfloat vsubadd_vf_vf_vf(vfloat x, vfloat y) { return vec_add(x, vnegpos_vf_vf(y)); } ////////////// Multiplication ////////////// static INLINE vfloat vmul_vf_vf_vf(vfloat x, vfloat y) { return vec_mul(x, y); } static INLINE vdouble vmul_vd_vd_vd(vdouble x, vdouble y) { return vec_mul(x, y); } static INLINE vfloat vdiv_vf_vf_vf(vfloat x, vfloat y) { return vec_div(x, y); } static INLINE vdouble vdiv_vd_vd_vd(vdouble x, vdouble y) { return vec_div(x, y); } static INLINE vfloat vrec_vf_vf(vfloat x) { return vec_div(vsetall__vf(1.0f), x); } static INLINE vdouble vrec_vd_vd(vdouble x) { return vec_div(vsetall__vd(1.0), x); } /********************************************** ** Math **********************************************/ static INLINE vfloat vmax_vf_vf_vf(vfloat x, vfloat y) { return vec_max(x, y); } static INLINE vdouble vmax_vd_vd_vd(vdouble x, vdouble y) { return vec_max(x, y); } static INLINE vfloat vmin_vf_vf_vf(vfloat x, vfloat y) { return vec_min(x, y); } static INLINE vdouble vmin_vd_vd_vd(vdouble x, vdouble y) { return vec_min(x, y); } static INLINE vfloat vabs_vf_vf(vfloat f) { return vec_abs(f); } static INLINE vdouble vabs_vd_vd(vdouble d) { return vec_abs(d); } static INLINE vfloat vsqrt_vf_vf(vfloat f) { return vec_sqrt(f); } static INLINE vdouble vsqrt_vd_vd(vdouble d) { return vec_sqrt(d); } /********************************************** ** FMA3 **********************************************/ #if CONFIG == 1 static INLINE vfloat vmla_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vec_madd(x, y, z); } static INLINE vdouble vmla_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vec_madd(x, y, z); } static INLINE vfloat vmlapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vec_msub(x, y, z); } static INLINE vdouble vmlapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vec_msub(x, y, z); } static INLINE vfloat vmlanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vec_nmsub(x, y, z); } static INLINE vdouble vmlanp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vec_nmsub(x, y, z); } #else static INLINE vfloat vmla_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vec_add(vec_mul(x, y), z); } static INLINE vdouble vmla_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vec_add(vec_mul(x, y), z); } static INLINE vfloat vmlapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vec_sub(vec_mul(x, y), z); } static INLINE vdouble vmlapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vec_sub(vec_mul(x, y), z); } static INLINE vfloat vmlanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vec_sub(z, vec_mul(x, y)); } static INLINE vdouble vmlanp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vec_sub(z, vec_mul(x, y)); } #endif static INLINE vfloat vfma_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vec_madd(x, y, z); } static INLINE vdouble vfma_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vec_madd(x, y, z); } static INLINE vfloat vfmapp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vec_madd(x, y, z); } static INLINE vdouble vfmapp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vec_madd(x, y, z); } static INLINE vfloat vfmapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vec_msub(x, y, z); } static INLINE vdouble vfmapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vec_msub(x, y, z); } static INLINE vfloat vfmanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vec_nmsub(x, y, z); } static INLINE vdouble vfmanp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vec_nmsub(x, y, z); } static INLINE vfloat vfmann_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vec_nmadd(x, y, z); } static INLINE vdouble vfmann_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vec_nmadd(x, y, z); } static INLINE vfloat vmlsubadd_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vmla_vf_vf_vf_vf(x, y, vnegpos_vf_vf(z)); } static INLINE vdouble vmlsubadd_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vmla_vd_vd_vd_vd(x, y, vnegpos_vd_vd(z)); } ================================================ FILE: src/helpersse2.h ================================================ // Copyright Naoki Shibata and contributors 2010 - 2020. // Distributed under the Boost Software License, Version 1.0. // (See accompanying file LICENSE.txt or copy at // http://www.boost.org/LICENSE_1_0.txt) #if CONFIG == 2 #if !defined(__SSE2__) && !defined(SLEEF_GENHEADER) #error Please specify -msse2. #endif #elif CONFIG == 3 #if (!defined(__SSE2__) || !defined(__SSE3__)) && !defined(SLEEF_GENHEADER) #error Please specify -msse2 and -msse3 #endif #elif CONFIG == 4 #if (!defined(__SSE2__) || !defined(__SSE3__) || !defined(__SSE4_1__)) && !defined(SLEEF_GENHEADER) #error Please specify -msse2, -msse3 and -msse4.1 #endif #else #error CONFIG macro invalid or not defined #endif #define ENABLE_DP //@#define ENABLE_DP #define LOG2VECTLENDP 1 //@#define LOG2VECTLENDP 1 #define VECTLENDP (1 << LOG2VECTLENDP) //@#define VECTLENDP (1 << LOG2VECTLENDP) #define ENABLE_SP //@#define ENABLE_SP #define LOG2VECTLENSP (LOG2VECTLENDP+1) //@#define LOG2VECTLENSP (LOG2VECTLENDP+1) #define VECTLENSP (1 << LOG2VECTLENSP) //@#define VECTLENSP (1 << LOG2VECTLENSP) #define ACCURATE_SQRT //@#define ACCURATE_SQRT #if !defined(SLEEF_GENHEADER) #if defined(_MSC_VER) #include #else #include #endif #include #include "misc.h" #endif // #if !defined(SLEEF_GENHEADER) typedef __m128i vmask; typedef __m128i vopmask; typedef __m128d vdouble; typedef __m128i vint; typedef __m128 vfloat; typedef __m128i vint2; typedef struct { vmask x, y; } vmask2; // #if !defined(SLEEF_GENHEADER) #ifndef __SLEEF_H__ static inline void Sleef_x86CpuID(int32_t out[4], uint32_t eax, uint32_t ecx) { /* We don't care for cpuid detection */ out[0] = 0xFFFFFFFF; out[1] = 0xFFFFFFFF; out[2] = 0xFFFFFFFF; out[3] = 0xFFFFFFFF; } #endif static INLINE int cpuSupportsSSE2() { int32_t reg[4]; Sleef_x86CpuID(reg, 1, 0); return (reg[3] & (1 << 26)) != 0; } static INLINE int cpuSupportsSSE3() { int32_t reg[4]; Sleef_x86CpuID(reg, 1, 0); return (reg[2] & (1 << 0)) != 0; } static INLINE int cpuSupportsSSE4_1() { int32_t reg[4]; Sleef_x86CpuID(reg, 1, 0); return (reg[2] & (1 << 19)) != 0; } #if defined(__SSE2__) && defined(__SSE3__) && defined(__SSE4_1__) static INLINE int vavailability_i(int name) { //int d = __builtin_cpu_supports("sse2") && __builtin_cpu_supports("sse3") && __builtin_cpu_supports("sse4.1"); int d = cpuSupportsSSE2() && cpuSupportsSSE3() && cpuSupportsSSE4_1(); return d ? 3 : 0; } #define ISANAME "SSE4.1" #define DFTPRIORITY 12 #elif defined(__SSE2__) && defined(__SSE3__) static INLINE int vavailability_i(int name) { //int d = __builtin_cpu_supports("sse2") && __builtin_cpu_supports("sse3"); int d = cpuSupportsSSE2() && cpuSupportsSSE3(); return d ? 3 : 0; } #define ISANAME "SSE3" #define DFTPRIORITY 11 #else static INLINE int vavailability_i(int name) { int d = cpuSupportsSSE2(); return d ? 3 : 0; } #define ISANAME "SSE2" #define DFTPRIORITY 10 #endif #endif // #if !defined(SLEEF_GENHEADER) static INLINE void vprefetch_v_p(const void *ptr) { _mm_prefetch(ptr, _MM_HINT_T0); } static INLINE int vtestallones_i_vo32(vopmask g) { return _mm_movemask_epi8(g) == 0xFFFF; } static INLINE int vtestallones_i_vo64(vopmask g) { return _mm_movemask_epi8(g) == 0xFFFF; } // static vint2 vloadu_vi2_p(int32_t *p) { return _mm_loadu_si128((__m128i *)p); } static void vstoreu_v_p_vi2(int32_t *p, vint2 v) { _mm_storeu_si128((__m128i *)p, v); } static vint vloadu_vi_p(int32_t *p) { return _mm_loadu_si128((__m128i *)p); } static void vstoreu_v_p_vi(int32_t *p, vint v) { _mm_storeu_si128((__m128i *)p, v); } // static INLINE vmask vand_vm_vm_vm(vmask x, vmask y) { return _mm_and_si128(x, y); } static INLINE vmask vandnot_vm_vm_vm(vmask x, vmask y) { return _mm_andnot_si128(x, y); } static INLINE vmask vor_vm_vm_vm(vmask x, vmask y) { return _mm_or_si128(x, y); } static INLINE vmask vxor_vm_vm_vm(vmask x, vmask y) { return _mm_xor_si128(x, y); } static INLINE vopmask vand_vo_vo_vo(vopmask x, vopmask y) { return _mm_and_si128(x, y); } static INLINE vopmask vandnot_vo_vo_vo(vopmask x, vopmask y) { return _mm_andnot_si128(x, y); } static INLINE vopmask vor_vo_vo_vo(vopmask x, vopmask y) { return _mm_or_si128(x, y); } static INLINE vopmask vxor_vo_vo_vo(vopmask x, vopmask y) { return _mm_xor_si128(x, y); } static INLINE vmask vand_vm_vo64_vm(vopmask x, vmask y) { return _mm_and_si128(x, y); } static INLINE vmask vor_vm_vo64_vm(vopmask x, vmask y) { return _mm_or_si128(x, y); } static INLINE vmask vandnot_vm_vo64_vm(vmask x, vmask y) { return _mm_andnot_si128(x, y); } static INLINE vmask vxor_vm_vo64_vm(vmask x, vmask y) { return _mm_xor_si128(x, y); } static INLINE vmask vand_vm_vo32_vm(vopmask x, vmask y) { return _mm_and_si128(x, y); } static INLINE vmask vor_vm_vo32_vm(vopmask x, vmask y) { return _mm_or_si128(x, y); } static INLINE vmask vandnot_vm_vo32_vm(vmask x, vmask y) { return _mm_andnot_si128(x, y); } static INLINE vmask vxor_vm_vo32_vm(vmask x, vmask y) { return _mm_xor_si128(x, y); } static INLINE vopmask vcast_vo32_vo64(vopmask m) { return _mm_shuffle_epi32(m, 0x08); } static INLINE vopmask vcast_vo64_vo32(vopmask m) { return _mm_shuffle_epi32(m, 0x50); } // static INLINE vint vrint_vi_vd(vdouble vd) { return _mm_cvtpd_epi32(vd); } static INLINE vint vtruncate_vi_vd(vdouble vd) { return _mm_cvttpd_epi32(vd); } static INLINE vdouble vcast_vd_vi(vint vi) { return _mm_cvtepi32_pd(vi); } static INLINE vint vcast_vi_i(int i) { return _mm_set_epi32(0, 0, i, i); } static INLINE vint2 vcastu_vi2_vi(vint vi) { return _mm_and_si128(_mm_shuffle_epi32(vi, 0x73), _mm_set_epi32(-1, 0, -1, 0)); } static INLINE vint vcastu_vi_vi2(vint2 vi) { return _mm_shuffle_epi32(vi, 0x0d); } #if CONFIG == 4 static INLINE vdouble vtruncate_vd_vd(vdouble vd) { return _mm_round_pd(vd, _MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC); } static INLINE vdouble vrint_vd_vd(vdouble vd) { return _mm_round_pd(vd, _MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC); } static INLINE vfloat vtruncate_vf_vf(vfloat vf) { return _mm_round_ps(vf, _MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC); } static INLINE vfloat vrint_vf_vf(vfloat vd) { return _mm_round_ps(vd, _MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC); } static INLINE vopmask veq64_vo_vm_vm(vmask x, vmask y) { return _mm_cmpeq_epi64(x, y); } #define FULL_FP_ROUNDING //@#define FULL_FP_ROUNDING #else static INLINE vdouble vtruncate_vd_vd(vdouble vd) { return vcast_vd_vi(vtruncate_vi_vd(vd)); } static INLINE vdouble vrint_vd_vd(vdouble vd) { return vcast_vd_vi(vrint_vi_vd(vd)); } static INLINE vopmask veq64_vo_vm_vm(vmask x, vmask y) { vmask t = _mm_cmpeq_epi32(x, y); return vand_vm_vm_vm(t, _mm_shuffle_epi32(t, 0xb1)); } #endif static INLINE vmask vadd64_vm_vm_vm(vmask x, vmask y) { return _mm_add_epi64(x, y); } static INLINE vmask vcast_vm_i_i(int i0, int i1) { return _mm_set_epi32(i0, i1, i0, i1); } // static INLINE vdouble vcast_vd_d(double d) { return _mm_set1_pd(d); } static INLINE vmask vreinterpret_vm_vd(vdouble vd) { return _mm_castpd_si128(vd); } static INLINE vint2 vreinterpret_vi2_vd(vdouble vd) { return _mm_castpd_si128(vd); } static INLINE vdouble vreinterpret_vd_vi2(vint2 vi) { return _mm_castsi128_pd(vi); } static INLINE vdouble vreinterpret_vd_vm(vmask vm) { return _mm_castsi128_pd(vm); } static INLINE vdouble vadd_vd_vd_vd(vdouble x, vdouble y) { return _mm_add_pd(x, y); } static INLINE vdouble vsub_vd_vd_vd(vdouble x, vdouble y) { return _mm_sub_pd(x, y); } static INLINE vdouble vmul_vd_vd_vd(vdouble x, vdouble y) { return _mm_mul_pd(x, y); } static INLINE vdouble vdiv_vd_vd_vd(vdouble x, vdouble y) { return _mm_div_pd(x, y); } static INLINE vdouble vrec_vd_vd(vdouble x) { return _mm_div_pd(_mm_set1_pd(1), x); } static INLINE vdouble vsqrt_vd_vd(vdouble x) { return _mm_sqrt_pd(x); } static INLINE vdouble vabs_vd_vd(vdouble d) { return _mm_andnot_pd(_mm_set1_pd(-0.0), d); } static INLINE vdouble vneg_vd_vd(vdouble d) { return _mm_xor_pd(_mm_set1_pd(-0.0), d); } static INLINE vdouble vmla_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vadd_vd_vd_vd(vmul_vd_vd_vd(x, y), z); } static INLINE vdouble vmlapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vsub_vd_vd_vd(vmul_vd_vd_vd(x, y), z); } static INLINE vdouble vmlanp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vsub_vd_vd_vd(z, vmul_vd_vd_vd(x, y)); } static INLINE vdouble vmax_vd_vd_vd(vdouble x, vdouble y) { return _mm_max_pd(x, y); } static INLINE vdouble vmin_vd_vd_vd(vdouble x, vdouble y) { return _mm_min_pd(x, y); } static INLINE vopmask veq_vo_vd_vd(vdouble x, vdouble y) { return _mm_castpd_si128(_mm_cmpeq_pd(x, y)); } static INLINE vopmask vneq_vo_vd_vd(vdouble x, vdouble y) { return _mm_castpd_si128(_mm_cmpneq_pd(x, y)); } static INLINE vopmask vlt_vo_vd_vd(vdouble x, vdouble y) { return _mm_castpd_si128(_mm_cmplt_pd(x, y)); } static INLINE vopmask vle_vo_vd_vd(vdouble x, vdouble y) { return _mm_castpd_si128(_mm_cmple_pd(x, y)); } static INLINE vopmask vgt_vo_vd_vd(vdouble x, vdouble y) { return _mm_castpd_si128(_mm_cmpgt_pd(x, y)); } static INLINE vopmask vge_vo_vd_vd(vdouble x, vdouble y) { return _mm_castpd_si128(_mm_cmpge_pd(x, y)); } static INLINE vint vadd_vi_vi_vi(vint x, vint y) { return _mm_add_epi32(x, y); } static INLINE vint vsub_vi_vi_vi(vint x, vint y) { return _mm_sub_epi32(x, y); } static INLINE vint vneg_vi_vi(vint e) { return vsub_vi_vi_vi(vcast_vi_i(0), e); } static INLINE vint vand_vi_vi_vi(vint x, vint y) { return _mm_and_si128(x, y); } static INLINE vint vandnot_vi_vi_vi(vint x, vint y) { return _mm_andnot_si128(x, y); } static INLINE vint vor_vi_vi_vi(vint x, vint y) { return _mm_or_si128(x, y); } static INLINE vint vxor_vi_vi_vi(vint x, vint y) { return _mm_xor_si128(x, y); } static INLINE vint vand_vi_vo_vi(vopmask x, vint y) { return _mm_and_si128(x, y); } static INLINE vint vandnot_vi_vo_vi(vopmask x, vint y) { return _mm_andnot_si128(x, y); } static INLINE vint vsll_vi_vi_i(vint x, int c) { return _mm_slli_epi32(x, c); } static INLINE vint vsrl_vi_vi_i(vint x, int c) { return _mm_srli_epi32(x, c); } static INLINE vint vsra_vi_vi_i(vint x, int c) { return _mm_srai_epi32(x, c); } static INLINE vint veq_vi_vi_vi(vint x, vint y) { return _mm_cmpeq_epi32(x, y); } static INLINE vint vgt_vi_vi_vi(vint x, vint y) { return _mm_cmpgt_epi32(x, y); } static INLINE vopmask veq_vo_vi_vi(vint x, vint y) { return _mm_cmpeq_epi32(x, y); } static INLINE vopmask vgt_vo_vi_vi(vint x, vint y) { return _mm_cmpgt_epi32(x, y); } #if CONFIG == 4 static INLINE vint vsel_vi_vo_vi_vi(vopmask m, vint x, vint y) { return _mm_blendv_epi8(y, x, m); } static INLINE vdouble vsel_vd_vo_vd_vd(vopmask m, vdouble x, vdouble y) { return _mm_blendv_pd(y, x, _mm_castsi128_pd(m)); } #else static INLINE vint vsel_vi_vo_vi_vi(vopmask m, vint x, vint y) { return vor_vm_vm_vm(vand_vm_vm_vm(m, x), vandnot_vm_vm_vm(m, y)); } static INLINE vdouble vsel_vd_vo_vd_vd(vopmask opmask, vdouble x, vdouble y) { return _mm_or_pd(_mm_and_pd(_mm_castsi128_pd(opmask), x), _mm_andnot_pd(_mm_castsi128_pd(opmask), y)); } #endif static INLINE CONST vdouble vsel_vd_vo_d_d(vopmask o, double v1, double v0) { return vsel_vd_vo_vd_vd(o, vcast_vd_d(v1), vcast_vd_d(v0)); } static INLINE vdouble vsel_vd_vo_vo_d_d_d(vopmask o0, vopmask o1, double d0, double d1, double d2) { return vsel_vd_vo_vd_vd(o0, vcast_vd_d(d0), vsel_vd_vo_d_d(o1, d1, d2)); } static INLINE vdouble vsel_vd_vo_vo_vo_d_d_d_d(vopmask o0, vopmask o1, vopmask o2, double d0, double d1, double d2, double d3) { return vsel_vd_vo_vd_vd(o0, vcast_vd_d(d0), vsel_vd_vo_vd_vd(o1, vcast_vd_d(d1), vsel_vd_vo_d_d(o2, d2, d3))); } static INLINE vopmask visinf_vo_vd(vdouble d) { return vreinterpret_vm_vd(_mm_cmpeq_pd(vabs_vd_vd(d), _mm_set1_pd(SLEEF_INFINITY))); } static INLINE vopmask vispinf_vo_vd(vdouble d) { return vreinterpret_vm_vd(_mm_cmpeq_pd(d, _mm_set1_pd(SLEEF_INFINITY))); } static INLINE vopmask visminf_vo_vd(vdouble d) { return vreinterpret_vm_vd(_mm_cmpeq_pd(d, _mm_set1_pd(-SLEEF_INFINITY))); } static INLINE vopmask visnan_vo_vd(vdouble d) { return vreinterpret_vm_vd(_mm_cmpneq_pd(d, d)); } // static INLINE vdouble vload_vd_p(const double *ptr) { return _mm_load_pd(ptr); } static INLINE vdouble vloadu_vd_p(const double *ptr) { return _mm_loadu_pd(ptr); } static INLINE void vstore_v_p_vd(double *ptr, vdouble v) { _mm_store_pd(ptr, v); } static INLINE void vstoreu_v_p_vd(double *ptr, vdouble v) { _mm_storeu_pd(ptr, v); } static INLINE vdouble vgather_vd_p_vi(const double *ptr, vint vi) { int a[sizeof(vint)/sizeof(int)]; vstoreu_v_p_vi(a, vi); return _mm_set_pd(ptr[a[1]], ptr[a[0]]); } // This function is for debugging static INLINE double vcast_d_vd(vdouble v) { double a[VECTLENDP]; vstoreu_v_p_vd(a, v); return a[0]; } // static INLINE vint2 vcast_vi2_vm(vmask vm) { return vm; } static INLINE vmask vcast_vm_vi2(vint2 vi) { return vi; } static INLINE vint2 vrint_vi2_vf(vfloat vf) { return _mm_cvtps_epi32(vf); } static INLINE vint2 vtruncate_vi2_vf(vfloat vf) { return _mm_cvttps_epi32(vf); } static INLINE vfloat vcast_vf_vi2(vint2 vi) { return _mm_cvtepi32_ps(vcast_vm_vi2(vi)); } static INLINE vfloat vcast_vf_f(float f) { return _mm_set1_ps(f); } static INLINE vint2 vcast_vi2_i(int i) { return _mm_set1_epi32(i); } static INLINE vmask vreinterpret_vm_vf(vfloat vf) { return _mm_castps_si128(vf); } static INLINE vfloat vreinterpret_vf_vm(vmask vm) { return _mm_castsi128_ps(vm); } static INLINE vfloat vreinterpret_vf_vi2(vint2 vm) { return _mm_castsi128_ps(vm); } static INLINE vint2 vreinterpret_vi2_vf(vfloat vf) { return _mm_castps_si128(vf); } #if CONFIG != 4 static INLINE vfloat vtruncate_vf_vf(vfloat vd) { return vcast_vf_vi2(vtruncate_vi2_vf(vd)); } static INLINE vfloat vrint_vf_vf(vfloat vf) { return vcast_vf_vi2(vrint_vi2_vf(vf)); } #endif static INLINE vfloat vadd_vf_vf_vf(vfloat x, vfloat y) { return _mm_add_ps(x, y); } static INLINE vfloat vsub_vf_vf_vf(vfloat x, vfloat y) { return _mm_sub_ps(x, y); } static INLINE vfloat vmul_vf_vf_vf(vfloat x, vfloat y) { return _mm_mul_ps(x, y); } static INLINE vfloat vdiv_vf_vf_vf(vfloat x, vfloat y) { return _mm_div_ps(x, y); } static INLINE vfloat vrec_vf_vf(vfloat x) { return vdiv_vf_vf_vf(vcast_vf_f(1.0f), x); } static INLINE vfloat vsqrt_vf_vf(vfloat x) { return _mm_sqrt_ps(x); } static INLINE vfloat vabs_vf_vf(vfloat f) { return vreinterpret_vf_vm(vandnot_vm_vm_vm(vreinterpret_vm_vf(vcast_vf_f(-0.0f)), vreinterpret_vm_vf(f))); } static INLINE vfloat vneg_vf_vf(vfloat d) { return vreinterpret_vf_vm(vxor_vm_vm_vm(vreinterpret_vm_vf(vcast_vf_f(-0.0f)), vreinterpret_vm_vf(d))); } static INLINE vfloat vmla_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vadd_vf_vf_vf(vmul_vf_vf_vf(x, y), z); } static INLINE vfloat vmlapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vsub_vf_vf_vf(vmul_vf_vf_vf(x, y), z); } static INLINE vfloat vmlanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vsub_vf_vf_vf(z, vmul_vf_vf_vf(x, y)); } static INLINE vfloat vmax_vf_vf_vf(vfloat x, vfloat y) { return _mm_max_ps(x, y); } static INLINE vfloat vmin_vf_vf_vf(vfloat x, vfloat y) { return _mm_min_ps(x, y); } static INLINE vopmask veq_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm_cmpeq_ps(x, y)); } static INLINE vopmask vneq_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm_cmpneq_ps(x, y)); } static INLINE vopmask vlt_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm_cmplt_ps(x, y)); } static INLINE vopmask vle_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm_cmple_ps(x, y)); } static INLINE vopmask vgt_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm_cmpgt_ps(x, y)); } static INLINE vopmask vge_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm_cmpge_ps(x, y)); } static INLINE vint2 vadd_vi2_vi2_vi2(vint2 x, vint2 y) { return vadd_vi_vi_vi(x, y); } static INLINE vint2 vsub_vi2_vi2_vi2(vint2 x, vint2 y) { return vsub_vi_vi_vi(x, y); } static INLINE vint2 vneg_vi2_vi2(vint2 e) { return vsub_vi2_vi2_vi2(vcast_vi2_i(0), e); } static INLINE vint2 vand_vi2_vi2_vi2(vint2 x, vint2 y) { return vand_vi_vi_vi(x, y); } static INLINE vint2 vandnot_vi2_vi2_vi2(vint2 x, vint2 y) { return vandnot_vi_vi_vi(x, y); } static INLINE vint2 vor_vi2_vi2_vi2(vint2 x, vint2 y) { return vor_vi_vi_vi(x, y); } static INLINE vint2 vxor_vi2_vi2_vi2(vint2 x, vint2 y) { return vxor_vi_vi_vi(x, y); } static INLINE vint2 vand_vi2_vo_vi2(vopmask x, vint2 y) { return vand_vi_vo_vi(x, y); } static INLINE vint2 vandnot_vi2_vo_vi2(vopmask x, vint2 y) { return vandnot_vi_vo_vi(x, y); } static INLINE vint2 vsll_vi2_vi2_i(vint2 x, int c) { return vsll_vi_vi_i(x, c); } static INLINE vint2 vsrl_vi2_vi2_i(vint2 x, int c) { return vsrl_vi_vi_i(x, c); } static INLINE vint2 vsra_vi2_vi2_i(vint2 x, int c) { return vsra_vi_vi_i(x, c); } static INLINE vopmask veq_vo_vi2_vi2(vint2 x, vint2 y) { return _mm_cmpeq_epi32(x, y); } static INLINE vopmask vgt_vo_vi2_vi2(vint2 x, vint2 y) { return _mm_cmpgt_epi32(x, y); } static INLINE vint2 veq_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm_cmpeq_epi32(x, y); } static INLINE vint2 vgt_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm_cmpgt_epi32(x, y); } #if CONFIG == 4 static INLINE vint2 vsel_vi2_vo_vi2_vi2(vopmask m, vint2 x, vint2 y) { return _mm_blendv_epi8(y, x, m); } static INLINE vfloat vsel_vf_vo_vf_vf(vopmask m, vfloat x, vfloat y) { return _mm_blendv_ps(y, x, _mm_castsi128_ps(m)); } #else static INLINE vint2 vsel_vi2_vo_vi2_vi2(vopmask m, vint2 x, vint2 y) { return vor_vi2_vi2_vi2(vand_vi2_vi2_vi2(m, x), vandnot_vi2_vi2_vi2(m, y)); } static INLINE vfloat vsel_vf_vo_vf_vf(vopmask opmask, vfloat x, vfloat y) { return _mm_or_ps(_mm_and_ps(_mm_castsi128_ps(opmask), x), _mm_andnot_ps(_mm_castsi128_ps(opmask), y)); } #endif static INLINE CONST vfloat vsel_vf_vo_f_f(vopmask o, float v1, float v0) { return vsel_vf_vo_vf_vf(o, vcast_vf_f(v1), vcast_vf_f(v0)); } static INLINE vfloat vsel_vf_vo_vo_f_f_f(vopmask o0, vopmask o1, float d0, float d1, float d2) { return vsel_vf_vo_vf_vf(o0, vcast_vf_f(d0), vsel_vf_vo_f_f(o1, d1, d2)); } static INLINE vfloat vsel_vf_vo_vo_vo_f_f_f_f(vopmask o0, vopmask o1, vopmask o2, float d0, float d1, float d2, float d3) { return vsel_vf_vo_vf_vf(o0, vcast_vf_f(d0), vsel_vf_vo_vf_vf(o1, vcast_vf_f(d1), vsel_vf_vo_f_f(o2, d2, d3))); } static INLINE vopmask visinf_vo_vf(vfloat d) { return veq_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(SLEEF_INFINITYf)); } static INLINE vopmask vispinf_vo_vf(vfloat d) { return veq_vo_vf_vf(d, vcast_vf_f(SLEEF_INFINITYf)); } static INLINE vopmask visminf_vo_vf(vfloat d) { return veq_vo_vf_vf(d, vcast_vf_f(-SLEEF_INFINITYf)); } static INLINE vopmask visnan_vo_vf(vfloat d) { return vneq_vo_vf_vf(d, d); } static INLINE vfloat vload_vf_p(const float *ptr) { return _mm_load_ps(ptr); } static INLINE vfloat vloadu_vf_p(const float *ptr) { return _mm_loadu_ps(ptr); } static INLINE void vstore_v_p_vf(float *ptr, vfloat v) { _mm_store_ps(ptr, v); } static INLINE void vstoreu_v_p_vf(float *ptr, vfloat v) { _mm_storeu_ps(ptr, v); } static INLINE vfloat vgather_vf_p_vi2(const float *ptr, vint2 vi) { int a[VECTLENSP]; vstoreu_v_p_vi2(a, vi); return _mm_set_ps(ptr[a[3]], ptr[a[2]], ptr[a[1]], ptr[a[0]]); } // This function is for debugging static INLINE float vcast_f_vf(vfloat v) { float a[VECTLENSP]; vstoreu_v_p_vf(a, v); return a[0]; } // #define PNMASK ((vdouble) { +0.0, -0.0 }) #define NPMASK ((vdouble) { -0.0, +0.0 }) #define PNMASKf ((vfloat) { +0.0f, -0.0f, +0.0f, -0.0f }) #define NPMASKf ((vfloat) { -0.0f, +0.0f, -0.0f, +0.0f }) static INLINE vdouble vposneg_vd_vd(vdouble d) { return vreinterpret_vd_vm(vxor_vm_vm_vm(vreinterpret_vm_vd(d), vreinterpret_vm_vd(PNMASK))); } static INLINE vdouble vnegpos_vd_vd(vdouble d) { return vreinterpret_vd_vm(vxor_vm_vm_vm(vreinterpret_vm_vd(d), vreinterpret_vm_vd(NPMASK))); } static INLINE vfloat vposneg_vf_vf(vfloat d) { return vreinterpret_vf_vm(vxor_vm_vm_vm(vreinterpret_vm_vf(d), vreinterpret_vm_vf(PNMASKf))); } static INLINE vfloat vnegpos_vf_vf(vfloat d) { return vreinterpret_vf_vm(vxor_vm_vm_vm(vreinterpret_vm_vf(d), vreinterpret_vm_vf(NPMASKf))); } #if CONFIG >= 3 static INLINE vdouble vsubadd_vd_vd_vd(vdouble x, vdouble y) { return _mm_addsub_pd(x, y); } static INLINE vfloat vsubadd_vf_vf_vf(vfloat x, vfloat y) { return _mm_addsub_ps(x, y); } #else static INLINE vdouble vsubadd_vd_vd_vd(vdouble x, vdouble y) { return vadd_vd_vd_vd(x, vnegpos_vd_vd(y)); } static INLINE vfloat vsubadd_vf_vf_vf(vfloat x, vfloat y) { return vadd_vf_vf_vf(x, vnegpos_vf_vf(y)); } #endif static INLINE vdouble vmlsubadd_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vsubadd_vd_vd_vd(vmul_vd_vd_vd(x, y), z); } static INLINE vfloat vmlsubadd_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vsubadd_vf_vf_vf(vmul_vf_vf_vf(x, y), z); } static INLINE vdouble vrev21_vd_vd(vdouble d0) { return _mm_shuffle_pd(d0, d0, 1); } static INLINE vdouble vreva2_vd_vd(vdouble vd) { return vd; } static INLINE void vstream_v_p_vd(double *ptr, vdouble v) { _mm_stream_pd(ptr, v); } static INLINE void vscatter2_v_p_i_i_vd(double *ptr, int offset, int step, vdouble v) { vstore_v_p_vd((double *)(&ptr[2*offset]), v); } static INLINE void vsscatter2_v_p_i_i_vd(double *ptr, int offset, int step, vdouble v) { _mm_stream_pd((double *)(&ptr[2*offset]), v); } // static INLINE vfloat vrev21_vf_vf(vfloat d0) { return _mm_shuffle_ps(d0, d0, (2 << 6) | (3 << 4) | (0 << 2) | (1 << 0)); } static INLINE vfloat vreva2_vf_vf(vfloat d0) { return _mm_shuffle_ps(d0, d0, (1 << 6) | (0 << 4) | (3 << 2) | (2 << 0)); } static INLINE vint2 vrev21_vi2_vi2(vint2 i) { return vreinterpret_vi2_vf(vrev21_vf_vf(vreinterpret_vf_vi2(i))); } static INLINE void vstream_v_p_vf(float *ptr, vfloat v) { _mm_stream_ps(ptr, v); } static INLINE void vscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v) { _mm_storel_pd((double *)(ptr+(offset + step * 0)*2), vreinterpret_vd_vm(vreinterpret_vm_vf(v))); _mm_storeh_pd((double *)(ptr+(offset + step * 1)*2), vreinterpret_vd_vm(vreinterpret_vm_vf(v))); } static INLINE void vsscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v) { _mm_storel_pd((double *)(ptr+(offset + step * 0)*2), vreinterpret_vd_vm(vreinterpret_vm_vf(v))); _mm_storeh_pd((double *)(ptr+(offset + step * 1)*2), vreinterpret_vd_vm(vreinterpret_vm_vf(v))); } // static INLINE vmask2 vinterleave_vm2_vm2(vmask2 v) { return (vmask2) { _mm_unpacklo_epi64(v.x, v.y), _mm_unpackhi_epi64(v.x, v.y) }; } static INLINE vmask2 vuninterleave_vm2_vm2(vmask2 v) { return (vmask2) { _mm_unpacklo_epi64(v.x, v.y), _mm_unpackhi_epi64(v.x, v.y) }; } static INLINE vint vuninterleave_vi_vi(vint v) { return v; } static INLINE vdouble vinterleave_vd_vd(vdouble vd) { return vd; } static INLINE vdouble vuninterleave_vd_vd(vdouble vd) { return vd; } static INLINE vmask vinterleave_vm_vm(vmask vm) { return vm; } static INLINE vmask vuninterleave_vm_vm(vmask vm) { return vm; } static vmask2 vloadu_vm2_p(void *p) { vmask2 vm2; memcpy(&vm2, p, VECTLENDP * 16); return vm2; } #if !defined(SLEEF_GENHEADER) typedef Sleef_quad2 vargquad; static INLINE vmask2 vcast_vm2_aq(vargquad aq) { return vinterleave_vm2_vm2(vloadu_vm2_p(&aq)); } static INLINE vargquad vcast_aq_vm2(vmask2 vm2) { vm2 = vuninterleave_vm2_vm2(vm2); vargquad aq; memcpy(&aq, &vm2, VECTLENDP * 16); return aq; } #endif // #if !defined(SLEEF_GENHEADER) static INLINE int vtestallzeros_i_vo64(vopmask g) { return _mm_movemask_epi8(g) == 0; } static INLINE vmask vsel_vm_vo64_vm_vm(vopmask o, vmask x, vmask y) { return vor_vm_vm_vm(vand_vm_vm_vm(o, x), vandnot_vm_vm_vm(o, y)); } static INLINE vmask vsub64_vm_vm_vm(vmask x, vmask y) { return _mm_sub_epi64(x, y); } static INLINE vmask vneg64_vm_vm(vmask x) { return _mm_sub_epi64(vcast_vm_i_i(0, 0), x); } #define vsll64_vm_vm_i(x, c) _mm_slli_epi64(x, c) #define vsrl64_vm_vm_i(x, c) _mm_srli_epi64(x, c) //@#define vsll64_vm_vm_i(x, c) _mm_slli_epi64(x, c) //@#define vsrl64_vm_vm_i(x, c) _mm_srli_epi64(x, c) static INLINE vopmask vgt64_vo_vm_vm(vmask x, vmask y) { int64_t ax[2], ay[2]; _mm_storeu_si128((__m128i *)ax, x); _mm_storeu_si128((__m128i *)ay, y); return _mm_set_epi64x(ax[1] > ay[1] ? -1 : 0, ax[0] > ay[0] ? -1 : 0); } static INLINE vmask vcast_vm_vi(vint vi) { vmask m = _mm_and_si128(_mm_shuffle_epi32(vi, (0 << 6) | (1 << 4) | (0 << 2) | (0 << 0)), _mm_set_epi32(0, -1, 0, -1)); return vor_vm_vm_vm(vcastu_vi2_vi(vgt_vo_vi_vi(vcast_vi_i(0), vi)), m); } static INLINE vint vcast_vi_vm(vmask vm) { return _mm_shuffle_epi32(vm, 0x08); } ================================================ FILE: src/helpersve.h ================================================ /*********************************************************************/ /* Copyright ARM Ltd. 2010 - 2019. */ /* Distributed under the Boost Software License, Version 1.0. */ /* (See accompanying file LICENSE.txt or copy at */ /* http://www.boost.org/LICENSE_1_0.txt) */ /*********************************************************************/ #if !defined(__ARM_FEATURE_SVE) && !defined(SLEEF_GENHEADER) #error Please specify SVE flags. #endif #if !defined(SLEEF_GENHEADER) #include #include #include "misc.h" #endif // #if !defined(SLEEF_GENHEADER) #if defined(VECTLENDP) || defined(VECTLENSP) #error VECTLENDP or VECTLENSP already defined #endif #if CONFIG == 1 || CONFIG == 2 // Vector length agnostic #define VECTLENSP (svcntw()) //@#define VECTLENSP (svcntw()) #define VECTLENDP (svcntd()) //@#define VECTLENDP (svcntd()) #define ISANAME "AArch64 SVE" #define ptrue svptrue_b8() //@#define ptrue svptrue_b8() #elif CONFIG == 8 // 256-bit vector length #define ISANAME "AArch64 SVE 256-bit" #define LOG2VECTLENDP 2 #define ptrue svptrue_pat_b8(SV_VL32) #define DFTPRIORITY 20 #elif CONFIG == 9 // 512-bit vector length #define ISANAME "AArch64 SVE 512-bit" #define LOG2VECTLENDP 3 #define ptrue svptrue_pat_b8(SV_VL64) #define DFTPRIORITY 21 #elif CONFIG == 10 // 1024-bit vector length #define ISANAME "AArch64 SVE 1024-bit" #define LOG2VECTLENDP 4 #define ptrue svptrue_pat_b8(SV_VL128) #define DFTPRIORITY 22 #elif CONFIG == 11 // 2048-bit vector length #define ISANAME "AArch64 SVE 2048-bit" #define LOG2VECTLENDP 5 #define ptrue svptrue_pat_b8(SV_VL256) #define DFTPRIORITY 23 #else #error CONFIG macro invalid or not defined #endif #ifdef LOG2VECTLENDP // For DFT, VECTLENDP and VECTLENSP are not the size of the available // vector length, but the size of the partial vectors utilized in the // computation. The appropriate VECTLENDP and VECTLENSP are chosen by // the dispatcher according to the value of svcntd(). #define LOG2VECTLENSP (LOG2VECTLENDP+1) #define VECTLENDP (1 << LOG2VECTLENDP) #define VECTLENSP (1 << LOG2VECTLENSP) static INLINE int vavailability_i(int name) { return svcntd() >= VECTLENDP ? 3 : 0; } #else static INLINE int vavailability_i(int name) { return 3; } #endif #define ENABLE_SP //@#define ENABLE_SP #define ENABLE_DP //@#define ENABLE_DP #if CONFIG != 2 #define ENABLE_FMA_SP //@#define ENABLE_FMA_SP #define ENABLE_FMA_DP //@#define ENABLE_FMA_DP //#define SPLIT_KERNEL // Benchmark comparison is needed to determine whether this option should be enabled. #endif #define FULL_FP_ROUNDING //@#define FULL_FP_ROUNDING #define ACCURATE_SQRT //@#define ACCURATE_SQRT // Type definitions // Mask definition typedef svint32_t vmask; typedef svbool_t vopmask; // Single precision definitions typedef svfloat32_t vfloat; typedef svint32_t vint2; // Double precision definitions typedef svfloat64_t vdouble; typedef svint32_t vint; // Double-double data type with setter/getter functions typedef svfloat64x2_t vdouble2; static INLINE vdouble vd2getx_vd_vd2(vdouble2 v) { return svget2_f64(v, 0); } static INLINE vdouble vd2gety_vd_vd2(vdouble2 v) { return svget2_f64(v, 1); } static INLINE vdouble2 vd2setxy_vd2_vd_vd(vdouble x, vdouble y) { return svcreate2_f64(x, y); } static INLINE vdouble2 vd2setx_vd2_vd2_vd(vdouble2 v, vdouble d) { return svset2_f64(v, 0, d); } static INLINE vdouble2 vd2sety_vd2_vd2_vd(vdouble2 v, vdouble d) { return svset2_f64(v, 1, d); } // Double-float data type with setter/getter functions typedef svfloat32x2_t vfloat2; static INLINE vfloat vf2getx_vf_vf2(vfloat2 v) { return svget2_f32(v, 0); } static INLINE vfloat vf2gety_vf_vf2(vfloat2 v) { return svget2_f32(v, 1); } static INLINE vfloat2 vf2setxy_vf2_vf_vf(vfloat x, vfloat y) { return svcreate2_f32(x, y); } static INLINE vfloat2 vf2setx_vf2_vf2_vf(vfloat2 v, vfloat d) { return svset2_f32(v, 0, d); } static INLINE vfloat2 vf2sety_vf2_vf2_vf(vfloat2 v, vfloat d) { return svset2_f32(v, 1, d); } // vmask2 is mainly used for quad-precision functions typedef svint32x2_t vmask2; static INLINE vmask vm2getx_vm_vm2(vmask2 v) { return svget2_s32(v, 0); } static INLINE vmask vm2gety_vm_vm2(vmask2 v) { return svget2_s32(v, 1); } static INLINE vmask2 vm2setxy_vm2_vm_vm(vmask x, vmask y) { return svcreate2_s32(x, y); } static INLINE vmask2 vm2setx_vm2_vm2_vm(vmask2 v, vmask x) { return svset2_s32(v, 0, x); } static INLINE vmask2 vm2sety_vm2_vm2_vm(vmask2 v, vmask y) { return svset2_s32(v, 1, y); } // Auxiliary data types typedef svfloat64x2_t di_t; static INLINE vdouble digetd_vd_di(di_t d) { return svget2_f64(d, 0); } static INLINE vint digeti_vi_di(di_t d) { return svreinterpret_s32_f64(svget2_f64(d, 1)); } static INLINE di_t disetdi_di_vd_vi(vdouble d, vint i) { return svcreate2_f64(d, svreinterpret_f64_s32(i)); } // typedef svfloat32x2_t fi_t; static INLINE vfloat figetd_vf_di(fi_t d) { return svget2_f32(d, 0); } static INLINE vint2 figeti_vi2_di(fi_t d) { return svreinterpret_s32_f32(svget2_f32(d, 1)); } static INLINE fi_t fisetdi_fi_vf_vi2(vfloat d, vint2 i) { return svcreate2_f32(d, svreinterpret_f32_s32(i)); } // typedef svfloat64x3_t ddi_t; static INLINE vdouble2 ddigetdd_vd2_ddi(ddi_t d) { return svcreate2_f64(svget3_f64(d, 0), svget3_f64(d, 1)); } static INLINE vint ddigeti_vi_ddi(ddi_t d) { return svreinterpret_s32_f64(svget3_f64(d, 2)); } static INLINE ddi_t ddisetddi_ddi_vd2_vi(vdouble2 v, vint i) { return svcreate3_f64(svget2_f64(v, 0), svget2_f64(v, 1), svreinterpret_f64_s32(i)); } static INLINE ddi_t ddisetdd_ddi_ddi_vd2(ddi_t ddi, vdouble2 v) { return svcreate3_f64(svget2_f64(v, 0), svget2_f64(v, 1), svget3_f64(ddi, 2)); } // typedef svfloat32x3_t dfi_t; static INLINE vfloat2 dfigetdf_vf2_dfi(dfi_t d) { return svcreate2_f32(svget3_f32(d, 0), svget3_f32(d, 1)); } static INLINE vint2 dfigeti_vi2_dfi(dfi_t d) { return svreinterpret_s32_f32(svget3_f32(d, 2)); } static INLINE dfi_t dfisetdfi_dfi_vf2_vi2(vfloat2 v, vint2 i) { return svcreate3_f32(svget2_f32(v, 0), svget2_f32(v, 1), svreinterpret_f32_s32(i)); } static INLINE dfi_t dfisetdf_dfi_dfi_vf2(dfi_t dfi, vfloat2 v) { return svcreate3_f32(svget2_f32(v, 0), svget2_f32(v, 1), svget3_f32(dfi, 2)); } // typedef svfloat64x4_t dd2; static INLINE dd2 dd2setab_dd2_vd2_vd2(vdouble2 a, vdouble2 b) { return svcreate4_f64(svget2_f64(a, 0), svget2_f64(a, 1), svget2_f64(b, 0), svget2_f64(b, 1)); } static INLINE vdouble2 dd2geta_vd2_dd2(dd2 d) { return svcreate2_f64(svget4_f64(d, 0), svget4_f64(d, 1)); } static INLINE vdouble2 dd2getb_vd2_dd2(dd2 d) { return svcreate2_f64(svget4_f64(d, 2), svget4_f64(d, 3)); } // typedef svfloat32x4_t df2; static INLINE df2 df2setab_df2_vf2_vf2(vfloat2 a, vfloat2 b) { return svcreate4_f32(svget2_f32(a, 0), svget2_f32(a, 1), svget2_f32(b, 0), svget2_f32(b, 1)); } static INLINE vfloat2 df2geta_vf2_df2(df2 d) { return svcreate2_f32(svget4_f32(d, 0), svget4_f32(d, 1)); } static INLINE vfloat2 df2getb_vf2_df2(df2 d) { return svcreate2_f32(svget4_f32(d, 2), svget4_f32(d, 3)); } // typedef svfloat64x3_t vdouble3; static INLINE vdouble vd3getx_vd_vd3(vdouble3 v) { return svget3_f64(v, 0); } static INLINE vdouble vd3gety_vd_vd3(vdouble3 v) { return svget3_f64(v, 1); } static INLINE vdouble vd3getz_vd_vd3(vdouble3 v) { return svget3_f64(v, 2); } static INLINE vdouble3 vd3setxyz_vd3_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return svcreate3_f64(x, y, z); } static INLINE vdouble3 vd3setx_vd3_vd3_vd(vdouble3 v, vdouble d) { return svset3_f64(v, 0, d); } static INLINE vdouble3 vd3sety_vd3_vd3_vd(vdouble3 v, vdouble d) { return svset3_f64(v, 1, d); } static INLINE vdouble3 vd3setz_vd3_vd3_vd(vdouble3 v, vdouble d) { return svset3_f64(v, 2, d); } // typedef svfloat64x4_t tdx; static INLINE vmask tdxgete_vm_tdx(tdx t) { return svreinterpret_s32_f64(svget4_f64(t, 0)); } static INLINE vdouble3 tdxgetd3_vd3_tdx(tdx t) { return svcreate3_f64(svget4_f64(t, 1), svget4_f64(t, 2), svget4_f64(t, 3)); } static INLINE vdouble tdxgetd3x_vd_tdx(tdx t) { return svget4_f64(t, 1); } static INLINE vdouble tdxgetd3y_vd_tdx(tdx t) { return svget4_f64(t, 2); } static INLINE vdouble tdxgetd3z_vd_tdx(tdx t) { return svget4_f64(t, 3); } static INLINE tdx tdxsete_tdx_tdx_vm(tdx t, vmask e) { return svset4_f64(t, 0, svreinterpret_f64_s32(e)); } static INLINE tdx tdxsetd3_tdx_tdx_vd3(tdx t, vdouble3 d3) { return svcreate4_f64(svget4_f64(t, 0), svget3_f64(d3, 0), svget3_f64(d3, 1), svget3_f64(d3, 2)); } static INLINE tdx tdxsetx_tdx_tdx_vd(tdx t, vdouble x) { return svset4_f64(t, 1, x); } static INLINE tdx tdxsety_tdx_tdx_vd(tdx t, vdouble y) { return svset4_f64(t, 2, y); } static INLINE tdx tdxsetz_tdx_tdx_vd(tdx t, vdouble z) { return svset4_f64(t, 3, z); } static INLINE tdx tdxsetxyz_tdx_tdx_vd_vd_vd(tdx t, vdouble x, vdouble y, vdouble z) { return svcreate4_f64(svget4_f64(t, 0), x, y, z); } static INLINE tdx tdxseted3_tdx_vm_vd3(vmask e, vdouble3 d3) { return svcreate4_f64(svreinterpret_f64_s32(e), svget3_f64(d3, 0), svget3_f64(d3, 1), svget3_f64(d3, 2)); } static INLINE tdx tdxsetexyz_tdx_vm_vd_vd_vd(vmask e, vdouble x, vdouble y, vdouble z) { return svcreate4_f64(svreinterpret_f64_s32(e), x, y, z); } // typedef svfloat64x4_t tdi_t; static INLINE vdouble3 tdigettd_vd3_tdi(tdi_t d) { return svcreate3_f64(svget4_f64(d, 0), svget4_f64(d, 1), svget4_f64(d, 2)); } static INLINE vdouble tdigetx_vd_tdi(tdi_t d) { return svget4_f64(d, 0); } static INLINE vint tdigeti_vi_tdi(tdi_t d) { return svreinterpret_s32_f64(svget4_f64(d, 3)); } static INLINE tdi_t tdisettdi_tdi_vd3_vi(vdouble3 v, vint i) { return svcreate4_f64(svget3_f64(v, 0), svget3_f64(v, 1), svget3_f64(v, 2), svreinterpret_f64_s32(i)); } static INLINE tdi_t tdisettd_tdi_tdi_vd3(tdi_t tdi, vdouble3 v) { return svcreate4_f64(svget3_f64(v, 0), svget3_f64(v, 1), svget3_f64(v, 2), svget4_f64(tdi, 3)); } // // masking predicates #define ALL_TRUE_MASK svdup_n_s32(0xffffffff) #define ALL_FALSE_MASK svdup_n_s32(0x0) //@#define ALL_TRUE_MASK svdup_n_s32(0xffffffff) //@#define ALL_FALSE_MASK svdup_n_s32(0x0) static INLINE void vprefetch_v_p(const void *ptr) {} // // // // Test if all lanes are active // // // static INLINE int vtestallones_i_vo32(vopmask g) { svbool_t pg = svptrue_b32(); return (svcntp_b32(pg, g) == svcntw()); } static INLINE int vtestallones_i_vo64(vopmask g) { svbool_t pg = svptrue_b64(); return (svcntp_b64(pg, g) == svcntd()); } // // // // // // // Vector load / store static INLINE void vstoreu_v_p_vi2(int32_t *p, vint2 v) { svst1_s32(ptrue, p, v); } static INLINE vfloat vload_vf_p(const float *ptr) { return svld1_f32(ptrue, ptr); } static INLINE vfloat vloadu_vf_p(const float *ptr) { return svld1_f32(ptrue, ptr); } static INLINE void vstoreu_v_p_vf(float *ptr, vfloat v) { svst1_f32(ptrue, ptr, v); } // Basic logical operations for mask static INLINE vmask vand_vm_vm_vm(vmask x, vmask y) { return svand_s32_x(ptrue, x, y); } static INLINE vmask vandnot_vm_vm_vm(vmask x, vmask y) { return svbic_s32_x(ptrue, y, x); } static INLINE vmask vor_vm_vm_vm(vmask x, vmask y) { return svorr_s32_x(ptrue, x, y); } static INLINE vmask vxor_vm_vm_vm(vmask x, vmask y) { return sveor_s32_x(ptrue, x, y); } static INLINE vmask vadd64_vm_vm_vm(vmask x, vmask y) { return svreinterpret_s32_s64( svadd_s64_x(ptrue, svreinterpret_s64_s32(x), svreinterpret_s64_s32(y))); } // Mask <--> single precision reinterpret static INLINE vmask vreinterpret_vm_vf(vfloat vf) { return svreinterpret_s32_f32(vf); } static INLINE vfloat vreinterpret_vf_vm(vmask vm) { return svreinterpret_f32_s32(vm); } static INLINE vfloat vreinterpret_vf_vi2(vint2 vm) { return svreinterpret_f32_s32(vm); } static INLINE vint2 vreinterpret_vi2_vf(vfloat vf) { return svreinterpret_s32_f32(vf); } static INLINE vint2 vcast_vi2_vm(vmask vm) { return vm; } static INLINE vmask vcast_vm_vi2(vint2 vi) { return vi; } // Conditional select static INLINE vint2 vsel_vi2_vm_vi2_vi2(vmask m, vint2 x, vint2 y) { return svsel_s32(svcmpeq_s32(ptrue, m, ALL_TRUE_MASK), x, y); } /****************************************/ /* Single precision FP operations */ /****************************************/ // Broadcast static INLINE vfloat vcast_vf_f(float f) { return svdup_n_f32(f); } // Add, Sub, Mul static INLINE vfloat vadd_vf_vf_vf(vfloat x, vfloat y) { return svadd_f32_x(ptrue, x, y); } static INLINE vfloat vsub_vf_vf_vf(vfloat x, vfloat y) { return svsub_f32_x(ptrue, x, y); } static INLINE vfloat vmul_vf_vf_vf(vfloat x, vfloat y) { return svmul_f32_x(ptrue, x, y); } // |x|, -x static INLINE vfloat vabs_vf_vf(vfloat f) { return svabs_f32_x(ptrue, f); } static INLINE vfloat vneg_vf_vf(vfloat f) { return svneg_f32_x(ptrue, f); } // max, min static INLINE vfloat vmax_vf_vf_vf(vfloat x, vfloat y) { return svmax_f32_x(ptrue, x, y); } static INLINE vfloat vmin_vf_vf_vf(vfloat x, vfloat y) { return svmin_f32_x(ptrue, x, y); } // int <--> float conversions static INLINE vint2 vtruncate_vi2_vf(vfloat vf) { return svcvt_s32_f32_x(ptrue, vf); } static INLINE vfloat vcast_vf_vi2(vint2 vi) { return svcvt_f32_s32_x(ptrue, vi); } static INLINE vint2 vcast_vi2_i(int i) { return svdup_n_s32(i); } static INLINE vint2 vrint_vi2_vf(vfloat d) { return svcvt_s32_f32_x(ptrue, svrintn_f32_x(ptrue, d)); } #if CONFIG == 1 // Multiply accumulate: z = z + x * y static INLINE vfloat vmla_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return svmad_f32_x(ptrue, x, y, z); } // Multiply subtract: z = z - x * y static INLINE vfloat vmlanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return svmsb_f32_x(ptrue, x, y, z); } static INLINE vfloat vmlapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return svnmsb_f32_x(ptrue, x, y, z); } #else static INLINE vfloat vmla_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vadd_vf_vf_vf(vmul_vf_vf_vf(x, y), z); } static INLINE vfloat vmlanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vsub_vf_vf_vf(z, vmul_vf_vf_vf(x, y)); } static INLINE vfloat vmlapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vsub_vf_vf_vf(vmul_vf_vf_vf(x, y), z); } #endif // fused multiply add / sub static INLINE vfloat vfma_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { // z + x * y return svmad_f32_x(ptrue, x, y, z); } static INLINE vfloat vfmanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { // z - x * y return svmsb_f32_x(ptrue, x, y, z); } static INLINE vfloat vfmapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { // x * y - z return svnmsb_f32_x(ptrue, x, y, z); } // conditional select static INLINE vfloat vsel_vf_vo_vf_vf(vopmask mask, vfloat x, vfloat y) { return svsel_f32(mask, x, y); } // Reciprocal 1/x, Division, Square root static INLINE vfloat vdiv_vf_vf_vf(vfloat n, vfloat d) { #ifndef ENABLE_ALTDIV return svdiv_f32_x(ptrue, n, d); #else // Finite numbers (including denormal) only, gives mostly correctly rounded result vfloat t, u, x, y; svuint32_t i0, i1; i0 = svand_u32_x(ptrue, svreinterpret_u32_f32(n), svdup_n_u32(0x7c000000)); i1 = svand_u32_x(ptrue, svreinterpret_u32_f32(d), svdup_n_u32(0x7c000000)); i0 = svsub_u32_x(ptrue, svdup_n_u32(0x7d000000), svlsr_n_u32_x(ptrue, svadd_u32_x(ptrue, i0, i1), 1)); t = svreinterpret_f32_u32(i0); y = svmul_f32_x(ptrue, d, t); x = svmul_f32_x(ptrue, n, t); t = svrecpe_f32(y); t = svmul_f32_x(ptrue, t, svrecps_f32(y, t)); t = svmul_f32_x(ptrue, t, svrecps_f32(y, t)); u = svmul_f32_x(ptrue, x, t); u = svmad_f32_x(ptrue, svmsb_f32_x(ptrue, y, u, x), t, u); return u; #endif } static INLINE vfloat vrec_vf_vf(vfloat d) { #ifndef ENABLE_ALTDIV return svdivr_n_f32_x(ptrue, d, 1.0f); #else return vsel_vf_vo_vf_vf(svcmpeq_f32(ptrue, vabs_vf_vf(d), vcast_vf_f(SLEEF_INFINITYf)), vcast_vf_f(0), vdiv_vf_vf_vf(vcast_vf_f(1.0f), d)); #endif } static INLINE vfloat vsqrt_vf_vf(vfloat d) { #ifndef ENABLE_ALTSQRT return svsqrt_f32_x(ptrue, d); #else // Gives correctly rounded result for all input range vfloat w, x, y, z; y = svrsqrte_f32(d); x = vmul_vf_vf_vf(d, y); w = vmul_vf_vf_vf(vcast_vf_f(0.5), y); y = vfmanp_vf_vf_vf_vf(x, w, vcast_vf_f(0.5)); x = vfma_vf_vf_vf_vf(x, y, x); w = vfma_vf_vf_vf_vf(w, y, w); y = vfmanp_vf_vf_vf_vf(x, w, vcast_vf_f(1.5)); w = vadd_vf_vf_vf(w, w); w = vmul_vf_vf_vf(w, y); x = vmul_vf_vf_vf(w, d); y = vfmapn_vf_vf_vf_vf(w, d, x); z = vfmanp_vf_vf_vf_vf(w, x, vcast_vf_f(1)); z = vfmanp_vf_vf_vf_vf(w, y, z); w = vmul_vf_vf_vf(vcast_vf_f(0.5), x); w = vfma_vf_vf_vf_vf(w, z, y); w = vadd_vf_vf_vf(w, x); return svsel_f32(svorr_b_z(ptrue, svcmpeq_f32(ptrue, d, vcast_vf_f(0)), svcmpeq_f32(ptrue, d, vcast_vf_f(SLEEF_INFINITYf))), d, w); #endif } // // // // // // static INLINE CONST vfloat vsel_vf_vo_f_f(vopmask o, float v1, float v0) { return vsel_vf_vo_vf_vf(o, vcast_vf_f(v1), vcast_vf_f(v0)); } static INLINE vfloat vsel_vf_vo_vo_f_f_f(vopmask o0, vopmask o1, float d0, float d1, float d2) { return vsel_vf_vo_vf_vf(o0, vcast_vf_f(d0), vsel_vf_vo_f_f(o1, d1, d2)); } static INLINE vfloat vsel_vf_vo_vo_vo_f_f_f_f(vopmask o0, vopmask o1, vopmask o2, float d0, float d1, float d2, float d3) { return vsel_vf_vo_vf_vf(o0, vcast_vf_f(d0), vsel_vf_vo_vf_vf(o1, vcast_vf_f(d1), vsel_vf_vo_f_f(o2, d2, d3))); } // // // // // // // truncate static INLINE vfloat vtruncate_vf_vf(vfloat vd) { return svrintz_f32_x(ptrue, vd); } // // // // Round float // // // static INLINE vfloat vrint_vf_vf(vfloat vf) { return svrintn_f32_x(svptrue_b32(), vf); } // // // // // // /***************************************/ /* Single precision integer operations */ /***************************************/ // Add, Sub, Neg (-x) static INLINE vint2 vadd_vi2_vi2_vi2(vint2 x, vint2 y) { return svadd_s32_x(ptrue, x, y); } static INLINE vint2 vsub_vi2_vi2_vi2(vint2 x, vint2 y) { return svsub_s32_x(ptrue, x, y); } static INLINE vint2 vneg_vi2_vi2(vint2 e) { return svneg_s32_x(ptrue, e); } // Logical operations static INLINE vint2 vand_vi2_vi2_vi2(vint2 x, vint2 y) { return svand_s32_x(ptrue, x, y); } static INLINE vint2 vandnot_vi2_vi2_vi2(vint2 x, vint2 y) { return svbic_s32_x(ptrue, y, x); } static INLINE vint2 vor_vi2_vi2_vi2(vint2 x, vint2 y) { return svorr_s32_x(ptrue, x, y); } static INLINE vint2 vxor_vi2_vi2_vi2(vint2 x, vint2 y) { return sveor_s32_x(ptrue, x, y); } // Shifts #define vsll_vi2_vi2_i(x, c) svlsl_n_s32_x(ptrue, x, c) //@#define vsll_vi2_vi2_i(x, c) svlsl_n_s32_x(ptrue, x, c) #define vsrl_vi2_vi2_i(x, c) \ svreinterpret_s32_u32(svlsr_n_u32_x(ptrue, svreinterpret_u32_s32(x), c)) //@#define vsrl_vi2_vi2_i(x, c) svreinterpret_s32_u32(svlsr_n_u32_x(ptrue, svreinterpret_u32_s32(x), c)) #define vsra_vi2_vi2_i(x, c) svasr_n_s32_x(ptrue, x, c) //@#define vsra_vi2_vi2_i(x, c) svasr_n_s32_x(ptrue, x, c) // Comparison returning integers static INLINE vint2 vgt_vi2_vi2_vi2(vint2 x, vint2 y) { return svsel_s32(svcmpgt_s32(ptrue, x, y), ALL_TRUE_MASK, ALL_FALSE_MASK); } // conditional select static INLINE vint2 vsel_vi2_vo_vi2_vi2(vopmask m, vint2 x, vint2 y) { return svsel_s32(m, x, y); } /****************************************/ /* opmask operations */ /****************************************/ // single precision FP static INLINE vopmask veq_vo_vf_vf(vfloat x, vfloat y) { return svcmpeq_f32(ptrue, x, y); } static INLINE vopmask vneq_vo_vf_vf(vfloat x, vfloat y) { return svcmpne_f32(ptrue, x, y); } static INLINE vopmask vlt_vo_vf_vf(vfloat x, vfloat y) { return svcmplt_f32(ptrue, x, y); } static INLINE vopmask vle_vo_vf_vf(vfloat x, vfloat y) { return svcmple_f32(ptrue, x, y); } static INLINE vopmask vgt_vo_vf_vf(vfloat x, vfloat y) { return svcmpgt_f32(ptrue, x, y); } static INLINE vopmask vge_vo_vf_vf(vfloat x, vfloat y) { return svcmpge_f32(ptrue, x, y); } static INLINE vopmask visinf_vo_vf(vfloat d) { return svcmpeq_n_f32(ptrue, vabs_vf_vf(d), SLEEF_INFINITYf); } static INLINE vopmask vispinf_vo_vf(vfloat d) { return svcmpeq_n_f32(ptrue, d, SLEEF_INFINITYf); } static INLINE vopmask visminf_vo_vf(vfloat d) { return svcmpeq_n_f32(ptrue, d, -SLEEF_INFINITYf); } static INLINE vopmask visnan_vo_vf(vfloat d) { return vneq_vo_vf_vf(d, d); } // integers static INLINE vopmask veq_vo_vi2_vi2(vint2 x, vint2 y) { return svcmpeq_s32(ptrue, x, y); } static INLINE vopmask vgt_vo_vi2_vi2(vint2 x, vint2 y) { return svcmpgt_s32(ptrue, x, y); } // logical opmask static INLINE vopmask vand_vo_vo_vo(vopmask x, vopmask y) { return svand_b_z(ptrue, x, y); } static INLINE vopmask vandnot_vo_vo_vo(vopmask x, vopmask y) { return svbic_b_z(ptrue, y, x); } static INLINE vopmask vor_vo_vo_vo(vopmask x, vopmask y) { return svorr_b_z(ptrue, x, y); } static INLINE vopmask vxor_vo_vo_vo(vopmask x, vopmask y) { return sveor_b_z(ptrue, x, y); } static INLINE vint2 vand_vi2_vo_vi2(vopmask x, vint2 y) { // This needs to be zeroing to prevent asinf and atanf denormal test // failing. return svand_s32_z(x, y, y); } // bitmask logical operations static INLINE vmask vand_vm_vo32_vm(vopmask x, vmask y) { return svsel_s32(x, y, ALL_FALSE_MASK); } static INLINE vmask vandnot_vm_vo32_vm(vopmask x, vmask y) { return svsel_s32(x, ALL_FALSE_MASK, y); } static INLINE vmask vor_vm_vo32_vm(vopmask x, vmask y) { return svsel_s32(x, ALL_TRUE_MASK, y); } // broadcast bitmask static INLINE vmask vcast_vm_i_i(int i0, int i1) { return svreinterpret_s32_u64( svdup_n_u64((0xffffffff & (uint64_t)i1) | (((uint64_t)i0) << 32))); } /*********************************/ /* SVE for double precision math */ /*********************************/ // Vector load/store static INLINE vdouble vload_vd_p(const double *ptr) { return svld1_f64(ptrue, ptr); } static INLINE vdouble vloadu_vd_p(const double *ptr) { return svld1_f64(ptrue, ptr); } static INLINE void vstoreu_v_p_vd(double *ptr, vdouble v) { svst1_f64(ptrue, ptr, v); } static INLINE void vstoreu_v_p_vi(int *ptr, vint v) { svst1w_s64(ptrue, ptr, svreinterpret_s64_s32(v)); } static vint vloadu_vi_p(int32_t *p) { return svreinterpret_s32_s64(svld1uw_s64(ptrue, (uint32_t *)p)); } // Reinterpret static INLINE vdouble vreinterpret_vd_vm(vmask vm) { return svreinterpret_f64_s32(vm); } static INLINE vmask vreinterpret_vm_vd(vdouble vd) { return svreinterpret_s32_f64(vd); } static INLINE vdouble vreinterpret_vd_vi2(vint2 x) { return svreinterpret_f64_s32(x); } static INLINE vint2 vreinterpret_vi2_vd(vdouble x) { return svreinterpret_s32_f64(x); } static INLINE vint2 vcastu_vi2_vi(vint x) { return svreinterpret_s32_s64( svlsl_n_s64_x(ptrue, svreinterpret_s64_s32(x), 32)); } static INLINE vint vcastu_vi_vi2(vint2 x) { return svreinterpret_s32_u64( svlsr_n_u64_x(ptrue, svreinterpret_u64_s32(x), 32)); } static INLINE vdouble vcast_vd_vi(vint vi) { return svcvt_f64_s32_x(ptrue, vi); } // Splat static INLINE vdouble vcast_vd_d(double d) { return svdup_n_f64(d); } // Conditional select static INLINE vdouble vsel_vd_vo_vd_vd(vopmask o, vdouble x, vdouble y) { return svsel_f64(o, x, y); } static INLINE CONST vdouble vsel_vd_vo_d_d(vopmask o, double v1, double v0) { return vsel_vd_vo_vd_vd(o, vcast_vd_d(v1), vcast_vd_d(v0)); } static INLINE vdouble vsel_vd_vo_vo_d_d_d(vopmask o0, vopmask o1, double d0, double d1, double d2) { return vsel_vd_vo_vd_vd(o0, vcast_vd_d(d0), vsel_vd_vo_d_d(o1, d1, d2)); } static INLINE vdouble vsel_vd_vo_vo_vo_d_d_d_d(vopmask o0, vopmask o1, vopmask o2, double d0, double d1, double d2, double d3) { return vsel_vd_vo_vd_vd(o0, vcast_vd_d(d0), vsel_vd_vo_vd_vd(o1, vcast_vd_d(d1), vsel_vd_vo_d_d(o2, d2, d3))); } static INLINE vint vsel_vi_vo_vi_vi(vopmask o, vint x, vint y) { return svsel_s32(o, x, y); } // truncate static INLINE vdouble vtruncate_vd_vd(vdouble vd) { return svrintz_f64_x(ptrue, vd); } static INLINE vint vtruncate_vi_vd(vdouble vd) { return svcvt_s32_f64_x(ptrue, vd); } static INLINE vint vrint_vi_vd(vdouble vd) { return svcvt_s32_f64_x(ptrue, svrintn_f64_x(ptrue, vd)); } static INLINE vdouble vrint_vd_vd(vdouble vd) { return svrintn_f64_x(ptrue, vd); } // FP math operations static INLINE vdouble vadd_vd_vd_vd(vdouble x, vdouble y) { return svadd_f64_x(ptrue, x, y); } static INLINE vdouble vsub_vd_vd_vd(vdouble x, vdouble y) { return svsub_f64_x(ptrue, x, y); } static INLINE vdouble vneg_vd_vd(vdouble x) { return svneg_f64_x(ptrue, x); } static INLINE vdouble vmul_vd_vd_vd(vdouble x, vdouble y) { return svmul_f64_x(ptrue, x, y); } static INLINE vdouble vabs_vd_vd(vdouble x) { return svabs_f64_x(ptrue, x); } static INLINE vdouble vmax_vd_vd_vd(vdouble x, vdouble y) { return svmax_f64_x(ptrue, x, y); } static INLINE vdouble vmin_vd_vd_vd(vdouble x, vdouble y) { return svmin_f64_x(ptrue, x, y); } #if CONFIG == 1 // Multiply accumulate / subtract static INLINE vdouble vmla_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { // z = x*y + z return svmad_f64_x(ptrue, x, y, z); } static INLINE vdouble vmlapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { // z = x * y - z return svnmsb_f64_x(ptrue, x, y, z); } static INLINE vdouble vmlanp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return svmsb_f64_x(ptrue, x, y, z); } #else static INLINE vdouble vmla_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vadd_vd_vd_vd(vmul_vd_vd_vd(x, y), z); } static INLINE vdouble vmlapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vsub_vd_vd_vd(vmul_vd_vd_vd(x, y), z); } #endif static INLINE vdouble vfma_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { // z + x * y return svmad_f64_x(ptrue, x, y, z); } static INLINE vdouble vfmanp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { // z - x * y return svmsb_f64_x(ptrue, x, y, z); } static INLINE vdouble vfmapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { // x * y - z return svnmsb_f64_x(ptrue, x, y, z); } // Reciprocal 1/x, Division, Square root static INLINE vdouble vdiv_vd_vd_vd(vdouble n, vdouble d) { #ifndef ENABLE_ALTDIV return svdiv_f64_x(ptrue, n, d); #else // Finite numbers (including denormal) only, gives mostly correctly rounded result vdouble t, u, x, y; svuint64_t i0, i1; i0 = svand_u64_x(ptrue, svreinterpret_u64_f64(n), svdup_n_u64(0x7fc0000000000000L)); i1 = svand_u64_x(ptrue, svreinterpret_u64_f64(d), svdup_n_u64(0x7fc0000000000000L)); i0 = svsub_u64_x(ptrue, svdup_n_u64(0x7fd0000000000000L), svlsr_n_u64_x(ptrue, svadd_u64_x(ptrue, i0, i1), 1)); t = svreinterpret_f64_u64(i0); y = svmul_f64_x(ptrue, d, t); x = svmul_f64_x(ptrue, n, t); t = svrecpe_f64(y); t = svmul_f64_x(ptrue, t, svrecps_f64(y, t)); t = svmul_f64_x(ptrue, t, svrecps_f64(y, t)); t = svmul_f64_x(ptrue, t, svrecps_f64(y, t)); u = svmul_f64_x(ptrue, x, t); u = svmad_f64_x(ptrue, svmsb_f64_x(ptrue, y, u, x), t, u); return u; #endif } static INLINE vdouble vrec_vd_vd(vdouble d) { #ifndef ENABLE_ALTDIV return svdivr_n_f64_x(ptrue, d, 1.0); #else return vsel_vd_vo_vd_vd(svcmpeq_f64(ptrue, vabs_vd_vd(d), vcast_vd_d(SLEEF_INFINITY)), vcast_vd_d(0), vdiv_vd_vd_vd(vcast_vd_d(1.0f), d)); #endif } static INLINE vdouble vsqrt_vd_vd(vdouble d) { #ifndef ENABLE_ALTSQRT return svsqrt_f64_x(ptrue, d); #else // Gives correctly rounded result for all input range vdouble w, x, y, z; y = svrsqrte_f64(d); x = vmul_vd_vd_vd(d, y); w = vmul_vd_vd_vd(vcast_vd_d(0.5), y); y = vfmanp_vd_vd_vd_vd(x, w, vcast_vd_d(0.5)); x = vfma_vd_vd_vd_vd(x, y, x); w = vfma_vd_vd_vd_vd(w, y, w); y = vfmanp_vd_vd_vd_vd(x, w, vcast_vd_d(0.5)); x = vfma_vd_vd_vd_vd(x, y, x); w = vfma_vd_vd_vd_vd(w, y, w); y = vfmanp_vd_vd_vd_vd(x, w, vcast_vd_d(1.5)); w = vadd_vd_vd_vd(w, w); w = vmul_vd_vd_vd(w, y); x = vmul_vd_vd_vd(w, d); y = vfmapn_vd_vd_vd_vd(w, d, x); z = vfmanp_vd_vd_vd_vd(w, x, vcast_vd_d(1)); z = vfmanp_vd_vd_vd_vd(w, y, z); w = vmul_vd_vd_vd(vcast_vd_d(0.5), x); w = vfma_vd_vd_vd_vd(w, z, y); w = vadd_vd_vd_vd(w, x); return svsel_f64(svorr_b_z(ptrue, svcmpeq_f64(ptrue, d, vcast_vd_d(0)), svcmpeq_f64(ptrue, d, vcast_vd_d(SLEEF_INFINITY))), d, w); #endif } // Float comparison static INLINE vopmask vlt_vo_vd_vd(vdouble x, vdouble y) { return svcmplt_f64(ptrue, x, y); } static INLINE vopmask veq_vo_vd_vd(vdouble x, vdouble y) { return svcmpeq_f64(ptrue, x, y); } static INLINE vopmask vgt_vo_vd_vd(vdouble x, vdouble y) { return svcmpgt_f64(ptrue, x, y); } static INLINE vopmask vge_vo_vd_vd(vdouble x, vdouble y) { return svcmpge_f64(ptrue, x, y); } static INLINE vopmask vneq_vo_vd_vd(vdouble x, vdouble y) { return svcmpne_f64(ptrue, x, y); } static INLINE vopmask vle_vo_vd_vd(vdouble x, vdouble y) { return svcmple_f64(ptrue, x, y); } // predicates static INLINE vopmask visnan_vo_vd(vdouble vd) { return svcmpne_f64(ptrue, vd, vd); } static INLINE vopmask visinf_vo_vd(vdouble vd) { return svcmpeq_n_f64(ptrue, svabs_f64_x(ptrue, vd), SLEEF_INFINITY); } static INLINE vopmask vispinf_vo_vd(vdouble vd) { return svcmpeq_n_f64(ptrue, vd, SLEEF_INFINITY); } static INLINE vopmask visminf_vo_vd(vdouble vd) { return svcmpeq_n_f64(ptrue, vd, -SLEEF_INFINITY); } // Comparing bit masks static INLINE vopmask veq64_vo_vm_vm(vmask x, vmask y) { return svcmpeq_s64(ptrue, svreinterpret_s64_s32(x), svreinterpret_s64_s32(y)); } // pure predicate operations static INLINE vopmask vcast_vo32_vo64(vopmask o) { return o; } static INLINE vopmask vcast_vo64_vo32(vopmask o) { return o; } // logical integer operations static INLINE vint vand_vi_vo_vi(vopmask x, vint y) { // This needs to be a zeroing instruction because we need to make // sure that the inactive elements for the unpacked integers vector // are zero. return svand_s32_z(x, y, y); } static INLINE vint vandnot_vi_vo_vi(vopmask x, vint y) { return svsel_s32(x, ALL_FALSE_MASK, y); } #define vsra_vi_vi_i(x, c) svasr_n_s32_x(ptrue, x, c) //@#define vsra_vi_vi_i(x, c) svasr_n_s32_x(ptrue, x, c) #define vsll_vi_vi_i(x, c) svlsl_n_s32_x(ptrue, x, c) //@#define vsll_vi_vi_i(x, c) svlsl_n_s32_x(ptrue, x, c) static INLINE vint vsrl_vi_vi_i(vint x, int c) { return svreinterpret_s32_u32(svlsr_n_u32_x(ptrue, svreinterpret_u32_s32(x), c)); } static INLINE vint vand_vi_vi_vi(vint x, vint y) { return svand_s32_x(ptrue, x, y); } static INLINE vint vandnot_vi_vi_vi(vint x, vint y) { return svbic_s32_x(ptrue, y, x); } static INLINE vint vxor_vi_vi_vi(vint x, vint y) { return sveor_s32_x(ptrue, x, y); } // integer math static INLINE vint vadd_vi_vi_vi(vint x, vint y) { return svadd_s32_x(ptrue, x, y); } static INLINE vint vsub_vi_vi_vi(vint x, vint y) { return svsub_s32_x(ptrue, x, y); } static INLINE vint vneg_vi_vi(vint x) { return svneg_s32_x(ptrue, x); } // integer comparison static INLINE vopmask vgt_vo_vi_vi(vint x, vint y) { return svcmpgt_s32(ptrue, x, y); } static INLINE vopmask veq_vo_vi_vi(vint x, vint y) { return svcmpeq_s32(ptrue, x, y); } // Splat static INLINE vint vcast_vi_i(int i) { return svdup_n_s32(i); } // bitmask logical operations static INLINE vmask vand_vm_vo64_vm(vopmask x, vmask y) { // This needs to be a zeroing instruction because we need to make // sure that the inactive elements for the unpacked integers vector // are zero. return svreinterpret_s32_s64( svand_s64_z(x, svreinterpret_s64_s32(y), svreinterpret_s64_s32(y))); } static INLINE vmask vandnot_vm_vo64_vm(vopmask x, vmask y) { return svreinterpret_s32_s64(svsel_s64( x, svreinterpret_s64_s32(ALL_FALSE_MASK), svreinterpret_s64_s32(y))); } static INLINE vmask vor_vm_vo64_vm(vopmask x, vmask y) { return svreinterpret_s32_s64(svsel_s64( x, svreinterpret_s64_s32(ALL_TRUE_MASK), svreinterpret_s64_s32(y))); } static INLINE vfloat vrev21_vf_vf(vfloat vf) { return svreinterpret_f32_u64(svrevw_u64_x(ptrue, svreinterpret_u64_f32(vf))); } static INLINE vint2 vrev21_vi2_vi2(vint2 i) { return vreinterpret_vi2_vf(vrev21_vf_vf(vreinterpret_vf_vi2(i))); } // Comparison returning integer static INLINE vint2 veq_vi2_vi2_vi2(vint2 x, vint2 y) { return svsel_s32(svcmpeq_s32(ptrue, x, y), ALL_TRUE_MASK, ALL_FALSE_MASK); } // Gather static INLINE vdouble vgather_vd_p_vi(const double *ptr, vint vi) { return svld1_gather_s64index_f64(ptrue, ptr, svreinterpret_s64_s32(vi)); } static INLINE vfloat vgather_vf_p_vi2(const float *ptr, vint2 vi2) { return svld1_gather_s32index_f32(ptrue, ptr, vi2); } // Operations for DFT static INLINE vdouble vposneg_vd_vd(vdouble d) { return svneg_f64_m(d, svdupq_n_b64(0, 1), d); } static INLINE vdouble vnegpos_vd_vd(vdouble d) { return svneg_f64_m(d, svdupq_n_b64(1, 0), d); } static INLINE vfloat vposneg_vf_vf(vfloat d) { return svneg_f32_m(d, svdupq_n_b32(0, 1, 0, 1), d); } static INLINE vfloat vnegpos_vf_vf(vfloat d) { return svneg_f32_m(d, svdupq_n_b32(1, 0, 1, 0), d); } static INLINE vdouble vsubadd_vd_vd_vd(vdouble x, vdouble y) { return vadd_vd_vd_vd(x, vnegpos_vd_vd(y)); } static INLINE vfloat vsubadd_vf_vf_vf(vfloat d0, vfloat d1) { return vadd_vf_vf_vf(d0, vnegpos_vf_vf(d1)); } static INLINE vdouble vmlsubadd_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vfma_vd_vd_vd_vd(x, y, vnegpos_vd_vd(z)); } static INLINE vfloat vmlsubadd_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vfma_vf_vf_vf_vf(x, y, vnegpos_vf_vf(z)); } // static INLINE vdouble vrev21_vd_vd(vdouble x) { return svzip1_f64(svuzp2_f64(x, x), svuzp1_f64(x, x)); } static INLINE vdouble vreva2_vd_vd(vdouble vd) { svint64_t x = svindex_s64((VECTLENDP-1), -1); x = svzip1_s64(svuzp2_s64(x, x), svuzp1_s64(x, x)); return svtbl_f64(vd, svreinterpret_u64_s64(x)); } static INLINE vfloat vreva2_vf_vf(vfloat vf) { svint32_t x = svindex_s32((VECTLENSP-1), -1); x = svzip1_s32(svuzp2_s32(x, x), svuzp1_s32(x, x)); return svtbl_f32(vf, svreinterpret_u32_s32(x)); } // static INLINE void vscatter2_v_p_i_i_vd(double *ptr, int offset, int step, vdouble v) { svst1_scatter_u64index_f64(ptrue, ptr + offset*2, svzip1_u64(svindex_u64(0, step*2), svindex_u64(1, step*2)), v); } static INLINE void vscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v) { svst1_scatter_u32index_f32(ptrue, ptr + offset*2, svzip1_u32(svindex_u32(0, step*2), svindex_u32(1, step*2)), v); } static INLINE void vstore_v_p_vd(double *ptr, vdouble v) { vstoreu_v_p_vd(ptr, v); } static INLINE void vstream_v_p_vd(double *ptr, vdouble v) { vstore_v_p_vd(ptr, v); } static INLINE void vstore_v_p_vf(float *ptr, vfloat v) { vstoreu_v_p_vf(ptr, v); } static INLINE void vstream_v_p_vf(float *ptr, vfloat v) { vstore_v_p_vf(ptr, v); } static INLINE void vsscatter2_v_p_i_i_vd(double *ptr, int offset, int step, vdouble v) { vscatter2_v_p_i_i_vd(ptr, offset, step, v); } static INLINE void vsscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v) { vscatter2_v_p_i_i_vf(ptr, offset, step, v); } // These functions are for debugging static double vcast_d_vd(vdouble v) { double a[svcntd()]; vstoreu_v_p_vd(a, v); return a[0]; } static float vcast_f_vf(vfloat v) { float a[svcntw()]; vstoreu_v_p_vf(a, v); return a[0]; } static int vcast_i_vi(vint v) { int a[svcntw()]; vstoreu_v_p_vi(a, v); return a[0]; } static int vcast_i_vi2(vint2 v) { int a[svcntw()]; vstoreu_v_p_vi2(a, v); return a[0]; } // static INLINE vmask2 vinterleave_vm2_vm2(vmask2 v) { return vm2setxy_vm2_vm_vm(svreinterpret_s32_u64(svtrn1_u64(svreinterpret_u64_s32(vm2getx_vm_vm2(v)), svreinterpret_u64_s32(vm2gety_vm_vm2(v)))), svreinterpret_s32_u64(svtrn2_u64(svreinterpret_u64_s32(vm2getx_vm_vm2(v)), svreinterpret_u64_s32(vm2gety_vm_vm2(v))))); } static INLINE vmask2 vuninterleave_vm2_vm2(vmask2 v) { return vm2setxy_vm2_vm_vm(svreinterpret_s32_u64(svtrn1_u64(svreinterpret_u64_s32(vm2getx_vm_vm2(v)), svreinterpret_u64_s32(vm2gety_vm_vm2(v)))), svreinterpret_s32_u64(svtrn2_u64(svreinterpret_u64_s32(vm2getx_vm_vm2(v)), svreinterpret_u64_s32(vm2gety_vm_vm2(v))))); } static INLINE vint vuninterleave_vi_vi(vint v) { return svreinterpret_s32_u64(svuzp1_u64(svtrn1_u64(svreinterpret_u64_s32(v), svreinterpret_u64_s32(v)), svtrn2_u64(svreinterpret_u64_s32(v), svreinterpret_u64_s32(v)))); } static INLINE vdouble vinterleave_vd_vd(vdouble vd) { return svtrn1_f64(svzip1_f64(vd, vd), svzip2_f64(vd, vd)); } static INLINE vdouble vuninterleave_vd_vd(vdouble vd) { return svuzp1_f64(svtrn1_f64(vd, vd), svtrn2_f64(vd, vd)); } static INLINE vmask vinterleave_vm_vm(vmask vm) { return svreinterpret_s32_u64(svtrn1_u64(svzip1_u64(svreinterpret_u64_s32(vm), svreinterpret_u64_s32(vm)), svzip2_u64(svreinterpret_u64_s32(vm), svreinterpret_u64_s32(vm)))); } static INLINE vmask vuninterleave_vm_vm(vmask vm) { return svreinterpret_s32_u64(svuzp1_u64(svtrn1_u64(svreinterpret_u64_s32(vm), svreinterpret_u64_s32(vm)), svtrn2_u64(svreinterpret_u64_s32(vm), svreinterpret_u64_s32(vm)))); } static vmask2 vloadu_vm2_p(void *p) { vmask2 vm2; memcpy(&vm2, p, VECTLENDP * 16); return vm2; } #if !defined(SLEEF_GENHEADER) typedef Sleef_quadx vargquad; static INLINE vmask2 vcast_vm2_aq(vargquad aq) { return vinterleave_vm2_vm2(vloadu_vm2_p(&aq)); } static INLINE vargquad vcast_aq_vm2(vmask2 vm2) { vm2 = vuninterleave_vm2_vm2(vm2); vargquad aq; memcpy(&aq, &vm2, VECTLENDP * 16); return aq; } #endif // #if !defined(SLEEF_GENHEADER) static INLINE int vtestallzeros_i_vo64(vopmask g) { return svcntp_b64(svptrue_b64(), g) == 0; } static INLINE vmask vsel_vm_vo64_vm_vm(vopmask o, vmask x, vmask y) { return svreinterpret_s32_s64(svsel_s64(o, svreinterpret_s64_s32(x), svreinterpret_s64_s32(y))); } static INLINE vmask vsub64_vm_vm_vm(vmask x, vmask y) { return svreinterpret_s32_s64( svsub_s64_x(ptrue, svreinterpret_s64_s32(x), svreinterpret_s64_s32(y))); } static INLINE vmask vneg64_vm_vm(vmask x) { return svreinterpret_s32_s64(svneg_s64_x(ptrue, svreinterpret_s64_s32(x))); } static INLINE vopmask vgt64_vo_vm_vm(vmask x, vmask y) { return svcmpgt_s64(ptrue, svreinterpret_s64_s32(x), svreinterpret_s64_s32(y)); } #define vsll64_vm_vm_i(x, c) svreinterpret_s32_u64(svlsl_n_u64_x(ptrue, svreinterpret_u64_s32(x), c)) //@#define vsll64_vm_vm_i(x, c) svreinterpret_s32_u64(svlsl_n_u64_x(ptrue, svreinterpret_u64_s32(x), c)) #define vsrl64_vm_vm_i(x, c) svreinterpret_s32_u64(svlsr_n_u64_x(ptrue, svreinterpret_u64_s32(x), c)) //@#define vsrl64_vm_vm_i(x, c) svreinterpret_s32_u64(svlsr_n_u64_x(ptrue, svreinterpret_u64_s32(x), c)) static INLINE vmask vcast_vm_vi(vint vi) { return svreinterpret_s32_s64(svextw_s64_z(ptrue, svreinterpret_s64_s32(vi))); } static INLINE vint vcast_vi_vm(vmask vm) { return vand_vm_vm_vm(vm, vcast_vm_i_i(0, 0xffffffff)); } ================================================ FILE: src/memory.cpp ================================================ /* Copyright (c) 2019 Agenium Scale Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #ifdef NSIMD_IS_MSVC #include #else #ifndef _POSIX_C_SOURCE #define _POSIX_C_SOURCE 200112L #endif #include #endif // ---------------------------------------------------------------------------- #define NSIMD_INSIDE #include // ---------------------------------------------------------------------------- extern "C" { NSIMD_DLLEXPORT void *nsimd_aligned_alloc(nsimd_nat n) { #ifdef NSIMD_IS_MSVC return _aligned_malloc(n, NSIMD_MAX_ALIGNMENT); #else void *ptr; if (posix_memalign(&ptr, NSIMD_MAX_ALIGNMENT, (size_t)n)) { return NULL; } else { return ptr; } #endif } // ---------------------------------------------------------------------------- NSIMD_DLLEXPORT void nsimd_aligned_free(void *ptr) { #ifdef NSIMD_IS_MSVC _aligned_free(ptr); #else free(ptr); #endif } } // extern "C" ================================================ FILE: src/misc.h ================================================ // Copyright Naoki Shibata and contributors 2010 - 2020. // Distributed under the Boost Software License, Version 1.0. // (See accompanying file LICENSE.txt or copy at // http://www.boost.org/LICENSE_1_0.txt) // #ifndef __MISC_H__ #define __MISC_H__ #if !defined(SLEEF_GENHEADER) #include #include #endif #ifndef M_PI #define M_PI 3.141592653589793238462643383279502884 #endif #ifndef M_PIl #define M_PIl 3.141592653589793238462643383279502884L #endif #ifndef M_1_PI #define M_1_PI 0.318309886183790671537767526745028724 #endif #ifndef M_1_PIl #define M_1_PIl 0.318309886183790671537767526745028724L #endif #ifndef M_2_PI #define M_2_PI 0.636619772367581343075535053490057448 #endif #ifndef M_2_PIl #define M_2_PIl 0.636619772367581343075535053490057448L #endif #ifndef SLEEF_FP_ILOGB0 #define SLEEF_FP_ILOGB0 ((int)-2147483648) #endif #ifndef SLEEF_FP_ILOGBNAN #define SLEEF_FP_ILOGBNAN ((int)2147483647) #endif #define SLEEF_SNAN (((union { long long int i; double d; }) { .i = INT64_C(0x7ff0000000000001) }).d) #define SLEEF_SNANf (((union { long int i; float f; }) { .i = 0xff800001 }).f) // /* PI_A to PI_D are constants that satisfy the following two conditions. * For PI_A, PI_B and PI_C, the last 28 bits are zero. * PI_A + PI_B + PI_C + PI_D is close to PI as much as possible. The argument of a trig function is multiplied by 1/PI, and the integral part is divided into two parts, each has at most 28 bits. So, the maximum argument that could be correctly reduced should be 2^(28*2-1) PI = 1.1e+17. However, due to internal double precision calculation, the actual maximum argument that can be correctly reduced is around 2^47. */ #define PI_A 3.1415926218032836914 #define PI_B 3.1786509424591713469e-08 #define PI_C 1.2246467864107188502e-16 #define PI_D 1.2736634327021899816e-24 #define TRIGRANGEMAX 1e+14 /* PI_A2 and PI_B2 are constants that satisfy the following two conditions. * The last 3 bits of PI_A2 are zero. * PI_A2 + PI_B2 is close to PI as much as possible. The argument of a trig function is multiplied by 1/PI, and the integral part is multiplied by PI_A2. So, the maximum argument that could be correctly reduced should be 2^(3-1) PI = 12.6. By testing, we confirmed that it correctly reduces the argument up to around 15. */ #define PI_A2 3.141592653589793116 #define PI_B2 1.2246467991473532072e-16 #define TRIGRANGEMAX2 15 #define M_2_PI_H 0.63661977236758138243 #define M_2_PI_L -3.9357353350364971764e-17 #define SQRT_DBL_MAX 1.3407807929942596355e+154 #define TRIGRANGEMAX3 1e+9 #define M_4_PI 1.273239544735162542821171882678754627704620361328125 #define L2U .69314718055966295651160180568695068359375 #define L2L .28235290563031577122588448175013436025525412068e-12 #define R_LN2 1.442695040888963407359924681001892137426645954152985934135449406931 #define L10U 0.30102999566383914498 // log 2 / log 10 #define L10L 1.4205023227266099418e-13 #define LOG10_2 3.3219280948873623478703194294893901758648313930 #define L10Uf 0.3010253906f #define L10Lf 4.605038981e-06f // #define PI_Af 3.140625f #define PI_Bf 0.0009670257568359375f #define PI_Cf 6.2771141529083251953e-07f #define PI_Df 1.2154201256553420762e-10f #define TRIGRANGEMAXf 39000 #define PI_A2f 3.1414794921875f #define PI_B2f 0.00011315941810607910156f #define PI_C2f 1.9841872589410058936e-09f #define TRIGRANGEMAX2f 125.0f #define TRIGRANGEMAX4f 8e+6f #define SQRT_FLT_MAX 18446743523953729536.0 #define L2Uf 0.693145751953125f #define L2Lf 1.428606765330187045e-06f #define R_LN2f 1.442695040888963407359924681001892137426645954152985934135449406931f #define M_PIf ((float)M_PI) // #ifndef MIN #define MIN(x, y) ((x) < (y) ? (x) : (y)) #endif #ifndef MAX #define MAX(x, y) ((x) > (y) ? (x) : (y)) #endif #ifndef ABS #define ABS(x) ((x) < 0 ? -(x) : (x)) #endif #define stringify(s) stringify_(s) #define stringify_(s) #s #if !defined(SLEEF_GENHEADER) typedef long double longdouble; #endif #if !defined(Sleef_double2_DEFINED) && !defined(SLEEF_GENHEADER) #define Sleef_double2_DEFINED typedef struct { double x, y; } Sleef_double2; #endif #if !defined(Sleef_float2_DEFINED) && !defined(SLEEF_GENHEADER) #define Sleef_float2_DEFINED typedef struct { float x, y; } Sleef_float2; #endif #if !defined(Sleef_longdouble2_DEFINED) && !defined(SLEEF_GENHEADER) #define Sleef_longdouble2_DEFINED typedef struct { long double x, y; } Sleef_longdouble2; #endif #if !defined(Sleef_quad_DEFINED) && !defined(SLEEF_GENHEADER) #define Sleef_quad_DEFINED #if defined(ENABLEFLOAT128) typedef __float128 Sleef_quad; #else typedef struct { double x, y; } Sleef_quad; #endif #endif #if !defined(Sleef_quad1_DEFINED) && !defined(SLEEF_GENHEADER) #define Sleef_quad1_DEFINED typedef union { struct { Sleef_quad x; }; Sleef_quad s[1]; } Sleef_quad1; #endif #if !defined(Sleef_quad2_DEFINED) && !defined(SLEEF_GENHEADER) #define Sleef_quad2_DEFINED typedef union { struct { Sleef_quad x, y; }; Sleef_quad s[2]; } Sleef_quad2; #endif #if !defined(Sleef_quad4_DEFINED) && !defined(SLEEF_GENHEADER) #define Sleef_quad4_DEFINED typedef union { struct { Sleef_quad x, y, z, w; }; Sleef_quad s[4]; } Sleef_quad4; #endif #if !defined(Sleef_quad8_DEFINED) && !defined(SLEEF_GENHEADER) #define Sleef_quad8_DEFINED typedef union { Sleef_quad s[8]; } Sleef_quad8; #endif #if defined(__ARM_FEATURE_SVE) && !defined(Sleef_quadx_DEFINED) && !defined(SLEEF_GENHEADER) #define Sleef_quadx_DEFINED typedef union { Sleef_quad s[32]; } Sleef_quadx; #endif // #if (defined (__GNUC__) || defined (__clang__) || defined(__INTEL_COMPILER)) && !defined(_MSC_VER) #define LIKELY(condition) __builtin_expect(!!(condition), 1) #define UNLIKELY(condition) __builtin_expect(!!(condition), 0) #define RESTRICT __restrict__ #ifndef __arm__ #define ALIGNED(x) __attribute__((aligned(x))) #else #define ALIGNED(x) #endif #if defined(SLEEF_GENHEADER) #define INLINE SLEEF_ALWAYS_INLINE #define EXPORT SLEEF_INLINE #define CONST SLEEF_CONST #define NOEXPORT #else // #if defined(SLEEF_GENHEADER) #ifndef __INTEL_COMPILER #define CONST const #else #define CONST #endif #define INLINE __attribute__((always_inline)) #if defined(__MINGW32__) || defined(__MINGW64__) || defined(__CYGWIN__) #ifndef SLEEF_STATIC_LIBS #define EXPORT __stdcall __declspec(dllexport) #define NOEXPORT #else // #ifndef SLEEF_STATIC_LIBS #define EXPORT #define NOEXPORT #endif // #ifndef SLEEF_STATIC_LIBS #else // #if defined(__MINGW32__) || defined(__MINGW64__) || defined(__CYGWIN__) #define EXPORT __attribute__((visibility("default"))) #define NOEXPORT __attribute__ ((visibility ("hidden"))) #endif // #if defined(__MINGW32__) || defined(__MINGW64__) || defined(__CYGWIN__) #endif // #if defined(SLEEF_GENHEADER) #define SLEEF_NAN __builtin_nan("") #define SLEEF_NANf __builtin_nanf("") #define SLEEF_NANl __builtin_nanl("") #define SLEEF_INFINITY __builtin_inf() #define SLEEF_INFINITYf __builtin_inff() #define SLEEF_INFINITYl __builtin_infl() #if defined(__INTEL_COMPILER) || defined (__clang__) #define SLEEF_INFINITYq __builtin_inf() #define SLEEF_NANq __builtin_nan("") #else #define SLEEF_INFINITYq __builtin_infq() #define SLEEF_NANq (SLEEF_INFINITYq - SLEEF_INFINITYq) #endif #elif defined(_MSC_VER) // #if (defined (__GNUC__) || defined (__clang__) || defined(__INTEL_COMPILER)) && !defined(_MSC_VER) #define INLINE __forceinline #define CONST #define RESTRICT #define ALIGNED(x) #define LIKELY(condition) (condition) #define UNLIKELY(condition) (condition) #ifndef SLEEF_STATIC_LIBS #define EXPORT __declspec(dllexport) #define NOEXPORT #else #define EXPORT #define NOEXPORT #endif #if (defined(__GNUC__) || defined(__CLANG__)) && (defined(__i386__) || defined(__x86_64__)) && !defined(SLEEF_GENHEADER) #include #endif #define SLEEF_INFINITY (1e+300 * 1e+300) #define SLEEF_NAN (SLEEF_INFINITY - SLEEF_INFINITY) #define SLEEF_INFINITYf ((float)SLEEF_INFINITY) #define SLEEF_NANf ((float)SLEEF_NAN) #define SLEEF_INFINITYl ((long double)SLEEF_INFINITY) #define SLEEF_NANl ((long double)SLEEF_NAN) #if (defined(_M_AMD64) || defined(_M_X64)) #ifndef __SSE2__ #define __SSE2__ #define __SSE3__ #define __SSE4_1__ #endif #elif _M_IX86_FP == 2 #ifndef __SSE2__ #define __SSE2__ #define __SSE3__ #define __SSE4_1__ #endif #elif _M_IX86_FP == 1 #ifndef __SSE__ #define __SSE__ #endif #endif #endif // #elif defined(_MSC_VER) // #if (defined (__GNUC__) || defined (__clang__) || defined(__INTEL_COMPILER)) && !defined(_MSC_VER) #if !defined(__linux__) #define isinff(x) ((x) == SLEEF_INFINITYf || (x) == -SLEEF_INFINITYf) #define isinfl(x) ((x) == SLEEF_INFINITYl || (x) == -SLEEF_INFINITYl) #define isnanf(x) ((x) != (x)) #define isnanl(x) ((x) != (x)) #endif #endif // #ifndef __MISC_H__ #ifdef ENABLE_AAVPCS #define VECTOR_CC __attribute__((aarch64_vector_pcs)) #else #define VECTOR_CC #endif /* NSIMD specific */ #ifndef NSIMD_SLEEF_MISC_H #define NSIMD_SLEEF_MISC_H #ifdef INLINE #undef INLINE #endif #define INLINE inline #define Sleef_rempitabdp nsimd_sleef_rempitab_f64 #define Sleef_rempitabsp nsimd_sleef_rempitab_f32 #endif ================================================ FILE: src/rempitab.c ================================================ // Copyright Naoki Shibata and contributors 2010 - 2020. // Distributed under the Boost Software License, Version 1.0. // (See accompanying file LICENSE.txt or copy at // http://www.boost.org/LICENSE_1_0.txt) #include "misc.h" #if !defined(SLEEF_GENHEADER) #define FUNCATR NOEXPORT ALIGNED(64) #else #define FUNCATR EXPORT ALIGNED(64) #endif FUNCATR const double Sleef_rempitabdp[] = { 0.15915494309189531785, 1.7916237278037667488e-17, 2.5454160968749269937e-33, 2.1132476107887107169e-49, 0.03415494309189533173, 4.0384494702232122736e-18, 1.0046721413651383112e-33, 2.1132476107887107169e-49, 0.03415494309189533173, 4.0384494702232122736e-18, 1.0046721413651383112e-33, 2.1132476107887107169e-49, 0.0029049430918953351999, 5.6900251826959904774e-19, 4.1707169171520598517e-35, -2.496415728504571394e-51, 0.0029049430918953351999, 5.6900251826959904774e-19, 4.1707169171520598517e-35, -2.496415728504571394e-51, 0.0029049430918953351999, 5.6900251826959904774e-19, 4.1707169171520598517e-35, -2.496415728504571394e-51, 0.0029049430918953351999, 5.6900251826959904774e-19, 4.1707169171520598517e-35, -2.496415728504571394e-51, 0.00095181809189533563356, 1.3532164927539732229e-19, -6.4410794381603004826e-36, 1.7634898158762436344e-52, 0.00095181809189533563356, 1.3532164927539732229e-19, -6.4410794381603004826e-36, 1.7634898158762436344e-52, 0.00046353684189533574198, 2.6901432026846872871e-20, -4.2254836195018827479e-37, 9.301187206862134399e-54, 0.00021939621689533574198, 2.6901432026846872871e-20, -4.2254836195018827479e-37, 9.301187206862134399e-54, 9.7325904395335769087e-05, -2.0362228529073840241e-22, 6.2960434583523738135e-40, 2.6283399642369025999e-57, 3.6290748145335769087e-05, -2.0362228529073840241e-22, 6.2960434583523738135e-40, 2.6283399642369025999e-57, 5.7731700203357690874e-06, -2.0362228529073840241e-22, 6.2960434583523738135e-40, 2.6283399642369025999e-57, 5.7731700203357690874e-06, -2.0362228529073840241e-22, 6.2960434583523738135e-40, 2.6283399642369025999e-57, 5.7731700203357690874e-06, -2.0362228529073840241e-22, 6.2960434583523738135e-40, 2.6283399642369025999e-57, 1.9584727547107690874e-06, -2.0362228529073840241e-22, 6.2960434583523738135e-40, 2.6283399642369025999e-57, 5.1124121898268875627e-08, 8.135951522836682362e-24, 6.2960434583523738135e-40, 2.6283399642369025999e-57, 5.1124121898268875627e-08, 8.135951522836682362e-24, 6.2960434583523738135e-40, 2.6283399642369025999e-57, 5.1124121898268875627e-08, 8.135951522836682362e-24, 6.2960434583523738135e-40, 2.6283399642369025999e-57, 5.1124121898268875627e-08, 8.135951522836682362e-24, 6.2960434583523738135e-40, 2.6283399642369025999e-57, 5.1124121898268875627e-08, 8.135951522836682362e-24, 6.2960434583523738135e-40, 2.6283399642369025999e-57, 5.1124121898268875627e-08, 8.135951522836682362e-24, 6.2960434583523738135e-40, 2.6283399642369025999e-57, 2.1321799510573569745e-08, 1.5185066224124613304e-24, 2.6226236120327253511e-40, 2.6283399642369025999e-57, 6.4206383167259151492e-09, -1.3585460269359374382e-25, -1.3244127270701094468e-41, -2.4695541513869446866e-57, 6.4206383167259151492e-09, -1.3585460269359374382e-25, -1.3244127270701094468e-41, -2.4695541513869446866e-57, 2.6953480182640010867e-09, -1.3585460269359374382e-25, -1.3244127270701094468e-41, -2.4695541513869446866e-57, 8.3270286903304384868e-10, 7.0940550444663151936e-26, 9.7147467687967058732e-42, 7.9392906424978921242e-59, 8.3270286903304384868e-10, 7.0940550444663151936e-26, 9.7147467687967058732e-42, 7.9392906424978921242e-59, 3.6704158172530459087e-10, 7.0940550444663151936e-26, 9.7147467687967058732e-42, 7.9392906424978921242e-59, 1.3421093807143501366e-10, 1.9241762160098927996e-26, 3.9750282589222551507e-42, 7.9392906424978921242e-59, 1.7795616244500218596e-11, -1.452834466126541428e-28, -1.5869767474823787636e-44, -2.6168913164368963837e-61, 1.7795616244500218596e-11, -1.452834466126541428e-28, -1.5869767474823787636e-44, -2.6168913164368963837e-61, 1.7795616244500218596e-11, -1.452834466126541428e-28, -1.5869767474823787636e-44, -2.6168913164368963837e-61, 3.2437010161333667893e-12, -1.452834466126541428e-28, -1.5869767474823787636e-44, -2.6168913164368963837e-61, 3.2437010161333667893e-12, -1.452834466126541428e-28, -1.5869767474823787636e-44, -2.6168913164368963837e-61, 3.2437010161333667893e-12, -1.452834466126541428e-28, -1.5869767474823787636e-44, -2.6168913164368963837e-61, 1.4247116125875099096e-12, 2.5861333686050385673e-28, 2.8971783383570358633e-44, -2.6168913164368963837e-61, 5.1521691081458187359e-13, 5.6664945123924856962e-29, 6.5510079543732854985e-45, -2.6168913164368963837e-61, 6.0469559928117805118e-14, 6.1778471897801070206e-30, 9.4581409707401690366e-46, 4.9461632249367446986e-62, 6.0469559928117805118e-14, 6.1778471897801070206e-30, 9.4581409707401690366e-46, 4.9461632249367446986e-62, 6.0469559928117805118e-14, 6.1778471897801070206e-30, 9.4581409707401690366e-46, 4.9461632249367446986e-62, 3.6261410673097965595e-15, -1.3304005198798645927e-31, -1.7578597149294783985e-47, 8.4432539107728104262e-64, 3.6261410673097965595e-15, -1.3304005198798645927e-31, -1.7578597149294783985e-47, 8.4432539107728104262e-64, 3.6261410673097965595e-15, -1.3304005198798645927e-31, -1.7578597149294783985e-47, 8.4432539107728104262e-64, 3.6261410673097965595e-15, -1.3304005198798645927e-31, -1.7578597149294783985e-47, 8.4432539107728104262e-64, 7.3427388509295482183e-17, 1.4871367740953237822e-32, -1.1571307704883330232e-48, -6.7249112515659578102e-65, 7.3427388509295482183e-17, 1.4871367740953237822e-32, -1.1571307704883330232e-48, -6.7249112515659578102e-65, 7.3427388509295482183e-17, 1.4871367740953237822e-32, -1.1571307704883330232e-48, -6.7249112515659578102e-65, 7.3427388509295482183e-17, 1.4871367740953237822e-32, -1.1571307704883330232e-48, -6.7249112515659578102e-65, 7.3427388509295482183e-17, 1.4871367740953237822e-32, -1.1571307704883330232e-48, -6.7249112515659578102e-65, 7.3427388509295482183e-17, 1.4871367740953237822e-32, -1.1571307704883330232e-48, -6.7249112515659578102e-65, 1.7916237278037667488e-17, 2.5454160968749269937e-33, 2.1132476107887107169e-49, 8.7154294504188129325e-66, 1.7916237278037667488e-17, 2.5454160968749269937e-33, 2.1132476107887107169e-49, 8.7154294504188129325e-66, 4.0384494702232122736e-18, 1.0046721413651383112e-33, 2.1132476107887107169e-49, 8.7154294504188129325e-66, 4.0384494702232122736e-18, 1.0046721413651383112e-33, 2.1132476107887107169e-49, 8.7154294504188129325e-66, 5.6900251826959904774e-19, 4.1707169171520598517e-35, -2.4964157285045710972e-51, -1.866653112309982615e-67, 5.6900251826959904774e-19, 4.1707169171520598517e-35, -2.4964157285045710972e-51, -1.866653112309982615e-67, 5.6900251826959904774e-19, 4.1707169171520598517e-35, -2.4964157285045710972e-51, -1.866653112309982615e-67, 1.3532164927539732229e-19, -6.4410794381603004826e-36, 1.7634898158762432635e-52, 3.5887057810247033998e-68, 1.3532164927539732229e-19, -6.4410794381603004826e-36, 1.7634898158762432635e-52, 3.5887057810247033998e-68, 2.6901432026846872871e-20, -4.2254836195018827479e-37, 9.3011872068621332399e-54, 1.113250147552460308e-69, 2.6901432026846872871e-20, -4.2254836195018827479e-37, 9.3011872068621332399e-54, 1.113250147552460308e-69, 2.6901432026846872871e-20, -4.2254836195018827479e-37, 9.3011872068621332399e-54, 1.113250147552460308e-69, 1.3348904870778067446e-20, -4.2254836195018827479e-37, 9.3011872068621332399e-54, 1.113250147552460308e-69, 6.5726412927436632287e-21, 1.0820844071023395684e-36, 1.7634898158762432635e-52, 3.5887057810247033998e-68, 3.1845095037264626247e-21, 3.2976802257607573031e-37, 9.3011872068621332399e-54, 1.113250147552460308e-69, 1.4904436092178623228e-21, -4.6390169687056261795e-38, -1.1392999419355048437e-54, -4.587677453735884283e-71, 6.4341066196356198368e-22, -4.6390169687056261795e-38, -1.1392999419355048437e-54, -4.587677453735884283e-71, 2.1989418833641172011e-22, 4.7649378378726728402e-38, 9.3011872068621332399e-54, 1.113250147552460308e-69, 8.135951522836682362e-24, 6.2960434583523738135e-40, 2.6283399642369020339e-57, 5.3358074162805516304e-73, 8.135951522836682362e-24, 6.2960434583523738135e-40, 2.6283399642369020339e-57, 5.3358074162805516304e-73, 8.135951522836682362e-24, 6.2960434583523738135e-40, 2.6283399642369020339e-57, 5.3358074162805516304e-73, 8.135951522836682362e-24, 6.2960434583523738135e-40, 2.6283399642369020339e-57, 5.3358074162805516304e-73, 8.135951522836682362e-24, 6.2960434583523738135e-40, 2.6283399642369020339e-57, 5.3358074162805516304e-73, 1.5185066224124613304e-24, 2.6226236120327253511e-40, 2.6283399642369020339e-57, 5.3358074162805516304e-73, 1.5185066224124613304e-24, 2.6226236120327253511e-40, 2.6283399642369020339e-57, 5.3358074162805516304e-73, 1.5185066224124613304e-24, 2.6226236120327253511e-40, 2.6283399642369020339e-57, 5.3358074162805516304e-73, 6.9132600985943383921e-25, 7.8591368887290111994e-41, 2.6283399642369020339e-57, 5.3358074162805516304e-73, 2.7773570358292009361e-25, -1.3244127270701094468e-41, -2.4695541513869446866e-57, -3.2399200798614356002e-74, 7.0940550444663151936e-26, 9.7147467687967058732e-42, 7.9392906424978921242e-59, 2.9745456030524896742e-75, 7.0940550444663151936e-26, 9.7147467687967058732e-42, 7.9392906424978921242e-59, 2.9745456030524896742e-75, 1.9241762160098927996e-26, 3.9750282589222551507e-42, 7.9392906424978921242e-59, 2.9745456030524896742e-75, 1.9241762160098927996e-26, 3.9750282589222551507e-42, 7.9392906424978921242e-59, 2.9745456030524896742e-75, 6.317065088957874881e-27, -3.2976062348358281152e-43, -2.6168913164368963837e-61, 3.7036201000008290615e-78, 6.317065088957874881e-27, -3.2976062348358281152e-43, -2.6168913164368963837e-61, 3.7036201000008290615e-78, 3.0858908211726098086e-27, 3.8770419025072344914e-43, 7.9392906424978921242e-59, 2.9745456030524896742e-75, 1.4703036872799779898e-27, 2.8971783383570358633e-44, -2.6168913164368963837e-61, 3.7036201000008290615e-78, 6.625101203336619011e-28, 2.8971783383570358633e-44, -2.6168913164368963837e-61, 3.7036201000008290615e-78, 2.5861333686050385673e-28, 2.8971783383570358633e-44, -2.6168913164368963837e-61, 3.7036201000008290615e-78, 5.6664945123924856962e-29, 6.5510079543732854985e-45, -2.6168913164368963837e-61, 3.7036201000008290615e-78, 5.6664945123924856962e-29, 6.5510079543732854985e-45, -2.6168913164368963837e-61, 3.7036201000008290615e-78, 6.1778471897801070206e-30, 9.4581409707401690366e-46, 4.9461632249367446986e-62, 3.7036201000008290615e-78, 6.1778471897801070206e-30, 9.4581409707401690366e-46, 4.9461632249367446986e-62, 3.7036201000008290615e-78, 6.1778471897801070206e-30, 9.4581409707401690366e-46, 4.9461632249367446986e-62, 3.7036201000008290615e-78, 6.1778471897801070206e-30, 9.4581409707401690366e-46, 4.9461632249367446986e-62, 3.7036201000008290615e-78, 3.0224035688960604996e-30, 2.451648649116083682e-46, 4.9461632249367446986e-62, 3.7036201000008290615e-78, 1.4446817584540368888e-30, 2.451648649116083682e-46, 4.9461632249367446986e-62, 3.7036201000008290615e-78, 6.5582085323302525856e-31, 7.0002556871006273225e-47, 1.0567786762735315635e-62, -6.1446417754639313137e-79, 2.6139040062251944343e-31, -1.7578597149294783985e-47, 8.4432539107728090768e-64, 1.9517662449371102229e-79, 6.4175174317266470186e-32, 4.3166913557804827486e-48, 8.4432539107728090768e-64, 1.9517662449371102229e-79, 6.4175174317266470186e-32, 4.3166913557804827486e-48, 8.4432539107728090768e-64, 1.9517662449371102229e-79, 1.4871367740953237822e-32, -1.1571307704883330232e-48, -6.7249112515659569668e-65, -7.2335760163150273591e-81, 1.4871367740953237822e-32, -1.1571307704883330232e-48, -6.7249112515659569668e-65, -7.2335760163150273591e-81, 2.5454160968749269937e-33, 2.1132476107887107169e-49, 8.7154294504188118783e-66, 1.2001823382693912203e-81, 2.5454160968749269937e-33, 2.1132476107887107169e-49, 8.7154294504188118783e-66, 1.2001823382693912203e-81, 2.5454160968749269937e-33, 2.1132476107887107169e-49, 8.7154294504188118783e-66, 1.2001823382693912203e-81, 1.0046721413651383112e-33, 2.1132476107887107169e-49, 8.7154294504188118783e-66, 1.2001823382693912203e-81, 2.3430016361024414106e-34, 4.0267819632970559834e-50, -7.8013829534098555144e-67, -1.1759240463442418271e-82, 2.3430016361024414106e-34, 4.0267819632970559834e-50, -7.8013829534098555144e-67, -1.1759240463442418271e-82, 4.1707169171520598517e-35, -2.4964157285045710972e-51, -1.866653112309982615e-67, 1.4185069655957361252e-83, 4.1707169171520598517e-35, -2.4964157285045710972e-51, -1.866653112309982615e-67, 1.4185069655957361252e-83, 4.1707169171520598517e-35, -2.4964157285045710972e-51, -1.866653112309982615e-67, 1.4185069655957361252e-83, 1.7633044866680145008e-35, 2.8491136916798196016e-51, 4.0680767287898916022e-67, 1.4185069655957361252e-83, 5.595982714259923599e-36, 1.7634898158762432635e-52, 3.588705781024702988e-68, 5.9489775128085140685e-84, 5.595982714259923599e-36, 1.7634898158762432635e-52, 3.588705781024702988e-68, 5.9489775128085140685e-84, 2.5867171761548675786e-36, 1.7634898158762432635e-52, 3.588705781024702988e-68, 5.9489775128085140685e-84, 1.0820844071023395684e-36, 1.7634898158762432635e-52, 3.588705781024702988e-68, 5.9489775128085140685e-84, 3.2976802257607573031e-37, 9.3011872068621332399e-54, 1.113250147552460308e-69, 2.9286284920280944778e-86, 3.2976802257607573031e-37, 9.3011872068621332399e-54, 1.113250147552460308e-69, 2.9286284920280944778e-86, 1.4168892644450972904e-37, 9.3011872068621332399e-54, 1.113250147552460308e-69, 2.9286284920280944778e-86, 4.7649378378726728402e-38, 9.3011872068621332399e-54, 1.113250147552460308e-69, 2.9286284920280944778e-86, 6.2960434583523738135e-40, 2.6283399642369020339e-57, 5.3358074162805516304e-73, 4.524218473063975309e-90, 6.2960434583523738135e-40, 2.6283399642369020339e-57, 5.3358074162805516304e-73, 4.524218473063975309e-90, 6.2960434583523738135e-40, 2.6283399642369020339e-57, 5.3358074162805516304e-73, 4.524218473063975309e-90, 6.2960434583523738135e-40, 2.6283399642369020339e-57, 5.3358074162805516304e-73, 4.524218473063975309e-90, 6.2960434583523738135e-40, 2.6283399642369020339e-57, 5.3358074162805516304e-73, 4.524218473063975309e-90, 6.2960434583523738135e-40, 2.6283399642369020339e-57, 5.3358074162805516304e-73, 4.524218473063975309e-90, 6.2960434583523738135e-40, 2.6283399642369020339e-57, 5.3358074162805516304e-73, 4.524218473063975309e-90, 2.6226236120327253511e-40, 2.6283399642369020339e-57, 5.3358074162805516304e-73, 4.524218473063975309e-90, 7.8591368887290111994e-41, 2.6283399642369020339e-57, 5.3358074162805516304e-73, 4.524218473063975309e-90, 7.8591368887290111994e-41, 2.6283399642369020339e-57, 5.3358074162805516304e-73, 4.524218473063975309e-90, 3.2673620808294506214e-41, 2.6283399642369020339e-57, 5.3358074162805516304e-73, 4.524218473063975309e-90, 9.7147467687967058732e-42, 7.9392906424978921242e-59, 2.9745456030524891833e-75, 5.969437008257943935e-91, 9.7147467687967058732e-42, 7.9392906424978921242e-59, 2.9745456030524891833e-75, 5.969437008257943935e-91, 3.9750282589222551507e-42, 7.9392906424978921242e-59, 2.9745456030524891833e-75, 5.969437008257943935e-91, 1.1051690039850297894e-42, 7.9392906424978921242e-59, 2.9745456030524891833e-75, 5.969437008257943935e-91, 1.1051690039850297894e-42, 7.9392906424978921242e-59, 2.9745456030524891833e-75, 5.969437008257943935e-91, 3.8770419025072344914e-43, 7.9392906424978921242e-59, 2.9745456030524891833e-75, 5.969437008257943935e-91, 2.8971783383570358633e-44, -2.6168913164368963837e-61, 3.7036201000008285821e-78, 5.6554937751584084315e-94, 2.8971783383570358633e-44, -2.6168913164368963837e-61, 3.7036201000008285821e-78, 5.6554937751584084315e-94, 2.8971783383570358633e-44, -2.6168913164368963837e-61, 3.7036201000008285821e-78, 5.6554937751584084315e-94, 2.8971783383570358633e-44, -2.6168913164368963837e-61, 3.7036201000008285821e-78, 5.6554937751584084315e-94, 6.5510079543732854985e-45, -2.6168913164368963837e-61, 3.7036201000008285821e-78, 5.6554937751584084315e-94, 6.5510079543732854985e-45, -2.6168913164368963837e-61, 3.7036201000008285821e-78, 5.6554937751584084315e-94, 9.4581409707401690366e-46, 4.9461632249367446986e-62, 3.7036201000008285821e-78, 5.6554937751584084315e-94, 9.4581409707401690366e-46, 4.9461632249367446986e-62, 3.7036201000008285821e-78, 5.6554937751584084315e-94, 9.4581409707401690366e-46, 4.9461632249367446986e-62, 3.7036201000008285821e-78, 5.6554937751584084315e-94, 2.451648649116083682e-46, 4.9461632249367446986e-62, 3.7036201000008285821e-78, 5.6554937751584084315e-94, 2.451648649116083682e-46, 4.9461632249367446986e-62, 3.7036201000008285821e-78, 5.6554937751584084315e-94, 7.0002556871006273225e-47, 1.0567786762735315635e-62, -6.1446417754639301152e-79, -1.5355611056488084652e-94, 7.0002556871006273225e-47, 1.0567786762735315635e-62, -6.1446417754639301152e-79, -1.5355611056488084652e-94, 2.6211979860855749482e-47, 8.4432539107728090768e-64, 1.9517662449371099233e-79, 2.62202614552995759e-95, 4.3166913557804827486e-48, 8.4432539107728090768e-64, 1.9517662449371099233e-79, 2.62202614552995759e-95, 4.3166913557804827486e-48, 8.4432539107728090768e-64, 1.9517662449371099233e-79, 2.62202614552995759e-95, 4.3166913557804827486e-48, 8.4432539107728090768e-64, 1.9517662449371099233e-79, 2.62202614552995759e-95, 1.5797802926460750146e-48, 2.3660905534865399025e-64, -7.2335760163150273591e-81, 2.8738690232659205689e-99, 2.1132476107887107169e-49, 8.7154294504188118783e-66, 1.2001823382693912203e-81, 2.8738690232659205689e-99, 2.1132476107887107169e-49, 8.7154294504188118783e-66, 1.2001823382693912203e-81, 2.8738690232659205689e-99, 2.1132476107887107169e-49, 8.7154294504188118783e-66, 1.2001823382693912203e-81, 2.8738690232659205689e-99, 4.0267819632970559834e-50, -7.8013829534098555144e-67, -1.1759240463442418271e-82, 2.8738690232659205689e-99, 4.0267819632970559834e-50, -7.8013829534098555144e-67, -1.1759240463442418271e-82, 2.8738690232659205689e-99, 4.0267819632970559834e-50, -7.8013829534098555144e-67, -1.1759240463442418271e-82, 2.8738690232659205689e-99, 1.8885701952232994665e-50, -7.8013829534098555144e-67, -1.1759240463442418271e-82, 2.8738690232659205689e-99, 8.1946431118642097069e-51, 1.5937536410989638719e-66, 1.459625439463388979e-82, 2.8738690232659205689e-99, 2.8491136916798196016e-51, 4.0680767287898916022e-67, 1.4185069655957361252e-83, -7.8369062883735917115e-100, 1.7634898158762432635e-52, 3.588705781024702988e-68, 5.9489775128085131541e-84, 1.0450891972142808004e-99, 1.7634898158762432635e-52, 3.588705781024702988e-68, 5.9489775128085131541e-84, 1.0450891972142808004e-99, 1.7634898158762432635e-52, 3.588705781024702988e-68, 5.9489775128085131541e-84, 1.0450891972142808004e-99, 1.7634898158762432635e-52, 3.588705781024702988e-68, 5.9489775128085131541e-84, 1.0450891972142808004e-99, 9.3011872068621332399e-54, 1.113250147552460308e-69, 2.9286284920280941206e-86, 2.1132026692048600853e-102, 9.3011872068621332399e-54, 1.113250147552460308e-69, 2.9286284920280941206e-86, 2.1132026692048600853e-102, 9.3011872068621332399e-54, 1.113250147552460308e-69, 2.9286284920280941206e-86, 2.1132026692048600853e-102, 9.3011872068621332399e-54, 1.113250147552460308e-69, 2.9286284920280941206e-86, 2.1132026692048600853e-102, 9.3011872068621332399e-54, 1.113250147552460308e-69, 2.9286284920280941206e-86, 2.1132026692048600853e-102, 4.0809436324633147776e-54, -4.587677453735884283e-71, -2.8859500138942368532e-87, -5.6567402911297190423e-103, 1.470821845263904967e-54, -4.587677453735884283e-71, -2.8859500138942368532e-87, -5.6567402911297190423e-103, 1.6576095166419998917e-55, 2.6568658093254848067e-71, 5.1571087196495574384e-87, 3.2728487032630537605e-103, 1.6576095166419998917e-55, 2.6568658093254848067e-71, 5.1571087196495574384e-87, 3.2728487032630537605e-103, 1.6576095166419998917e-55, 2.6568658093254848067e-71, 5.1571087196495574384e-87, 3.2728487032630537605e-103, 2.6283399642369020339e-57, 5.3358074162805516304e-73, 4.5242184730639744369e-90, 1.145584788913072936e-105, 2.6283399642369020339e-57, 5.3358074162805516304e-73, 4.5242184730639744369e-90, 1.145584788913072936e-105, 2.6283399642369020339e-57, 5.3358074162805516304e-73, 4.5242184730639744369e-90, 1.145584788913072936e-105, 2.6283399642369020339e-57, 5.3358074162805516304e-73, 4.5242184730639744369e-90, 1.145584788913072936e-105, 2.6283399642369020339e-57, 5.3358074162805516304e-73, 4.5242184730639744369e-90, 1.145584788913072936e-105, 2.6283399642369020339e-57, 5.3358074162805516304e-73, 4.5242184730639744369e-90, 1.145584788913072936e-105, 7.9392906424978921242e-59, 2.9745456030524891833e-75, 5.969437008257942845e-91, 5.554706987098633963e-107, 7.9392906424978921242e-59, 2.9745456030524891833e-75, 5.969437008257942845e-91, 5.554706987098633963e-107, 7.9392906424978921242e-59, 2.9745456030524891833e-75, 5.969437008257942845e-91, 5.554706987098633963e-107, 7.9392906424978921242e-59, 2.9745456030524891833e-75, 5.969437008257942845e-91, 5.554706987098633963e-107, 7.9392906424978921242e-59, 2.9745456030524891833e-75, 5.969437008257942845e-91, 5.554706987098633963e-107, 7.9392906424978921242e-59, 2.9745456030524891833e-75, 5.969437008257942845e-91, 5.554706987098633963e-107, 3.9565608646667614317e-59, 2.9745456030524891833e-75, 5.969437008257942845e-91, 5.554706987098633963e-107, 1.9651959757511960854e-59, 2.9745456030524891833e-75, 5.969437008257942845e-91, 5.554706987098633963e-107, 9.6951353129341363331e-60, 7.6368645294831185015e-76, 1.0603435429602168369e-91, 1.0451839188820145747e-108, 4.7167230906452229674e-60, 7.6368645294831185015e-76, 1.0603435429602168369e-91, 1.0451839188820145747e-108, 2.2275169795007668372e-60, 2.1097166542226745549e-76, 4.4670685979800101779e-92, 1.0451839188820145747e-108, 9.8291392392853877215e-61, -6.5385728340754726503e-77, -1.3520652573660833788e-93, -2.3220403312043059402e-109, 3.6061239614242446325e-61, 7.2792968540756372162e-77, 1.3988851821689310822e-92, 1.0451839188820145747e-108, 4.9461632249367446986e-62, 3.7036201000008285821e-78, 5.6554937751584084315e-94, -1.9306041120023063932e-110, 4.9461632249367446986e-62, 3.7036201000008285821e-78, 5.6554937751584084315e-94, -1.9306041120023063932e-110, 4.9461632249367446986e-62, 3.7036201000008285821e-78, 5.6554937751584084315e-94, -1.9306041120023063932e-110, 1.0567786762735315635e-62, -6.1446417754639301152e-79, -1.535561105648808199e-94, -1.9306041120023063932e-110, 1.0567786762735315635e-62, -6.1446417754639301152e-79, -1.535561105648808199e-94, -1.9306041120023063932e-110, 8.4432539107728090768e-64, 1.9517662449371099233e-79, 2.62202614552995759e-95, 6.5314563001514358328e-112, 8.4432539107728090768e-64, 1.9517662449371099233e-79, 2.62202614552995759e-95, 6.5314563001514358328e-112, 8.4432539107728090768e-64, 1.9517662449371099233e-79, 2.62202614552995759e-95, 6.5314563001514358328e-112, 8.4432539107728090768e-64, 1.9517662449371099233e-79, 2.62202614552995759e-95, 6.5314563001514358328e-112, 2.3660905534865399025e-64, -7.2335760163150273591e-81, 2.8738690232659205689e-99, 1.8395411057335783574e-115, 2.3660905534865399025e-64, -7.2335760163150273591e-81, 2.8738690232659205689e-99, 1.8395411057335783574e-115, 8.4679971416497210292e-65, -7.2335760163150273591e-81, 2.8738690232659205689e-99, 1.8395411057335783574e-115, 8.7154294504188118783e-66, 1.2001823382693912203e-81, 2.8738690232659205689e-99, 1.8395411057335783574e-115, 8.7154294504188118783e-66, 1.2001823382693912203e-81, 2.8738690232659205689e-99, 1.8395411057335783574e-115, 8.7154294504188118783e-66, 1.2001823382693912203e-81, 2.8738690232659205689e-99, 1.8395411057335783574e-115, 8.7154294504188118783e-66, 1.2001823382693912203e-81, 2.8738690232659205689e-99, 1.8395411057335783574e-115, 3.9676455775389135587e-66, 1.459625439463388979e-82, 2.8738690232659205689e-99, 1.8395411057335783574e-115, 1.5937536410989638719e-66, 1.459625439463388979e-82, 2.8738690232659205689e-99, 1.8395411057335783574e-115, 4.0680767287898916022e-67, 1.4185069655957361252e-83, -7.8369062883735917115e-100, -1.9081236411894110579e-116, 4.0680767287898916022e-67, 1.4185069655957361252e-83, -7.8369062883735917115e-100, -1.9081236411894110579e-116, 1.1007118082399544936e-67, 1.4185069655957361252e-83, -7.8369062883735917115e-100, -1.9081236411894110579e-116, 1.1007118082399544936e-67, 1.4185069655957361252e-83, -7.8369062883735917115e-100, -1.9081236411894110579e-116, 3.588705781024702988e-68, 5.9489775128085131541e-84, 1.0450891972142805974e-99, 1.8395411057335783574e-115, 3.588705781024702988e-68, 5.9489775128085131541e-84, 1.0450891972142805974e-99, 1.8395411057335783574e-115, 1.7341027056809927069e-68, 1.830931441234090934e-84, 1.3069928418846076386e-100, 3.1677600334418876704e-116, 8.0680116800913756637e-69, -2.2809159455312046184e-85, -4.0748824503880445403e-101, -6.3915272253158644628e-117, 3.4315039917320989315e-69, -2.2809159455312046184e-85, -4.0748824503880445403e-101, -6.3915272253158644628e-117, 1.113250147552460308e-69, 2.9286284920280941206e-86, 2.1132026692048600853e-102, -4.6672632026740766185e-119, 1.113250147552460308e-69, 2.9286284920280941206e-86, 2.1132026692048600853e-102, -4.6672632026740766185e-119, 5.3368668650755071652e-70, 2.9286284920280941206e-86, 2.1132026692048600853e-102, -4.6672632026740766185e-119, 2.4390495598509592076e-70, 2.9286284920280941206e-86, 2.1132026692048600853e-102, -4.6672632026740766185e-119, 9.901409072386855505e-71, -2.8859500138942368532e-87, -5.6567402911297190423e-103, -4.6672632026740766185e-119, 2.6568658093254848067e-71, 5.1571087196495574384e-87, 3.2728487032630532648e-103, 5.2465720993401781599e-119, 2.6568658093254848067e-71, 5.1571087196495574384e-87, 3.2728487032630532648e-103, 5.2465720993401781599e-119, 8.4572999356014273536e-72, 1.1355793528776598461e-87, 3.2728487032630532648e-103, 5.2465720993401781599e-119, 8.4572999356014273536e-72, 1.1355793528776598461e-87, 3.2728487032630532648e-103, 5.2465720993401781599e-119, 3.9294603961880721752e-72, 1.3019701118468578292e-88, -7.5747169634236195447e-105, -2.0152904854894729832e-121, 1.6655406264813940833e-72, 1.3019701118468578292e-88, -7.5747169634236195447e-105, -2.0152904854894729832e-121, 5.3358074162805516304e-73, 4.5242184730639744369e-90, 1.1455847889130727424e-105, 1.8573014293598455046e-121, 5.3358074162805516304e-73, 4.5242184730639744369e-90, 1.1455847889130727424e-105, 1.8573014293598455046e-121, 2.5059077041472040156e-73, 4.5242184730639744369e-90, 1.1455847889130727424e-105, 1.8573014293598455046e-121, 1.0909578480805302081e-73, 4.5242184730639744369e-90, 1.1455847889130727424e-105, 1.8573014293598455046e-121, 3.8348292004719330442e-74, 4.5242184730639744369e-90, 1.1455847889130727424e-105, 1.8573014293598455046e-121, 2.9745456030524891833e-75, 5.969437008257942845e-91, 5.5547069870986327528e-107, 1.6304246661326865276e-122, 2.9745456030524891833e-75, 5.969437008257942845e-91, 5.5547069870986327528e-107, 1.6304246661326865276e-122, 2.9745456030524891833e-75, 5.969437008257942845e-91, 5.5547069870986327528e-107, 1.6304246661326865276e-122, 2.9745456030524891833e-75, 5.969437008257942845e-91, 5.5547069870986327528e-107, 1.6304246661326865276e-122, 7.6368645294831185015e-76, 1.0603435429602168369e-91, 1.0451839188820145747e-108, 4.2386081393205242443e-125, 7.6368645294831185015e-76, 1.0603435429602168369e-91, 1.0451839188820145747e-108, 4.2386081393205242443e-125, 2.1097166542226745549e-76, 4.4670685979800101779e-92, 1.0451839188820145747e-108, 4.2386081393205242443e-125, 2.1097166542226745549e-76, 4.4670685979800101779e-92, 1.0451839188820145747e-108, 4.2386081393205242443e-125, 7.2792968540756372162e-77, 1.3988851821689310822e-92, 1.0451839188820145747e-108, 4.2386081393205242443e-125, 3.7036201000008285821e-78, 5.6554937751584084315e-94, -1.9306041120023063932e-110, 1.0223371855251472933e-126, 3.7036201000008285821e-78, 5.6554937751584084315e-94, -1.9306041120023063932e-110, 1.0223371855251472933e-126, 3.7036201000008285821e-78, 5.6554937751584084315e-94, -1.9306041120023063932e-110, 1.0223371855251472933e-126, 3.7036201000008285821e-78, 5.6554937751584084315e-94, -1.9306041120023063932e-110, 1.0223371855251472933e-126, 3.7036201000008285821e-78, 5.6554937751584084315e-94, -1.9306041120023063932e-110, 1.0223371855251472933e-126, 1.5445779612272179051e-78, 8.6145718795359707834e-95, 7.3062078800278780675e-111, 1.0223371855251472933e-126, 4.6505689184041232695e-79, 8.6145718795359707834e-95, 7.3062078800278780675e-111, 1.0223371855251472933e-126, 4.6505689184041232695e-79, 8.6145718795359707834e-95, 7.3062078800278780675e-111, 1.0223371855251472933e-126, 1.9517662449371099233e-79, 2.62202614552995759e-95, 6.5314563001514349095e-112, 9.9039323746573674262e-128, 6.0236490820360325022e-80, -3.7424672147304925625e-96, -1.784871512364483542e-112, 6.7095375687163151728e-129, 6.0236490820360325022e-80, -3.7424672147304925625e-96, -1.784871512364483542e-112, 6.7095375687163151728e-129, 2.6501457402022643213e-80, 3.7482149527770239293e-96, 6.5314563001514349095e-112, 9.9039323746573674262e-128, 9.6339406928538097998e-81, 2.8738690232659205689e-99, 1.8395411057335783574e-115, -7.8150389500644475446e-132, 1.2001823382693912203e-81, 2.8738690232659205689e-99, 1.8395411057335783574e-115, -7.8150389500644475446e-132, 1.2001823382693912203e-81, 2.8738690232659205689e-99, 1.8395411057335783574e-115, -7.8150389500644475446e-132, 1.2001823382693912203e-81, 2.8738690232659205689e-99, 1.8395411057335783574e-115, -7.8150389500644475446e-132, 1.459625439463388979e-82, 2.8738690232659205689e-99, 1.8395411057335783574e-115, -7.8150389500644475446e-132, 1.459625439463388979e-82, 2.8738690232659205689e-99, 1.8395411057335783574e-115, -7.8150389500644475446e-132, 1.459625439463388979e-82, 2.8738690232659205689e-99, 1.8395411057335783574e-115, -7.8150389500644475446e-132, 1.4185069655957361252e-83, -7.8369062883735917115e-100, -1.9081236411894107761e-116, -2.1796760241698337334e-132, 1.4185069655957361252e-83, -7.8369062883735917115e-100, -1.9081236411894107761e-116, -2.1796760241698337334e-132, 1.4185069655957361252e-83, -7.8369062883735917115e-100, -1.9081236411894107761e-116, -2.1796760241698337334e-132, 1.4185069655957361252e-83, -7.8369062883735917115e-100, -1.9081236411894107761e-116, -2.1796760241698337334e-132, 5.9489775128085131541e-84, 1.0450891972142805974e-99, 1.8395411057335783574e-115, -7.8150389500644475446e-132, 1.830931441234090934e-84, 1.3069928418846076386e-100, 3.1677600334418871069e-116, 3.4556869017247800778e-132, 1.830931441234090934e-84, 1.3069928418846076386e-100, 3.1677600334418871069e-116, 3.4556869017247800778e-132, 8.0141992334048515034e-85, 1.3069928418846076386e-100, 3.1677600334418871069e-116, 3.4556869017247800778e-132, 2.8666416439368237283e-85, 1.6400545060233297363e-101, -4.6672632026740766185e-119, -3.755176715260116501e-136, 2.9286284920280941206e-86, 2.1132026692048600853e-102, -4.6672632026740766185e-119, -3.755176715260116501e-136, 2.9286284920280941206e-86, 2.1132026692048600853e-102, -4.6672632026740766185e-119, -3.755176715260116501e-136, 2.9286284920280941206e-86, 2.1132026692048600853e-102, -4.6672632026740766185e-119, -3.755176715260116501e-136, 2.9286284920280941206e-86, 2.1132026692048600853e-102, -4.6672632026740766185e-119, -3.755176715260116501e-136, 1.3200167453193350837e-86, 2.1132026692048600853e-102, -4.6672632026740766185e-119, -3.755176715260116501e-136, 5.1571087196495574384e-87, 3.2728487032630532648e-103, 5.2465720993401781599e-119, -3.755176715260116501e-136, 1.1355793528776598461e-87, 3.2728487032630532648e-103, 5.2465720993401781599e-119, -3.755176715260116501e-136, 1.1355793528776598461e-87, 3.2728487032630532648e-103, 5.2465720993401781599e-119, -3.755176715260116501e-136, 1.3019701118468578292e-88, -7.5747169634236195447e-105, -2.0152904854894725532e-121, -3.1562414818576682143e-137, 1.3019701118468578292e-88, -7.5747169634236195447e-105, -2.0152904854894725532e-121, -3.1562414818576682143e-137, 1.3019701118468578292e-88, -7.5747169634236195447e-105, -2.0152904854894725532e-121, -3.1562414818576682143e-137, 4.5242184730639744369e-90, 1.1455847889130727424e-105, 1.8573014293598452896e-121, 1.1431992269852683481e-137, 4.5242184730639744369e-90, 1.1455847889130727424e-105, 1.8573014293598452896e-121, 1.1431992269852683481e-137, 4.5242184730639744369e-90, 1.1455847889130727424e-105, 1.8573014293598452896e-121, 1.1431992269852683481e-137, 4.5242184730639744369e-90, 1.1455847889130727424e-105, 1.8573014293598452896e-121, 1.1431992269852683481e-137, 4.5242184730639744369e-90, 1.1455847889130727424e-105, 1.8573014293598452896e-121, 1.1431992269852683481e-137, 5.969437008257942845e-91, 5.5547069870986327528e-107, 1.6304246661326865276e-122, 6.8339049774534162772e-139, 5.969437008257942845e-91, 5.5547069870986327528e-107, 1.6304246661326865276e-122, 6.8339049774534162772e-139, 5.969437008257942845e-91, 5.5547069870986327528e-107, 1.6304246661326865276e-122, 6.8339049774534162772e-139, 1.0603435429602168369e-91, 1.0451839188820145747e-108, 4.2386081393205242443e-125, 1.1062055705591188256e-141, 1.0603435429602168369e-91, 1.0451839188820145747e-108, 4.2386081393205242443e-125, 1.1062055705591188256e-141, 1.0603435429602168369e-91, 1.0451839188820145747e-108, 4.2386081393205242443e-125, 1.1062055705591188256e-141, 4.4670685979800101779e-92, 1.0451839188820145747e-108, 4.2386081393205242443e-125, 1.1062055705591188256e-141, 1.3988851821689310822e-92, 1.0451839188820145747e-108, 4.2386081393205242443e-125, 1.1062055705591188256e-141, 1.3988851821689310822e-92, 1.0451839188820145747e-108, 4.2386081393205242443e-125, 1.1062055705591188256e-141, 6.3183932821616130831e-93, 1.0451839188820145747e-108, 4.2386081393205242443e-125, 1.1062055705591188256e-141, 2.4831640123977650651e-93, 1.9359195088038447797e-109, -4.8867691298577234423e-126, -2.0587960670007823264e-142, 5.6554937751584084315e-94, -1.9306041120023063932e-110, 1.0223371855251471293e-126, 1.2214168761472102282e-142, 5.6554937751584084315e-94, -1.9306041120023063932e-110, 1.0223371855251471293e-126, 1.2214168761472102282e-142, 8.6145718795359707834e-95, 7.3062078800278780675e-111, 1.0223371855251471293e-126, 1.2214168761472102282e-142, 8.6145718795359707834e-95, 7.3062078800278780675e-111, 1.0223371855251471293e-126, 1.2214168761472102282e-142, 8.6145718795359707834e-95, 7.3062078800278780675e-111, 1.0223371855251471293e-126, 1.2214168761472102282e-142, 2.62202614552995759e-95, 6.5314563001514349095e-112, 9.9039323746573674262e-128, -8.6629775332868987041e-145, 2.62202614552995759e-95, 6.5314563001514349095e-112, 9.9039323746573674262e-128, -8.6629775332868987041e-145, 1.1238897120284541253e-95, 6.5314563001514349095e-112, 9.9039323746573674262e-128, -8.6629775332868987041e-145, 3.7482149527770239293e-96, 6.5314563001514349095e-112, 9.9039323746573674262e-128, -8.6629775332868987041e-145, 2.8738690232659205689e-99, 1.8395411057335783574e-115, -7.8150389500644475446e-132, -3.9681466199873824165e-148, 2.8738690232659205689e-99, 1.8395411057335783574e-115, -7.8150389500644475446e-132, -3.9681466199873824165e-148, 2.8738690232659205689e-99, 1.8395411057335783574e-115, -7.8150389500644475446e-132, -3.9681466199873824165e-148, 2.8738690232659205689e-99, 1.8395411057335783574e-115, -7.8150389500644475446e-132, -3.9681466199873824165e-148, 2.8738690232659205689e-99, 1.8395411057335783574e-115, -7.8150389500644475446e-132, -3.9681466199873824165e-148, 2.8738690232659205689e-99, 1.8395411057335783574e-115, -7.8150389500644475446e-132, -3.9681466199873824165e-148, 2.8738690232659205689e-99, 1.8395411057335783574e-115, -7.8150389500644475446e-132, -3.9681466199873824165e-148, 2.8738690232659205689e-99, 1.8395411057335783574e-115, -7.8150389500644475446e-132, -3.9681466199873824165e-148, 2.8738690232659205689e-99, 1.8395411057335783574e-115, -7.8150389500644475446e-132, -3.9681466199873824165e-148, 2.8738690232659205689e-99, 1.8395411057335783574e-115, -7.8150389500644475446e-132, -3.9681466199873824165e-148, 2.8738690232659205689e-99, 1.8395411057335783574e-115, -7.8150389500644475446e-132, -3.9681466199873824165e-148, 1.0450891972142805974e-99, 1.8395411057335783574e-115, -7.8150389500644475446e-132, -3.9681466199873824165e-148, 1.3069928418846076386e-100, 3.1677600334418871069e-116, 3.4556869017247794521e-132, 8.5448727249069983612e-148, 1.3069928418846076386e-100, 3.1677600334418871069e-116, 3.4556869017247794521e-132, 8.5448727249069983612e-148, 1.3069928418846076386e-100, 3.1677600334418871069e-116, 3.4556869017247794521e-132, 8.5448727249069983612e-148, 1.6400545060233297363e-101, -4.6672632026740766185e-119, -3.755176715260116501e-136, 2.1571619860435652883e-152, 1.6400545060233297363e-101, -4.6672632026740766185e-119, -3.755176715260116501e-136, 2.1571619860435652883e-152, 1.6400545060233297363e-101, -4.6672632026740766185e-119, -3.755176715260116501e-136, 2.1571619860435652883e-152, 2.1132026692048600853e-102, -4.6672632026740766185e-119, -3.755176715260116501e-136, 2.1571619860435652883e-152, 2.1132026692048600853e-102, -4.6672632026740766185e-119, -3.755176715260116501e-136, 2.1571619860435652883e-152, 2.1132026692048600853e-102, -4.6672632026740766185e-119, -3.755176715260116501e-136, 2.1571619860435652883e-152, 3.2728487032630532648e-103, 5.2465720993401781599e-119, -3.755176715260116501e-136, 2.1571619860435652883e-152, 3.2728487032630532648e-103, 5.2465720993401781599e-119, -3.755176715260116501e-136, 2.1571619860435652883e-152, 3.2728487032630532648e-103, 5.2465720993401781599e-119, -3.755176715260116501e-136, 2.1571619860435652883e-152, 1.0404514546648604359e-103, 2.896544483330507019e-120, 3.1239284188885823808e-136, 2.1571619860435652883e-152, 1.0404514546648604359e-103, 2.896544483330507019e-120, 3.1239284188885823808e-136, 2.1571619860435652883e-152, 4.8235214251531210473e-104, 2.896544483330507019e-120, 3.1239284188885823808e-136, 2.1571619860435652883e-152, 2.0330248644053793915e-104, 2.896544483330507019e-120, 3.1239284188885823808e-136, 2.1571619860435652883e-152, 6.3777658403150887343e-105, -2.0152904854894725532e-121, -3.156241481857667737e-137, -7.0684085473731388916e-153, 6.3777658403150887343e-105, -2.0152904854894725532e-121, -3.156241481857667737e-137, -7.0684085473731388916e-153, 2.88964513938041089e-105, 5.7298933442091639924e-121, -3.156241481857667737e-137, -7.0684085473731388916e-153, 1.1455847889130727424e-105, 1.8573014293598452896e-121, 1.1431992269852681095e-137, 2.4782675885631257398e-153, 2.7355461367940366859e-106, -7.8994528064813712419e-123, -2.0037599452814940222e-138, 9.1598554579059548847e-155, 2.7355461367940366859e-106, -7.8994528064813712419e-123, -2.0037599452814940222e-138, 9.1598554579059548847e-155, 5.5547069870986327528e-107, 1.6304246661326865276e-122, 6.8339049774534147855e-139, 9.1598554579059548847e-155, 5.5547069870986327528e-107, 1.6304246661326865276e-122, 6.8339049774534147855e-139, 9.1598554579059548847e-155, 1.0451839188820145747e-108, 4.2386081393205242443e-125, 1.1062055705591186799e-141, 1.1734404793201255869e-157, 1.0451839188820145747e-108, 4.2386081393205242443e-125, 1.1062055705591186799e-141, 1.1734404793201255869e-157, 1.0451839188820145747e-108, 4.2386081393205242443e-125, 1.1062055705591186799e-141, 1.1734404793201255869e-157, 1.0451839188820145747e-108, 4.2386081393205242443e-125, 1.1062055705591186799e-141, 1.1734404793201255869e-157, 1.0451839188820145747e-108, 4.2386081393205242443e-125, 1.1062055705591186799e-141, 1.1734404793201255869e-157, 1.0451839188820145747e-108, 4.2386081393205242443e-125, 1.1062055705591186799e-141, 1.1734404793201255869e-157, 1.9359195088038447797e-109, -4.8867691298577234423e-126, -2.0587960670007819622e-142, -2.8326669474241479263e-158, 1.9359195088038447797e-109, -4.8867691298577234423e-126, -2.0587960670007819622e-142, -2.8326669474241479263e-158, 1.9359195088038447797e-109, -4.8867691298577234423e-126, -2.0587960670007819622e-142, -2.8326669474241479263e-158, 8.7142954880180709975e-110, -4.8867691298577234423e-126, -2.0587960670007819622e-142, -2.8326669474241479263e-158, 3.3918456880078814158e-110, 6.931443500908017045e-126, 1.1062055705591186799e-141, 1.1734404793201255869e-157, 7.3062078800278780675e-111, 1.0223371855251471293e-126, 1.2214168761472102282e-142, 8.0910098773220312367e-159, 7.3062078800278780675e-111, 1.0223371855251471293e-126, 1.2214168761472102282e-142, 8.0910098773220312367e-159, 6.5314563001514349095e-112, 9.9039323746573674262e-128, -8.6629775332868972816e-145, -1.5987060076657616072e-160, 6.5314563001514349095e-112, 9.9039323746573674262e-128, -8.6629775332868972816e-145, -1.5987060076657616072e-160, 6.5314563001514349095e-112, 9.9039323746573674262e-128, -8.6629775332868972816e-145, -1.5987060076657616072e-160, 6.5314563001514349095e-112, 9.9039323746573674262e-128, -8.6629775332868972816e-145, -1.5987060076657616072e-160, 2.3732923938934761454e-112, 6.7095375687163138915e-129, 1.6963686085056791706e-144, 1.2464251916751375716e-160, 2.9421044076449630171e-113, 6.7095375687163138915e-129, 1.6963686085056791706e-144, 1.2464251916751375716e-160, 2.9421044076449630171e-113, 6.7095375687163138915e-129, 1.6963686085056791706e-144, 1.2464251916751375716e-160, 2.9421044076449630171e-113, 6.7095375687163138915e-129, 1.6963686085056791706e-144, 1.2464251916751375716e-160, 3.4325196623373878948e-114, 9.3892593260023063019e-130, 9.4702132359198537748e-146, 1.7950099192230045857e-161, 3.4325196623373878948e-114, 9.3892593260023063019e-130, 9.4702132359198537748e-146, 1.7950099192230045857e-161, 3.4325196623373878948e-114, 9.3892593260023063019e-130, 9.4702132359198537748e-146, 1.7950099192230045857e-161, 1.8395411057335783574e-115, -7.8150389500644475446e-132, -3.9681466199873824165e-148, 2.9106774506606945839e-164, 1.8395411057335783574e-115, -7.8150389500644475446e-132, -3.9681466199873824165e-148, 2.9106774506606945839e-164, 1.8395411057335783574e-115, -7.8150389500644475446e-132, -3.9681466199873824165e-148, 2.9106774506606945839e-164, 1.8395411057335783574e-115, -7.8150389500644475446e-132, -3.9681466199873824165e-148, 2.9106774506606945839e-164, 1.8395411057335783574e-115, -7.8150389500644475446e-132, -3.9681466199873824165e-148, 2.9106774506606945839e-164, 8.2436437080731844263e-116, 1.4726412753514008951e-131, -3.9681466199873824165e-148, 2.9106774506606945839e-164, 3.1677600334418871069e-116, 3.4556869017247794521e-132, 8.544872724906996972e-148, 1.6802919634942429241e-163, 6.2981819612623816536e-117, 6.3800543877747317218e-133, 7.2423563434801054878e-149, 1.1741471776254779927e-164, 6.2981819612623816536e-117, 6.3800543877747317218e-133, 7.2423563434801054878e-149, 1.1741471776254779927e-164, 6.2981819612623816536e-117, 6.3800543877747317218e-133, 7.2423563434801054878e-149, 1.1741471776254779927e-164, 3.1257546646178208289e-117, -6.6414926959353515111e-134, -5.7828074707888119584e-150, -1.2825052715093464343e-165, 1.5395410162955400644e-117, -6.6414926959353515111e-134, -5.7828074707888119584e-150, -1.2825052715093464343e-165, 7.4643419213439950602e-118, 1.0969016447485317626e-133, -5.7828074707888119584e-150, -1.2825052715093464343e-165, 3.4988078005382940294e-118, 2.1637618757749825688e-134, -8.9490928918944555247e-151, -1.9717385086233606481e-166, 1.5160407401354430737e-118, 2.1637618757749825688e-134, -8.9490928918944555247e-151, -1.9717385086233606481e-166, 5.2465720993401781599e-119, -3.755176715260116501e-136, 2.1571619860435648643e-152, 6.3257905089784152346e-168, 2.896544483330507019e-120, 3.1239284188885823808e-136, 2.1571619860435648643e-152, 6.3257905089784152346e-168, 2.896544483330507019e-120, 3.1239284188885823808e-136, 2.1571619860435648643e-152, 6.3257905089784152346e-168, 2.896544483330507019e-120, 3.1239284188885823808e-136, 2.1571619860435648643e-152, 6.3257905089784152346e-168, 2.896544483330507019e-120, 3.1239284188885823808e-136, 2.1571619860435648643e-152, 6.3257905089784152346e-168, 2.896544483330507019e-120, 3.1239284188885823808e-136, 2.1571619860435648643e-152, 6.3257905089784152346e-168, 1.3475077173907800538e-120, -3.156241481857667737e-137, -7.0684085473731388916e-153, -3.3573283875161501977e-170, 5.7298933442091639924e-121, -3.156241481857667737e-137, -7.0684085473731388916e-153, -3.3573283875161501977e-170, 1.8573014293598452896e-121, 1.1431992269852681095e-137, 2.4782675885631257398e-153, -3.3573283875161501977e-170, 1.8573014293598452896e-121, 1.1431992269852681095e-137, 2.4782675885631257398e-153, -3.3573283875161501977e-170, 8.8915345064751572143e-122, 1.1431992269852681095e-137, 2.4782675885631257398e-153, -3.3573283875161501977e-170, 4.0507946129135104481e-122, 6.8339049774534147855e-139, 9.1598554579059548847e-155, -4.5159745404911825673e-172, 1.6304246661326865276e-122, 6.8339049774534147855e-139, 9.1598554579059548847e-155, -4.5159745404911825673e-172, 4.2023969274227456735e-123, 6.8339049774534147855e-139, 9.1598554579059548847e-155, -4.5159745404911825673e-172, 4.2023969274227456735e-123, 6.8339049774534147855e-139, 9.1598554579059548847e-155, -4.5159745404911825673e-172, 1.1769344939467164447e-123, 1.1602886988632691941e-140, 3.0307583960570927356e-156, 5.8345524661064369683e-172, 1.1769344939467164447e-123, 1.1602886988632691941e-140, 3.0307583960570927356e-156, 5.8345524661064369683e-172, 4.2056888557770896953e-124, 1.1602886988632691941e-140, 3.0307583960570927356e-156, 5.8345524661064369683e-172, 4.2386081393205242443e-125, 1.1062055705591186799e-141, 1.1734404793201255869e-157, 1.2381024895275844856e-174, 4.2386081393205242443e-125, 1.1062055705591186799e-141, 1.1734404793201255869e-157, 1.2381024895275844856e-174, 4.2386081393205242443e-125, 1.1062055705591186799e-141, 1.1734404793201255869e-157, 1.2381024895275844856e-174, 4.2386081393205242443e-125, 1.1062055705591186799e-141, 1.1734404793201255869e-157, 1.2381024895275844856e-174, 1.8749656131673758844e-125, 1.1062055705591186799e-141, 1.1734404793201255869e-157, 1.2381024895275844856e-174, 6.931443500908017045e-126, 1.1062055705591186799e-141, 1.1734404793201255869e-157, 1.2381024895275844856e-174, 1.0223371855251471293e-126, 1.2214168761472102282e-142, 8.0910098773220302259e-159, 1.2381024895275844856e-174, 1.0223371855251471293e-126, 1.2214168761472102282e-142, 8.0910098773220302259e-159, 1.2381024895275844856e-174, 1.0223371855251471293e-126, 1.2214168761472102282e-142, 8.0910098773220302259e-159, 1.2381024895275844856e-174, 2.8369889610228834887e-127, 4.0136364036021218058e-143, -1.0134099605688458828e-159, -2.5389576707476506925e-176, 2.8369889610228834887e-127, 4.0136364036021218058e-143, -1.0134099605688458828e-159, -2.5389576707476506925e-176, 9.9039323746573674262e-128, -8.6629775332868972816e-145, -1.5987060076657612913e-160, -2.5389576707476506925e-176, 6.7095375687163138915e-129, 1.6963686085056791706e-144, 1.2464251916751375716e-160, 6.197724948400014906e-177, 6.7095375687163138915e-129, 1.6963686085056791706e-144, 1.2464251916751375716e-160, 6.197724948400014906e-177, 6.7095375687163138915e-129, 1.6963686085056791706e-144, 1.2464251916751375716e-160, 6.197724948400014906e-177, 6.7095375687163138915e-129, 1.6963686085056791706e-144, 1.2464251916751375716e-160, 6.197724948400014906e-177, 9.3892593260023063019e-130, 9.4702132359198537748e-146, 1.7950099192230045857e-161, -1.6991004655691155518e-177, 9.3892593260023063019e-130, 9.4702132359198537748e-146, 1.7950099192230045857e-161, -1.6991004655691155518e-177, 9.3892593260023063019e-130, 9.4702132359198537748e-146, 1.7950099192230045857e-161, -1.6991004655691155518e-177, 2.175994780857201024e-130, 1.4618808551874518553e-146, 1.6802919634942426156e-163, 2.8330093736631818036e-179, 2.175994780857201024e-130, 1.4618808551874518553e-146, 1.6802919634942426156e-163, 2.8330093736631818036e-179, 3.7267864457092460442e-131, 4.6083930759590139305e-147, 1.6802919634942426156e-163, 2.8330093736631818036e-179, 3.7267864457092460442e-131, 4.6083930759590139305e-147, 1.6802919634942426156e-163, 2.8330093736631818036e-179, 3.7267864457092460442e-131, 4.6083930759590139305e-147, 1.6802919634942426156e-163, 2.8330093736631818036e-179, 1.4726412753514008951e-131, -3.9681466199873824165e-148, 2.9106774506606941983e-164, 5.1948630316441296498e-180, 3.4556869017247794521e-132, 8.544872724906996972e-148, 1.6802919634942426156e-163, 2.8330093736631818036e-179, 3.4556869017247794521e-132, 8.544872724906996972e-148, 1.6802919634942426156e-163, 2.8330093736631818036e-179, 6.3800543877747317218e-133, 7.2423563434801054878e-149, 1.1741471776254777999e-164, 1.3389912474795152755e-180, 6.3800543877747317218e-133, 7.2423563434801054878e-149, 1.1741471776254777999e-164, 1.3389912474795152755e-180, 6.3800543877747317218e-133, 7.2423563434801054878e-149, 1.1741471776254777999e-164, 1.3389912474795152755e-180, 2.8579525590905986764e-133, -5.7828074707888119584e-150, -1.2825052715093464343e-165, -1.0696067158221530218e-181, 1.0969016447485317626e-133, -5.7828074707888119584e-150, -1.2825052715093464343e-165, -1.0696067158221530218e-181, 2.1637618757749825688e-134, -8.9490928918944555247e-151, -1.9717385086233606481e-166, 1.3535321672928907047e-182, 2.1637618757749825688e-134, -8.9490928918944555247e-151, -1.9717385086233606481e-166, 1.3535321672928907047e-182, 2.1637618757749825688e-134, -8.9490928918944555247e-151, -1.9717385086233606481e-166, 1.3535321672928907047e-182, 1.0631050543111905033e-134, 1.5490398016102376505e-150, 3.4549185946116918017e-166, 1.3535321672928907047e-182, 5.1277664357929471499e-135, 3.2706525621039604902e-151, 7.4159004299416557678e-167, 1.3535321672928907047e-182, 2.3761243821334675971e-135, 3.2706525621039604902e-151, 7.4159004299416557678e-167, 1.3535321672928907047e-182, 1.0003033553037281263e-135, 2.1571619860435648643e-152, 6.3257905089784152346e-168, 3.5607241064750984115e-184, 3.1239284188885823808e-136, 2.1571619860435648643e-152, 6.3257905089784152346e-168, 3.5607241064750984115e-184, 3.1239284188885823808e-136, 2.1571619860435648643e-152, 6.3257905089784152346e-168, 3.5607241064750984115e-184, 1.4041521353514076604e-136, 2.1571619860435648643e-152, 6.3257905089784152346e-168, 3.5607241064750984115e-184, 5.4426399358282049106e-137, 2.4782675885631257398e-153, -3.3573283875161501977e-170, 3.0568054078295488291e-186, 1.1431992269852681095e-137, 2.4782675885631257398e-153, -3.3573283875161501977e-170, 3.0568054078295488291e-186, 1.1431992269852681095e-137, 2.4782675885631257398e-153, -3.3573283875161501977e-170, 3.0568054078295488291e-186, 6.8339049774534147855e-139, 9.1598554579059548847e-155, -4.5159745404911819927e-172, -4.5870810097328578981e-188, 6.8339049774534147855e-139, 9.1598554579059548847e-155, -4.5159745404911819927e-172, -4.5870810097328578981e-188, 6.8339049774534147855e-139, 9.1598554579059548847e-155, -4.5159745404911819927e-172, -4.5870810097328578981e-188, 6.8339049774534147855e-139, 9.1598554579059548847e-155, -4.5159745404911819927e-172, -4.5870810097328578981e-188, 1.1602886988632691941e-140, 3.0307583960570927356e-156, 5.8345524661064358191e-172, 6.9043123899963188689e-188, 1.1602886988632691941e-140, 3.0307583960570927356e-156, 5.8345524661064358191e-172, 6.9043123899963188689e-188, 1.1602886988632691941e-140, 3.0307583960570927356e-156, 5.8345524661064358191e-172, 6.9043123899963188689e-188, 1.1602886988632691941e-140, 3.0307583960570927356e-156, 5.8345524661064358191e-172, 6.9043123899963188689e-188, 1.1602886988632691941e-140, 3.0307583960570927356e-156, 5.8345524661064358191e-172, 6.9043123899963188689e-188, 1.1602886988632691941e-140, 3.0307583960570927356e-156, 5.8345524661064358191e-172, 6.9043123899963188689e-188, 1.1062055705591186799e-141, 1.1734404793201255869e-157, 1.2381024895275844856e-174, -8.4789520282639751913e-191, 1.1062055705591186799e-141, 1.1734404793201255869e-157, 1.2381024895275844856e-174, -8.4789520282639751913e-191, 1.1062055705591186799e-141, 1.1734404793201255869e-157, 1.2381024895275844856e-174, -8.4789520282639751913e-191, 1.1062055705591186799e-141, 1.1734404793201255869e-157, 1.2381024895275844856e-174, -8.4789520282639751913e-191, 4.5016298192952031469e-142, -2.8326669474241479263e-158, 1.2381024895275844856e-174, -8.4789520282639751913e-191, 1.2214168761472102282e-142, 8.0910098773220302259e-159, 1.2381024895275844856e-174, -8.4789520282639751913e-191, 1.2214168761472102282e-142, 8.0910098773220302259e-159, 1.2381024895275844856e-174, -8.4789520282639751913e-191, 4.0136364036021218058e-143, -1.0134099605688458828e-159, -2.5389576707476506925e-176, -6.2404128071707654958e-193, 4.0136364036021218058e-143, -1.0134099605688458828e-159, -2.5389576707476506925e-176, -6.2404128071707654958e-193, 1.9635033141346264592e-143, -1.0134099605688458828e-159, -2.5389576707476506925e-176, -6.2404128071707654958e-193, 9.3843676940087855824e-144, 1.2626949989038732076e-159, 2.2730883653953564668e-175, 2.7431118386590483722e-191, 4.2590349703400483539e-144, 1.2464251916751375716e-160, 6.1977249484000140293e-177, 1.1294061984896458822e-192, 1.6963686085056791706e-144, 1.2464251916751375716e-160, 6.1977249484000140293e-177, 1.1294061984896458822e-192, 4.1503542758849472122e-145, -1.7614040799531193879e-161, -1.6991004655691153326e-177, -1.856794109153959173e-193, 4.1503542758849472122e-145, -1.7614040799531193879e-161, -1.6991004655691153326e-177, -1.856794109153959173e-193, 9.4702132359198537748e-146, 1.7950099192230045857e-161, -1.6991004655691153326e-177, -1.856794109153959173e-193, 9.4702132359198537748e-146, 1.7950099192230045857e-161, -1.6991004655691153326e-177, -1.856794109153959173e-193, 1.4618808551874518553e-146, 1.6802919634942426156e-163, 2.8330093736631818036e-179, -7.4549709281190454638e-196, 1.4618808551874518553e-146, 1.6802919634942426156e-163, 2.8330093736631818036e-179, -7.4549709281190454638e-196, 1.4618808551874518553e-146, 1.6802919634942426156e-163, 2.8330093736631818036e-179, -7.4549709281190454638e-196, 4.6083930759590139305e-147, 1.6802919634942426156e-163, 2.8330093736631818036e-179, -7.4549709281190454638e-196, 4.6083930759590139305e-147, 1.6802919634942426156e-163, 2.8330093736631818036e-179, -7.4549709281190454638e-196, 2.105789206980137775e-147, 1.6802919634942426156e-163, 2.8330093736631818036e-179, -7.4549709281190454638e-196, 8.544872724906996972e-148, 1.6802919634942426156e-163, 2.8330093736631818036e-179, -7.4549709281190454638e-196, 2.2883630524598079723e-148, 2.9106774506606941983e-164, 5.1948630316441287936e-180, 9.6685396110091032843e-196, 2.2883630524598079723e-148, 2.9106774506606941983e-164, 5.1948630316441287936e-180, 9.6685396110091032843e-196, 7.2423563434801054878e-149, 1.1741471776254777999e-164, 1.3389912474795150614e-180, 1.1067843414450286726e-196, 7.2423563434801054878e-149, 1.1741471776254777999e-164, 1.3389912474795150614e-180, 1.1067843414450286726e-196, 3.3320377982006123631e-149, 3.0588204110786950436e-165, 3.7502330143836152136e-181, 3.6564932749519464998e-198, 1.3768785255608653665e-149, 3.0588204110786950436e-165, 3.7502330143836152136e-181, 3.6564932749519464998e-198, 3.9929888924099219388e-150, -1.9717385086233606481e-166, 1.3535321672928907047e-182, 3.1205762277848031878e-199, 3.9929888924099219388e-150, -1.9717385086233606481e-166, 1.3535321672928907047e-182, 3.1205762277848031878e-199, 1.5490398016102376505e-150, 3.4549185946116918017e-166, 1.3535321672928907047e-182, 3.1205762277848031878e-199, 3.2706525621039604902e-151, 7.4159004299416557678e-167, 1.3535321672928907047e-182, 3.1205762277848031878e-199, 3.2706525621039604902e-151, 7.4159004299416557678e-167, 1.3535321672928907047e-182, 3.1205762277848031878e-199, 2.1571619860435648643e-152, 6.3257905089784152346e-168, 3.5607241064750984115e-184, -1.4832196127821708615e-201, 2.1571619860435648643e-152, 6.3257905089784152346e-168, 3.5607241064750984115e-184, -1.4832196127821708615e-201, 2.1571619860435648643e-152, 6.3257905089784152346e-168, 3.5607241064750984115e-184, -1.4832196127821708615e-201, 2.1571619860435648643e-152, 6.3257905089784152346e-168, 3.5607241064750984115e-184, -1.4832196127821708615e-201, 2.4782675885631257398e-153, -3.3573283875161501977e-170, 3.0568054078295488291e-186, 1.4980560800565462618e-202, 2.4782675885631257398e-153, -3.3573283875161501977e-170, 3.0568054078295488291e-186, 1.4980560800565462618e-202, 2.4782675885631257398e-153, -3.3573283875161501977e-170, 3.0568054078295488291e-186, 1.4980560800565462618e-202, 9.1598554579059548847e-155, -4.5159745404911819927e-172, -4.5870810097328572602e-188, -3.2905064432040069127e-204, 9.1598554579059548847e-155, -4.5159745404911819927e-172, -4.5870810097328572602e-188, -3.2905064432040069127e-204, 9.1598554579059548847e-155, -4.5159745404911819927e-172, -4.5870810097328572602e-188, -3.2905064432040069127e-204, 9.1598554579059548847e-155, -4.5159745404911819927e-172, -4.5870810097328572602e-188, -3.2905064432040069127e-204, 9.1598554579059548847e-155, -4.5159745404911819927e-172, -4.5870810097328572602e-188, -3.2905064432040069127e-204, 1.7015147267057481414e-155, -4.5159745404911819927e-172, -4.5870810097328572602e-188, -3.2905064432040069127e-204, 1.7015147267057481414e-155, -4.5159745404911819927e-172, -4.5870810097328572602e-188, -3.2905064432040069127e-204, 1.7015147267057481414e-155, -4.5159745404911819927e-172, -4.5870810097328572602e-188, -3.2905064432040069127e-204, 7.6922213530572229852e-156, -4.5159745404911819927e-172, -4.5870810097328572602e-188, -3.2905064432040069127e-204, 3.0307583960570927356e-156, 5.8345524661064358191e-172, 6.9043123899963188689e-188, -3.2905064432040069127e-204, 7.0002691755702864582e-157, 6.5928896280762691321e-173, 1.1586156901317304854e-188, -1.0100405885278530137e-205, 7.0002691755702864582e-157, 6.5928896280762691321e-173, 1.1586156901317304854e-188, -1.0100405885278530137e-205, 1.1734404793201255869e-157, 1.2381024895275844856e-174, -8.4789520282639751913e-191, -1.3321093418096261919e-207, 1.1734404793201255869e-157, 1.2381024895275844856e-174, -8.4789520282639751913e-191, -1.3321093418096261919e-207, 1.1734404793201255869e-157, 1.2381024895275844856e-174, -8.4789520282639751913e-191, -1.3321093418096261919e-207, 4.4508689228885539715e-158, 1.2381024895275844856e-174, -8.4789520282639751913e-191, -1.3321093418096261919e-207, 8.0910098773220302259e-159, 1.2381024895275844856e-174, -8.4789520282639751913e-191, -1.3321093418096261919e-207, 8.0910098773220302259e-159, 1.2381024895275844856e-174, -8.4789520282639751913e-191, -1.3321093418096261919e-207, 8.0910098773220302259e-159, 1.2381024895275844856e-174, -8.4789520282639751913e-191, -1.3321093418096261919e-207, 3.5387999583765925506e-159, 2.2730883653953564668e-175, 2.7431118386590483722e-191, -1.3321093418096261919e-207, 1.2626949989038732076e-159, 2.2730883653953564668e-175, 2.7431118386590483722e-191, -1.3321093418096261919e-207, 1.2464251916751375716e-160, 6.1977249484000140293e-177, 1.1294061984896456875e-192, 2.2526486929936882202e-208, 1.2464251916751375716e-160, 6.1977249484000140293e-177, 1.1294061984896456875e-192, 2.2526486929936882202e-208, 1.2464251916751375716e-160, 6.1977249484000140293e-177, 1.1294061984896456875e-192, 2.2526486929936882202e-208, 1.2464251916751375716e-160, 6.1977249484000140293e-177, 1.1294061984896456875e-192, 2.2526486929936882202e-208, 5.3514239183991277695e-161, 6.1977249484000140293e-177, 1.1294061984896456875e-192, 2.2526486929936882202e-208, 1.7950099192230045857e-161, -1.6991004655691153326e-177, -1.8567941091539589297e-193, -1.8074851186411640793e-209, 1.6802919634942426156e-163, 2.8330093736631818036e-179, -7.4549709281190454638e-196, -1.4481306607622412036e-212, 1.6802919634942426156e-163, 2.8330093736631818036e-179, -7.4549709281190454638e-196, -1.4481306607622412036e-212, 1.6802919634942426156e-163, 2.8330093736631818036e-179, -7.4549709281190454638e-196, -1.4481306607622412036e-212, 1.6802919634942426156e-163, 2.8330093736631818036e-179, -7.4549709281190454638e-196, -1.4481306607622412036e-212, 1.6802919634942426156e-163, 2.8330093736631818036e-179, -7.4549709281190454638e-196, -1.4481306607622412036e-212, 1.6802919634942426156e-163, 2.8330093736631818036e-179, -7.4549709281190454638e-196, -1.4481306607622412036e-212, 1.6802919634942426156e-163, 2.8330093736631818036e-179, -7.4549709281190454638e-196, -1.4481306607622412036e-212, 2.9106774506606941983e-164, 5.1948630316441287936e-180, 9.6685396110091013832e-196, 1.7562785002189357559e-211, 2.9106774506606941983e-164, 5.1948630316441287936e-180, 9.6685396110091013832e-196, 1.7562785002189357559e-211, 2.9106774506606941983e-164, 5.1948630316441287936e-180, 9.6685396110091013832e-196, 1.7562785002189357559e-211, 1.1741471776254777999e-164, 1.3389912474795150614e-180, 1.106784341445028435e-196, 3.3045982549756583552e-212, 3.0588204110786950436e-165, 3.7502330143836152136e-181, 3.6564932749519464998e-198, 3.7097125405852507464e-214, 3.0588204110786950436e-165, 3.7502330143836152136e-181, 3.6564932749519464998e-198, 3.7097125405852507464e-214, 8.8815756978467430465e-166, 1.3403131492807310959e-181, 3.6564932749519464998e-198, 3.7097125405852507464e-214, 8.8815756978467430465e-166, 1.3403131492807310959e-181, 3.6564932749519464998e-198, 3.7097125405852507464e-214, 3.4549185946116918017e-166, 1.3535321672928907047e-182, 3.1205762277848031878e-199, -3.3569248349832580936e-217, 7.4159004299416557678e-167, 1.3535321672928907047e-182, 3.1205762277848031878e-199, -3.3569248349832580936e-217, 7.4159004299416557678e-167, 1.3535321672928907047e-182, 3.1205762277848031878e-199, -3.3569248349832580936e-217, 6.3257905089784152346e-168, 3.5607241064750984115e-184, -1.4832196127821708615e-201, 2.6911956484118910092e-218, 6.3257905089784152346e-168, 3.5607241064750984115e-184, -1.4832196127821708615e-201, 2.6911956484118910092e-218, 6.3257905089784152346e-168, 3.5607241064750984115e-184, -1.4832196127821708615e-201, 2.6911956484118910092e-218, 6.3257905089784152346e-168, 3.5607241064750984115e-184, -1.4832196127821708615e-201, 2.6911956484118910092e-218, 2.0862146470760309789e-168, -1.146150630053972131e-184, -1.4832196127821708615e-201, 2.6911956484118910092e-218, 2.0862146470760309789e-168, -1.146150630053972131e-184, -1.4832196127821708615e-201, 2.6911956484118910092e-218, 1.026320681600434562e-168, 1.2072867382105631402e-184, -1.4832196127821708615e-201, 2.6911956484118910092e-218, 4.9637369886263658882e-169, 3.0568054078295488291e-186, 1.4980560800565460352e-202, 2.6911956484118910092e-218, 2.3140020749373754342e-169, 3.0568054078295488291e-186, 1.4980560800565460352e-202, 2.6911956484118910092e-218, 9.8913461809288020723e-170, 3.0568054078295488291e-186, 1.4980560800565460352e-202, 2.6911956484118910092e-218, 3.2670088967063259373e-170, 3.0568054078295488291e-186, 1.4980560800565460352e-202, 2.6911956484118910092e-218, 3.2670088967063259373e-170, 3.0568054078295488291e-186, 1.4980560800565460352e-202, 2.6911956484118910092e-218, 1.6109245756507072713e-170, -6.2044048008378732802e-187, -5.4322544592823556944e-203, 4.2491789852161138683e-219, 7.8288241512289757055e-171, 1.2181824638728806485e-186, 1.4980560800565460352e-202, 2.6911956484118910092e-218, 3.6886133485899290404e-171, 2.9887099189454666024e-187, 4.774153170641553462e-203, 4.2491789852161138683e-219, 1.6185079472704052482e-171, 2.9887099189454666024e-187, 4.774153170641553462e-203, 4.2491789852161138683e-219, 5.8345524661064358191e-172, 6.9043123899963188689e-188, -3.2905064432040069127e-204, -9.1795828160190082842e-224, 6.5928896280762691321e-173, 1.1586156901317304854e-188, -1.0100405885278530137e-205, -9.1795828160190082842e-224, 6.5928896280762691321e-173, 1.1586156901317304854e-188, -1.0100405885278530137e-205, -9.1795828160190082842e-224, 6.5928896280762691321e-173, 1.1586156901317304854e-188, -1.0100405885278530137e-205, -9.1795828160190082842e-224, 1.2381024895275844856e-174, -8.4789520282639751913e-191, -1.332109341809626019e-207, -9.1795828160190082842e-224, 1.2381024895275844856e-174, -8.4789520282639751913e-191, -1.332109341809626019e-207, -9.1795828160190082842e-224, 1.2381024895275844856e-174, -8.4789520282639751913e-191, -1.332109341809626019e-207, -9.1795828160190082842e-224, 1.2381024895275844856e-174, -8.4789520282639751913e-191, -1.332109341809626019e-207, -9.1795828160190082842e-224, 1.2381024895275844856e-174, -8.4789520282639751913e-191, -1.332109341809626019e-207, -9.1795828160190082842e-224, 1.2381024895275844856e-174, -8.4789520282639751913e-191, -1.332109341809626019e-207, -9.1795828160190082842e-224, 2.2730883653953564668e-175, 2.7431118386590483722e-191, -1.332109341809626019e-207, -9.1795828160190082842e-224, 2.2730883653953564668e-175, 2.7431118386590483722e-191, -1.332109341809626019e-207, -9.1795828160190082842e-224, 2.2730883653953564668e-175, 2.7431118386590483722e-191, -1.332109341809626019e-207, -9.1795828160190082842e-224, 1.0095962991602958391e-175, -6.2404128071707654958e-193, 3.0593092910744445285e-209, 5.4622616159087170031e-225, 3.7785026604276538491e-176, -6.2404128071707654958e-193, 3.0593092910744445285e-209, 5.4622616159087170031e-225, 6.1977249484000140293e-177, 1.1294061984896456875e-192, 2.2526486929936882202e-208, -5.3441928036578162463e-225, 6.1977249484000140293e-177, 1.1294061984896456875e-192, 2.2526486929936882202e-208, -5.3441928036578162463e-225, 6.1977249484000140293e-177, 1.1294061984896456875e-192, 2.2526486929936882202e-208, -5.3441928036578162463e-225, 2.2493122414154495675e-177, 2.5268245888628466632e-193, 3.0593092910744445285e-209, 5.4622616159087170031e-225, 2.7510588792316711745e-178, 3.3501523985444386676e-194, 6.2591208621664049475e-210, 5.9034406125450500218e-227, 2.7510588792316711745e-178, 3.3501523985444386676e-194, 6.2591208621664049475e-210, 5.9034406125450500218e-227, 2.7510588792316711745e-178, 3.3501523985444386676e-194, 6.2591208621664049475e-210, 5.9034406125450500218e-227, 2.8330093736631818036e-179, -7.4549709281190454638e-196, -1.4481306607622412036e-212, 9.9192633285681635836e-229, 2.8330093736631818036e-179, -7.4549709281190454638e-196, -1.4481306607622412036e-212, 9.9192633285681635836e-229, 2.8330093736631818036e-179, -7.4549709281190454638e-196, -1.4481306607622412036e-212, 9.9192633285681635836e-229, 2.8330093736631818036e-179, -7.4549709281190454638e-196, -1.4481306607622412036e-212, 9.9192633285681635836e-229, 1.2906606599973359683e-179, -7.4549709281190454638e-196, -1.4481306607622412036e-212, 9.9192633285681635836e-229, 5.1948630316441287936e-180, 9.6685396110091013832e-196, 1.7562785002189355449e-211, 1.6821693549018732055e-227, 1.3389912474795150614e-180, 1.106784341445028435e-196, 3.3045982549756578275e-212, 6.2685154049107876715e-228, 1.3389912474795150614e-180, 1.106784341445028435e-196, 3.3045982549756578275e-212, 6.2685154049107876715e-228, 3.7502330143836152136e-181, 3.6564932749519464998e-198, 3.7097125405852507464e-214, 2.5658818466966882188e-231, 3.7502330143836152136e-181, 3.6564932749519464998e-198, 3.7097125405852507464e-214, 2.5658818466966882188e-231, 1.3403131492807310959e-181, 3.6564932749519464998e-198, 3.7097125405852507464e-214, 2.5658818466966882188e-231, 1.3535321672928907047e-182, 3.1205762277848031878e-199, -3.3569248349832580936e-217, -1.0577661142165146927e-233, 1.3535321672928907047e-182, 3.1205762277848031878e-199, -3.3569248349832580936e-217, -1.0577661142165146927e-233, 1.3535321672928907047e-182, 3.1205762277848031878e-199, -3.3569248349832580936e-217, -1.0577661142165146927e-233, 1.3535321672928907047e-182, 3.1205762277848031878e-199, -3.3569248349832580936e-217, -1.0577661142165146927e-233, 6.0043220944823941786e-183, 3.1205762277848031878e-199, -3.3569248349832580936e-217, -1.0577661142165146927e-233, 2.2388223052591377446e-183, 3.1205762277848031878e-199, -3.3569248349832580936e-217, -1.0577661142165146927e-233, 3.5607241064750984115e-184, -1.4832196127821708615e-201, 2.6911956484118910092e-218, -5.1336618966962585332e-235, 3.5607241064750984115e-184, -1.4832196127821708615e-201, 2.6911956484118910092e-218, -5.1336618966962585332e-235, 3.5607241064750984115e-184, -1.4832196127821708615e-201, 2.6911956484118910092e-218, -5.1336618966962585332e-235, 1.2072867382105631402e-184, -1.4832196127821708615e-201, 2.6911956484118910092e-218, -5.1336618966962585332e-235, 3.0568054078295488291e-186, 1.4980560800565460352e-202, 2.6911956484118910092e-218, -5.1336618966962585332e-235, 3.0568054078295488291e-186, 1.4980560800565460352e-202, 2.6911956484118910092e-218, -5.1336618966962585332e-235, 3.0568054078295488291e-186, 1.4980560800565460352e-202, 2.6911956484118910092e-218, -5.1336618966962585332e-235, 3.0568054078295488291e-186, 1.4980560800565460352e-202, 2.6911956484118910092e-218, -5.1336618966962585332e-235, 3.0568054078295488291e-186, 1.4980560800565460352e-202, 2.6911956484118910092e-218, -5.1336618966962585332e-235, 3.0568054078295488291e-186, 1.4980560800565460352e-202, 2.6911956484118910092e-218, -5.1336618966962585332e-235, 1.2181824638728806485e-186, 1.4980560800565460352e-202, 2.6911956484118910092e-218, -5.1336618966962585332e-235, 2.9887099189454666024e-187, 4.774153170641553462e-203, 4.2491789852161132393e-219, 7.4467067939231424594e-235, 2.9887099189454666024e-187, 4.774153170641553462e-203, 4.2491789852161132393e-219, 7.4467067939231424594e-235, 6.9043123899963188689e-188, -3.2905064432040069127e-204, -9.1795828160190063645e-224, -2.3569545504732004486e-239, 6.9043123899963188689e-188, -3.2905064432040069127e-204, -9.1795828160190063645e-224, -2.3569545504732004486e-239, 1.1586156901317304854e-188, -1.0100405885278530137e-205, -9.1795828160190063645e-224, -2.3569545504732004486e-239, 1.1586156901317304854e-188, -1.0100405885278530137e-205, -9.1795828160190063645e-224, -2.3569545504732004486e-239, 1.1586156901317304854e-188, -1.0100405885278530137e-205, -9.1795828160190063645e-224, -2.3569545504732004486e-239, 4.4040360264865697732e-189, -1.0100405885278530137e-205, -9.1795828160190063645e-224, -2.3569545504732004486e-239, 8.129755890712020335e-190, 9.8339840169166049336e-206, -9.1795828160190063645e-224, -2.3569545504732004486e-239, 8.129755890712020335e-190, 9.8339840169166049336e-206, -9.1795828160190063645e-224, -2.3569545504732004486e-239, 8.129755890712020335e-190, 9.8339840169166049336e-206, -9.1795828160190063645e-224, -2.3569545504732004486e-239, 3.6409303439428119063e-190, -1.332109341809626019e-207, -9.1795828160190063645e-224, -2.3569545504732004486e-239, 1.3965175705582071936e-190, -1.332109341809626019e-207, -9.1795828160190063645e-224, -2.3569545504732004486e-239, 2.7431118386590483722e-191, -1.332109341809626019e-207, -9.1795828160190063645e-224, -2.3569545504732004486e-239, 2.7431118386590483722e-191, -1.332109341809626019e-207, -9.1795828160190063645e-224, -2.3569545504732004486e-239, 2.7431118386590483722e-191, -1.332109341809626019e-207, -9.1795828160190063645e-224, -2.3569545504732004486e-239, 1.3403538552936701153e-191, 1.7826390804083638359e-207, -9.1795828160190063645e-224, -2.3569545504732004486e-239, 6.389748636109812983e-192, 2.2526486929936882202e-208, -5.3441928036578156465e-225, -7.741539335184153052e-241, 2.8828536776963681193e-192, 2.2526486929936882202e-208, -5.3441928036578156465e-225, -7.741539335184153052e-241, 1.1294061984896456875e-192, 2.2526486929936882202e-208, -5.3441928036578156465e-225, -7.741539335184153052e-241, 2.5268245888628466632e-193, 3.0593092910744445285e-209, 5.4622616159087170031e-225, 4.2560351759808952526e-241, 2.5268245888628466632e-193, 3.0593092910744445285e-209, 5.4622616159087170031e-225, 4.2560351759808952526e-241, 3.3501523985444386676e-194, 6.2591208621664049475e-210, 5.9034406125450490845e-227, 1.3186893776791012681e-242, 3.3501523985444386676e-194, 6.2591208621664049475e-210, 5.9034406125450490845e-227, 1.3186893776791012681e-242, 3.3501523985444386676e-194, 6.2591208621664049475e-210, 5.9034406125450490845e-227, 1.3186893776791012681e-242, 6.1039071228393547627e-195, 1.7562785002189355449e-211, 1.6821693549018732055e-227, -8.7276385348052817035e-244, 6.1039071228393547627e-195, 1.7562785002189355449e-211, 1.6821693549018732055e-227, -8.7276385348052817035e-244, 6.1039071228393547627e-195, 1.7562785002189355449e-211, 1.6821693549018732055e-227, -8.7276385348052817035e-244, 2.6792050150137250131e-195, 1.7562785002189355449e-211, 1.6821693549018732055e-227, -8.7276385348052817035e-244, 9.6685396110091013832e-196, 1.7562785002189355449e-211, 1.6821693549018732055e-227, -8.7276385348052817035e-244, 2.0416567491425607157e-177, 6.0959078275963141821e-193, 1.156336993964950812e-208, 2.7126166236326293347e-224, 2.0416567491425607157e-177, 6.0959078275963141821e-193, 1.156336993964950812e-208, 2.7126166236326293347e-224, 2.0416567491425607157e-177, 6.0959078275963141821e-193, 1.156336993964950812e-208, 2.7126166236326293347e-224, 6.7450395650278649168e-179, 6.8432117823206978686e-195, 4.7332165749391048364e-212, 4.4984059688774601837e-228, 6.7450395650278649168e-179, 6.8432117823206978686e-195, 4.7332165749391048364e-212, 4.4984059688774601837e-228, 6.7450395650278649168e-179, 6.8432117823206978686e-195, 4.7332165749391048364e-212, 4.4984059688774601837e-228, 6.7450395650278649168e-179, 6.8432117823206978686e-195, 4.7332165749391048364e-212, 4.4984059688774601837e-228, 6.7450395650278649168e-179, 6.8432117823206978686e-195, 4.7332165749391048364e-212, 4.4984059688774601837e-228, 5.756447103644822603e-180, -6.1924333305615830735e-198, -1.9512340798794268979e-214, -3.6162764918921697356e-230, 5.756447103644822603e-180, -6.1924333305615830735e-198, -1.9512340798794268979e-214, -3.6162764918921697356e-230, 5.756447103644822603e-180, -6.1924333305615830735e-198, -1.9512340798794268979e-214, -3.6162764918921697356e-230, 5.756447103644822603e-180, -6.1924333305615830735e-198, -1.9512340798794268979e-214, -3.6162764918921697356e-230, 1.9005753194802080146e-180, -6.1924333305615830735e-198, -1.9512340798794268979e-214, -3.6162764918921697356e-230, 1.9005753194802080146e-180, -6.1924333305615830735e-198, -1.9512340798794268979e-214, -3.6162764918921697356e-230, 9.3660737343905436753e-181, -6.1924333305615830735e-198, -1.9512340798794268979e-214, -3.6162764918921697356e-230, 4.5462340041847754398e-181, -6.1924333305615830735e-198, -1.9512340798794268979e-214, -3.6162764918921697356e-230, 2.1363141390818913221e-181, -6.1924333305615830735e-198, -1.9512340798794268979e-214, -3.6162764918921697356e-230, 9.3135420653044926323e-182, -6.1924333305615830735e-198, -1.9512340798794268979e-214, -3.6162764918921697356e-230, 3.2887424025472810002e-182, 7.185309278132283136e-198, -1.9512340798794268979e-214, -3.6162764918921697356e-230, 2.7634257116867652192e-183, 4.9643797378534984559e-199, -9.4699347169310243473e-216, -9.2331809177749095611e-233, 2.7634257116867652192e-183, 4.9643797378534984559e-199, -9.4699347169310243473e-216, -9.2331809177749095611e-233, 2.7634257116867652192e-183, 4.9643797378534984559e-199, -9.4699347169310243473e-216, -9.2331809177749095611e-233, 2.7634257116867652192e-183, 4.9643797378534984559e-199, -9.4699347169310243473e-216, -9.2331809177749095611e-233, 8.806758170751374203e-184, 7.8383517263666503337e-200, 1.3736749441945438342e-215, -9.2331809177749095611e-233, 8.806758170751374203e-184, 7.8383517263666503337e-200, 1.3736749441945438342e-215, -9.2331809177749095611e-233, 4.0998834342223036605e-184, 7.8383517263666503337e-200, 1.3736749441945438342e-215, -9.2331809177749095611e-233, 1.7464460659577689118e-184, 2.612671019845610006e-200, 2.1334073625072069974e-216, -9.2331809177749095611e-233, 5.697273818255015375e-185, -1.6933341491052464293e-204, -4.3478137385944270631e-220, -2.3353910329236990725e-236, 5.697273818255015375e-185, -1.6933341491052464293e-204, -4.3478137385944270631e-220, -2.3353910329236990725e-236, 2.755477107924346286e-185, -1.6933341491052464293e-204, -4.3478137385944270631e-220, -2.3353910329236990725e-236, 1.2845787527590117414e-185, -1.6933341491052464293e-204, -4.3478137385944270631e-220, -2.3353910329236990725e-236, 5.4912957517634446918e-186, -1.6933341491052464293e-204, -4.3478137385944270631e-220, -2.3353910329236990725e-236, 1.8140498638501083305e-186, -1.6933341491052464293e-204, -4.3478137385944270631e-220, -2.3353910329236990725e-236, 1.8140498638501083305e-186, -1.6933341491052464293e-204, -4.3478137385944270631e-220, -2.3353910329236990725e-236, 8.9473839187177424013e-187, -1.6933341491052464293e-204, -4.3478137385944270631e-220, -2.3353910329236990725e-236, 4.3508265588260719497e-187, -1.6933341491052464293e-204, -4.3478137385944270631e-220, -2.3353910329236990725e-236, 2.0525478788802367239e-187, -1.6933341491052464293e-204, -4.3478137385944270631e-220, -2.3353910329236990725e-236, 9.0340853890731911095e-188, -1.6933341491052464293e-204, -4.3478137385944270631e-220, -2.3353910329236990725e-236, 3.288388689208603045e-188, -1.6933341491052464293e-204, -4.3478137385944270631e-220, -2.3353910329236990725e-236, 4.1554033927630885323e-189, -9.8582956929636044137e-206, -1.4280619485269765742e-221, 1.2171222696290252021e-237, 4.1554033927630885323e-189, -9.8582956929636044137e-206, -1.4280619485269765742e-221, 1.2171222696290252021e-237, 4.1554033927630885323e-189, -9.8582956929636044137e-206, -1.4280619485269765742e-221, 1.2171222696290252021e-237, 5.643429553477207926e-190, 1.0076094209231528444e-205, 7.8509991660024955813e-222, 1.2171222696290252021e-237, 5.643429553477207926e-190, 1.0076094209231528444e-205, 7.8509991660024955813e-222, 1.2171222696290252021e-237, 5.643429553477207926e-190, 1.0076094209231528444e-205, 7.8509991660024955813e-222, 1.2171222696290252021e-237, 1.1546040067079994973e-190, 1.0889925813396166947e-207, 2.4325525462765697993e-223, -1.1429360314275701698e-239, 1.1546040067079994973e-190, 1.0889925813396166947e-207, 2.4325525462765697993e-223, -1.1429360314275701698e-239, 3.2397620015697148712e-192, 3.1030547578511949035e-208, -1.609965144193984205e-224, -1.8313007053436627876e-240, 3.2397620015697148712e-192, 3.1030547578511949035e-208, -1.609965144193984205e-224, -1.8313007053436627876e-240, 3.2397620015697148712e-192, 3.1030547578511949035e-208, -1.609965144193984205e-224, -1.8313007053436627876e-240, 3.2397620015697148712e-192, 3.1030547578511949035e-208, -1.609965144193984205e-224, -1.8313007053436627876e-240, 3.2397620015697148712e-192, 3.1030547578511949035e-208, -1.609965144193984205e-224, -1.8313007053436627876e-240, 3.2397620015697148712e-192, 3.1030547578511949035e-208, -1.609965144193984205e-224, -1.8313007053436627876e-240, 1.4863145223629928288e-192, -7.9038076992129241506e-209, -1.609965144193984205e-224, -1.8313007053436627876e-240, 6.0959078275963141821e-193, 1.156336993964950812e-208, 2.7126166236326293347e-224, -1.8313007053436627876e-240, 1.712289129579509076e-193, 1.8297811202182925249e-209, 1.1003018740995688645e-226, 5.827891678485165325e-243, 1.712289129579509076e-193, 1.8297811202182925249e-209, 1.1003018740995688645e-226, 5.827891678485165325e-243, 6.1638445507530779946e-194, -6.0361608463951204924e-210, 1.1003018740995688645e-226, 5.827891678485165325e-243, 6.8432117823206978686e-195, 4.7332165749391048364e-212, 4.4984059688774601837e-228, -3.029900079464340522e-245, 6.8432117823206978686e-195, 4.7332165749391048364e-212, 4.4984059688774601837e-228, -3.029900079464340522e-245, 6.8432117823206978686e-195, 4.7332165749391048364e-212, 4.4984059688774601837e-228, -3.029900079464340522e-245, 6.8432117823206978686e-195, 4.7332165749391048364e-212, 4.4984059688774601837e-228, -3.029900079464340522e-245, 3.418509674495068119e-195, 4.7332165749391048364e-212, 4.4984059688774601837e-228, -3.029900079464340522e-245, 1.7061586205822532442e-195, 4.7332165749391048364e-212, 4.4984059688774601837e-228, -3.029900079464340522e-245, 8.499830936258458068e-196, 4.7332165749391048364e-212, 4.4984059688774601837e-228, -3.029900079464340522e-245, 4.218953301476420881e-196, 4.7332165749391048364e-212, 4.4984059688774601837e-228, -3.029900079464340522e-245, 2.0785144840854027628e-196, -1.9512340798794268979e-214, -3.6162764918921692779e-230, -2.8387319855193022476e-246, 1.008295075389893466e-196, -1.9512340798794268979e-214, -3.6162764918921692779e-230, -2.8387319855193022476e-246, 4.7318537104213881764e-197, -1.9512340798794268979e-214, -3.6162764918921692779e-230, -2.8387319855193022476e-246, 2.0563051886826149345e-197, -1.9512340798794268979e-214, -3.6162764918921692779e-230, -2.8387319855193022476e-246, 7.185309278132283136e-198, -1.9512340798794268979e-214, -3.6162764918921692779e-230, -2.8387319855193022476e-246, 4.9643797378534984559e-199, -9.4699347169310243473e-216, -9.2331809177749077733e-233, -1.4042876247421728101e-248, 4.9643797378534984559e-199, -9.4699347169310243473e-216, -9.2331809177749077733e-233, -1.4042876247421728101e-248, 4.9643797378534984559e-199, -9.4699347169310243473e-216, -9.2331809177749077733e-233, -1.4042876247421728101e-248, 4.9643797378534984559e-199, -9.4699347169310243473e-216, -9.2331809177749077733e-233, -1.4042876247421728101e-248, 7.8383517263666503337e-200, 1.3736749441945438342e-215, -9.2331809177749077733e-233, -1.4042876247421728101e-248, 7.8383517263666503337e-200, 1.3736749441945438342e-215, -9.2331809177749077733e-233, -1.4042876247421728101e-248, 7.8383517263666503337e-200, 1.3736749441945438342e-215, -9.2331809177749077733e-233, -1.4042876247421728101e-248, 2.612671019845610006e-200, 2.1334073625072069974e-216, -9.2331809177749077733e-233, -1.4042876247421728101e-248, 2.612671019845610006e-200, 2.1334073625072069974e-216, -9.2331809177749077733e-233, -1.4042876247421728101e-248, 1.306250843215349634e-200, 2.1334073625072069974e-216, -9.2331809177749077733e-233, -1.4042876247421728101e-248, 6.5304075490021959302e-201, 6.8298960257742791824e-217, 6.8696910062179237095e-233, 3.8349029251851101018e-249, 3.2643571074265457254e-201, -4.2219277387461470355e-218, -1.753154605289404553e-234, -7.5861268822635538093e-251, 1.6313318866387202604e-201, -4.2219277387461470355e-218, -1.753154605289404553e-234, -7.5861268822635538093e-251, 8.1481927624480752786e-202, -4.2219277387461470355e-218, -1.753154605289404553e-234, -7.5861268822635538093e-251, 4.0656297104785107096e-202, 4.8431832608149701961e-218, 8.3111403472061145651e-234, 1.6001805286092554504e-249, 2.0243481844937293316e-202, 3.1062776103441183191e-219, 7.6291913283447536617e-235, 2.0347903074934629333e-250, 1.0037074215013384159e-202, 3.1062776103441183191e-219, 7.6291913283447536617e-235, 2.0347903074934629333e-250, 4.9338704000514295811e-203, 3.1062776103441183191e-219, 7.6291913283447536617e-235, 2.0347903074934629333e-250, 2.3822684925704522921e-203, 3.1062776103441183191e-219, 7.6291913283447536617e-235, 2.0347903074934629333e-250, 1.1064675388299639308e-203, 2.7343042298126957741e-220, 5.5273393987134252385e-236, 1.1432574793608782288e-251, 4.6856706195971960852e-204, 2.7343042298126957741e-220, 5.5273393987134252385e-236, 1.1432574793608782288e-251, 1.4961682352459748279e-204, -8.0675475439086544798e-221, -3.6970842501441777651e-237, -5.7032870362481275794e-253, 1.4961682352459748279e-204, -8.0675475439086544798e-221, -3.6970842501441777651e-237, -5.7032870362481275794e-253, 6.9879263915816924805e-205, 9.6377473771091526132e-221, 1.5959741828948633012e-236, 2.7031904319843495713e-252, 3.0010484111426663515e-205, 7.8509991660024955813e-222, 1.2171222696290252021e-237, -2.4742181023285720738e-254, 1.0076094209231528444e-205, 7.8509991660024955813e-222, 1.2171222696290252021e-237, -2.4742181023285720738e-254, 1.0889925813396166947e-207, 2.4325525462765697993e-223, -1.1429360314275701698e-239, 8.3218722366085688343e-256, 1.0889925813396166947e-207, 2.4325525462765697993e-223, -1.1429360314275701698e-239, 8.3218722366085688343e-256, 1.0889925813396166947e-207, 2.4325525462765697993e-223, -1.1429360314275701698e-239, 8.3218722366085688343e-256, 1.0889925813396166947e-207, 2.4325525462765697993e-223, -1.1429360314275701698e-239, 8.3218722366085688343e-256, 1.0889925813396166947e-207, 2.4325525462765697993e-223, -1.1429360314275701698e-239, 8.3218722366085688343e-256, 1.0889925813396166947e-207, 2.4325525462765697993e-223, -1.1429360314275701698e-239, 8.3218722366085688343e-256, 1.0889925813396166947e-207, 2.4325525462765697993e-223, -1.1429360314275701698e-239, 8.3218722366085688343e-256, 3.1030547578511949035e-208, -1.609965144193984205e-224, -1.8313007053436625212e-240, -2.3341145329525059632e-256, 3.1030547578511949035e-208, -1.609965144193984205e-224, -1.8313007053436625212e-240, -2.3341145329525059632e-256, 1.156336993964950812e-208, 2.7126166236326293347e-224, -1.8313007053436625212e-240, -2.3341145329525059632e-256, 1.8297811202182925249e-209, 1.1003018740995688645e-226, 5.827891678485165325e-243, -3.1174271110208206547e-259, 1.8297811202182925249e-209, 1.1003018740995688645e-226, 5.827891678485165325e-243, -3.1174271110208206547e-259, 1.8297811202182925249e-209, 1.1003018740995688645e-226, 5.827891678485165325e-243, -3.1174271110208206547e-259, 6.1308251778939023781e-210, 1.1003018740995688645e-226, 5.827891678485165325e-243, -3.1174271110208206547e-259, 4.7332165749391048364e-212, 4.4984059688774601837e-228, -3.0299000794643401155e-245, -2.8075477999879273582e-261, 4.7332165749391048364e-212, 4.4984059688774601837e-228, -3.0299000794643401155e-245, -2.8075477999879273582e-261, 4.7332165749391048364e-212, 4.4984059688774601837e-228, -3.0299000794643401155e-245, -2.8075477999879273582e-261, 4.7332165749391048364e-212, 4.4984059688774601837e-228, -3.0299000794643401155e-245, -2.8075477999879273582e-261, 4.7332165749391048364e-212, 4.4984059688774601837e-228, -3.0299000794643401155e-245, -2.8075477999879273582e-261, 4.7332165749391048364e-212, 4.4984059688774601837e-228, -3.0299000794643401155e-245, -2.8075477999879273582e-261, 4.7332165749391048364e-212, 4.4984059688774601837e-228, -3.0299000794643401155e-245, -2.8075477999879273582e-261, 4.7332165749391048364e-212, 4.4984059688774601837e-228, -3.0299000794643401155e-245, -2.8075477999879273582e-261, 2.3568521170701555846e-212, -7.7818310317651142243e-229, -3.0299000794643401155e-245, -2.8075477999879273582e-261, 1.1686698881356804311e-212, 1.8601114328504743806e-228, -3.0299000794643401155e-245, -2.8075477999879273582e-261, 5.7457877366844311816e-213, 5.409641648369814791e-229, -3.0299000794643401155e-245, -2.8075477999879273582e-261, 2.7753321643482446169e-213, -1.1860946916976500828e-229, 6.3146909508553973881e-246, 1.2573885592501532045e-261, 1.290104378180150675e-213, 2.1117734783360818049e-229, 4.2928382696354204061e-245, -2.8075477999879273582e-261, 5.4749048509610403382e-214, 4.6283939331921604413e-230, 6.3146909508553973881e-246, 1.2573885592501532045e-261, 1.7618353855408067201e-214, 5.060587206499956961e-231, 5.9380161562121075096e-247, -1.2904053011746964278e-263, 1.7618353855408067201e-214, 5.060587206499956961e-231, 5.9380161562121075096e-247, -1.2904053011746964278e-263, 8.3356801918574821257e-215, 5.060587206499956961e-231, 5.9380161562121075096e-247, -1.2904053011746964278e-263, 3.6943433600821895879e-215, 5.060587206499956961e-231, 5.9380161562121075096e-247, -1.2904053011746964278e-263, 1.3736749441945438342e-215, -9.2331809177749077733e-233, -1.4042876247421726117e-248, -9.9505977179164858712e-265, 2.1334073625072069974e-216, -9.2331809177749077733e-233, -1.4042876247421726117e-248, -9.9505977179164858712e-265, 2.1334073625072069974e-216, -9.2331809177749077733e-233, -1.4042876247421726117e-248, -9.9505977179164858712e-265, 2.1334073625072069974e-216, -9.2331809177749077733e-233, -1.4042876247421726117e-248, -9.9505977179164858712e-265, 6.8298960257742791824e-217, 6.8696910062179237095e-233, 3.8349029251851101018e-249, -2.6436684620390282645e-267, 6.8298960257742791824e-217, 6.8696910062179237095e-233, 3.8349029251851101018e-249, -2.6436684620390282645e-267, 3.2038516259498326923e-217, -1.1817449557784924788e-233, -6.3454186796659920093e-250, -2.6436684620390282645e-267, 1.3908294260376086421e-217, 2.8439730252197153919e-233, 3.8349029251851101018e-249, -2.6436684620390282645e-267, 4.8431832608149701961e-218, 8.3111403472061145651e-234, 1.6001805286092554504e-249, -2.6436684620390282645e-267, 3.1062776103441183191e-219, 7.6291913283447536617e-235, 2.0347903074934629333e-250, -2.6436684620390282645e-267, 3.1062776103441183191e-219, 7.6291913283447536617e-235, 2.0347903074934629333e-250, -2.6436684620390282645e-267, 3.1062776103441183191e-219, 7.6291913283447536617e-235, 2.0347903074934629333e-250, -2.6436684620390282645e-267, 3.1062776103441183191e-219, 7.6291913283447536617e-235, 2.0347903074934629333e-250, -2.6436684620390282645e-267, 2.7343042298126957741e-220, 5.5273393987134252385e-236, 1.1432574793608780349e-251, 1.2329569415922591084e-267, 2.7343042298126957741e-220, 5.5273393987134252385e-236, 1.1432574793608780349e-251, 1.2329569415922591084e-267, 2.7343042298126957741e-220, 5.5273393987134252385e-236, 1.1432574793608780349e-251, 1.2329569415922591084e-267, 2.7343042298126957741e-220, 5.5273393987134252385e-236, 1.1432574793608780349e-251, 1.2329569415922591084e-267, 9.6377473771091526132e-221, 1.5959741828948633012e-236, 2.7031904319843490867e-252, 2.638005906844372114e-268, 7.8509991660024955813e-222, 1.2171222696290252021e-237, -2.4742181023285720738e-254, -1.2030990169203137715e-270, 7.8509991660024955813e-222, 1.2171222696290252021e-237, -2.4742181023285720738e-254, -1.2030990169203137715e-270, 7.8509991660024955813e-222, 1.2171222696290252021e-237, -2.4742181023285720738e-254, -1.2030990169203137715e-270, 7.8509991660024955813e-222, 1.2171222696290252021e-237, -2.4742181023285720738e-254, -1.2030990169203137715e-270, 2.318094503184431479e-222, -1.1429360314275701698e-239, 8.3218722366085688343e-256, -2.0046830753539155726e-272, 2.318094503184431479e-222, -1.1429360314275701698e-239, 8.3218722366085688343e-256, -2.0046830753539155726e-272, 9.3486833747991514629e-223, -1.1429360314275701698e-239, 8.3218722366085688343e-256, -2.0046830753539155726e-272, 2.4325525462765697993e-223, -1.1429360314275701698e-239, 8.3218722366085688343e-256, -2.0046830753539155726e-272, 2.4325525462765697993e-223, -1.1429360314275701698e-239, 8.3218722366085688343e-256, -2.0046830753539155726e-272, 7.0351983914592419146e-224, 7.766758903588374524e-240, 8.3218722366085688343e-256, -2.0046830753539155726e-272, 7.0351983914592419146e-224, 7.766758903588374524e-240, 8.3218722366085688343e-256, -2.0046830753539155726e-272, 2.7126166236326293347e-224, -1.8313007053436625212e-240, -2.3341145329525056675e-256, -2.0046830753539155726e-272, 5.5132573971932232487e-225, 5.6821419688934674008e-241, 3.2988215943776273615e-257, 2.1353977370878701046e-273, 5.5132573971932232487e-225, 5.6821419688934674008e-241, 3.2988215943776273615e-257, 2.1353977370878701046e-273, 1.1003018740995688645e-226, 5.827891678485165325e-243, -3.117427111020820077e-259, -5.9718623963762788119e-275, 1.1003018740995688645e-226, 5.827891678485165325e-243, -3.117427111020820077e-259, -5.9718623963762788119e-275, 1.1003018740995688645e-226, 5.827891678485165325e-243, -3.117427111020820077e-259, -5.9718623963762788119e-275, 1.1003018740995688645e-226, 5.827891678485165325e-243, -3.117427111020820077e-259, -5.9718623963762788119e-275, 1.1003018740995688645e-226, 5.827891678485165325e-243, -3.117427111020820077e-259, -5.9718623963762788119e-275, 1.1003018740995688645e-226, 5.827891678485165325e-243, -3.117427111020820077e-259, -5.9718623963762788119e-275, 2.560476225709334075e-227, 5.827891678485165325e-243, -3.117427111020820077e-259, -5.9718623963762788119e-275, 2.560476225709334075e-227, 5.827891678485165325e-243, -3.117427111020820077e-259, -5.9718623963762788119e-275, 4.4984059688774601837e-228, -3.0299000794643401155e-245, -2.8075477999879273582e-261, -1.472095602234059958e-277, 4.4984059688774601837e-228, -3.0299000794643401155e-245, -2.8075477999879273582e-261, -1.472095602234059958e-277, 4.4984059688774601837e-228, -3.0299000794643401155e-245, -2.8075477999879273582e-261, -1.472095602234059958e-277, 1.8601114328504743806e-228, -3.0299000794643401155e-245, -2.8075477999879273582e-261, -1.472095602234059958e-277, 5.409641648369814791e-229, -3.0299000794643401155e-245, -2.8075477999879273582e-261, -1.472095602234059958e-277, 5.409641648369814791e-229, -3.0299000794643401155e-245, -2.8075477999879273582e-261, -1.472095602234059958e-277, 2.1117734783360818049e-229, 4.2928382696354204061e-245, -2.8075477999879273582e-261, -1.472095602234059958e-277, 4.6283939331921604413e-230, 6.3146909508553973881e-246, 1.2573885592501529789e-261, 3.0408903374280139822e-277, 4.6283939331921604413e-230, 6.3146909508553973881e-246, 1.2573885592501529789e-261, 3.0408903374280139822e-277, 5.060587206499956961e-231, 5.9380161562121075096e-247, -1.2904053011746964278e-263, 8.7279092175580820317e-280, 5.060587206499956961e-231, 5.9380161562121075096e-247, -1.2904053011746964278e-263, 8.7279092175580820317e-280, 5.060587206499956961e-231, 5.9380161562121075096e-247, -1.2904053011746964278e-263, 8.7279092175580820317e-280, 5.060587206499956961e-231, 5.9380161562121075096e-247, -1.2904053011746964278e-263, 8.7279092175580820317e-280, 2.4841276986611042098e-231, 2.1712682097791944335e-248, 2.9746046415267896827e-264, -8.6516445844406224413e-282, 1.1958979447416775482e-231, 2.1712682097791944335e-248, 2.9746046415267896827e-264, -8.6516445844406224413e-282, 5.5178306778196421733e-232, 2.1712682097791944335e-248, 2.9746046415267896827e-264, -8.6516445844406224413e-282, 2.2972562930210755192e-232, 2.1712682097791944335e-248, 2.9746046415267896827e-264, -8.6516445844406224413e-282, 6.8696910062179237095e-233, 3.8349029251851101018e-249, -2.6436684620390282645e-267, -4.3807022524130141006e-284, 6.8696910062179237095e-233, 3.8349029251851101018e-249, -2.6436684620390282645e-267, -4.3807022524130141006e-284, 2.8439730252197153919e-233, 3.8349029251851101018e-249, -2.6436684620390282645e-267, -4.3807022524130141006e-284, 8.3111403472061145651e-234, 1.6001805286092554504e-249, -2.6436684620390282645e-267, -4.3807022524130141006e-284, 8.3111403472061145651e-234, 1.6001805286092554504e-249, -2.6436684620390282645e-267, -4.3807022524130141006e-284, 3.2789928709583552854e-234, 4.8281933032132812475e-250, -2.6436684620390282645e-267, -4.3807022524130141006e-284, 7.6291913283447536617e-235, 2.0347903074934629333e-250, -2.6436684620390282645e-267, -4.3807022524130141006e-284, 7.6291913283447536617e-235, 2.0347903074934629333e-250, -2.6436684620390282645e-267, -4.3807022524130141006e-284, 1.3390069830350552605e-235, -6.026193929640082176e-252, -7.0535576022338457803e-268, -4.3807022524130141006e-284, 1.3390069830350552605e-235, -6.026193929640082176e-252, -7.0535576022338457803e-268, -4.3807022524130141006e-284, 1.3390069830350552605e-235, -6.026193929640082176e-252, -7.0535576022338457803e-268, -4.3807022524130141006e-284, 5.5273393987134252385e-236, 1.1432574793608780349e-251, 1.2329569415922591084e-267, -4.3807022524130141006e-284, 1.5959741828948633012e-236, 2.7031904319843490867e-252, 2.638005906844371576e-268, 6.3790946999826013345e-284, 1.5959741828948633012e-236, 2.7031904319843490867e-252, 2.638005906844371576e-268, 6.3790946999826013345e-284, 6.1313287894022281692e-237, 5.2084434157824127104e-253, 2.1511502957481757317e-269, 3.2670891426006739096e-285, 1.2171222696290252021e-237, -2.4742181023285720738e-254, -1.2030990169203137715e-270, -9.5347405022956042207e-287, 1.2171222696290252021e-237, -2.4742181023285720738e-254, -1.2030990169203137715e-270, -9.5347405022956042207e-287, 1.2171222696290252021e-237, -2.4742181023285720738e-254, -1.2030990169203137715e-270, -9.5347405022956042207e-287, 6.0284645465737476297e-238, -2.4742181023285720738e-254, -1.2030990169203137715e-270, -9.5347405022956042207e-287, 2.9570854717154947523e-238, 4.3456134301905148502e-254, 6.3684349745470443788e-270, -9.5347405022956042207e-287, 1.4213959342863689955e-238, 9.3569766393097138822e-255, 2.5826679788133653036e-270, -9.5347405022956042207e-287, 6.5355116557180594664e-239, 9.3569766393097138822e-255, 2.5826679788133653036e-270, -9.5347405022956042207e-287, 2.6962878121452450746e-239, 8.3218722366085688343e-256, -2.0046830753539152442e-272, -3.4057806738724185961e-288, 7.766758903588374524e-240, 8.3218722366085688343e-256, -2.0046830753539152442e-272, -3.4057806738724185961e-288, 7.766758903588374524e-240, 8.3218722366085688343e-256, -2.0046830753539152442e-272, -3.4057806738724185961e-288, 2.9677290991223565342e-240, -2.3341145329525056675e-256, -2.0046830753539152442e-272, -3.4057806738724185961e-288, 5.6821419688934674008e-241, 3.2988215943776273615e-257, 2.1353977370878701046e-273, -1.2215123283371736879e-289, 5.6821419688934674008e-241, 3.2988215943776273615e-257, 2.1353977370878701046e-273, -1.2215123283371736879e-289, 5.6821419688934674008e-241, 3.2988215943776273615e-257, 2.1353977370878701046e-273, -1.2215123283371736879e-289, 2.6827483411022054912e-241, 3.2988215943776273615e-257, 2.1353977370878701046e-273, -1.2215123283371736879e-289, 1.1830515272065748694e-241, -3.117427111020820077e-259, -5.9718623963762788119e-275, 6.1155422068568954053e-291, 4.3320312025875939195e-242, -3.117427111020820077e-259, -5.9718623963762788119e-275, 6.1155422068568954053e-291, 5.827891678485165325e-243, -3.117427111020820077e-259, -5.9718623963762788119e-275, 6.1155422068568954053e-291, 5.827891678485165325e-243, -3.117427111020820077e-259, -5.9718623963762788119e-275, 6.1155422068568954053e-291, 5.827891678485165325e-243, -3.117427111020820077e-259, -5.9718623963762788119e-275, 6.1155422068568954053e-291, 1.1413391350613183311e-243, -5.1586784110844895013e-260, -1.9524039360882352712e-276, -2.9779654517181717279e-292, 1.1413391350613183311e-243, -5.1586784110844895013e-260, -1.9524039360882352712e-276, -2.9779654517181717279e-292, 1.1413391350613183311e-243, -5.1586784110844895013e-260, -1.9524039360882352712e-276, -2.9779654517181717279e-292, 5.5552006713333735927e-244, 7.8491179384773690214e-260, -1.9524039360882352712e-276, -2.9779654517181717279e-292, 2.6261053316934700345e-244, 1.345219763696439399e-260, 1.6579848156414234801e-276, 1.0303712682997740506e-292, 1.1615576618735179302e-244, 1.345219763696439399e-260, 1.6579848156414234801e-276, 1.0303712682997740506e-292, 4.2928382696354204061e-245, -2.8075477999879273582e-261, -1.472095602234059958e-277, 2.8287088295287585094e-294, 6.3146909508553973881e-246, 1.2573885592501529789e-261, 3.0408903374280139822e-277, 2.8287088295287585094e-294, 6.3146909508553973881e-246, 1.2573885592501529789e-261, 3.0408903374280139822e-277, 2.8287088295287585094e-294, 6.3146909508553973881e-246, 1.2573885592501529789e-261, 3.0408903374280139822e-277, 2.8287088295287585094e-294, 1.7379794826680480784e-246, 2.4115446944063306384e-262, 2.202741251392177696e-278, 2.8287088295287585094e-294, 1.7379794826680480784e-246, 2.4115446944063306384e-262, 2.202741251392177696e-278, 2.8287088295287585094e-294, 5.9380161562121075096e-247, -1.2904053011746964278e-263, 8.7279092175580810531e-280, 8.8634899828990930877e-296, 2.1712682097791944335e-248, 2.9746046415267896827e-264, -8.6516445844406224413e-282, -5.0528699238150276549e-299, 2.1712682097791944335e-248, 2.9746046415267896827e-264, -8.6516445844406224413e-282, -5.0528699238150276549e-299, 2.1712682097791944335e-248, 2.9746046415267896827e-264, -8.6516445844406224413e-282, -5.0528699238150276549e-299, 2.1712682097791944335e-248, 2.9746046415267896827e-264, -8.6516445844406224413e-282, -5.0528699238150276549e-299, 2.1712682097791944335e-248, 2.9746046415267896827e-264, -8.6516445844406224413e-282, -5.0528699238150276549e-299, 3.8349029251851101018e-249, -2.6436684620390282645e-267, -4.3807022524130141006e-284, -2.7456019707854725967e-300, 3.8349029251851101018e-249, -2.6436684620390282645e-267, -4.3807022524130141006e-284, -2.7456019707854725967e-300, 3.8349029251851101018e-249, -2.6436684620390282645e-267, -4.3807022524130141006e-284, -2.7456019707854725967e-300, 1.6001805286092554504e-249, -2.6436684620390282645e-267, -4.3807022524130141006e-284, -2.7456019707854725967e-300, 4.8281933032132812475e-250, -2.6436684620390282645e-267, -4.3807022524130141006e-284, -2.7456019707854725967e-300, 4.8281933032132812475e-250, -2.6436684620390282645e-267, -4.3807022524130141006e-284, -2.7456019707854725967e-300, 2.0347903074934629333e-250, -2.6436684620390282645e-267, -4.3807022524130141006e-284, -2.7456019707854725967e-300, 6.3808880963355377617e-251, -2.6436684620390282645e-267, -4.3807022524130141006e-284, -2.7456019707854725967e-300, 6.3808880963355377617e-251, -2.6436684620390282645e-267, -4.3807022524130141006e-284, -2.7456019707854725967e-300, 2.8891343516857640937e-251, 5.1095823452235464813e-267, -4.3807022524130141006e-284, -2.7456019707854725967e-300, 1.1432574793608780349e-251, 1.2329569415922591084e-267, -4.3807022524130141006e-284, -2.7456019707854725967e-300, 2.7031904319843490867e-252, 2.638005906844371576e-268, 6.3790946999826013345e-284, -2.7456019707854725967e-300, 2.7031904319843490867e-252, 2.638005906844371576e-268, 6.3790946999826013345e-284, -2.7456019707854725967e-300, 5.2084434157824127104e-253, 2.1511502957481757317e-269, 3.2670891426006735363e-285, 2.4084160842482777461e-301, 5.2084434157824127104e-253, 2.1511502957481757317e-269, 3.2670891426006735363e-285, 2.4084160842482777461e-301, 5.2084434157824127104e-253, 2.1511502957481757317e-269, 3.2670891426006735363e-285, 2.4084160842482777461e-301, 2.4805108027747776379e-253, 2.1511502957481757317e-269, 3.2670891426006735363e-285, 2.4084160842482777461e-301, 1.1165444962709601017e-253, 2.1511502957481757317e-269, 3.2670891426006735363e-285, 2.4084160842482777461e-301, 4.3456134301905148502e-254, 6.3684349745470443788e-270, -9.5347405022956030541e-287, -1.5805886663557401565e-302, 9.3569766393097138822e-255, 2.5826679788133653036e-270, -9.5347405022956030541e-287, -1.5805886663557401565e-302, 9.3569766393097138822e-255, 2.5826679788133653036e-270, -9.5347405022956030541e-287, -1.5805886663557401565e-302, 8.3218722366085688343e-256, -2.0046830753539152442e-272, -3.4057806738724185961e-288, 2.3458177946667328156e-304, 8.3218722366085688343e-256, -2.0046830753539152442e-272, -3.4057806738724185961e-288, 2.3458177946667328156e-304, 8.3218722366085688343e-256, -2.0046830753539152442e-272, -3.4057806738724185961e-288, 2.3458177946667328156e-304, 8.3218722366085688343e-256, -2.0046830753539152442e-272, -3.4057806738724185961e-288, 2.3458177946667328156e-304, 2.9938788518280315834e-256, -2.0046830753539152442e-272, -3.4057806738724185961e-288, 2.3458177946667328156e-304, 3.2988215943776273615e-257, 2.1353977370878701046e-273, -1.2215123283371736879e-289, 6.7342163555358599277e-306, 3.2988215943776273615e-257, 2.1353977370878701046e-273, -1.2215123283371736879e-289, 6.7342163555358599277e-306, 3.2988215943776273615e-257, 2.1353977370878701046e-273, -1.2215123283371736879e-289, 6.7342163555358599277e-306, 3.2988215943776273615e-257, 2.1353977370878701046e-273, -1.2215123283371736879e-289, 6.7342163555358599277e-306, 1.6338236616337094706e-257, 2.1353977370878701046e-273, -1.2215123283371736879e-289, 6.7342163555358599277e-306, 8.0132469526175071002e-258, 2.8687869620228451614e-274, -1.9537812801257956865e-290, 1.0380272777574237546e-306, 3.850752120757712373e-258, 2.8687869620228451614e-274, -1.9537812801257956865e-290, 1.0380272777574237546e-306, 1.7695047048278150093e-258, 2.8687869620228451614e-274, -1.9537812801257956865e-290, 1.0380272777574237546e-306, 7.2888099686286655858e-259, 5.581381609158630475e-275, 6.1155422068568946933e-291, 1.0380272777574237546e-306, 2.0856914288039227544e-259, -1.9524039360882352712e-276, -2.9779654517181712829e-292, -3.000817432603284506e-308, 2.0856914288039227544e-259, -1.9524039360882352712e-276, -2.9779654517181712829e-292, -3.000817432603284506e-308, 7.8491179384773690214e-260, -1.9524039360882352712e-276, -2.9779654517181712829e-292, -3.000817432603284506e-308, 1.345219763696439399e-260, 1.6579848156414234801e-276, 1.0303712682997738281e-292, 1.4493302844111182601e-308, 1.345219763696439399e-260, 1.6579848156414234801e-276, 1.0303712682997738281e-292, 1.4493302844111182601e-308, 1.345219763696439399e-260, 1.6579848156414234801e-276, 1.0303712682997738281e-292, 1.4493302844111182601e-308, 5.3223249184882342185e-261, -1.472095602234059958e-277, 2.8287088295287585094e-294, -1.0874435234232647519e-310, 1.2573885592501529789e-261, 3.0408903374280139822e-277, 2.8287088295287585094e-294, -1.0874435234232647519e-310, 1.2573885592501529789e-261, 3.0408903374280139822e-277, 2.8287088295287585094e-294, -1.0874435234232647519e-310, 2.4115446944063306384e-262, 2.202741251392177696e-278, 2.8287088295287585094e-294, -1.0874435234232647519e-310, 2.4115446944063306384e-262, 2.202741251392177696e-278, 2.8287088295287585094e-294, -1.0874435234232647519e-310, 2.4115446944063306384e-262, 2.202741251392177696e-278, 2.8287088295287585094e-294, -1.0874435234232647519e-310, 1.1412520821444306741e-262, -6.1787496089661820348e-279, -3.028042329852615431e-295, -2.182740474438892116e-311, 5.0610577601348040988e-263, 7.9243314524777990283e-279, -3.028042329852615431e-295, -2.182740474438892116e-311, 1.8853262294800541881e-263, 8.7279092175580810531e-280, 8.8634899828990930877e-296, -9.8167844904532653004e-314, 2.9746046415267896827e-264, -8.6516445844406224413e-282, -5.0528699238150265939e-299, -1.3288013265921760399e-314, 2.9746046415267896827e-264, -8.6516445844406224413e-282, -5.0528699238150265939e-299, -1.3288013265921760399e-314, 2.9746046415267896827e-264, -8.6516445844406224413e-282, -5.0528699238150265939e-299, -1.3288013265921760399e-314, 9.8977243486757054781e-265, -8.6516445844406224413e-282, -5.0528699238150265939e-299, -1.3288013265921760399e-314, 9.8977243486757054781e-265, -8.6516445844406224413e-282, -5.0528699238150265939e-299, -1.3288013265921760399e-314, 4.9356438320276576408e-265, -8.6516445844406224413e-282, -5.0528699238150265939e-299, -1.3288013265921760399e-314, 2.4546035737036337221e-265, -8.6516445844406224413e-282, -5.0528699238150265939e-299, -1.3288013265921760399e-314, 1.2140834445416214873e-265, 1.8893435613692150014e-281, 3.0075895258731974416e-297, -9.8167844904532653004e-314, 5.9382337996061564537e-266, 5.1208955146257653156e-282, -5.0528699238150265939e-299, -1.3288013265921760399e-314, 2.8369334767011265554e-266, 5.1208955146257653156e-282, -5.0528699238150265939e-299, -1.3288013265921760399e-314, 1.2862833152486119506e-266, 1.6777604898591683764e-282, -5.0528699238150265939e-299, -1.3288013265921760399e-314, 5.1095823452235464813e-267, -4.3807022524130141006e-284, -2.7456019707854725967e-300, -2.5539572388808429997e-317, 1.2329569415922591084e-267, -4.3807022524130141006e-284, -2.7456019707854725967e-300, -2.5539572388808429997e-317, 1.2329569415922591084e-267, -4.3807022524130141006e-284, -2.7456019707854725967e-300, -2.5539572388808429997e-317, 2.638005906844371576e-268, 6.3790946999826013345e-284, -2.7456019707854725967e-300, -2.5539572388808429997e-317, 2.638005906844371576e-268, 6.3790946999826013345e-284, -2.7456019707854725967e-300, -2.5539572388808429997e-317, 2.1511502957481757317e-269, 3.2670891426006735363e-285, 2.4084160842482773317e-301, 5.7350888195772519812e-317, 2.1511502957481757317e-269, 3.2670891426006735363e-285, 2.4084160842482773317e-301, 5.7350888195772519812e-317, 2.1511502957481757317e-269, 3.2670891426006735363e-285, 2.4084160842482773317e-301, 5.7350888195772519812e-317, 2.1511502957481757317e-269, 3.2670891426006735363e-285, 2.4084160842482773317e-301, 5.7350888195772519812e-317, 6.3684349745470443788e-270, -9.5347405022956030541e-287, -1.5805886663557401565e-302, 3.6369654387311681856e-319, 6.3684349745470443788e-270, -9.5347405022956030541e-287, -1.5805886663557401565e-302, 3.6369654387311681856e-319, 2.5826679788133653036e-270, -9.5347405022956030541e-287, -1.5805886663557401565e-302, 3.6369654387311681856e-319, 6.8978448094652555593e-271, 1.1480487920352081009e-286, 7.5257037990230704094e-303, 3.6369654387311681856e-319, 6.8978448094652555593e-271, 1.1480487920352081009e-286, 7.5257037990230704094e-303, 3.6369654387311681856e-319, 2.1656360647981577662e-271, 9.7287370902823839435e-288, 1.6928061833779524157e-303, 3.6369654387311681856e-319, 2.1656360647981577662e-271, 9.7287370902823839435e-288, 1.6928061833779524157e-303, 3.6369654387311681856e-319, 9.825838786313830552e-272, 9.7287370902823839435e-288, 1.6928061833779524157e-303, 3.6369654387311681856e-319, 3.9105778554799569972e-272, 9.7287370902823839435e-288, 1.6928061833779524157e-303, 3.6369654387311681856e-319, 9.5294739006302120482e-273, -1.2215123283371736879e-289, 6.7342163555358599277e-306, -5.681754927174335258e-322, 9.5294739006302120482e-273, -1.2215123283371736879e-289, 6.7342163555358599277e-306, -5.681754927174335258e-322, 2.1353977370878701046e-273, -1.2215123283371736879e-289, 6.7342163555358599277e-306, -5.681754927174335258e-322, 2.1353977370878701046e-273, -1.2215123283371736879e-289, 6.7342163555358599277e-306, -5.681754927174335258e-322, 2.8687869620228451614e-274, -1.9537812801257956865e-290, 1.0380272777574237546e-306, 6.4228533959362050743e-323, }; NOEXPORT ALIGNED(64) const float Sleef_rempitabsp[] = { 0.159154892, 5.112411827e-08, 3.626141271e-15, -2.036222915e-22, 0.03415493667, 6.420638243e-09, 7.342738037e-17, 8.135951656e-24, 0.03415493667, 6.420638243e-09, 7.342738037e-17, 8.135951656e-24, 0.002904943191, -9.861969574e-11, -9.839336547e-18, -1.790215892e-24, 0.002904943191, -9.861969574e-11, -9.839336547e-18, -1.790215892e-24, 0.002904943191, -9.861969574e-11, -9.839336547e-18, -1.790215892e-24, 0.002904943191, -9.861969574e-11, -9.839336547e-18, -1.790215892e-24, 0.0009518179577, 1.342109202e-10, 1.791623576e-17, 1.518506657e-24, 0.0009518179577, 1.342109202e-10, 1.791623576e-17, 1.518506657e-24, 0.0004635368241, 1.779561221e-11, 4.038449606e-18, -1.358546052e-25, 0.0002193961991, 1.779561221e-11, 4.038449606e-18, -1.358546052e-25, 9.73258866e-05, 1.779561221e-11, 4.038449606e-18, -1.358546052e-25, 3.62907449e-05, 3.243700447e-12, 5.690024473e-19, 7.09405479e-26, 5.773168596e-06, 1.424711477e-12, 1.3532163e-19, 1.92417627e-26, 5.773168596e-06, 1.424711477e-12, 1.3532163e-19, 1.92417627e-26, 5.773168596e-06, 1.424711477e-12, 1.3532163e-19, 1.92417627e-26, 1.958472239e-06, 5.152167755e-13, 1.3532163e-19, 1.92417627e-26, 5.112411827e-08, 3.626141271e-15, -2.036222915e-22, 6.177847236e-30, 5.112411827e-08, 3.626141271e-15, -2.036222915e-22, 6.177847236e-30, 5.112411827e-08, 3.626141271e-15, -2.036222915e-22, 6.177847236e-30, 5.112411827e-08, 3.626141271e-15, -2.036222915e-22, 6.177847236e-30, 5.112411827e-08, 3.626141271e-15, -2.036222915e-22, 6.177847236e-30, 5.112411827e-08, 3.626141271e-15, -2.036222915e-22, 6.177847236e-30, 2.132179588e-08, 3.626141271e-15, -2.036222915e-22, 6.177847236e-30, 6.420638243e-09, 7.342738037e-17, 8.135951656e-24, -1.330400526e-31, 6.420638243e-09, 7.342738037e-17, 8.135951656e-24, -1.330400526e-31, 2.695347945e-09, 7.342738037e-17, 8.135951656e-24, -1.330400526e-31, 8.327027956e-10, 7.342738037e-17, 8.135951656e-24, -1.330400526e-31, 8.327027956e-10, 7.342738037e-17, 8.135951656e-24, -1.330400526e-31, 3.670415083e-10, 7.342738037e-17, 8.135951656e-24, -1.330400526e-31, 1.342109202e-10, 1.791623576e-17, 1.518506361e-24, 2.613904e-31, 1.779561221e-11, 4.038449606e-18, -1.358545683e-25, -3.443243946e-32, 1.779561221e-11, 4.038449606e-18, -1.358545683e-25, -3.443243946e-32, 1.779561221e-11, 4.038449606e-18, -1.358545683e-25, -3.443243946e-32, 3.243700447e-12, 5.690024473e-19, 7.094053557e-26, 1.487136711e-32, 3.243700447e-12, 5.690024473e-19, 7.094053557e-26, 1.487136711e-32, 3.243700447e-12, 5.690024473e-19, 7.094053557e-26, 1.487136711e-32, 1.424711477e-12, 1.3532163e-19, 1.924175961e-26, 2.545416018e-33, 5.152167755e-13, 1.3532163e-19, 1.924175961e-26, 2.545416018e-33, 6.046956013e-14, -2.036222915e-22, 6.177846108e-30, 1.082084378e-36, 6.046956013e-14, -2.036222915e-22, 6.177846108e-30, 1.082084378e-36, 6.046956013e-14, -2.036222915e-22, 6.177846108e-30, 1.082084378e-36, 3.626141271e-15, -2.036222915e-22, 6.177846108e-30, 1.082084378e-36, 3.626141271e-15, -2.036222915e-22, 6.177846108e-30, 1.082084378e-36, 3.626141271e-15, -2.036222915e-22, 6.177846108e-30, 1.082084378e-36, 3.626141271e-15, -2.036222915e-22, 6.177846108e-30, 1.082084378e-36, 7.342738037e-17, 8.135951656e-24, -1.330400526e-31, 6.296048013e-40, 7.342738037e-17, 8.135951656e-24, -1.330400526e-31, 6.296048013e-40, 7.342738037e-17, 8.135951656e-24, -1.330400526e-31, 6.296048013e-40, 7.342738037e-17, 8.135951656e-24, -1.330400526e-31, 6.296048013e-40, 7.342738037e-17, 8.135951656e-24, -1.330400526e-31, 6.296048013e-40, 7.342738037e-17, 8.135951656e-24, -1.330400526e-31, 6.296048013e-40, 1.791623576e-17, 1.518506361e-24, 2.61390353e-31, 4.764937743e-38, 1.791623576e-17, 1.518506361e-24, 2.61390353e-31, 4.764937743e-38, 4.038449606e-18, -1.358545683e-25, -3.443243946e-32, 6.296048013e-40, 4.038449606e-18, -1.358545683e-25, -3.443243946e-32, 6.296048013e-40, 5.690024473e-19, 7.094053557e-26, 1.487136711e-32, 6.296048013e-40, 5.690024473e-19, 7.094053557e-26, 1.487136711e-32, 6.296048013e-40, 5.690024473e-19, 7.094053557e-26, 1.487136711e-32, 6.296048013e-40, 1.3532163e-19, 1.924175961e-26, 2.545415467e-33, 6.296048013e-40, 1.3532163e-19, 1.924175961e-26, 2.545415467e-33, 6.296048013e-40, 2.690143217e-20, -1.452834402e-28, -6.441077673e-36, -1.764234767e-42, 2.690143217e-20, -1.452834402e-28, -6.441077673e-36, -1.764234767e-42, 2.690143217e-20, -1.452834402e-28, -6.441077673e-36, -1.764234767e-42, 1.334890502e-20, -1.452834402e-28, -6.441077673e-36, -1.764234767e-42, 6.572641438e-21, -1.452834402e-28, -6.441077673e-36, -1.764234767e-42, 0.05874381959, 1.222115387e-08, 7.693612965e-16, 1.792054435e-22, 0.02749382704, 4.77057327e-09, 7.693612965e-16, 1.792054435e-22, 0.01186883077, 1.045283415e-09, 3.252721926e-16, 7.332633139e-23, 0.00405633077, 1.045283415e-09, 3.252721926e-16, 7.332633139e-23, 0.000150081818, -2.454155802e-12, 1.161414894e-20, 1.291319272e-27, 0.000150081818, -2.454155802e-12, 1.161414894e-20, 1.291319272e-27, 0.000150081818, -2.454155802e-12, 1.161414894e-20, 1.291319272e-27, 0.000150081818, -2.454155802e-12, 1.161414894e-20, 1.291319272e-27, 0.000150081818, -2.454155802e-12, 1.161414894e-20, 1.291319272e-27, 2.801149822e-05, 4.821800945e-12, 8.789757674e-19, 1.208447639e-25, 2.801149822e-05, 4.821800945e-12, 8.789757674e-19, 1.208447639e-25, 2.801149822e-05, 4.821800945e-12, 8.789757674e-19, 1.208447639e-25, 1.275271279e-05, 1.183823005e-12, 1.161414894e-20, 1.291319272e-27, 5.12331826e-06, 1.183823005e-12, 1.161414894e-20, 1.291319272e-27, 1.308621904e-06, 2.743283031e-13, 1.161414894e-20, 1.291319272e-27, 1.308621904e-06, 2.743283031e-13, 1.161414894e-20, 1.291319272e-27, 3.549478151e-07, 4.695462769e-14, 1.161414894e-20, 1.291319272e-27, 3.549478151e-07, 4.695462769e-14, 1.161414894e-20, 1.291319272e-27, 1.165292645e-07, 1.853292503e-14, 4.837885366e-21, 1.291319272e-27, 1.165292645e-07, 1.853292503e-14, 4.837885366e-21, 1.291319272e-27, 5.69246339e-08, 4.322073705e-15, 1.449754789e-21, 7.962890365e-29, 2.712231151e-08, 4.322073705e-15, 1.449754789e-21, 7.962890365e-29, 1.222115387e-08, 7.693612965e-16, 1.792054182e-22, 2.91418027e-29, 4.77057327e-09, 7.693612965e-16, 1.792054182e-22, 2.91418027e-29, 1.045283415e-09, 3.252721926e-16, 7.332632508e-23, 3.898253736e-30, 1.045283415e-09, 3.252721926e-16, 7.332632508e-23, 3.898253736e-30, 1.139611461e-10, 1.996093359e-17, 5.344349223e-25, 1.511644828e-31, 1.139611461e-10, 1.996093359e-17, 5.344349223e-25, 1.511644828e-31, 1.139611461e-10, 1.996093359e-17, 5.344349223e-25, 1.511644828e-31, 1.139611461e-10, 1.996093359e-17, 5.344349223e-25, 1.511644828e-31, 5.575349904e-11, 6.083145782e-18, 5.344349223e-25, 1.511644828e-31, 2.664967552e-11, -8.557475018e-19, -8.595036458e-26, -2.139883875e-32, 1.209775682e-11, 2.61369883e-18, 5.344349223e-25, 1.511644828e-31, 4.821800945e-12, 8.789757674e-19, 1.208447639e-25, 3.253064536e-33, 1.183823005e-12, 1.161414894e-20, 1.29131908e-27, 1.715766248e-34, 1.183823005e-12, 1.161414894e-20, 1.29131908e-27, 1.715766248e-34, 2.743283031e-13, 1.161414894e-20, 1.29131908e-27, 1.715766248e-34, }; ================================================ FILE: src/rename.h ================================================ #ifndef RENAMESCALAR_H #define RENAMESCALAR_H /* ------------------------------------------------------------------------- */ /* Naming of functions scalar */ #ifdef DETERMINISTIC #define xsin nsimd_sleef_sin_u35d_scalar_f64 #define xsinf nsimd_sleef_sin_u35d_scalar_f32 #define xcos nsimd_sleef_cos_u35d_scalar_f64 #define xcosf nsimd_sleef_cos_u35d_scalar_f32 #define xsincos nsimd_sleef_sincos_u35d_scalar_f64 #define xsincosf nsimd_sleef_sincos_u35d_scalar_f32 #define xtan nsimd_sleef_tan_u35d_scalar_f64 #define xtanf nsimd_sleef_tan_u35d_scalar_f32 #define xasin nsimd_sleef_asin_u35d_scalar_f64 #define xasinf nsimd_sleef_asin_u35d_scalar_f32 #define xacos nsimd_sleef_acos_u35d_scalar_f64 #define xacosf nsimd_sleef_acos_u35d_scalar_f32 #define xatan nsimd_sleef_atan_u35d_scalar_f64 #define xatanf nsimd_sleef_atan_u35d_scalar_f32 #define xatan2 nsimd_sleef_atan2_u35d_scalar_f64 #define xatan2f nsimd_sleef_atan2_u35d_scalar_f32 #define xlog nsimd_sleef_log_u35d_scalar_f64 #define xlogf nsimd_sleef_log_u35d_scalar_f32 #define xcbrt nsimd_sleef_cbrt_u35d_scalar_f64 #define xcbrtf nsimd_sleef_cbrt_u35d_scalar_f32 #define xsin_u1 nsimd_sleef_sin_u10d_scalar_f64 #define xsinf_u1 nsimd_sleef_sin_u10d_scalar_f32 #define xcos_u1 nsimd_sleef_cos_u10d_scalar_f64 #define xcosf_u1 nsimd_sleef_cos_u10d_scalar_f32 #define xsincos_u1 nsimd_sleef_sincos_u10d_scalar_f64 #define xsincosf_u1 nsimd_sleef_sincos_u10d_scalar_f32 #define xtan_u1 nsimd_sleef_tan_u10d_scalar_f64 #define xtanf_u1 nsimd_sleef_tan_u10d_scalar_f32 #define xasin_u1 nsimd_sleef_asin_u10d_scalar_f64 #define xasinf_u1 nsimd_sleef_asin_u10d_scalar_f32 #define xacos_u1 nsimd_sleef_acos_u10d_scalar_f64 #define xacosf_u1 nsimd_sleef_acos_u10d_scalar_f32 #define xatan_u1 nsimd_sleef_atan_u10d_scalar_f64 #define xatanf_u1 nsimd_sleef_atan_u10d_scalar_f32 #define xatan2_u1 nsimd_sleef_atan2_u10d_scalar_f64 #define xatan2f_u1 nsimd_sleef_atan2_u10d_scalar_f32 #define xlog_u1 nsimd_sleef_log_u10d_scalar_f64 #define xlogf_u1 nsimd_sleef_log_u10d_scalar_f32 #define xcbrt_u1 nsimd_sleef_cbrt_u10d_scalar_f64 #define xcbrtf_u1 nsimd_sleef_cbrt_u10d_scalar_f32 #define xexp nsimd_sleef_exp_u10d_scalar_f64 #define xexpf nsimd_sleef_exp_u10d_scalar_f32 #define xpow nsimd_sleef_pow_u10d_scalar_f64 #define xpowf nsimd_sleef_pow_u10d_scalar_f32 #define xsinh nsimd_sleef_sinh_u10d_scalar_f64 #define xsinhf nsimd_sleef_sinh_u10d_scalar_f32 #define xcosh nsimd_sleef_cosh_u10d_scalar_f64 #define xcoshf nsimd_sleef_cosh_u10d_scalar_f32 #define xtanh nsimd_sleef_tanh_u10d_scalar_f64 #define xtanhf nsimd_sleef_tanh_u10d_scalar_f32 #define xsinh_u35 nsimd_sleef_sinh_u35d_scalar_f64 #define xsinhf_u35 nsimd_sleef_sinh_u35d_scalar_f32 #define xcosh_u35 nsimd_sleef_cosh_u35d_scalar_f64 #define xcoshf_u35 nsimd_sleef_cosh_u35d_scalar_f32 #define xtanh_u35 nsimd_sleef_tanh_u35d_scalar_f64 #define xtanhf_u35 nsimd_sleef_tanh_u35d_scalar_f32 #define xfastsin_u3500 nsimd_sleef_fastsin_u3500d_scalar_f64 #define xfastsinf_u3500 nsimd_sleef_fastsin_u3500d_scalar_f32 #define xfastcos_u3500 nsimd_sleef_fastcos_u3500d_scalar_f64 #define xfastcosf_u3500 nsimd_sleef_fastcos_u3500d_scalar_f32 #define xfastpow_u3500 nsimd_sleef_fastpow_u3500d_scalar_f64 #define xfastpowf_u3500 nsimd_sleef_fastpow_u3500d_scalar_f32 #define xasinh nsimd_sleef_asinh_u10d_scalar_f64 #define xasinhf nsimd_sleef_asinh_u10d_scalar_f32 #define xacosh nsimd_sleef_acosh_u10d_scalar_f64 #define xacoshf nsimd_sleef_acosh_u10d_scalar_f32 #define xatanh nsimd_sleef_atanh_u10d_scalar_f64 #define xatanhf nsimd_sleef_atanh_u10d_scalar_f32 #define xexp2 nsimd_sleef_exp2_u10d_scalar_f64 #define xexp2f nsimd_sleef_exp2_u10d_scalar_f32 #define xexp2_u35 nsimd_sleef_exp2_u35d_scalar_f64 #define xexp2f_u35 nsimd_sleef_exp2_u35d_scalar_f32 #define xexp10 nsimd_sleef_exp10_u10d_scalar_f64 #define xexp10f nsimd_sleef_exp10_u10d_scalar_f32 #define xexp10_u35 nsimd_sleef_exp10_u35d_scalar_f64 #define xexp10f_u35 nsimd_sleef_exp10_u35d_scalar_f32 #define xexpm1 nsimd_sleef_expm1_u10d_scalar_f64 #define xexpm1f nsimd_sleef_expm1_u10d_scalar_f32 #define xlog10 nsimd_sleef_log10_u10d_scalar_f64 #define xlog10f nsimd_sleef_log10_u10d_scalar_f32 #define xlog2 nsimd_sleef_log2_u10d_scalar_f64 #define xlog2f nsimd_sleef_log2_u10d_scalar_f32 #define xlog2_u35 nsimd_sleef_log2_u35d_scalar_f64 #define xlog2f_u35 nsimd_sleef_log2_u35d_scalar_f32 #define xlog1p nsimd_sleef_log1p_u10d_scalar_f64 #define xlog1pf nsimd_sleef_log1p_u10d_scalar_f32 #define xsincospi_u05 nsimd_sleef_sincospi_u05d_scalar_f64 #define xsincospif_u05 nsimd_sleef_sincospi_u05d_scalar_f32 #define xsincospi_u35 nsimd_sleef_sincospi_u35d_scalar_f64 #define xsincospif_u35 nsimd_sleef_sincospi_u35d_scalar_f32 #define xsinpi_u05 nsimd_sleef_sinpi_u05d_scalar_f64 #define xsinpif_u05 nsimd_sleef_sinpi_u05d_scalar_f32 #define xcospi_u05 nsimd_sleef_cospi_u05d_scalar_f64 #define xcospif_u05 nsimd_sleef_cospi_u05d_scalar_f32 #define xldexp nsimd_sleef_ldexp_scalar_f64 #define xldexpf nsimd_sleef_ldexp_scalar_f32 #define xilogb nsimd_sleef_ilogb_scalar_f64 #define xilogbf nsimd_sleef_ilogb_scalar_f32 #define xfma nsimd_sleef_fma_scalar_f64 #define xfmaf nsimd_sleef_fma_scalar_f32 #define xsqrt nsimd_sleef_sqrt_scalar_f64 #define xsqrtf nsimd_sleef_sqrt_scalar_f32 #define xsqrt_u05 nsimd_sleef_sqrt_u05d_scalar_f64 #define xsqrtf_u05 nsimd_sleef_sqrt_u05d_scalar_f32 #define xsqrt_u35 nsimd_sleef_sqrt_u35d_scalar_f64 #define xsqrtf_u35 nsimd_sleef_sqrt_u35d_scalar_f32 #define xhypot_u05 nsimd_sleef_hypot_u05d_scalar_f64 #define xhypotf_u05 nsimd_sleef_hypot_u05d_scalar_f32 #define xhypot_u35 nsimd_sleef_hypot_u35d_scalar_f64 #define xhypotf_u35 nsimd_sleef_hypot_u35d_scalar_f32 #define xfabs nsimd_sleef_fabs_scalar_f64 #define xfabsf nsimd_sleef_fabs_scalar_f32 #define xcopysign nsimd_sleef_copysign_scalar_f64 #define xcopysignf nsimd_sleef_copysign_scalar_f32 #define xfmax nsimd_sleef_fmax_scalar_f64 #define xfmaxf nsimd_sleef_fmax_scalar_f32 #define xfmin nsimd_sleef_fmin_scalar_f64 #define xfminf nsimd_sleef_fmin_scalar_f32 #define xfdim nsimd_sleef_fdim_scalar_f64 #define xfdimf nsimd_sleef_fdim_scalar_f32 #define xtrunc nsimd_sleef_trunc_scalar_f64 #define xtruncf nsimd_sleef_trunc_scalar_f32 #define xfloor nsimd_sleef_floor_scalar_f64 #define xfloorf nsimd_sleef_floor_scalar_f32 #define xceil nsimd_sleef_ceil_scalar_f64 #define xceilf nsimd_sleef_ceil_scalar_f32 #define xround nsimd_sleef_round_scalar_f64 #define xroundf nsimd_sleef_round_scalar_f32 #define xrint nsimd_sleef_rint_scalar_f64 #define xrintf nsimd_sleef_rint_scalar_f32 #define xnextafter nsimd_sleef_nextafter_scalar_f64 #define xnextafterf nsimd_sleef_nextafter_scalar_f32 #define xfrfrexp nsimd_sleef_frfrexp_scalar_f64 #define xfrfrexpf nsimd_sleef_frfrexp_scalar_f32 #define xexpfrexp nsimd_sleef_expfrexp_scalar_f64 #define xexpfrexpf nsimd_sleef_expfrexp_scalar_f32 #define xfmod nsimd_sleef_fmod_scalar_f64 #define xfmodf nsimd_sleef_fmod_scalar_f32 #define xremainder nsimd_sleef_remainder_scalar_f64 #define xremainderf nsimd_sleef_remainder_scalar_f32 #define xmodf nsimd_sleef_modf_scalar_f64 #define xmodff nsimd_sleef_modf_scalar_f32 #define xlgamma_u1 nsimd_sleef_lgamma_u10d_scalar_f64 #define xlgammaf_u1 nsimd_sleef_lgamma_u10d_scalar_f32 #define xtgamma_u1 nsimd_sleef_tgamma_u10d_scalar_f64 #define xtgammaf_u1 nsimd_sleef_tgamma_u10d_scalar_f32 #define xerf_u1 nsimd_sleef_erf_u10d_scalar_f64 #define xerff_u1 nsimd_sleef_erf_u10d_scalar_f32 #define xerfc_u15 nsimd_sleef_erfc_u15d_scalar_f64 #define xerfcf_u15 nsimd_sleef_erfc_u15d_scalar_f32 #define xgetInt nsimd_sleef_getInt_scalar_f64 #define xgetIntf nsimd_sleef_getInt_scalar_f32 #define xgetPtr nsimd_sleef_getPtr_scalar_f64 #define xgetPtrf nsimd_sleef_getPtr_scalar_f32 #else #define xsin nsimd_sleef_sin_u35_scalar_f64 #define xsinf nsimd_sleef_sin_u35_scalar_f32 #define xcos nsimd_sleef_cos_u35_scalar_f64 #define xcosf nsimd_sleef_cos_u35_scalar_f32 #define xsincos nsimd_sleef_sincos_u35_scalar_f64 #define xsincosf nsimd_sleef_sincos_u35_scalar_f32 #define xtan nsimd_sleef_tan_u35_scalar_f64 #define xtanf nsimd_sleef_tan_u35_scalar_f32 #define xasin nsimd_sleef_asin_u35_scalar_f64 #define xasinf nsimd_sleef_asin_u35_scalar_f32 #define xacos nsimd_sleef_acos_u35_scalar_f64 #define xacosf nsimd_sleef_acos_u35_scalar_f32 #define xatan nsimd_sleef_atan_u35_scalar_f64 #define xatanf nsimd_sleef_atan_u35_scalar_f32 #define xatan2 nsimd_sleef_atan2_u35_scalar_f64 #define xatan2f nsimd_sleef_atan2_u35_scalar_f32 #define xlog nsimd_sleef_log_u35_scalar_f64 #define xlogf nsimd_sleef_log_u35_scalar_f32 #define xcbrt nsimd_sleef_cbrt_u35_scalar_f64 #define xcbrtf nsimd_sleef_cbrt_u35_scalar_f32 #define xsin_u1 nsimd_sleef_sin_u10_scalar_f64 #define xsinf_u1 nsimd_sleef_sin_u10_scalar_f32 #define xcos_u1 nsimd_sleef_cos_u10_scalar_f64 #define xcosf_u1 nsimd_sleef_cos_u10_scalar_f32 #define xsincos_u1 nsimd_sleef_sincos_u10_scalar_f64 #define xsincosf_u1 nsimd_sleef_sincos_u10_scalar_f32 #define xtan_u1 nsimd_sleef_tan_u10_scalar_f64 #define xtanf_u1 nsimd_sleef_tan_u10_scalar_f32 #define xasin_u1 nsimd_sleef_asin_u10_scalar_f64 #define xasinf_u1 nsimd_sleef_asin_u10_scalar_f32 #define xacos_u1 nsimd_sleef_acos_u10_scalar_f64 #define xacosf_u1 nsimd_sleef_acos_u10_scalar_f32 #define xatan_u1 nsimd_sleef_atan_u10_scalar_f64 #define xatanf_u1 nsimd_sleef_atan_u10_scalar_f32 #define xatan2_u1 nsimd_sleef_atan2_u10_scalar_f64 #define xatan2f_u1 nsimd_sleef_atan2_u10_scalar_f32 #define xlog_u1 nsimd_sleef_log_u10_scalar_f64 #define xlogf_u1 nsimd_sleef_log_u10_scalar_f32 #define xcbrt_u1 nsimd_sleef_cbrt_u10_scalar_f64 #define xcbrtf_u1 nsimd_sleef_cbrt_u10_scalar_f32 #define xexp nsimd_sleef_exp_u10_scalar_f64 #define xexpf nsimd_sleef_exp_u10_scalar_f32 #define xpow nsimd_sleef_pow_u10_scalar_f64 #define xpowf nsimd_sleef_pow_u10_scalar_f32 #define xsinh nsimd_sleef_sinh_u10_scalar_f64 #define xsinhf nsimd_sleef_sinh_u10_scalar_f32 #define xcosh nsimd_sleef_cosh_u10_scalar_f64 #define xcoshf nsimd_sleef_cosh_u10_scalar_f32 #define xtanh nsimd_sleef_tanh_u10_scalar_f64 #define xtanhf nsimd_sleef_tanh_u10_scalar_f32 #define xsinh_u35 nsimd_sleef_sinh_u35_scalar_f64 #define xsinhf_u35 nsimd_sleef_sinh_u35_scalar_f32 #define xcosh_u35 nsimd_sleef_cosh_u35_scalar_f64 #define xcoshf_u35 nsimd_sleef_cosh_u35_scalar_f32 #define xtanh_u35 nsimd_sleef_tanh_u35_scalar_f64 #define xtanhf_u35 nsimd_sleef_tanh_u35_scalar_f32 #define xfastsin_u3500 nsimd_sleef_fastsin_u3500_scalar_f64 #define xfastsinf_u3500 nsimd_sleef_fastsin_u3500_scalar_f32 #define xfastcos_u3500 nsimd_sleef_fastcos_u3500_scalar_f64 #define xfastcosf_u3500 nsimd_sleef_fastcos_u3500_scalar_f32 #define xfastpow_u3500 nsimd_sleef_fastpow_u3500_scalar_f64 #define xfastpowf_u3500 nsimd_sleef_fastpow_u3500_scalar_f32 #define xasinh nsimd_sleef_asinh_u10_scalar_f64 #define xasinhf nsimd_sleef_asinh_u10_scalar_f32 #define xacosh nsimd_sleef_acosh_u10_scalar_f64 #define xacoshf nsimd_sleef_acosh_u10_scalar_f32 #define xatanh nsimd_sleef_atanh_u10_scalar_f64 #define xatanhf nsimd_sleef_atanh_u10_scalar_f32 #define xexp2 nsimd_sleef_exp2_u10_scalar_f64 #define xexp2f nsimd_sleef_exp2_u10_scalar_f32 #define xexp2_u35 nsimd_sleef_exp2_u35_scalar_f64 #define xexp2f_u35 nsimd_sleef_exp2_u35_scalar_f32 #define xexp10 nsimd_sleef_exp10_u10_scalar_f64 #define xexp10f nsimd_sleef_exp10_u10_scalar_f32 #define xexp10_u35 nsimd_sleef_exp10_u35_scalar_f64 #define xexp10f_u35 nsimd_sleef_exp10_u35_scalar_f32 #define xexpm1 nsimd_sleef_expm1_u10_scalar_f64 #define xexpm1f nsimd_sleef_expm1_u10_scalar_f32 #define xlog10 nsimd_sleef_log10_u10_scalar_f64 #define xlog10f nsimd_sleef_log10_u10_scalar_f32 #define xlog2 nsimd_sleef_log2_u10_scalar_f64 #define xlog2f nsimd_sleef_log2_u10_scalar_f32 #define xlog2_u35 nsimd_sleef_log2_u35_scalar_f64 #define xlog2f_u35 nsimd_sleef_log2_u35_scalar_f32 #define xlog1p nsimd_sleef_log1p_u10_scalar_f64 #define xlog1pf nsimd_sleef_log1p_u10_scalar_f32 #define xsincospi_u05 nsimd_sleef_sincospi_u05_scalar_f64 #define xsincospif_u05 nsimd_sleef_sincospi_u05_scalar_f32 #define xsincospi_u35 nsimd_sleef_sincospi_u35_scalar_f64 #define xsincospif_u35 nsimd_sleef_sincospi_u35_scalar_f32 #define xsinpi_u05 nsimd_sleef_sinpi_u05_scalar_f64 #define xsinpif_u05 nsimd_sleef_sinpi_u05_scalar_f32 #define xcospi_u05 nsimd_sleef_cospi_u05_scalar_f64 #define xcospif_u05 nsimd_sleef_cospi_u05_scalar_f32 #define xldexp nsimd_sleef_ldexp_scalar_f64 #define xldexpf nsimd_sleef_ldexp_scalar_f32 #define xilogb nsimd_sleef_ilogb_scalar_f64 #define xilogbf nsimd_sleef_ilogb_scalar_f32 #define xfma nsimd_sleef_fma_scalar_f64 #define xfmaf nsimd_sleef_fma_scalar_f32 #define xsqrt nsimd_sleef_sqrt_scalar_f64 #define xsqrtf nsimd_sleef_sqrt_scalar_f32 #define xsqrt_u05 nsimd_sleef_sqrt_u05_scalar_f64 #define xsqrtf_u05 nsimd_sleef_sqrt_u05_scalar_f32 #define xsqrt_u35 nsimd_sleef_sqrt_u35_scalar_f64 #define xsqrtf_u35 nsimd_sleef_sqrt_u35_scalar_f32 #define xhypot_u05 nsimd_sleef_hypot_u05_scalar_f64 #define xhypotf_u05 nsimd_sleef_hypot_u05_scalar_f32 #define xhypot_u35 nsimd_sleef_hypot_u35_scalar_f64 #define xhypotf_u35 nsimd_sleef_hypot_u35_scalar_f32 #define xfabs nsimd_sleef_fabs_scalar_f64 #define xfabsf nsimd_sleef_fabs_scalar_f32 #define xcopysign nsimd_sleef_copysign_scalar_f64 #define xcopysignf nsimd_sleef_copysign_scalar_f32 #define xfmax nsimd_sleef_fmax_scalar_f64 #define xfmaxf nsimd_sleef_fmax_scalar_f32 #define xfmin nsimd_sleef_fmin_scalar_f64 #define xfminf nsimd_sleef_fmin_scalar_f32 #define xfdim nsimd_sleef_fdim_scalar_f64 #define xfdimf nsimd_sleef_fdim_scalar_f32 #define xtrunc nsimd_sleef_trunc_scalar_f64 #define xtruncf nsimd_sleef_trunc_scalar_f32 #define xfloor nsimd_sleef_floor_scalar_f64 #define xfloorf nsimd_sleef_floor_scalar_f32 #define xceil nsimd_sleef_ceil_scalar_f64 #define xceilf nsimd_sleef_ceil_scalar_f32 #define xround nsimd_sleef_round_scalar_f64 #define xroundf nsimd_sleef_round_scalar_f32 #define xrint nsimd_sleef_rint_scalar_f64 #define xrintf nsimd_sleef_rint_scalar_f32 #define xnextafter nsimd_sleef_nextafter_scalar_f64 #define xnextafterf nsimd_sleef_nextafter_scalar_f32 #define xfrfrexp nsimd_sleef_frfrexp_scalar_f64 #define xfrfrexpf nsimd_sleef_frfrexp_scalar_f32 #define xexpfrexp nsimd_sleef_expfrexp_scalar_f64 #define xexpfrexpf nsimd_sleef_expfrexp_scalar_f32 #define xfmod nsimd_sleef_fmod_scalar_f64 #define xfmodf nsimd_sleef_fmod_scalar_f32 #define xremainder nsimd_sleef_remainder_scalar_f64 #define xremainderf nsimd_sleef_remainder_scalar_f32 #define xmodf nsimd_sleef_modf_scalar_f64 #define xmodff nsimd_sleef_modf_scalar_f32 #define xlgamma_u1 nsimd_sleef_lgamma_u10_scalar_f64 #define xlgammaf_u1 nsimd_sleef_lgamma_u10_scalar_f32 #define xtgamma_u1 nsimd_sleef_tgamma_u10_scalar_f64 #define xtgammaf_u1 nsimd_sleef_tgamma_u10_scalar_f32 #define xerf_u1 nsimd_sleef_erf_u10_scalar_f64 #define xerff_u1 nsimd_sleef_erf_u10_scalar_f32 #define xerfc_u15 nsimd_sleef_erfc_u15_scalar_f64 #define xerfcf_u15 nsimd_sleef_erfc_u15_scalar_f32 #define xgetInt nsimd_sleef_getInt_scalar_f64 #define xgetIntf nsimd_sleef_getInt_scalar_f32 #define xgetPtr nsimd_sleef_getPtr_scalar_f64 #define xgetPtrf nsimd_sleef_getPtr_scalar_f32 #endif #define rempi nsimd_sleef_rempi_scalar #define rempif nsimd_sleef_rempif_scalar #define rempisub nsimd_sleef_rempisub_scalar #define rempisubf nsimd_sleef_rempisubf_scalar #define gammak nsimd_gammak_scalar #define gammafk nsimd_gammafk_scalar #endif ================================================ FILE: src/renameadvsimd.h ================================================ #ifndef RENAMEADVSIMD_H #define RENAMEADVSIMD_H /* ------------------------------------------------------------------------- */ /* Naming of functions aarch64 */ #ifdef NSIMD_AARCH64 #ifdef DETERMINISTIC #define xsin nsimd_sleef_sin_u35d_aarch64_f64 #define xsinf nsimd_sleef_sin_u35d_aarch64_f32 #define xcos nsimd_sleef_cos_u35d_aarch64_f64 #define xcosf nsimd_sleef_cos_u35d_aarch64_f32 #define xsincos nsimd_sleef_sincos_u35d_aarch64_f64 #define xsincosf nsimd_sleef_sincos_u35d_aarch64_f32 #define xtan nsimd_sleef_tan_u35d_aarch64_f64 #define xtanf nsimd_sleef_tan_u35d_aarch64_f32 #define xasin nsimd_sleef_asin_u35d_aarch64_f64 #define xasinf nsimd_sleef_asin_u35d_aarch64_f32 #define xacos nsimd_sleef_acos_u35d_aarch64_f64 #define xacosf nsimd_sleef_acos_u35d_aarch64_f32 #define xatan nsimd_sleef_atan_u35d_aarch64_f64 #define xatanf nsimd_sleef_atan_u35d_aarch64_f32 #define xatan2 nsimd_sleef_atan2_u35d_aarch64_f64 #define xatan2f nsimd_sleef_atan2_u35d_aarch64_f32 #define xlog nsimd_sleef_log_u35d_aarch64_f64 #define xlogf nsimd_sleef_log_u35d_aarch64_f32 #define xcbrt nsimd_sleef_cbrt_u35d_aarch64_f64 #define xcbrtf nsimd_sleef_cbrt_u35d_aarch64_f32 #define xsin_u1 nsimd_sleef_sin_u10d_aarch64_f64 #define xsinf_u1 nsimd_sleef_sin_u10d_aarch64_f32 #define xcos_u1 nsimd_sleef_cos_u10d_aarch64_f64 #define xcosf_u1 nsimd_sleef_cos_u10d_aarch64_f32 #define xsincos_u1 nsimd_sleef_sincos_u10d_aarch64_f64 #define xsincosf_u1 nsimd_sleef_sincos_u10d_aarch64_f32 #define xtan_u1 nsimd_sleef_tan_u10d_aarch64_f64 #define xtanf_u1 nsimd_sleef_tan_u10d_aarch64_f32 #define xasin_u1 nsimd_sleef_asin_u10d_aarch64_f64 #define xasinf_u1 nsimd_sleef_asin_u10d_aarch64_f32 #define xacos_u1 nsimd_sleef_acos_u10d_aarch64_f64 #define xacosf_u1 nsimd_sleef_acos_u10d_aarch64_f32 #define xatan_u1 nsimd_sleef_atan_u10d_aarch64_f64 #define xatanf_u1 nsimd_sleef_atan_u10d_aarch64_f32 #define xatan2_u1 nsimd_sleef_atan2_u10d_aarch64_f64 #define xatan2f_u1 nsimd_sleef_atan2_u10d_aarch64_f32 #define xlog_u1 nsimd_sleef_log_u10d_aarch64_f64 #define xlogf_u1 nsimd_sleef_log_u10d_aarch64_f32 #define xcbrt_u1 nsimd_sleef_cbrt_u10d_aarch64_f64 #define xcbrtf_u1 nsimd_sleef_cbrt_u10d_aarch64_f32 #define xexp nsimd_sleef_exp_u10d_aarch64_f64 #define xexpf nsimd_sleef_exp_u10d_aarch64_f32 #define xpow nsimd_sleef_pow_u10d_aarch64_f64 #define xpowf nsimd_sleef_pow_u10d_aarch64_f32 #define xsinh nsimd_sleef_sinh_u10d_aarch64_f64 #define xsinhf nsimd_sleef_sinh_u10d_aarch64_f32 #define xcosh nsimd_sleef_cosh_u10d_aarch64_f64 #define xcoshf nsimd_sleef_cosh_u10d_aarch64_f32 #define xtanh nsimd_sleef_tanh_u10d_aarch64_f64 #define xtanhf nsimd_sleef_tanh_u10d_aarch64_f32 #define xsinh_u35 nsimd_sleef_sinh_u35d_aarch64_f64 #define xsinhf_u35 nsimd_sleef_sinh_u35d_aarch64_f32 #define xcosh_u35 nsimd_sleef_cosh_u35d_aarch64_f64 #define xcoshf_u35 nsimd_sleef_cosh_u35d_aarch64_f32 #define xtanh_u35 nsimd_sleef_tanh_u35d_aarch64_f64 #define xtanhf_u35 nsimd_sleef_tanh_u35d_aarch64_f32 #define xfastsin_u3500 nsimd_sleef_fastsin_u3500d_aarch64_f64 #define xfastsinf_u3500 nsimd_sleef_fastsin_u3500d_aarch64_f32 #define xfastcos_u3500 nsimd_sleef_fastcos_u3500d_aarch64_f64 #define xfastcosf_u3500 nsimd_sleef_fastcos_u3500d_aarch64_f32 #define xfastpow_u3500 nsimd_sleef_fastpow_u3500d_aarch64_f64 #define xfastpowf_u3500 nsimd_sleef_fastpow_u3500d_aarch64_f32 #define xasinh nsimd_sleef_asinh_u10d_aarch64_f64 #define xasinhf nsimd_sleef_asinh_u10d_aarch64_f32 #define xacosh nsimd_sleef_acosh_u10d_aarch64_f64 #define xacoshf nsimd_sleef_acosh_u10d_aarch64_f32 #define xatanh nsimd_sleef_atanh_u10d_aarch64_f64 #define xatanhf nsimd_sleef_atanh_u10d_aarch64_f32 #define xexp2 nsimd_sleef_exp2_u10d_aarch64_f64 #define xexp2f nsimd_sleef_exp2_u10d_aarch64_f32 #define xexp2_u35 nsimd_sleef_exp2_u35d_aarch64_f64 #define xexp2f_u35 nsimd_sleef_exp2_u35d_aarch64_f32 #define xexp10 nsimd_sleef_exp10_u10d_aarch64_f64 #define xexp10f nsimd_sleef_exp10_u10d_aarch64_f32 #define xexp10_u35 nsimd_sleef_exp10_u35d_aarch64_f64 #define xexp10f_u35 nsimd_sleef_exp10_u35d_aarch64_f32 #define xexpm1 nsimd_sleef_expm1_u10d_aarch64_f64 #define xexpm1f nsimd_sleef_expm1_u10d_aarch64_f32 #define xlog10 nsimd_sleef_log10_u10d_aarch64_f64 #define xlog10f nsimd_sleef_log10_u10d_aarch64_f32 #define xlog2 nsimd_sleef_log2_u10d_aarch64_f64 #define xlog2f nsimd_sleef_log2_u10d_aarch64_f32 #define xlog2_u35 nsimd_sleef_log2_u35d_aarch64_f64 #define xlog2f_u35 nsimd_sleef_log2_u35d_aarch64_f32 #define xlog1p nsimd_sleef_log1p_u10d_aarch64_f64 #define xlog1pf nsimd_sleef_log1p_u10d_aarch64_f32 #define xsincospi_u05 nsimd_sleef_sincospi_u05d_aarch64_f64 #define xsincospif_u05 nsimd_sleef_sincospi_u05d_aarch64_f32 #define xsincospi_u35 nsimd_sleef_sincospi_u35d_aarch64_f64 #define xsincospif_u35 nsimd_sleef_sincospi_u35d_aarch64_f32 #define xsinpi_u05 nsimd_sleef_sinpi_u05d_aarch64_f64 #define xsinpif_u05 nsimd_sleef_sinpi_u05d_aarch64_f32 #define xcospi_u05 nsimd_sleef_cospi_u05d_aarch64_f64 #define xcospif_u05 nsimd_sleef_cospi_u05d_aarch64_f32 #define xldexp nsimd_sleef_ldexp_aarch64_f64 #define xldexpf nsimd_sleef_ldexp_aarch64_f32 #define xilogb nsimd_sleef_ilogb_aarch64_f64 #define xilogbf nsimd_sleef_ilogb_aarch64_f32 #define xfma nsimd_sleef_fma_aarch64_f64 #define xfmaf nsimd_sleef_fma_aarch64_f32 #define xsqrt nsimd_sleef_sqrt_aarch64_f64 #define xsqrtf nsimd_sleef_sqrt_aarch64_f32 #define xsqrt_u05 nsimd_sleef_sqrt_u05d_aarch64_f64 #define xsqrtf_u05 nsimd_sleef_sqrt_u05d_aarch64_f32 #define xsqrt_u35 nsimd_sleef_sqrt_u35d_aarch64_f64 #define xsqrtf_u35 nsimd_sleef_sqrt_u35d_aarch64_f32 #define xhypot_u05 nsimd_sleef_hypot_u05d_aarch64_f64 #define xhypotf_u05 nsimd_sleef_hypot_u05d_aarch64_f32 #define xhypot_u35 nsimd_sleef_hypot_u35d_aarch64_f64 #define xhypotf_u35 nsimd_sleef_hypot_u35d_aarch64_f32 #define xfabs nsimd_sleef_fabs_aarch64_f64 #define xfabsf nsimd_sleef_fabs_aarch64_f32 #define xcopysign nsimd_sleef_copysign_aarch64_f64 #define xcopysignf nsimd_sleef_copysign_aarch64_f32 #define xfmax nsimd_sleef_fmax_aarch64_f64 #define xfmaxf nsimd_sleef_fmax_aarch64_f32 #define xfmin nsimd_sleef_fmin_aarch64_f64 #define xfminf nsimd_sleef_fmin_aarch64_f32 #define xfdim nsimd_sleef_fdim_aarch64_f64 #define xfdimf nsimd_sleef_fdim_aarch64_f32 #define xtrunc nsimd_sleef_trunc_aarch64_f64 #define xtruncf nsimd_sleef_trunc_aarch64_f32 #define xfloor nsimd_sleef_floor_aarch64_f64 #define xfloorf nsimd_sleef_floor_aarch64_f32 #define xceil nsimd_sleef_ceil_aarch64_f64 #define xceilf nsimd_sleef_ceil_aarch64_f32 #define xround nsimd_sleef_round_aarch64_f64 #define xroundf nsimd_sleef_round_aarch64_f32 #define xrint nsimd_sleef_rint_aarch64_f64 #define xrintf nsimd_sleef_rint_aarch64_f32 #define xnextafter nsimd_sleef_nextafter_aarch64_f64 #define xnextafterf nsimd_sleef_nextafter_aarch64_f32 #define xfrfrexp nsimd_sleef_frfrexp_aarch64_f64 #define xfrfrexpf nsimd_sleef_frfrexp_aarch64_f32 #define xexpfrexp nsimd_sleef_expfrexp_aarch64_f64 #define xexpfrexpf nsimd_sleef_expfrexp_aarch64_f32 #define xfmod nsimd_sleef_fmod_aarch64_f64 #define xfmodf nsimd_sleef_fmod_aarch64_f32 #define xremainder nsimd_sleef_remainder_aarch64_f64 #define xremainderf nsimd_sleef_remainder_aarch64_f32 #define xmodf nsimd_sleef_modf_aarch64_f64 #define xmodff nsimd_sleef_modf_aarch64_f32 #define xlgamma_u1 nsimd_sleef_lgamma_u10d_aarch64_f64 #define xlgammaf_u1 nsimd_sleef_lgamma_u10d_aarch64_f32 #define xtgamma_u1 nsimd_sleef_tgamma_u10d_aarch64_f64 #define xtgammaf_u1 nsimd_sleef_tgamma_u10d_aarch64_f32 #define xerf_u1 nsimd_sleef_erf_u10d_aarch64_f64 #define xerff_u1 nsimd_sleef_erf_u10d_aarch64_f32 #define xerfc_u15 nsimd_sleef_erfc_u15d_aarch64_f64 #define xerfcf_u15 nsimd_sleef_erfc_u15d_aarch64_f32 #define xgetInt nsimd_sleef_getInt_aarch64_f64 #define xgetIntf nsimd_sleef_getInt_aarch64_f32 #define xgetPtr nsimd_sleef_getPtr_aarch64_f64 #define xgetPtrf nsimd_sleef_getPtr_aarch64_f32 #else #define xsin nsimd_sleef_sin_u35_aarch64_f64 #define xsinf nsimd_sleef_sin_u35_aarch64_f32 #define xcos nsimd_sleef_cos_u35_aarch64_f64 #define xcosf nsimd_sleef_cos_u35_aarch64_f32 #define xsincos nsimd_sleef_sincos_u35_aarch64_f64 #define xsincosf nsimd_sleef_sincos_u35_aarch64_f32 #define xtan nsimd_sleef_tan_u35_aarch64_f64 #define xtanf nsimd_sleef_tan_u35_aarch64_f32 #define xasin nsimd_sleef_asin_u35_aarch64_f64 #define xasinf nsimd_sleef_asin_u35_aarch64_f32 #define xacos nsimd_sleef_acos_u35_aarch64_f64 #define xacosf nsimd_sleef_acos_u35_aarch64_f32 #define xatan nsimd_sleef_atan_u35_aarch64_f64 #define xatanf nsimd_sleef_atan_u35_aarch64_f32 #define xatan2 nsimd_sleef_atan2_u35_aarch64_f64 #define xatan2f nsimd_sleef_atan2_u35_aarch64_f32 #define xlog nsimd_sleef_log_u35_aarch64_f64 #define xlogf nsimd_sleef_log_u35_aarch64_f32 #define xcbrt nsimd_sleef_cbrt_u35_aarch64_f64 #define xcbrtf nsimd_sleef_cbrt_u35_aarch64_f32 #define xsin_u1 nsimd_sleef_sin_u10_aarch64_f64 #define xsinf_u1 nsimd_sleef_sin_u10_aarch64_f32 #define xcos_u1 nsimd_sleef_cos_u10_aarch64_f64 #define xcosf_u1 nsimd_sleef_cos_u10_aarch64_f32 #define xsincos_u1 nsimd_sleef_sincos_u10_aarch64_f64 #define xsincosf_u1 nsimd_sleef_sincos_u10_aarch64_f32 #define xtan_u1 nsimd_sleef_tan_u10_aarch64_f64 #define xtanf_u1 nsimd_sleef_tan_u10_aarch64_f32 #define xasin_u1 nsimd_sleef_asin_u10_aarch64_f64 #define xasinf_u1 nsimd_sleef_asin_u10_aarch64_f32 #define xacos_u1 nsimd_sleef_acos_u10_aarch64_f64 #define xacosf_u1 nsimd_sleef_acos_u10_aarch64_f32 #define xatan_u1 nsimd_sleef_atan_u10_aarch64_f64 #define xatanf_u1 nsimd_sleef_atan_u10_aarch64_f32 #define xatan2_u1 nsimd_sleef_atan2_u10_aarch64_f64 #define xatan2f_u1 nsimd_sleef_atan2_u10_aarch64_f32 #define xlog_u1 nsimd_sleef_log_u10_aarch64_f64 #define xlogf_u1 nsimd_sleef_log_u10_aarch64_f32 #define xcbrt_u1 nsimd_sleef_cbrt_u10_aarch64_f64 #define xcbrtf_u1 nsimd_sleef_cbrt_u10_aarch64_f32 #define xexp nsimd_sleef_exp_u10_aarch64_f64 #define xexpf nsimd_sleef_exp_u10_aarch64_f32 #define xpow nsimd_sleef_pow_u10_aarch64_f64 #define xpowf nsimd_sleef_pow_u10_aarch64_f32 #define xsinh nsimd_sleef_sinh_u10_aarch64_f64 #define xsinhf nsimd_sleef_sinh_u10_aarch64_f32 #define xcosh nsimd_sleef_cosh_u10_aarch64_f64 #define xcoshf nsimd_sleef_cosh_u10_aarch64_f32 #define xtanh nsimd_sleef_tanh_u10_aarch64_f64 #define xtanhf nsimd_sleef_tanh_u10_aarch64_f32 #define xsinh_u35 nsimd_sleef_sinh_u35_aarch64_f64 #define xsinhf_u35 nsimd_sleef_sinh_u35_aarch64_f32 #define xcosh_u35 nsimd_sleef_cosh_u35_aarch64_f64 #define xcoshf_u35 nsimd_sleef_cosh_u35_aarch64_f32 #define xtanh_u35 nsimd_sleef_tanh_u35_aarch64_f64 #define xtanhf_u35 nsimd_sleef_tanh_u35_aarch64_f32 #define xfastsin_u3500 nsimd_sleef_fastsin_u3500_aarch64_f64 #define xfastsinf_u3500 nsimd_sleef_fastsin_u3500_aarch64_f32 #define xfastcos_u3500 nsimd_sleef_fastcos_u3500_aarch64_f64 #define xfastcosf_u3500 nsimd_sleef_fastcos_u3500_aarch64_f32 #define xfastpow_u3500 nsimd_sleef_fastpow_u3500_aarch64_f64 #define xfastpowf_u3500 nsimd_sleef_fastpow_u3500_aarch64_f32 #define xasinh nsimd_sleef_asinh_u10_aarch64_f64 #define xasinhf nsimd_sleef_asinh_u10_aarch64_f32 #define xacosh nsimd_sleef_acosh_u10_aarch64_f64 #define xacoshf nsimd_sleef_acosh_u10_aarch64_f32 #define xatanh nsimd_sleef_atanh_u10_aarch64_f64 #define xatanhf nsimd_sleef_atanh_u10_aarch64_f32 #define xexp2 nsimd_sleef_exp2_u10_aarch64_f64 #define xexp2f nsimd_sleef_exp2_u10_aarch64_f32 #define xexp2_u35 nsimd_sleef_exp2_u35_aarch64_f64 #define xexp2f_u35 nsimd_sleef_exp2_u35_aarch64_f32 #define xexp10 nsimd_sleef_exp10_u10_aarch64_f64 #define xexp10f nsimd_sleef_exp10_u10_aarch64_f32 #define xexp10_u35 nsimd_sleef_exp10_u35_aarch64_f64 #define xexp10f_u35 nsimd_sleef_exp10_u35_aarch64_f32 #define xexpm1 nsimd_sleef_expm1_u10_aarch64_f64 #define xexpm1f nsimd_sleef_expm1_u10_aarch64_f32 #define xlog10 nsimd_sleef_log10_u10_aarch64_f64 #define xlog10f nsimd_sleef_log10_u10_aarch64_f32 #define xlog2 nsimd_sleef_log2_u10_aarch64_f64 #define xlog2f nsimd_sleef_log2_u10_aarch64_f32 #define xlog2_u35 nsimd_sleef_log2_u35_aarch64_f64 #define xlog2f_u35 nsimd_sleef_log2_u35_aarch64_f32 #define xlog1p nsimd_sleef_log1p_u10_aarch64_f64 #define xlog1pf nsimd_sleef_log1p_u10_aarch64_f32 #define xsincospi_u05 nsimd_sleef_sincospi_u05_aarch64_f64 #define xsincospif_u05 nsimd_sleef_sincospi_u05_aarch64_f32 #define xsincospi_u35 nsimd_sleef_sincospi_u35_aarch64_f64 #define xsincospif_u35 nsimd_sleef_sincospi_u35_aarch64_f32 #define xsinpi_u05 nsimd_sleef_sinpi_u05_aarch64_f64 #define xsinpif_u05 nsimd_sleef_sinpi_u05_aarch64_f32 #define xcospi_u05 nsimd_sleef_cospi_u05_aarch64_f64 #define xcospif_u05 nsimd_sleef_cospi_u05_aarch64_f32 #define xldexp nsimd_sleef_ldexp_aarch64_f64 #define xldexpf nsimd_sleef_ldexp_aarch64_f32 #define xilogb nsimd_sleef_ilogb_aarch64_f64 #define xilogbf nsimd_sleef_ilogb_aarch64_f32 #define xfma nsimd_sleef_fma_aarch64_f64 #define xfmaf nsimd_sleef_fma_aarch64_f32 #define xsqrt nsimd_sleef_sqrt_aarch64_f64 #define xsqrtf nsimd_sleef_sqrt_aarch64_f32 #define xsqrt_u05 nsimd_sleef_sqrt_u05_aarch64_f64 #define xsqrtf_u05 nsimd_sleef_sqrt_u05_aarch64_f32 #define xsqrt_u35 nsimd_sleef_sqrt_u35_aarch64_f64 #define xsqrtf_u35 nsimd_sleef_sqrt_u35_aarch64_f32 #define xhypot_u05 nsimd_sleef_hypot_u05_aarch64_f64 #define xhypotf_u05 nsimd_sleef_hypot_u05_aarch64_f32 #define xhypot_u35 nsimd_sleef_hypot_u35_aarch64_f64 #define xhypotf_u35 nsimd_sleef_hypot_u35_aarch64_f32 #define xfabs nsimd_sleef_fabs_aarch64_f64 #define xfabsf nsimd_sleef_fabs_aarch64_f32 #define xcopysign nsimd_sleef_copysign_aarch64_f64 #define xcopysignf nsimd_sleef_copysign_aarch64_f32 #define xfmax nsimd_sleef_fmax_aarch64_f64 #define xfmaxf nsimd_sleef_fmax_aarch64_f32 #define xfmin nsimd_sleef_fmin_aarch64_f64 #define xfminf nsimd_sleef_fmin_aarch64_f32 #define xfdim nsimd_sleef_fdim_aarch64_f64 #define xfdimf nsimd_sleef_fdim_aarch64_f32 #define xtrunc nsimd_sleef_trunc_aarch64_f64 #define xtruncf nsimd_sleef_trunc_aarch64_f32 #define xfloor nsimd_sleef_floor_aarch64_f64 #define xfloorf nsimd_sleef_floor_aarch64_f32 #define xceil nsimd_sleef_ceil_aarch64_f64 #define xceilf nsimd_sleef_ceil_aarch64_f32 #define xround nsimd_sleef_round_aarch64_f64 #define xroundf nsimd_sleef_round_aarch64_f32 #define xrint nsimd_sleef_rint_aarch64_f64 #define xrintf nsimd_sleef_rint_aarch64_f32 #define xnextafter nsimd_sleef_nextafter_aarch64_f64 #define xnextafterf nsimd_sleef_nextafter_aarch64_f32 #define xfrfrexp nsimd_sleef_frfrexp_aarch64_f64 #define xfrfrexpf nsimd_sleef_frfrexp_aarch64_f32 #define xexpfrexp nsimd_sleef_expfrexp_aarch64_f64 #define xexpfrexpf nsimd_sleef_expfrexp_aarch64_f32 #define xfmod nsimd_sleef_fmod_aarch64_f64 #define xfmodf nsimd_sleef_fmod_aarch64_f32 #define xremainder nsimd_sleef_remainder_aarch64_f64 #define xremainderf nsimd_sleef_remainder_aarch64_f32 #define xmodf nsimd_sleef_modf_aarch64_f64 #define xmodff nsimd_sleef_modf_aarch64_f32 #define xlgamma_u1 nsimd_sleef_lgamma_u10_aarch64_f64 #define xlgammaf_u1 nsimd_sleef_lgamma_u10_aarch64_f32 #define xtgamma_u1 nsimd_sleef_tgamma_u10_aarch64_f64 #define xtgammaf_u1 nsimd_sleef_tgamma_u10_aarch64_f32 #define xerf_u1 nsimd_sleef_erf_u10_aarch64_f64 #define xerff_u1 nsimd_sleef_erf_u10_aarch64_f32 #define xerfc_u15 nsimd_sleef_erfc_u15_aarch64_f64 #define xerfcf_u15 nsimd_sleef_erfc_u15_aarch64_f32 #define xgetInt nsimd_sleef_getInt_aarch64_f64 #define xgetIntf nsimd_sleef_getInt_aarch64_f32 #define xgetPtr nsimd_sleef_getPtr_aarch64_f64 #define xgetPtrf nsimd_sleef_getPtr_aarch64_f32 #endif #define rempi nsimd_sleef_rempi_aarch64 #define rempif nsimd_sleef_rempif_aarch64 #define rempisub nsimd_sleef_rempisub_aarch64 #define rempisubf nsimd_sleef_rempisubf_aarch64 #define gammak nsimd_gammak_aarch64 #define gammafk nsimd_gammafk_aarch64 #endif #endif ================================================ FILE: src/renameavx.h ================================================ #ifndef RENAMEAVX_H #define RENAMEAVX_H /* ------------------------------------------------------------------------- */ /* Naming of functions avx */ #ifdef NSIMD_AVX #ifdef DETERMINISTIC #define xsin nsimd_sleef_sin_u35d_avx_f64 #define xsinf nsimd_sleef_sin_u35d_avx_f32 #define xcos nsimd_sleef_cos_u35d_avx_f64 #define xcosf nsimd_sleef_cos_u35d_avx_f32 #define xsincos nsimd_sleef_sincos_u35d_avx_f64 #define xsincosf nsimd_sleef_sincos_u35d_avx_f32 #define xtan nsimd_sleef_tan_u35d_avx_f64 #define xtanf nsimd_sleef_tan_u35d_avx_f32 #define xasin nsimd_sleef_asin_u35d_avx_f64 #define xasinf nsimd_sleef_asin_u35d_avx_f32 #define xacos nsimd_sleef_acos_u35d_avx_f64 #define xacosf nsimd_sleef_acos_u35d_avx_f32 #define xatan nsimd_sleef_atan_u35d_avx_f64 #define xatanf nsimd_sleef_atan_u35d_avx_f32 #define xatan2 nsimd_sleef_atan2_u35d_avx_f64 #define xatan2f nsimd_sleef_atan2_u35d_avx_f32 #define xlog nsimd_sleef_log_u35d_avx_f64 #define xlogf nsimd_sleef_log_u35d_avx_f32 #define xcbrt nsimd_sleef_cbrt_u35d_avx_f64 #define xcbrtf nsimd_sleef_cbrt_u35d_avx_f32 #define xsin_u1 nsimd_sleef_sin_u10d_avx_f64 #define xsinf_u1 nsimd_sleef_sin_u10d_avx_f32 #define xcos_u1 nsimd_sleef_cos_u10d_avx_f64 #define xcosf_u1 nsimd_sleef_cos_u10d_avx_f32 #define xsincos_u1 nsimd_sleef_sincos_u10d_avx_f64 #define xsincosf_u1 nsimd_sleef_sincos_u10d_avx_f32 #define xtan_u1 nsimd_sleef_tan_u10d_avx_f64 #define xtanf_u1 nsimd_sleef_tan_u10d_avx_f32 #define xasin_u1 nsimd_sleef_asin_u10d_avx_f64 #define xasinf_u1 nsimd_sleef_asin_u10d_avx_f32 #define xacos_u1 nsimd_sleef_acos_u10d_avx_f64 #define xacosf_u1 nsimd_sleef_acos_u10d_avx_f32 #define xatan_u1 nsimd_sleef_atan_u10d_avx_f64 #define xatanf_u1 nsimd_sleef_atan_u10d_avx_f32 #define xatan2_u1 nsimd_sleef_atan2_u10d_avx_f64 #define xatan2f_u1 nsimd_sleef_atan2_u10d_avx_f32 #define xlog_u1 nsimd_sleef_log_u10d_avx_f64 #define xlogf_u1 nsimd_sleef_log_u10d_avx_f32 #define xcbrt_u1 nsimd_sleef_cbrt_u10d_avx_f64 #define xcbrtf_u1 nsimd_sleef_cbrt_u10d_avx_f32 #define xexp nsimd_sleef_exp_u10d_avx_f64 #define xexpf nsimd_sleef_exp_u10d_avx_f32 #define xpow nsimd_sleef_pow_u10d_avx_f64 #define xpowf nsimd_sleef_pow_u10d_avx_f32 #define xsinh nsimd_sleef_sinh_u10d_avx_f64 #define xsinhf nsimd_sleef_sinh_u10d_avx_f32 #define xcosh nsimd_sleef_cosh_u10d_avx_f64 #define xcoshf nsimd_sleef_cosh_u10d_avx_f32 #define xtanh nsimd_sleef_tanh_u10d_avx_f64 #define xtanhf nsimd_sleef_tanh_u10d_avx_f32 #define xsinh_u35 nsimd_sleef_sinh_u35d_avx_f64 #define xsinhf_u35 nsimd_sleef_sinh_u35d_avx_f32 #define xcosh_u35 nsimd_sleef_cosh_u35d_avx_f64 #define xcoshf_u35 nsimd_sleef_cosh_u35d_avx_f32 #define xtanh_u35 nsimd_sleef_tanh_u35d_avx_f64 #define xtanhf_u35 nsimd_sleef_tanh_u35d_avx_f32 #define xfastsin_u3500 nsimd_sleef_fastsin_u3500d_avx_f64 #define xfastsinf_u3500 nsimd_sleef_fastsin_u3500d_avx_f32 #define xfastcos_u3500 nsimd_sleef_fastcos_u3500d_avx_f64 #define xfastcosf_u3500 nsimd_sleef_fastcos_u3500d_avx_f32 #define xfastpow_u3500 nsimd_sleef_fastpow_u3500d_avx_f64 #define xfastpowf_u3500 nsimd_sleef_fastpow_u3500d_avx_f32 #define xasinh nsimd_sleef_asinh_u10d_avx_f64 #define xasinhf nsimd_sleef_asinh_u10d_avx_f32 #define xacosh nsimd_sleef_acosh_u10d_avx_f64 #define xacoshf nsimd_sleef_acosh_u10d_avx_f32 #define xatanh nsimd_sleef_atanh_u10d_avx_f64 #define xatanhf nsimd_sleef_atanh_u10d_avx_f32 #define xexp2 nsimd_sleef_exp2_u10d_avx_f64 #define xexp2f nsimd_sleef_exp2_u10d_avx_f32 #define xexp2_u35 nsimd_sleef_exp2_u35d_avx_f64 #define xexp2f_u35 nsimd_sleef_exp2_u35d_avx_f32 #define xexp10 nsimd_sleef_exp10_u10d_avx_f64 #define xexp10f nsimd_sleef_exp10_u10d_avx_f32 #define xexp10_u35 nsimd_sleef_exp10_u35d_avx_f64 #define xexp10f_u35 nsimd_sleef_exp10_u35d_avx_f32 #define xexpm1 nsimd_sleef_expm1_u10d_avx_f64 #define xexpm1f nsimd_sleef_expm1_u10d_avx_f32 #define xlog10 nsimd_sleef_log10_u10d_avx_f64 #define xlog10f nsimd_sleef_log10_u10d_avx_f32 #define xlog2 nsimd_sleef_log2_u10d_avx_f64 #define xlog2f nsimd_sleef_log2_u10d_avx_f32 #define xlog2_u35 nsimd_sleef_log2_u35d_avx_f64 #define xlog2f_u35 nsimd_sleef_log2_u35d_avx_f32 #define xlog1p nsimd_sleef_log1p_u10d_avx_f64 #define xlog1pf nsimd_sleef_log1p_u10d_avx_f32 #define xsincospi_u05 nsimd_sleef_sincospi_u05d_avx_f64 #define xsincospif_u05 nsimd_sleef_sincospi_u05d_avx_f32 #define xsincospi_u35 nsimd_sleef_sincospi_u35d_avx_f64 #define xsincospif_u35 nsimd_sleef_sincospi_u35d_avx_f32 #define xsinpi_u05 nsimd_sleef_sinpi_u05d_avx_f64 #define xsinpif_u05 nsimd_sleef_sinpi_u05d_avx_f32 #define xcospi_u05 nsimd_sleef_cospi_u05d_avx_f64 #define xcospif_u05 nsimd_sleef_cospi_u05d_avx_f32 #define xldexp nsimd_sleef_ldexp_avx_f64 #define xldexpf nsimd_sleef_ldexp_avx_f32 #define xilogb nsimd_sleef_ilogb_avx_f64 #define xilogbf nsimd_sleef_ilogb_avx_f32 #define xfma nsimd_sleef_fma_avx_f64 #define xfmaf nsimd_sleef_fma_avx_f32 #define xsqrt nsimd_sleef_sqrt_avx_f64 #define xsqrtf nsimd_sleef_sqrt_avx_f32 #define xsqrt_u05 nsimd_sleef_sqrt_u05d_avx_f64 #define xsqrtf_u05 nsimd_sleef_sqrt_u05d_avx_f32 #define xsqrt_u35 nsimd_sleef_sqrt_u35d_avx_f64 #define xsqrtf_u35 nsimd_sleef_sqrt_u35d_avx_f32 #define xhypot_u05 nsimd_sleef_hypot_u05d_avx_f64 #define xhypotf_u05 nsimd_sleef_hypot_u05d_avx_f32 #define xhypot_u35 nsimd_sleef_hypot_u35d_avx_f64 #define xhypotf_u35 nsimd_sleef_hypot_u35d_avx_f32 #define xfabs nsimd_sleef_fabs_avx_f64 #define xfabsf nsimd_sleef_fabs_avx_f32 #define xcopysign nsimd_sleef_copysign_avx_f64 #define xcopysignf nsimd_sleef_copysign_avx_f32 #define xfmax nsimd_sleef_fmax_avx_f64 #define xfmaxf nsimd_sleef_fmax_avx_f32 #define xfmin nsimd_sleef_fmin_avx_f64 #define xfminf nsimd_sleef_fmin_avx_f32 #define xfdim nsimd_sleef_fdim_avx_f64 #define xfdimf nsimd_sleef_fdim_avx_f32 #define xtrunc nsimd_sleef_trunc_avx_f64 #define xtruncf nsimd_sleef_trunc_avx_f32 #define xfloor nsimd_sleef_floor_avx_f64 #define xfloorf nsimd_sleef_floor_avx_f32 #define xceil nsimd_sleef_ceil_avx_f64 #define xceilf nsimd_sleef_ceil_avx_f32 #define xround nsimd_sleef_round_avx_f64 #define xroundf nsimd_sleef_round_avx_f32 #define xrint nsimd_sleef_rint_avx_f64 #define xrintf nsimd_sleef_rint_avx_f32 #define xnextafter nsimd_sleef_nextafter_avx_f64 #define xnextafterf nsimd_sleef_nextafter_avx_f32 #define xfrfrexp nsimd_sleef_frfrexp_avx_f64 #define xfrfrexpf nsimd_sleef_frfrexp_avx_f32 #define xexpfrexp nsimd_sleef_expfrexp_avx_f64 #define xexpfrexpf nsimd_sleef_expfrexp_avx_f32 #define xfmod nsimd_sleef_fmod_avx_f64 #define xfmodf nsimd_sleef_fmod_avx_f32 #define xremainder nsimd_sleef_remainder_avx_f64 #define xremainderf nsimd_sleef_remainder_avx_f32 #define xmodf nsimd_sleef_modf_avx_f64 #define xmodff nsimd_sleef_modf_avx_f32 #define xlgamma_u1 nsimd_sleef_lgamma_u10d_avx_f64 #define xlgammaf_u1 nsimd_sleef_lgamma_u10d_avx_f32 #define xtgamma_u1 nsimd_sleef_tgamma_u10d_avx_f64 #define xtgammaf_u1 nsimd_sleef_tgamma_u10d_avx_f32 #define xerf_u1 nsimd_sleef_erf_u10d_avx_f64 #define xerff_u1 nsimd_sleef_erf_u10d_avx_f32 #define xerfc_u15 nsimd_sleef_erfc_u15d_avx_f64 #define xerfcf_u15 nsimd_sleef_erfc_u15d_avx_f32 #define xgetInt nsimd_sleef_getInt_avx_f64 #define xgetIntf nsimd_sleef_getInt_avx_f32 #define xgetPtr nsimd_sleef_getPtr_avx_f64 #define xgetPtrf nsimd_sleef_getPtr_avx_f32 #else #define xsin nsimd_sleef_sin_u35_avx_f64 #define xsinf nsimd_sleef_sin_u35_avx_f32 #define xcos nsimd_sleef_cos_u35_avx_f64 #define xcosf nsimd_sleef_cos_u35_avx_f32 #define xsincos nsimd_sleef_sincos_u35_avx_f64 #define xsincosf nsimd_sleef_sincos_u35_avx_f32 #define xtan nsimd_sleef_tan_u35_avx_f64 #define xtanf nsimd_sleef_tan_u35_avx_f32 #define xasin nsimd_sleef_asin_u35_avx_f64 #define xasinf nsimd_sleef_asin_u35_avx_f32 #define xacos nsimd_sleef_acos_u35_avx_f64 #define xacosf nsimd_sleef_acos_u35_avx_f32 #define xatan nsimd_sleef_atan_u35_avx_f64 #define xatanf nsimd_sleef_atan_u35_avx_f32 #define xatan2 nsimd_sleef_atan2_u35_avx_f64 #define xatan2f nsimd_sleef_atan2_u35_avx_f32 #define xlog nsimd_sleef_log_u35_avx_f64 #define xlogf nsimd_sleef_log_u35_avx_f32 #define xcbrt nsimd_sleef_cbrt_u35_avx_f64 #define xcbrtf nsimd_sleef_cbrt_u35_avx_f32 #define xsin_u1 nsimd_sleef_sin_u10_avx_f64 #define xsinf_u1 nsimd_sleef_sin_u10_avx_f32 #define xcos_u1 nsimd_sleef_cos_u10_avx_f64 #define xcosf_u1 nsimd_sleef_cos_u10_avx_f32 #define xsincos_u1 nsimd_sleef_sincos_u10_avx_f64 #define xsincosf_u1 nsimd_sleef_sincos_u10_avx_f32 #define xtan_u1 nsimd_sleef_tan_u10_avx_f64 #define xtanf_u1 nsimd_sleef_tan_u10_avx_f32 #define xasin_u1 nsimd_sleef_asin_u10_avx_f64 #define xasinf_u1 nsimd_sleef_asin_u10_avx_f32 #define xacos_u1 nsimd_sleef_acos_u10_avx_f64 #define xacosf_u1 nsimd_sleef_acos_u10_avx_f32 #define xatan_u1 nsimd_sleef_atan_u10_avx_f64 #define xatanf_u1 nsimd_sleef_atan_u10_avx_f32 #define xatan2_u1 nsimd_sleef_atan2_u10_avx_f64 #define xatan2f_u1 nsimd_sleef_atan2_u10_avx_f32 #define xlog_u1 nsimd_sleef_log_u10_avx_f64 #define xlogf_u1 nsimd_sleef_log_u10_avx_f32 #define xcbrt_u1 nsimd_sleef_cbrt_u10_avx_f64 #define xcbrtf_u1 nsimd_sleef_cbrt_u10_avx_f32 #define xexp nsimd_sleef_exp_u10_avx_f64 #define xexpf nsimd_sleef_exp_u10_avx_f32 #define xpow nsimd_sleef_pow_u10_avx_f64 #define xpowf nsimd_sleef_pow_u10_avx_f32 #define xsinh nsimd_sleef_sinh_u10_avx_f64 #define xsinhf nsimd_sleef_sinh_u10_avx_f32 #define xcosh nsimd_sleef_cosh_u10_avx_f64 #define xcoshf nsimd_sleef_cosh_u10_avx_f32 #define xtanh nsimd_sleef_tanh_u10_avx_f64 #define xtanhf nsimd_sleef_tanh_u10_avx_f32 #define xsinh_u35 nsimd_sleef_sinh_u35_avx_f64 #define xsinhf_u35 nsimd_sleef_sinh_u35_avx_f32 #define xcosh_u35 nsimd_sleef_cosh_u35_avx_f64 #define xcoshf_u35 nsimd_sleef_cosh_u35_avx_f32 #define xtanh_u35 nsimd_sleef_tanh_u35_avx_f64 #define xtanhf_u35 nsimd_sleef_tanh_u35_avx_f32 #define xfastsin_u3500 nsimd_sleef_fastsin_u3500_avx_f64 #define xfastsinf_u3500 nsimd_sleef_fastsin_u3500_avx_f32 #define xfastcos_u3500 nsimd_sleef_fastcos_u3500_avx_f64 #define xfastcosf_u3500 nsimd_sleef_fastcos_u3500_avx_f32 #define xfastpow_u3500 nsimd_sleef_fastpow_u3500_avx_f64 #define xfastpowf_u3500 nsimd_sleef_fastpow_u3500_avx_f32 #define xasinh nsimd_sleef_asinh_u10_avx_f64 #define xasinhf nsimd_sleef_asinh_u10_avx_f32 #define xacosh nsimd_sleef_acosh_u10_avx_f64 #define xacoshf nsimd_sleef_acosh_u10_avx_f32 #define xatanh nsimd_sleef_atanh_u10_avx_f64 #define xatanhf nsimd_sleef_atanh_u10_avx_f32 #define xexp2 nsimd_sleef_exp2_u10_avx_f64 #define xexp2f nsimd_sleef_exp2_u10_avx_f32 #define xexp2_u35 nsimd_sleef_exp2_u35_avx_f64 #define xexp2f_u35 nsimd_sleef_exp2_u35_avx_f32 #define xexp10 nsimd_sleef_exp10_u10_avx_f64 #define xexp10f nsimd_sleef_exp10_u10_avx_f32 #define xexp10_u35 nsimd_sleef_exp10_u35_avx_f64 #define xexp10f_u35 nsimd_sleef_exp10_u35_avx_f32 #define xexpm1 nsimd_sleef_expm1_u10_avx_f64 #define xexpm1f nsimd_sleef_expm1_u10_avx_f32 #define xlog10 nsimd_sleef_log10_u10_avx_f64 #define xlog10f nsimd_sleef_log10_u10_avx_f32 #define xlog2 nsimd_sleef_log2_u10_avx_f64 #define xlog2f nsimd_sleef_log2_u10_avx_f32 #define xlog2_u35 nsimd_sleef_log2_u35_avx_f64 #define xlog2f_u35 nsimd_sleef_log2_u35_avx_f32 #define xlog1p nsimd_sleef_log1p_u10_avx_f64 #define xlog1pf nsimd_sleef_log1p_u10_avx_f32 #define xsincospi_u05 nsimd_sleef_sincospi_u05_avx_f64 #define xsincospif_u05 nsimd_sleef_sincospi_u05_avx_f32 #define xsincospi_u35 nsimd_sleef_sincospi_u35_avx_f64 #define xsincospif_u35 nsimd_sleef_sincospi_u35_avx_f32 #define xsinpi_u05 nsimd_sleef_sinpi_u05_avx_f64 #define xsinpif_u05 nsimd_sleef_sinpi_u05_avx_f32 #define xcospi_u05 nsimd_sleef_cospi_u05_avx_f64 #define xcospif_u05 nsimd_sleef_cospi_u05_avx_f32 #define xldexp nsimd_sleef_ldexp_avx_f64 #define xldexpf nsimd_sleef_ldexp_avx_f32 #define xilogb nsimd_sleef_ilogb_avx_f64 #define xilogbf nsimd_sleef_ilogb_avx_f32 #define xfma nsimd_sleef_fma_avx_f64 #define xfmaf nsimd_sleef_fma_avx_f32 #define xsqrt nsimd_sleef_sqrt_avx_f64 #define xsqrtf nsimd_sleef_sqrt_avx_f32 #define xsqrt_u05 nsimd_sleef_sqrt_u05_avx_f64 #define xsqrtf_u05 nsimd_sleef_sqrt_u05_avx_f32 #define xsqrt_u35 nsimd_sleef_sqrt_u35_avx_f64 #define xsqrtf_u35 nsimd_sleef_sqrt_u35_avx_f32 #define xhypot_u05 nsimd_sleef_hypot_u05_avx_f64 #define xhypotf_u05 nsimd_sleef_hypot_u05_avx_f32 #define xhypot_u35 nsimd_sleef_hypot_u35_avx_f64 #define xhypotf_u35 nsimd_sleef_hypot_u35_avx_f32 #define xfabs nsimd_sleef_fabs_avx_f64 #define xfabsf nsimd_sleef_fabs_avx_f32 #define xcopysign nsimd_sleef_copysign_avx_f64 #define xcopysignf nsimd_sleef_copysign_avx_f32 #define xfmax nsimd_sleef_fmax_avx_f64 #define xfmaxf nsimd_sleef_fmax_avx_f32 #define xfmin nsimd_sleef_fmin_avx_f64 #define xfminf nsimd_sleef_fmin_avx_f32 #define xfdim nsimd_sleef_fdim_avx_f64 #define xfdimf nsimd_sleef_fdim_avx_f32 #define xtrunc nsimd_sleef_trunc_avx_f64 #define xtruncf nsimd_sleef_trunc_avx_f32 #define xfloor nsimd_sleef_floor_avx_f64 #define xfloorf nsimd_sleef_floor_avx_f32 #define xceil nsimd_sleef_ceil_avx_f64 #define xceilf nsimd_sleef_ceil_avx_f32 #define xround nsimd_sleef_round_avx_f64 #define xroundf nsimd_sleef_round_avx_f32 #define xrint nsimd_sleef_rint_avx_f64 #define xrintf nsimd_sleef_rint_avx_f32 #define xnextafter nsimd_sleef_nextafter_avx_f64 #define xnextafterf nsimd_sleef_nextafter_avx_f32 #define xfrfrexp nsimd_sleef_frfrexp_avx_f64 #define xfrfrexpf nsimd_sleef_frfrexp_avx_f32 #define xexpfrexp nsimd_sleef_expfrexp_avx_f64 #define xexpfrexpf nsimd_sleef_expfrexp_avx_f32 #define xfmod nsimd_sleef_fmod_avx_f64 #define xfmodf nsimd_sleef_fmod_avx_f32 #define xremainder nsimd_sleef_remainder_avx_f64 #define xremainderf nsimd_sleef_remainder_avx_f32 #define xmodf nsimd_sleef_modf_avx_f64 #define xmodff nsimd_sleef_modf_avx_f32 #define xlgamma_u1 nsimd_sleef_lgamma_u10_avx_f64 #define xlgammaf_u1 nsimd_sleef_lgamma_u10_avx_f32 #define xtgamma_u1 nsimd_sleef_tgamma_u10_avx_f64 #define xtgammaf_u1 nsimd_sleef_tgamma_u10_avx_f32 #define xerf_u1 nsimd_sleef_erf_u10_avx_f64 #define xerff_u1 nsimd_sleef_erf_u10_avx_f32 #define xerfc_u15 nsimd_sleef_erfc_u15_avx_f64 #define xerfcf_u15 nsimd_sleef_erfc_u15_avx_f32 #define xgetInt nsimd_sleef_getInt_avx_f64 #define xgetIntf nsimd_sleef_getInt_avx_f32 #define xgetPtr nsimd_sleef_getPtr_avx_f64 #define xgetPtrf nsimd_sleef_getPtr_avx_f32 #endif #define rempi nsimd_sleef_rempi_avx #define rempif nsimd_sleef_rempif_avx #define rempisub nsimd_sleef_rempisub_avx #define rempisubf nsimd_sleef_rempisubf_avx #define gammak nsimd_gammak_avx #define gammafk nsimd_gammafk_avx #endif #endif ================================================ FILE: src/renameavx2.h ================================================ #ifndef RENAMEAVX2_H #define RENAMEAVX2_H /* ------------------------------------------------------------------------- */ /* Naming of functions avx2 */ #ifdef NSIMD_AVX2 #ifdef DETERMINISTIC #define xsin nsimd_sleef_sin_u35d_avx2_f64 #define xsinf nsimd_sleef_sin_u35d_avx2_f32 #define xcos nsimd_sleef_cos_u35d_avx2_f64 #define xcosf nsimd_sleef_cos_u35d_avx2_f32 #define xsincos nsimd_sleef_sincos_u35d_avx2_f64 #define xsincosf nsimd_sleef_sincos_u35d_avx2_f32 #define xtan nsimd_sleef_tan_u35d_avx2_f64 #define xtanf nsimd_sleef_tan_u35d_avx2_f32 #define xasin nsimd_sleef_asin_u35d_avx2_f64 #define xasinf nsimd_sleef_asin_u35d_avx2_f32 #define xacos nsimd_sleef_acos_u35d_avx2_f64 #define xacosf nsimd_sleef_acos_u35d_avx2_f32 #define xatan nsimd_sleef_atan_u35d_avx2_f64 #define xatanf nsimd_sleef_atan_u35d_avx2_f32 #define xatan2 nsimd_sleef_atan2_u35d_avx2_f64 #define xatan2f nsimd_sleef_atan2_u35d_avx2_f32 #define xlog nsimd_sleef_log_u35d_avx2_f64 #define xlogf nsimd_sleef_log_u35d_avx2_f32 #define xcbrt nsimd_sleef_cbrt_u35d_avx2_f64 #define xcbrtf nsimd_sleef_cbrt_u35d_avx2_f32 #define xsin_u1 nsimd_sleef_sin_u10d_avx2_f64 #define xsinf_u1 nsimd_sleef_sin_u10d_avx2_f32 #define xcos_u1 nsimd_sleef_cos_u10d_avx2_f64 #define xcosf_u1 nsimd_sleef_cos_u10d_avx2_f32 #define xsincos_u1 nsimd_sleef_sincos_u10d_avx2_f64 #define xsincosf_u1 nsimd_sleef_sincos_u10d_avx2_f32 #define xtan_u1 nsimd_sleef_tan_u10d_avx2_f64 #define xtanf_u1 nsimd_sleef_tan_u10d_avx2_f32 #define xasin_u1 nsimd_sleef_asin_u10d_avx2_f64 #define xasinf_u1 nsimd_sleef_asin_u10d_avx2_f32 #define xacos_u1 nsimd_sleef_acos_u10d_avx2_f64 #define xacosf_u1 nsimd_sleef_acos_u10d_avx2_f32 #define xatan_u1 nsimd_sleef_atan_u10d_avx2_f64 #define xatanf_u1 nsimd_sleef_atan_u10d_avx2_f32 #define xatan2_u1 nsimd_sleef_atan2_u10d_avx2_f64 #define xatan2f_u1 nsimd_sleef_atan2_u10d_avx2_f32 #define xlog_u1 nsimd_sleef_log_u10d_avx2_f64 #define xlogf_u1 nsimd_sleef_log_u10d_avx2_f32 #define xcbrt_u1 nsimd_sleef_cbrt_u10d_avx2_f64 #define xcbrtf_u1 nsimd_sleef_cbrt_u10d_avx2_f32 #define xexp nsimd_sleef_exp_u10d_avx2_f64 #define xexpf nsimd_sleef_exp_u10d_avx2_f32 #define xpow nsimd_sleef_pow_u10d_avx2_f64 #define xpowf nsimd_sleef_pow_u10d_avx2_f32 #define xsinh nsimd_sleef_sinh_u10d_avx2_f64 #define xsinhf nsimd_sleef_sinh_u10d_avx2_f32 #define xcosh nsimd_sleef_cosh_u10d_avx2_f64 #define xcoshf nsimd_sleef_cosh_u10d_avx2_f32 #define xtanh nsimd_sleef_tanh_u10d_avx2_f64 #define xtanhf nsimd_sleef_tanh_u10d_avx2_f32 #define xsinh_u35 nsimd_sleef_sinh_u35d_avx2_f64 #define xsinhf_u35 nsimd_sleef_sinh_u35d_avx2_f32 #define xcosh_u35 nsimd_sleef_cosh_u35d_avx2_f64 #define xcoshf_u35 nsimd_sleef_cosh_u35d_avx2_f32 #define xtanh_u35 nsimd_sleef_tanh_u35d_avx2_f64 #define xtanhf_u35 nsimd_sleef_tanh_u35d_avx2_f32 #define xfastsin_u3500 nsimd_sleef_fastsin_u3500d_avx2_f64 #define xfastsinf_u3500 nsimd_sleef_fastsin_u3500d_avx2_f32 #define xfastcos_u3500 nsimd_sleef_fastcos_u3500d_avx2_f64 #define xfastcosf_u3500 nsimd_sleef_fastcos_u3500d_avx2_f32 #define xfastpow_u3500 nsimd_sleef_fastpow_u3500d_avx2_f64 #define xfastpowf_u3500 nsimd_sleef_fastpow_u3500d_avx2_f32 #define xasinh nsimd_sleef_asinh_u10d_avx2_f64 #define xasinhf nsimd_sleef_asinh_u10d_avx2_f32 #define xacosh nsimd_sleef_acosh_u10d_avx2_f64 #define xacoshf nsimd_sleef_acosh_u10d_avx2_f32 #define xatanh nsimd_sleef_atanh_u10d_avx2_f64 #define xatanhf nsimd_sleef_atanh_u10d_avx2_f32 #define xexp2 nsimd_sleef_exp2_u10d_avx2_f64 #define xexp2f nsimd_sleef_exp2_u10d_avx2_f32 #define xexp2_u35 nsimd_sleef_exp2_u35d_avx2_f64 #define xexp2f_u35 nsimd_sleef_exp2_u35d_avx2_f32 #define xexp10 nsimd_sleef_exp10_u10d_avx2_f64 #define xexp10f nsimd_sleef_exp10_u10d_avx2_f32 #define xexp10_u35 nsimd_sleef_exp10_u35d_avx2_f64 #define xexp10f_u35 nsimd_sleef_exp10_u35d_avx2_f32 #define xexpm1 nsimd_sleef_expm1_u10d_avx2_f64 #define xexpm1f nsimd_sleef_expm1_u10d_avx2_f32 #define xlog10 nsimd_sleef_log10_u10d_avx2_f64 #define xlog10f nsimd_sleef_log10_u10d_avx2_f32 #define xlog2 nsimd_sleef_log2_u10d_avx2_f64 #define xlog2f nsimd_sleef_log2_u10d_avx2_f32 #define xlog2_u35 nsimd_sleef_log2_u35d_avx2_f64 #define xlog2f_u35 nsimd_sleef_log2_u35d_avx2_f32 #define xlog1p nsimd_sleef_log1p_u10d_avx2_f64 #define xlog1pf nsimd_sleef_log1p_u10d_avx2_f32 #define xsincospi_u05 nsimd_sleef_sincospi_u05d_avx2_f64 #define xsincospif_u05 nsimd_sleef_sincospi_u05d_avx2_f32 #define xsincospi_u35 nsimd_sleef_sincospi_u35d_avx2_f64 #define xsincospif_u35 nsimd_sleef_sincospi_u35d_avx2_f32 #define xsinpi_u05 nsimd_sleef_sinpi_u05d_avx2_f64 #define xsinpif_u05 nsimd_sleef_sinpi_u05d_avx2_f32 #define xcospi_u05 nsimd_sleef_cospi_u05d_avx2_f64 #define xcospif_u05 nsimd_sleef_cospi_u05d_avx2_f32 #define xldexp nsimd_sleef_ldexp_avx2_f64 #define xldexpf nsimd_sleef_ldexp_avx2_f32 #define xilogb nsimd_sleef_ilogb_avx2_f64 #define xilogbf nsimd_sleef_ilogb_avx2_f32 #define xfma nsimd_sleef_fma_avx2_f64 #define xfmaf nsimd_sleef_fma_avx2_f32 #define xsqrt nsimd_sleef_sqrt_avx2_f64 #define xsqrtf nsimd_sleef_sqrt_avx2_f32 #define xsqrt_u05 nsimd_sleef_sqrt_u05d_avx2_f64 #define xsqrtf_u05 nsimd_sleef_sqrt_u05d_avx2_f32 #define xsqrt_u35 nsimd_sleef_sqrt_u35d_avx2_f64 #define xsqrtf_u35 nsimd_sleef_sqrt_u35d_avx2_f32 #define xhypot_u05 nsimd_sleef_hypot_u05d_avx2_f64 #define xhypotf_u05 nsimd_sleef_hypot_u05d_avx2_f32 #define xhypot_u35 nsimd_sleef_hypot_u35d_avx2_f64 #define xhypotf_u35 nsimd_sleef_hypot_u35d_avx2_f32 #define xfabs nsimd_sleef_fabs_avx2_f64 #define xfabsf nsimd_sleef_fabs_avx2_f32 #define xcopysign nsimd_sleef_copysign_avx2_f64 #define xcopysignf nsimd_sleef_copysign_avx2_f32 #define xfmax nsimd_sleef_fmax_avx2_f64 #define xfmaxf nsimd_sleef_fmax_avx2_f32 #define xfmin nsimd_sleef_fmin_avx2_f64 #define xfminf nsimd_sleef_fmin_avx2_f32 #define xfdim nsimd_sleef_fdim_avx2_f64 #define xfdimf nsimd_sleef_fdim_avx2_f32 #define xtrunc nsimd_sleef_trunc_avx2_f64 #define xtruncf nsimd_sleef_trunc_avx2_f32 #define xfloor nsimd_sleef_floor_avx2_f64 #define xfloorf nsimd_sleef_floor_avx2_f32 #define xceil nsimd_sleef_ceil_avx2_f64 #define xceilf nsimd_sleef_ceil_avx2_f32 #define xround nsimd_sleef_round_avx2_f64 #define xroundf nsimd_sleef_round_avx2_f32 #define xrint nsimd_sleef_rint_avx2_f64 #define xrintf nsimd_sleef_rint_avx2_f32 #define xnextafter nsimd_sleef_nextafter_avx2_f64 #define xnextafterf nsimd_sleef_nextafter_avx2_f32 #define xfrfrexp nsimd_sleef_frfrexp_avx2_f64 #define xfrfrexpf nsimd_sleef_frfrexp_avx2_f32 #define xexpfrexp nsimd_sleef_expfrexp_avx2_f64 #define xexpfrexpf nsimd_sleef_expfrexp_avx2_f32 #define xfmod nsimd_sleef_fmod_avx2_f64 #define xfmodf nsimd_sleef_fmod_avx2_f32 #define xremainder nsimd_sleef_remainder_avx2_f64 #define xremainderf nsimd_sleef_remainder_avx2_f32 #define xmodf nsimd_sleef_modf_avx2_f64 #define xmodff nsimd_sleef_modf_avx2_f32 #define xlgamma_u1 nsimd_sleef_lgamma_u10d_avx2_f64 #define xlgammaf_u1 nsimd_sleef_lgamma_u10d_avx2_f32 #define xtgamma_u1 nsimd_sleef_tgamma_u10d_avx2_f64 #define xtgammaf_u1 nsimd_sleef_tgamma_u10d_avx2_f32 #define xerf_u1 nsimd_sleef_erf_u10d_avx2_f64 #define xerff_u1 nsimd_sleef_erf_u10d_avx2_f32 #define xerfc_u15 nsimd_sleef_erfc_u15d_avx2_f64 #define xerfcf_u15 nsimd_sleef_erfc_u15d_avx2_f32 #define xgetInt nsimd_sleef_getInt_avx2_f64 #define xgetIntf nsimd_sleef_getInt_avx2_f32 #define xgetPtr nsimd_sleef_getPtr_avx2_f64 #define xgetPtrf nsimd_sleef_getPtr_avx2_f32 #else #define xsin nsimd_sleef_sin_u35_avx2_f64 #define xsinf nsimd_sleef_sin_u35_avx2_f32 #define xcos nsimd_sleef_cos_u35_avx2_f64 #define xcosf nsimd_sleef_cos_u35_avx2_f32 #define xsincos nsimd_sleef_sincos_u35_avx2_f64 #define xsincosf nsimd_sleef_sincos_u35_avx2_f32 #define xtan nsimd_sleef_tan_u35_avx2_f64 #define xtanf nsimd_sleef_tan_u35_avx2_f32 #define xasin nsimd_sleef_asin_u35_avx2_f64 #define xasinf nsimd_sleef_asin_u35_avx2_f32 #define xacos nsimd_sleef_acos_u35_avx2_f64 #define xacosf nsimd_sleef_acos_u35_avx2_f32 #define xatan nsimd_sleef_atan_u35_avx2_f64 #define xatanf nsimd_sleef_atan_u35_avx2_f32 #define xatan2 nsimd_sleef_atan2_u35_avx2_f64 #define xatan2f nsimd_sleef_atan2_u35_avx2_f32 #define xlog nsimd_sleef_log_u35_avx2_f64 #define xlogf nsimd_sleef_log_u35_avx2_f32 #define xcbrt nsimd_sleef_cbrt_u35_avx2_f64 #define xcbrtf nsimd_sleef_cbrt_u35_avx2_f32 #define xsin_u1 nsimd_sleef_sin_u10_avx2_f64 #define xsinf_u1 nsimd_sleef_sin_u10_avx2_f32 #define xcos_u1 nsimd_sleef_cos_u10_avx2_f64 #define xcosf_u1 nsimd_sleef_cos_u10_avx2_f32 #define xsincos_u1 nsimd_sleef_sincos_u10_avx2_f64 #define xsincosf_u1 nsimd_sleef_sincos_u10_avx2_f32 #define xtan_u1 nsimd_sleef_tan_u10_avx2_f64 #define xtanf_u1 nsimd_sleef_tan_u10_avx2_f32 #define xasin_u1 nsimd_sleef_asin_u10_avx2_f64 #define xasinf_u1 nsimd_sleef_asin_u10_avx2_f32 #define xacos_u1 nsimd_sleef_acos_u10_avx2_f64 #define xacosf_u1 nsimd_sleef_acos_u10_avx2_f32 #define xatan_u1 nsimd_sleef_atan_u10_avx2_f64 #define xatanf_u1 nsimd_sleef_atan_u10_avx2_f32 #define xatan2_u1 nsimd_sleef_atan2_u10_avx2_f64 #define xatan2f_u1 nsimd_sleef_atan2_u10_avx2_f32 #define xlog_u1 nsimd_sleef_log_u10_avx2_f64 #define xlogf_u1 nsimd_sleef_log_u10_avx2_f32 #define xcbrt_u1 nsimd_sleef_cbrt_u10_avx2_f64 #define xcbrtf_u1 nsimd_sleef_cbrt_u10_avx2_f32 #define xexp nsimd_sleef_exp_u10_avx2_f64 #define xexpf nsimd_sleef_exp_u10_avx2_f32 #define xpow nsimd_sleef_pow_u10_avx2_f64 #define xpowf nsimd_sleef_pow_u10_avx2_f32 #define xsinh nsimd_sleef_sinh_u10_avx2_f64 #define xsinhf nsimd_sleef_sinh_u10_avx2_f32 #define xcosh nsimd_sleef_cosh_u10_avx2_f64 #define xcoshf nsimd_sleef_cosh_u10_avx2_f32 #define xtanh nsimd_sleef_tanh_u10_avx2_f64 #define xtanhf nsimd_sleef_tanh_u10_avx2_f32 #define xsinh_u35 nsimd_sleef_sinh_u35_avx2_f64 #define xsinhf_u35 nsimd_sleef_sinh_u35_avx2_f32 #define xcosh_u35 nsimd_sleef_cosh_u35_avx2_f64 #define xcoshf_u35 nsimd_sleef_cosh_u35_avx2_f32 #define xtanh_u35 nsimd_sleef_tanh_u35_avx2_f64 #define xtanhf_u35 nsimd_sleef_tanh_u35_avx2_f32 #define xfastsin_u3500 nsimd_sleef_fastsin_u3500_avx2_f64 #define xfastsinf_u3500 nsimd_sleef_fastsin_u3500_avx2_f32 #define xfastcos_u3500 nsimd_sleef_fastcos_u3500_avx2_f64 #define xfastcosf_u3500 nsimd_sleef_fastcos_u3500_avx2_f32 #define xfastpow_u3500 nsimd_sleef_fastpow_u3500_avx2_f64 #define xfastpowf_u3500 nsimd_sleef_fastpow_u3500_avx2_f32 #define xasinh nsimd_sleef_asinh_u10_avx2_f64 #define xasinhf nsimd_sleef_asinh_u10_avx2_f32 #define xacosh nsimd_sleef_acosh_u10_avx2_f64 #define xacoshf nsimd_sleef_acosh_u10_avx2_f32 #define xatanh nsimd_sleef_atanh_u10_avx2_f64 #define xatanhf nsimd_sleef_atanh_u10_avx2_f32 #define xexp2 nsimd_sleef_exp2_u10_avx2_f64 #define xexp2f nsimd_sleef_exp2_u10_avx2_f32 #define xexp2_u35 nsimd_sleef_exp2_u35_avx2_f64 #define xexp2f_u35 nsimd_sleef_exp2_u35_avx2_f32 #define xexp10 nsimd_sleef_exp10_u10_avx2_f64 #define xexp10f nsimd_sleef_exp10_u10_avx2_f32 #define xexp10_u35 nsimd_sleef_exp10_u35_avx2_f64 #define xexp10f_u35 nsimd_sleef_exp10_u35_avx2_f32 #define xexpm1 nsimd_sleef_expm1_u10_avx2_f64 #define xexpm1f nsimd_sleef_expm1_u10_avx2_f32 #define xlog10 nsimd_sleef_log10_u10_avx2_f64 #define xlog10f nsimd_sleef_log10_u10_avx2_f32 #define xlog2 nsimd_sleef_log2_u10_avx2_f64 #define xlog2f nsimd_sleef_log2_u10_avx2_f32 #define xlog2_u35 nsimd_sleef_log2_u35_avx2_f64 #define xlog2f_u35 nsimd_sleef_log2_u35_avx2_f32 #define xlog1p nsimd_sleef_log1p_u10_avx2_f64 #define xlog1pf nsimd_sleef_log1p_u10_avx2_f32 #define xsincospi_u05 nsimd_sleef_sincospi_u05_avx2_f64 #define xsincospif_u05 nsimd_sleef_sincospi_u05_avx2_f32 #define xsincospi_u35 nsimd_sleef_sincospi_u35_avx2_f64 #define xsincospif_u35 nsimd_sleef_sincospi_u35_avx2_f32 #define xsinpi_u05 nsimd_sleef_sinpi_u05_avx2_f64 #define xsinpif_u05 nsimd_sleef_sinpi_u05_avx2_f32 #define xcospi_u05 nsimd_sleef_cospi_u05_avx2_f64 #define xcospif_u05 nsimd_sleef_cospi_u05_avx2_f32 #define xldexp nsimd_sleef_ldexp_avx2_f64 #define xldexpf nsimd_sleef_ldexp_avx2_f32 #define xilogb nsimd_sleef_ilogb_avx2_f64 #define xilogbf nsimd_sleef_ilogb_avx2_f32 #define xfma nsimd_sleef_fma_avx2_f64 #define xfmaf nsimd_sleef_fma_avx2_f32 #define xsqrt nsimd_sleef_sqrt_avx2_f64 #define xsqrtf nsimd_sleef_sqrt_avx2_f32 #define xsqrt_u05 nsimd_sleef_sqrt_u05_avx2_f64 #define xsqrtf_u05 nsimd_sleef_sqrt_u05_avx2_f32 #define xsqrt_u35 nsimd_sleef_sqrt_u35_avx2_f64 #define xsqrtf_u35 nsimd_sleef_sqrt_u35_avx2_f32 #define xhypot_u05 nsimd_sleef_hypot_u05_avx2_f64 #define xhypotf_u05 nsimd_sleef_hypot_u05_avx2_f32 #define xhypot_u35 nsimd_sleef_hypot_u35_avx2_f64 #define xhypotf_u35 nsimd_sleef_hypot_u35_avx2_f32 #define xfabs nsimd_sleef_fabs_avx2_f64 #define xfabsf nsimd_sleef_fabs_avx2_f32 #define xcopysign nsimd_sleef_copysign_avx2_f64 #define xcopysignf nsimd_sleef_copysign_avx2_f32 #define xfmax nsimd_sleef_fmax_avx2_f64 #define xfmaxf nsimd_sleef_fmax_avx2_f32 #define xfmin nsimd_sleef_fmin_avx2_f64 #define xfminf nsimd_sleef_fmin_avx2_f32 #define xfdim nsimd_sleef_fdim_avx2_f64 #define xfdimf nsimd_sleef_fdim_avx2_f32 #define xtrunc nsimd_sleef_trunc_avx2_f64 #define xtruncf nsimd_sleef_trunc_avx2_f32 #define xfloor nsimd_sleef_floor_avx2_f64 #define xfloorf nsimd_sleef_floor_avx2_f32 #define xceil nsimd_sleef_ceil_avx2_f64 #define xceilf nsimd_sleef_ceil_avx2_f32 #define xround nsimd_sleef_round_avx2_f64 #define xroundf nsimd_sleef_round_avx2_f32 #define xrint nsimd_sleef_rint_avx2_f64 #define xrintf nsimd_sleef_rint_avx2_f32 #define xnextafter nsimd_sleef_nextafter_avx2_f64 #define xnextafterf nsimd_sleef_nextafter_avx2_f32 #define xfrfrexp nsimd_sleef_frfrexp_avx2_f64 #define xfrfrexpf nsimd_sleef_frfrexp_avx2_f32 #define xexpfrexp nsimd_sleef_expfrexp_avx2_f64 #define xexpfrexpf nsimd_sleef_expfrexp_avx2_f32 #define xfmod nsimd_sleef_fmod_avx2_f64 #define xfmodf nsimd_sleef_fmod_avx2_f32 #define xremainder nsimd_sleef_remainder_avx2_f64 #define xremainderf nsimd_sleef_remainder_avx2_f32 #define xmodf nsimd_sleef_modf_avx2_f64 #define xmodff nsimd_sleef_modf_avx2_f32 #define xlgamma_u1 nsimd_sleef_lgamma_u10_avx2_f64 #define xlgammaf_u1 nsimd_sleef_lgamma_u10_avx2_f32 #define xtgamma_u1 nsimd_sleef_tgamma_u10_avx2_f64 #define xtgammaf_u1 nsimd_sleef_tgamma_u10_avx2_f32 #define xerf_u1 nsimd_sleef_erf_u10_avx2_f64 #define xerff_u1 nsimd_sleef_erf_u10_avx2_f32 #define xerfc_u15 nsimd_sleef_erfc_u15_avx2_f64 #define xerfcf_u15 nsimd_sleef_erfc_u15_avx2_f32 #define xgetInt nsimd_sleef_getInt_avx2_f64 #define xgetIntf nsimd_sleef_getInt_avx2_f32 #define xgetPtr nsimd_sleef_getPtr_avx2_f64 #define xgetPtrf nsimd_sleef_getPtr_avx2_f32 #endif #define rempi nsimd_sleef_rempi_avx2 #define rempif nsimd_sleef_rempif_avx2 #define rempisub nsimd_sleef_rempisub_avx2 #define rempisubf nsimd_sleef_rempisubf_avx2 #define gammak nsimd_gammak_avx2 #define gammafk nsimd_gammafk_avx2 #endif #endif ================================================ FILE: src/renameavx512f.h ================================================ #ifndef RENAMEAVX512F_H #define RENAMEAVX512F_H /* ------------------------------------------------------------------------- */ /* Naming of functions avx512_knl */ #ifdef NSIMD_AVX512_KNL #ifdef DETERMINISTIC #define xsin nsimd_sleef_sin_u35d_avx512_knl_f64 #define xsinf nsimd_sleef_sin_u35d_avx512_knl_f32 #define xcos nsimd_sleef_cos_u35d_avx512_knl_f64 #define xcosf nsimd_sleef_cos_u35d_avx512_knl_f32 #define xsincos nsimd_sleef_sincos_u35d_avx512_knl_f64 #define xsincosf nsimd_sleef_sincos_u35d_avx512_knl_f32 #define xtan nsimd_sleef_tan_u35d_avx512_knl_f64 #define xtanf nsimd_sleef_tan_u35d_avx512_knl_f32 #define xasin nsimd_sleef_asin_u35d_avx512_knl_f64 #define xasinf nsimd_sleef_asin_u35d_avx512_knl_f32 #define xacos nsimd_sleef_acos_u35d_avx512_knl_f64 #define xacosf nsimd_sleef_acos_u35d_avx512_knl_f32 #define xatan nsimd_sleef_atan_u35d_avx512_knl_f64 #define xatanf nsimd_sleef_atan_u35d_avx512_knl_f32 #define xatan2 nsimd_sleef_atan2_u35d_avx512_knl_f64 #define xatan2f nsimd_sleef_atan2_u35d_avx512_knl_f32 #define xlog nsimd_sleef_log_u35d_avx512_knl_f64 #define xlogf nsimd_sleef_log_u35d_avx512_knl_f32 #define xcbrt nsimd_sleef_cbrt_u35d_avx512_knl_f64 #define xcbrtf nsimd_sleef_cbrt_u35d_avx512_knl_f32 #define xsin_u1 nsimd_sleef_sin_u10d_avx512_knl_f64 #define xsinf_u1 nsimd_sleef_sin_u10d_avx512_knl_f32 #define xcos_u1 nsimd_sleef_cos_u10d_avx512_knl_f64 #define xcosf_u1 nsimd_sleef_cos_u10d_avx512_knl_f32 #define xsincos_u1 nsimd_sleef_sincos_u10d_avx512_knl_f64 #define xsincosf_u1 nsimd_sleef_sincos_u10d_avx512_knl_f32 #define xtan_u1 nsimd_sleef_tan_u10d_avx512_knl_f64 #define xtanf_u1 nsimd_sleef_tan_u10d_avx512_knl_f32 #define xasin_u1 nsimd_sleef_asin_u10d_avx512_knl_f64 #define xasinf_u1 nsimd_sleef_asin_u10d_avx512_knl_f32 #define xacos_u1 nsimd_sleef_acos_u10d_avx512_knl_f64 #define xacosf_u1 nsimd_sleef_acos_u10d_avx512_knl_f32 #define xatan_u1 nsimd_sleef_atan_u10d_avx512_knl_f64 #define xatanf_u1 nsimd_sleef_atan_u10d_avx512_knl_f32 #define xatan2_u1 nsimd_sleef_atan2_u10d_avx512_knl_f64 #define xatan2f_u1 nsimd_sleef_atan2_u10d_avx512_knl_f32 #define xlog_u1 nsimd_sleef_log_u10d_avx512_knl_f64 #define xlogf_u1 nsimd_sleef_log_u10d_avx512_knl_f32 #define xcbrt_u1 nsimd_sleef_cbrt_u10d_avx512_knl_f64 #define xcbrtf_u1 nsimd_sleef_cbrt_u10d_avx512_knl_f32 #define xexp nsimd_sleef_exp_u10d_avx512_knl_f64 #define xexpf nsimd_sleef_exp_u10d_avx512_knl_f32 #define xpow nsimd_sleef_pow_u10d_avx512_knl_f64 #define xpowf nsimd_sleef_pow_u10d_avx512_knl_f32 #define xsinh nsimd_sleef_sinh_u10d_avx512_knl_f64 #define xsinhf nsimd_sleef_sinh_u10d_avx512_knl_f32 #define xcosh nsimd_sleef_cosh_u10d_avx512_knl_f64 #define xcoshf nsimd_sleef_cosh_u10d_avx512_knl_f32 #define xtanh nsimd_sleef_tanh_u10d_avx512_knl_f64 #define xtanhf nsimd_sleef_tanh_u10d_avx512_knl_f32 #define xsinh_u35 nsimd_sleef_sinh_u35d_avx512_knl_f64 #define xsinhf_u35 nsimd_sleef_sinh_u35d_avx512_knl_f32 #define xcosh_u35 nsimd_sleef_cosh_u35d_avx512_knl_f64 #define xcoshf_u35 nsimd_sleef_cosh_u35d_avx512_knl_f32 #define xtanh_u35 nsimd_sleef_tanh_u35d_avx512_knl_f64 #define xtanhf_u35 nsimd_sleef_tanh_u35d_avx512_knl_f32 #define xfastsin_u3500 nsimd_sleef_fastsin_u3500d_avx512_knl_f64 #define xfastsinf_u3500 nsimd_sleef_fastsin_u3500d_avx512_knl_f32 #define xfastcos_u3500 nsimd_sleef_fastcos_u3500d_avx512_knl_f64 #define xfastcosf_u3500 nsimd_sleef_fastcos_u3500d_avx512_knl_f32 #define xfastpow_u3500 nsimd_sleef_fastpow_u3500d_avx512_knl_f64 #define xfastpowf_u3500 nsimd_sleef_fastpow_u3500d_avx512_knl_f32 #define xasinh nsimd_sleef_asinh_u10d_avx512_knl_f64 #define xasinhf nsimd_sleef_asinh_u10d_avx512_knl_f32 #define xacosh nsimd_sleef_acosh_u10d_avx512_knl_f64 #define xacoshf nsimd_sleef_acosh_u10d_avx512_knl_f32 #define xatanh nsimd_sleef_atanh_u10d_avx512_knl_f64 #define xatanhf nsimd_sleef_atanh_u10d_avx512_knl_f32 #define xexp2 nsimd_sleef_exp2_u10d_avx512_knl_f64 #define xexp2f nsimd_sleef_exp2_u10d_avx512_knl_f32 #define xexp2_u35 nsimd_sleef_exp2_u35d_avx512_knl_f64 #define xexp2f_u35 nsimd_sleef_exp2_u35d_avx512_knl_f32 #define xexp10 nsimd_sleef_exp10_u10d_avx512_knl_f64 #define xexp10f nsimd_sleef_exp10_u10d_avx512_knl_f32 #define xexp10_u35 nsimd_sleef_exp10_u35d_avx512_knl_f64 #define xexp10f_u35 nsimd_sleef_exp10_u35d_avx512_knl_f32 #define xexpm1 nsimd_sleef_expm1_u10d_avx512_knl_f64 #define xexpm1f nsimd_sleef_expm1_u10d_avx512_knl_f32 #define xlog10 nsimd_sleef_log10_u10d_avx512_knl_f64 #define xlog10f nsimd_sleef_log10_u10d_avx512_knl_f32 #define xlog2 nsimd_sleef_log2_u10d_avx512_knl_f64 #define xlog2f nsimd_sleef_log2_u10d_avx512_knl_f32 #define xlog2_u35 nsimd_sleef_log2_u35d_avx512_knl_f64 #define xlog2f_u35 nsimd_sleef_log2_u35d_avx512_knl_f32 #define xlog1p nsimd_sleef_log1p_u10d_avx512_knl_f64 #define xlog1pf nsimd_sleef_log1p_u10d_avx512_knl_f32 #define xsincospi_u05 nsimd_sleef_sincospi_u05d_avx512_knl_f64 #define xsincospif_u05 nsimd_sleef_sincospi_u05d_avx512_knl_f32 #define xsincospi_u35 nsimd_sleef_sincospi_u35d_avx512_knl_f64 #define xsincospif_u35 nsimd_sleef_sincospi_u35d_avx512_knl_f32 #define xsinpi_u05 nsimd_sleef_sinpi_u05d_avx512_knl_f64 #define xsinpif_u05 nsimd_sleef_sinpi_u05d_avx512_knl_f32 #define xcospi_u05 nsimd_sleef_cospi_u05d_avx512_knl_f64 #define xcospif_u05 nsimd_sleef_cospi_u05d_avx512_knl_f32 #define xldexp nsimd_sleef_ldexp_avx512_knl_f64 #define xldexpf nsimd_sleef_ldexp_avx512_knl_f32 #define xilogb nsimd_sleef_ilogb_avx512_knl_f64 #define xilogbf nsimd_sleef_ilogb_avx512_knl_f32 #define xfma nsimd_sleef_fma_avx512_knl_f64 #define xfmaf nsimd_sleef_fma_avx512_knl_f32 #define xsqrt nsimd_sleef_sqrt_avx512_knl_f64 #define xsqrtf nsimd_sleef_sqrt_avx512_knl_f32 #define xsqrt_u05 nsimd_sleef_sqrt_u05d_avx512_knl_f64 #define xsqrtf_u05 nsimd_sleef_sqrt_u05d_avx512_knl_f32 #define xsqrt_u35 nsimd_sleef_sqrt_u35d_avx512_knl_f64 #define xsqrtf_u35 nsimd_sleef_sqrt_u35d_avx512_knl_f32 #define xhypot_u05 nsimd_sleef_hypot_u05d_avx512_knl_f64 #define xhypotf_u05 nsimd_sleef_hypot_u05d_avx512_knl_f32 #define xhypot_u35 nsimd_sleef_hypot_u35d_avx512_knl_f64 #define xhypotf_u35 nsimd_sleef_hypot_u35d_avx512_knl_f32 #define xfabs nsimd_sleef_fabs_avx512_knl_f64 #define xfabsf nsimd_sleef_fabs_avx512_knl_f32 #define xcopysign nsimd_sleef_copysign_avx512_knl_f64 #define xcopysignf nsimd_sleef_copysign_avx512_knl_f32 #define xfmax nsimd_sleef_fmax_avx512_knl_f64 #define xfmaxf nsimd_sleef_fmax_avx512_knl_f32 #define xfmin nsimd_sleef_fmin_avx512_knl_f64 #define xfminf nsimd_sleef_fmin_avx512_knl_f32 #define xfdim nsimd_sleef_fdim_avx512_knl_f64 #define xfdimf nsimd_sleef_fdim_avx512_knl_f32 #define xtrunc nsimd_sleef_trunc_avx512_knl_f64 #define xtruncf nsimd_sleef_trunc_avx512_knl_f32 #define xfloor nsimd_sleef_floor_avx512_knl_f64 #define xfloorf nsimd_sleef_floor_avx512_knl_f32 #define xceil nsimd_sleef_ceil_avx512_knl_f64 #define xceilf nsimd_sleef_ceil_avx512_knl_f32 #define xround nsimd_sleef_round_avx512_knl_f64 #define xroundf nsimd_sleef_round_avx512_knl_f32 #define xrint nsimd_sleef_rint_avx512_knl_f64 #define xrintf nsimd_sleef_rint_avx512_knl_f32 #define xnextafter nsimd_sleef_nextafter_avx512_knl_f64 #define xnextafterf nsimd_sleef_nextafter_avx512_knl_f32 #define xfrfrexp nsimd_sleef_frfrexp_avx512_knl_f64 #define xfrfrexpf nsimd_sleef_frfrexp_avx512_knl_f32 #define xexpfrexp nsimd_sleef_expfrexp_avx512_knl_f64 #define xexpfrexpf nsimd_sleef_expfrexp_avx512_knl_f32 #define xfmod nsimd_sleef_fmod_avx512_knl_f64 #define xfmodf nsimd_sleef_fmod_avx512_knl_f32 #define xremainder nsimd_sleef_remainder_avx512_knl_f64 #define xremainderf nsimd_sleef_remainder_avx512_knl_f32 #define xmodf nsimd_sleef_modf_avx512_knl_f64 #define xmodff nsimd_sleef_modf_avx512_knl_f32 #define xlgamma_u1 nsimd_sleef_lgamma_u10d_avx512_knl_f64 #define xlgammaf_u1 nsimd_sleef_lgamma_u10d_avx512_knl_f32 #define xtgamma_u1 nsimd_sleef_tgamma_u10d_avx512_knl_f64 #define xtgammaf_u1 nsimd_sleef_tgamma_u10d_avx512_knl_f32 #define xerf_u1 nsimd_sleef_erf_u10d_avx512_knl_f64 #define xerff_u1 nsimd_sleef_erf_u10d_avx512_knl_f32 #define xerfc_u15 nsimd_sleef_erfc_u15d_avx512_knl_f64 #define xerfcf_u15 nsimd_sleef_erfc_u15d_avx512_knl_f32 #define xgetInt nsimd_sleef_getInt_avx512_knl_f64 #define xgetIntf nsimd_sleef_getInt_avx512_knl_f32 #define xgetPtr nsimd_sleef_getPtr_avx512_knl_f64 #define xgetPtrf nsimd_sleef_getPtr_avx512_knl_f32 #else #define xsin nsimd_sleef_sin_u35_avx512_knl_f64 #define xsinf nsimd_sleef_sin_u35_avx512_knl_f32 #define xcos nsimd_sleef_cos_u35_avx512_knl_f64 #define xcosf nsimd_sleef_cos_u35_avx512_knl_f32 #define xsincos nsimd_sleef_sincos_u35_avx512_knl_f64 #define xsincosf nsimd_sleef_sincos_u35_avx512_knl_f32 #define xtan nsimd_sleef_tan_u35_avx512_knl_f64 #define xtanf nsimd_sleef_tan_u35_avx512_knl_f32 #define xasin nsimd_sleef_asin_u35_avx512_knl_f64 #define xasinf nsimd_sleef_asin_u35_avx512_knl_f32 #define xacos nsimd_sleef_acos_u35_avx512_knl_f64 #define xacosf nsimd_sleef_acos_u35_avx512_knl_f32 #define xatan nsimd_sleef_atan_u35_avx512_knl_f64 #define xatanf nsimd_sleef_atan_u35_avx512_knl_f32 #define xatan2 nsimd_sleef_atan2_u35_avx512_knl_f64 #define xatan2f nsimd_sleef_atan2_u35_avx512_knl_f32 #define xlog nsimd_sleef_log_u35_avx512_knl_f64 #define xlogf nsimd_sleef_log_u35_avx512_knl_f32 #define xcbrt nsimd_sleef_cbrt_u35_avx512_knl_f64 #define xcbrtf nsimd_sleef_cbrt_u35_avx512_knl_f32 #define xsin_u1 nsimd_sleef_sin_u10_avx512_knl_f64 #define xsinf_u1 nsimd_sleef_sin_u10_avx512_knl_f32 #define xcos_u1 nsimd_sleef_cos_u10_avx512_knl_f64 #define xcosf_u1 nsimd_sleef_cos_u10_avx512_knl_f32 #define xsincos_u1 nsimd_sleef_sincos_u10_avx512_knl_f64 #define xsincosf_u1 nsimd_sleef_sincos_u10_avx512_knl_f32 #define xtan_u1 nsimd_sleef_tan_u10_avx512_knl_f64 #define xtanf_u1 nsimd_sleef_tan_u10_avx512_knl_f32 #define xasin_u1 nsimd_sleef_asin_u10_avx512_knl_f64 #define xasinf_u1 nsimd_sleef_asin_u10_avx512_knl_f32 #define xacos_u1 nsimd_sleef_acos_u10_avx512_knl_f64 #define xacosf_u1 nsimd_sleef_acos_u10_avx512_knl_f32 #define xatan_u1 nsimd_sleef_atan_u10_avx512_knl_f64 #define xatanf_u1 nsimd_sleef_atan_u10_avx512_knl_f32 #define xatan2_u1 nsimd_sleef_atan2_u10_avx512_knl_f64 #define xatan2f_u1 nsimd_sleef_atan2_u10_avx512_knl_f32 #define xlog_u1 nsimd_sleef_log_u10_avx512_knl_f64 #define xlogf_u1 nsimd_sleef_log_u10_avx512_knl_f32 #define xcbrt_u1 nsimd_sleef_cbrt_u10_avx512_knl_f64 #define xcbrtf_u1 nsimd_sleef_cbrt_u10_avx512_knl_f32 #define xexp nsimd_sleef_exp_u10_avx512_knl_f64 #define xexpf nsimd_sleef_exp_u10_avx512_knl_f32 #define xpow nsimd_sleef_pow_u10_avx512_knl_f64 #define xpowf nsimd_sleef_pow_u10_avx512_knl_f32 #define xsinh nsimd_sleef_sinh_u10_avx512_knl_f64 #define xsinhf nsimd_sleef_sinh_u10_avx512_knl_f32 #define xcosh nsimd_sleef_cosh_u10_avx512_knl_f64 #define xcoshf nsimd_sleef_cosh_u10_avx512_knl_f32 #define xtanh nsimd_sleef_tanh_u10_avx512_knl_f64 #define xtanhf nsimd_sleef_tanh_u10_avx512_knl_f32 #define xsinh_u35 nsimd_sleef_sinh_u35_avx512_knl_f64 #define xsinhf_u35 nsimd_sleef_sinh_u35_avx512_knl_f32 #define xcosh_u35 nsimd_sleef_cosh_u35_avx512_knl_f64 #define xcoshf_u35 nsimd_sleef_cosh_u35_avx512_knl_f32 #define xtanh_u35 nsimd_sleef_tanh_u35_avx512_knl_f64 #define xtanhf_u35 nsimd_sleef_tanh_u35_avx512_knl_f32 #define xfastsin_u3500 nsimd_sleef_fastsin_u3500_avx512_knl_f64 #define xfastsinf_u3500 nsimd_sleef_fastsin_u3500_avx512_knl_f32 #define xfastcos_u3500 nsimd_sleef_fastcos_u3500_avx512_knl_f64 #define xfastcosf_u3500 nsimd_sleef_fastcos_u3500_avx512_knl_f32 #define xfastpow_u3500 nsimd_sleef_fastpow_u3500_avx512_knl_f64 #define xfastpowf_u3500 nsimd_sleef_fastpow_u3500_avx512_knl_f32 #define xasinh nsimd_sleef_asinh_u10_avx512_knl_f64 #define xasinhf nsimd_sleef_asinh_u10_avx512_knl_f32 #define xacosh nsimd_sleef_acosh_u10_avx512_knl_f64 #define xacoshf nsimd_sleef_acosh_u10_avx512_knl_f32 #define xatanh nsimd_sleef_atanh_u10_avx512_knl_f64 #define xatanhf nsimd_sleef_atanh_u10_avx512_knl_f32 #define xexp2 nsimd_sleef_exp2_u10_avx512_knl_f64 #define xexp2f nsimd_sleef_exp2_u10_avx512_knl_f32 #define xexp2_u35 nsimd_sleef_exp2_u35_avx512_knl_f64 #define xexp2f_u35 nsimd_sleef_exp2_u35_avx512_knl_f32 #define xexp10 nsimd_sleef_exp10_u10_avx512_knl_f64 #define xexp10f nsimd_sleef_exp10_u10_avx512_knl_f32 #define xexp10_u35 nsimd_sleef_exp10_u35_avx512_knl_f64 #define xexp10f_u35 nsimd_sleef_exp10_u35_avx512_knl_f32 #define xexpm1 nsimd_sleef_expm1_u10_avx512_knl_f64 #define xexpm1f nsimd_sleef_expm1_u10_avx512_knl_f32 #define xlog10 nsimd_sleef_log10_u10_avx512_knl_f64 #define xlog10f nsimd_sleef_log10_u10_avx512_knl_f32 #define xlog2 nsimd_sleef_log2_u10_avx512_knl_f64 #define xlog2f nsimd_sleef_log2_u10_avx512_knl_f32 #define xlog2_u35 nsimd_sleef_log2_u35_avx512_knl_f64 #define xlog2f_u35 nsimd_sleef_log2_u35_avx512_knl_f32 #define xlog1p nsimd_sleef_log1p_u10_avx512_knl_f64 #define xlog1pf nsimd_sleef_log1p_u10_avx512_knl_f32 #define xsincospi_u05 nsimd_sleef_sincospi_u05_avx512_knl_f64 #define xsincospif_u05 nsimd_sleef_sincospi_u05_avx512_knl_f32 #define xsincospi_u35 nsimd_sleef_sincospi_u35_avx512_knl_f64 #define xsincospif_u35 nsimd_sleef_sincospi_u35_avx512_knl_f32 #define xsinpi_u05 nsimd_sleef_sinpi_u05_avx512_knl_f64 #define xsinpif_u05 nsimd_sleef_sinpi_u05_avx512_knl_f32 #define xcospi_u05 nsimd_sleef_cospi_u05_avx512_knl_f64 #define xcospif_u05 nsimd_sleef_cospi_u05_avx512_knl_f32 #define xldexp nsimd_sleef_ldexp_avx512_knl_f64 #define xldexpf nsimd_sleef_ldexp_avx512_knl_f32 #define xilogb nsimd_sleef_ilogb_avx512_knl_f64 #define xilogbf nsimd_sleef_ilogb_avx512_knl_f32 #define xfma nsimd_sleef_fma_avx512_knl_f64 #define xfmaf nsimd_sleef_fma_avx512_knl_f32 #define xsqrt nsimd_sleef_sqrt_avx512_knl_f64 #define xsqrtf nsimd_sleef_sqrt_avx512_knl_f32 #define xsqrt_u05 nsimd_sleef_sqrt_u05_avx512_knl_f64 #define xsqrtf_u05 nsimd_sleef_sqrt_u05_avx512_knl_f32 #define xsqrt_u35 nsimd_sleef_sqrt_u35_avx512_knl_f64 #define xsqrtf_u35 nsimd_sleef_sqrt_u35_avx512_knl_f32 #define xhypot_u05 nsimd_sleef_hypot_u05_avx512_knl_f64 #define xhypotf_u05 nsimd_sleef_hypot_u05_avx512_knl_f32 #define xhypot_u35 nsimd_sleef_hypot_u35_avx512_knl_f64 #define xhypotf_u35 nsimd_sleef_hypot_u35_avx512_knl_f32 #define xfabs nsimd_sleef_fabs_avx512_knl_f64 #define xfabsf nsimd_sleef_fabs_avx512_knl_f32 #define xcopysign nsimd_sleef_copysign_avx512_knl_f64 #define xcopysignf nsimd_sleef_copysign_avx512_knl_f32 #define xfmax nsimd_sleef_fmax_avx512_knl_f64 #define xfmaxf nsimd_sleef_fmax_avx512_knl_f32 #define xfmin nsimd_sleef_fmin_avx512_knl_f64 #define xfminf nsimd_sleef_fmin_avx512_knl_f32 #define xfdim nsimd_sleef_fdim_avx512_knl_f64 #define xfdimf nsimd_sleef_fdim_avx512_knl_f32 #define xtrunc nsimd_sleef_trunc_avx512_knl_f64 #define xtruncf nsimd_sleef_trunc_avx512_knl_f32 #define xfloor nsimd_sleef_floor_avx512_knl_f64 #define xfloorf nsimd_sleef_floor_avx512_knl_f32 #define xceil nsimd_sleef_ceil_avx512_knl_f64 #define xceilf nsimd_sleef_ceil_avx512_knl_f32 #define xround nsimd_sleef_round_avx512_knl_f64 #define xroundf nsimd_sleef_round_avx512_knl_f32 #define xrint nsimd_sleef_rint_avx512_knl_f64 #define xrintf nsimd_sleef_rint_avx512_knl_f32 #define xnextafter nsimd_sleef_nextafter_avx512_knl_f64 #define xnextafterf nsimd_sleef_nextafter_avx512_knl_f32 #define xfrfrexp nsimd_sleef_frfrexp_avx512_knl_f64 #define xfrfrexpf nsimd_sleef_frfrexp_avx512_knl_f32 #define xexpfrexp nsimd_sleef_expfrexp_avx512_knl_f64 #define xexpfrexpf nsimd_sleef_expfrexp_avx512_knl_f32 #define xfmod nsimd_sleef_fmod_avx512_knl_f64 #define xfmodf nsimd_sleef_fmod_avx512_knl_f32 #define xremainder nsimd_sleef_remainder_avx512_knl_f64 #define xremainderf nsimd_sleef_remainder_avx512_knl_f32 #define xmodf nsimd_sleef_modf_avx512_knl_f64 #define xmodff nsimd_sleef_modf_avx512_knl_f32 #define xlgamma_u1 nsimd_sleef_lgamma_u10_avx512_knl_f64 #define xlgammaf_u1 nsimd_sleef_lgamma_u10_avx512_knl_f32 #define xtgamma_u1 nsimd_sleef_tgamma_u10_avx512_knl_f64 #define xtgammaf_u1 nsimd_sleef_tgamma_u10_avx512_knl_f32 #define xerf_u1 nsimd_sleef_erf_u10_avx512_knl_f64 #define xerff_u1 nsimd_sleef_erf_u10_avx512_knl_f32 #define xerfc_u15 nsimd_sleef_erfc_u15_avx512_knl_f64 #define xerfcf_u15 nsimd_sleef_erfc_u15_avx512_knl_f32 #define xgetInt nsimd_sleef_getInt_avx512_knl_f64 #define xgetIntf nsimd_sleef_getInt_avx512_knl_f32 #define xgetPtr nsimd_sleef_getPtr_avx512_knl_f64 #define xgetPtrf nsimd_sleef_getPtr_avx512_knl_f32 #endif #define rempi nsimd_sleef_rempi_avx512_knl #define rempif nsimd_sleef_rempif_avx512_knl #define rempisub nsimd_sleef_rempisub_avx512_knl #define rempisubf nsimd_sleef_rempisubf_avx512_knl #define gammak nsimd_gammak_avx512_knl #define gammafk nsimd_gammafk_avx512_knl #endif /* ------------------------------------------------------------------------- */ /* Naming of functions avx512_skylake */ #ifdef NSIMD_AVX512_SKYLAKE #ifdef DETERMINISTIC #define xsin nsimd_sleef_sin_u35d_avx512_skylake_f64 #define xsinf nsimd_sleef_sin_u35d_avx512_skylake_f32 #define xcos nsimd_sleef_cos_u35d_avx512_skylake_f64 #define xcosf nsimd_sleef_cos_u35d_avx512_skylake_f32 #define xsincos nsimd_sleef_sincos_u35d_avx512_skylake_f64 #define xsincosf nsimd_sleef_sincos_u35d_avx512_skylake_f32 #define xtan nsimd_sleef_tan_u35d_avx512_skylake_f64 #define xtanf nsimd_sleef_tan_u35d_avx512_skylake_f32 #define xasin nsimd_sleef_asin_u35d_avx512_skylake_f64 #define xasinf nsimd_sleef_asin_u35d_avx512_skylake_f32 #define xacos nsimd_sleef_acos_u35d_avx512_skylake_f64 #define xacosf nsimd_sleef_acos_u35d_avx512_skylake_f32 #define xatan nsimd_sleef_atan_u35d_avx512_skylake_f64 #define xatanf nsimd_sleef_atan_u35d_avx512_skylake_f32 #define xatan2 nsimd_sleef_atan2_u35d_avx512_skylake_f64 #define xatan2f nsimd_sleef_atan2_u35d_avx512_skylake_f32 #define xlog nsimd_sleef_log_u35d_avx512_skylake_f64 #define xlogf nsimd_sleef_log_u35d_avx512_skylake_f32 #define xcbrt nsimd_sleef_cbrt_u35d_avx512_skylake_f64 #define xcbrtf nsimd_sleef_cbrt_u35d_avx512_skylake_f32 #define xsin_u1 nsimd_sleef_sin_u10d_avx512_skylake_f64 #define xsinf_u1 nsimd_sleef_sin_u10d_avx512_skylake_f32 #define xcos_u1 nsimd_sleef_cos_u10d_avx512_skylake_f64 #define xcosf_u1 nsimd_sleef_cos_u10d_avx512_skylake_f32 #define xsincos_u1 nsimd_sleef_sincos_u10d_avx512_skylake_f64 #define xsincosf_u1 nsimd_sleef_sincos_u10d_avx512_skylake_f32 #define xtan_u1 nsimd_sleef_tan_u10d_avx512_skylake_f64 #define xtanf_u1 nsimd_sleef_tan_u10d_avx512_skylake_f32 #define xasin_u1 nsimd_sleef_asin_u10d_avx512_skylake_f64 #define xasinf_u1 nsimd_sleef_asin_u10d_avx512_skylake_f32 #define xacos_u1 nsimd_sleef_acos_u10d_avx512_skylake_f64 #define xacosf_u1 nsimd_sleef_acos_u10d_avx512_skylake_f32 #define xatan_u1 nsimd_sleef_atan_u10d_avx512_skylake_f64 #define xatanf_u1 nsimd_sleef_atan_u10d_avx512_skylake_f32 #define xatan2_u1 nsimd_sleef_atan2_u10d_avx512_skylake_f64 #define xatan2f_u1 nsimd_sleef_atan2_u10d_avx512_skylake_f32 #define xlog_u1 nsimd_sleef_log_u10d_avx512_skylake_f64 #define xlogf_u1 nsimd_sleef_log_u10d_avx512_skylake_f32 #define xcbrt_u1 nsimd_sleef_cbrt_u10d_avx512_skylake_f64 #define xcbrtf_u1 nsimd_sleef_cbrt_u10d_avx512_skylake_f32 #define xexp nsimd_sleef_exp_u10d_avx512_skylake_f64 #define xexpf nsimd_sleef_exp_u10d_avx512_skylake_f32 #define xpow nsimd_sleef_pow_u10d_avx512_skylake_f64 #define xpowf nsimd_sleef_pow_u10d_avx512_skylake_f32 #define xsinh nsimd_sleef_sinh_u10d_avx512_skylake_f64 #define xsinhf nsimd_sleef_sinh_u10d_avx512_skylake_f32 #define xcosh nsimd_sleef_cosh_u10d_avx512_skylake_f64 #define xcoshf nsimd_sleef_cosh_u10d_avx512_skylake_f32 #define xtanh nsimd_sleef_tanh_u10d_avx512_skylake_f64 #define xtanhf nsimd_sleef_tanh_u10d_avx512_skylake_f32 #define xsinh_u35 nsimd_sleef_sinh_u35d_avx512_skylake_f64 #define xsinhf_u35 nsimd_sleef_sinh_u35d_avx512_skylake_f32 #define xcosh_u35 nsimd_sleef_cosh_u35d_avx512_skylake_f64 #define xcoshf_u35 nsimd_sleef_cosh_u35d_avx512_skylake_f32 #define xtanh_u35 nsimd_sleef_tanh_u35d_avx512_skylake_f64 #define xtanhf_u35 nsimd_sleef_tanh_u35d_avx512_skylake_f32 #define xfastsin_u3500 nsimd_sleef_fastsin_u3500d_avx512_skylake_f64 #define xfastsinf_u3500 nsimd_sleef_fastsin_u3500d_avx512_skylake_f32 #define xfastcos_u3500 nsimd_sleef_fastcos_u3500d_avx512_skylake_f64 #define xfastcosf_u3500 nsimd_sleef_fastcos_u3500d_avx512_skylake_f32 #define xfastpow_u3500 nsimd_sleef_fastpow_u3500d_avx512_skylake_f64 #define xfastpowf_u3500 nsimd_sleef_fastpow_u3500d_avx512_skylake_f32 #define xasinh nsimd_sleef_asinh_u10d_avx512_skylake_f64 #define xasinhf nsimd_sleef_asinh_u10d_avx512_skylake_f32 #define xacosh nsimd_sleef_acosh_u10d_avx512_skylake_f64 #define xacoshf nsimd_sleef_acosh_u10d_avx512_skylake_f32 #define xatanh nsimd_sleef_atanh_u10d_avx512_skylake_f64 #define xatanhf nsimd_sleef_atanh_u10d_avx512_skylake_f32 #define xexp2 nsimd_sleef_exp2_u10d_avx512_skylake_f64 #define xexp2f nsimd_sleef_exp2_u10d_avx512_skylake_f32 #define xexp2_u35 nsimd_sleef_exp2_u35d_avx512_skylake_f64 #define xexp2f_u35 nsimd_sleef_exp2_u35d_avx512_skylake_f32 #define xexp10 nsimd_sleef_exp10_u10d_avx512_skylake_f64 #define xexp10f nsimd_sleef_exp10_u10d_avx512_skylake_f32 #define xexp10_u35 nsimd_sleef_exp10_u35d_avx512_skylake_f64 #define xexp10f_u35 nsimd_sleef_exp10_u35d_avx512_skylake_f32 #define xexpm1 nsimd_sleef_expm1_u10d_avx512_skylake_f64 #define xexpm1f nsimd_sleef_expm1_u10d_avx512_skylake_f32 #define xlog10 nsimd_sleef_log10_u10d_avx512_skylake_f64 #define xlog10f nsimd_sleef_log10_u10d_avx512_skylake_f32 #define xlog2 nsimd_sleef_log2_u10d_avx512_skylake_f64 #define xlog2f nsimd_sleef_log2_u10d_avx512_skylake_f32 #define xlog2_u35 nsimd_sleef_log2_u35d_avx512_skylake_f64 #define xlog2f_u35 nsimd_sleef_log2_u35d_avx512_skylake_f32 #define xlog1p nsimd_sleef_log1p_u10d_avx512_skylake_f64 #define xlog1pf nsimd_sleef_log1p_u10d_avx512_skylake_f32 #define xsincospi_u05 nsimd_sleef_sincospi_u05d_avx512_skylake_f64 #define xsincospif_u05 nsimd_sleef_sincospi_u05d_avx512_skylake_f32 #define xsincospi_u35 nsimd_sleef_sincospi_u35d_avx512_skylake_f64 #define xsincospif_u35 nsimd_sleef_sincospi_u35d_avx512_skylake_f32 #define xsinpi_u05 nsimd_sleef_sinpi_u05d_avx512_skylake_f64 #define xsinpif_u05 nsimd_sleef_sinpi_u05d_avx512_skylake_f32 #define xcospi_u05 nsimd_sleef_cospi_u05d_avx512_skylake_f64 #define xcospif_u05 nsimd_sleef_cospi_u05d_avx512_skylake_f32 #define xldexp nsimd_sleef_ldexp_avx512_skylake_f64 #define xldexpf nsimd_sleef_ldexp_avx512_skylake_f32 #define xilogb nsimd_sleef_ilogb_avx512_skylake_f64 #define xilogbf nsimd_sleef_ilogb_avx512_skylake_f32 #define xfma nsimd_sleef_fma_avx512_skylake_f64 #define xfmaf nsimd_sleef_fma_avx512_skylake_f32 #define xsqrt nsimd_sleef_sqrt_avx512_skylake_f64 #define xsqrtf nsimd_sleef_sqrt_avx512_skylake_f32 #define xsqrt_u05 nsimd_sleef_sqrt_u05d_avx512_skylake_f64 #define xsqrtf_u05 nsimd_sleef_sqrt_u05d_avx512_skylake_f32 #define xsqrt_u35 nsimd_sleef_sqrt_u35d_avx512_skylake_f64 #define xsqrtf_u35 nsimd_sleef_sqrt_u35d_avx512_skylake_f32 #define xhypot_u05 nsimd_sleef_hypot_u05d_avx512_skylake_f64 #define xhypotf_u05 nsimd_sleef_hypot_u05d_avx512_skylake_f32 #define xhypot_u35 nsimd_sleef_hypot_u35d_avx512_skylake_f64 #define xhypotf_u35 nsimd_sleef_hypot_u35d_avx512_skylake_f32 #define xfabs nsimd_sleef_fabs_avx512_skylake_f64 #define xfabsf nsimd_sleef_fabs_avx512_skylake_f32 #define xcopysign nsimd_sleef_copysign_avx512_skylake_f64 #define xcopysignf nsimd_sleef_copysign_avx512_skylake_f32 #define xfmax nsimd_sleef_fmax_avx512_skylake_f64 #define xfmaxf nsimd_sleef_fmax_avx512_skylake_f32 #define xfmin nsimd_sleef_fmin_avx512_skylake_f64 #define xfminf nsimd_sleef_fmin_avx512_skylake_f32 #define xfdim nsimd_sleef_fdim_avx512_skylake_f64 #define xfdimf nsimd_sleef_fdim_avx512_skylake_f32 #define xtrunc nsimd_sleef_trunc_avx512_skylake_f64 #define xtruncf nsimd_sleef_trunc_avx512_skylake_f32 #define xfloor nsimd_sleef_floor_avx512_skylake_f64 #define xfloorf nsimd_sleef_floor_avx512_skylake_f32 #define xceil nsimd_sleef_ceil_avx512_skylake_f64 #define xceilf nsimd_sleef_ceil_avx512_skylake_f32 #define xround nsimd_sleef_round_avx512_skylake_f64 #define xroundf nsimd_sleef_round_avx512_skylake_f32 #define xrint nsimd_sleef_rint_avx512_skylake_f64 #define xrintf nsimd_sleef_rint_avx512_skylake_f32 #define xnextafter nsimd_sleef_nextafter_avx512_skylake_f64 #define xnextafterf nsimd_sleef_nextafter_avx512_skylake_f32 #define xfrfrexp nsimd_sleef_frfrexp_avx512_skylake_f64 #define xfrfrexpf nsimd_sleef_frfrexp_avx512_skylake_f32 #define xexpfrexp nsimd_sleef_expfrexp_avx512_skylake_f64 #define xexpfrexpf nsimd_sleef_expfrexp_avx512_skylake_f32 #define xfmod nsimd_sleef_fmod_avx512_skylake_f64 #define xfmodf nsimd_sleef_fmod_avx512_skylake_f32 #define xremainder nsimd_sleef_remainder_avx512_skylake_f64 #define xremainderf nsimd_sleef_remainder_avx512_skylake_f32 #define xmodf nsimd_sleef_modf_avx512_skylake_f64 #define xmodff nsimd_sleef_modf_avx512_skylake_f32 #define xlgamma_u1 nsimd_sleef_lgamma_u10d_avx512_skylake_f64 #define xlgammaf_u1 nsimd_sleef_lgamma_u10d_avx512_skylake_f32 #define xtgamma_u1 nsimd_sleef_tgamma_u10d_avx512_skylake_f64 #define xtgammaf_u1 nsimd_sleef_tgamma_u10d_avx512_skylake_f32 #define xerf_u1 nsimd_sleef_erf_u10d_avx512_skylake_f64 #define xerff_u1 nsimd_sleef_erf_u10d_avx512_skylake_f32 #define xerfc_u15 nsimd_sleef_erfc_u15d_avx512_skylake_f64 #define xerfcf_u15 nsimd_sleef_erfc_u15d_avx512_skylake_f32 #define xgetInt nsimd_sleef_getInt_avx512_skylake_f64 #define xgetIntf nsimd_sleef_getInt_avx512_skylake_f32 #define xgetPtr nsimd_sleef_getPtr_avx512_skylake_f64 #define xgetPtrf nsimd_sleef_getPtr_avx512_skylake_f32 #else #define xsin nsimd_sleef_sin_u35_avx512_skylake_f64 #define xsinf nsimd_sleef_sin_u35_avx512_skylake_f32 #define xcos nsimd_sleef_cos_u35_avx512_skylake_f64 #define xcosf nsimd_sleef_cos_u35_avx512_skylake_f32 #define xsincos nsimd_sleef_sincos_u35_avx512_skylake_f64 #define xsincosf nsimd_sleef_sincos_u35_avx512_skylake_f32 #define xtan nsimd_sleef_tan_u35_avx512_skylake_f64 #define xtanf nsimd_sleef_tan_u35_avx512_skylake_f32 #define xasin nsimd_sleef_asin_u35_avx512_skylake_f64 #define xasinf nsimd_sleef_asin_u35_avx512_skylake_f32 #define xacos nsimd_sleef_acos_u35_avx512_skylake_f64 #define xacosf nsimd_sleef_acos_u35_avx512_skylake_f32 #define xatan nsimd_sleef_atan_u35_avx512_skylake_f64 #define xatanf nsimd_sleef_atan_u35_avx512_skylake_f32 #define xatan2 nsimd_sleef_atan2_u35_avx512_skylake_f64 #define xatan2f nsimd_sleef_atan2_u35_avx512_skylake_f32 #define xlog nsimd_sleef_log_u35_avx512_skylake_f64 #define xlogf nsimd_sleef_log_u35_avx512_skylake_f32 #define xcbrt nsimd_sleef_cbrt_u35_avx512_skylake_f64 #define xcbrtf nsimd_sleef_cbrt_u35_avx512_skylake_f32 #define xsin_u1 nsimd_sleef_sin_u10_avx512_skylake_f64 #define xsinf_u1 nsimd_sleef_sin_u10_avx512_skylake_f32 #define xcos_u1 nsimd_sleef_cos_u10_avx512_skylake_f64 #define xcosf_u1 nsimd_sleef_cos_u10_avx512_skylake_f32 #define xsincos_u1 nsimd_sleef_sincos_u10_avx512_skylake_f64 #define xsincosf_u1 nsimd_sleef_sincos_u10_avx512_skylake_f32 #define xtan_u1 nsimd_sleef_tan_u10_avx512_skylake_f64 #define xtanf_u1 nsimd_sleef_tan_u10_avx512_skylake_f32 #define xasin_u1 nsimd_sleef_asin_u10_avx512_skylake_f64 #define xasinf_u1 nsimd_sleef_asin_u10_avx512_skylake_f32 #define xacos_u1 nsimd_sleef_acos_u10_avx512_skylake_f64 #define xacosf_u1 nsimd_sleef_acos_u10_avx512_skylake_f32 #define xatan_u1 nsimd_sleef_atan_u10_avx512_skylake_f64 #define xatanf_u1 nsimd_sleef_atan_u10_avx512_skylake_f32 #define xatan2_u1 nsimd_sleef_atan2_u10_avx512_skylake_f64 #define xatan2f_u1 nsimd_sleef_atan2_u10_avx512_skylake_f32 #define xlog_u1 nsimd_sleef_log_u10_avx512_skylake_f64 #define xlogf_u1 nsimd_sleef_log_u10_avx512_skylake_f32 #define xcbrt_u1 nsimd_sleef_cbrt_u10_avx512_skylake_f64 #define xcbrtf_u1 nsimd_sleef_cbrt_u10_avx512_skylake_f32 #define xexp nsimd_sleef_exp_u10_avx512_skylake_f64 #define xexpf nsimd_sleef_exp_u10_avx512_skylake_f32 #define xpow nsimd_sleef_pow_u10_avx512_skylake_f64 #define xpowf nsimd_sleef_pow_u10_avx512_skylake_f32 #define xsinh nsimd_sleef_sinh_u10_avx512_skylake_f64 #define xsinhf nsimd_sleef_sinh_u10_avx512_skylake_f32 #define xcosh nsimd_sleef_cosh_u10_avx512_skylake_f64 #define xcoshf nsimd_sleef_cosh_u10_avx512_skylake_f32 #define xtanh nsimd_sleef_tanh_u10_avx512_skylake_f64 #define xtanhf nsimd_sleef_tanh_u10_avx512_skylake_f32 #define xsinh_u35 nsimd_sleef_sinh_u35_avx512_skylake_f64 #define xsinhf_u35 nsimd_sleef_sinh_u35_avx512_skylake_f32 #define xcosh_u35 nsimd_sleef_cosh_u35_avx512_skylake_f64 #define xcoshf_u35 nsimd_sleef_cosh_u35_avx512_skylake_f32 #define xtanh_u35 nsimd_sleef_tanh_u35_avx512_skylake_f64 #define xtanhf_u35 nsimd_sleef_tanh_u35_avx512_skylake_f32 #define xfastsin_u3500 nsimd_sleef_fastsin_u3500_avx512_skylake_f64 #define xfastsinf_u3500 nsimd_sleef_fastsin_u3500_avx512_skylake_f32 #define xfastcos_u3500 nsimd_sleef_fastcos_u3500_avx512_skylake_f64 #define xfastcosf_u3500 nsimd_sleef_fastcos_u3500_avx512_skylake_f32 #define xfastpow_u3500 nsimd_sleef_fastpow_u3500_avx512_skylake_f64 #define xfastpowf_u3500 nsimd_sleef_fastpow_u3500_avx512_skylake_f32 #define xasinh nsimd_sleef_asinh_u10_avx512_skylake_f64 #define xasinhf nsimd_sleef_asinh_u10_avx512_skylake_f32 #define xacosh nsimd_sleef_acosh_u10_avx512_skylake_f64 #define xacoshf nsimd_sleef_acosh_u10_avx512_skylake_f32 #define xatanh nsimd_sleef_atanh_u10_avx512_skylake_f64 #define xatanhf nsimd_sleef_atanh_u10_avx512_skylake_f32 #define xexp2 nsimd_sleef_exp2_u10_avx512_skylake_f64 #define xexp2f nsimd_sleef_exp2_u10_avx512_skylake_f32 #define xexp2_u35 nsimd_sleef_exp2_u35_avx512_skylake_f64 #define xexp2f_u35 nsimd_sleef_exp2_u35_avx512_skylake_f32 #define xexp10 nsimd_sleef_exp10_u10_avx512_skylake_f64 #define xexp10f nsimd_sleef_exp10_u10_avx512_skylake_f32 #define xexp10_u35 nsimd_sleef_exp10_u35_avx512_skylake_f64 #define xexp10f_u35 nsimd_sleef_exp10_u35_avx512_skylake_f32 #define xexpm1 nsimd_sleef_expm1_u10_avx512_skylake_f64 #define xexpm1f nsimd_sleef_expm1_u10_avx512_skylake_f32 #define xlog10 nsimd_sleef_log10_u10_avx512_skylake_f64 #define xlog10f nsimd_sleef_log10_u10_avx512_skylake_f32 #define xlog2 nsimd_sleef_log2_u10_avx512_skylake_f64 #define xlog2f nsimd_sleef_log2_u10_avx512_skylake_f32 #define xlog2_u35 nsimd_sleef_log2_u35_avx512_skylake_f64 #define xlog2f_u35 nsimd_sleef_log2_u35_avx512_skylake_f32 #define xlog1p nsimd_sleef_log1p_u10_avx512_skylake_f64 #define xlog1pf nsimd_sleef_log1p_u10_avx512_skylake_f32 #define xsincospi_u05 nsimd_sleef_sincospi_u05_avx512_skylake_f64 #define xsincospif_u05 nsimd_sleef_sincospi_u05_avx512_skylake_f32 #define xsincospi_u35 nsimd_sleef_sincospi_u35_avx512_skylake_f64 #define xsincospif_u35 nsimd_sleef_sincospi_u35_avx512_skylake_f32 #define xsinpi_u05 nsimd_sleef_sinpi_u05_avx512_skylake_f64 #define xsinpif_u05 nsimd_sleef_sinpi_u05_avx512_skylake_f32 #define xcospi_u05 nsimd_sleef_cospi_u05_avx512_skylake_f64 #define xcospif_u05 nsimd_sleef_cospi_u05_avx512_skylake_f32 #define xldexp nsimd_sleef_ldexp_avx512_skylake_f64 #define xldexpf nsimd_sleef_ldexp_avx512_skylake_f32 #define xilogb nsimd_sleef_ilogb_avx512_skylake_f64 #define xilogbf nsimd_sleef_ilogb_avx512_skylake_f32 #define xfma nsimd_sleef_fma_avx512_skylake_f64 #define xfmaf nsimd_sleef_fma_avx512_skylake_f32 #define xsqrt nsimd_sleef_sqrt_avx512_skylake_f64 #define xsqrtf nsimd_sleef_sqrt_avx512_skylake_f32 #define xsqrt_u05 nsimd_sleef_sqrt_u05_avx512_skylake_f64 #define xsqrtf_u05 nsimd_sleef_sqrt_u05_avx512_skylake_f32 #define xsqrt_u35 nsimd_sleef_sqrt_u35_avx512_skylake_f64 #define xsqrtf_u35 nsimd_sleef_sqrt_u35_avx512_skylake_f32 #define xhypot_u05 nsimd_sleef_hypot_u05_avx512_skylake_f64 #define xhypotf_u05 nsimd_sleef_hypot_u05_avx512_skylake_f32 #define xhypot_u35 nsimd_sleef_hypot_u35_avx512_skylake_f64 #define xhypotf_u35 nsimd_sleef_hypot_u35_avx512_skylake_f32 #define xfabs nsimd_sleef_fabs_avx512_skylake_f64 #define xfabsf nsimd_sleef_fabs_avx512_skylake_f32 #define xcopysign nsimd_sleef_copysign_avx512_skylake_f64 #define xcopysignf nsimd_sleef_copysign_avx512_skylake_f32 #define xfmax nsimd_sleef_fmax_avx512_skylake_f64 #define xfmaxf nsimd_sleef_fmax_avx512_skylake_f32 #define xfmin nsimd_sleef_fmin_avx512_skylake_f64 #define xfminf nsimd_sleef_fmin_avx512_skylake_f32 #define xfdim nsimd_sleef_fdim_avx512_skylake_f64 #define xfdimf nsimd_sleef_fdim_avx512_skylake_f32 #define xtrunc nsimd_sleef_trunc_avx512_skylake_f64 #define xtruncf nsimd_sleef_trunc_avx512_skylake_f32 #define xfloor nsimd_sleef_floor_avx512_skylake_f64 #define xfloorf nsimd_sleef_floor_avx512_skylake_f32 #define xceil nsimd_sleef_ceil_avx512_skylake_f64 #define xceilf nsimd_sleef_ceil_avx512_skylake_f32 #define xround nsimd_sleef_round_avx512_skylake_f64 #define xroundf nsimd_sleef_round_avx512_skylake_f32 #define xrint nsimd_sleef_rint_avx512_skylake_f64 #define xrintf nsimd_sleef_rint_avx512_skylake_f32 #define xnextafter nsimd_sleef_nextafter_avx512_skylake_f64 #define xnextafterf nsimd_sleef_nextafter_avx512_skylake_f32 #define xfrfrexp nsimd_sleef_frfrexp_avx512_skylake_f64 #define xfrfrexpf nsimd_sleef_frfrexp_avx512_skylake_f32 #define xexpfrexp nsimd_sleef_expfrexp_avx512_skylake_f64 #define xexpfrexpf nsimd_sleef_expfrexp_avx512_skylake_f32 #define xfmod nsimd_sleef_fmod_avx512_skylake_f64 #define xfmodf nsimd_sleef_fmod_avx512_skylake_f32 #define xremainder nsimd_sleef_remainder_avx512_skylake_f64 #define xremainderf nsimd_sleef_remainder_avx512_skylake_f32 #define xmodf nsimd_sleef_modf_avx512_skylake_f64 #define xmodff nsimd_sleef_modf_avx512_skylake_f32 #define xlgamma_u1 nsimd_sleef_lgamma_u10_avx512_skylake_f64 #define xlgammaf_u1 nsimd_sleef_lgamma_u10_avx512_skylake_f32 #define xtgamma_u1 nsimd_sleef_tgamma_u10_avx512_skylake_f64 #define xtgammaf_u1 nsimd_sleef_tgamma_u10_avx512_skylake_f32 #define xerf_u1 nsimd_sleef_erf_u10_avx512_skylake_f64 #define xerff_u1 nsimd_sleef_erf_u10_avx512_skylake_f32 #define xerfc_u15 nsimd_sleef_erfc_u15_avx512_skylake_f64 #define xerfcf_u15 nsimd_sleef_erfc_u15_avx512_skylake_f32 #define xgetInt nsimd_sleef_getInt_avx512_skylake_f64 #define xgetIntf nsimd_sleef_getInt_avx512_skylake_f32 #define xgetPtr nsimd_sleef_getPtr_avx512_skylake_f64 #define xgetPtrf nsimd_sleef_getPtr_avx512_skylake_f32 #endif #define rempi nsimd_sleef_rempi_avx512_skylake #define rempif nsimd_sleef_rempif_avx512_skylake #define rempisub nsimd_sleef_rempisub_avx512_skylake #define rempisubf nsimd_sleef_rempisubf_avx512_skylake #define gammak nsimd_gammak_avx512_skylake #define gammafk nsimd_gammafk_avx512_skylake #endif #endif ================================================ FILE: src/renameneon32.h ================================================ #ifndef RENAMENEON32_H #define RENAMENEON32_H /* ------------------------------------------------------------------------- */ /* Naming of functions neon128 */ #ifdef NSIMD_NEON128 #ifdef DETERMINISTIC #define xsin nsimd_sleef_sin_u35d_neon128_f64 #define xsinf nsimd_sleef_sin_u35d_neon128_f32 #define xcos nsimd_sleef_cos_u35d_neon128_f64 #define xcosf nsimd_sleef_cos_u35d_neon128_f32 #define xsincos nsimd_sleef_sincos_u35d_neon128_f64 #define xsincosf nsimd_sleef_sincos_u35d_neon128_f32 #define xtan nsimd_sleef_tan_u35d_neon128_f64 #define xtanf nsimd_sleef_tan_u35d_neon128_f32 #define xasin nsimd_sleef_asin_u35d_neon128_f64 #define xasinf nsimd_sleef_asin_u35d_neon128_f32 #define xacos nsimd_sleef_acos_u35d_neon128_f64 #define xacosf nsimd_sleef_acos_u35d_neon128_f32 #define xatan nsimd_sleef_atan_u35d_neon128_f64 #define xatanf nsimd_sleef_atan_u35d_neon128_f32 #define xatan2 nsimd_sleef_atan2_u35d_neon128_f64 #define xatan2f nsimd_sleef_atan2_u35d_neon128_f32 #define xlog nsimd_sleef_log_u35d_neon128_f64 #define xlogf nsimd_sleef_log_u35d_neon128_f32 #define xcbrt nsimd_sleef_cbrt_u35d_neon128_f64 #define xcbrtf nsimd_sleef_cbrt_u35d_neon128_f32 #define xsin_u1 nsimd_sleef_sin_u10d_neon128_f64 #define xsinf_u1 nsimd_sleef_sin_u10d_neon128_f32 #define xcos_u1 nsimd_sleef_cos_u10d_neon128_f64 #define xcosf_u1 nsimd_sleef_cos_u10d_neon128_f32 #define xsincos_u1 nsimd_sleef_sincos_u10d_neon128_f64 #define xsincosf_u1 nsimd_sleef_sincos_u10d_neon128_f32 #define xtan_u1 nsimd_sleef_tan_u10d_neon128_f64 #define xtanf_u1 nsimd_sleef_tan_u10d_neon128_f32 #define xasin_u1 nsimd_sleef_asin_u10d_neon128_f64 #define xasinf_u1 nsimd_sleef_asin_u10d_neon128_f32 #define xacos_u1 nsimd_sleef_acos_u10d_neon128_f64 #define xacosf_u1 nsimd_sleef_acos_u10d_neon128_f32 #define xatan_u1 nsimd_sleef_atan_u10d_neon128_f64 #define xatanf_u1 nsimd_sleef_atan_u10d_neon128_f32 #define xatan2_u1 nsimd_sleef_atan2_u10d_neon128_f64 #define xatan2f_u1 nsimd_sleef_atan2_u10d_neon128_f32 #define xlog_u1 nsimd_sleef_log_u10d_neon128_f64 #define xlogf_u1 nsimd_sleef_log_u10d_neon128_f32 #define xcbrt_u1 nsimd_sleef_cbrt_u10d_neon128_f64 #define xcbrtf_u1 nsimd_sleef_cbrt_u10d_neon128_f32 #define xexp nsimd_sleef_exp_u10d_neon128_f64 #define xexpf nsimd_sleef_exp_u10d_neon128_f32 #define xpow nsimd_sleef_pow_u10d_neon128_f64 #define xpowf nsimd_sleef_pow_u10d_neon128_f32 #define xsinh nsimd_sleef_sinh_u10d_neon128_f64 #define xsinhf nsimd_sleef_sinh_u10d_neon128_f32 #define xcosh nsimd_sleef_cosh_u10d_neon128_f64 #define xcoshf nsimd_sleef_cosh_u10d_neon128_f32 #define xtanh nsimd_sleef_tanh_u10d_neon128_f64 #define xtanhf nsimd_sleef_tanh_u10d_neon128_f32 #define xsinh_u35 nsimd_sleef_sinh_u35d_neon128_f64 #define xsinhf_u35 nsimd_sleef_sinh_u35d_neon128_f32 #define xcosh_u35 nsimd_sleef_cosh_u35d_neon128_f64 #define xcoshf_u35 nsimd_sleef_cosh_u35d_neon128_f32 #define xtanh_u35 nsimd_sleef_tanh_u35d_neon128_f64 #define xtanhf_u35 nsimd_sleef_tanh_u35d_neon128_f32 #define xfastsin_u3500 nsimd_sleef_fastsin_u3500d_neon128_f64 #define xfastsinf_u3500 nsimd_sleef_fastsin_u3500d_neon128_f32 #define xfastcos_u3500 nsimd_sleef_fastcos_u3500d_neon128_f64 #define xfastcosf_u3500 nsimd_sleef_fastcos_u3500d_neon128_f32 #define xfastpow_u3500 nsimd_sleef_fastpow_u3500d_neon128_f64 #define xfastpowf_u3500 nsimd_sleef_fastpow_u3500d_neon128_f32 #define xasinh nsimd_sleef_asinh_u10d_neon128_f64 #define xasinhf nsimd_sleef_asinh_u10d_neon128_f32 #define xacosh nsimd_sleef_acosh_u10d_neon128_f64 #define xacoshf nsimd_sleef_acosh_u10d_neon128_f32 #define xatanh nsimd_sleef_atanh_u10d_neon128_f64 #define xatanhf nsimd_sleef_atanh_u10d_neon128_f32 #define xexp2 nsimd_sleef_exp2_u10d_neon128_f64 #define xexp2f nsimd_sleef_exp2_u10d_neon128_f32 #define xexp2_u35 nsimd_sleef_exp2_u35d_neon128_f64 #define xexp2f_u35 nsimd_sleef_exp2_u35d_neon128_f32 #define xexp10 nsimd_sleef_exp10_u10d_neon128_f64 #define xexp10f nsimd_sleef_exp10_u10d_neon128_f32 #define xexp10_u35 nsimd_sleef_exp10_u35d_neon128_f64 #define xexp10f_u35 nsimd_sleef_exp10_u35d_neon128_f32 #define xexpm1 nsimd_sleef_expm1_u10d_neon128_f64 #define xexpm1f nsimd_sleef_expm1_u10d_neon128_f32 #define xlog10 nsimd_sleef_log10_u10d_neon128_f64 #define xlog10f nsimd_sleef_log10_u10d_neon128_f32 #define xlog2 nsimd_sleef_log2_u10d_neon128_f64 #define xlog2f nsimd_sleef_log2_u10d_neon128_f32 #define xlog2_u35 nsimd_sleef_log2_u35d_neon128_f64 #define xlog2f_u35 nsimd_sleef_log2_u35d_neon128_f32 #define xlog1p nsimd_sleef_log1p_u10d_neon128_f64 #define xlog1pf nsimd_sleef_log1p_u10d_neon128_f32 #define xsincospi_u05 nsimd_sleef_sincospi_u05d_neon128_f64 #define xsincospif_u05 nsimd_sleef_sincospi_u05d_neon128_f32 #define xsincospi_u35 nsimd_sleef_sincospi_u35d_neon128_f64 #define xsincospif_u35 nsimd_sleef_sincospi_u35d_neon128_f32 #define xsinpi_u05 nsimd_sleef_sinpi_u05d_neon128_f64 #define xsinpif_u05 nsimd_sleef_sinpi_u05d_neon128_f32 #define xcospi_u05 nsimd_sleef_cospi_u05d_neon128_f64 #define xcospif_u05 nsimd_sleef_cospi_u05d_neon128_f32 #define xldexp nsimd_sleef_ldexp_neon128_f64 #define xldexpf nsimd_sleef_ldexp_neon128_f32 #define xilogb nsimd_sleef_ilogb_neon128_f64 #define xilogbf nsimd_sleef_ilogb_neon128_f32 #define xfma nsimd_sleef_fma_neon128_f64 #define xfmaf nsimd_sleef_fma_neon128_f32 #define xsqrt nsimd_sleef_sqrt_neon128_f64 #define xsqrtf nsimd_sleef_sqrt_neon128_f32 #define xsqrt_u05 nsimd_sleef_sqrt_u05d_neon128_f64 #define xsqrtf_u05 nsimd_sleef_sqrt_u05d_neon128_f32 #define xsqrt_u35 nsimd_sleef_sqrt_u35d_neon128_f64 #define xsqrtf_u35 nsimd_sleef_sqrt_u35d_neon128_f32 #define xhypot_u05 nsimd_sleef_hypot_u05d_neon128_f64 #define xhypotf_u05 nsimd_sleef_hypot_u05d_neon128_f32 #define xhypot_u35 nsimd_sleef_hypot_u35d_neon128_f64 #define xhypotf_u35 nsimd_sleef_hypot_u35d_neon128_f32 #define xfabs nsimd_sleef_fabs_neon128_f64 #define xfabsf nsimd_sleef_fabs_neon128_f32 #define xcopysign nsimd_sleef_copysign_neon128_f64 #define xcopysignf nsimd_sleef_copysign_neon128_f32 #define xfmax nsimd_sleef_fmax_neon128_f64 #define xfmaxf nsimd_sleef_fmax_neon128_f32 #define xfmin nsimd_sleef_fmin_neon128_f64 #define xfminf nsimd_sleef_fmin_neon128_f32 #define xfdim nsimd_sleef_fdim_neon128_f64 #define xfdimf nsimd_sleef_fdim_neon128_f32 #define xtrunc nsimd_sleef_trunc_neon128_f64 #define xtruncf nsimd_sleef_trunc_neon128_f32 #define xfloor nsimd_sleef_floor_neon128_f64 #define xfloorf nsimd_sleef_floor_neon128_f32 #define xceil nsimd_sleef_ceil_neon128_f64 #define xceilf nsimd_sleef_ceil_neon128_f32 #define xround nsimd_sleef_round_neon128_f64 #define xroundf nsimd_sleef_round_neon128_f32 #define xrint nsimd_sleef_rint_neon128_f64 #define xrintf nsimd_sleef_rint_neon128_f32 #define xnextafter nsimd_sleef_nextafter_neon128_f64 #define xnextafterf nsimd_sleef_nextafter_neon128_f32 #define xfrfrexp nsimd_sleef_frfrexp_neon128_f64 #define xfrfrexpf nsimd_sleef_frfrexp_neon128_f32 #define xexpfrexp nsimd_sleef_expfrexp_neon128_f64 #define xexpfrexpf nsimd_sleef_expfrexp_neon128_f32 #define xfmod nsimd_sleef_fmod_neon128_f64 #define xfmodf nsimd_sleef_fmod_neon128_f32 #define xremainder nsimd_sleef_remainder_neon128_f64 #define xremainderf nsimd_sleef_remainder_neon128_f32 #define xmodf nsimd_sleef_modf_neon128_f64 #define xmodff nsimd_sleef_modf_neon128_f32 #define xlgamma_u1 nsimd_sleef_lgamma_u10d_neon128_f64 #define xlgammaf_u1 nsimd_sleef_lgamma_u10d_neon128_f32 #define xtgamma_u1 nsimd_sleef_tgamma_u10d_neon128_f64 #define xtgammaf_u1 nsimd_sleef_tgamma_u10d_neon128_f32 #define xerf_u1 nsimd_sleef_erf_u10d_neon128_f64 #define xerff_u1 nsimd_sleef_erf_u10d_neon128_f32 #define xerfc_u15 nsimd_sleef_erfc_u15d_neon128_f64 #define xerfcf_u15 nsimd_sleef_erfc_u15d_neon128_f32 #define xgetInt nsimd_sleef_getInt_neon128_f64 #define xgetIntf nsimd_sleef_getInt_neon128_f32 #define xgetPtr nsimd_sleef_getPtr_neon128_f64 #define xgetPtrf nsimd_sleef_getPtr_neon128_f32 #else #define xsin nsimd_sleef_sin_u35_neon128_f64 #define xsinf nsimd_sleef_sin_u35_neon128_f32 #define xcos nsimd_sleef_cos_u35_neon128_f64 #define xcosf nsimd_sleef_cos_u35_neon128_f32 #define xsincos nsimd_sleef_sincos_u35_neon128_f64 #define xsincosf nsimd_sleef_sincos_u35_neon128_f32 #define xtan nsimd_sleef_tan_u35_neon128_f64 #define xtanf nsimd_sleef_tan_u35_neon128_f32 #define xasin nsimd_sleef_asin_u35_neon128_f64 #define xasinf nsimd_sleef_asin_u35_neon128_f32 #define xacos nsimd_sleef_acos_u35_neon128_f64 #define xacosf nsimd_sleef_acos_u35_neon128_f32 #define xatan nsimd_sleef_atan_u35_neon128_f64 #define xatanf nsimd_sleef_atan_u35_neon128_f32 #define xatan2 nsimd_sleef_atan2_u35_neon128_f64 #define xatan2f nsimd_sleef_atan2_u35_neon128_f32 #define xlog nsimd_sleef_log_u35_neon128_f64 #define xlogf nsimd_sleef_log_u35_neon128_f32 #define xcbrt nsimd_sleef_cbrt_u35_neon128_f64 #define xcbrtf nsimd_sleef_cbrt_u35_neon128_f32 #define xsin_u1 nsimd_sleef_sin_u10_neon128_f64 #define xsinf_u1 nsimd_sleef_sin_u10_neon128_f32 #define xcos_u1 nsimd_sleef_cos_u10_neon128_f64 #define xcosf_u1 nsimd_sleef_cos_u10_neon128_f32 #define xsincos_u1 nsimd_sleef_sincos_u10_neon128_f64 #define xsincosf_u1 nsimd_sleef_sincos_u10_neon128_f32 #define xtan_u1 nsimd_sleef_tan_u10_neon128_f64 #define xtanf_u1 nsimd_sleef_tan_u10_neon128_f32 #define xasin_u1 nsimd_sleef_asin_u10_neon128_f64 #define xasinf_u1 nsimd_sleef_asin_u10_neon128_f32 #define xacos_u1 nsimd_sleef_acos_u10_neon128_f64 #define xacosf_u1 nsimd_sleef_acos_u10_neon128_f32 #define xatan_u1 nsimd_sleef_atan_u10_neon128_f64 #define xatanf_u1 nsimd_sleef_atan_u10_neon128_f32 #define xatan2_u1 nsimd_sleef_atan2_u10_neon128_f64 #define xatan2f_u1 nsimd_sleef_atan2_u10_neon128_f32 #define xlog_u1 nsimd_sleef_log_u10_neon128_f64 #define xlogf_u1 nsimd_sleef_log_u10_neon128_f32 #define xcbrt_u1 nsimd_sleef_cbrt_u10_neon128_f64 #define xcbrtf_u1 nsimd_sleef_cbrt_u10_neon128_f32 #define xexp nsimd_sleef_exp_u10_neon128_f64 #define xexpf nsimd_sleef_exp_u10_neon128_f32 #define xpow nsimd_sleef_pow_u10_neon128_f64 #define xpowf nsimd_sleef_pow_u10_neon128_f32 #define xsinh nsimd_sleef_sinh_u10_neon128_f64 #define xsinhf nsimd_sleef_sinh_u10_neon128_f32 #define xcosh nsimd_sleef_cosh_u10_neon128_f64 #define xcoshf nsimd_sleef_cosh_u10_neon128_f32 #define xtanh nsimd_sleef_tanh_u10_neon128_f64 #define xtanhf nsimd_sleef_tanh_u10_neon128_f32 #define xsinh_u35 nsimd_sleef_sinh_u35_neon128_f64 #define xsinhf_u35 nsimd_sleef_sinh_u35_neon128_f32 #define xcosh_u35 nsimd_sleef_cosh_u35_neon128_f64 #define xcoshf_u35 nsimd_sleef_cosh_u35_neon128_f32 #define xtanh_u35 nsimd_sleef_tanh_u35_neon128_f64 #define xtanhf_u35 nsimd_sleef_tanh_u35_neon128_f32 #define xfastsin_u3500 nsimd_sleef_fastsin_u3500_neon128_f64 #define xfastsinf_u3500 nsimd_sleef_fastsin_u3500_neon128_f32 #define xfastcos_u3500 nsimd_sleef_fastcos_u3500_neon128_f64 #define xfastcosf_u3500 nsimd_sleef_fastcos_u3500_neon128_f32 #define xfastpow_u3500 nsimd_sleef_fastpow_u3500_neon128_f64 #define xfastpowf_u3500 nsimd_sleef_fastpow_u3500_neon128_f32 #define xasinh nsimd_sleef_asinh_u10_neon128_f64 #define xasinhf nsimd_sleef_asinh_u10_neon128_f32 #define xacosh nsimd_sleef_acosh_u10_neon128_f64 #define xacoshf nsimd_sleef_acosh_u10_neon128_f32 #define xatanh nsimd_sleef_atanh_u10_neon128_f64 #define xatanhf nsimd_sleef_atanh_u10_neon128_f32 #define xexp2 nsimd_sleef_exp2_u10_neon128_f64 #define xexp2f nsimd_sleef_exp2_u10_neon128_f32 #define xexp2_u35 nsimd_sleef_exp2_u35_neon128_f64 #define xexp2f_u35 nsimd_sleef_exp2_u35_neon128_f32 #define xexp10 nsimd_sleef_exp10_u10_neon128_f64 #define xexp10f nsimd_sleef_exp10_u10_neon128_f32 #define xexp10_u35 nsimd_sleef_exp10_u35_neon128_f64 #define xexp10f_u35 nsimd_sleef_exp10_u35_neon128_f32 #define xexpm1 nsimd_sleef_expm1_u10_neon128_f64 #define xexpm1f nsimd_sleef_expm1_u10_neon128_f32 #define xlog10 nsimd_sleef_log10_u10_neon128_f64 #define xlog10f nsimd_sleef_log10_u10_neon128_f32 #define xlog2 nsimd_sleef_log2_u10_neon128_f64 #define xlog2f nsimd_sleef_log2_u10_neon128_f32 #define xlog2_u35 nsimd_sleef_log2_u35_neon128_f64 #define xlog2f_u35 nsimd_sleef_log2_u35_neon128_f32 #define xlog1p nsimd_sleef_log1p_u10_neon128_f64 #define xlog1pf nsimd_sleef_log1p_u10_neon128_f32 #define xsincospi_u05 nsimd_sleef_sincospi_u05_neon128_f64 #define xsincospif_u05 nsimd_sleef_sincospi_u05_neon128_f32 #define xsincospi_u35 nsimd_sleef_sincospi_u35_neon128_f64 #define xsincospif_u35 nsimd_sleef_sincospi_u35_neon128_f32 #define xsinpi_u05 nsimd_sleef_sinpi_u05_neon128_f64 #define xsinpif_u05 nsimd_sleef_sinpi_u05_neon128_f32 #define xcospi_u05 nsimd_sleef_cospi_u05_neon128_f64 #define xcospif_u05 nsimd_sleef_cospi_u05_neon128_f32 #define xldexp nsimd_sleef_ldexp_neon128_f64 #define xldexpf nsimd_sleef_ldexp_neon128_f32 #define xilogb nsimd_sleef_ilogb_neon128_f64 #define xilogbf nsimd_sleef_ilogb_neon128_f32 #define xfma nsimd_sleef_fma_neon128_f64 #define xfmaf nsimd_sleef_fma_neon128_f32 #define xsqrt nsimd_sleef_sqrt_neon128_f64 #define xsqrtf nsimd_sleef_sqrt_neon128_f32 #define xsqrt_u05 nsimd_sleef_sqrt_u05_neon128_f64 #define xsqrtf_u05 nsimd_sleef_sqrt_u05_neon128_f32 #define xsqrt_u35 nsimd_sleef_sqrt_u35_neon128_f64 #define xsqrtf_u35 nsimd_sleef_sqrt_u35_neon128_f32 #define xhypot_u05 nsimd_sleef_hypot_u05_neon128_f64 #define xhypotf_u05 nsimd_sleef_hypot_u05_neon128_f32 #define xhypot_u35 nsimd_sleef_hypot_u35_neon128_f64 #define xhypotf_u35 nsimd_sleef_hypot_u35_neon128_f32 #define xfabs nsimd_sleef_fabs_neon128_f64 #define xfabsf nsimd_sleef_fabs_neon128_f32 #define xcopysign nsimd_sleef_copysign_neon128_f64 #define xcopysignf nsimd_sleef_copysign_neon128_f32 #define xfmax nsimd_sleef_fmax_neon128_f64 #define xfmaxf nsimd_sleef_fmax_neon128_f32 #define xfmin nsimd_sleef_fmin_neon128_f64 #define xfminf nsimd_sleef_fmin_neon128_f32 #define xfdim nsimd_sleef_fdim_neon128_f64 #define xfdimf nsimd_sleef_fdim_neon128_f32 #define xtrunc nsimd_sleef_trunc_neon128_f64 #define xtruncf nsimd_sleef_trunc_neon128_f32 #define xfloor nsimd_sleef_floor_neon128_f64 #define xfloorf nsimd_sleef_floor_neon128_f32 #define xceil nsimd_sleef_ceil_neon128_f64 #define xceilf nsimd_sleef_ceil_neon128_f32 #define xround nsimd_sleef_round_neon128_f64 #define xroundf nsimd_sleef_round_neon128_f32 #define xrint nsimd_sleef_rint_neon128_f64 #define xrintf nsimd_sleef_rint_neon128_f32 #define xnextafter nsimd_sleef_nextafter_neon128_f64 #define xnextafterf nsimd_sleef_nextafter_neon128_f32 #define xfrfrexp nsimd_sleef_frfrexp_neon128_f64 #define xfrfrexpf nsimd_sleef_frfrexp_neon128_f32 #define xexpfrexp nsimd_sleef_expfrexp_neon128_f64 #define xexpfrexpf nsimd_sleef_expfrexp_neon128_f32 #define xfmod nsimd_sleef_fmod_neon128_f64 #define xfmodf nsimd_sleef_fmod_neon128_f32 #define xremainder nsimd_sleef_remainder_neon128_f64 #define xremainderf nsimd_sleef_remainder_neon128_f32 #define xmodf nsimd_sleef_modf_neon128_f64 #define xmodff nsimd_sleef_modf_neon128_f32 #define xlgamma_u1 nsimd_sleef_lgamma_u10_neon128_f64 #define xlgammaf_u1 nsimd_sleef_lgamma_u10_neon128_f32 #define xtgamma_u1 nsimd_sleef_tgamma_u10_neon128_f64 #define xtgammaf_u1 nsimd_sleef_tgamma_u10_neon128_f32 #define xerf_u1 nsimd_sleef_erf_u10_neon128_f64 #define xerff_u1 nsimd_sleef_erf_u10_neon128_f32 #define xerfc_u15 nsimd_sleef_erfc_u15_neon128_f64 #define xerfcf_u15 nsimd_sleef_erfc_u15_neon128_f32 #define xgetInt nsimd_sleef_getInt_neon128_f64 #define xgetIntf nsimd_sleef_getInt_neon128_f32 #define xgetPtr nsimd_sleef_getPtr_neon128_f64 #define xgetPtrf nsimd_sleef_getPtr_neon128_f32 #endif #define rempi nsimd_sleef_rempi_neon128 #define rempif nsimd_sleef_rempif_neon128 #define rempisub nsimd_sleef_rempisub_neon128 #define rempisubf nsimd_sleef_rempisubf_neon128 #define gammak nsimd_gammak_neon128 #define gammafk nsimd_gammafk_neon128 #endif #endif ================================================ FILE: src/renamesse2.h ================================================ #ifndef RENAMESSE2_H #define RENAMESSE2_H /* ------------------------------------------------------------------------- */ /* Naming of functions sse2 */ #ifdef NSIMD_SSE2 #ifdef DETERMINISTIC #define xsin nsimd_sleef_sin_u35d_sse2_f64 #define xsinf nsimd_sleef_sin_u35d_sse2_f32 #define xcos nsimd_sleef_cos_u35d_sse2_f64 #define xcosf nsimd_sleef_cos_u35d_sse2_f32 #define xsincos nsimd_sleef_sincos_u35d_sse2_f64 #define xsincosf nsimd_sleef_sincos_u35d_sse2_f32 #define xtan nsimd_sleef_tan_u35d_sse2_f64 #define xtanf nsimd_sleef_tan_u35d_sse2_f32 #define xasin nsimd_sleef_asin_u35d_sse2_f64 #define xasinf nsimd_sleef_asin_u35d_sse2_f32 #define xacos nsimd_sleef_acos_u35d_sse2_f64 #define xacosf nsimd_sleef_acos_u35d_sse2_f32 #define xatan nsimd_sleef_atan_u35d_sse2_f64 #define xatanf nsimd_sleef_atan_u35d_sse2_f32 #define xatan2 nsimd_sleef_atan2_u35d_sse2_f64 #define xatan2f nsimd_sleef_atan2_u35d_sse2_f32 #define xlog nsimd_sleef_log_u35d_sse2_f64 #define xlogf nsimd_sleef_log_u35d_sse2_f32 #define xcbrt nsimd_sleef_cbrt_u35d_sse2_f64 #define xcbrtf nsimd_sleef_cbrt_u35d_sse2_f32 #define xsin_u1 nsimd_sleef_sin_u10d_sse2_f64 #define xsinf_u1 nsimd_sleef_sin_u10d_sse2_f32 #define xcos_u1 nsimd_sleef_cos_u10d_sse2_f64 #define xcosf_u1 nsimd_sleef_cos_u10d_sse2_f32 #define xsincos_u1 nsimd_sleef_sincos_u10d_sse2_f64 #define xsincosf_u1 nsimd_sleef_sincos_u10d_sse2_f32 #define xtan_u1 nsimd_sleef_tan_u10d_sse2_f64 #define xtanf_u1 nsimd_sleef_tan_u10d_sse2_f32 #define xasin_u1 nsimd_sleef_asin_u10d_sse2_f64 #define xasinf_u1 nsimd_sleef_asin_u10d_sse2_f32 #define xacos_u1 nsimd_sleef_acos_u10d_sse2_f64 #define xacosf_u1 nsimd_sleef_acos_u10d_sse2_f32 #define xatan_u1 nsimd_sleef_atan_u10d_sse2_f64 #define xatanf_u1 nsimd_sleef_atan_u10d_sse2_f32 #define xatan2_u1 nsimd_sleef_atan2_u10d_sse2_f64 #define xatan2f_u1 nsimd_sleef_atan2_u10d_sse2_f32 #define xlog_u1 nsimd_sleef_log_u10d_sse2_f64 #define xlogf_u1 nsimd_sleef_log_u10d_sse2_f32 #define xcbrt_u1 nsimd_sleef_cbrt_u10d_sse2_f64 #define xcbrtf_u1 nsimd_sleef_cbrt_u10d_sse2_f32 #define xexp nsimd_sleef_exp_u10d_sse2_f64 #define xexpf nsimd_sleef_exp_u10d_sse2_f32 #define xpow nsimd_sleef_pow_u10d_sse2_f64 #define xpowf nsimd_sleef_pow_u10d_sse2_f32 #define xsinh nsimd_sleef_sinh_u10d_sse2_f64 #define xsinhf nsimd_sleef_sinh_u10d_sse2_f32 #define xcosh nsimd_sleef_cosh_u10d_sse2_f64 #define xcoshf nsimd_sleef_cosh_u10d_sse2_f32 #define xtanh nsimd_sleef_tanh_u10d_sse2_f64 #define xtanhf nsimd_sleef_tanh_u10d_sse2_f32 #define xsinh_u35 nsimd_sleef_sinh_u35d_sse2_f64 #define xsinhf_u35 nsimd_sleef_sinh_u35d_sse2_f32 #define xcosh_u35 nsimd_sleef_cosh_u35d_sse2_f64 #define xcoshf_u35 nsimd_sleef_cosh_u35d_sse2_f32 #define xtanh_u35 nsimd_sleef_tanh_u35d_sse2_f64 #define xtanhf_u35 nsimd_sleef_tanh_u35d_sse2_f32 #define xfastsin_u3500 nsimd_sleef_fastsin_u3500d_sse2_f64 #define xfastsinf_u3500 nsimd_sleef_fastsin_u3500d_sse2_f32 #define xfastcos_u3500 nsimd_sleef_fastcos_u3500d_sse2_f64 #define xfastcosf_u3500 nsimd_sleef_fastcos_u3500d_sse2_f32 #define xfastpow_u3500 nsimd_sleef_fastpow_u3500d_sse2_f64 #define xfastpowf_u3500 nsimd_sleef_fastpow_u3500d_sse2_f32 #define xasinh nsimd_sleef_asinh_u10d_sse2_f64 #define xasinhf nsimd_sleef_asinh_u10d_sse2_f32 #define xacosh nsimd_sleef_acosh_u10d_sse2_f64 #define xacoshf nsimd_sleef_acosh_u10d_sse2_f32 #define xatanh nsimd_sleef_atanh_u10d_sse2_f64 #define xatanhf nsimd_sleef_atanh_u10d_sse2_f32 #define xexp2 nsimd_sleef_exp2_u10d_sse2_f64 #define xexp2f nsimd_sleef_exp2_u10d_sse2_f32 #define xexp2_u35 nsimd_sleef_exp2_u35d_sse2_f64 #define xexp2f_u35 nsimd_sleef_exp2_u35d_sse2_f32 #define xexp10 nsimd_sleef_exp10_u10d_sse2_f64 #define xexp10f nsimd_sleef_exp10_u10d_sse2_f32 #define xexp10_u35 nsimd_sleef_exp10_u35d_sse2_f64 #define xexp10f_u35 nsimd_sleef_exp10_u35d_sse2_f32 #define xexpm1 nsimd_sleef_expm1_u10d_sse2_f64 #define xexpm1f nsimd_sleef_expm1_u10d_sse2_f32 #define xlog10 nsimd_sleef_log10_u10d_sse2_f64 #define xlog10f nsimd_sleef_log10_u10d_sse2_f32 #define xlog2 nsimd_sleef_log2_u10d_sse2_f64 #define xlog2f nsimd_sleef_log2_u10d_sse2_f32 #define xlog2_u35 nsimd_sleef_log2_u35d_sse2_f64 #define xlog2f_u35 nsimd_sleef_log2_u35d_sse2_f32 #define xlog1p nsimd_sleef_log1p_u10d_sse2_f64 #define xlog1pf nsimd_sleef_log1p_u10d_sse2_f32 #define xsincospi_u05 nsimd_sleef_sincospi_u05d_sse2_f64 #define xsincospif_u05 nsimd_sleef_sincospi_u05d_sse2_f32 #define xsincospi_u35 nsimd_sleef_sincospi_u35d_sse2_f64 #define xsincospif_u35 nsimd_sleef_sincospi_u35d_sse2_f32 #define xsinpi_u05 nsimd_sleef_sinpi_u05d_sse2_f64 #define xsinpif_u05 nsimd_sleef_sinpi_u05d_sse2_f32 #define xcospi_u05 nsimd_sleef_cospi_u05d_sse2_f64 #define xcospif_u05 nsimd_sleef_cospi_u05d_sse2_f32 #define xldexp nsimd_sleef_ldexp_sse2_f64 #define xldexpf nsimd_sleef_ldexp_sse2_f32 #define xilogb nsimd_sleef_ilogb_sse2_f64 #define xilogbf nsimd_sleef_ilogb_sse2_f32 #define xfma nsimd_sleef_fma_sse2_f64 #define xfmaf nsimd_sleef_fma_sse2_f32 #define xsqrt nsimd_sleef_sqrt_sse2_f64 #define xsqrtf nsimd_sleef_sqrt_sse2_f32 #define xsqrt_u05 nsimd_sleef_sqrt_u05d_sse2_f64 #define xsqrtf_u05 nsimd_sleef_sqrt_u05d_sse2_f32 #define xsqrt_u35 nsimd_sleef_sqrt_u35d_sse2_f64 #define xsqrtf_u35 nsimd_sleef_sqrt_u35d_sse2_f32 #define xhypot_u05 nsimd_sleef_hypot_u05d_sse2_f64 #define xhypotf_u05 nsimd_sleef_hypot_u05d_sse2_f32 #define xhypot_u35 nsimd_sleef_hypot_u35d_sse2_f64 #define xhypotf_u35 nsimd_sleef_hypot_u35d_sse2_f32 #define xfabs nsimd_sleef_fabs_sse2_f64 #define xfabsf nsimd_sleef_fabs_sse2_f32 #define xcopysign nsimd_sleef_copysign_sse2_f64 #define xcopysignf nsimd_sleef_copysign_sse2_f32 #define xfmax nsimd_sleef_fmax_sse2_f64 #define xfmaxf nsimd_sleef_fmax_sse2_f32 #define xfmin nsimd_sleef_fmin_sse2_f64 #define xfminf nsimd_sleef_fmin_sse2_f32 #define xfdim nsimd_sleef_fdim_sse2_f64 #define xfdimf nsimd_sleef_fdim_sse2_f32 #define xtrunc nsimd_sleef_trunc_sse2_f64 #define xtruncf nsimd_sleef_trunc_sse2_f32 #define xfloor nsimd_sleef_floor_sse2_f64 #define xfloorf nsimd_sleef_floor_sse2_f32 #define xceil nsimd_sleef_ceil_sse2_f64 #define xceilf nsimd_sleef_ceil_sse2_f32 #define xround nsimd_sleef_round_sse2_f64 #define xroundf nsimd_sleef_round_sse2_f32 #define xrint nsimd_sleef_rint_sse2_f64 #define xrintf nsimd_sleef_rint_sse2_f32 #define xnextafter nsimd_sleef_nextafter_sse2_f64 #define xnextafterf nsimd_sleef_nextafter_sse2_f32 #define xfrfrexp nsimd_sleef_frfrexp_sse2_f64 #define xfrfrexpf nsimd_sleef_frfrexp_sse2_f32 #define xexpfrexp nsimd_sleef_expfrexp_sse2_f64 #define xexpfrexpf nsimd_sleef_expfrexp_sse2_f32 #define xfmod nsimd_sleef_fmod_sse2_f64 #define xfmodf nsimd_sleef_fmod_sse2_f32 #define xremainder nsimd_sleef_remainder_sse2_f64 #define xremainderf nsimd_sleef_remainder_sse2_f32 #define xmodf nsimd_sleef_modf_sse2_f64 #define xmodff nsimd_sleef_modf_sse2_f32 #define xlgamma_u1 nsimd_sleef_lgamma_u10d_sse2_f64 #define xlgammaf_u1 nsimd_sleef_lgamma_u10d_sse2_f32 #define xtgamma_u1 nsimd_sleef_tgamma_u10d_sse2_f64 #define xtgammaf_u1 nsimd_sleef_tgamma_u10d_sse2_f32 #define xerf_u1 nsimd_sleef_erf_u10d_sse2_f64 #define xerff_u1 nsimd_sleef_erf_u10d_sse2_f32 #define xerfc_u15 nsimd_sleef_erfc_u15d_sse2_f64 #define xerfcf_u15 nsimd_sleef_erfc_u15d_sse2_f32 #define xgetInt nsimd_sleef_getInt_sse2_f64 #define xgetIntf nsimd_sleef_getInt_sse2_f32 #define xgetPtr nsimd_sleef_getPtr_sse2_f64 #define xgetPtrf nsimd_sleef_getPtr_sse2_f32 #else #define xsin nsimd_sleef_sin_u35_sse2_f64 #define xsinf nsimd_sleef_sin_u35_sse2_f32 #define xcos nsimd_sleef_cos_u35_sse2_f64 #define xcosf nsimd_sleef_cos_u35_sse2_f32 #define xsincos nsimd_sleef_sincos_u35_sse2_f64 #define xsincosf nsimd_sleef_sincos_u35_sse2_f32 #define xtan nsimd_sleef_tan_u35_sse2_f64 #define xtanf nsimd_sleef_tan_u35_sse2_f32 #define xasin nsimd_sleef_asin_u35_sse2_f64 #define xasinf nsimd_sleef_asin_u35_sse2_f32 #define xacos nsimd_sleef_acos_u35_sse2_f64 #define xacosf nsimd_sleef_acos_u35_sse2_f32 #define xatan nsimd_sleef_atan_u35_sse2_f64 #define xatanf nsimd_sleef_atan_u35_sse2_f32 #define xatan2 nsimd_sleef_atan2_u35_sse2_f64 #define xatan2f nsimd_sleef_atan2_u35_sse2_f32 #define xlog nsimd_sleef_log_u35_sse2_f64 #define xlogf nsimd_sleef_log_u35_sse2_f32 #define xcbrt nsimd_sleef_cbrt_u35_sse2_f64 #define xcbrtf nsimd_sleef_cbrt_u35_sse2_f32 #define xsin_u1 nsimd_sleef_sin_u10_sse2_f64 #define xsinf_u1 nsimd_sleef_sin_u10_sse2_f32 #define xcos_u1 nsimd_sleef_cos_u10_sse2_f64 #define xcosf_u1 nsimd_sleef_cos_u10_sse2_f32 #define xsincos_u1 nsimd_sleef_sincos_u10_sse2_f64 #define xsincosf_u1 nsimd_sleef_sincos_u10_sse2_f32 #define xtan_u1 nsimd_sleef_tan_u10_sse2_f64 #define xtanf_u1 nsimd_sleef_tan_u10_sse2_f32 #define xasin_u1 nsimd_sleef_asin_u10_sse2_f64 #define xasinf_u1 nsimd_sleef_asin_u10_sse2_f32 #define xacos_u1 nsimd_sleef_acos_u10_sse2_f64 #define xacosf_u1 nsimd_sleef_acos_u10_sse2_f32 #define xatan_u1 nsimd_sleef_atan_u10_sse2_f64 #define xatanf_u1 nsimd_sleef_atan_u10_sse2_f32 #define xatan2_u1 nsimd_sleef_atan2_u10_sse2_f64 #define xatan2f_u1 nsimd_sleef_atan2_u10_sse2_f32 #define xlog_u1 nsimd_sleef_log_u10_sse2_f64 #define xlogf_u1 nsimd_sleef_log_u10_sse2_f32 #define xcbrt_u1 nsimd_sleef_cbrt_u10_sse2_f64 #define xcbrtf_u1 nsimd_sleef_cbrt_u10_sse2_f32 #define xexp nsimd_sleef_exp_u10_sse2_f64 #define xexpf nsimd_sleef_exp_u10_sse2_f32 #define xpow nsimd_sleef_pow_u10_sse2_f64 #define xpowf nsimd_sleef_pow_u10_sse2_f32 #define xsinh nsimd_sleef_sinh_u10_sse2_f64 #define xsinhf nsimd_sleef_sinh_u10_sse2_f32 #define xcosh nsimd_sleef_cosh_u10_sse2_f64 #define xcoshf nsimd_sleef_cosh_u10_sse2_f32 #define xtanh nsimd_sleef_tanh_u10_sse2_f64 #define xtanhf nsimd_sleef_tanh_u10_sse2_f32 #define xsinh_u35 nsimd_sleef_sinh_u35_sse2_f64 #define xsinhf_u35 nsimd_sleef_sinh_u35_sse2_f32 #define xcosh_u35 nsimd_sleef_cosh_u35_sse2_f64 #define xcoshf_u35 nsimd_sleef_cosh_u35_sse2_f32 #define xtanh_u35 nsimd_sleef_tanh_u35_sse2_f64 #define xtanhf_u35 nsimd_sleef_tanh_u35_sse2_f32 #define xfastsin_u3500 nsimd_sleef_fastsin_u3500_sse2_f64 #define xfastsinf_u3500 nsimd_sleef_fastsin_u3500_sse2_f32 #define xfastcos_u3500 nsimd_sleef_fastcos_u3500_sse2_f64 #define xfastcosf_u3500 nsimd_sleef_fastcos_u3500_sse2_f32 #define xfastpow_u3500 nsimd_sleef_fastpow_u3500_sse2_f64 #define xfastpowf_u3500 nsimd_sleef_fastpow_u3500_sse2_f32 #define xasinh nsimd_sleef_asinh_u10_sse2_f64 #define xasinhf nsimd_sleef_asinh_u10_sse2_f32 #define xacosh nsimd_sleef_acosh_u10_sse2_f64 #define xacoshf nsimd_sleef_acosh_u10_sse2_f32 #define xatanh nsimd_sleef_atanh_u10_sse2_f64 #define xatanhf nsimd_sleef_atanh_u10_sse2_f32 #define xexp2 nsimd_sleef_exp2_u10_sse2_f64 #define xexp2f nsimd_sleef_exp2_u10_sse2_f32 #define xexp2_u35 nsimd_sleef_exp2_u35_sse2_f64 #define xexp2f_u35 nsimd_sleef_exp2_u35_sse2_f32 #define xexp10 nsimd_sleef_exp10_u10_sse2_f64 #define xexp10f nsimd_sleef_exp10_u10_sse2_f32 #define xexp10_u35 nsimd_sleef_exp10_u35_sse2_f64 #define xexp10f_u35 nsimd_sleef_exp10_u35_sse2_f32 #define xexpm1 nsimd_sleef_expm1_u10_sse2_f64 #define xexpm1f nsimd_sleef_expm1_u10_sse2_f32 #define xlog10 nsimd_sleef_log10_u10_sse2_f64 #define xlog10f nsimd_sleef_log10_u10_sse2_f32 #define xlog2 nsimd_sleef_log2_u10_sse2_f64 #define xlog2f nsimd_sleef_log2_u10_sse2_f32 #define xlog2_u35 nsimd_sleef_log2_u35_sse2_f64 #define xlog2f_u35 nsimd_sleef_log2_u35_sse2_f32 #define xlog1p nsimd_sleef_log1p_u10_sse2_f64 #define xlog1pf nsimd_sleef_log1p_u10_sse2_f32 #define xsincospi_u05 nsimd_sleef_sincospi_u05_sse2_f64 #define xsincospif_u05 nsimd_sleef_sincospi_u05_sse2_f32 #define xsincospi_u35 nsimd_sleef_sincospi_u35_sse2_f64 #define xsincospif_u35 nsimd_sleef_sincospi_u35_sse2_f32 #define xsinpi_u05 nsimd_sleef_sinpi_u05_sse2_f64 #define xsinpif_u05 nsimd_sleef_sinpi_u05_sse2_f32 #define xcospi_u05 nsimd_sleef_cospi_u05_sse2_f64 #define xcospif_u05 nsimd_sleef_cospi_u05_sse2_f32 #define xldexp nsimd_sleef_ldexp_sse2_f64 #define xldexpf nsimd_sleef_ldexp_sse2_f32 #define xilogb nsimd_sleef_ilogb_sse2_f64 #define xilogbf nsimd_sleef_ilogb_sse2_f32 #define xfma nsimd_sleef_fma_sse2_f64 #define xfmaf nsimd_sleef_fma_sse2_f32 #define xsqrt nsimd_sleef_sqrt_sse2_f64 #define xsqrtf nsimd_sleef_sqrt_sse2_f32 #define xsqrt_u05 nsimd_sleef_sqrt_u05_sse2_f64 #define xsqrtf_u05 nsimd_sleef_sqrt_u05_sse2_f32 #define xsqrt_u35 nsimd_sleef_sqrt_u35_sse2_f64 #define xsqrtf_u35 nsimd_sleef_sqrt_u35_sse2_f32 #define xhypot_u05 nsimd_sleef_hypot_u05_sse2_f64 #define xhypotf_u05 nsimd_sleef_hypot_u05_sse2_f32 #define xhypot_u35 nsimd_sleef_hypot_u35_sse2_f64 #define xhypotf_u35 nsimd_sleef_hypot_u35_sse2_f32 #define xfabs nsimd_sleef_fabs_sse2_f64 #define xfabsf nsimd_sleef_fabs_sse2_f32 #define xcopysign nsimd_sleef_copysign_sse2_f64 #define xcopysignf nsimd_sleef_copysign_sse2_f32 #define xfmax nsimd_sleef_fmax_sse2_f64 #define xfmaxf nsimd_sleef_fmax_sse2_f32 #define xfmin nsimd_sleef_fmin_sse2_f64 #define xfminf nsimd_sleef_fmin_sse2_f32 #define xfdim nsimd_sleef_fdim_sse2_f64 #define xfdimf nsimd_sleef_fdim_sse2_f32 #define xtrunc nsimd_sleef_trunc_sse2_f64 #define xtruncf nsimd_sleef_trunc_sse2_f32 #define xfloor nsimd_sleef_floor_sse2_f64 #define xfloorf nsimd_sleef_floor_sse2_f32 #define xceil nsimd_sleef_ceil_sse2_f64 #define xceilf nsimd_sleef_ceil_sse2_f32 #define xround nsimd_sleef_round_sse2_f64 #define xroundf nsimd_sleef_round_sse2_f32 #define xrint nsimd_sleef_rint_sse2_f64 #define xrintf nsimd_sleef_rint_sse2_f32 #define xnextafter nsimd_sleef_nextafter_sse2_f64 #define xnextafterf nsimd_sleef_nextafter_sse2_f32 #define xfrfrexp nsimd_sleef_frfrexp_sse2_f64 #define xfrfrexpf nsimd_sleef_frfrexp_sse2_f32 #define xexpfrexp nsimd_sleef_expfrexp_sse2_f64 #define xexpfrexpf nsimd_sleef_expfrexp_sse2_f32 #define xfmod nsimd_sleef_fmod_sse2_f64 #define xfmodf nsimd_sleef_fmod_sse2_f32 #define xremainder nsimd_sleef_remainder_sse2_f64 #define xremainderf nsimd_sleef_remainder_sse2_f32 #define xmodf nsimd_sleef_modf_sse2_f64 #define xmodff nsimd_sleef_modf_sse2_f32 #define xlgamma_u1 nsimd_sleef_lgamma_u10_sse2_f64 #define xlgammaf_u1 nsimd_sleef_lgamma_u10_sse2_f32 #define xtgamma_u1 nsimd_sleef_tgamma_u10_sse2_f64 #define xtgammaf_u1 nsimd_sleef_tgamma_u10_sse2_f32 #define xerf_u1 nsimd_sleef_erf_u10_sse2_f64 #define xerff_u1 nsimd_sleef_erf_u10_sse2_f32 #define xerfc_u15 nsimd_sleef_erfc_u15_sse2_f64 #define xerfcf_u15 nsimd_sleef_erfc_u15_sse2_f32 #define xgetInt nsimd_sleef_getInt_sse2_f64 #define xgetIntf nsimd_sleef_getInt_sse2_f32 #define xgetPtr nsimd_sleef_getPtr_sse2_f64 #define xgetPtrf nsimd_sleef_getPtr_sse2_f32 #endif #define rempi nsimd_sleef_rempi_sse2 #define rempif nsimd_sleef_rempif_sse2 #define rempisub nsimd_sleef_rempisub_sse2 #define rempisubf nsimd_sleef_rempisubf_sse2 #define gammak nsimd_gammak_sse2 #define gammafk nsimd_gammafk_sse2 #endif #endif ================================================ FILE: src/renamesse4.h ================================================ #ifndef RENAMESSE4_H #define RENAMESSE4_H /* ------------------------------------------------------------------------- */ /* Naming of functions sse42 */ #ifdef NSIMD_SSE42 #ifdef DETERMINISTIC #define xsin nsimd_sleef_sin_u35d_sse42_f64 #define xsinf nsimd_sleef_sin_u35d_sse42_f32 #define xcos nsimd_sleef_cos_u35d_sse42_f64 #define xcosf nsimd_sleef_cos_u35d_sse42_f32 #define xsincos nsimd_sleef_sincos_u35d_sse42_f64 #define xsincosf nsimd_sleef_sincos_u35d_sse42_f32 #define xtan nsimd_sleef_tan_u35d_sse42_f64 #define xtanf nsimd_sleef_tan_u35d_sse42_f32 #define xasin nsimd_sleef_asin_u35d_sse42_f64 #define xasinf nsimd_sleef_asin_u35d_sse42_f32 #define xacos nsimd_sleef_acos_u35d_sse42_f64 #define xacosf nsimd_sleef_acos_u35d_sse42_f32 #define xatan nsimd_sleef_atan_u35d_sse42_f64 #define xatanf nsimd_sleef_atan_u35d_sse42_f32 #define xatan2 nsimd_sleef_atan2_u35d_sse42_f64 #define xatan2f nsimd_sleef_atan2_u35d_sse42_f32 #define xlog nsimd_sleef_log_u35d_sse42_f64 #define xlogf nsimd_sleef_log_u35d_sse42_f32 #define xcbrt nsimd_sleef_cbrt_u35d_sse42_f64 #define xcbrtf nsimd_sleef_cbrt_u35d_sse42_f32 #define xsin_u1 nsimd_sleef_sin_u10d_sse42_f64 #define xsinf_u1 nsimd_sleef_sin_u10d_sse42_f32 #define xcos_u1 nsimd_sleef_cos_u10d_sse42_f64 #define xcosf_u1 nsimd_sleef_cos_u10d_sse42_f32 #define xsincos_u1 nsimd_sleef_sincos_u10d_sse42_f64 #define xsincosf_u1 nsimd_sleef_sincos_u10d_sse42_f32 #define xtan_u1 nsimd_sleef_tan_u10d_sse42_f64 #define xtanf_u1 nsimd_sleef_tan_u10d_sse42_f32 #define xasin_u1 nsimd_sleef_asin_u10d_sse42_f64 #define xasinf_u1 nsimd_sleef_asin_u10d_sse42_f32 #define xacos_u1 nsimd_sleef_acos_u10d_sse42_f64 #define xacosf_u1 nsimd_sleef_acos_u10d_sse42_f32 #define xatan_u1 nsimd_sleef_atan_u10d_sse42_f64 #define xatanf_u1 nsimd_sleef_atan_u10d_sse42_f32 #define xatan2_u1 nsimd_sleef_atan2_u10d_sse42_f64 #define xatan2f_u1 nsimd_sleef_atan2_u10d_sse42_f32 #define xlog_u1 nsimd_sleef_log_u10d_sse42_f64 #define xlogf_u1 nsimd_sleef_log_u10d_sse42_f32 #define xcbrt_u1 nsimd_sleef_cbrt_u10d_sse42_f64 #define xcbrtf_u1 nsimd_sleef_cbrt_u10d_sse42_f32 #define xexp nsimd_sleef_exp_u10d_sse42_f64 #define xexpf nsimd_sleef_exp_u10d_sse42_f32 #define xpow nsimd_sleef_pow_u10d_sse42_f64 #define xpowf nsimd_sleef_pow_u10d_sse42_f32 #define xsinh nsimd_sleef_sinh_u10d_sse42_f64 #define xsinhf nsimd_sleef_sinh_u10d_sse42_f32 #define xcosh nsimd_sleef_cosh_u10d_sse42_f64 #define xcoshf nsimd_sleef_cosh_u10d_sse42_f32 #define xtanh nsimd_sleef_tanh_u10d_sse42_f64 #define xtanhf nsimd_sleef_tanh_u10d_sse42_f32 #define xsinh_u35 nsimd_sleef_sinh_u35d_sse42_f64 #define xsinhf_u35 nsimd_sleef_sinh_u35d_sse42_f32 #define xcosh_u35 nsimd_sleef_cosh_u35d_sse42_f64 #define xcoshf_u35 nsimd_sleef_cosh_u35d_sse42_f32 #define xtanh_u35 nsimd_sleef_tanh_u35d_sse42_f64 #define xtanhf_u35 nsimd_sleef_tanh_u35d_sse42_f32 #define xfastsin_u3500 nsimd_sleef_fastsin_u3500d_sse42_f64 #define xfastsinf_u3500 nsimd_sleef_fastsin_u3500d_sse42_f32 #define xfastcos_u3500 nsimd_sleef_fastcos_u3500d_sse42_f64 #define xfastcosf_u3500 nsimd_sleef_fastcos_u3500d_sse42_f32 #define xfastpow_u3500 nsimd_sleef_fastpow_u3500d_sse42_f64 #define xfastpowf_u3500 nsimd_sleef_fastpow_u3500d_sse42_f32 #define xasinh nsimd_sleef_asinh_u10d_sse42_f64 #define xasinhf nsimd_sleef_asinh_u10d_sse42_f32 #define xacosh nsimd_sleef_acosh_u10d_sse42_f64 #define xacoshf nsimd_sleef_acosh_u10d_sse42_f32 #define xatanh nsimd_sleef_atanh_u10d_sse42_f64 #define xatanhf nsimd_sleef_atanh_u10d_sse42_f32 #define xexp2 nsimd_sleef_exp2_u10d_sse42_f64 #define xexp2f nsimd_sleef_exp2_u10d_sse42_f32 #define xexp2_u35 nsimd_sleef_exp2_u35d_sse42_f64 #define xexp2f_u35 nsimd_sleef_exp2_u35d_sse42_f32 #define xexp10 nsimd_sleef_exp10_u10d_sse42_f64 #define xexp10f nsimd_sleef_exp10_u10d_sse42_f32 #define xexp10_u35 nsimd_sleef_exp10_u35d_sse42_f64 #define xexp10f_u35 nsimd_sleef_exp10_u35d_sse42_f32 #define xexpm1 nsimd_sleef_expm1_u10d_sse42_f64 #define xexpm1f nsimd_sleef_expm1_u10d_sse42_f32 #define xlog10 nsimd_sleef_log10_u10d_sse42_f64 #define xlog10f nsimd_sleef_log10_u10d_sse42_f32 #define xlog2 nsimd_sleef_log2_u10d_sse42_f64 #define xlog2f nsimd_sleef_log2_u10d_sse42_f32 #define xlog2_u35 nsimd_sleef_log2_u35d_sse42_f64 #define xlog2f_u35 nsimd_sleef_log2_u35d_sse42_f32 #define xlog1p nsimd_sleef_log1p_u10d_sse42_f64 #define xlog1pf nsimd_sleef_log1p_u10d_sse42_f32 #define xsincospi_u05 nsimd_sleef_sincospi_u05d_sse42_f64 #define xsincospif_u05 nsimd_sleef_sincospi_u05d_sse42_f32 #define xsincospi_u35 nsimd_sleef_sincospi_u35d_sse42_f64 #define xsincospif_u35 nsimd_sleef_sincospi_u35d_sse42_f32 #define xsinpi_u05 nsimd_sleef_sinpi_u05d_sse42_f64 #define xsinpif_u05 nsimd_sleef_sinpi_u05d_sse42_f32 #define xcospi_u05 nsimd_sleef_cospi_u05d_sse42_f64 #define xcospif_u05 nsimd_sleef_cospi_u05d_sse42_f32 #define xldexp nsimd_sleef_ldexp_sse42_f64 #define xldexpf nsimd_sleef_ldexp_sse42_f32 #define xilogb nsimd_sleef_ilogb_sse42_f64 #define xilogbf nsimd_sleef_ilogb_sse42_f32 #define xfma nsimd_sleef_fma_sse42_f64 #define xfmaf nsimd_sleef_fma_sse42_f32 #define xsqrt nsimd_sleef_sqrt_sse42_f64 #define xsqrtf nsimd_sleef_sqrt_sse42_f32 #define xsqrt_u05 nsimd_sleef_sqrt_u05d_sse42_f64 #define xsqrtf_u05 nsimd_sleef_sqrt_u05d_sse42_f32 #define xsqrt_u35 nsimd_sleef_sqrt_u35d_sse42_f64 #define xsqrtf_u35 nsimd_sleef_sqrt_u35d_sse42_f32 #define xhypot_u05 nsimd_sleef_hypot_u05d_sse42_f64 #define xhypotf_u05 nsimd_sleef_hypot_u05d_sse42_f32 #define xhypot_u35 nsimd_sleef_hypot_u35d_sse42_f64 #define xhypotf_u35 nsimd_sleef_hypot_u35d_sse42_f32 #define xfabs nsimd_sleef_fabs_sse42_f64 #define xfabsf nsimd_sleef_fabs_sse42_f32 #define xcopysign nsimd_sleef_copysign_sse42_f64 #define xcopysignf nsimd_sleef_copysign_sse42_f32 #define xfmax nsimd_sleef_fmax_sse42_f64 #define xfmaxf nsimd_sleef_fmax_sse42_f32 #define xfmin nsimd_sleef_fmin_sse42_f64 #define xfminf nsimd_sleef_fmin_sse42_f32 #define xfdim nsimd_sleef_fdim_sse42_f64 #define xfdimf nsimd_sleef_fdim_sse42_f32 #define xtrunc nsimd_sleef_trunc_sse42_f64 #define xtruncf nsimd_sleef_trunc_sse42_f32 #define xfloor nsimd_sleef_floor_sse42_f64 #define xfloorf nsimd_sleef_floor_sse42_f32 #define xceil nsimd_sleef_ceil_sse42_f64 #define xceilf nsimd_sleef_ceil_sse42_f32 #define xround nsimd_sleef_round_sse42_f64 #define xroundf nsimd_sleef_round_sse42_f32 #define xrint nsimd_sleef_rint_sse42_f64 #define xrintf nsimd_sleef_rint_sse42_f32 #define xnextafter nsimd_sleef_nextafter_sse42_f64 #define xnextafterf nsimd_sleef_nextafter_sse42_f32 #define xfrfrexp nsimd_sleef_frfrexp_sse42_f64 #define xfrfrexpf nsimd_sleef_frfrexp_sse42_f32 #define xexpfrexp nsimd_sleef_expfrexp_sse42_f64 #define xexpfrexpf nsimd_sleef_expfrexp_sse42_f32 #define xfmod nsimd_sleef_fmod_sse42_f64 #define xfmodf nsimd_sleef_fmod_sse42_f32 #define xremainder nsimd_sleef_remainder_sse42_f64 #define xremainderf nsimd_sleef_remainder_sse42_f32 #define xmodf nsimd_sleef_modf_sse42_f64 #define xmodff nsimd_sleef_modf_sse42_f32 #define xlgamma_u1 nsimd_sleef_lgamma_u10d_sse42_f64 #define xlgammaf_u1 nsimd_sleef_lgamma_u10d_sse42_f32 #define xtgamma_u1 nsimd_sleef_tgamma_u10d_sse42_f64 #define xtgammaf_u1 nsimd_sleef_tgamma_u10d_sse42_f32 #define xerf_u1 nsimd_sleef_erf_u10d_sse42_f64 #define xerff_u1 nsimd_sleef_erf_u10d_sse42_f32 #define xerfc_u15 nsimd_sleef_erfc_u15d_sse42_f64 #define xerfcf_u15 nsimd_sleef_erfc_u15d_sse42_f32 #define xgetInt nsimd_sleef_getInt_sse42_f64 #define xgetIntf nsimd_sleef_getInt_sse42_f32 #define xgetPtr nsimd_sleef_getPtr_sse42_f64 #define xgetPtrf nsimd_sleef_getPtr_sse42_f32 #else #define xsin nsimd_sleef_sin_u35_sse42_f64 #define xsinf nsimd_sleef_sin_u35_sse42_f32 #define xcos nsimd_sleef_cos_u35_sse42_f64 #define xcosf nsimd_sleef_cos_u35_sse42_f32 #define xsincos nsimd_sleef_sincos_u35_sse42_f64 #define xsincosf nsimd_sleef_sincos_u35_sse42_f32 #define xtan nsimd_sleef_tan_u35_sse42_f64 #define xtanf nsimd_sleef_tan_u35_sse42_f32 #define xasin nsimd_sleef_asin_u35_sse42_f64 #define xasinf nsimd_sleef_asin_u35_sse42_f32 #define xacos nsimd_sleef_acos_u35_sse42_f64 #define xacosf nsimd_sleef_acos_u35_sse42_f32 #define xatan nsimd_sleef_atan_u35_sse42_f64 #define xatanf nsimd_sleef_atan_u35_sse42_f32 #define xatan2 nsimd_sleef_atan2_u35_sse42_f64 #define xatan2f nsimd_sleef_atan2_u35_sse42_f32 #define xlog nsimd_sleef_log_u35_sse42_f64 #define xlogf nsimd_sleef_log_u35_sse42_f32 #define xcbrt nsimd_sleef_cbrt_u35_sse42_f64 #define xcbrtf nsimd_sleef_cbrt_u35_sse42_f32 #define xsin_u1 nsimd_sleef_sin_u10_sse42_f64 #define xsinf_u1 nsimd_sleef_sin_u10_sse42_f32 #define xcos_u1 nsimd_sleef_cos_u10_sse42_f64 #define xcosf_u1 nsimd_sleef_cos_u10_sse42_f32 #define xsincos_u1 nsimd_sleef_sincos_u10_sse42_f64 #define xsincosf_u1 nsimd_sleef_sincos_u10_sse42_f32 #define xtan_u1 nsimd_sleef_tan_u10_sse42_f64 #define xtanf_u1 nsimd_sleef_tan_u10_sse42_f32 #define xasin_u1 nsimd_sleef_asin_u10_sse42_f64 #define xasinf_u1 nsimd_sleef_asin_u10_sse42_f32 #define xacos_u1 nsimd_sleef_acos_u10_sse42_f64 #define xacosf_u1 nsimd_sleef_acos_u10_sse42_f32 #define xatan_u1 nsimd_sleef_atan_u10_sse42_f64 #define xatanf_u1 nsimd_sleef_atan_u10_sse42_f32 #define xatan2_u1 nsimd_sleef_atan2_u10_sse42_f64 #define xatan2f_u1 nsimd_sleef_atan2_u10_sse42_f32 #define xlog_u1 nsimd_sleef_log_u10_sse42_f64 #define xlogf_u1 nsimd_sleef_log_u10_sse42_f32 #define xcbrt_u1 nsimd_sleef_cbrt_u10_sse42_f64 #define xcbrtf_u1 nsimd_sleef_cbrt_u10_sse42_f32 #define xexp nsimd_sleef_exp_u10_sse42_f64 #define xexpf nsimd_sleef_exp_u10_sse42_f32 #define xpow nsimd_sleef_pow_u10_sse42_f64 #define xpowf nsimd_sleef_pow_u10_sse42_f32 #define xsinh nsimd_sleef_sinh_u10_sse42_f64 #define xsinhf nsimd_sleef_sinh_u10_sse42_f32 #define xcosh nsimd_sleef_cosh_u10_sse42_f64 #define xcoshf nsimd_sleef_cosh_u10_sse42_f32 #define xtanh nsimd_sleef_tanh_u10_sse42_f64 #define xtanhf nsimd_sleef_tanh_u10_sse42_f32 #define xsinh_u35 nsimd_sleef_sinh_u35_sse42_f64 #define xsinhf_u35 nsimd_sleef_sinh_u35_sse42_f32 #define xcosh_u35 nsimd_sleef_cosh_u35_sse42_f64 #define xcoshf_u35 nsimd_sleef_cosh_u35_sse42_f32 #define xtanh_u35 nsimd_sleef_tanh_u35_sse42_f64 #define xtanhf_u35 nsimd_sleef_tanh_u35_sse42_f32 #define xfastsin_u3500 nsimd_sleef_fastsin_u3500_sse42_f64 #define xfastsinf_u3500 nsimd_sleef_fastsin_u3500_sse42_f32 #define xfastcos_u3500 nsimd_sleef_fastcos_u3500_sse42_f64 #define xfastcosf_u3500 nsimd_sleef_fastcos_u3500_sse42_f32 #define xfastpow_u3500 nsimd_sleef_fastpow_u3500_sse42_f64 #define xfastpowf_u3500 nsimd_sleef_fastpow_u3500_sse42_f32 #define xasinh nsimd_sleef_asinh_u10_sse42_f64 #define xasinhf nsimd_sleef_asinh_u10_sse42_f32 #define xacosh nsimd_sleef_acosh_u10_sse42_f64 #define xacoshf nsimd_sleef_acosh_u10_sse42_f32 #define xatanh nsimd_sleef_atanh_u10_sse42_f64 #define xatanhf nsimd_sleef_atanh_u10_sse42_f32 #define xexp2 nsimd_sleef_exp2_u10_sse42_f64 #define xexp2f nsimd_sleef_exp2_u10_sse42_f32 #define xexp2_u35 nsimd_sleef_exp2_u35_sse42_f64 #define xexp2f_u35 nsimd_sleef_exp2_u35_sse42_f32 #define xexp10 nsimd_sleef_exp10_u10_sse42_f64 #define xexp10f nsimd_sleef_exp10_u10_sse42_f32 #define xexp10_u35 nsimd_sleef_exp10_u35_sse42_f64 #define xexp10f_u35 nsimd_sleef_exp10_u35_sse42_f32 #define xexpm1 nsimd_sleef_expm1_u10_sse42_f64 #define xexpm1f nsimd_sleef_expm1_u10_sse42_f32 #define xlog10 nsimd_sleef_log10_u10_sse42_f64 #define xlog10f nsimd_sleef_log10_u10_sse42_f32 #define xlog2 nsimd_sleef_log2_u10_sse42_f64 #define xlog2f nsimd_sleef_log2_u10_sse42_f32 #define xlog2_u35 nsimd_sleef_log2_u35_sse42_f64 #define xlog2f_u35 nsimd_sleef_log2_u35_sse42_f32 #define xlog1p nsimd_sleef_log1p_u10_sse42_f64 #define xlog1pf nsimd_sleef_log1p_u10_sse42_f32 #define xsincospi_u05 nsimd_sleef_sincospi_u05_sse42_f64 #define xsincospif_u05 nsimd_sleef_sincospi_u05_sse42_f32 #define xsincospi_u35 nsimd_sleef_sincospi_u35_sse42_f64 #define xsincospif_u35 nsimd_sleef_sincospi_u35_sse42_f32 #define xsinpi_u05 nsimd_sleef_sinpi_u05_sse42_f64 #define xsinpif_u05 nsimd_sleef_sinpi_u05_sse42_f32 #define xcospi_u05 nsimd_sleef_cospi_u05_sse42_f64 #define xcospif_u05 nsimd_sleef_cospi_u05_sse42_f32 #define xldexp nsimd_sleef_ldexp_sse42_f64 #define xldexpf nsimd_sleef_ldexp_sse42_f32 #define xilogb nsimd_sleef_ilogb_sse42_f64 #define xilogbf nsimd_sleef_ilogb_sse42_f32 #define xfma nsimd_sleef_fma_sse42_f64 #define xfmaf nsimd_sleef_fma_sse42_f32 #define xsqrt nsimd_sleef_sqrt_sse42_f64 #define xsqrtf nsimd_sleef_sqrt_sse42_f32 #define xsqrt_u05 nsimd_sleef_sqrt_u05_sse42_f64 #define xsqrtf_u05 nsimd_sleef_sqrt_u05_sse42_f32 #define xsqrt_u35 nsimd_sleef_sqrt_u35_sse42_f64 #define xsqrtf_u35 nsimd_sleef_sqrt_u35_sse42_f32 #define xhypot_u05 nsimd_sleef_hypot_u05_sse42_f64 #define xhypotf_u05 nsimd_sleef_hypot_u05_sse42_f32 #define xhypot_u35 nsimd_sleef_hypot_u35_sse42_f64 #define xhypotf_u35 nsimd_sleef_hypot_u35_sse42_f32 #define xfabs nsimd_sleef_fabs_sse42_f64 #define xfabsf nsimd_sleef_fabs_sse42_f32 #define xcopysign nsimd_sleef_copysign_sse42_f64 #define xcopysignf nsimd_sleef_copysign_sse42_f32 #define xfmax nsimd_sleef_fmax_sse42_f64 #define xfmaxf nsimd_sleef_fmax_sse42_f32 #define xfmin nsimd_sleef_fmin_sse42_f64 #define xfminf nsimd_sleef_fmin_sse42_f32 #define xfdim nsimd_sleef_fdim_sse42_f64 #define xfdimf nsimd_sleef_fdim_sse42_f32 #define xtrunc nsimd_sleef_trunc_sse42_f64 #define xtruncf nsimd_sleef_trunc_sse42_f32 #define xfloor nsimd_sleef_floor_sse42_f64 #define xfloorf nsimd_sleef_floor_sse42_f32 #define xceil nsimd_sleef_ceil_sse42_f64 #define xceilf nsimd_sleef_ceil_sse42_f32 #define xround nsimd_sleef_round_sse42_f64 #define xroundf nsimd_sleef_round_sse42_f32 #define xrint nsimd_sleef_rint_sse42_f64 #define xrintf nsimd_sleef_rint_sse42_f32 #define xnextafter nsimd_sleef_nextafter_sse42_f64 #define xnextafterf nsimd_sleef_nextafter_sse42_f32 #define xfrfrexp nsimd_sleef_frfrexp_sse42_f64 #define xfrfrexpf nsimd_sleef_frfrexp_sse42_f32 #define xexpfrexp nsimd_sleef_expfrexp_sse42_f64 #define xexpfrexpf nsimd_sleef_expfrexp_sse42_f32 #define xfmod nsimd_sleef_fmod_sse42_f64 #define xfmodf nsimd_sleef_fmod_sse42_f32 #define xremainder nsimd_sleef_remainder_sse42_f64 #define xremainderf nsimd_sleef_remainder_sse42_f32 #define xmodf nsimd_sleef_modf_sse42_f64 #define xmodff nsimd_sleef_modf_sse42_f32 #define xlgamma_u1 nsimd_sleef_lgamma_u10_sse42_f64 #define xlgammaf_u1 nsimd_sleef_lgamma_u10_sse42_f32 #define xtgamma_u1 nsimd_sleef_tgamma_u10_sse42_f64 #define xtgammaf_u1 nsimd_sleef_tgamma_u10_sse42_f32 #define xerf_u1 nsimd_sleef_erf_u10_sse42_f64 #define xerff_u1 nsimd_sleef_erf_u10_sse42_f32 #define xerfc_u15 nsimd_sleef_erfc_u15_sse42_f64 #define xerfcf_u15 nsimd_sleef_erfc_u15_sse42_f32 #define xgetInt nsimd_sleef_getInt_sse42_f64 #define xgetIntf nsimd_sleef_getInt_sse42_f32 #define xgetPtr nsimd_sleef_getPtr_sse42_f64 #define xgetPtrf nsimd_sleef_getPtr_sse42_f32 #endif #define rempi nsimd_sleef_rempi_sse42 #define rempif nsimd_sleef_rempif_sse42 #define rempisub nsimd_sleef_rempisub_sse42 #define rempisubf nsimd_sleef_rempisubf_sse42 #define gammak nsimd_gammak_sse42 #define gammafk nsimd_gammafk_sse42 #endif #endif ================================================ FILE: src/renamesve.h ================================================ #ifndef RENAMESVE_H #define RENAMESVE_H /* ------------------------------------------------------------------------- */ /* Naming of functions sve128 */ #ifdef NSIMD_SVE128 #ifdef DETERMINISTIC #define xsin nsimd_sleef_sin_u35d_sve128_f64 #define xsinf nsimd_sleef_sin_u35d_sve128_f32 #define xcos nsimd_sleef_cos_u35d_sve128_f64 #define xcosf nsimd_sleef_cos_u35d_sve128_f32 #define xsincos nsimd_sleef_sincos_u35d_sve128_f64 #define xsincosf nsimd_sleef_sincos_u35d_sve128_f32 #define xtan nsimd_sleef_tan_u35d_sve128_f64 #define xtanf nsimd_sleef_tan_u35d_sve128_f32 #define xasin nsimd_sleef_asin_u35d_sve128_f64 #define xasinf nsimd_sleef_asin_u35d_sve128_f32 #define xacos nsimd_sleef_acos_u35d_sve128_f64 #define xacosf nsimd_sleef_acos_u35d_sve128_f32 #define xatan nsimd_sleef_atan_u35d_sve128_f64 #define xatanf nsimd_sleef_atan_u35d_sve128_f32 #define xatan2 nsimd_sleef_atan2_u35d_sve128_f64 #define xatan2f nsimd_sleef_atan2_u35d_sve128_f32 #define xlog nsimd_sleef_log_u35d_sve128_f64 #define xlogf nsimd_sleef_log_u35d_sve128_f32 #define xcbrt nsimd_sleef_cbrt_u35d_sve128_f64 #define xcbrtf nsimd_sleef_cbrt_u35d_sve128_f32 #define xsin_u1 nsimd_sleef_sin_u10d_sve128_f64 #define xsinf_u1 nsimd_sleef_sin_u10d_sve128_f32 #define xcos_u1 nsimd_sleef_cos_u10d_sve128_f64 #define xcosf_u1 nsimd_sleef_cos_u10d_sve128_f32 #define xsincos_u1 nsimd_sleef_sincos_u10d_sve128_f64 #define xsincosf_u1 nsimd_sleef_sincos_u10d_sve128_f32 #define xtan_u1 nsimd_sleef_tan_u10d_sve128_f64 #define xtanf_u1 nsimd_sleef_tan_u10d_sve128_f32 #define xasin_u1 nsimd_sleef_asin_u10d_sve128_f64 #define xasinf_u1 nsimd_sleef_asin_u10d_sve128_f32 #define xacos_u1 nsimd_sleef_acos_u10d_sve128_f64 #define xacosf_u1 nsimd_sleef_acos_u10d_sve128_f32 #define xatan_u1 nsimd_sleef_atan_u10d_sve128_f64 #define xatanf_u1 nsimd_sleef_atan_u10d_sve128_f32 #define xatan2_u1 nsimd_sleef_atan2_u10d_sve128_f64 #define xatan2f_u1 nsimd_sleef_atan2_u10d_sve128_f32 #define xlog_u1 nsimd_sleef_log_u10d_sve128_f64 #define xlogf_u1 nsimd_sleef_log_u10d_sve128_f32 #define xcbrt_u1 nsimd_sleef_cbrt_u10d_sve128_f64 #define xcbrtf_u1 nsimd_sleef_cbrt_u10d_sve128_f32 #define xexp nsimd_sleef_exp_u10d_sve128_f64 #define xexpf nsimd_sleef_exp_u10d_sve128_f32 #define xpow nsimd_sleef_pow_u10d_sve128_f64 #define xpowf nsimd_sleef_pow_u10d_sve128_f32 #define xsinh nsimd_sleef_sinh_u10d_sve128_f64 #define xsinhf nsimd_sleef_sinh_u10d_sve128_f32 #define xcosh nsimd_sleef_cosh_u10d_sve128_f64 #define xcoshf nsimd_sleef_cosh_u10d_sve128_f32 #define xtanh nsimd_sleef_tanh_u10d_sve128_f64 #define xtanhf nsimd_sleef_tanh_u10d_sve128_f32 #define xsinh_u35 nsimd_sleef_sinh_u35d_sve128_f64 #define xsinhf_u35 nsimd_sleef_sinh_u35d_sve128_f32 #define xcosh_u35 nsimd_sleef_cosh_u35d_sve128_f64 #define xcoshf_u35 nsimd_sleef_cosh_u35d_sve128_f32 #define xtanh_u35 nsimd_sleef_tanh_u35d_sve128_f64 #define xtanhf_u35 nsimd_sleef_tanh_u35d_sve128_f32 #define xfastsin_u3500 nsimd_sleef_fastsin_u3500d_sve128_f64 #define xfastsinf_u3500 nsimd_sleef_fastsin_u3500d_sve128_f32 #define xfastcos_u3500 nsimd_sleef_fastcos_u3500d_sve128_f64 #define xfastcosf_u3500 nsimd_sleef_fastcos_u3500d_sve128_f32 #define xfastpow_u3500 nsimd_sleef_fastpow_u3500d_sve128_f64 #define xfastpowf_u3500 nsimd_sleef_fastpow_u3500d_sve128_f32 #define xasinh nsimd_sleef_asinh_u10d_sve128_f64 #define xasinhf nsimd_sleef_asinh_u10d_sve128_f32 #define xacosh nsimd_sleef_acosh_u10d_sve128_f64 #define xacoshf nsimd_sleef_acosh_u10d_sve128_f32 #define xatanh nsimd_sleef_atanh_u10d_sve128_f64 #define xatanhf nsimd_sleef_atanh_u10d_sve128_f32 #define xexp2 nsimd_sleef_exp2_u10d_sve128_f64 #define xexp2f nsimd_sleef_exp2_u10d_sve128_f32 #define xexp2_u35 nsimd_sleef_exp2_u35d_sve128_f64 #define xexp2f_u35 nsimd_sleef_exp2_u35d_sve128_f32 #define xexp10 nsimd_sleef_exp10_u10d_sve128_f64 #define xexp10f nsimd_sleef_exp10_u10d_sve128_f32 #define xexp10_u35 nsimd_sleef_exp10_u35d_sve128_f64 #define xexp10f_u35 nsimd_sleef_exp10_u35d_sve128_f32 #define xexpm1 nsimd_sleef_expm1_u10d_sve128_f64 #define xexpm1f nsimd_sleef_expm1_u10d_sve128_f32 #define xlog10 nsimd_sleef_log10_u10d_sve128_f64 #define xlog10f nsimd_sleef_log10_u10d_sve128_f32 #define xlog2 nsimd_sleef_log2_u10d_sve128_f64 #define xlog2f nsimd_sleef_log2_u10d_sve128_f32 #define xlog2_u35 nsimd_sleef_log2_u35d_sve128_f64 #define xlog2f_u35 nsimd_sleef_log2_u35d_sve128_f32 #define xlog1p nsimd_sleef_log1p_u10d_sve128_f64 #define xlog1pf nsimd_sleef_log1p_u10d_sve128_f32 #define xsincospi_u05 nsimd_sleef_sincospi_u05d_sve128_f64 #define xsincospif_u05 nsimd_sleef_sincospi_u05d_sve128_f32 #define xsincospi_u35 nsimd_sleef_sincospi_u35d_sve128_f64 #define xsincospif_u35 nsimd_sleef_sincospi_u35d_sve128_f32 #define xsinpi_u05 nsimd_sleef_sinpi_u05d_sve128_f64 #define xsinpif_u05 nsimd_sleef_sinpi_u05d_sve128_f32 #define xcospi_u05 nsimd_sleef_cospi_u05d_sve128_f64 #define xcospif_u05 nsimd_sleef_cospi_u05d_sve128_f32 #define xldexp nsimd_sleef_ldexp_sve128_f64 #define xldexpf nsimd_sleef_ldexp_sve128_f32 #define xilogb nsimd_sleef_ilogb_sve128_f64 #define xilogbf nsimd_sleef_ilogb_sve128_f32 #define xfma nsimd_sleef_fma_sve128_f64 #define xfmaf nsimd_sleef_fma_sve128_f32 #define xsqrt nsimd_sleef_sqrt_sve128_f64 #define xsqrtf nsimd_sleef_sqrt_sve128_f32 #define xsqrt_u05 nsimd_sleef_sqrt_u05d_sve128_f64 #define xsqrtf_u05 nsimd_sleef_sqrt_u05d_sve128_f32 #define xsqrt_u35 nsimd_sleef_sqrt_u35d_sve128_f64 #define xsqrtf_u35 nsimd_sleef_sqrt_u35d_sve128_f32 #define xhypot_u05 nsimd_sleef_hypot_u05d_sve128_f64 #define xhypotf_u05 nsimd_sleef_hypot_u05d_sve128_f32 #define xhypot_u35 nsimd_sleef_hypot_u35d_sve128_f64 #define xhypotf_u35 nsimd_sleef_hypot_u35d_sve128_f32 #define xfabs nsimd_sleef_fabs_sve128_f64 #define xfabsf nsimd_sleef_fabs_sve128_f32 #define xcopysign nsimd_sleef_copysign_sve128_f64 #define xcopysignf nsimd_sleef_copysign_sve128_f32 #define xfmax nsimd_sleef_fmax_sve128_f64 #define xfmaxf nsimd_sleef_fmax_sve128_f32 #define xfmin nsimd_sleef_fmin_sve128_f64 #define xfminf nsimd_sleef_fmin_sve128_f32 #define xfdim nsimd_sleef_fdim_sve128_f64 #define xfdimf nsimd_sleef_fdim_sve128_f32 #define xtrunc nsimd_sleef_trunc_sve128_f64 #define xtruncf nsimd_sleef_trunc_sve128_f32 #define xfloor nsimd_sleef_floor_sve128_f64 #define xfloorf nsimd_sleef_floor_sve128_f32 #define xceil nsimd_sleef_ceil_sve128_f64 #define xceilf nsimd_sleef_ceil_sve128_f32 #define xround nsimd_sleef_round_sve128_f64 #define xroundf nsimd_sleef_round_sve128_f32 #define xrint nsimd_sleef_rint_sve128_f64 #define xrintf nsimd_sleef_rint_sve128_f32 #define xnextafter nsimd_sleef_nextafter_sve128_f64 #define xnextafterf nsimd_sleef_nextafter_sve128_f32 #define xfrfrexp nsimd_sleef_frfrexp_sve128_f64 #define xfrfrexpf nsimd_sleef_frfrexp_sve128_f32 #define xexpfrexp nsimd_sleef_expfrexp_sve128_f64 #define xexpfrexpf nsimd_sleef_expfrexp_sve128_f32 #define xfmod nsimd_sleef_fmod_sve128_f64 #define xfmodf nsimd_sleef_fmod_sve128_f32 #define xremainder nsimd_sleef_remainder_sve128_f64 #define xremainderf nsimd_sleef_remainder_sve128_f32 #define xmodf nsimd_sleef_modf_sve128_f64 #define xmodff nsimd_sleef_modf_sve128_f32 #define xlgamma_u1 nsimd_sleef_lgamma_u10d_sve128_f64 #define xlgammaf_u1 nsimd_sleef_lgamma_u10d_sve128_f32 #define xtgamma_u1 nsimd_sleef_tgamma_u10d_sve128_f64 #define xtgammaf_u1 nsimd_sleef_tgamma_u10d_sve128_f32 #define xerf_u1 nsimd_sleef_erf_u10d_sve128_f64 #define xerff_u1 nsimd_sleef_erf_u10d_sve128_f32 #define xerfc_u15 nsimd_sleef_erfc_u15d_sve128_f64 #define xerfcf_u15 nsimd_sleef_erfc_u15d_sve128_f32 #define xgetInt nsimd_sleef_getInt_sve128_f64 #define xgetIntf nsimd_sleef_getInt_sve128_f32 #define xgetPtr nsimd_sleef_getPtr_sve128_f64 #define xgetPtrf nsimd_sleef_getPtr_sve128_f32 #else #define xsin nsimd_sleef_sin_u35_sve128_f64 #define xsinf nsimd_sleef_sin_u35_sve128_f32 #define xcos nsimd_sleef_cos_u35_sve128_f64 #define xcosf nsimd_sleef_cos_u35_sve128_f32 #define xsincos nsimd_sleef_sincos_u35_sve128_f64 #define xsincosf nsimd_sleef_sincos_u35_sve128_f32 #define xtan nsimd_sleef_tan_u35_sve128_f64 #define xtanf nsimd_sleef_tan_u35_sve128_f32 #define xasin nsimd_sleef_asin_u35_sve128_f64 #define xasinf nsimd_sleef_asin_u35_sve128_f32 #define xacos nsimd_sleef_acos_u35_sve128_f64 #define xacosf nsimd_sleef_acos_u35_sve128_f32 #define xatan nsimd_sleef_atan_u35_sve128_f64 #define xatanf nsimd_sleef_atan_u35_sve128_f32 #define xatan2 nsimd_sleef_atan2_u35_sve128_f64 #define xatan2f nsimd_sleef_atan2_u35_sve128_f32 #define xlog nsimd_sleef_log_u35_sve128_f64 #define xlogf nsimd_sleef_log_u35_sve128_f32 #define xcbrt nsimd_sleef_cbrt_u35_sve128_f64 #define xcbrtf nsimd_sleef_cbrt_u35_sve128_f32 #define xsin_u1 nsimd_sleef_sin_u10_sve128_f64 #define xsinf_u1 nsimd_sleef_sin_u10_sve128_f32 #define xcos_u1 nsimd_sleef_cos_u10_sve128_f64 #define xcosf_u1 nsimd_sleef_cos_u10_sve128_f32 #define xsincos_u1 nsimd_sleef_sincos_u10_sve128_f64 #define xsincosf_u1 nsimd_sleef_sincos_u10_sve128_f32 #define xtan_u1 nsimd_sleef_tan_u10_sve128_f64 #define xtanf_u1 nsimd_sleef_tan_u10_sve128_f32 #define xasin_u1 nsimd_sleef_asin_u10_sve128_f64 #define xasinf_u1 nsimd_sleef_asin_u10_sve128_f32 #define xacos_u1 nsimd_sleef_acos_u10_sve128_f64 #define xacosf_u1 nsimd_sleef_acos_u10_sve128_f32 #define xatan_u1 nsimd_sleef_atan_u10_sve128_f64 #define xatanf_u1 nsimd_sleef_atan_u10_sve128_f32 #define xatan2_u1 nsimd_sleef_atan2_u10_sve128_f64 #define xatan2f_u1 nsimd_sleef_atan2_u10_sve128_f32 #define xlog_u1 nsimd_sleef_log_u10_sve128_f64 #define xlogf_u1 nsimd_sleef_log_u10_sve128_f32 #define xcbrt_u1 nsimd_sleef_cbrt_u10_sve128_f64 #define xcbrtf_u1 nsimd_sleef_cbrt_u10_sve128_f32 #define xexp nsimd_sleef_exp_u10_sve128_f64 #define xexpf nsimd_sleef_exp_u10_sve128_f32 #define xpow nsimd_sleef_pow_u10_sve128_f64 #define xpowf nsimd_sleef_pow_u10_sve128_f32 #define xsinh nsimd_sleef_sinh_u10_sve128_f64 #define xsinhf nsimd_sleef_sinh_u10_sve128_f32 #define xcosh nsimd_sleef_cosh_u10_sve128_f64 #define xcoshf nsimd_sleef_cosh_u10_sve128_f32 #define xtanh nsimd_sleef_tanh_u10_sve128_f64 #define xtanhf nsimd_sleef_tanh_u10_sve128_f32 #define xsinh_u35 nsimd_sleef_sinh_u35_sve128_f64 #define xsinhf_u35 nsimd_sleef_sinh_u35_sve128_f32 #define xcosh_u35 nsimd_sleef_cosh_u35_sve128_f64 #define xcoshf_u35 nsimd_sleef_cosh_u35_sve128_f32 #define xtanh_u35 nsimd_sleef_tanh_u35_sve128_f64 #define xtanhf_u35 nsimd_sleef_tanh_u35_sve128_f32 #define xfastsin_u3500 nsimd_sleef_fastsin_u3500_sve128_f64 #define xfastsinf_u3500 nsimd_sleef_fastsin_u3500_sve128_f32 #define xfastcos_u3500 nsimd_sleef_fastcos_u3500_sve128_f64 #define xfastcosf_u3500 nsimd_sleef_fastcos_u3500_sve128_f32 #define xfastpow_u3500 nsimd_sleef_fastpow_u3500_sve128_f64 #define xfastpowf_u3500 nsimd_sleef_fastpow_u3500_sve128_f32 #define xasinh nsimd_sleef_asinh_u10_sve128_f64 #define xasinhf nsimd_sleef_asinh_u10_sve128_f32 #define xacosh nsimd_sleef_acosh_u10_sve128_f64 #define xacoshf nsimd_sleef_acosh_u10_sve128_f32 #define xatanh nsimd_sleef_atanh_u10_sve128_f64 #define xatanhf nsimd_sleef_atanh_u10_sve128_f32 #define xexp2 nsimd_sleef_exp2_u10_sve128_f64 #define xexp2f nsimd_sleef_exp2_u10_sve128_f32 #define xexp2_u35 nsimd_sleef_exp2_u35_sve128_f64 #define xexp2f_u35 nsimd_sleef_exp2_u35_sve128_f32 #define xexp10 nsimd_sleef_exp10_u10_sve128_f64 #define xexp10f nsimd_sleef_exp10_u10_sve128_f32 #define xexp10_u35 nsimd_sleef_exp10_u35_sve128_f64 #define xexp10f_u35 nsimd_sleef_exp10_u35_sve128_f32 #define xexpm1 nsimd_sleef_expm1_u10_sve128_f64 #define xexpm1f nsimd_sleef_expm1_u10_sve128_f32 #define xlog10 nsimd_sleef_log10_u10_sve128_f64 #define xlog10f nsimd_sleef_log10_u10_sve128_f32 #define xlog2 nsimd_sleef_log2_u10_sve128_f64 #define xlog2f nsimd_sleef_log2_u10_sve128_f32 #define xlog2_u35 nsimd_sleef_log2_u35_sve128_f64 #define xlog2f_u35 nsimd_sleef_log2_u35_sve128_f32 #define xlog1p nsimd_sleef_log1p_u10_sve128_f64 #define xlog1pf nsimd_sleef_log1p_u10_sve128_f32 #define xsincospi_u05 nsimd_sleef_sincospi_u05_sve128_f64 #define xsincospif_u05 nsimd_sleef_sincospi_u05_sve128_f32 #define xsincospi_u35 nsimd_sleef_sincospi_u35_sve128_f64 #define xsincospif_u35 nsimd_sleef_sincospi_u35_sve128_f32 #define xsinpi_u05 nsimd_sleef_sinpi_u05_sve128_f64 #define xsinpif_u05 nsimd_sleef_sinpi_u05_sve128_f32 #define xcospi_u05 nsimd_sleef_cospi_u05_sve128_f64 #define xcospif_u05 nsimd_sleef_cospi_u05_sve128_f32 #define xldexp nsimd_sleef_ldexp_sve128_f64 #define xldexpf nsimd_sleef_ldexp_sve128_f32 #define xilogb nsimd_sleef_ilogb_sve128_f64 #define xilogbf nsimd_sleef_ilogb_sve128_f32 #define xfma nsimd_sleef_fma_sve128_f64 #define xfmaf nsimd_sleef_fma_sve128_f32 #define xsqrt nsimd_sleef_sqrt_sve128_f64 #define xsqrtf nsimd_sleef_sqrt_sve128_f32 #define xsqrt_u05 nsimd_sleef_sqrt_u05_sve128_f64 #define xsqrtf_u05 nsimd_sleef_sqrt_u05_sve128_f32 #define xsqrt_u35 nsimd_sleef_sqrt_u35_sve128_f64 #define xsqrtf_u35 nsimd_sleef_sqrt_u35_sve128_f32 #define xhypot_u05 nsimd_sleef_hypot_u05_sve128_f64 #define xhypotf_u05 nsimd_sleef_hypot_u05_sve128_f32 #define xhypot_u35 nsimd_sleef_hypot_u35_sve128_f64 #define xhypotf_u35 nsimd_sleef_hypot_u35_sve128_f32 #define xfabs nsimd_sleef_fabs_sve128_f64 #define xfabsf nsimd_sleef_fabs_sve128_f32 #define xcopysign nsimd_sleef_copysign_sve128_f64 #define xcopysignf nsimd_sleef_copysign_sve128_f32 #define xfmax nsimd_sleef_fmax_sve128_f64 #define xfmaxf nsimd_sleef_fmax_sve128_f32 #define xfmin nsimd_sleef_fmin_sve128_f64 #define xfminf nsimd_sleef_fmin_sve128_f32 #define xfdim nsimd_sleef_fdim_sve128_f64 #define xfdimf nsimd_sleef_fdim_sve128_f32 #define xtrunc nsimd_sleef_trunc_sve128_f64 #define xtruncf nsimd_sleef_trunc_sve128_f32 #define xfloor nsimd_sleef_floor_sve128_f64 #define xfloorf nsimd_sleef_floor_sve128_f32 #define xceil nsimd_sleef_ceil_sve128_f64 #define xceilf nsimd_sleef_ceil_sve128_f32 #define xround nsimd_sleef_round_sve128_f64 #define xroundf nsimd_sleef_round_sve128_f32 #define xrint nsimd_sleef_rint_sve128_f64 #define xrintf nsimd_sleef_rint_sve128_f32 #define xnextafter nsimd_sleef_nextafter_sve128_f64 #define xnextafterf nsimd_sleef_nextafter_sve128_f32 #define xfrfrexp nsimd_sleef_frfrexp_sve128_f64 #define xfrfrexpf nsimd_sleef_frfrexp_sve128_f32 #define xexpfrexp nsimd_sleef_expfrexp_sve128_f64 #define xexpfrexpf nsimd_sleef_expfrexp_sve128_f32 #define xfmod nsimd_sleef_fmod_sve128_f64 #define xfmodf nsimd_sleef_fmod_sve128_f32 #define xremainder nsimd_sleef_remainder_sve128_f64 #define xremainderf nsimd_sleef_remainder_sve128_f32 #define xmodf nsimd_sleef_modf_sve128_f64 #define xmodff nsimd_sleef_modf_sve128_f32 #define xlgamma_u1 nsimd_sleef_lgamma_u10_sve128_f64 #define xlgammaf_u1 nsimd_sleef_lgamma_u10_sve128_f32 #define xtgamma_u1 nsimd_sleef_tgamma_u10_sve128_f64 #define xtgammaf_u1 nsimd_sleef_tgamma_u10_sve128_f32 #define xerf_u1 nsimd_sleef_erf_u10_sve128_f64 #define xerff_u1 nsimd_sleef_erf_u10_sve128_f32 #define xerfc_u15 nsimd_sleef_erfc_u15_sve128_f64 #define xerfcf_u15 nsimd_sleef_erfc_u15_sve128_f32 #define xgetInt nsimd_sleef_getInt_sve128_f64 #define xgetIntf nsimd_sleef_getInt_sve128_f32 #define xgetPtr nsimd_sleef_getPtr_sve128_f64 #define xgetPtrf nsimd_sleef_getPtr_sve128_f32 #endif #define rempi nsimd_sleef_rempi_sve128 #define rempif nsimd_sleef_rempif_sve128 #define rempisub nsimd_sleef_rempisub_sve128 #define rempisubf nsimd_sleef_rempisubf_sve128 #define gammak nsimd_gammak_sve128 #define gammafk nsimd_gammafk_sve128 #endif /* ------------------------------------------------------------------------- */ /* Naming of functions sve256 */ #ifdef NSIMD_SVE256 #ifdef DETERMINISTIC #define xsin nsimd_sleef_sin_u35d_sve256_f64 #define xsinf nsimd_sleef_sin_u35d_sve256_f32 #define xcos nsimd_sleef_cos_u35d_sve256_f64 #define xcosf nsimd_sleef_cos_u35d_sve256_f32 #define xsincos nsimd_sleef_sincos_u35d_sve256_f64 #define xsincosf nsimd_sleef_sincos_u35d_sve256_f32 #define xtan nsimd_sleef_tan_u35d_sve256_f64 #define xtanf nsimd_sleef_tan_u35d_sve256_f32 #define xasin nsimd_sleef_asin_u35d_sve256_f64 #define xasinf nsimd_sleef_asin_u35d_sve256_f32 #define xacos nsimd_sleef_acos_u35d_sve256_f64 #define xacosf nsimd_sleef_acos_u35d_sve256_f32 #define xatan nsimd_sleef_atan_u35d_sve256_f64 #define xatanf nsimd_sleef_atan_u35d_sve256_f32 #define xatan2 nsimd_sleef_atan2_u35d_sve256_f64 #define xatan2f nsimd_sleef_atan2_u35d_sve256_f32 #define xlog nsimd_sleef_log_u35d_sve256_f64 #define xlogf nsimd_sleef_log_u35d_sve256_f32 #define xcbrt nsimd_sleef_cbrt_u35d_sve256_f64 #define xcbrtf nsimd_sleef_cbrt_u35d_sve256_f32 #define xsin_u1 nsimd_sleef_sin_u10d_sve256_f64 #define xsinf_u1 nsimd_sleef_sin_u10d_sve256_f32 #define xcos_u1 nsimd_sleef_cos_u10d_sve256_f64 #define xcosf_u1 nsimd_sleef_cos_u10d_sve256_f32 #define xsincos_u1 nsimd_sleef_sincos_u10d_sve256_f64 #define xsincosf_u1 nsimd_sleef_sincos_u10d_sve256_f32 #define xtan_u1 nsimd_sleef_tan_u10d_sve256_f64 #define xtanf_u1 nsimd_sleef_tan_u10d_sve256_f32 #define xasin_u1 nsimd_sleef_asin_u10d_sve256_f64 #define xasinf_u1 nsimd_sleef_asin_u10d_sve256_f32 #define xacos_u1 nsimd_sleef_acos_u10d_sve256_f64 #define xacosf_u1 nsimd_sleef_acos_u10d_sve256_f32 #define xatan_u1 nsimd_sleef_atan_u10d_sve256_f64 #define xatanf_u1 nsimd_sleef_atan_u10d_sve256_f32 #define xatan2_u1 nsimd_sleef_atan2_u10d_sve256_f64 #define xatan2f_u1 nsimd_sleef_atan2_u10d_sve256_f32 #define xlog_u1 nsimd_sleef_log_u10d_sve256_f64 #define xlogf_u1 nsimd_sleef_log_u10d_sve256_f32 #define xcbrt_u1 nsimd_sleef_cbrt_u10d_sve256_f64 #define xcbrtf_u1 nsimd_sleef_cbrt_u10d_sve256_f32 #define xexp nsimd_sleef_exp_u10d_sve256_f64 #define xexpf nsimd_sleef_exp_u10d_sve256_f32 #define xpow nsimd_sleef_pow_u10d_sve256_f64 #define xpowf nsimd_sleef_pow_u10d_sve256_f32 #define xsinh nsimd_sleef_sinh_u10d_sve256_f64 #define xsinhf nsimd_sleef_sinh_u10d_sve256_f32 #define xcosh nsimd_sleef_cosh_u10d_sve256_f64 #define xcoshf nsimd_sleef_cosh_u10d_sve256_f32 #define xtanh nsimd_sleef_tanh_u10d_sve256_f64 #define xtanhf nsimd_sleef_tanh_u10d_sve256_f32 #define xsinh_u35 nsimd_sleef_sinh_u35d_sve256_f64 #define xsinhf_u35 nsimd_sleef_sinh_u35d_sve256_f32 #define xcosh_u35 nsimd_sleef_cosh_u35d_sve256_f64 #define xcoshf_u35 nsimd_sleef_cosh_u35d_sve256_f32 #define xtanh_u35 nsimd_sleef_tanh_u35d_sve256_f64 #define xtanhf_u35 nsimd_sleef_tanh_u35d_sve256_f32 #define xfastsin_u3500 nsimd_sleef_fastsin_u3500d_sve256_f64 #define xfastsinf_u3500 nsimd_sleef_fastsin_u3500d_sve256_f32 #define xfastcos_u3500 nsimd_sleef_fastcos_u3500d_sve256_f64 #define xfastcosf_u3500 nsimd_sleef_fastcos_u3500d_sve256_f32 #define xfastpow_u3500 nsimd_sleef_fastpow_u3500d_sve256_f64 #define xfastpowf_u3500 nsimd_sleef_fastpow_u3500d_sve256_f32 #define xasinh nsimd_sleef_asinh_u10d_sve256_f64 #define xasinhf nsimd_sleef_asinh_u10d_sve256_f32 #define xacosh nsimd_sleef_acosh_u10d_sve256_f64 #define xacoshf nsimd_sleef_acosh_u10d_sve256_f32 #define xatanh nsimd_sleef_atanh_u10d_sve256_f64 #define xatanhf nsimd_sleef_atanh_u10d_sve256_f32 #define xexp2 nsimd_sleef_exp2_u10d_sve256_f64 #define xexp2f nsimd_sleef_exp2_u10d_sve256_f32 #define xexp2_u35 nsimd_sleef_exp2_u35d_sve256_f64 #define xexp2f_u35 nsimd_sleef_exp2_u35d_sve256_f32 #define xexp10 nsimd_sleef_exp10_u10d_sve256_f64 #define xexp10f nsimd_sleef_exp10_u10d_sve256_f32 #define xexp10_u35 nsimd_sleef_exp10_u35d_sve256_f64 #define xexp10f_u35 nsimd_sleef_exp10_u35d_sve256_f32 #define xexpm1 nsimd_sleef_expm1_u10d_sve256_f64 #define xexpm1f nsimd_sleef_expm1_u10d_sve256_f32 #define xlog10 nsimd_sleef_log10_u10d_sve256_f64 #define xlog10f nsimd_sleef_log10_u10d_sve256_f32 #define xlog2 nsimd_sleef_log2_u10d_sve256_f64 #define xlog2f nsimd_sleef_log2_u10d_sve256_f32 #define xlog2_u35 nsimd_sleef_log2_u35d_sve256_f64 #define xlog2f_u35 nsimd_sleef_log2_u35d_sve256_f32 #define xlog1p nsimd_sleef_log1p_u10d_sve256_f64 #define xlog1pf nsimd_sleef_log1p_u10d_sve256_f32 #define xsincospi_u05 nsimd_sleef_sincospi_u05d_sve256_f64 #define xsincospif_u05 nsimd_sleef_sincospi_u05d_sve256_f32 #define xsincospi_u35 nsimd_sleef_sincospi_u35d_sve256_f64 #define xsincospif_u35 nsimd_sleef_sincospi_u35d_sve256_f32 #define xsinpi_u05 nsimd_sleef_sinpi_u05d_sve256_f64 #define xsinpif_u05 nsimd_sleef_sinpi_u05d_sve256_f32 #define xcospi_u05 nsimd_sleef_cospi_u05d_sve256_f64 #define xcospif_u05 nsimd_sleef_cospi_u05d_sve256_f32 #define xldexp nsimd_sleef_ldexp_sve256_f64 #define xldexpf nsimd_sleef_ldexp_sve256_f32 #define xilogb nsimd_sleef_ilogb_sve256_f64 #define xilogbf nsimd_sleef_ilogb_sve256_f32 #define xfma nsimd_sleef_fma_sve256_f64 #define xfmaf nsimd_sleef_fma_sve256_f32 #define xsqrt nsimd_sleef_sqrt_sve256_f64 #define xsqrtf nsimd_sleef_sqrt_sve256_f32 #define xsqrt_u05 nsimd_sleef_sqrt_u05d_sve256_f64 #define xsqrtf_u05 nsimd_sleef_sqrt_u05d_sve256_f32 #define xsqrt_u35 nsimd_sleef_sqrt_u35d_sve256_f64 #define xsqrtf_u35 nsimd_sleef_sqrt_u35d_sve256_f32 #define xhypot_u05 nsimd_sleef_hypot_u05d_sve256_f64 #define xhypotf_u05 nsimd_sleef_hypot_u05d_sve256_f32 #define xhypot_u35 nsimd_sleef_hypot_u35d_sve256_f64 #define xhypotf_u35 nsimd_sleef_hypot_u35d_sve256_f32 #define xfabs nsimd_sleef_fabs_sve256_f64 #define xfabsf nsimd_sleef_fabs_sve256_f32 #define xcopysign nsimd_sleef_copysign_sve256_f64 #define xcopysignf nsimd_sleef_copysign_sve256_f32 #define xfmax nsimd_sleef_fmax_sve256_f64 #define xfmaxf nsimd_sleef_fmax_sve256_f32 #define xfmin nsimd_sleef_fmin_sve256_f64 #define xfminf nsimd_sleef_fmin_sve256_f32 #define xfdim nsimd_sleef_fdim_sve256_f64 #define xfdimf nsimd_sleef_fdim_sve256_f32 #define xtrunc nsimd_sleef_trunc_sve256_f64 #define xtruncf nsimd_sleef_trunc_sve256_f32 #define xfloor nsimd_sleef_floor_sve256_f64 #define xfloorf nsimd_sleef_floor_sve256_f32 #define xceil nsimd_sleef_ceil_sve256_f64 #define xceilf nsimd_sleef_ceil_sve256_f32 #define xround nsimd_sleef_round_sve256_f64 #define xroundf nsimd_sleef_round_sve256_f32 #define xrint nsimd_sleef_rint_sve256_f64 #define xrintf nsimd_sleef_rint_sve256_f32 #define xnextafter nsimd_sleef_nextafter_sve256_f64 #define xnextafterf nsimd_sleef_nextafter_sve256_f32 #define xfrfrexp nsimd_sleef_frfrexp_sve256_f64 #define xfrfrexpf nsimd_sleef_frfrexp_sve256_f32 #define xexpfrexp nsimd_sleef_expfrexp_sve256_f64 #define xexpfrexpf nsimd_sleef_expfrexp_sve256_f32 #define xfmod nsimd_sleef_fmod_sve256_f64 #define xfmodf nsimd_sleef_fmod_sve256_f32 #define xremainder nsimd_sleef_remainder_sve256_f64 #define xremainderf nsimd_sleef_remainder_sve256_f32 #define xmodf nsimd_sleef_modf_sve256_f64 #define xmodff nsimd_sleef_modf_sve256_f32 #define xlgamma_u1 nsimd_sleef_lgamma_u10d_sve256_f64 #define xlgammaf_u1 nsimd_sleef_lgamma_u10d_sve256_f32 #define xtgamma_u1 nsimd_sleef_tgamma_u10d_sve256_f64 #define xtgammaf_u1 nsimd_sleef_tgamma_u10d_sve256_f32 #define xerf_u1 nsimd_sleef_erf_u10d_sve256_f64 #define xerff_u1 nsimd_sleef_erf_u10d_sve256_f32 #define xerfc_u15 nsimd_sleef_erfc_u15d_sve256_f64 #define xerfcf_u15 nsimd_sleef_erfc_u15d_sve256_f32 #define xgetInt nsimd_sleef_getInt_sve256_f64 #define xgetIntf nsimd_sleef_getInt_sve256_f32 #define xgetPtr nsimd_sleef_getPtr_sve256_f64 #define xgetPtrf nsimd_sleef_getPtr_sve256_f32 #else #define xsin nsimd_sleef_sin_u35_sve256_f64 #define xsinf nsimd_sleef_sin_u35_sve256_f32 #define xcos nsimd_sleef_cos_u35_sve256_f64 #define xcosf nsimd_sleef_cos_u35_sve256_f32 #define xsincos nsimd_sleef_sincos_u35_sve256_f64 #define xsincosf nsimd_sleef_sincos_u35_sve256_f32 #define xtan nsimd_sleef_tan_u35_sve256_f64 #define xtanf nsimd_sleef_tan_u35_sve256_f32 #define xasin nsimd_sleef_asin_u35_sve256_f64 #define xasinf nsimd_sleef_asin_u35_sve256_f32 #define xacos nsimd_sleef_acos_u35_sve256_f64 #define xacosf nsimd_sleef_acos_u35_sve256_f32 #define xatan nsimd_sleef_atan_u35_sve256_f64 #define xatanf nsimd_sleef_atan_u35_sve256_f32 #define xatan2 nsimd_sleef_atan2_u35_sve256_f64 #define xatan2f nsimd_sleef_atan2_u35_sve256_f32 #define xlog nsimd_sleef_log_u35_sve256_f64 #define xlogf nsimd_sleef_log_u35_sve256_f32 #define xcbrt nsimd_sleef_cbrt_u35_sve256_f64 #define xcbrtf nsimd_sleef_cbrt_u35_sve256_f32 #define xsin_u1 nsimd_sleef_sin_u10_sve256_f64 #define xsinf_u1 nsimd_sleef_sin_u10_sve256_f32 #define xcos_u1 nsimd_sleef_cos_u10_sve256_f64 #define xcosf_u1 nsimd_sleef_cos_u10_sve256_f32 #define xsincos_u1 nsimd_sleef_sincos_u10_sve256_f64 #define xsincosf_u1 nsimd_sleef_sincos_u10_sve256_f32 #define xtan_u1 nsimd_sleef_tan_u10_sve256_f64 #define xtanf_u1 nsimd_sleef_tan_u10_sve256_f32 #define xasin_u1 nsimd_sleef_asin_u10_sve256_f64 #define xasinf_u1 nsimd_sleef_asin_u10_sve256_f32 #define xacos_u1 nsimd_sleef_acos_u10_sve256_f64 #define xacosf_u1 nsimd_sleef_acos_u10_sve256_f32 #define xatan_u1 nsimd_sleef_atan_u10_sve256_f64 #define xatanf_u1 nsimd_sleef_atan_u10_sve256_f32 #define xatan2_u1 nsimd_sleef_atan2_u10_sve256_f64 #define xatan2f_u1 nsimd_sleef_atan2_u10_sve256_f32 #define xlog_u1 nsimd_sleef_log_u10_sve256_f64 #define xlogf_u1 nsimd_sleef_log_u10_sve256_f32 #define xcbrt_u1 nsimd_sleef_cbrt_u10_sve256_f64 #define xcbrtf_u1 nsimd_sleef_cbrt_u10_sve256_f32 #define xexp nsimd_sleef_exp_u10_sve256_f64 #define xexpf nsimd_sleef_exp_u10_sve256_f32 #define xpow nsimd_sleef_pow_u10_sve256_f64 #define xpowf nsimd_sleef_pow_u10_sve256_f32 #define xsinh nsimd_sleef_sinh_u10_sve256_f64 #define xsinhf nsimd_sleef_sinh_u10_sve256_f32 #define xcosh nsimd_sleef_cosh_u10_sve256_f64 #define xcoshf nsimd_sleef_cosh_u10_sve256_f32 #define xtanh nsimd_sleef_tanh_u10_sve256_f64 #define xtanhf nsimd_sleef_tanh_u10_sve256_f32 #define xsinh_u35 nsimd_sleef_sinh_u35_sve256_f64 #define xsinhf_u35 nsimd_sleef_sinh_u35_sve256_f32 #define xcosh_u35 nsimd_sleef_cosh_u35_sve256_f64 #define xcoshf_u35 nsimd_sleef_cosh_u35_sve256_f32 #define xtanh_u35 nsimd_sleef_tanh_u35_sve256_f64 #define xtanhf_u35 nsimd_sleef_tanh_u35_sve256_f32 #define xfastsin_u3500 nsimd_sleef_fastsin_u3500_sve256_f64 #define xfastsinf_u3500 nsimd_sleef_fastsin_u3500_sve256_f32 #define xfastcos_u3500 nsimd_sleef_fastcos_u3500_sve256_f64 #define xfastcosf_u3500 nsimd_sleef_fastcos_u3500_sve256_f32 #define xfastpow_u3500 nsimd_sleef_fastpow_u3500_sve256_f64 #define xfastpowf_u3500 nsimd_sleef_fastpow_u3500_sve256_f32 #define xasinh nsimd_sleef_asinh_u10_sve256_f64 #define xasinhf nsimd_sleef_asinh_u10_sve256_f32 #define xacosh nsimd_sleef_acosh_u10_sve256_f64 #define xacoshf nsimd_sleef_acosh_u10_sve256_f32 #define xatanh nsimd_sleef_atanh_u10_sve256_f64 #define xatanhf nsimd_sleef_atanh_u10_sve256_f32 #define xexp2 nsimd_sleef_exp2_u10_sve256_f64 #define xexp2f nsimd_sleef_exp2_u10_sve256_f32 #define xexp2_u35 nsimd_sleef_exp2_u35_sve256_f64 #define xexp2f_u35 nsimd_sleef_exp2_u35_sve256_f32 #define xexp10 nsimd_sleef_exp10_u10_sve256_f64 #define xexp10f nsimd_sleef_exp10_u10_sve256_f32 #define xexp10_u35 nsimd_sleef_exp10_u35_sve256_f64 #define xexp10f_u35 nsimd_sleef_exp10_u35_sve256_f32 #define xexpm1 nsimd_sleef_expm1_u10_sve256_f64 #define xexpm1f nsimd_sleef_expm1_u10_sve256_f32 #define xlog10 nsimd_sleef_log10_u10_sve256_f64 #define xlog10f nsimd_sleef_log10_u10_sve256_f32 #define xlog2 nsimd_sleef_log2_u10_sve256_f64 #define xlog2f nsimd_sleef_log2_u10_sve256_f32 #define xlog2_u35 nsimd_sleef_log2_u35_sve256_f64 #define xlog2f_u35 nsimd_sleef_log2_u35_sve256_f32 #define xlog1p nsimd_sleef_log1p_u10_sve256_f64 #define xlog1pf nsimd_sleef_log1p_u10_sve256_f32 #define xsincospi_u05 nsimd_sleef_sincospi_u05_sve256_f64 #define xsincospif_u05 nsimd_sleef_sincospi_u05_sve256_f32 #define xsincospi_u35 nsimd_sleef_sincospi_u35_sve256_f64 #define xsincospif_u35 nsimd_sleef_sincospi_u35_sve256_f32 #define xsinpi_u05 nsimd_sleef_sinpi_u05_sve256_f64 #define xsinpif_u05 nsimd_sleef_sinpi_u05_sve256_f32 #define xcospi_u05 nsimd_sleef_cospi_u05_sve256_f64 #define xcospif_u05 nsimd_sleef_cospi_u05_sve256_f32 #define xldexp nsimd_sleef_ldexp_sve256_f64 #define xldexpf nsimd_sleef_ldexp_sve256_f32 #define xilogb nsimd_sleef_ilogb_sve256_f64 #define xilogbf nsimd_sleef_ilogb_sve256_f32 #define xfma nsimd_sleef_fma_sve256_f64 #define xfmaf nsimd_sleef_fma_sve256_f32 #define xsqrt nsimd_sleef_sqrt_sve256_f64 #define xsqrtf nsimd_sleef_sqrt_sve256_f32 #define xsqrt_u05 nsimd_sleef_sqrt_u05_sve256_f64 #define xsqrtf_u05 nsimd_sleef_sqrt_u05_sve256_f32 #define xsqrt_u35 nsimd_sleef_sqrt_u35_sve256_f64 #define xsqrtf_u35 nsimd_sleef_sqrt_u35_sve256_f32 #define xhypot_u05 nsimd_sleef_hypot_u05_sve256_f64 #define xhypotf_u05 nsimd_sleef_hypot_u05_sve256_f32 #define xhypot_u35 nsimd_sleef_hypot_u35_sve256_f64 #define xhypotf_u35 nsimd_sleef_hypot_u35_sve256_f32 #define xfabs nsimd_sleef_fabs_sve256_f64 #define xfabsf nsimd_sleef_fabs_sve256_f32 #define xcopysign nsimd_sleef_copysign_sve256_f64 #define xcopysignf nsimd_sleef_copysign_sve256_f32 #define xfmax nsimd_sleef_fmax_sve256_f64 #define xfmaxf nsimd_sleef_fmax_sve256_f32 #define xfmin nsimd_sleef_fmin_sve256_f64 #define xfminf nsimd_sleef_fmin_sve256_f32 #define xfdim nsimd_sleef_fdim_sve256_f64 #define xfdimf nsimd_sleef_fdim_sve256_f32 #define xtrunc nsimd_sleef_trunc_sve256_f64 #define xtruncf nsimd_sleef_trunc_sve256_f32 #define xfloor nsimd_sleef_floor_sve256_f64 #define xfloorf nsimd_sleef_floor_sve256_f32 #define xceil nsimd_sleef_ceil_sve256_f64 #define xceilf nsimd_sleef_ceil_sve256_f32 #define xround nsimd_sleef_round_sve256_f64 #define xroundf nsimd_sleef_round_sve256_f32 #define xrint nsimd_sleef_rint_sve256_f64 #define xrintf nsimd_sleef_rint_sve256_f32 #define xnextafter nsimd_sleef_nextafter_sve256_f64 #define xnextafterf nsimd_sleef_nextafter_sve256_f32 #define xfrfrexp nsimd_sleef_frfrexp_sve256_f64 #define xfrfrexpf nsimd_sleef_frfrexp_sve256_f32 #define xexpfrexp nsimd_sleef_expfrexp_sve256_f64 #define xexpfrexpf nsimd_sleef_expfrexp_sve256_f32 #define xfmod nsimd_sleef_fmod_sve256_f64 #define xfmodf nsimd_sleef_fmod_sve256_f32 #define xremainder nsimd_sleef_remainder_sve256_f64 #define xremainderf nsimd_sleef_remainder_sve256_f32 #define xmodf nsimd_sleef_modf_sve256_f64 #define xmodff nsimd_sleef_modf_sve256_f32 #define xlgamma_u1 nsimd_sleef_lgamma_u10_sve256_f64 #define xlgammaf_u1 nsimd_sleef_lgamma_u10_sve256_f32 #define xtgamma_u1 nsimd_sleef_tgamma_u10_sve256_f64 #define xtgammaf_u1 nsimd_sleef_tgamma_u10_sve256_f32 #define xerf_u1 nsimd_sleef_erf_u10_sve256_f64 #define xerff_u1 nsimd_sleef_erf_u10_sve256_f32 #define xerfc_u15 nsimd_sleef_erfc_u15_sve256_f64 #define xerfcf_u15 nsimd_sleef_erfc_u15_sve256_f32 #define xgetInt nsimd_sleef_getInt_sve256_f64 #define xgetIntf nsimd_sleef_getInt_sve256_f32 #define xgetPtr nsimd_sleef_getPtr_sve256_f64 #define xgetPtrf nsimd_sleef_getPtr_sve256_f32 #endif #define rempi nsimd_sleef_rempi_sve256 #define rempif nsimd_sleef_rempif_sve256 #define rempisub nsimd_sleef_rempisub_sve256 #define rempisubf nsimd_sleef_rempisubf_sve256 #define gammak nsimd_gammak_sve256 #define gammafk nsimd_gammafk_sve256 #endif /* ------------------------------------------------------------------------- */ /* Naming of functions sve512 */ #ifdef NSIMD_SVE512 #ifdef DETERMINISTIC #define xsin nsimd_sleef_sin_u35d_sve512_f64 #define xsinf nsimd_sleef_sin_u35d_sve512_f32 #define xcos nsimd_sleef_cos_u35d_sve512_f64 #define xcosf nsimd_sleef_cos_u35d_sve512_f32 #define xsincos nsimd_sleef_sincos_u35d_sve512_f64 #define xsincosf nsimd_sleef_sincos_u35d_sve512_f32 #define xtan nsimd_sleef_tan_u35d_sve512_f64 #define xtanf nsimd_sleef_tan_u35d_sve512_f32 #define xasin nsimd_sleef_asin_u35d_sve512_f64 #define xasinf nsimd_sleef_asin_u35d_sve512_f32 #define xacos nsimd_sleef_acos_u35d_sve512_f64 #define xacosf nsimd_sleef_acos_u35d_sve512_f32 #define xatan nsimd_sleef_atan_u35d_sve512_f64 #define xatanf nsimd_sleef_atan_u35d_sve512_f32 #define xatan2 nsimd_sleef_atan2_u35d_sve512_f64 #define xatan2f nsimd_sleef_atan2_u35d_sve512_f32 #define xlog nsimd_sleef_log_u35d_sve512_f64 #define xlogf nsimd_sleef_log_u35d_sve512_f32 #define xcbrt nsimd_sleef_cbrt_u35d_sve512_f64 #define xcbrtf nsimd_sleef_cbrt_u35d_sve512_f32 #define xsin_u1 nsimd_sleef_sin_u10d_sve512_f64 #define xsinf_u1 nsimd_sleef_sin_u10d_sve512_f32 #define xcos_u1 nsimd_sleef_cos_u10d_sve512_f64 #define xcosf_u1 nsimd_sleef_cos_u10d_sve512_f32 #define xsincos_u1 nsimd_sleef_sincos_u10d_sve512_f64 #define xsincosf_u1 nsimd_sleef_sincos_u10d_sve512_f32 #define xtan_u1 nsimd_sleef_tan_u10d_sve512_f64 #define xtanf_u1 nsimd_sleef_tan_u10d_sve512_f32 #define xasin_u1 nsimd_sleef_asin_u10d_sve512_f64 #define xasinf_u1 nsimd_sleef_asin_u10d_sve512_f32 #define xacos_u1 nsimd_sleef_acos_u10d_sve512_f64 #define xacosf_u1 nsimd_sleef_acos_u10d_sve512_f32 #define xatan_u1 nsimd_sleef_atan_u10d_sve512_f64 #define xatanf_u1 nsimd_sleef_atan_u10d_sve512_f32 #define xatan2_u1 nsimd_sleef_atan2_u10d_sve512_f64 #define xatan2f_u1 nsimd_sleef_atan2_u10d_sve512_f32 #define xlog_u1 nsimd_sleef_log_u10d_sve512_f64 #define xlogf_u1 nsimd_sleef_log_u10d_sve512_f32 #define xcbrt_u1 nsimd_sleef_cbrt_u10d_sve512_f64 #define xcbrtf_u1 nsimd_sleef_cbrt_u10d_sve512_f32 #define xexp nsimd_sleef_exp_u10d_sve512_f64 #define xexpf nsimd_sleef_exp_u10d_sve512_f32 #define xpow nsimd_sleef_pow_u10d_sve512_f64 #define xpowf nsimd_sleef_pow_u10d_sve512_f32 #define xsinh nsimd_sleef_sinh_u10d_sve512_f64 #define xsinhf nsimd_sleef_sinh_u10d_sve512_f32 #define xcosh nsimd_sleef_cosh_u10d_sve512_f64 #define xcoshf nsimd_sleef_cosh_u10d_sve512_f32 #define xtanh nsimd_sleef_tanh_u10d_sve512_f64 #define xtanhf nsimd_sleef_tanh_u10d_sve512_f32 #define xsinh_u35 nsimd_sleef_sinh_u35d_sve512_f64 #define xsinhf_u35 nsimd_sleef_sinh_u35d_sve512_f32 #define xcosh_u35 nsimd_sleef_cosh_u35d_sve512_f64 #define xcoshf_u35 nsimd_sleef_cosh_u35d_sve512_f32 #define xtanh_u35 nsimd_sleef_tanh_u35d_sve512_f64 #define xtanhf_u35 nsimd_sleef_tanh_u35d_sve512_f32 #define xfastsin_u3500 nsimd_sleef_fastsin_u3500d_sve512_f64 #define xfastsinf_u3500 nsimd_sleef_fastsin_u3500d_sve512_f32 #define xfastcos_u3500 nsimd_sleef_fastcos_u3500d_sve512_f64 #define xfastcosf_u3500 nsimd_sleef_fastcos_u3500d_sve512_f32 #define xfastpow_u3500 nsimd_sleef_fastpow_u3500d_sve512_f64 #define xfastpowf_u3500 nsimd_sleef_fastpow_u3500d_sve512_f32 #define xasinh nsimd_sleef_asinh_u10d_sve512_f64 #define xasinhf nsimd_sleef_asinh_u10d_sve512_f32 #define xacosh nsimd_sleef_acosh_u10d_sve512_f64 #define xacoshf nsimd_sleef_acosh_u10d_sve512_f32 #define xatanh nsimd_sleef_atanh_u10d_sve512_f64 #define xatanhf nsimd_sleef_atanh_u10d_sve512_f32 #define xexp2 nsimd_sleef_exp2_u10d_sve512_f64 #define xexp2f nsimd_sleef_exp2_u10d_sve512_f32 #define xexp2_u35 nsimd_sleef_exp2_u35d_sve512_f64 #define xexp2f_u35 nsimd_sleef_exp2_u35d_sve512_f32 #define xexp10 nsimd_sleef_exp10_u10d_sve512_f64 #define xexp10f nsimd_sleef_exp10_u10d_sve512_f32 #define xexp10_u35 nsimd_sleef_exp10_u35d_sve512_f64 #define xexp10f_u35 nsimd_sleef_exp10_u35d_sve512_f32 #define xexpm1 nsimd_sleef_expm1_u10d_sve512_f64 #define xexpm1f nsimd_sleef_expm1_u10d_sve512_f32 #define xlog10 nsimd_sleef_log10_u10d_sve512_f64 #define xlog10f nsimd_sleef_log10_u10d_sve512_f32 #define xlog2 nsimd_sleef_log2_u10d_sve512_f64 #define xlog2f nsimd_sleef_log2_u10d_sve512_f32 #define xlog2_u35 nsimd_sleef_log2_u35d_sve512_f64 #define xlog2f_u35 nsimd_sleef_log2_u35d_sve512_f32 #define xlog1p nsimd_sleef_log1p_u10d_sve512_f64 #define xlog1pf nsimd_sleef_log1p_u10d_sve512_f32 #define xsincospi_u05 nsimd_sleef_sincospi_u05d_sve512_f64 #define xsincospif_u05 nsimd_sleef_sincospi_u05d_sve512_f32 #define xsincospi_u35 nsimd_sleef_sincospi_u35d_sve512_f64 #define xsincospif_u35 nsimd_sleef_sincospi_u35d_sve512_f32 #define xsinpi_u05 nsimd_sleef_sinpi_u05d_sve512_f64 #define xsinpif_u05 nsimd_sleef_sinpi_u05d_sve512_f32 #define xcospi_u05 nsimd_sleef_cospi_u05d_sve512_f64 #define xcospif_u05 nsimd_sleef_cospi_u05d_sve512_f32 #define xldexp nsimd_sleef_ldexp_sve512_f64 #define xldexpf nsimd_sleef_ldexp_sve512_f32 #define xilogb nsimd_sleef_ilogb_sve512_f64 #define xilogbf nsimd_sleef_ilogb_sve512_f32 #define xfma nsimd_sleef_fma_sve512_f64 #define xfmaf nsimd_sleef_fma_sve512_f32 #define xsqrt nsimd_sleef_sqrt_sve512_f64 #define xsqrtf nsimd_sleef_sqrt_sve512_f32 #define xsqrt_u05 nsimd_sleef_sqrt_u05d_sve512_f64 #define xsqrtf_u05 nsimd_sleef_sqrt_u05d_sve512_f32 #define xsqrt_u35 nsimd_sleef_sqrt_u35d_sve512_f64 #define xsqrtf_u35 nsimd_sleef_sqrt_u35d_sve512_f32 #define xhypot_u05 nsimd_sleef_hypot_u05d_sve512_f64 #define xhypotf_u05 nsimd_sleef_hypot_u05d_sve512_f32 #define xhypot_u35 nsimd_sleef_hypot_u35d_sve512_f64 #define xhypotf_u35 nsimd_sleef_hypot_u35d_sve512_f32 #define xfabs nsimd_sleef_fabs_sve512_f64 #define xfabsf nsimd_sleef_fabs_sve512_f32 #define xcopysign nsimd_sleef_copysign_sve512_f64 #define xcopysignf nsimd_sleef_copysign_sve512_f32 #define xfmax nsimd_sleef_fmax_sve512_f64 #define xfmaxf nsimd_sleef_fmax_sve512_f32 #define xfmin nsimd_sleef_fmin_sve512_f64 #define xfminf nsimd_sleef_fmin_sve512_f32 #define xfdim nsimd_sleef_fdim_sve512_f64 #define xfdimf nsimd_sleef_fdim_sve512_f32 #define xtrunc nsimd_sleef_trunc_sve512_f64 #define xtruncf nsimd_sleef_trunc_sve512_f32 #define xfloor nsimd_sleef_floor_sve512_f64 #define xfloorf nsimd_sleef_floor_sve512_f32 #define xceil nsimd_sleef_ceil_sve512_f64 #define xceilf nsimd_sleef_ceil_sve512_f32 #define xround nsimd_sleef_round_sve512_f64 #define xroundf nsimd_sleef_round_sve512_f32 #define xrint nsimd_sleef_rint_sve512_f64 #define xrintf nsimd_sleef_rint_sve512_f32 #define xnextafter nsimd_sleef_nextafter_sve512_f64 #define xnextafterf nsimd_sleef_nextafter_sve512_f32 #define xfrfrexp nsimd_sleef_frfrexp_sve512_f64 #define xfrfrexpf nsimd_sleef_frfrexp_sve512_f32 #define xexpfrexp nsimd_sleef_expfrexp_sve512_f64 #define xexpfrexpf nsimd_sleef_expfrexp_sve512_f32 #define xfmod nsimd_sleef_fmod_sve512_f64 #define xfmodf nsimd_sleef_fmod_sve512_f32 #define xremainder nsimd_sleef_remainder_sve512_f64 #define xremainderf nsimd_sleef_remainder_sve512_f32 #define xmodf nsimd_sleef_modf_sve512_f64 #define xmodff nsimd_sleef_modf_sve512_f32 #define xlgamma_u1 nsimd_sleef_lgamma_u10d_sve512_f64 #define xlgammaf_u1 nsimd_sleef_lgamma_u10d_sve512_f32 #define xtgamma_u1 nsimd_sleef_tgamma_u10d_sve512_f64 #define xtgammaf_u1 nsimd_sleef_tgamma_u10d_sve512_f32 #define xerf_u1 nsimd_sleef_erf_u10d_sve512_f64 #define xerff_u1 nsimd_sleef_erf_u10d_sve512_f32 #define xerfc_u15 nsimd_sleef_erfc_u15d_sve512_f64 #define xerfcf_u15 nsimd_sleef_erfc_u15d_sve512_f32 #define xgetInt nsimd_sleef_getInt_sve512_f64 #define xgetIntf nsimd_sleef_getInt_sve512_f32 #define xgetPtr nsimd_sleef_getPtr_sve512_f64 #define xgetPtrf nsimd_sleef_getPtr_sve512_f32 #else #define xsin nsimd_sleef_sin_u35_sve512_f64 #define xsinf nsimd_sleef_sin_u35_sve512_f32 #define xcos nsimd_sleef_cos_u35_sve512_f64 #define xcosf nsimd_sleef_cos_u35_sve512_f32 #define xsincos nsimd_sleef_sincos_u35_sve512_f64 #define xsincosf nsimd_sleef_sincos_u35_sve512_f32 #define xtan nsimd_sleef_tan_u35_sve512_f64 #define xtanf nsimd_sleef_tan_u35_sve512_f32 #define xasin nsimd_sleef_asin_u35_sve512_f64 #define xasinf nsimd_sleef_asin_u35_sve512_f32 #define xacos nsimd_sleef_acos_u35_sve512_f64 #define xacosf nsimd_sleef_acos_u35_sve512_f32 #define xatan nsimd_sleef_atan_u35_sve512_f64 #define xatanf nsimd_sleef_atan_u35_sve512_f32 #define xatan2 nsimd_sleef_atan2_u35_sve512_f64 #define xatan2f nsimd_sleef_atan2_u35_sve512_f32 #define xlog nsimd_sleef_log_u35_sve512_f64 #define xlogf nsimd_sleef_log_u35_sve512_f32 #define xcbrt nsimd_sleef_cbrt_u35_sve512_f64 #define xcbrtf nsimd_sleef_cbrt_u35_sve512_f32 #define xsin_u1 nsimd_sleef_sin_u10_sve512_f64 #define xsinf_u1 nsimd_sleef_sin_u10_sve512_f32 #define xcos_u1 nsimd_sleef_cos_u10_sve512_f64 #define xcosf_u1 nsimd_sleef_cos_u10_sve512_f32 #define xsincos_u1 nsimd_sleef_sincos_u10_sve512_f64 #define xsincosf_u1 nsimd_sleef_sincos_u10_sve512_f32 #define xtan_u1 nsimd_sleef_tan_u10_sve512_f64 #define xtanf_u1 nsimd_sleef_tan_u10_sve512_f32 #define xasin_u1 nsimd_sleef_asin_u10_sve512_f64 #define xasinf_u1 nsimd_sleef_asin_u10_sve512_f32 #define xacos_u1 nsimd_sleef_acos_u10_sve512_f64 #define xacosf_u1 nsimd_sleef_acos_u10_sve512_f32 #define xatan_u1 nsimd_sleef_atan_u10_sve512_f64 #define xatanf_u1 nsimd_sleef_atan_u10_sve512_f32 #define xatan2_u1 nsimd_sleef_atan2_u10_sve512_f64 #define xatan2f_u1 nsimd_sleef_atan2_u10_sve512_f32 #define xlog_u1 nsimd_sleef_log_u10_sve512_f64 #define xlogf_u1 nsimd_sleef_log_u10_sve512_f32 #define xcbrt_u1 nsimd_sleef_cbrt_u10_sve512_f64 #define xcbrtf_u1 nsimd_sleef_cbrt_u10_sve512_f32 #define xexp nsimd_sleef_exp_u10_sve512_f64 #define xexpf nsimd_sleef_exp_u10_sve512_f32 #define xpow nsimd_sleef_pow_u10_sve512_f64 #define xpowf nsimd_sleef_pow_u10_sve512_f32 #define xsinh nsimd_sleef_sinh_u10_sve512_f64 #define xsinhf nsimd_sleef_sinh_u10_sve512_f32 #define xcosh nsimd_sleef_cosh_u10_sve512_f64 #define xcoshf nsimd_sleef_cosh_u10_sve512_f32 #define xtanh nsimd_sleef_tanh_u10_sve512_f64 #define xtanhf nsimd_sleef_tanh_u10_sve512_f32 #define xsinh_u35 nsimd_sleef_sinh_u35_sve512_f64 #define xsinhf_u35 nsimd_sleef_sinh_u35_sve512_f32 #define xcosh_u35 nsimd_sleef_cosh_u35_sve512_f64 #define xcoshf_u35 nsimd_sleef_cosh_u35_sve512_f32 #define xtanh_u35 nsimd_sleef_tanh_u35_sve512_f64 #define xtanhf_u35 nsimd_sleef_tanh_u35_sve512_f32 #define xfastsin_u3500 nsimd_sleef_fastsin_u3500_sve512_f64 #define xfastsinf_u3500 nsimd_sleef_fastsin_u3500_sve512_f32 #define xfastcos_u3500 nsimd_sleef_fastcos_u3500_sve512_f64 #define xfastcosf_u3500 nsimd_sleef_fastcos_u3500_sve512_f32 #define xfastpow_u3500 nsimd_sleef_fastpow_u3500_sve512_f64 #define xfastpowf_u3500 nsimd_sleef_fastpow_u3500_sve512_f32 #define xasinh nsimd_sleef_asinh_u10_sve512_f64 #define xasinhf nsimd_sleef_asinh_u10_sve512_f32 #define xacosh nsimd_sleef_acosh_u10_sve512_f64 #define xacoshf nsimd_sleef_acosh_u10_sve512_f32 #define xatanh nsimd_sleef_atanh_u10_sve512_f64 #define xatanhf nsimd_sleef_atanh_u10_sve512_f32 #define xexp2 nsimd_sleef_exp2_u10_sve512_f64 #define xexp2f nsimd_sleef_exp2_u10_sve512_f32 #define xexp2_u35 nsimd_sleef_exp2_u35_sve512_f64 #define xexp2f_u35 nsimd_sleef_exp2_u35_sve512_f32 #define xexp10 nsimd_sleef_exp10_u10_sve512_f64 #define xexp10f nsimd_sleef_exp10_u10_sve512_f32 #define xexp10_u35 nsimd_sleef_exp10_u35_sve512_f64 #define xexp10f_u35 nsimd_sleef_exp10_u35_sve512_f32 #define xexpm1 nsimd_sleef_expm1_u10_sve512_f64 #define xexpm1f nsimd_sleef_expm1_u10_sve512_f32 #define xlog10 nsimd_sleef_log10_u10_sve512_f64 #define xlog10f nsimd_sleef_log10_u10_sve512_f32 #define xlog2 nsimd_sleef_log2_u10_sve512_f64 #define xlog2f nsimd_sleef_log2_u10_sve512_f32 #define xlog2_u35 nsimd_sleef_log2_u35_sve512_f64 #define xlog2f_u35 nsimd_sleef_log2_u35_sve512_f32 #define xlog1p nsimd_sleef_log1p_u10_sve512_f64 #define xlog1pf nsimd_sleef_log1p_u10_sve512_f32 #define xsincospi_u05 nsimd_sleef_sincospi_u05_sve512_f64 #define xsincospif_u05 nsimd_sleef_sincospi_u05_sve512_f32 #define xsincospi_u35 nsimd_sleef_sincospi_u35_sve512_f64 #define xsincospif_u35 nsimd_sleef_sincospi_u35_sve512_f32 #define xsinpi_u05 nsimd_sleef_sinpi_u05_sve512_f64 #define xsinpif_u05 nsimd_sleef_sinpi_u05_sve512_f32 #define xcospi_u05 nsimd_sleef_cospi_u05_sve512_f64 #define xcospif_u05 nsimd_sleef_cospi_u05_sve512_f32 #define xldexp nsimd_sleef_ldexp_sve512_f64 #define xldexpf nsimd_sleef_ldexp_sve512_f32 #define xilogb nsimd_sleef_ilogb_sve512_f64 #define xilogbf nsimd_sleef_ilogb_sve512_f32 #define xfma nsimd_sleef_fma_sve512_f64 #define xfmaf nsimd_sleef_fma_sve512_f32 #define xsqrt nsimd_sleef_sqrt_sve512_f64 #define xsqrtf nsimd_sleef_sqrt_sve512_f32 #define xsqrt_u05 nsimd_sleef_sqrt_u05_sve512_f64 #define xsqrtf_u05 nsimd_sleef_sqrt_u05_sve512_f32 #define xsqrt_u35 nsimd_sleef_sqrt_u35_sve512_f64 #define xsqrtf_u35 nsimd_sleef_sqrt_u35_sve512_f32 #define xhypot_u05 nsimd_sleef_hypot_u05_sve512_f64 #define xhypotf_u05 nsimd_sleef_hypot_u05_sve512_f32 #define xhypot_u35 nsimd_sleef_hypot_u35_sve512_f64 #define xhypotf_u35 nsimd_sleef_hypot_u35_sve512_f32 #define xfabs nsimd_sleef_fabs_sve512_f64 #define xfabsf nsimd_sleef_fabs_sve512_f32 #define xcopysign nsimd_sleef_copysign_sve512_f64 #define xcopysignf nsimd_sleef_copysign_sve512_f32 #define xfmax nsimd_sleef_fmax_sve512_f64 #define xfmaxf nsimd_sleef_fmax_sve512_f32 #define xfmin nsimd_sleef_fmin_sve512_f64 #define xfminf nsimd_sleef_fmin_sve512_f32 #define xfdim nsimd_sleef_fdim_sve512_f64 #define xfdimf nsimd_sleef_fdim_sve512_f32 #define xtrunc nsimd_sleef_trunc_sve512_f64 #define xtruncf nsimd_sleef_trunc_sve512_f32 #define xfloor nsimd_sleef_floor_sve512_f64 #define xfloorf nsimd_sleef_floor_sve512_f32 #define xceil nsimd_sleef_ceil_sve512_f64 #define xceilf nsimd_sleef_ceil_sve512_f32 #define xround nsimd_sleef_round_sve512_f64 #define xroundf nsimd_sleef_round_sve512_f32 #define xrint nsimd_sleef_rint_sve512_f64 #define xrintf nsimd_sleef_rint_sve512_f32 #define xnextafter nsimd_sleef_nextafter_sve512_f64 #define xnextafterf nsimd_sleef_nextafter_sve512_f32 #define xfrfrexp nsimd_sleef_frfrexp_sve512_f64 #define xfrfrexpf nsimd_sleef_frfrexp_sve512_f32 #define xexpfrexp nsimd_sleef_expfrexp_sve512_f64 #define xexpfrexpf nsimd_sleef_expfrexp_sve512_f32 #define xfmod nsimd_sleef_fmod_sve512_f64 #define xfmodf nsimd_sleef_fmod_sve512_f32 #define xremainder nsimd_sleef_remainder_sve512_f64 #define xremainderf nsimd_sleef_remainder_sve512_f32 #define xmodf nsimd_sleef_modf_sve512_f64 #define xmodff nsimd_sleef_modf_sve512_f32 #define xlgamma_u1 nsimd_sleef_lgamma_u10_sve512_f64 #define xlgammaf_u1 nsimd_sleef_lgamma_u10_sve512_f32 #define xtgamma_u1 nsimd_sleef_tgamma_u10_sve512_f64 #define xtgammaf_u1 nsimd_sleef_tgamma_u10_sve512_f32 #define xerf_u1 nsimd_sleef_erf_u10_sve512_f64 #define xerff_u1 nsimd_sleef_erf_u10_sve512_f32 #define xerfc_u15 nsimd_sleef_erfc_u15_sve512_f64 #define xerfcf_u15 nsimd_sleef_erfc_u15_sve512_f32 #define xgetInt nsimd_sleef_getInt_sve512_f64 #define xgetIntf nsimd_sleef_getInt_sve512_f32 #define xgetPtr nsimd_sleef_getPtr_sve512_f64 #define xgetPtrf nsimd_sleef_getPtr_sve512_f32 #endif #define rempi nsimd_sleef_rempi_sve512 #define rempif nsimd_sleef_rempif_sve512 #define rempisub nsimd_sleef_rempisub_sve512 #define rempisubf nsimd_sleef_rempisubf_sve512 #define gammak nsimd_gammak_sve512 #define gammafk nsimd_gammafk_sve512 #endif /* ------------------------------------------------------------------------- */ /* Naming of functions sve1024 */ #ifdef NSIMD_SVE1024 #ifdef DETERMINISTIC #define xsin nsimd_sleef_sin_u35d_sve1024_f64 #define xsinf nsimd_sleef_sin_u35d_sve1024_f32 #define xcos nsimd_sleef_cos_u35d_sve1024_f64 #define xcosf nsimd_sleef_cos_u35d_sve1024_f32 #define xsincos nsimd_sleef_sincos_u35d_sve1024_f64 #define xsincosf nsimd_sleef_sincos_u35d_sve1024_f32 #define xtan nsimd_sleef_tan_u35d_sve1024_f64 #define xtanf nsimd_sleef_tan_u35d_sve1024_f32 #define xasin nsimd_sleef_asin_u35d_sve1024_f64 #define xasinf nsimd_sleef_asin_u35d_sve1024_f32 #define xacos nsimd_sleef_acos_u35d_sve1024_f64 #define xacosf nsimd_sleef_acos_u35d_sve1024_f32 #define xatan nsimd_sleef_atan_u35d_sve1024_f64 #define xatanf nsimd_sleef_atan_u35d_sve1024_f32 #define xatan2 nsimd_sleef_atan2_u35d_sve1024_f64 #define xatan2f nsimd_sleef_atan2_u35d_sve1024_f32 #define xlog nsimd_sleef_log_u35d_sve1024_f64 #define xlogf nsimd_sleef_log_u35d_sve1024_f32 #define xcbrt nsimd_sleef_cbrt_u35d_sve1024_f64 #define xcbrtf nsimd_sleef_cbrt_u35d_sve1024_f32 #define xsin_u1 nsimd_sleef_sin_u10d_sve1024_f64 #define xsinf_u1 nsimd_sleef_sin_u10d_sve1024_f32 #define xcos_u1 nsimd_sleef_cos_u10d_sve1024_f64 #define xcosf_u1 nsimd_sleef_cos_u10d_sve1024_f32 #define xsincos_u1 nsimd_sleef_sincos_u10d_sve1024_f64 #define xsincosf_u1 nsimd_sleef_sincos_u10d_sve1024_f32 #define xtan_u1 nsimd_sleef_tan_u10d_sve1024_f64 #define xtanf_u1 nsimd_sleef_tan_u10d_sve1024_f32 #define xasin_u1 nsimd_sleef_asin_u10d_sve1024_f64 #define xasinf_u1 nsimd_sleef_asin_u10d_sve1024_f32 #define xacos_u1 nsimd_sleef_acos_u10d_sve1024_f64 #define xacosf_u1 nsimd_sleef_acos_u10d_sve1024_f32 #define xatan_u1 nsimd_sleef_atan_u10d_sve1024_f64 #define xatanf_u1 nsimd_sleef_atan_u10d_sve1024_f32 #define xatan2_u1 nsimd_sleef_atan2_u10d_sve1024_f64 #define xatan2f_u1 nsimd_sleef_atan2_u10d_sve1024_f32 #define xlog_u1 nsimd_sleef_log_u10d_sve1024_f64 #define xlogf_u1 nsimd_sleef_log_u10d_sve1024_f32 #define xcbrt_u1 nsimd_sleef_cbrt_u10d_sve1024_f64 #define xcbrtf_u1 nsimd_sleef_cbrt_u10d_sve1024_f32 #define xexp nsimd_sleef_exp_u10d_sve1024_f64 #define xexpf nsimd_sleef_exp_u10d_sve1024_f32 #define xpow nsimd_sleef_pow_u10d_sve1024_f64 #define xpowf nsimd_sleef_pow_u10d_sve1024_f32 #define xsinh nsimd_sleef_sinh_u10d_sve1024_f64 #define xsinhf nsimd_sleef_sinh_u10d_sve1024_f32 #define xcosh nsimd_sleef_cosh_u10d_sve1024_f64 #define xcoshf nsimd_sleef_cosh_u10d_sve1024_f32 #define xtanh nsimd_sleef_tanh_u10d_sve1024_f64 #define xtanhf nsimd_sleef_tanh_u10d_sve1024_f32 #define xsinh_u35 nsimd_sleef_sinh_u35d_sve1024_f64 #define xsinhf_u35 nsimd_sleef_sinh_u35d_sve1024_f32 #define xcosh_u35 nsimd_sleef_cosh_u35d_sve1024_f64 #define xcoshf_u35 nsimd_sleef_cosh_u35d_sve1024_f32 #define xtanh_u35 nsimd_sleef_tanh_u35d_sve1024_f64 #define xtanhf_u35 nsimd_sleef_tanh_u35d_sve1024_f32 #define xfastsin_u3500 nsimd_sleef_fastsin_u3500d_sve1024_f64 #define xfastsinf_u3500 nsimd_sleef_fastsin_u3500d_sve1024_f32 #define xfastcos_u3500 nsimd_sleef_fastcos_u3500d_sve1024_f64 #define xfastcosf_u3500 nsimd_sleef_fastcos_u3500d_sve1024_f32 #define xfastpow_u3500 nsimd_sleef_fastpow_u3500d_sve1024_f64 #define xfastpowf_u3500 nsimd_sleef_fastpow_u3500d_sve1024_f32 #define xasinh nsimd_sleef_asinh_u10d_sve1024_f64 #define xasinhf nsimd_sleef_asinh_u10d_sve1024_f32 #define xacosh nsimd_sleef_acosh_u10d_sve1024_f64 #define xacoshf nsimd_sleef_acosh_u10d_sve1024_f32 #define xatanh nsimd_sleef_atanh_u10d_sve1024_f64 #define xatanhf nsimd_sleef_atanh_u10d_sve1024_f32 #define xexp2 nsimd_sleef_exp2_u10d_sve1024_f64 #define xexp2f nsimd_sleef_exp2_u10d_sve1024_f32 #define xexp2_u35 nsimd_sleef_exp2_u35d_sve1024_f64 #define xexp2f_u35 nsimd_sleef_exp2_u35d_sve1024_f32 #define xexp10 nsimd_sleef_exp10_u10d_sve1024_f64 #define xexp10f nsimd_sleef_exp10_u10d_sve1024_f32 #define xexp10_u35 nsimd_sleef_exp10_u35d_sve1024_f64 #define xexp10f_u35 nsimd_sleef_exp10_u35d_sve1024_f32 #define xexpm1 nsimd_sleef_expm1_u10d_sve1024_f64 #define xexpm1f nsimd_sleef_expm1_u10d_sve1024_f32 #define xlog10 nsimd_sleef_log10_u10d_sve1024_f64 #define xlog10f nsimd_sleef_log10_u10d_sve1024_f32 #define xlog2 nsimd_sleef_log2_u10d_sve1024_f64 #define xlog2f nsimd_sleef_log2_u10d_sve1024_f32 #define xlog2_u35 nsimd_sleef_log2_u35d_sve1024_f64 #define xlog2f_u35 nsimd_sleef_log2_u35d_sve1024_f32 #define xlog1p nsimd_sleef_log1p_u10d_sve1024_f64 #define xlog1pf nsimd_sleef_log1p_u10d_sve1024_f32 #define xsincospi_u05 nsimd_sleef_sincospi_u05d_sve1024_f64 #define xsincospif_u05 nsimd_sleef_sincospi_u05d_sve1024_f32 #define xsincospi_u35 nsimd_sleef_sincospi_u35d_sve1024_f64 #define xsincospif_u35 nsimd_sleef_sincospi_u35d_sve1024_f32 #define xsinpi_u05 nsimd_sleef_sinpi_u05d_sve1024_f64 #define xsinpif_u05 nsimd_sleef_sinpi_u05d_sve1024_f32 #define xcospi_u05 nsimd_sleef_cospi_u05d_sve1024_f64 #define xcospif_u05 nsimd_sleef_cospi_u05d_sve1024_f32 #define xldexp nsimd_sleef_ldexp_sve1024_f64 #define xldexpf nsimd_sleef_ldexp_sve1024_f32 #define xilogb nsimd_sleef_ilogb_sve1024_f64 #define xilogbf nsimd_sleef_ilogb_sve1024_f32 #define xfma nsimd_sleef_fma_sve1024_f64 #define xfmaf nsimd_sleef_fma_sve1024_f32 #define xsqrt nsimd_sleef_sqrt_sve1024_f64 #define xsqrtf nsimd_sleef_sqrt_sve1024_f32 #define xsqrt_u05 nsimd_sleef_sqrt_u05d_sve1024_f64 #define xsqrtf_u05 nsimd_sleef_sqrt_u05d_sve1024_f32 #define xsqrt_u35 nsimd_sleef_sqrt_u35d_sve1024_f64 #define xsqrtf_u35 nsimd_sleef_sqrt_u35d_sve1024_f32 #define xhypot_u05 nsimd_sleef_hypot_u05d_sve1024_f64 #define xhypotf_u05 nsimd_sleef_hypot_u05d_sve1024_f32 #define xhypot_u35 nsimd_sleef_hypot_u35d_sve1024_f64 #define xhypotf_u35 nsimd_sleef_hypot_u35d_sve1024_f32 #define xfabs nsimd_sleef_fabs_sve1024_f64 #define xfabsf nsimd_sleef_fabs_sve1024_f32 #define xcopysign nsimd_sleef_copysign_sve1024_f64 #define xcopysignf nsimd_sleef_copysign_sve1024_f32 #define xfmax nsimd_sleef_fmax_sve1024_f64 #define xfmaxf nsimd_sleef_fmax_sve1024_f32 #define xfmin nsimd_sleef_fmin_sve1024_f64 #define xfminf nsimd_sleef_fmin_sve1024_f32 #define xfdim nsimd_sleef_fdim_sve1024_f64 #define xfdimf nsimd_sleef_fdim_sve1024_f32 #define xtrunc nsimd_sleef_trunc_sve1024_f64 #define xtruncf nsimd_sleef_trunc_sve1024_f32 #define xfloor nsimd_sleef_floor_sve1024_f64 #define xfloorf nsimd_sleef_floor_sve1024_f32 #define xceil nsimd_sleef_ceil_sve1024_f64 #define xceilf nsimd_sleef_ceil_sve1024_f32 #define xround nsimd_sleef_round_sve1024_f64 #define xroundf nsimd_sleef_round_sve1024_f32 #define xrint nsimd_sleef_rint_sve1024_f64 #define xrintf nsimd_sleef_rint_sve1024_f32 #define xnextafter nsimd_sleef_nextafter_sve1024_f64 #define xnextafterf nsimd_sleef_nextafter_sve1024_f32 #define xfrfrexp nsimd_sleef_frfrexp_sve1024_f64 #define xfrfrexpf nsimd_sleef_frfrexp_sve1024_f32 #define xexpfrexp nsimd_sleef_expfrexp_sve1024_f64 #define xexpfrexpf nsimd_sleef_expfrexp_sve1024_f32 #define xfmod nsimd_sleef_fmod_sve1024_f64 #define xfmodf nsimd_sleef_fmod_sve1024_f32 #define xremainder nsimd_sleef_remainder_sve1024_f64 #define xremainderf nsimd_sleef_remainder_sve1024_f32 #define xmodf nsimd_sleef_modf_sve1024_f64 #define xmodff nsimd_sleef_modf_sve1024_f32 #define xlgamma_u1 nsimd_sleef_lgamma_u10d_sve1024_f64 #define xlgammaf_u1 nsimd_sleef_lgamma_u10d_sve1024_f32 #define xtgamma_u1 nsimd_sleef_tgamma_u10d_sve1024_f64 #define xtgammaf_u1 nsimd_sleef_tgamma_u10d_sve1024_f32 #define xerf_u1 nsimd_sleef_erf_u10d_sve1024_f64 #define xerff_u1 nsimd_sleef_erf_u10d_sve1024_f32 #define xerfc_u15 nsimd_sleef_erfc_u15d_sve1024_f64 #define xerfcf_u15 nsimd_sleef_erfc_u15d_sve1024_f32 #define xgetInt nsimd_sleef_getInt_sve1024_f64 #define xgetIntf nsimd_sleef_getInt_sve1024_f32 #define xgetPtr nsimd_sleef_getPtr_sve1024_f64 #define xgetPtrf nsimd_sleef_getPtr_sve1024_f32 #else #define xsin nsimd_sleef_sin_u35_sve1024_f64 #define xsinf nsimd_sleef_sin_u35_sve1024_f32 #define xcos nsimd_sleef_cos_u35_sve1024_f64 #define xcosf nsimd_sleef_cos_u35_sve1024_f32 #define xsincos nsimd_sleef_sincos_u35_sve1024_f64 #define xsincosf nsimd_sleef_sincos_u35_sve1024_f32 #define xtan nsimd_sleef_tan_u35_sve1024_f64 #define xtanf nsimd_sleef_tan_u35_sve1024_f32 #define xasin nsimd_sleef_asin_u35_sve1024_f64 #define xasinf nsimd_sleef_asin_u35_sve1024_f32 #define xacos nsimd_sleef_acos_u35_sve1024_f64 #define xacosf nsimd_sleef_acos_u35_sve1024_f32 #define xatan nsimd_sleef_atan_u35_sve1024_f64 #define xatanf nsimd_sleef_atan_u35_sve1024_f32 #define xatan2 nsimd_sleef_atan2_u35_sve1024_f64 #define xatan2f nsimd_sleef_atan2_u35_sve1024_f32 #define xlog nsimd_sleef_log_u35_sve1024_f64 #define xlogf nsimd_sleef_log_u35_sve1024_f32 #define xcbrt nsimd_sleef_cbrt_u35_sve1024_f64 #define xcbrtf nsimd_sleef_cbrt_u35_sve1024_f32 #define xsin_u1 nsimd_sleef_sin_u10_sve1024_f64 #define xsinf_u1 nsimd_sleef_sin_u10_sve1024_f32 #define xcos_u1 nsimd_sleef_cos_u10_sve1024_f64 #define xcosf_u1 nsimd_sleef_cos_u10_sve1024_f32 #define xsincos_u1 nsimd_sleef_sincos_u10_sve1024_f64 #define xsincosf_u1 nsimd_sleef_sincos_u10_sve1024_f32 #define xtan_u1 nsimd_sleef_tan_u10_sve1024_f64 #define xtanf_u1 nsimd_sleef_tan_u10_sve1024_f32 #define xasin_u1 nsimd_sleef_asin_u10_sve1024_f64 #define xasinf_u1 nsimd_sleef_asin_u10_sve1024_f32 #define xacos_u1 nsimd_sleef_acos_u10_sve1024_f64 #define xacosf_u1 nsimd_sleef_acos_u10_sve1024_f32 #define xatan_u1 nsimd_sleef_atan_u10_sve1024_f64 #define xatanf_u1 nsimd_sleef_atan_u10_sve1024_f32 #define xatan2_u1 nsimd_sleef_atan2_u10_sve1024_f64 #define xatan2f_u1 nsimd_sleef_atan2_u10_sve1024_f32 #define xlog_u1 nsimd_sleef_log_u10_sve1024_f64 #define xlogf_u1 nsimd_sleef_log_u10_sve1024_f32 #define xcbrt_u1 nsimd_sleef_cbrt_u10_sve1024_f64 #define xcbrtf_u1 nsimd_sleef_cbrt_u10_sve1024_f32 #define xexp nsimd_sleef_exp_u10_sve1024_f64 #define xexpf nsimd_sleef_exp_u10_sve1024_f32 #define xpow nsimd_sleef_pow_u10_sve1024_f64 #define xpowf nsimd_sleef_pow_u10_sve1024_f32 #define xsinh nsimd_sleef_sinh_u10_sve1024_f64 #define xsinhf nsimd_sleef_sinh_u10_sve1024_f32 #define xcosh nsimd_sleef_cosh_u10_sve1024_f64 #define xcoshf nsimd_sleef_cosh_u10_sve1024_f32 #define xtanh nsimd_sleef_tanh_u10_sve1024_f64 #define xtanhf nsimd_sleef_tanh_u10_sve1024_f32 #define xsinh_u35 nsimd_sleef_sinh_u35_sve1024_f64 #define xsinhf_u35 nsimd_sleef_sinh_u35_sve1024_f32 #define xcosh_u35 nsimd_sleef_cosh_u35_sve1024_f64 #define xcoshf_u35 nsimd_sleef_cosh_u35_sve1024_f32 #define xtanh_u35 nsimd_sleef_tanh_u35_sve1024_f64 #define xtanhf_u35 nsimd_sleef_tanh_u35_sve1024_f32 #define xfastsin_u3500 nsimd_sleef_fastsin_u3500_sve1024_f64 #define xfastsinf_u3500 nsimd_sleef_fastsin_u3500_sve1024_f32 #define xfastcos_u3500 nsimd_sleef_fastcos_u3500_sve1024_f64 #define xfastcosf_u3500 nsimd_sleef_fastcos_u3500_sve1024_f32 #define xfastpow_u3500 nsimd_sleef_fastpow_u3500_sve1024_f64 #define xfastpowf_u3500 nsimd_sleef_fastpow_u3500_sve1024_f32 #define xasinh nsimd_sleef_asinh_u10_sve1024_f64 #define xasinhf nsimd_sleef_asinh_u10_sve1024_f32 #define xacosh nsimd_sleef_acosh_u10_sve1024_f64 #define xacoshf nsimd_sleef_acosh_u10_sve1024_f32 #define xatanh nsimd_sleef_atanh_u10_sve1024_f64 #define xatanhf nsimd_sleef_atanh_u10_sve1024_f32 #define xexp2 nsimd_sleef_exp2_u10_sve1024_f64 #define xexp2f nsimd_sleef_exp2_u10_sve1024_f32 #define xexp2_u35 nsimd_sleef_exp2_u35_sve1024_f64 #define xexp2f_u35 nsimd_sleef_exp2_u35_sve1024_f32 #define xexp10 nsimd_sleef_exp10_u10_sve1024_f64 #define xexp10f nsimd_sleef_exp10_u10_sve1024_f32 #define xexp10_u35 nsimd_sleef_exp10_u35_sve1024_f64 #define xexp10f_u35 nsimd_sleef_exp10_u35_sve1024_f32 #define xexpm1 nsimd_sleef_expm1_u10_sve1024_f64 #define xexpm1f nsimd_sleef_expm1_u10_sve1024_f32 #define xlog10 nsimd_sleef_log10_u10_sve1024_f64 #define xlog10f nsimd_sleef_log10_u10_sve1024_f32 #define xlog2 nsimd_sleef_log2_u10_sve1024_f64 #define xlog2f nsimd_sleef_log2_u10_sve1024_f32 #define xlog2_u35 nsimd_sleef_log2_u35_sve1024_f64 #define xlog2f_u35 nsimd_sleef_log2_u35_sve1024_f32 #define xlog1p nsimd_sleef_log1p_u10_sve1024_f64 #define xlog1pf nsimd_sleef_log1p_u10_sve1024_f32 #define xsincospi_u05 nsimd_sleef_sincospi_u05_sve1024_f64 #define xsincospif_u05 nsimd_sleef_sincospi_u05_sve1024_f32 #define xsincospi_u35 nsimd_sleef_sincospi_u35_sve1024_f64 #define xsincospif_u35 nsimd_sleef_sincospi_u35_sve1024_f32 #define xsinpi_u05 nsimd_sleef_sinpi_u05_sve1024_f64 #define xsinpif_u05 nsimd_sleef_sinpi_u05_sve1024_f32 #define xcospi_u05 nsimd_sleef_cospi_u05_sve1024_f64 #define xcospif_u05 nsimd_sleef_cospi_u05_sve1024_f32 #define xldexp nsimd_sleef_ldexp_sve1024_f64 #define xldexpf nsimd_sleef_ldexp_sve1024_f32 #define xilogb nsimd_sleef_ilogb_sve1024_f64 #define xilogbf nsimd_sleef_ilogb_sve1024_f32 #define xfma nsimd_sleef_fma_sve1024_f64 #define xfmaf nsimd_sleef_fma_sve1024_f32 #define xsqrt nsimd_sleef_sqrt_sve1024_f64 #define xsqrtf nsimd_sleef_sqrt_sve1024_f32 #define xsqrt_u05 nsimd_sleef_sqrt_u05_sve1024_f64 #define xsqrtf_u05 nsimd_sleef_sqrt_u05_sve1024_f32 #define xsqrt_u35 nsimd_sleef_sqrt_u35_sve1024_f64 #define xsqrtf_u35 nsimd_sleef_sqrt_u35_sve1024_f32 #define xhypot_u05 nsimd_sleef_hypot_u05_sve1024_f64 #define xhypotf_u05 nsimd_sleef_hypot_u05_sve1024_f32 #define xhypot_u35 nsimd_sleef_hypot_u35_sve1024_f64 #define xhypotf_u35 nsimd_sleef_hypot_u35_sve1024_f32 #define xfabs nsimd_sleef_fabs_sve1024_f64 #define xfabsf nsimd_sleef_fabs_sve1024_f32 #define xcopysign nsimd_sleef_copysign_sve1024_f64 #define xcopysignf nsimd_sleef_copysign_sve1024_f32 #define xfmax nsimd_sleef_fmax_sve1024_f64 #define xfmaxf nsimd_sleef_fmax_sve1024_f32 #define xfmin nsimd_sleef_fmin_sve1024_f64 #define xfminf nsimd_sleef_fmin_sve1024_f32 #define xfdim nsimd_sleef_fdim_sve1024_f64 #define xfdimf nsimd_sleef_fdim_sve1024_f32 #define xtrunc nsimd_sleef_trunc_sve1024_f64 #define xtruncf nsimd_sleef_trunc_sve1024_f32 #define xfloor nsimd_sleef_floor_sve1024_f64 #define xfloorf nsimd_sleef_floor_sve1024_f32 #define xceil nsimd_sleef_ceil_sve1024_f64 #define xceilf nsimd_sleef_ceil_sve1024_f32 #define xround nsimd_sleef_round_sve1024_f64 #define xroundf nsimd_sleef_round_sve1024_f32 #define xrint nsimd_sleef_rint_sve1024_f64 #define xrintf nsimd_sleef_rint_sve1024_f32 #define xnextafter nsimd_sleef_nextafter_sve1024_f64 #define xnextafterf nsimd_sleef_nextafter_sve1024_f32 #define xfrfrexp nsimd_sleef_frfrexp_sve1024_f64 #define xfrfrexpf nsimd_sleef_frfrexp_sve1024_f32 #define xexpfrexp nsimd_sleef_expfrexp_sve1024_f64 #define xexpfrexpf nsimd_sleef_expfrexp_sve1024_f32 #define xfmod nsimd_sleef_fmod_sve1024_f64 #define xfmodf nsimd_sleef_fmod_sve1024_f32 #define xremainder nsimd_sleef_remainder_sve1024_f64 #define xremainderf nsimd_sleef_remainder_sve1024_f32 #define xmodf nsimd_sleef_modf_sve1024_f64 #define xmodff nsimd_sleef_modf_sve1024_f32 #define xlgamma_u1 nsimd_sleef_lgamma_u10_sve1024_f64 #define xlgammaf_u1 nsimd_sleef_lgamma_u10_sve1024_f32 #define xtgamma_u1 nsimd_sleef_tgamma_u10_sve1024_f64 #define xtgammaf_u1 nsimd_sleef_tgamma_u10_sve1024_f32 #define xerf_u1 nsimd_sleef_erf_u10_sve1024_f64 #define xerff_u1 nsimd_sleef_erf_u10_sve1024_f32 #define xerfc_u15 nsimd_sleef_erfc_u15_sve1024_f64 #define xerfcf_u15 nsimd_sleef_erfc_u15_sve1024_f32 #define xgetInt nsimd_sleef_getInt_sve1024_f64 #define xgetIntf nsimd_sleef_getInt_sve1024_f32 #define xgetPtr nsimd_sleef_getPtr_sve1024_f64 #define xgetPtrf nsimd_sleef_getPtr_sve1024_f32 #endif #define rempi nsimd_sleef_rempi_sve1024 #define rempif nsimd_sleef_rempif_sve1024 #define rempisub nsimd_sleef_rempisub_sve1024 #define rempisubf nsimd_sleef_rempisubf_sve1024 #define gammak nsimd_gammak_sve1024 #define gammafk nsimd_gammafk_sve1024 #endif /* ------------------------------------------------------------------------- */ /* Naming of functions sve2048 */ #ifdef NSIMD_SVE2048 #ifdef DETERMINISTIC #define xsin nsimd_sleef_sin_u35d_sve2048_f64 #define xsinf nsimd_sleef_sin_u35d_sve2048_f32 #define xcos nsimd_sleef_cos_u35d_sve2048_f64 #define xcosf nsimd_sleef_cos_u35d_sve2048_f32 #define xsincos nsimd_sleef_sincos_u35d_sve2048_f64 #define xsincosf nsimd_sleef_sincos_u35d_sve2048_f32 #define xtan nsimd_sleef_tan_u35d_sve2048_f64 #define xtanf nsimd_sleef_tan_u35d_sve2048_f32 #define xasin nsimd_sleef_asin_u35d_sve2048_f64 #define xasinf nsimd_sleef_asin_u35d_sve2048_f32 #define xacos nsimd_sleef_acos_u35d_sve2048_f64 #define xacosf nsimd_sleef_acos_u35d_sve2048_f32 #define xatan nsimd_sleef_atan_u35d_sve2048_f64 #define xatanf nsimd_sleef_atan_u35d_sve2048_f32 #define xatan2 nsimd_sleef_atan2_u35d_sve2048_f64 #define xatan2f nsimd_sleef_atan2_u35d_sve2048_f32 #define xlog nsimd_sleef_log_u35d_sve2048_f64 #define xlogf nsimd_sleef_log_u35d_sve2048_f32 #define xcbrt nsimd_sleef_cbrt_u35d_sve2048_f64 #define xcbrtf nsimd_sleef_cbrt_u35d_sve2048_f32 #define xsin_u1 nsimd_sleef_sin_u10d_sve2048_f64 #define xsinf_u1 nsimd_sleef_sin_u10d_sve2048_f32 #define xcos_u1 nsimd_sleef_cos_u10d_sve2048_f64 #define xcosf_u1 nsimd_sleef_cos_u10d_sve2048_f32 #define xsincos_u1 nsimd_sleef_sincos_u10d_sve2048_f64 #define xsincosf_u1 nsimd_sleef_sincos_u10d_sve2048_f32 #define xtan_u1 nsimd_sleef_tan_u10d_sve2048_f64 #define xtanf_u1 nsimd_sleef_tan_u10d_sve2048_f32 #define xasin_u1 nsimd_sleef_asin_u10d_sve2048_f64 #define xasinf_u1 nsimd_sleef_asin_u10d_sve2048_f32 #define xacos_u1 nsimd_sleef_acos_u10d_sve2048_f64 #define xacosf_u1 nsimd_sleef_acos_u10d_sve2048_f32 #define xatan_u1 nsimd_sleef_atan_u10d_sve2048_f64 #define xatanf_u1 nsimd_sleef_atan_u10d_sve2048_f32 #define xatan2_u1 nsimd_sleef_atan2_u10d_sve2048_f64 #define xatan2f_u1 nsimd_sleef_atan2_u10d_sve2048_f32 #define xlog_u1 nsimd_sleef_log_u10d_sve2048_f64 #define xlogf_u1 nsimd_sleef_log_u10d_sve2048_f32 #define xcbrt_u1 nsimd_sleef_cbrt_u10d_sve2048_f64 #define xcbrtf_u1 nsimd_sleef_cbrt_u10d_sve2048_f32 #define xexp nsimd_sleef_exp_u10d_sve2048_f64 #define xexpf nsimd_sleef_exp_u10d_sve2048_f32 #define xpow nsimd_sleef_pow_u10d_sve2048_f64 #define xpowf nsimd_sleef_pow_u10d_sve2048_f32 #define xsinh nsimd_sleef_sinh_u10d_sve2048_f64 #define xsinhf nsimd_sleef_sinh_u10d_sve2048_f32 #define xcosh nsimd_sleef_cosh_u10d_sve2048_f64 #define xcoshf nsimd_sleef_cosh_u10d_sve2048_f32 #define xtanh nsimd_sleef_tanh_u10d_sve2048_f64 #define xtanhf nsimd_sleef_tanh_u10d_sve2048_f32 #define xsinh_u35 nsimd_sleef_sinh_u35d_sve2048_f64 #define xsinhf_u35 nsimd_sleef_sinh_u35d_sve2048_f32 #define xcosh_u35 nsimd_sleef_cosh_u35d_sve2048_f64 #define xcoshf_u35 nsimd_sleef_cosh_u35d_sve2048_f32 #define xtanh_u35 nsimd_sleef_tanh_u35d_sve2048_f64 #define xtanhf_u35 nsimd_sleef_tanh_u35d_sve2048_f32 #define xfastsin_u3500 nsimd_sleef_fastsin_u3500d_sve2048_f64 #define xfastsinf_u3500 nsimd_sleef_fastsin_u3500d_sve2048_f32 #define xfastcos_u3500 nsimd_sleef_fastcos_u3500d_sve2048_f64 #define xfastcosf_u3500 nsimd_sleef_fastcos_u3500d_sve2048_f32 #define xfastpow_u3500 nsimd_sleef_fastpow_u3500d_sve2048_f64 #define xfastpowf_u3500 nsimd_sleef_fastpow_u3500d_sve2048_f32 #define xasinh nsimd_sleef_asinh_u10d_sve2048_f64 #define xasinhf nsimd_sleef_asinh_u10d_sve2048_f32 #define xacosh nsimd_sleef_acosh_u10d_sve2048_f64 #define xacoshf nsimd_sleef_acosh_u10d_sve2048_f32 #define xatanh nsimd_sleef_atanh_u10d_sve2048_f64 #define xatanhf nsimd_sleef_atanh_u10d_sve2048_f32 #define xexp2 nsimd_sleef_exp2_u10d_sve2048_f64 #define xexp2f nsimd_sleef_exp2_u10d_sve2048_f32 #define xexp2_u35 nsimd_sleef_exp2_u35d_sve2048_f64 #define xexp2f_u35 nsimd_sleef_exp2_u35d_sve2048_f32 #define xexp10 nsimd_sleef_exp10_u10d_sve2048_f64 #define xexp10f nsimd_sleef_exp10_u10d_sve2048_f32 #define xexp10_u35 nsimd_sleef_exp10_u35d_sve2048_f64 #define xexp10f_u35 nsimd_sleef_exp10_u35d_sve2048_f32 #define xexpm1 nsimd_sleef_expm1_u10d_sve2048_f64 #define xexpm1f nsimd_sleef_expm1_u10d_sve2048_f32 #define xlog10 nsimd_sleef_log10_u10d_sve2048_f64 #define xlog10f nsimd_sleef_log10_u10d_sve2048_f32 #define xlog2 nsimd_sleef_log2_u10d_sve2048_f64 #define xlog2f nsimd_sleef_log2_u10d_sve2048_f32 #define xlog2_u35 nsimd_sleef_log2_u35d_sve2048_f64 #define xlog2f_u35 nsimd_sleef_log2_u35d_sve2048_f32 #define xlog1p nsimd_sleef_log1p_u10d_sve2048_f64 #define xlog1pf nsimd_sleef_log1p_u10d_sve2048_f32 #define xsincospi_u05 nsimd_sleef_sincospi_u05d_sve2048_f64 #define xsincospif_u05 nsimd_sleef_sincospi_u05d_sve2048_f32 #define xsincospi_u35 nsimd_sleef_sincospi_u35d_sve2048_f64 #define xsincospif_u35 nsimd_sleef_sincospi_u35d_sve2048_f32 #define xsinpi_u05 nsimd_sleef_sinpi_u05d_sve2048_f64 #define xsinpif_u05 nsimd_sleef_sinpi_u05d_sve2048_f32 #define xcospi_u05 nsimd_sleef_cospi_u05d_sve2048_f64 #define xcospif_u05 nsimd_sleef_cospi_u05d_sve2048_f32 #define xldexp nsimd_sleef_ldexp_sve2048_f64 #define xldexpf nsimd_sleef_ldexp_sve2048_f32 #define xilogb nsimd_sleef_ilogb_sve2048_f64 #define xilogbf nsimd_sleef_ilogb_sve2048_f32 #define xfma nsimd_sleef_fma_sve2048_f64 #define xfmaf nsimd_sleef_fma_sve2048_f32 #define xsqrt nsimd_sleef_sqrt_sve2048_f64 #define xsqrtf nsimd_sleef_sqrt_sve2048_f32 #define xsqrt_u05 nsimd_sleef_sqrt_u05d_sve2048_f64 #define xsqrtf_u05 nsimd_sleef_sqrt_u05d_sve2048_f32 #define xsqrt_u35 nsimd_sleef_sqrt_u35d_sve2048_f64 #define xsqrtf_u35 nsimd_sleef_sqrt_u35d_sve2048_f32 #define xhypot_u05 nsimd_sleef_hypot_u05d_sve2048_f64 #define xhypotf_u05 nsimd_sleef_hypot_u05d_sve2048_f32 #define xhypot_u35 nsimd_sleef_hypot_u35d_sve2048_f64 #define xhypotf_u35 nsimd_sleef_hypot_u35d_sve2048_f32 #define xfabs nsimd_sleef_fabs_sve2048_f64 #define xfabsf nsimd_sleef_fabs_sve2048_f32 #define xcopysign nsimd_sleef_copysign_sve2048_f64 #define xcopysignf nsimd_sleef_copysign_sve2048_f32 #define xfmax nsimd_sleef_fmax_sve2048_f64 #define xfmaxf nsimd_sleef_fmax_sve2048_f32 #define xfmin nsimd_sleef_fmin_sve2048_f64 #define xfminf nsimd_sleef_fmin_sve2048_f32 #define xfdim nsimd_sleef_fdim_sve2048_f64 #define xfdimf nsimd_sleef_fdim_sve2048_f32 #define xtrunc nsimd_sleef_trunc_sve2048_f64 #define xtruncf nsimd_sleef_trunc_sve2048_f32 #define xfloor nsimd_sleef_floor_sve2048_f64 #define xfloorf nsimd_sleef_floor_sve2048_f32 #define xceil nsimd_sleef_ceil_sve2048_f64 #define xceilf nsimd_sleef_ceil_sve2048_f32 #define xround nsimd_sleef_round_sve2048_f64 #define xroundf nsimd_sleef_round_sve2048_f32 #define xrint nsimd_sleef_rint_sve2048_f64 #define xrintf nsimd_sleef_rint_sve2048_f32 #define xnextafter nsimd_sleef_nextafter_sve2048_f64 #define xnextafterf nsimd_sleef_nextafter_sve2048_f32 #define xfrfrexp nsimd_sleef_frfrexp_sve2048_f64 #define xfrfrexpf nsimd_sleef_frfrexp_sve2048_f32 #define xexpfrexp nsimd_sleef_expfrexp_sve2048_f64 #define xexpfrexpf nsimd_sleef_expfrexp_sve2048_f32 #define xfmod nsimd_sleef_fmod_sve2048_f64 #define xfmodf nsimd_sleef_fmod_sve2048_f32 #define xremainder nsimd_sleef_remainder_sve2048_f64 #define xremainderf nsimd_sleef_remainder_sve2048_f32 #define xmodf nsimd_sleef_modf_sve2048_f64 #define xmodff nsimd_sleef_modf_sve2048_f32 #define xlgamma_u1 nsimd_sleef_lgamma_u10d_sve2048_f64 #define xlgammaf_u1 nsimd_sleef_lgamma_u10d_sve2048_f32 #define xtgamma_u1 nsimd_sleef_tgamma_u10d_sve2048_f64 #define xtgammaf_u1 nsimd_sleef_tgamma_u10d_sve2048_f32 #define xerf_u1 nsimd_sleef_erf_u10d_sve2048_f64 #define xerff_u1 nsimd_sleef_erf_u10d_sve2048_f32 #define xerfc_u15 nsimd_sleef_erfc_u15d_sve2048_f64 #define xerfcf_u15 nsimd_sleef_erfc_u15d_sve2048_f32 #define xgetInt nsimd_sleef_getInt_sve2048_f64 #define xgetIntf nsimd_sleef_getInt_sve2048_f32 #define xgetPtr nsimd_sleef_getPtr_sve2048_f64 #define xgetPtrf nsimd_sleef_getPtr_sve2048_f32 #else #define xsin nsimd_sleef_sin_u35_sve2048_f64 #define xsinf nsimd_sleef_sin_u35_sve2048_f32 #define xcos nsimd_sleef_cos_u35_sve2048_f64 #define xcosf nsimd_sleef_cos_u35_sve2048_f32 #define xsincos nsimd_sleef_sincos_u35_sve2048_f64 #define xsincosf nsimd_sleef_sincos_u35_sve2048_f32 #define xtan nsimd_sleef_tan_u35_sve2048_f64 #define xtanf nsimd_sleef_tan_u35_sve2048_f32 #define xasin nsimd_sleef_asin_u35_sve2048_f64 #define xasinf nsimd_sleef_asin_u35_sve2048_f32 #define xacos nsimd_sleef_acos_u35_sve2048_f64 #define xacosf nsimd_sleef_acos_u35_sve2048_f32 #define xatan nsimd_sleef_atan_u35_sve2048_f64 #define xatanf nsimd_sleef_atan_u35_sve2048_f32 #define xatan2 nsimd_sleef_atan2_u35_sve2048_f64 #define xatan2f nsimd_sleef_atan2_u35_sve2048_f32 #define xlog nsimd_sleef_log_u35_sve2048_f64 #define xlogf nsimd_sleef_log_u35_sve2048_f32 #define xcbrt nsimd_sleef_cbrt_u35_sve2048_f64 #define xcbrtf nsimd_sleef_cbrt_u35_sve2048_f32 #define xsin_u1 nsimd_sleef_sin_u10_sve2048_f64 #define xsinf_u1 nsimd_sleef_sin_u10_sve2048_f32 #define xcos_u1 nsimd_sleef_cos_u10_sve2048_f64 #define xcosf_u1 nsimd_sleef_cos_u10_sve2048_f32 #define xsincos_u1 nsimd_sleef_sincos_u10_sve2048_f64 #define xsincosf_u1 nsimd_sleef_sincos_u10_sve2048_f32 #define xtan_u1 nsimd_sleef_tan_u10_sve2048_f64 #define xtanf_u1 nsimd_sleef_tan_u10_sve2048_f32 #define xasin_u1 nsimd_sleef_asin_u10_sve2048_f64 #define xasinf_u1 nsimd_sleef_asin_u10_sve2048_f32 #define xacos_u1 nsimd_sleef_acos_u10_sve2048_f64 #define xacosf_u1 nsimd_sleef_acos_u10_sve2048_f32 #define xatan_u1 nsimd_sleef_atan_u10_sve2048_f64 #define xatanf_u1 nsimd_sleef_atan_u10_sve2048_f32 #define xatan2_u1 nsimd_sleef_atan2_u10_sve2048_f64 #define xatan2f_u1 nsimd_sleef_atan2_u10_sve2048_f32 #define xlog_u1 nsimd_sleef_log_u10_sve2048_f64 #define xlogf_u1 nsimd_sleef_log_u10_sve2048_f32 #define xcbrt_u1 nsimd_sleef_cbrt_u10_sve2048_f64 #define xcbrtf_u1 nsimd_sleef_cbrt_u10_sve2048_f32 #define xexp nsimd_sleef_exp_u10_sve2048_f64 #define xexpf nsimd_sleef_exp_u10_sve2048_f32 #define xpow nsimd_sleef_pow_u10_sve2048_f64 #define xpowf nsimd_sleef_pow_u10_sve2048_f32 #define xsinh nsimd_sleef_sinh_u10_sve2048_f64 #define xsinhf nsimd_sleef_sinh_u10_sve2048_f32 #define xcosh nsimd_sleef_cosh_u10_sve2048_f64 #define xcoshf nsimd_sleef_cosh_u10_sve2048_f32 #define xtanh nsimd_sleef_tanh_u10_sve2048_f64 #define xtanhf nsimd_sleef_tanh_u10_sve2048_f32 #define xsinh_u35 nsimd_sleef_sinh_u35_sve2048_f64 #define xsinhf_u35 nsimd_sleef_sinh_u35_sve2048_f32 #define xcosh_u35 nsimd_sleef_cosh_u35_sve2048_f64 #define xcoshf_u35 nsimd_sleef_cosh_u35_sve2048_f32 #define xtanh_u35 nsimd_sleef_tanh_u35_sve2048_f64 #define xtanhf_u35 nsimd_sleef_tanh_u35_sve2048_f32 #define xfastsin_u3500 nsimd_sleef_fastsin_u3500_sve2048_f64 #define xfastsinf_u3500 nsimd_sleef_fastsin_u3500_sve2048_f32 #define xfastcos_u3500 nsimd_sleef_fastcos_u3500_sve2048_f64 #define xfastcosf_u3500 nsimd_sleef_fastcos_u3500_sve2048_f32 #define xfastpow_u3500 nsimd_sleef_fastpow_u3500_sve2048_f64 #define xfastpowf_u3500 nsimd_sleef_fastpow_u3500_sve2048_f32 #define xasinh nsimd_sleef_asinh_u10_sve2048_f64 #define xasinhf nsimd_sleef_asinh_u10_sve2048_f32 #define xacosh nsimd_sleef_acosh_u10_sve2048_f64 #define xacoshf nsimd_sleef_acosh_u10_sve2048_f32 #define xatanh nsimd_sleef_atanh_u10_sve2048_f64 #define xatanhf nsimd_sleef_atanh_u10_sve2048_f32 #define xexp2 nsimd_sleef_exp2_u10_sve2048_f64 #define xexp2f nsimd_sleef_exp2_u10_sve2048_f32 #define xexp2_u35 nsimd_sleef_exp2_u35_sve2048_f64 #define xexp2f_u35 nsimd_sleef_exp2_u35_sve2048_f32 #define xexp10 nsimd_sleef_exp10_u10_sve2048_f64 #define xexp10f nsimd_sleef_exp10_u10_sve2048_f32 #define xexp10_u35 nsimd_sleef_exp10_u35_sve2048_f64 #define xexp10f_u35 nsimd_sleef_exp10_u35_sve2048_f32 #define xexpm1 nsimd_sleef_expm1_u10_sve2048_f64 #define xexpm1f nsimd_sleef_expm1_u10_sve2048_f32 #define xlog10 nsimd_sleef_log10_u10_sve2048_f64 #define xlog10f nsimd_sleef_log10_u10_sve2048_f32 #define xlog2 nsimd_sleef_log2_u10_sve2048_f64 #define xlog2f nsimd_sleef_log2_u10_sve2048_f32 #define xlog2_u35 nsimd_sleef_log2_u35_sve2048_f64 #define xlog2f_u35 nsimd_sleef_log2_u35_sve2048_f32 #define xlog1p nsimd_sleef_log1p_u10_sve2048_f64 #define xlog1pf nsimd_sleef_log1p_u10_sve2048_f32 #define xsincospi_u05 nsimd_sleef_sincospi_u05_sve2048_f64 #define xsincospif_u05 nsimd_sleef_sincospi_u05_sve2048_f32 #define xsincospi_u35 nsimd_sleef_sincospi_u35_sve2048_f64 #define xsincospif_u35 nsimd_sleef_sincospi_u35_sve2048_f32 #define xsinpi_u05 nsimd_sleef_sinpi_u05_sve2048_f64 #define xsinpif_u05 nsimd_sleef_sinpi_u05_sve2048_f32 #define xcospi_u05 nsimd_sleef_cospi_u05_sve2048_f64 #define xcospif_u05 nsimd_sleef_cospi_u05_sve2048_f32 #define xldexp nsimd_sleef_ldexp_sve2048_f64 #define xldexpf nsimd_sleef_ldexp_sve2048_f32 #define xilogb nsimd_sleef_ilogb_sve2048_f64 #define xilogbf nsimd_sleef_ilogb_sve2048_f32 #define xfma nsimd_sleef_fma_sve2048_f64 #define xfmaf nsimd_sleef_fma_sve2048_f32 #define xsqrt nsimd_sleef_sqrt_sve2048_f64 #define xsqrtf nsimd_sleef_sqrt_sve2048_f32 #define xsqrt_u05 nsimd_sleef_sqrt_u05_sve2048_f64 #define xsqrtf_u05 nsimd_sleef_sqrt_u05_sve2048_f32 #define xsqrt_u35 nsimd_sleef_sqrt_u35_sve2048_f64 #define xsqrtf_u35 nsimd_sleef_sqrt_u35_sve2048_f32 #define xhypot_u05 nsimd_sleef_hypot_u05_sve2048_f64 #define xhypotf_u05 nsimd_sleef_hypot_u05_sve2048_f32 #define xhypot_u35 nsimd_sleef_hypot_u35_sve2048_f64 #define xhypotf_u35 nsimd_sleef_hypot_u35_sve2048_f32 #define xfabs nsimd_sleef_fabs_sve2048_f64 #define xfabsf nsimd_sleef_fabs_sve2048_f32 #define xcopysign nsimd_sleef_copysign_sve2048_f64 #define xcopysignf nsimd_sleef_copysign_sve2048_f32 #define xfmax nsimd_sleef_fmax_sve2048_f64 #define xfmaxf nsimd_sleef_fmax_sve2048_f32 #define xfmin nsimd_sleef_fmin_sve2048_f64 #define xfminf nsimd_sleef_fmin_sve2048_f32 #define xfdim nsimd_sleef_fdim_sve2048_f64 #define xfdimf nsimd_sleef_fdim_sve2048_f32 #define xtrunc nsimd_sleef_trunc_sve2048_f64 #define xtruncf nsimd_sleef_trunc_sve2048_f32 #define xfloor nsimd_sleef_floor_sve2048_f64 #define xfloorf nsimd_sleef_floor_sve2048_f32 #define xceil nsimd_sleef_ceil_sve2048_f64 #define xceilf nsimd_sleef_ceil_sve2048_f32 #define xround nsimd_sleef_round_sve2048_f64 #define xroundf nsimd_sleef_round_sve2048_f32 #define xrint nsimd_sleef_rint_sve2048_f64 #define xrintf nsimd_sleef_rint_sve2048_f32 #define xnextafter nsimd_sleef_nextafter_sve2048_f64 #define xnextafterf nsimd_sleef_nextafter_sve2048_f32 #define xfrfrexp nsimd_sleef_frfrexp_sve2048_f64 #define xfrfrexpf nsimd_sleef_frfrexp_sve2048_f32 #define xexpfrexp nsimd_sleef_expfrexp_sve2048_f64 #define xexpfrexpf nsimd_sleef_expfrexp_sve2048_f32 #define xfmod nsimd_sleef_fmod_sve2048_f64 #define xfmodf nsimd_sleef_fmod_sve2048_f32 #define xremainder nsimd_sleef_remainder_sve2048_f64 #define xremainderf nsimd_sleef_remainder_sve2048_f32 #define xmodf nsimd_sleef_modf_sve2048_f64 #define xmodff nsimd_sleef_modf_sve2048_f32 #define xlgamma_u1 nsimd_sleef_lgamma_u10_sve2048_f64 #define xlgammaf_u1 nsimd_sleef_lgamma_u10_sve2048_f32 #define xtgamma_u1 nsimd_sleef_tgamma_u10_sve2048_f64 #define xtgammaf_u1 nsimd_sleef_tgamma_u10_sve2048_f32 #define xerf_u1 nsimd_sleef_erf_u10_sve2048_f64 #define xerff_u1 nsimd_sleef_erf_u10_sve2048_f32 #define xerfc_u15 nsimd_sleef_erfc_u15_sve2048_f64 #define xerfcf_u15 nsimd_sleef_erfc_u15_sve2048_f32 #define xgetInt nsimd_sleef_getInt_sve2048_f64 #define xgetIntf nsimd_sleef_getInt_sve2048_f32 #define xgetPtr nsimd_sleef_getPtr_sve2048_f64 #define xgetPtrf nsimd_sleef_getPtr_sve2048_f32 #endif #define rempi nsimd_sleef_rempi_sve2048 #define rempif nsimd_sleef_rempif_sve2048 #define rempisub nsimd_sleef_rempisub_sve2048 #define rempisubf nsimd_sleef_rempisubf_sve2048 #define gammak nsimd_gammak_sve2048 #define gammafk nsimd_gammafk_sve2048 #endif #endif ================================================ FILE: src/renamevsx.h ================================================ #ifndef RENAMEVSX_H #define RENAMEVSX_H /* ------------------------------------------------------------------------- */ /* Naming of functions vmx */ #ifdef NSIMD_VMX #ifdef DETERMINISTIC #define xsin nsimd_sleef_sin_u35d_vmx_f64 #define xsinf nsimd_sleef_sin_u35d_vmx_f32 #define xcos nsimd_sleef_cos_u35d_vmx_f64 #define xcosf nsimd_sleef_cos_u35d_vmx_f32 #define xsincos nsimd_sleef_sincos_u35d_vmx_f64 #define xsincosf nsimd_sleef_sincos_u35d_vmx_f32 #define xtan nsimd_sleef_tan_u35d_vmx_f64 #define xtanf nsimd_sleef_tan_u35d_vmx_f32 #define xasin nsimd_sleef_asin_u35d_vmx_f64 #define xasinf nsimd_sleef_asin_u35d_vmx_f32 #define xacos nsimd_sleef_acos_u35d_vmx_f64 #define xacosf nsimd_sleef_acos_u35d_vmx_f32 #define xatan nsimd_sleef_atan_u35d_vmx_f64 #define xatanf nsimd_sleef_atan_u35d_vmx_f32 #define xatan2 nsimd_sleef_atan2_u35d_vmx_f64 #define xatan2f nsimd_sleef_atan2_u35d_vmx_f32 #define xlog nsimd_sleef_log_u35d_vmx_f64 #define xlogf nsimd_sleef_log_u35d_vmx_f32 #define xcbrt nsimd_sleef_cbrt_u35d_vmx_f64 #define xcbrtf nsimd_sleef_cbrt_u35d_vmx_f32 #define xsin_u1 nsimd_sleef_sin_u10d_vmx_f64 #define xsinf_u1 nsimd_sleef_sin_u10d_vmx_f32 #define xcos_u1 nsimd_sleef_cos_u10d_vmx_f64 #define xcosf_u1 nsimd_sleef_cos_u10d_vmx_f32 #define xsincos_u1 nsimd_sleef_sincos_u10d_vmx_f64 #define xsincosf_u1 nsimd_sleef_sincos_u10d_vmx_f32 #define xtan_u1 nsimd_sleef_tan_u10d_vmx_f64 #define xtanf_u1 nsimd_sleef_tan_u10d_vmx_f32 #define xasin_u1 nsimd_sleef_asin_u10d_vmx_f64 #define xasinf_u1 nsimd_sleef_asin_u10d_vmx_f32 #define xacos_u1 nsimd_sleef_acos_u10d_vmx_f64 #define xacosf_u1 nsimd_sleef_acos_u10d_vmx_f32 #define xatan_u1 nsimd_sleef_atan_u10d_vmx_f64 #define xatanf_u1 nsimd_sleef_atan_u10d_vmx_f32 #define xatan2_u1 nsimd_sleef_atan2_u10d_vmx_f64 #define xatan2f_u1 nsimd_sleef_atan2_u10d_vmx_f32 #define xlog_u1 nsimd_sleef_log_u10d_vmx_f64 #define xlogf_u1 nsimd_sleef_log_u10d_vmx_f32 #define xcbrt_u1 nsimd_sleef_cbrt_u10d_vmx_f64 #define xcbrtf_u1 nsimd_sleef_cbrt_u10d_vmx_f32 #define xexp nsimd_sleef_exp_u10d_vmx_f64 #define xexpf nsimd_sleef_exp_u10d_vmx_f32 #define xpow nsimd_sleef_pow_u10d_vmx_f64 #define xpowf nsimd_sleef_pow_u10d_vmx_f32 #define xsinh nsimd_sleef_sinh_u10d_vmx_f64 #define xsinhf nsimd_sleef_sinh_u10d_vmx_f32 #define xcosh nsimd_sleef_cosh_u10d_vmx_f64 #define xcoshf nsimd_sleef_cosh_u10d_vmx_f32 #define xtanh nsimd_sleef_tanh_u10d_vmx_f64 #define xtanhf nsimd_sleef_tanh_u10d_vmx_f32 #define xsinh_u35 nsimd_sleef_sinh_u35d_vmx_f64 #define xsinhf_u35 nsimd_sleef_sinh_u35d_vmx_f32 #define xcosh_u35 nsimd_sleef_cosh_u35d_vmx_f64 #define xcoshf_u35 nsimd_sleef_cosh_u35d_vmx_f32 #define xtanh_u35 nsimd_sleef_tanh_u35d_vmx_f64 #define xtanhf_u35 nsimd_sleef_tanh_u35d_vmx_f32 #define xfastsin_u3500 nsimd_sleef_fastsin_u3500d_vmx_f64 #define xfastsinf_u3500 nsimd_sleef_fastsin_u3500d_vmx_f32 #define xfastcos_u3500 nsimd_sleef_fastcos_u3500d_vmx_f64 #define xfastcosf_u3500 nsimd_sleef_fastcos_u3500d_vmx_f32 #define xfastpow_u3500 nsimd_sleef_fastpow_u3500d_vmx_f64 #define xfastpowf_u3500 nsimd_sleef_fastpow_u3500d_vmx_f32 #define xasinh nsimd_sleef_asinh_u10d_vmx_f64 #define xasinhf nsimd_sleef_asinh_u10d_vmx_f32 #define xacosh nsimd_sleef_acosh_u10d_vmx_f64 #define xacoshf nsimd_sleef_acosh_u10d_vmx_f32 #define xatanh nsimd_sleef_atanh_u10d_vmx_f64 #define xatanhf nsimd_sleef_atanh_u10d_vmx_f32 #define xexp2 nsimd_sleef_exp2_u10d_vmx_f64 #define xexp2f nsimd_sleef_exp2_u10d_vmx_f32 #define xexp2_u35 nsimd_sleef_exp2_u35d_vmx_f64 #define xexp2f_u35 nsimd_sleef_exp2_u35d_vmx_f32 #define xexp10 nsimd_sleef_exp10_u10d_vmx_f64 #define xexp10f nsimd_sleef_exp10_u10d_vmx_f32 #define xexp10_u35 nsimd_sleef_exp10_u35d_vmx_f64 #define xexp10f_u35 nsimd_sleef_exp10_u35d_vmx_f32 #define xexpm1 nsimd_sleef_expm1_u10d_vmx_f64 #define xexpm1f nsimd_sleef_expm1_u10d_vmx_f32 #define xlog10 nsimd_sleef_log10_u10d_vmx_f64 #define xlog10f nsimd_sleef_log10_u10d_vmx_f32 #define xlog2 nsimd_sleef_log2_u10d_vmx_f64 #define xlog2f nsimd_sleef_log2_u10d_vmx_f32 #define xlog2_u35 nsimd_sleef_log2_u35d_vmx_f64 #define xlog2f_u35 nsimd_sleef_log2_u35d_vmx_f32 #define xlog1p nsimd_sleef_log1p_u10d_vmx_f64 #define xlog1pf nsimd_sleef_log1p_u10d_vmx_f32 #define xsincospi_u05 nsimd_sleef_sincospi_u05d_vmx_f64 #define xsincospif_u05 nsimd_sleef_sincospi_u05d_vmx_f32 #define xsincospi_u35 nsimd_sleef_sincospi_u35d_vmx_f64 #define xsincospif_u35 nsimd_sleef_sincospi_u35d_vmx_f32 #define xsinpi_u05 nsimd_sleef_sinpi_u05d_vmx_f64 #define xsinpif_u05 nsimd_sleef_sinpi_u05d_vmx_f32 #define xcospi_u05 nsimd_sleef_cospi_u05d_vmx_f64 #define xcospif_u05 nsimd_sleef_cospi_u05d_vmx_f32 #define xldexp nsimd_sleef_ldexp_vmx_f64 #define xldexpf nsimd_sleef_ldexp_vmx_f32 #define xilogb nsimd_sleef_ilogb_vmx_f64 #define xilogbf nsimd_sleef_ilogb_vmx_f32 #define xfma nsimd_sleef_fma_vmx_f64 #define xfmaf nsimd_sleef_fma_vmx_f32 #define xsqrt nsimd_sleef_sqrt_vmx_f64 #define xsqrtf nsimd_sleef_sqrt_vmx_f32 #define xsqrt_u05 nsimd_sleef_sqrt_u05d_vmx_f64 #define xsqrtf_u05 nsimd_sleef_sqrt_u05d_vmx_f32 #define xsqrt_u35 nsimd_sleef_sqrt_u35d_vmx_f64 #define xsqrtf_u35 nsimd_sleef_sqrt_u35d_vmx_f32 #define xhypot_u05 nsimd_sleef_hypot_u05d_vmx_f64 #define xhypotf_u05 nsimd_sleef_hypot_u05d_vmx_f32 #define xhypot_u35 nsimd_sleef_hypot_u35d_vmx_f64 #define xhypotf_u35 nsimd_sleef_hypot_u35d_vmx_f32 #define xfabs nsimd_sleef_fabs_vmx_f64 #define xfabsf nsimd_sleef_fabs_vmx_f32 #define xcopysign nsimd_sleef_copysign_vmx_f64 #define xcopysignf nsimd_sleef_copysign_vmx_f32 #define xfmax nsimd_sleef_fmax_vmx_f64 #define xfmaxf nsimd_sleef_fmax_vmx_f32 #define xfmin nsimd_sleef_fmin_vmx_f64 #define xfminf nsimd_sleef_fmin_vmx_f32 #define xfdim nsimd_sleef_fdim_vmx_f64 #define xfdimf nsimd_sleef_fdim_vmx_f32 #define xtrunc nsimd_sleef_trunc_vmx_f64 #define xtruncf nsimd_sleef_trunc_vmx_f32 #define xfloor nsimd_sleef_floor_vmx_f64 #define xfloorf nsimd_sleef_floor_vmx_f32 #define xceil nsimd_sleef_ceil_vmx_f64 #define xceilf nsimd_sleef_ceil_vmx_f32 #define xround nsimd_sleef_round_vmx_f64 #define xroundf nsimd_sleef_round_vmx_f32 #define xrint nsimd_sleef_rint_vmx_f64 #define xrintf nsimd_sleef_rint_vmx_f32 #define xnextafter nsimd_sleef_nextafter_vmx_f64 #define xnextafterf nsimd_sleef_nextafter_vmx_f32 #define xfrfrexp nsimd_sleef_frfrexp_vmx_f64 #define xfrfrexpf nsimd_sleef_frfrexp_vmx_f32 #define xexpfrexp nsimd_sleef_expfrexp_vmx_f64 #define xexpfrexpf nsimd_sleef_expfrexp_vmx_f32 #define xfmod nsimd_sleef_fmod_vmx_f64 #define xfmodf nsimd_sleef_fmod_vmx_f32 #define xremainder nsimd_sleef_remainder_vmx_f64 #define xremainderf nsimd_sleef_remainder_vmx_f32 #define xmodf nsimd_sleef_modf_vmx_f64 #define xmodff nsimd_sleef_modf_vmx_f32 #define xlgamma_u1 nsimd_sleef_lgamma_u10d_vmx_f64 #define xlgammaf_u1 nsimd_sleef_lgamma_u10d_vmx_f32 #define xtgamma_u1 nsimd_sleef_tgamma_u10d_vmx_f64 #define xtgammaf_u1 nsimd_sleef_tgamma_u10d_vmx_f32 #define xerf_u1 nsimd_sleef_erf_u10d_vmx_f64 #define xerff_u1 nsimd_sleef_erf_u10d_vmx_f32 #define xerfc_u15 nsimd_sleef_erfc_u15d_vmx_f64 #define xerfcf_u15 nsimd_sleef_erfc_u15d_vmx_f32 #define xgetInt nsimd_sleef_getInt_vmx_f64 #define xgetIntf nsimd_sleef_getInt_vmx_f32 #define xgetPtr nsimd_sleef_getPtr_vmx_f64 #define xgetPtrf nsimd_sleef_getPtr_vmx_f32 #else #define xsin nsimd_sleef_sin_u35_vmx_f64 #define xsinf nsimd_sleef_sin_u35_vmx_f32 #define xcos nsimd_sleef_cos_u35_vmx_f64 #define xcosf nsimd_sleef_cos_u35_vmx_f32 #define xsincos nsimd_sleef_sincos_u35_vmx_f64 #define xsincosf nsimd_sleef_sincos_u35_vmx_f32 #define xtan nsimd_sleef_tan_u35_vmx_f64 #define xtanf nsimd_sleef_tan_u35_vmx_f32 #define xasin nsimd_sleef_asin_u35_vmx_f64 #define xasinf nsimd_sleef_asin_u35_vmx_f32 #define xacos nsimd_sleef_acos_u35_vmx_f64 #define xacosf nsimd_sleef_acos_u35_vmx_f32 #define xatan nsimd_sleef_atan_u35_vmx_f64 #define xatanf nsimd_sleef_atan_u35_vmx_f32 #define xatan2 nsimd_sleef_atan2_u35_vmx_f64 #define xatan2f nsimd_sleef_atan2_u35_vmx_f32 #define xlog nsimd_sleef_log_u35_vmx_f64 #define xlogf nsimd_sleef_log_u35_vmx_f32 #define xcbrt nsimd_sleef_cbrt_u35_vmx_f64 #define xcbrtf nsimd_sleef_cbrt_u35_vmx_f32 #define xsin_u1 nsimd_sleef_sin_u10_vmx_f64 #define xsinf_u1 nsimd_sleef_sin_u10_vmx_f32 #define xcos_u1 nsimd_sleef_cos_u10_vmx_f64 #define xcosf_u1 nsimd_sleef_cos_u10_vmx_f32 #define xsincos_u1 nsimd_sleef_sincos_u10_vmx_f64 #define xsincosf_u1 nsimd_sleef_sincos_u10_vmx_f32 #define xtan_u1 nsimd_sleef_tan_u10_vmx_f64 #define xtanf_u1 nsimd_sleef_tan_u10_vmx_f32 #define xasin_u1 nsimd_sleef_asin_u10_vmx_f64 #define xasinf_u1 nsimd_sleef_asin_u10_vmx_f32 #define xacos_u1 nsimd_sleef_acos_u10_vmx_f64 #define xacosf_u1 nsimd_sleef_acos_u10_vmx_f32 #define xatan_u1 nsimd_sleef_atan_u10_vmx_f64 #define xatanf_u1 nsimd_sleef_atan_u10_vmx_f32 #define xatan2_u1 nsimd_sleef_atan2_u10_vmx_f64 #define xatan2f_u1 nsimd_sleef_atan2_u10_vmx_f32 #define xlog_u1 nsimd_sleef_log_u10_vmx_f64 #define xlogf_u1 nsimd_sleef_log_u10_vmx_f32 #define xcbrt_u1 nsimd_sleef_cbrt_u10_vmx_f64 #define xcbrtf_u1 nsimd_sleef_cbrt_u10_vmx_f32 #define xexp nsimd_sleef_exp_u10_vmx_f64 #define xexpf nsimd_sleef_exp_u10_vmx_f32 #define xpow nsimd_sleef_pow_u10_vmx_f64 #define xpowf nsimd_sleef_pow_u10_vmx_f32 #define xsinh nsimd_sleef_sinh_u10_vmx_f64 #define xsinhf nsimd_sleef_sinh_u10_vmx_f32 #define xcosh nsimd_sleef_cosh_u10_vmx_f64 #define xcoshf nsimd_sleef_cosh_u10_vmx_f32 #define xtanh nsimd_sleef_tanh_u10_vmx_f64 #define xtanhf nsimd_sleef_tanh_u10_vmx_f32 #define xsinh_u35 nsimd_sleef_sinh_u35_vmx_f64 #define xsinhf_u35 nsimd_sleef_sinh_u35_vmx_f32 #define xcosh_u35 nsimd_sleef_cosh_u35_vmx_f64 #define xcoshf_u35 nsimd_sleef_cosh_u35_vmx_f32 #define xtanh_u35 nsimd_sleef_tanh_u35_vmx_f64 #define xtanhf_u35 nsimd_sleef_tanh_u35_vmx_f32 #define xfastsin_u3500 nsimd_sleef_fastsin_u3500_vmx_f64 #define xfastsinf_u3500 nsimd_sleef_fastsin_u3500_vmx_f32 #define xfastcos_u3500 nsimd_sleef_fastcos_u3500_vmx_f64 #define xfastcosf_u3500 nsimd_sleef_fastcos_u3500_vmx_f32 #define xfastpow_u3500 nsimd_sleef_fastpow_u3500_vmx_f64 #define xfastpowf_u3500 nsimd_sleef_fastpow_u3500_vmx_f32 #define xasinh nsimd_sleef_asinh_u10_vmx_f64 #define xasinhf nsimd_sleef_asinh_u10_vmx_f32 #define xacosh nsimd_sleef_acosh_u10_vmx_f64 #define xacoshf nsimd_sleef_acosh_u10_vmx_f32 #define xatanh nsimd_sleef_atanh_u10_vmx_f64 #define xatanhf nsimd_sleef_atanh_u10_vmx_f32 #define xexp2 nsimd_sleef_exp2_u10_vmx_f64 #define xexp2f nsimd_sleef_exp2_u10_vmx_f32 #define xexp2_u35 nsimd_sleef_exp2_u35_vmx_f64 #define xexp2f_u35 nsimd_sleef_exp2_u35_vmx_f32 #define xexp10 nsimd_sleef_exp10_u10_vmx_f64 #define xexp10f nsimd_sleef_exp10_u10_vmx_f32 #define xexp10_u35 nsimd_sleef_exp10_u35_vmx_f64 #define xexp10f_u35 nsimd_sleef_exp10_u35_vmx_f32 #define xexpm1 nsimd_sleef_expm1_u10_vmx_f64 #define xexpm1f nsimd_sleef_expm1_u10_vmx_f32 #define xlog10 nsimd_sleef_log10_u10_vmx_f64 #define xlog10f nsimd_sleef_log10_u10_vmx_f32 #define xlog2 nsimd_sleef_log2_u10_vmx_f64 #define xlog2f nsimd_sleef_log2_u10_vmx_f32 #define xlog2_u35 nsimd_sleef_log2_u35_vmx_f64 #define xlog2f_u35 nsimd_sleef_log2_u35_vmx_f32 #define xlog1p nsimd_sleef_log1p_u10_vmx_f64 #define xlog1pf nsimd_sleef_log1p_u10_vmx_f32 #define xsincospi_u05 nsimd_sleef_sincospi_u05_vmx_f64 #define xsincospif_u05 nsimd_sleef_sincospi_u05_vmx_f32 #define xsincospi_u35 nsimd_sleef_sincospi_u35_vmx_f64 #define xsincospif_u35 nsimd_sleef_sincospi_u35_vmx_f32 #define xsinpi_u05 nsimd_sleef_sinpi_u05_vmx_f64 #define xsinpif_u05 nsimd_sleef_sinpi_u05_vmx_f32 #define xcospi_u05 nsimd_sleef_cospi_u05_vmx_f64 #define xcospif_u05 nsimd_sleef_cospi_u05_vmx_f32 #define xldexp nsimd_sleef_ldexp_vmx_f64 #define xldexpf nsimd_sleef_ldexp_vmx_f32 #define xilogb nsimd_sleef_ilogb_vmx_f64 #define xilogbf nsimd_sleef_ilogb_vmx_f32 #define xfma nsimd_sleef_fma_vmx_f64 #define xfmaf nsimd_sleef_fma_vmx_f32 #define xsqrt nsimd_sleef_sqrt_vmx_f64 #define xsqrtf nsimd_sleef_sqrt_vmx_f32 #define xsqrt_u05 nsimd_sleef_sqrt_u05_vmx_f64 #define xsqrtf_u05 nsimd_sleef_sqrt_u05_vmx_f32 #define xsqrt_u35 nsimd_sleef_sqrt_u35_vmx_f64 #define xsqrtf_u35 nsimd_sleef_sqrt_u35_vmx_f32 #define xhypot_u05 nsimd_sleef_hypot_u05_vmx_f64 #define xhypotf_u05 nsimd_sleef_hypot_u05_vmx_f32 #define xhypot_u35 nsimd_sleef_hypot_u35_vmx_f64 #define xhypotf_u35 nsimd_sleef_hypot_u35_vmx_f32 #define xfabs nsimd_sleef_fabs_vmx_f64 #define xfabsf nsimd_sleef_fabs_vmx_f32 #define xcopysign nsimd_sleef_copysign_vmx_f64 #define xcopysignf nsimd_sleef_copysign_vmx_f32 #define xfmax nsimd_sleef_fmax_vmx_f64 #define xfmaxf nsimd_sleef_fmax_vmx_f32 #define xfmin nsimd_sleef_fmin_vmx_f64 #define xfminf nsimd_sleef_fmin_vmx_f32 #define xfdim nsimd_sleef_fdim_vmx_f64 #define xfdimf nsimd_sleef_fdim_vmx_f32 #define xtrunc nsimd_sleef_trunc_vmx_f64 #define xtruncf nsimd_sleef_trunc_vmx_f32 #define xfloor nsimd_sleef_floor_vmx_f64 #define xfloorf nsimd_sleef_floor_vmx_f32 #define xceil nsimd_sleef_ceil_vmx_f64 #define xceilf nsimd_sleef_ceil_vmx_f32 #define xround nsimd_sleef_round_vmx_f64 #define xroundf nsimd_sleef_round_vmx_f32 #define xrint nsimd_sleef_rint_vmx_f64 #define xrintf nsimd_sleef_rint_vmx_f32 #define xnextafter nsimd_sleef_nextafter_vmx_f64 #define xnextafterf nsimd_sleef_nextafter_vmx_f32 #define xfrfrexp nsimd_sleef_frfrexp_vmx_f64 #define xfrfrexpf nsimd_sleef_frfrexp_vmx_f32 #define xexpfrexp nsimd_sleef_expfrexp_vmx_f64 #define xexpfrexpf nsimd_sleef_expfrexp_vmx_f32 #define xfmod nsimd_sleef_fmod_vmx_f64 #define xfmodf nsimd_sleef_fmod_vmx_f32 #define xremainder nsimd_sleef_remainder_vmx_f64 #define xremainderf nsimd_sleef_remainder_vmx_f32 #define xmodf nsimd_sleef_modf_vmx_f64 #define xmodff nsimd_sleef_modf_vmx_f32 #define xlgamma_u1 nsimd_sleef_lgamma_u10_vmx_f64 #define xlgammaf_u1 nsimd_sleef_lgamma_u10_vmx_f32 #define xtgamma_u1 nsimd_sleef_tgamma_u10_vmx_f64 #define xtgammaf_u1 nsimd_sleef_tgamma_u10_vmx_f32 #define xerf_u1 nsimd_sleef_erf_u10_vmx_f64 #define xerff_u1 nsimd_sleef_erf_u10_vmx_f32 #define xerfc_u15 nsimd_sleef_erfc_u15_vmx_f64 #define xerfcf_u15 nsimd_sleef_erfc_u15_vmx_f32 #define xgetInt nsimd_sleef_getInt_vmx_f64 #define xgetIntf nsimd_sleef_getInt_vmx_f32 #define xgetPtr nsimd_sleef_getPtr_vmx_f64 #define xgetPtrf nsimd_sleef_getPtr_vmx_f32 #endif #define rempi nsimd_sleef_rempi_vmx #define rempif nsimd_sleef_rempif_vmx #define rempisub nsimd_sleef_rempisub_vmx #define rempisubf nsimd_sleef_rempisubf_vmx #define gammak nsimd_gammak_vmx #define gammafk nsimd_gammafk_vmx #endif /* ------------------------------------------------------------------------- */ /* Naming of functions vsx */ #ifdef NSIMD_VSX #ifdef DETERMINISTIC #define xsin nsimd_sleef_sin_u35d_vsx_f64 #define xsinf nsimd_sleef_sin_u35d_vsx_f32 #define xcos nsimd_sleef_cos_u35d_vsx_f64 #define xcosf nsimd_sleef_cos_u35d_vsx_f32 #define xsincos nsimd_sleef_sincos_u35d_vsx_f64 #define xsincosf nsimd_sleef_sincos_u35d_vsx_f32 #define xtan nsimd_sleef_tan_u35d_vsx_f64 #define xtanf nsimd_sleef_tan_u35d_vsx_f32 #define xasin nsimd_sleef_asin_u35d_vsx_f64 #define xasinf nsimd_sleef_asin_u35d_vsx_f32 #define xacos nsimd_sleef_acos_u35d_vsx_f64 #define xacosf nsimd_sleef_acos_u35d_vsx_f32 #define xatan nsimd_sleef_atan_u35d_vsx_f64 #define xatanf nsimd_sleef_atan_u35d_vsx_f32 #define xatan2 nsimd_sleef_atan2_u35d_vsx_f64 #define xatan2f nsimd_sleef_atan2_u35d_vsx_f32 #define xlog nsimd_sleef_log_u35d_vsx_f64 #define xlogf nsimd_sleef_log_u35d_vsx_f32 #define xcbrt nsimd_sleef_cbrt_u35d_vsx_f64 #define xcbrtf nsimd_sleef_cbrt_u35d_vsx_f32 #define xsin_u1 nsimd_sleef_sin_u10d_vsx_f64 #define xsinf_u1 nsimd_sleef_sin_u10d_vsx_f32 #define xcos_u1 nsimd_sleef_cos_u10d_vsx_f64 #define xcosf_u1 nsimd_sleef_cos_u10d_vsx_f32 #define xsincos_u1 nsimd_sleef_sincos_u10d_vsx_f64 #define xsincosf_u1 nsimd_sleef_sincos_u10d_vsx_f32 #define xtan_u1 nsimd_sleef_tan_u10d_vsx_f64 #define xtanf_u1 nsimd_sleef_tan_u10d_vsx_f32 #define xasin_u1 nsimd_sleef_asin_u10d_vsx_f64 #define xasinf_u1 nsimd_sleef_asin_u10d_vsx_f32 #define xacos_u1 nsimd_sleef_acos_u10d_vsx_f64 #define xacosf_u1 nsimd_sleef_acos_u10d_vsx_f32 #define xatan_u1 nsimd_sleef_atan_u10d_vsx_f64 #define xatanf_u1 nsimd_sleef_atan_u10d_vsx_f32 #define xatan2_u1 nsimd_sleef_atan2_u10d_vsx_f64 #define xatan2f_u1 nsimd_sleef_atan2_u10d_vsx_f32 #define xlog_u1 nsimd_sleef_log_u10d_vsx_f64 #define xlogf_u1 nsimd_sleef_log_u10d_vsx_f32 #define xcbrt_u1 nsimd_sleef_cbrt_u10d_vsx_f64 #define xcbrtf_u1 nsimd_sleef_cbrt_u10d_vsx_f32 #define xexp nsimd_sleef_exp_u10d_vsx_f64 #define xexpf nsimd_sleef_exp_u10d_vsx_f32 #define xpow nsimd_sleef_pow_u10d_vsx_f64 #define xpowf nsimd_sleef_pow_u10d_vsx_f32 #define xsinh nsimd_sleef_sinh_u10d_vsx_f64 #define xsinhf nsimd_sleef_sinh_u10d_vsx_f32 #define xcosh nsimd_sleef_cosh_u10d_vsx_f64 #define xcoshf nsimd_sleef_cosh_u10d_vsx_f32 #define xtanh nsimd_sleef_tanh_u10d_vsx_f64 #define xtanhf nsimd_sleef_tanh_u10d_vsx_f32 #define xsinh_u35 nsimd_sleef_sinh_u35d_vsx_f64 #define xsinhf_u35 nsimd_sleef_sinh_u35d_vsx_f32 #define xcosh_u35 nsimd_sleef_cosh_u35d_vsx_f64 #define xcoshf_u35 nsimd_sleef_cosh_u35d_vsx_f32 #define xtanh_u35 nsimd_sleef_tanh_u35d_vsx_f64 #define xtanhf_u35 nsimd_sleef_tanh_u35d_vsx_f32 #define xfastsin_u3500 nsimd_sleef_fastsin_u3500d_vsx_f64 #define xfastsinf_u3500 nsimd_sleef_fastsin_u3500d_vsx_f32 #define xfastcos_u3500 nsimd_sleef_fastcos_u3500d_vsx_f64 #define xfastcosf_u3500 nsimd_sleef_fastcos_u3500d_vsx_f32 #define xfastpow_u3500 nsimd_sleef_fastpow_u3500d_vsx_f64 #define xfastpowf_u3500 nsimd_sleef_fastpow_u3500d_vsx_f32 #define xasinh nsimd_sleef_asinh_u10d_vsx_f64 #define xasinhf nsimd_sleef_asinh_u10d_vsx_f32 #define xacosh nsimd_sleef_acosh_u10d_vsx_f64 #define xacoshf nsimd_sleef_acosh_u10d_vsx_f32 #define xatanh nsimd_sleef_atanh_u10d_vsx_f64 #define xatanhf nsimd_sleef_atanh_u10d_vsx_f32 #define xexp2 nsimd_sleef_exp2_u10d_vsx_f64 #define xexp2f nsimd_sleef_exp2_u10d_vsx_f32 #define xexp2_u35 nsimd_sleef_exp2_u35d_vsx_f64 #define xexp2f_u35 nsimd_sleef_exp2_u35d_vsx_f32 #define xexp10 nsimd_sleef_exp10_u10d_vsx_f64 #define xexp10f nsimd_sleef_exp10_u10d_vsx_f32 #define xexp10_u35 nsimd_sleef_exp10_u35d_vsx_f64 #define xexp10f_u35 nsimd_sleef_exp10_u35d_vsx_f32 #define xexpm1 nsimd_sleef_expm1_u10d_vsx_f64 #define xexpm1f nsimd_sleef_expm1_u10d_vsx_f32 #define xlog10 nsimd_sleef_log10_u10d_vsx_f64 #define xlog10f nsimd_sleef_log10_u10d_vsx_f32 #define xlog2 nsimd_sleef_log2_u10d_vsx_f64 #define xlog2f nsimd_sleef_log2_u10d_vsx_f32 #define xlog2_u35 nsimd_sleef_log2_u35d_vsx_f64 #define xlog2f_u35 nsimd_sleef_log2_u35d_vsx_f32 #define xlog1p nsimd_sleef_log1p_u10d_vsx_f64 #define xlog1pf nsimd_sleef_log1p_u10d_vsx_f32 #define xsincospi_u05 nsimd_sleef_sincospi_u05d_vsx_f64 #define xsincospif_u05 nsimd_sleef_sincospi_u05d_vsx_f32 #define xsincospi_u35 nsimd_sleef_sincospi_u35d_vsx_f64 #define xsincospif_u35 nsimd_sleef_sincospi_u35d_vsx_f32 #define xsinpi_u05 nsimd_sleef_sinpi_u05d_vsx_f64 #define xsinpif_u05 nsimd_sleef_sinpi_u05d_vsx_f32 #define xcospi_u05 nsimd_sleef_cospi_u05d_vsx_f64 #define xcospif_u05 nsimd_sleef_cospi_u05d_vsx_f32 #define xldexp nsimd_sleef_ldexp_vsx_f64 #define xldexpf nsimd_sleef_ldexp_vsx_f32 #define xilogb nsimd_sleef_ilogb_vsx_f64 #define xilogbf nsimd_sleef_ilogb_vsx_f32 #define xfma nsimd_sleef_fma_vsx_f64 #define xfmaf nsimd_sleef_fma_vsx_f32 #define xsqrt nsimd_sleef_sqrt_vsx_f64 #define xsqrtf nsimd_sleef_sqrt_vsx_f32 #define xsqrt_u05 nsimd_sleef_sqrt_u05d_vsx_f64 #define xsqrtf_u05 nsimd_sleef_sqrt_u05d_vsx_f32 #define xsqrt_u35 nsimd_sleef_sqrt_u35d_vsx_f64 #define xsqrtf_u35 nsimd_sleef_sqrt_u35d_vsx_f32 #define xhypot_u05 nsimd_sleef_hypot_u05d_vsx_f64 #define xhypotf_u05 nsimd_sleef_hypot_u05d_vsx_f32 #define xhypot_u35 nsimd_sleef_hypot_u35d_vsx_f64 #define xhypotf_u35 nsimd_sleef_hypot_u35d_vsx_f32 #define xfabs nsimd_sleef_fabs_vsx_f64 #define xfabsf nsimd_sleef_fabs_vsx_f32 #define xcopysign nsimd_sleef_copysign_vsx_f64 #define xcopysignf nsimd_sleef_copysign_vsx_f32 #define xfmax nsimd_sleef_fmax_vsx_f64 #define xfmaxf nsimd_sleef_fmax_vsx_f32 #define xfmin nsimd_sleef_fmin_vsx_f64 #define xfminf nsimd_sleef_fmin_vsx_f32 #define xfdim nsimd_sleef_fdim_vsx_f64 #define xfdimf nsimd_sleef_fdim_vsx_f32 #define xtrunc nsimd_sleef_trunc_vsx_f64 #define xtruncf nsimd_sleef_trunc_vsx_f32 #define xfloor nsimd_sleef_floor_vsx_f64 #define xfloorf nsimd_sleef_floor_vsx_f32 #define xceil nsimd_sleef_ceil_vsx_f64 #define xceilf nsimd_sleef_ceil_vsx_f32 #define xround nsimd_sleef_round_vsx_f64 #define xroundf nsimd_sleef_round_vsx_f32 #define xrint nsimd_sleef_rint_vsx_f64 #define xrintf nsimd_sleef_rint_vsx_f32 #define xnextafter nsimd_sleef_nextafter_vsx_f64 #define xnextafterf nsimd_sleef_nextafter_vsx_f32 #define xfrfrexp nsimd_sleef_frfrexp_vsx_f64 #define xfrfrexpf nsimd_sleef_frfrexp_vsx_f32 #define xexpfrexp nsimd_sleef_expfrexp_vsx_f64 #define xexpfrexpf nsimd_sleef_expfrexp_vsx_f32 #define xfmod nsimd_sleef_fmod_vsx_f64 #define xfmodf nsimd_sleef_fmod_vsx_f32 #define xremainder nsimd_sleef_remainder_vsx_f64 #define xremainderf nsimd_sleef_remainder_vsx_f32 #define xmodf nsimd_sleef_modf_vsx_f64 #define xmodff nsimd_sleef_modf_vsx_f32 #define xlgamma_u1 nsimd_sleef_lgamma_u10d_vsx_f64 #define xlgammaf_u1 nsimd_sleef_lgamma_u10d_vsx_f32 #define xtgamma_u1 nsimd_sleef_tgamma_u10d_vsx_f64 #define xtgammaf_u1 nsimd_sleef_tgamma_u10d_vsx_f32 #define xerf_u1 nsimd_sleef_erf_u10d_vsx_f64 #define xerff_u1 nsimd_sleef_erf_u10d_vsx_f32 #define xerfc_u15 nsimd_sleef_erfc_u15d_vsx_f64 #define xerfcf_u15 nsimd_sleef_erfc_u15d_vsx_f32 #define xgetInt nsimd_sleef_getInt_vsx_f64 #define xgetIntf nsimd_sleef_getInt_vsx_f32 #define xgetPtr nsimd_sleef_getPtr_vsx_f64 #define xgetPtrf nsimd_sleef_getPtr_vsx_f32 #else #define xsin nsimd_sleef_sin_u35_vsx_f64 #define xsinf nsimd_sleef_sin_u35_vsx_f32 #define xcos nsimd_sleef_cos_u35_vsx_f64 #define xcosf nsimd_sleef_cos_u35_vsx_f32 #define xsincos nsimd_sleef_sincos_u35_vsx_f64 #define xsincosf nsimd_sleef_sincos_u35_vsx_f32 #define xtan nsimd_sleef_tan_u35_vsx_f64 #define xtanf nsimd_sleef_tan_u35_vsx_f32 #define xasin nsimd_sleef_asin_u35_vsx_f64 #define xasinf nsimd_sleef_asin_u35_vsx_f32 #define xacos nsimd_sleef_acos_u35_vsx_f64 #define xacosf nsimd_sleef_acos_u35_vsx_f32 #define xatan nsimd_sleef_atan_u35_vsx_f64 #define xatanf nsimd_sleef_atan_u35_vsx_f32 #define xatan2 nsimd_sleef_atan2_u35_vsx_f64 #define xatan2f nsimd_sleef_atan2_u35_vsx_f32 #define xlog nsimd_sleef_log_u35_vsx_f64 #define xlogf nsimd_sleef_log_u35_vsx_f32 #define xcbrt nsimd_sleef_cbrt_u35_vsx_f64 #define xcbrtf nsimd_sleef_cbrt_u35_vsx_f32 #define xsin_u1 nsimd_sleef_sin_u10_vsx_f64 #define xsinf_u1 nsimd_sleef_sin_u10_vsx_f32 #define xcos_u1 nsimd_sleef_cos_u10_vsx_f64 #define xcosf_u1 nsimd_sleef_cos_u10_vsx_f32 #define xsincos_u1 nsimd_sleef_sincos_u10_vsx_f64 #define xsincosf_u1 nsimd_sleef_sincos_u10_vsx_f32 #define xtan_u1 nsimd_sleef_tan_u10_vsx_f64 #define xtanf_u1 nsimd_sleef_tan_u10_vsx_f32 #define xasin_u1 nsimd_sleef_asin_u10_vsx_f64 #define xasinf_u1 nsimd_sleef_asin_u10_vsx_f32 #define xacos_u1 nsimd_sleef_acos_u10_vsx_f64 #define xacosf_u1 nsimd_sleef_acos_u10_vsx_f32 #define xatan_u1 nsimd_sleef_atan_u10_vsx_f64 #define xatanf_u1 nsimd_sleef_atan_u10_vsx_f32 #define xatan2_u1 nsimd_sleef_atan2_u10_vsx_f64 #define xatan2f_u1 nsimd_sleef_atan2_u10_vsx_f32 #define xlog_u1 nsimd_sleef_log_u10_vsx_f64 #define xlogf_u1 nsimd_sleef_log_u10_vsx_f32 #define xcbrt_u1 nsimd_sleef_cbrt_u10_vsx_f64 #define xcbrtf_u1 nsimd_sleef_cbrt_u10_vsx_f32 #define xexp nsimd_sleef_exp_u10_vsx_f64 #define xexpf nsimd_sleef_exp_u10_vsx_f32 #define xpow nsimd_sleef_pow_u10_vsx_f64 #define xpowf nsimd_sleef_pow_u10_vsx_f32 #define xsinh nsimd_sleef_sinh_u10_vsx_f64 #define xsinhf nsimd_sleef_sinh_u10_vsx_f32 #define xcosh nsimd_sleef_cosh_u10_vsx_f64 #define xcoshf nsimd_sleef_cosh_u10_vsx_f32 #define xtanh nsimd_sleef_tanh_u10_vsx_f64 #define xtanhf nsimd_sleef_tanh_u10_vsx_f32 #define xsinh_u35 nsimd_sleef_sinh_u35_vsx_f64 #define xsinhf_u35 nsimd_sleef_sinh_u35_vsx_f32 #define xcosh_u35 nsimd_sleef_cosh_u35_vsx_f64 #define xcoshf_u35 nsimd_sleef_cosh_u35_vsx_f32 #define xtanh_u35 nsimd_sleef_tanh_u35_vsx_f64 #define xtanhf_u35 nsimd_sleef_tanh_u35_vsx_f32 #define xfastsin_u3500 nsimd_sleef_fastsin_u3500_vsx_f64 #define xfastsinf_u3500 nsimd_sleef_fastsin_u3500_vsx_f32 #define xfastcos_u3500 nsimd_sleef_fastcos_u3500_vsx_f64 #define xfastcosf_u3500 nsimd_sleef_fastcos_u3500_vsx_f32 #define xfastpow_u3500 nsimd_sleef_fastpow_u3500_vsx_f64 #define xfastpowf_u3500 nsimd_sleef_fastpow_u3500_vsx_f32 #define xasinh nsimd_sleef_asinh_u10_vsx_f64 #define xasinhf nsimd_sleef_asinh_u10_vsx_f32 #define xacosh nsimd_sleef_acosh_u10_vsx_f64 #define xacoshf nsimd_sleef_acosh_u10_vsx_f32 #define xatanh nsimd_sleef_atanh_u10_vsx_f64 #define xatanhf nsimd_sleef_atanh_u10_vsx_f32 #define xexp2 nsimd_sleef_exp2_u10_vsx_f64 #define xexp2f nsimd_sleef_exp2_u10_vsx_f32 #define xexp2_u35 nsimd_sleef_exp2_u35_vsx_f64 #define xexp2f_u35 nsimd_sleef_exp2_u35_vsx_f32 #define xexp10 nsimd_sleef_exp10_u10_vsx_f64 #define xexp10f nsimd_sleef_exp10_u10_vsx_f32 #define xexp10_u35 nsimd_sleef_exp10_u35_vsx_f64 #define xexp10f_u35 nsimd_sleef_exp10_u35_vsx_f32 #define xexpm1 nsimd_sleef_expm1_u10_vsx_f64 #define xexpm1f nsimd_sleef_expm1_u10_vsx_f32 #define xlog10 nsimd_sleef_log10_u10_vsx_f64 #define xlog10f nsimd_sleef_log10_u10_vsx_f32 #define xlog2 nsimd_sleef_log2_u10_vsx_f64 #define xlog2f nsimd_sleef_log2_u10_vsx_f32 #define xlog2_u35 nsimd_sleef_log2_u35_vsx_f64 #define xlog2f_u35 nsimd_sleef_log2_u35_vsx_f32 #define xlog1p nsimd_sleef_log1p_u10_vsx_f64 #define xlog1pf nsimd_sleef_log1p_u10_vsx_f32 #define xsincospi_u05 nsimd_sleef_sincospi_u05_vsx_f64 #define xsincospif_u05 nsimd_sleef_sincospi_u05_vsx_f32 #define xsincospi_u35 nsimd_sleef_sincospi_u35_vsx_f64 #define xsincospif_u35 nsimd_sleef_sincospi_u35_vsx_f32 #define xsinpi_u05 nsimd_sleef_sinpi_u05_vsx_f64 #define xsinpif_u05 nsimd_sleef_sinpi_u05_vsx_f32 #define xcospi_u05 nsimd_sleef_cospi_u05_vsx_f64 #define xcospif_u05 nsimd_sleef_cospi_u05_vsx_f32 #define xldexp nsimd_sleef_ldexp_vsx_f64 #define xldexpf nsimd_sleef_ldexp_vsx_f32 #define xilogb nsimd_sleef_ilogb_vsx_f64 #define xilogbf nsimd_sleef_ilogb_vsx_f32 #define xfma nsimd_sleef_fma_vsx_f64 #define xfmaf nsimd_sleef_fma_vsx_f32 #define xsqrt nsimd_sleef_sqrt_vsx_f64 #define xsqrtf nsimd_sleef_sqrt_vsx_f32 #define xsqrt_u05 nsimd_sleef_sqrt_u05_vsx_f64 #define xsqrtf_u05 nsimd_sleef_sqrt_u05_vsx_f32 #define xsqrt_u35 nsimd_sleef_sqrt_u35_vsx_f64 #define xsqrtf_u35 nsimd_sleef_sqrt_u35_vsx_f32 #define xhypot_u05 nsimd_sleef_hypot_u05_vsx_f64 #define xhypotf_u05 nsimd_sleef_hypot_u05_vsx_f32 #define xhypot_u35 nsimd_sleef_hypot_u35_vsx_f64 #define xhypotf_u35 nsimd_sleef_hypot_u35_vsx_f32 #define xfabs nsimd_sleef_fabs_vsx_f64 #define xfabsf nsimd_sleef_fabs_vsx_f32 #define xcopysign nsimd_sleef_copysign_vsx_f64 #define xcopysignf nsimd_sleef_copysign_vsx_f32 #define xfmax nsimd_sleef_fmax_vsx_f64 #define xfmaxf nsimd_sleef_fmax_vsx_f32 #define xfmin nsimd_sleef_fmin_vsx_f64 #define xfminf nsimd_sleef_fmin_vsx_f32 #define xfdim nsimd_sleef_fdim_vsx_f64 #define xfdimf nsimd_sleef_fdim_vsx_f32 #define xtrunc nsimd_sleef_trunc_vsx_f64 #define xtruncf nsimd_sleef_trunc_vsx_f32 #define xfloor nsimd_sleef_floor_vsx_f64 #define xfloorf nsimd_sleef_floor_vsx_f32 #define xceil nsimd_sleef_ceil_vsx_f64 #define xceilf nsimd_sleef_ceil_vsx_f32 #define xround nsimd_sleef_round_vsx_f64 #define xroundf nsimd_sleef_round_vsx_f32 #define xrint nsimd_sleef_rint_vsx_f64 #define xrintf nsimd_sleef_rint_vsx_f32 #define xnextafter nsimd_sleef_nextafter_vsx_f64 #define xnextafterf nsimd_sleef_nextafter_vsx_f32 #define xfrfrexp nsimd_sleef_frfrexp_vsx_f64 #define xfrfrexpf nsimd_sleef_frfrexp_vsx_f32 #define xexpfrexp nsimd_sleef_expfrexp_vsx_f64 #define xexpfrexpf nsimd_sleef_expfrexp_vsx_f32 #define xfmod nsimd_sleef_fmod_vsx_f64 #define xfmodf nsimd_sleef_fmod_vsx_f32 #define xremainder nsimd_sleef_remainder_vsx_f64 #define xremainderf nsimd_sleef_remainder_vsx_f32 #define xmodf nsimd_sleef_modf_vsx_f64 #define xmodff nsimd_sleef_modf_vsx_f32 #define xlgamma_u1 nsimd_sleef_lgamma_u10_vsx_f64 #define xlgammaf_u1 nsimd_sleef_lgamma_u10_vsx_f32 #define xtgamma_u1 nsimd_sleef_tgamma_u10_vsx_f64 #define xtgammaf_u1 nsimd_sleef_tgamma_u10_vsx_f32 #define xerf_u1 nsimd_sleef_erf_u10_vsx_f64 #define xerff_u1 nsimd_sleef_erf_u10_vsx_f32 #define xerfc_u15 nsimd_sleef_erfc_u15_vsx_f64 #define xerfcf_u15 nsimd_sleef_erfc_u15_vsx_f32 #define xgetInt nsimd_sleef_getInt_vsx_f64 #define xgetIntf nsimd_sleef_getInt_vsx_f32 #define xgetPtr nsimd_sleef_getPtr_vsx_f64 #define xgetPtrf nsimd_sleef_getPtr_vsx_f32 #endif #define rempi nsimd_sleef_rempi_vsx #define rempif nsimd_sleef_rempif_vsx #define rempisub nsimd_sleef_rempisub_vsx #define rempisubf nsimd_sleef_rempisubf_vsx #define gammak nsimd_gammak_vsx #define gammafk nsimd_gammafk_vsx #endif #endif ================================================ FILE: src/sleefdp.c ================================================ // Copyright Naoki Shibata and contributors 2010 - 2020. // Distributed under the Boost Software License, Version 1.0. // (See accompanying file LICENSE.txt or copy at // http://www.boost.org/LICENSE_1_0.txt) // Always use -ffp-contract=off option to compile SLEEF. #include #include #include #include #include #ifndef ENABLE_BUILTIN_MATH #include #define SQRT sqrt #else #define SQRT __builtin_sqrt #endif #include "misc.h" extern const double Sleef_rempitabdp[]; #ifdef DORENAME #include "rename.h" #endif #if (defined(_MSC_VER)) #pragma fp_contract (off) #endif #define MLA mla #define C2V(x) (x) #include "estrin.h" static INLINE CONST int64_t doubleToRawLongBits(double d) { union { double f; int64_t i; } tmp; tmp.f = d; return tmp.i; } static INLINE CONST double longBitsToDouble(int64_t i) { union { double f; int64_t i; } tmp; tmp.i = i; return tmp.f; } static INLINE CONST double fabsk(double x) { return longBitsToDouble(INT64_C(0x7fffffffffffffff) & doubleToRawLongBits(x)); } static INLINE CONST double mulsign(double x, double y) { return longBitsToDouble(doubleToRawLongBits(x) ^ (doubleToRawLongBits(y) & (INT64_C(1) << 63))); } static INLINE CONST double copysignk(double x, double y) { return longBitsToDouble((doubleToRawLongBits(x) & ~(INT64_C(1) << 63)) ^ (doubleToRawLongBits(y) & (INT64_C(1) << 63))); } static INLINE CONST double sign(double d) { return mulsign(1, d); } static INLINE CONST double mla(double x, double y, double z) { return x * y + z; } static INLINE CONST double rintk(double x) { return x < 0 ? (int)(x - 0.5) : (int)(x + 0.5); } static INLINE CONST int ceilk(double x) { return (int)x + (x < 0 ? 0 : 1); } static INLINE CONST double trunck(double x) { return (double)(int)x; } static INLINE CONST double fmink(double x, double y) { return x < y ? x : y; } static INLINE CONST double fmaxk(double x, double y) { return x > y ? x : y; } static INLINE CONST int xisnan(double x) { return x != x; } static INLINE CONST int xisinf(double x) { return x == SLEEF_INFINITY || x == -SLEEF_INFINITY; } static INLINE CONST int xisminf(double x) { return x == -SLEEF_INFINITY; } static INLINE CONST int xispinf(double x) { return x == SLEEF_INFINITY; } static INLINE CONST int xisnegzero(double x) { return doubleToRawLongBits(x) == doubleToRawLongBits(-0.0); } static INLINE CONST int xisnumber(double x) { return !xisinf(x) && !xisnan(x); } static INLINE CONST int xisint(double d) { double x = d - (double)(INT64_C(1) << 31) * (int)(d * (1.0 / (INT64_C(1) << 31))); return (x == (int)x) || (fabsk(d) >= (double)(INT64_C(1) << 53)); } static INLINE CONST int xisodd(double d) { double x = d - (double)(INT64_C(1) << 31) * (int)(d * (1.0 / (INT64_C(1) << 31))); return (1 & (int)x) != 0 && fabsk(d) < (double)(INT64_C(1) << 53); } static INLINE CONST double pow2i(int q) { return longBitsToDouble(((int64_t)(q + 0x3ff)) << 52); } static INLINE CONST double ldexpk(double x, int q) { double u; int m; m = q >> 31; m = (((m + q) >> 9) - m) << 7; q = q - (m << 2); m += 0x3ff; m = m < 0 ? 0 : m; m = m > 0x7ff ? 0x7ff : m; u = longBitsToDouble(((int64_t)m) << 52); x = x * u * u * u * u; u = longBitsToDouble(((int64_t)(q + 0x3ff)) << 52); return x * u; } static INLINE CONST double ldexp2k(double d, int e) { // faster than ldexpk, short reach return d * pow2i(e >> 1) * pow2i(e - (e >> 1)); } static INLINE CONST double ldexp3k(double d, int e) { // very fast, no denormal return longBitsToDouble(doubleToRawLongBits(d) + (((int64_t)e) << 52)); } EXPORT CONST double xldexp(double x, int exp) { if (exp > 2100) exp = 2100; if (exp < -2100) exp = -2100; int e0 = exp >> 2; if (exp < 0) e0++; if (-100 < exp && exp < 100) e0 = 0; int e1 = exp - (e0 << 2); double p = pow2i(e0); double ret = x * pow2i(e1) * p * p * p * p; return ret; } static INLINE CONST int ilogbk(double d) { int m = d < 4.9090934652977266E-91; d = m ? 2.037035976334486E90 * d : d; int q = (doubleToRawLongBits(d) >> 52) & 0x7ff; q = m ? q - (300 + 0x03ff) : q - 0x03ff; return q; } // ilogb2k is similar to ilogbk, but the argument has to be a // normalized FP value. static INLINE CONST int ilogb2k(double d) { return ((doubleToRawLongBits(d) >> 52) & 0x7ff) - 0x3ff; } EXPORT CONST int xilogb(double d) { int e = ilogbk(fabsk(d)); e = d == 0.0 ? SLEEF_FP_ILOGB0 : e; e = xisnan(d) ? SLEEF_FP_ILOGBNAN : e; e = xisinf(d) ? INT_MAX : e; return e; } // #ifndef NDEBUG static int checkfp(double x) { if (xisinf(x) || xisnan(x)) return 1; return 0; } #endif static INLINE CONST double upper(double d) { return longBitsToDouble(doubleToRawLongBits(d) & INT64_C(0xfffffffff8000000)); } static INLINE CONST Sleef_double2 dd(double h, double l) { Sleef_double2 ret; ret.x = h; ret.y = l; return ret; } static INLINE CONST Sleef_double2 ddnormalize_d2_d2(Sleef_double2 t) { Sleef_double2 s; s.x = t.x + t.y; s.y = t.x - s.x + t.y; return s; } static INLINE CONST Sleef_double2 ddscale_d2_d2_d(Sleef_double2 d, double s) { Sleef_double2 r; r.x = d.x * s; r.y = d.y * s; return r; } static INLINE CONST Sleef_double2 ddneg_d2_d2(Sleef_double2 d) { Sleef_double2 r; r.x = -d.x; r.y = -d.y; return r; } static INLINE CONST Sleef_double2 ddabs_d2_d2(Sleef_double2 x) { return dd(x.x < 0 ? -x.x : x.x, x.x < 0 ? -x.y : x.y); } /* * ddadd and ddadd2 are functions for double-double addition. ddadd * is simpler and faster than ddadd2, but it requires the absolute * value of first argument to be larger than the second argument. The * exact condition that should be met is checked if NDEBUG macro is * not defined. * * Please note that if the results won't be used, it is no problem to * feed arguments that do not meet this condition. You will see * warning messages if you turn off NDEBUG macro and run tester2, but * this is normal. * * Please see : * Jonathan Richard Shewchuk, Adaptive Precision Floating-Point * Arithmetic and Fast Robust Geometric Predicates, Discrete & * Computational Geometry 18:305-363, 1997. */ static INLINE CONST Sleef_double2 ddadd_d2_d_d(double x, double y) { // |x| >= |y| Sleef_double2 r; #ifndef NDEBUG if (!(checkfp(x) || checkfp(y) || fabsk(x) >= fabsk(y) || (fabsk(x+y) <= fabsk(x) && fabsk(x+y) <= fabsk(y)))) { fprintf(stderr, "[ddadd_d2_d_d : %g, %g]\n", x, y); fflush(stderr); } #endif r.x = x + y; r.y = x - r.x + y; return r; } static INLINE CONST Sleef_double2 ddadd2_d2_d_d(double x, double y) { Sleef_double2 r; r.x = x + y; double v = r.x - x; r.y = (x - (r.x - v)) + (y - v); return r; } static INLINE CONST Sleef_double2 ddadd_d2_d2_d(Sleef_double2 x, double y) { // |x| >= |y| Sleef_double2 r; #ifndef NDEBUG if (!(checkfp(x.x) || checkfp(y) || fabsk(x.x) >= fabsk(y) || (fabsk(x.x+y) <= fabsk(x.x) && fabsk(x.x+y) <= fabsk(y)))) { fprintf(stderr, "[ddadd_d2_d2_d : %g %g]\n", x.x, y); fflush(stderr); } #endif r.x = x.x + y; r.y = x.x - r.x + y + x.y; return r; } static INLINE CONST Sleef_double2 ddadd2_d2_d2_d(Sleef_double2 x, double y) { Sleef_double2 r; r.x = x.x + y; double v = r.x - x.x; r.y = (x.x - (r.x - v)) + (y - v); r.y += x.y; return r; } static INLINE CONST Sleef_double2 ddadd_d2_d_d2(double x, Sleef_double2 y) { // |x| >= |y| Sleef_double2 r; #ifndef NDEBUG if (!(checkfp(x) || checkfp(y.x) || fabsk(x) >= fabsk(y.x) || (fabsk(x+y.x) <= fabsk(x) && fabsk(x+y.x) <= fabsk(y.x)))) { fprintf(stderr, "[ddadd_d2_d_d2 : %g %g]\n", x, y.x); fflush(stderr); } #endif r.x = x + y.x; r.y = x - r.x + y.x + y.y; return r; } static INLINE CONST Sleef_double2 ddadd2_d2_d_d2(double x, Sleef_double2 y) { Sleef_double2 r; r.x = x + y.x; double v = r.x - x; r.y = (x - (r.x - v)) + (y.x - v) + y.y; return r; } static INLINE CONST double ddadd2_d_d_d2(double x, Sleef_double2 y) { return y.y + y.x + x; } static INLINE CONST Sleef_double2 ddadd_d2_d2_d2(Sleef_double2 x, Sleef_double2 y) { // |x| >= |y| Sleef_double2 r; #ifndef NDEBUG if (!(x.x == 0 || checkfp(x.x) || checkfp(y.x) || fabsk(x.x) >= fabsk(y.x) || (fabsk(x.x+y.x) <= fabsk(x.x) && fabsk(x.x+y.x) <= fabsk(y.x)))) { fprintf(stderr, "[ddadd_d2_d2_d2 : %g %g]\n", x.x, y.x); fflush(stderr); } #endif r.x = x.x + y.x; r.y = x.x - r.x + y.x + x.y + y.y; return r; } static INLINE CONST Sleef_double2 ddadd2_d2_d2_d2(Sleef_double2 x, Sleef_double2 y) { Sleef_double2 r; r.x = x.x + y.x; double v = r.x - x.x; r.y = (x.x - (r.x - v)) + (y.x - v); r.y += x.y + y.y; return r; } static INLINE CONST Sleef_double2 ddsub_d2_d2_d2(Sleef_double2 x, Sleef_double2 y) { // |x| >= |y| Sleef_double2 r; #ifndef NDEBUG if (!(checkfp(x.x) || checkfp(y.x) || fabsk(x.x) >= fabsk(y.x) || (fabsk(x.x-y.x) <= fabsk(x.x) && fabsk(x.x-y.x) <= fabsk(y.x)))) { fprintf(stderr, "[ddsub_d2_d2_d2 : %g %g]\n", x.x, y.x); fflush(stderr); } #endif r.x = x.x - y.x; r.y = x.x - r.x - y.x + x.y - y.y; return r; } static INLINE CONST Sleef_double2 dddiv_d2_d2_d2(Sleef_double2 n, Sleef_double2 d) { double t = 1.0 / d.x; double dh = upper(d.x), dl = d.x - dh; double th = upper(t ), tl = t - th; double nhh = upper(n.x), nhl = n.x - nhh; Sleef_double2 q; q.x = n.x * t; double u = -q.x + nhh * th + nhh * tl + nhl * th + nhl * tl + q.x * (1 - dh * th - dh * tl - dl * th - dl * tl); q.y = t * (n.y - q.x * d.y) + u; return q; } static INLINE CONST Sleef_double2 ddmul_d2_d_d(double x, double y) { double xh = upper(x), xl = x - xh; double yh = upper(y), yl = y - yh; Sleef_double2 r; r.x = x * y; r.y = xh * yh - r.x + xl * yh + xh * yl + xl * yl; return r; } static INLINE CONST Sleef_double2 ddmul_d2_d2_d(Sleef_double2 x, double y) { double xh = upper(x.x), xl = x.x - xh; double yh = upper(y ), yl = y - yh; Sleef_double2 r; r.x = x.x * y; r.y = xh * yh - r.x + xl * yh + xh * yl + xl * yl + x.y * y; return r; } static INLINE CONST Sleef_double2 ddmul_d2_d2_d2(Sleef_double2 x, Sleef_double2 y) { double xh = upper(x.x), xl = x.x - xh; double yh = upper(y.x), yl = y.x - yh; Sleef_double2 r; r.x = x.x * y.x; r.y = xh * yh - r.x + xl * yh + xh * yl + xl * yl + x.x * y.y + x.y * y.x; return r; } static INLINE CONST double ddmul_d_d2_d2(Sleef_double2 x, Sleef_double2 y) { double xh = upper(x.x), xl = x.x - xh; double yh = upper(y.x), yl = y.x - yh; return x.y * yh + xh * y.y + xl * yl + xh * yl + xl * yh + xh * yh; } static INLINE CONST Sleef_double2 ddsqu_d2_d2(Sleef_double2 x) { double xh = upper(x.x), xl = x.x - xh; Sleef_double2 r; r.x = x.x * x.x; r.y = xh * xh - r.x + (xh + xh) * xl + xl * xl + x.x * (x.y + x.y); return r; } static INLINE CONST double ddsqu_d_d2(Sleef_double2 x) { double xh = upper(x.x), xl = x.x - xh; return xh * x.y + xh * x.y + xl * xl + (xh * xl + xh * xl) + xh * xh; } static INLINE CONST Sleef_double2 ddrec_d2_d(double d) { double t = 1.0 / d; double dh = upper(d), dl = d - dh; double th = upper(t), tl = t - th; Sleef_double2 q; q.x = t; q.y = t * (1 - dh * th - dh * tl - dl * th - dl * tl); return q; } static INLINE CONST Sleef_double2 ddrec_d2_d2(Sleef_double2 d) { double t = 1.0 / d.x; double dh = upper(d.x), dl = d.x - dh; double th = upper(t ), tl = t - th; Sleef_double2 q; q.x = t; q.y = t * (1 - dh * th - dh * tl - dl * th - dl * tl - d.y * t); return q; } static INLINE CONST Sleef_double2 ddsqrt_d2_d2(Sleef_double2 d) { double t = SQRT(d.x + d.y); return ddscale_d2_d2_d(ddmul_d2_d2_d2(ddadd2_d2_d2_d2(d, ddmul_d2_d_d(t, t)), ddrec_d2_d(t)), 0.5); } static INLINE CONST Sleef_double2 ddsqrt_d2_d(double d) { double t = SQRT(d); return ddscale_d2_d2_d(ddmul_d2_d2_d2(ddadd2_d2_d_d2(d, ddmul_d2_d_d(t, t)), ddrec_d2_d(t)), 0.5); } // static INLINE CONST double atan2k(double y, double x) { double s, t, u; int q = 0; if (x < 0) { x = -x; q = -2; } if (y > x) { t = x; x = y; y = -t; q += 1; } s = y / x; t = s * s; double t2 = t * t, t4 = t2 * t2, t8 = t4 * t4, t16 = t8 * t8; u = POLY19(t, t2, t4, t8, t16, -1.88796008463073496563746e-05, 0.000209850076645816976906797, -0.00110611831486672482563471, 0.00370026744188713119232403, -0.00889896195887655491740809, 0.016599329773529201970117, -0.0254517624932312641616861, 0.0337852580001353069993897, -0.0407629191276836500001934, 0.0466667150077840625632675, -0.0523674852303482457616113, 0.0587666392926673580854313, -0.0666573579361080525984562, 0.0769219538311769618355029, -0.090908995008245008229153, 0.111111105648261418443745, -0.14285714266771329383765, 0.199999999996591265594148, -0.333333333333311110369124); t = u * t * s + s; t = q * (M_PI/2) + t; return t; } EXPORT CONST double xatan2(double y, double x) { double r = atan2k(fabsk(y), x); r = mulsign(r, x); if (xisinf(x) || x == 0) r = M_PI/2 - (xisinf(x) ? (sign(x) * (M_PI /2)) : 0); if (xisinf(y) ) r = M_PI/2 - (xisinf(x) ? (sign(x) * (M_PI*1/4)) : 0); if ( y == 0) r = (sign(x) == -1 ? M_PI : 0); return xisnan(x) || xisnan(y) ? SLEEF_NAN : mulsign(r, y); } EXPORT CONST double xasin(double d) { int o = fabsk(d) < 0.5; double x2 = o ? (d*d) : ((1-fabsk(d))*0.5), x = o ? fabsk(d) : SQRT(x2), u; double x4 = x2 * x2, x8 = x4 * x4, x16 = x8 * x8; u = POLY12(x2, x4, x8, x16, +0.3161587650653934628e-1, -0.1581918243329996643e-1, +0.1929045477267910674e-1, +0.6606077476277170610e-2, +0.1215360525577377331e-1, +0.1388715184501609218e-1, +0.1735956991223614604e-1, +0.2237176181932048341e-1, +0.3038195928038132237e-1, +0.4464285681377102438e-1, +0.7500000000378581611e-1, +0.1666666666666497543e+0); u = mla(u, x * x2, x); double r = o ? u : (M_PI/2 - 2*u); r = mulsign(r, d); return r; } EXPORT CONST double xacos(double d) { int o = fabsk(d) < 0.5; double x2 = o ? (d*d) : ((1-fabsk(d))*0.5), u; double x = o ? fabsk(d) : SQRT(x2); x = fabsk(d) == 1.0 ? 0 : x; double x4 = x2 * x2, x8 = x4 * x4, x16 = x8 * x8; u = POLY12(x2, x4, x8, x16, +0.3161587650653934628e-1, -0.1581918243329996643e-1, +0.1929045477267910674e-1, +0.6606077476277170610e-2, +0.1215360525577377331e-1, +0.1388715184501609218e-1, +0.1735956991223614604e-1, +0.2237176181932048341e-1, +0.3038195928038132237e-1, +0.4464285681377102438e-1, +0.7500000000378581611e-1, +0.1666666666666497543e+0); u *= x * x2; double y = 3.1415926535897932/2 - (mulsign(x, d) + mulsign(u, d)); x += u; double r = o ? y : (x*2); if (!o && d < 0) r = ddadd_d2_d2_d(dd(3.141592653589793116, 1.2246467991473532072e-16), -r).x; return r; } EXPORT CONST double xatan(double s) { double t, u; int q = 0; if (sign(s) == -1) { s = -s; q = 2; } if (s > 1) { s = 1.0 / s; q |= 1; } t = s * s; double t2 = t * t, t4 = t2 * t2, t8 = t4 * t4, t16 = t8 * t8; u = POLY19(t, t2, t4, t8, t16, -1.88796008463073496563746e-05, 0.000209850076645816976906797, -0.00110611831486672482563471, 0.00370026744188713119232403, -0.00889896195887655491740809, 0.016599329773529201970117, -0.0254517624932312641616861, 0.0337852580001353069993897, -0.0407629191276836500001934, 0.0466667150077840625632675, -0.0523674852303482457616113, 0.0587666392926673580854313, -0.0666573579361080525984562, 0.0769219538311769618355029, -0.090908995008245008229153, 0.111111105648261418443745, -0.14285714266771329383765, 0.199999999996591265594148, -0.333333333333311110369124); t = s + s * (t * u); if ((q & 1) != 0) t = 1.570796326794896557998982 - t; if ((q & 2) != 0) t = -t; return t; } static Sleef_double2 atan2k_u1(Sleef_double2 y, Sleef_double2 x) { double u; Sleef_double2 s, t; int q = 0; if (x.x < 0) { x.x = -x.x; x.y = -x.y; q = -2; } if (y.x > x.x) { t = x; x = y; y.x = -t.x; y.y = -t.y; q += 1; } s = dddiv_d2_d2_d2(y, x); t = ddsqu_d2_d2(s); t = ddnormalize_d2_d2(t); double t2 = t.x * t.x, t4 = t2 * t2, t8 = t4 * t4, t16 = t8 * t8; u = POLY16(t.x, t2, t4, t8, 1.06298484191448746607415e-05, -0.000125620649967286867384336, 0.00070557664296393412389774, -0.00251865614498713360352999, 0.00646262899036991172313504, -0.0128281333663399031014274, 0.0208024799924145797902497, -0.0289002344784740315686289, 0.0359785005035104590853656, -0.041848579703592507506027, 0.0470843011653283988193763, -0.0524914210588448421068719, 0.0587946590969581003860434, -0.0666620884778795497194182, 0.0769225330296203768654095, -0.0909090442773387574781907); u = mla(u, t.x, 0.111111108376896236538123); u = mla(u, t.x, -0.142857142756268568062339); u = mla(u, t.x, 0.199999999997977351284817); u = mla(u, t.x, -0.333333333333317605173818); t = ddadd_d2_d2_d2(s, ddmul_d2_d2_d(ddmul_d2_d2_d2(s, t), u)); if (fabsk(s.x) < 1e-200) t = s; t = ddadd2_d2_d2_d2(ddmul_d2_d2_d(dd(1.570796326794896557998982, 6.12323399573676603586882e-17), q), t); return t; } EXPORT CONST double xatan2_u1(double y, double x) { if (fabsk(x) < 5.5626846462680083984e-309) { y *= (UINT64_C(1) << 53); x *= (UINT64_C(1) << 53); } // nexttoward((1.0 / DBL_MAX), 1) Sleef_double2 d = atan2k_u1(dd(fabsk(y), 0), dd(x, 0)); double r = d.x + d.y; r = mulsign(r, x); if (xisinf(x) || x == 0) r = M_PI/2 - (xisinf(x) ? (sign(x) * (M_PI /2)) : 0); if (xisinf(y) ) r = M_PI/2 - (xisinf(x) ? (sign(x) * (M_PI*1/4)) : 0); if ( y == 0) r = (sign(x) == -1 ? M_PI : 0); return xisnan(x) || xisnan(y) ? SLEEF_NAN : mulsign(r, y); } EXPORT CONST double xasin_u1(double d) { int o = fabsk(d) < 0.5; double x2 = o ? (d*d) : ((1-fabsk(d))*0.5), u; Sleef_double2 x = o ? dd(fabsk(d), 0) : ddsqrt_d2_d(x2); x = fabsk(d) == 1.0 ? dd(0, 0) : x; double x4 = x2 * x2, x8 = x4 * x4, x16 = x8 * x8; u = POLY12(x2, x4, x8, x16, +0.3161587650653934628e-1, -0.1581918243329996643e-1, +0.1929045477267910674e-1, +0.6606077476277170610e-2, +0.1215360525577377331e-1, +0.1388715184501609218e-1, +0.1735956991223614604e-1, +0.2237176181932048341e-1, +0.3038195928038132237e-1, +0.4464285681377102438e-1, +0.7500000000378581611e-1, +0.1666666666666497543e+0); u *= x2 * x.x; Sleef_double2 y = ddadd_d2_d2_d(ddsub_d2_d2_d2(dd(3.141592653589793116/4, 1.2246467991473532072e-16/4), x), -u); double r = o ? (u + x.x) : ((y.x + y.y)*2); r = mulsign(r, d); return r; } EXPORT CONST double xacos_u1(double d) { int o = fabsk(d) < 0.5; double x2 = o ? (d*d) : ((1-fabsk(d))*0.5), u; Sleef_double2 x = o ? dd(fabsk(d), 0) : ddsqrt_d2_d(x2), w; x = fabsk(d) == 1.0 ? dd(0, 0) : x; double x4 = x2 * x2, x8 = x4 * x4, x16 = x8 * x8; u = POLY12(x2, x4, x8, x16, +0.3161587650653934628e-1, -0.1581918243329996643e-1, +0.1929045477267910674e-1, +0.6606077476277170610e-2, +0.1215360525577377331e-1, +0.1388715184501609218e-1, +0.1735956991223614604e-1, +0.2237176181932048341e-1, +0.3038195928038132237e-1, +0.4464285681377102438e-1, +0.7500000000378581611e-1, +0.1666666666666497543e+0); u *= x.x * x2; Sleef_double2 y = ddsub_d2_d2_d2(dd(3.141592653589793116/2, 1.2246467991473532072e-16/2), ddadd_d2_d_d(mulsign(x.x, d), mulsign(u, d))); x = ddadd_d2_d2_d(x, u); y = o ? y : ddscale_d2_d2_d(x, 2); if (!o && d < 0) y = ddsub_d2_d2_d2(dd(3.141592653589793116, 1.2246467991473532072e-16), y); return y.x + y.y; } EXPORT CONST double xatan_u1(double d) { Sleef_double2 d2 = atan2k_u1(dd(fabsk(d), 0), dd(1, 0)); double r = d2.x + d2.y; if (xisinf(d)) r = 1.570796326794896557998982; return mulsign(r, d); } typedef struct { double d; int32_t i; } di_t; typedef struct { Sleef_double2 dd; int32_t i; } ddi_t; static INLINE CONST double orsign(double x, double y) { return longBitsToDouble(doubleToRawLongBits(x) | (doubleToRawLongBits(y) & (INT64_C(1) << 63))); } static CONST di_t rempisub(double x) { // This function is equivalent to : // di_t ret = { x - rint(4 * x) * 0.25, (int32_t)(rint(4 * x) - rint(x) * 4) }; di_t ret; double c = mulsign(INT64_C(1) << 52, x); double rint4x = fabsk(4*x) > INT64_C(1) << 52 ? (4*x) : orsign(mla(4, x, c) - c, x); double rintx = fabsk( x) > INT64_C(1) << 52 ? x : orsign(x + c - c , x); ret.d = mla(-0.25, rint4x, x); ret.i = mla(-4 , rintx , rint4x); return ret; } // Payne-Hanek like argument reduction static CONST ddi_t rempi(double a) { Sleef_double2 x, y, z; di_t di; double t; int ex = ilogb2k(a) - 55, q = ex > (700-55) ? -64 : 0; a = ldexp3k(a, q); if (ex < 0) ex = 0; ex *= 4; x = ddmul_d2_d_d(a, Sleef_rempitabdp[ex]); di = rempisub(x.x); q = di.i; x.x = di.d; x = ddnormalize_d2_d2(x); y = ddmul_d2_d_d(a, Sleef_rempitabdp[ex+1]); x = ddadd2_d2_d2_d2(x, y); di = rempisub(x.x); q += di.i; x.x = di.d; x = ddnormalize_d2_d2(x); y = ddmul_d2_d2_d(dd(Sleef_rempitabdp[ex+2], Sleef_rempitabdp[ex+3]), a); x = ddadd2_d2_d2_d2(x, y); x = ddnormalize_d2_d2(x); x = ddmul_d2_d2_d2(x, dd(3.141592653589793116*2, 1.2246467991473532072e-16*2)); ddi_t ret = { fabsk(a) < 0.7 ? dd(a, 0) : x, q }; return ret; } EXPORT CONST double xsin(double d) { double u, s, t = d; int ql; if (fabsk(d) < TRIGRANGEMAX2) { ql = rintk(d * M_1_PI); d = mla(ql, -PI_A2, d); d = mla(ql, -PI_B2, d); } else if (fabsk(d) < TRIGRANGEMAX) { double dqh = trunck(d * (M_1_PI / (1 << 24))) * (double)(1 << 24); ql = rintk(mla(d, M_1_PI, -dqh)); d = mla(dqh, -PI_A, d); d = mla( ql, -PI_A, d); d = mla(dqh, -PI_B, d); d = mla( ql, -PI_B, d); d = mla(dqh, -PI_C, d); d = mla( ql, -PI_C, d); d = mla(dqh + ql, -PI_D, d); } else { ddi_t ddi = rempi(t); ql = ((ddi.i & 3) * 2 + (ddi.dd.x > 0) + 1) >> 2; if ((ddi.i & 1) != 0) { ddi.dd = ddadd2_d2_d2_d2(ddi.dd, dd(mulsign(3.141592653589793116*-0.5, ddi.dd.x), mulsign(1.2246467991473532072e-16*-0.5, ddi.dd.x))); } d = ddi.dd.x + ddi.dd.y; if (xisinf(t) || xisnan(t)) d = SLEEF_NAN; } s = d * d; if ((ql & 1) != 0) d = -d; double s2 = s * s, s4 = s2 * s2; u = POLY8(s, s2, s4, -7.97255955009037868891952e-18, 2.81009972710863200091251e-15, -7.64712219118158833288484e-13, 1.60590430605664501629054e-10, -2.50521083763502045810755e-08, 2.75573192239198747630416e-06, -0.000198412698412696162806809, 0.00833333333333332974823815); u = mla(u, s, -0.166666666666666657414808); u = mla(s, u * d, d); if (xisnegzero(t)) u = t; return u; } EXPORT CONST double xsin_u1(double d) { double u; Sleef_double2 s, t, x; int ql; if (fabsk(d) < TRIGRANGEMAX2) { ql = rintk(d * M_1_PI); u = mla(ql, -PI_A2, d); s = ddadd_d2_d_d (u, ql * -PI_B2); } else if (fabsk(d) < TRIGRANGEMAX) { const double dqh = trunck(d * (M_1_PI / (1 << 24))) * (double)(1 << 24); ql = rintk(mla(d, M_1_PI, -dqh)); u = mla(dqh, -PI_A, d); s = ddadd_d2_d_d (u, ql * -PI_A); s = ddadd2_d2_d2_d(s, dqh * -PI_B); s = ddadd2_d2_d2_d(s, ql * -PI_B); s = ddadd2_d2_d2_d(s, dqh * -PI_C); s = ddadd2_d2_d2_d(s, ql * -PI_C); s = ddadd_d2_d2_d (s, (dqh + ql) * -PI_D); } else { ddi_t ddi = rempi(d); ql = ((ddi.i & 3) * 2 + (ddi.dd.x > 0) + 1) >> 2; if ((ddi.i & 1) != 0) { ddi.dd = ddadd2_d2_d2_d2(ddi.dd, dd(mulsign(3.141592653589793116*-0.5, ddi.dd.x), mulsign(1.2246467991473532072e-16*-0.5, ddi.dd.x))); } s = ddnormalize_d2_d2(ddi.dd); if (xisinf(d) || xisnan(d)) s.x = SLEEF_NAN; } t = s; s = ddsqu_d2_d2(s); double s2 = s.x * s.x, s4 = s2 * s2; u = POLY6(s.x, s2, s4, 2.72052416138529567917983e-15, -7.6429259411395447190023e-13, 1.60589370117277896211623e-10, -2.5052106814843123359368e-08, 2.75573192104428224777379e-06, -0.000198412698412046454654947); u = mla(u, s.x, 0.00833333333333318056201922); x = ddadd_d2_d_d2(1, ddmul_d2_d2_d2(ddadd_d2_d_d(-0.166666666666666657414808, u * s.x), s)); u = ddmul_d_d2_d2(t, x); if ((ql & 1) != 0) u = -u; if (xisnegzero(d)) u = d; return u; } EXPORT CONST double xcos(double d) { double u, s, t = d; int ql; if (fabsk(d) < TRIGRANGEMAX2) { ql = mla(2, rintk(d * M_1_PI - 0.5), 1); d = mla(ql, -PI_A2*0.5, d); d = mla(ql, -PI_B2*0.5, d); } else if (fabsk(d) < TRIGRANGEMAX) { double dqh = trunck(d * (M_1_PI / (INT64_C(1) << 23)) - 0.5 * (M_1_PI / (INT64_C(1) << 23))); ql = 2*rintk(d * M_1_PI - 0.5 - dqh * (double)(INT64_C(1) << 23))+1; dqh *= 1 << 24; d = mla(dqh, -PI_A*0.5, d); d = mla( ql, -PI_A*0.5, d); d = mla(dqh, -PI_B*0.5, d); d = mla( ql, -PI_B*0.5, d); d = mla(dqh, -PI_C*0.5, d); d = mla( ql, -PI_C*0.5, d); d = mla(dqh + ql , -PI_D*0.5, d); } else { ddi_t ddi = rempi(t); ql = ((ddi.i & 3) * 2 + (ddi.dd.x > 0) + 7) >> 1; if ((ddi.i & 1) == 0) { ddi.dd = ddadd2_d2_d2_d2(ddi.dd, dd(mulsign(3.141592653589793116*-0.5, ddi.dd.x > 0 ? 1 : -1), mulsign(1.2246467991473532072e-16*-0.5, ddi.dd.x > 0 ? 1 : -1))); } d = ddi.dd.x + ddi.dd.y; if (xisinf(t) || xisnan(t)) d = SLEEF_NAN; } s = d * d; if ((ql & 2) == 0) d = -d; double s2 = s * s, s4 = s2 * s2; u = POLY8(s, s2, s4, -7.97255955009037868891952e-18, 2.81009972710863200091251e-15, -7.64712219118158833288484e-13, 1.60590430605664501629054e-10, -2.50521083763502045810755e-08, 2.75573192239198747630416e-06, -0.000198412698412696162806809, 0.00833333333333332974823815); u = mla(u, s, -0.166666666666666657414808); u = mla(s, u * d, d); return u; } EXPORT CONST double xcos_u1(double d) { double u; Sleef_double2 s, t, x; int ql; d = fabsk(d); if (d < TRIGRANGEMAX2) { ql = mla(2, rintk(d * M_1_PI - 0.5), 1); s = ddadd2_d2_d_d(d, ql * (-PI_A2*0.5)); s = ddadd_d2_d2_d(s, ql * (-PI_B2*0.5)); } else if (d < TRIGRANGEMAX) { double dqh = trunck(d * (M_1_PI / (INT64_C(1) << 23)) - 0.5 * (M_1_PI / (INT64_C(1) << 23))); ql = 2*rintk(d * M_1_PI - 0.5 - dqh * (double)(INT64_C(1) << 23))+1; dqh *= 1 << 24; u = mla(dqh, -PI_A*0.5, d); s = ddadd2_d2_d_d (u, ql * (-PI_A*0.5)); s = ddadd2_d2_d2_d(s, dqh * (-PI_B*0.5)); s = ddadd2_d2_d2_d(s, ql * (-PI_B*0.5)); s = ddadd2_d2_d2_d(s, dqh * (-PI_C*0.5)); s = ddadd2_d2_d2_d(s, ql * (-PI_C*0.5)); s = ddadd_d2_d2_d(s, (dqh + ql) * (-PI_D*0.5)); } else { ddi_t ddi = rempi(d); ql = ((ddi.i & 3) * 2 + (ddi.dd.x > 0) + 7) >> 1; if ((ddi.i & 1) == 0) { ddi.dd = ddadd2_d2_d2_d2(ddi.dd, dd(mulsign(3.141592653589793116*-0.5, ddi.dd.x > 0 ? 1 : -1), mulsign(1.2246467991473532072e-16*-0.5, ddi.dd.x > 0 ? 1 : -1))); } s = ddnormalize_d2_d2(ddi.dd); if (xisinf(d) || xisnan(d)) s.x = SLEEF_NAN; } t = s; s = ddsqu_d2_d2(s); double s2 = s.x * s.x, s4 = s2 * s2; u = POLY6(s.x, s2, s4, 2.72052416138529567917983e-15, -7.6429259411395447190023e-13, 1.60589370117277896211623e-10, -2.5052106814843123359368e-08, 2.75573192104428224777379e-06, -0.000198412698412046454654947); u = mla(u, s.x, 0.00833333333333318056201922); x = ddadd_d2_d_d2(1, ddmul_d2_d2_d2(ddadd_d2_d_d(-0.166666666666666657414808, u * s.x), s)); u = ddmul_d_d2_d2(t, x); if ((((int)ql) & 2) == 0) u = -u; return u; } EXPORT CONST Sleef_double2 xsincos(double d) { double u, s, t; Sleef_double2 r; int ql; s = d; if (fabsk(d) < TRIGRANGEMAX2) { ql = rintk(s * (2 * M_1_PI)); s = mla(ql, -PI_A2*0.5, s); s = mla(ql, -PI_B2*0.5, s); } else if (fabsk(d) < TRIGRANGEMAX) { double dqh = trunck(d * ((2 * M_1_PI) / (1 << 24))) * (double)(1 << 24); ql = rintk(d * (2 * M_1_PI) - dqh); s = mla(dqh, -PI_A * 0.5, s); s = mla( ql, -PI_A * 0.5, s); s = mla(dqh, -PI_B * 0.5, s); s = mla( ql, -PI_B * 0.5, s); s = mla(dqh, -PI_C * 0.5, s); s = mla( ql, -PI_C * 0.5, s); s = mla(dqh + ql, -PI_D * 0.5, s); } else { ddi_t ddi = rempi(d); ql = ddi.i; s = ddi.dd.x + ddi.dd.y; if (xisinf(d) || xisnan(d)) s = SLEEF_NAN; } t = s; s = s * s; u = 1.58938307283228937328511e-10; u = mla(u, s, -2.50506943502539773349318e-08); u = mla(u, s, 2.75573131776846360512547e-06); u = mla(u, s, -0.000198412698278911770864914); u = mla(u, s, 0.0083333333333191845961746); u = mla(u, s, -0.166666666666666130709393); u = u * s * t; r.x = t + u; if (xisnegzero(d)) r.x = -0.0; u = -1.13615350239097429531523e-11; u = mla(u, s, 2.08757471207040055479366e-09); u = mla(u, s, -2.75573144028847567498567e-07); u = mla(u, s, 2.48015872890001867311915e-05); u = mla(u, s, -0.00138888888888714019282329); u = mla(u, s, 0.0416666666666665519592062); u = mla(u, s, -0.5); r.y = u * s + 1; if ((ql & 1) != 0) { s = r.y; r.y = r.x; r.x = s; } if ((ql & 2) != 0) { r.x = -r.x; } if (((ql+1) & 2) != 0) { r.y = -r.y; } return r; } EXPORT CONST Sleef_double2 xsincos_u1(double d) { double u; Sleef_double2 r, s, t, x; int ql; if (fabsk(d) < TRIGRANGEMAX2) { ql = rintk(d * (2 * M_1_PI)); u = mla(ql, -PI_A2*0.5, d); s = ddadd_d2_d_d (u, ql * (-PI_B2*0.5)); } else if (fabsk(d) < TRIGRANGEMAX) { const double dqh = trunck(d * ((2 * M_1_PI) / (1 << 24))) * (double)(1 << 24); ql = rintk(d * (2 * M_1_PI) - dqh); u = mla(dqh, -PI_A*0.5, d); s = ddadd_d2_d_d(u, ql * (-PI_A*0.5)); s = ddadd2_d2_d2_d(s, dqh * (-PI_B*0.5)); s = ddadd2_d2_d2_d(s, ql * (-PI_B*0.5)); s = ddadd2_d2_d2_d(s, dqh * (-PI_C*0.5)); s = ddadd2_d2_d2_d(s, ql * (-PI_C*0.5)); s = ddadd_d2_d2_d(s, (dqh + ql) * (-PI_D*0.5)); } else { ddi_t ddi = rempi(d); ql = ddi.i; s = ddi.dd; if (xisinf(d) || xisnan(d)) s = dd(SLEEF_NAN, SLEEF_NAN); } t = s; s.x = ddsqu_d_d2(s); u = 1.58938307283228937328511e-10; u = mla(u, s.x, -2.50506943502539773349318e-08); u = mla(u, s.x, 2.75573131776846360512547e-06); u = mla(u, s.x, -0.000198412698278911770864914); u = mla(u, s.x, 0.0083333333333191845961746); u = mla(u, s.x, -0.166666666666666130709393); u *= s.x * t.x; x = ddadd_d2_d2_d(t, u); r.x = x.x + x.y; if (xisnegzero(d)) r.x = -0.0; u = -1.13615350239097429531523e-11; u = mla(u, s.x, 2.08757471207040055479366e-09); u = mla(u, s.x, -2.75573144028847567498567e-07); u = mla(u, s.x, 2.48015872890001867311915e-05); u = mla(u, s.x, -0.00138888888888714019282329); u = mla(u, s.x, 0.0416666666666665519592062); u = mla(u, s.x, -0.5); x = ddadd_d2_d_d2(1, ddmul_d2_d_d(s.x, u)); r.y = x.x + x.y; if ((ql & 1) != 0) { u = r.y; r.y = r.x; r.x = u; } if ((ql & 2) != 0) { r.x = -r.x; } if (((ql+1) & 2) != 0) { r.y = -r.y; } return r; } EXPORT CONST Sleef_double2 xsincospi_u05(double d) { double u, s, t; Sleef_double2 r, x, s2; u = d * 4; int q = ceilk(u) & ~(int)1; s = u - (double)q; t = s; s = s * s; s2 = ddmul_d2_d_d(t, t); // u = -2.02461120785182399295868e-14; u = mla(u, s, 6.94821830580179461327784e-12); u = mla(u, s, -1.75724749952853179952664e-09); u = mla(u, s, 3.13361688966868392878422e-07); u = mla(u, s, -3.6576204182161551920361e-05); u = mla(u, s, 0.00249039457019271850274356); x = ddadd2_d2_d_d2(u * s, dd(-0.0807455121882807852484731, 3.61852475067037104849987e-18)); x = ddadd2_d2_d2_d2(ddmul_d2_d2_d2(s2, x), dd(0.785398163397448278999491, 3.06287113727155002607105e-17)); x = ddmul_d2_d2_d(x, t); r.x = x.x + x.y; if (xisnegzero(d)) r.x = -0.0; // u = 9.94480387626843774090208e-16; u = mla(u, s, -3.89796226062932799164047e-13); u = mla(u, s, 1.15011582539996035266901e-10); u = mla(u, s, -2.4611369501044697495359e-08); u = mla(u, s, 3.59086044859052754005062e-06); u = mla(u, s, -0.000325991886927389905997954); x = ddadd2_d2_d_d2(u * s, dd(0.0158543442438155018914259, -1.04693272280631521908845e-18)); x = ddadd2_d2_d2_d2(ddmul_d2_d2_d2(s2, x), dd(-0.308425137534042437259529, -1.95698492133633550338345e-17)); x = ddadd2_d2_d2_d(ddmul_d2_d2_d2(x, s2), 1); r.y = x.x + x.y; // if ((q & 2) != 0) { s = r.y; r.y = r.x; r.x = s; } if ((q & 4) != 0) { r.x = -r.x; } if (((q+2) & 4) != 0) { r.y = -r.y; } if (fabsk(d) > TRIGRANGEMAX3/4) { r.x = 0; r.y = 1; } if (xisinf(d)) { r.x = r.y = SLEEF_NAN; } return r; } EXPORT CONST Sleef_double2 xsincospi_u35(double d) { double u, s, t; Sleef_double2 r; u = d * 4; int q = ceilk(u) & ~(int)1; s = u - (double)q; t = s; s = s * s; // u = +0.6880638894766060136e-11; u = mla(u, s, -0.1757159564542310199e-8); u = mla(u, s, +0.3133616327257867311e-6); u = mla(u, s, -0.3657620416388486452e-4); u = mla(u, s, +0.2490394570189932103e-2); u = mla(u, s, -0.8074551218828056320e-1); u = mla(u, s, +0.7853981633974482790e+0); r.x = u * t; // u = -0.3860141213683794352e-12; u = mla(u, s, +0.1150057888029681415e-9); u = mla(u, s, -0.2461136493006663553e-7); u = mla(u, s, +0.3590860446623516713e-5); u = mla(u, s, -0.3259918869269435942e-3); u = mla(u, s, +0.1585434424381541169e-1); u = mla(u, s, -0.3084251375340424373e+0); u = mla(u, s, 1); r.y = u; // if ((q & 2) != 0) { s = r.y; r.y = r.x; r.x = s; } if ((q & 4) != 0) { r.x = -r.x; } if (((q+2) & 4) != 0) { r.y = -r.y; } if (fabsk(d) > TRIGRANGEMAX3/4) { r.x = 0; r.y = 1; } if (xisinf(d)) { r.x = r.y = SLEEF_NAN; } return r; } static INLINE CONST Sleef_double2 sinpik(double d) { double u, s, t; Sleef_double2 x, s2; u = d * 4; int q = ceilk(u) & ~1; int o = (q & 2) != 0; s = u - (double)q; t = s; s = s * s; s2 = ddmul_d2_d_d(t, t); // u = o ? 9.94480387626843774090208e-16 : -2.02461120785182399295868e-14; u = mla(u, s, o ? -3.89796226062932799164047e-13 : 6.94821830580179461327784e-12); u = mla(u, s, o ? 1.15011582539996035266901e-10 : -1.75724749952853179952664e-09); u = mla(u, s, o ? -2.4611369501044697495359e-08 : 3.13361688966868392878422e-07); u = mla(u, s, o ? 3.59086044859052754005062e-06 : -3.6576204182161551920361e-05); u = mla(u, s, o ? -0.000325991886927389905997954 : 0.00249039457019271850274356); x = ddadd2_d2_d_d2(u * s, o ? dd(0.0158543442438155018914259, -1.04693272280631521908845e-18) : dd(-0.0807455121882807852484731, 3.61852475067037104849987e-18)); x = ddadd2_d2_d2_d2(ddmul_d2_d2_d2(s2, x), o ? dd(-0.308425137534042437259529, -1.95698492133633550338345e-17) : dd(0.785398163397448278999491, 3.06287113727155002607105e-17)); x = ddmul_d2_d2_d2(x, o ? s2 : dd(t, 0)); x = o ? ddadd2_d2_d2_d(x, 1) : x; // if ((q & 4) != 0) { x.x = -x.x; x.y = -x.y; } return x; } EXPORT CONST double xsinpi_u05(double d) { Sleef_double2 x = sinpik(d); double r = x.x + x.y; if (xisnegzero(d)) r = -0.0; if (fabsk(d) > TRIGRANGEMAX3/4) r = 0; if (xisinf(d)) r = SLEEF_NAN; return r; } static INLINE CONST Sleef_double2 cospik(double d) { double u, s, t; Sleef_double2 x, s2; u = d * 4; int q = ceilk(u) & ~1; int o = (q & 2) == 0; s = u - (double)q; t = s; s = s * s; s2 = ddmul_d2_d_d(t, t); // u = o ? 9.94480387626843774090208e-16 : -2.02461120785182399295868e-14; u = mla(u, s, o ? -3.89796226062932799164047e-13 : 6.94821830580179461327784e-12); u = mla(u, s, o ? 1.15011582539996035266901e-10 : -1.75724749952853179952664e-09); u = mla(u, s, o ? -2.4611369501044697495359e-08 : 3.13361688966868392878422e-07); u = mla(u, s, o ? 3.59086044859052754005062e-06 : -3.6576204182161551920361e-05); u = mla(u, s, o ? -0.000325991886927389905997954 : 0.00249039457019271850274356); x = ddadd2_d2_d_d2(u * s, o ? dd(0.0158543442438155018914259, -1.04693272280631521908845e-18) : dd(-0.0807455121882807852484731, 3.61852475067037104849987e-18)); x = ddadd2_d2_d2_d2(ddmul_d2_d2_d2(s2, x), o ? dd(-0.308425137534042437259529, -1.95698492133633550338345e-17) : dd(0.785398163397448278999491, 3.06287113727155002607105e-17)); x = ddmul_d2_d2_d2(x, o ? s2 : dd(t, 0)); x = o ? ddadd2_d2_d2_d(x, 1) : x; // if (((q+2) & 4) != 0) { x.x = -x.x; x.y = -x.y; } return x; } EXPORT CONST double xcospi_u05(double d) { Sleef_double2 x = cospik(d); double r = x.x + x.y; if (fabsk(d) > TRIGRANGEMAX3/4) r = 1; if (xisinf(d)) r = SLEEF_NAN; return r; } EXPORT CONST double xtan(double d) { double u, s, x, y; int ql; if (fabsk(d) < TRIGRANGEMAX2) { ql = rintk(d * (2 * M_1_PI)); x = mla(ql, -PI_A2*0.5, d); x = mla(ql, -PI_B2*0.5, x); } else if (fabsk(d) < 1e+6) { double dqh = trunck(d * ((2 * M_1_PI) / (1 << 24))) * (double)(1 << 24); ql = rintk(d * (2 * M_1_PI) - dqh); x = mla(dqh, -PI_A * 0.5, d); x = mla( ql, -PI_A * 0.5, x); x = mla(dqh, -PI_B * 0.5, x); x = mla( ql, -PI_B * 0.5, x); x = mla(dqh, -PI_C * 0.5, x); x = mla( ql, -PI_C * 0.5, x); x = mla(dqh + ql, -PI_D * 0.5, x); } else { ddi_t ddi = rempi(d); ql = ddi.i; x = ddi.dd.x + ddi.dd.y; if (xisinf(d) || xisnan(d)) x = SLEEF_NAN; } x *= 0.5; s = x * x; double s2 = s * s, s4 = s2 * s2; u = POLY8(s, s2, s4, +0.3245098826639276316e-3, +0.5619219738114323735e-3, +0.1460781502402784494e-2, +0.3591611540792499519e-2, +0.8863268409563113126e-2, +0.2186948728185535498e-1, +0.5396825399517272970e-1, +0.1333333333330500581e+0); u = mla(u, s, +0.3333333333333343695e+0); u = mla(s, u * x, x); y = mla(u, u, -1); x = -2 * u; if ((ql & 1) != 0) { double t = x; x = y; y = -t; } u = x / y; return u; } EXPORT CONST double xtan_u1(double d) { double u; Sleef_double2 s, t, x, y; int ql; if (fabsk(d) < TRIGRANGEMAX2) { ql = rintk(d * (2 * M_1_PI)); u = mla(ql, -PI_A2*0.5, d); s = ddadd_d2_d_d(u, ql * (-PI_B2*0.5)); } else if (fabsk(d) < TRIGRANGEMAX) { const double dqh = trunck(d * (M_2_PI / (1 << 24))) * (double)(1 << 24); s = ddadd2_d2_d2_d(ddmul_d2_d2_d(dd(M_2_PI_H, M_2_PI_L), d), (d < 0 ? -0.5 : 0.5) - dqh); ql = s.x + s.y; u = mla(dqh, -PI_A*0.5, d); s = ddadd_d2_d_d (u, ql * (-PI_A*0.5)); s = ddadd2_d2_d2_d(s, dqh * (-PI_B*0.5)); s = ddadd2_d2_d2_d(s, ql * (-PI_B*0.5)); s = ddadd2_d2_d2_d(s, dqh * (-PI_C*0.5)); s = ddadd2_d2_d2_d(s, ql * (-PI_C*0.5)); s = ddadd_d2_d2_d(s, (dqh + ql) * (-PI_D*0.5)); } else { ddi_t ddi = rempi(d); ql = ddi.i; s = ddi.dd; if (xisinf(d) || xisnan(d)) s.x = SLEEF_NAN; } t = ddscale_d2_d2_d(s, 0.5); s = ddsqu_d2_d2(t); double s2 = s.x * s.x, s4 = s2 * s2; u = POLY8(s.x, s2, s4, +0.3245098826639276316e-3, +0.5619219738114323735e-3, +0.1460781502402784494e-2, +0.3591611540792499519e-2, +0.8863268409563113126e-2, +0.2186948728185535498e-1, +0.5396825399517272970e-1, +0.1333333333330500581e+0); u = mla(u, s.x, +0.3333333333333343695e+0); x = ddadd_d2_d2_d2(t, ddmul_d2_d2_d(ddmul_d2_d2_d2(s, t), u)); y = ddadd_d2_d_d2(-1, ddsqu_d2_d2(x)); x = ddscale_d2_d2_d(x, -2); if ((ql & 1) != 0) { t = x; x = y; y = ddneg_d2_d2(t); } x = dddiv_d2_d2_d2(x, y); u = x.x + x.y; if (xisnegzero(d)) u = d; return u; } EXPORT CONST double xlog(double d) { double x, x2, t, m; int e; int o = d < DBL_MIN; if (o) d *= (double)(INT64_C(1) << 32) * (double)(INT64_C(1) << 32); e = ilogb2k(d * (1.0/0.75)); m = ldexp3k(d, -e); if (o) e -= 64; x = (m-1) / (m+1); x2 = x * x; double x4 = x2 * x2, x8 = x4 * x4; t = POLY7(x2, x4, x8, 0.153487338491425068243146, 0.152519917006351951593857, 0.181863266251982985677316, 0.222221366518767365905163, 0.285714294746548025383248, 0.399999999950799600689777, 0.6666666666667778740063); x = x * 2 + 0.693147180559945286226764 * e + x * x2 * t; if (xisinf(d)) x = SLEEF_INFINITY; if (d < 0 || xisnan(d)) x = SLEEF_NAN; if (d == 0) x = -SLEEF_INFINITY; return x; } EXPORT CONST double xexp(double d) { int q = (int)rintk(d * R_LN2); double s, u; s = mla(q, -L2U, d); s = mla(q, -L2L, s); double s2 = s * s, s4 = s2 * s2, s8 = s4 * s4; u = POLY10(s, s2, s4, s8, 2.08860621107283687536341e-09, 2.51112930892876518610661e-08, 2.75573911234900471893338e-07, 2.75572362911928827629423e-06, 2.4801587159235472998791e-05, 0.000198412698960509205564975, 0.00138888888889774492207962, 0.00833333333331652721664984, 0.0416666666666665047591422, 0.166666666666666851703837); u = mla(u, s, +0.5); u = s * s * u + s + 1; u = ldexp2k(u, q); if (d > 709.78271114955742909217217426) u = SLEEF_INFINITY; if (d < -1000) u = 0; return u; } static INLINE CONST double expm1k(double d) { int q = (int)rintk(d * R_LN2); double s, u; s = mla(q, -L2U, d); s = mla(q, -L2L, s); double s2 = s * s, s4 = s2 * s2, s8 = s4 * s4; u = POLY10(s, s2, s4, s8, 2.08860621107283687536341e-09, 2.51112930892876518610661e-08, 2.75573911234900471893338e-07, 2.75572362911928827629423e-06, 2.4801587159235472998791e-05, 0.000198412698960509205564975, 0.00138888888889774492207962, 0.00833333333331652721664984, 0.0416666666666665047591422, 0.166666666666666851703837); u = mla(s2, 0.5, s2 * s * u) + s; if (q != 0) u = ldexp2k(u + 1, q) - 1; return u; } static INLINE CONST Sleef_double2 logk(double d) { Sleef_double2 x, x2, s; double m, t; int e; int o = d < DBL_MIN; if (o) d *= (double)(INT64_C(1) << 32) * (double)(INT64_C(1) << 32); e = ilogb2k(d * (1.0/0.75)); m = ldexp3k(d, -e); if (o) e -= 64; x = dddiv_d2_d2_d2(ddadd2_d2_d_d(-1, m), ddadd2_d2_d_d(1, m)); x2 = ddsqu_d2_d2(x); double x4 = x2.x * x2.x, x8 = x4 * x4, x16 = x8 * x8; t = POLY9(x2.x, x4, x8, x16, 0.116255524079935043668677, 0.103239680901072952701192, 0.117754809412463995466069, 0.13332981086846273921509, 0.153846227114512262845736, 0.181818180850050775676507, 0.222222222230083560345903, 0.285714285714249172087875, 0.400000000000000077715612); Sleef_double2 c = dd(0.666666666666666629659233, 3.80554962542412056336616e-17); s = ddmul_d2_d2_d(dd(0.693147180559945286226764, 2.319046813846299558417771e-17), e); s = ddadd_d2_d2_d2(s, ddscale_d2_d2_d(x, 2)); x = ddmul_d2_d2_d2(x2, x); s = ddadd_d2_d2_d2(s, ddmul_d2_d2_d2(x, c)); x = ddmul_d2_d2_d2(x2, x); s = ddadd_d2_d2_d2(s, ddmul_d2_d2_d(x, t)); return s; } EXPORT CONST double xlog_u1(double d) { Sleef_double2 x, s; double m, t, x2; int e; int o = d < DBL_MIN; if (o) d *= (double)(INT64_C(1) << 32) * (double)(INT64_C(1) << 32); e = ilogb2k(d * (1.0/0.75)); m = ldexp3k(d, -e); if (o) e -= 64; x = dddiv_d2_d2_d2(ddadd2_d2_d_d(-1, m), ddadd2_d2_d_d(1, m)); x2 = x.x * x.x; double x4 = x2 * x2, x8 = x4 * x4; t = POLY7(x2, x4, x8, 0.1532076988502701353e+0, 0.1525629051003428716e+0, 0.1818605932937785996e+0, 0.2222214519839380009e+0, 0.2857142932794299317e+0, 0.3999999999635251990e+0, 0.6666666666667333541e+0); s = ddmul_d2_d2_d(dd(0.693147180559945286226764, 2.319046813846299558417771e-17), (double)e); s = ddadd_d2_d2_d2(s, ddscale_d2_d2_d(x, 2)); s = ddadd_d2_d2_d(s, x2 * x.x * t); double r = s.x + s.y; if (xisinf(d)) r = SLEEF_INFINITY; if (d < 0 || xisnan(d)) r = SLEEF_NAN; if (d == 0) r = -SLEEF_INFINITY; return r; } static INLINE CONST double expk(Sleef_double2 d) { int q = (int)rintk((d.x + d.y) * R_LN2); Sleef_double2 s, t; double u; s = ddadd2_d2_d2_d(d, q * -L2U); s = ddadd2_d2_d2_d(s, q * -L2L); s = ddnormalize_d2_d2(s); double s2 = s.x * s.x, s4 = s2 * s2, s8 = s4 * s4; u = POLY10(s.x, s2, s4, s8, 2.51069683420950419527139e-08, 2.76286166770270649116855e-07, 2.75572496725023574143864e-06, 2.48014973989819794114153e-05, 0.000198412698809069797676111, 0.0013888888939977128960529, 0.00833333333332371417601081, 0.0416666666665409524128449, 0.166666666666666740681535, 0.500000000000000999200722); t = ddadd_d2_d_d2(1, s); t = ddadd_d2_d2_d2(t, ddmul_d2_d2_d(ddsqu_d2_d2(s), u)); u = ldexpk(t.x + t.y, q); if (d.x < -1000) u = 0; return u; } EXPORT CONST double xpow(double x, double y) { int yisint = xisint(y); int yisodd = yisint && xisodd(y); Sleef_double2 d = ddmul_d2_d2_d(logk(fabsk(x)), y); double result = expk(d); if (d.x > 709.78271114955742909217217426) result = SLEEF_INFINITY; result = xisnan(result) ? SLEEF_INFINITY : result; result *= (x > 0 ? 1 : (!yisint ? SLEEF_NAN : (yisodd ? -1 : 1))); double efx = mulsign(fabsk(x) - 1, y); if (xisinf(y)) result = efx < 0 ? 0.0 : (efx == 0 ? 1.0 : SLEEF_INFINITY); if (xisinf(x) || x == 0) result = (yisodd ? sign(x) : 1) * ((x == 0 ? -y : y) < 0 ? 0 : SLEEF_INFINITY); if (xisnan(x) || xisnan(y)) result = SLEEF_NAN; if (y == 0 || x == 1) result = 1; return result; } static INLINE CONST Sleef_double2 expk2(Sleef_double2 d) { int q = (int)rintk((d.x + d.y) * R_LN2); Sleef_double2 s, t; double u; s = ddadd2_d2_d2_d(d, q * -L2U); s = ddadd2_d2_d2_d(s, q * -L2L); u = +0.1602472219709932072e-9; u = mla(u, s.x, +0.2092255183563157007e-8); u = mla(u, s.x, +0.2505230023782644465e-7); u = mla(u, s.x, +0.2755724800902135303e-6); u = mla(u, s.x, +0.2755731892386044373e-5); u = mla(u, s.x, +0.2480158735605815065e-4); u = mla(u, s.x, +0.1984126984148071858e-3); u = mla(u, s.x, +0.1388888888886763255e-2); u = mla(u, s.x, +0.8333333333333347095e-2); u = mla(u, s.x, +0.4166666666666669905e-1); t = ddadd2_d2_d2_d(ddmul_d2_d2_d(s, u), +0.1666666666666666574e+0); t = ddadd2_d2_d2_d(ddmul_d2_d2_d2(s, t), 0.5); t = ddadd2_d2_d2_d2(s, ddmul_d2_d2_d2(ddsqu_d2_d2(s), t)); t = ddadd2_d2_d_d2(1, t); t.x = ldexp2k(t.x, q); t.y = ldexp2k(t.y, q); return d.x < -1000 ? dd(0, 0) : t; } EXPORT CONST double xsinh(double x) { double y = fabsk(x); Sleef_double2 d = expk2(dd(y, 0)); d = ddsub_d2_d2_d2(d, ddrec_d2_d2(d)); y = (d.x + d.y) * 0.5; y = fabsk(x) > 710 ? SLEEF_INFINITY : y; y = xisnan(y) ? SLEEF_INFINITY : y; y = mulsign(y, x); y = xisnan(x) ? SLEEF_NAN : y; return y; } EXPORT CONST double xcosh(double x) { double y = fabsk(x); Sleef_double2 d = expk2(dd(y, 0)); d = ddadd_d2_d2_d2(d, ddrec_d2_d2(d)); y = (d.x + d.y) * 0.5; y = fabsk(x) > 710 ? SLEEF_INFINITY : y; y = xisnan(y) ? SLEEF_INFINITY : y; y = xisnan(x) ? SLEEF_NAN : y; return y; } EXPORT CONST double xtanh(double x) { double y = fabsk(x); Sleef_double2 d = expk2(dd(y, 0)); Sleef_double2 e = ddrec_d2_d2(d); d = dddiv_d2_d2_d2(ddsub_d2_d2_d2(d, e), ddadd_d2_d2_d2(d, e)); y = d.x + d.y; y = fabsk(x) > 18.714973875 ? 1.0 : y; y = xisnan(y) ? 1.0 : y; y = mulsign(y, x); y = xisnan(x) ? SLEEF_NAN : y; return y; } EXPORT CONST double xsinh_u35(double x) { double e = expm1k(fabsk(x)); double y = (e + 2) / (e + 1) * (0.5 * e); y = fabsk(x) > 709 ? SLEEF_INFINITY : y; y = xisnan(y) ? SLEEF_INFINITY : y; y = mulsign(y, x); y = xisnan(x) ? SLEEF_NAN : y; return y; } EXPORT CONST double xcosh_u35(double x) { double e = xexp(fabsk(x)); double y = 0.5 / e + 0.5 * e; y = fabsk(x) > 709 ? SLEEF_INFINITY : y; y = xisnan(y) ? SLEEF_INFINITY : y; y = xisnan(x) ? SLEEF_NAN : y; return y; } EXPORT CONST double xtanh_u35(double x) { double y = fabsk(x); double d = expm1k(2*y); y = d / (d + 2); y = fabsk(x) > 18.714973875 ? 1.0 : y; y = xisnan(y) ? 1.0 : y; y = mulsign(y, x); y = xisnan(x) ? SLEEF_NAN : y; return y; } static INLINE CONST Sleef_double2 logk2(Sleef_double2 d) { Sleef_double2 x, x2, m, s; double t; int e; e = ilogbk(d.x * (1.0/0.75)); m.x = ldexp2k(d.x, -e); m.y = ldexp2k(d.y, -e); x = dddiv_d2_d2_d2(ddadd2_d2_d2_d(m, -1), ddadd2_d2_d2_d(m, 1)); x2 = ddsqu_d2_d2(x); double x4 = x2.x * x2.x, x8 = x4 * x4; t = POLY7(x2.x, x4, x8, 0.13860436390467167910856, 0.131699838841615374240845, 0.153914168346271945653214, 0.181816523941564611721589, 0.22222224632662035403996, 0.285714285511134091777308, 0.400000000000914013309483); t = mla(t, x2.x, 0.666666666666664853302393); s = ddmul_d2_d2_d(dd(0.693147180559945286226764, 2.319046813846299558417771e-17), e); s = ddadd_d2_d2_d2(s, ddscale_d2_d2_d(x, 2)); s = ddadd_d2_d2_d2(s, ddmul_d2_d2_d(ddmul_d2_d2_d2(x2, x), t)); return s; } EXPORT CONST double xasinh(double x) { double y = fabsk(x); Sleef_double2 d; d = y > 1 ? ddrec_d2_d(x) : dd(y, 0); d = ddsqrt_d2_d2(ddadd2_d2_d2_d(ddsqu_d2_d2(d), 1)); d = y > 1 ? ddmul_d2_d2_d(d, y) : d; d = logk2(ddnormalize_d2_d2(ddadd_d2_d2_d(d, x))); y = d.x + d.y; y = (fabsk(x) > SQRT_DBL_MAX || xisnan(y)) ? mulsign(SLEEF_INFINITY, x) : y; y = xisnan(x) ? SLEEF_NAN : y; y = xisnegzero(x) ? -0.0 : y; return y; } EXPORT CONST double xacosh(double x) { Sleef_double2 d = logk2(ddadd2_d2_d2_d(ddmul_d2_d2_d2(ddsqrt_d2_d2(ddadd2_d2_d_d(x, 1)), ddsqrt_d2_d2(ddadd2_d2_d_d(x, -1))), x)); double y = d.x + d.y; y = (x > SQRT_DBL_MAX || xisnan(y)) ? SLEEF_INFINITY : y; y = x == 1.0 ? 0.0 : y; y = x < 1.0 ? SLEEF_NAN : y; y = xisnan(x) ? SLEEF_NAN : y; return y; } EXPORT CONST double xatanh(double x) { double y = fabsk(x); Sleef_double2 d = logk2(dddiv_d2_d2_d2(ddadd2_d2_d_d(1, y), ddadd2_d2_d_d(1, -y))); y = y > 1.0 ? SLEEF_NAN : (y == 1.0 ? SLEEF_INFINITY : (d.x + d.y) * 0.5); y = mulsign(y, x); y = (xisinf(x) || xisnan(y)) ? SLEEF_NAN : y; return y; } // EXPORT CONST double xcbrt(double d) { // max error : 2 ulps double x, y, q = 1.0; int e, r; e = ilogbk(fabsk(d))+1; d = ldexp2k(d, -e); r = (e + 6144) % 3; q = (r == 1) ? 1.2599210498948731647672106 : q; q = (r == 2) ? 1.5874010519681994747517056 : q; q = ldexp2k(q, (e + 6144) / 3 - 2048); q = mulsign(q, d); d = fabsk(d); x = -0.640245898480692909870982; x = mla(x, d, 2.96155103020039511818595); x = mla(x, d, -5.73353060922947843636166); x = mla(x, d, 6.03990368989458747961407); x = mla(x, d, -3.85841935510444988821632); x = mla(x, d, 2.2307275302496609725722); y = x * x; y = y * y; x -= (d * y - x) * (1.0 / 3.0); y = d * x * x; y = (y - (2.0 / 3.0) * y * (y * x - 1)) * q; return y; } EXPORT CONST double xcbrt_u1(double d) { double x, y, z; Sleef_double2 q2 = dd(1, 0), u, v; int e, r; e = ilogbk(fabsk(d))+1; d = ldexp2k(d, -e); r = (e + 6144) % 3; q2 = (r == 1) ? dd(1.2599210498948731907, -2.5899333753005069177e-17) : q2; q2 = (r == 2) ? dd(1.5874010519681995834, -1.0869008194197822986e-16) : q2; q2.x = mulsign(q2.x, d); q2.y = mulsign(q2.y, d); d = fabsk(d); x = -0.640245898480692909870982; x = mla(x, d, 2.96155103020039511818595); x = mla(x, d, -5.73353060922947843636166); x = mla(x, d, 6.03990368989458747961407); x = mla(x, d, -3.85841935510444988821632); x = mla(x, d, 2.2307275302496609725722); y = x * x; y = y * y; x -= (d * y - x) * (1.0 / 3.0); z = x; u = ddmul_d2_d_d(x, x); u = ddmul_d2_d2_d2(u, u); u = ddmul_d2_d2_d(u, d); u = ddadd2_d2_d2_d(u, -x); y = u.x + u.y; y = -2.0 / 3.0 * y * z; v = ddadd2_d2_d2_d(ddmul_d2_d_d(z, z), y); v = ddmul_d2_d2_d(v, d); v = ddmul_d2_d2_d2(v, q2); z = ldexp2k(v.x + v.y, (e + 6144) / 3 - 2048); if (xisinf(d)) { z = mulsign(SLEEF_INFINITY, q2.x); } if (d == 0) { z = mulsign(0, q2.x); } return z; } EXPORT CONST double xexp2(double d) { int q = (int)rintk(d); double s, u; s = d - q; double s2 = s * s, s4 = s2 * s2, s8 = s4 * s4; u = POLY10(s, s2, s4, s8, +0.4434359082926529454e-9, +0.7073164598085707425e-8, +0.1017819260921760451e-6, +0.1321543872511327615e-5, +0.1525273353517584730e-4, +0.1540353045101147808e-3, +0.1333355814670499073e-2, +0.9618129107597600536e-2, +0.5550410866482046596e-1, +0.2402265069591012214e+0); u = mla(u, s, +0.6931471805599452862e+0); u = ddnormalize_d2_d2(ddadd_d2_d_d2(1, ddmul_d2_d_d(u, s))).x; u = ldexp2k(u, q); if (d >= 1024) u = SLEEF_INFINITY; if (d < -2000) u = 0; return u; } EXPORT CONST double xexp2_u35(double d) { int q = (int)rintk(d); double s, u; s = d - q; u = +0.4434359082926529454e-9; u = mla(u, s, +0.7073164598085707425e-8); u = mla(u, s, +0.1017819260921760451e-6); u = mla(u, s, +0.1321543872511327615e-5); u = mla(u, s, +0.1525273353517584730e-4); u = mla(u, s, +0.1540353045101147808e-3); u = mla(u, s, +0.1333355814670499073e-2); u = mla(u, s, +0.9618129107597600536e-2); u = mla(u, s, +0.5550410866482046596e-1); u = mla(u, s, +0.2402265069591012214e+0); u = mla(u, s, +0.6931471805599452862e+0); u = mla(u, s, +0.1000000000000000000e+1); u = ldexp2k(u, q); if (d >= 1024) u = SLEEF_INFINITY; if (d < -2000) u = 0; return u; } EXPORT CONST double xexp10(double d) { int q = (int)rintk(d * LOG10_2); double s, u; s = mla(q, -L10U, d); s = mla(q, -L10L, s); u = +0.2411463498334267652e-3; u = mla(u, s, +0.1157488415217187375e-2); u = mla(u, s, +0.5013975546789733659e-2); u = mla(u, s, +0.1959762320720533080e-1); u = mla(u, s, +0.6808936399446784138e-1); u = mla(u, s, +0.2069958494722676234e+0); u = mla(u, s, +0.5393829292058536229e+0); u = mla(u, s, +0.1171255148908541655e+1); u = mla(u, s, +0.2034678592293432953e+1); u = mla(u, s, +0.2650949055239205876e+1); u = mla(u, s, +0.2302585092994045901e+1); u = ddnormalize_d2_d2(ddadd_d2_d_d2(1, ddmul_d2_d_d(u, s))).x; u = ldexp2k(u, q); if (d > 308.25471555991671) u = SLEEF_INFINITY; // log10(DBL_MAX) if (d < -350) u = 0; return u; } EXPORT CONST double xexp10_u35(double d) { int q = (int)rintk(d * LOG10_2); double s, u; s = mla(q, -L10U, d); s = mla(q, -L10L, s); u = +0.2411463498334267652e-3; u = mla(u, s, +0.1157488415217187375e-2); u = mla(u, s, +0.5013975546789733659e-2); u = mla(u, s, +0.1959762320720533080e-1); u = mla(u, s, +0.6808936399446784138e-1); u = mla(u, s, +0.2069958494722676234e+0); u = mla(u, s, +0.5393829292058536229e+0); u = mla(u, s, +0.1171255148908541655e+1); u = mla(u, s, +0.2034678592293432953e+1); u = mla(u, s, +0.2650949055239205876e+1); u = mla(u, s, +0.2302585092994045901e+1); u = mla(u, s, +0.1000000000000000000e+1); u = ldexp2k(u, q); if (d > 308.25471555991671) u = SLEEF_INFINITY; if (d < -350) u = 0; return u; } EXPORT CONST double xexpm1(double a) { Sleef_double2 d = ddadd2_d2_d2_d(expk2(dd(a, 0)), -1.0); double x = d.x + d.y; if (a > 709.782712893383996732223) x = SLEEF_INFINITY; // log(DBL_MAX) if (a < -36.736800569677101399113302437) x = -1; // log(1 - nexttoward(1, 0)) if (xisnegzero(a)) x = -0.0; return x; } EXPORT CONST double xlog10(double d) { Sleef_double2 x, s; double m, t, x2; int e; int o = d < DBL_MIN; if (o) d *= (double)(INT64_C(1) << 32) * (double)(INT64_C(1) << 32); e = ilogb2k(d * (1.0/0.75)); m = ldexp3k(d, -e); if (o) e -= 64; x = dddiv_d2_d2_d2(ddadd2_d2_d_d(-1, m), ddadd2_d2_d_d(1, m)); x2 = x.x * x.x; double x4 = x2 * x2, x8 = x4 * x4; t = POLY7(x2, x4, x8, +0.6653725819576758460e-1, +0.6625722782820833712e-1, +0.7898105214313944078e-1, +0.9650955035715275132e-1, +0.1240841409721444993e+0, +0.1737177927454605086e+0, +0.2895296546021972617e+0); s = ddmul_d2_d2_d(dd(0.30102999566398119802, -2.803728127785170339e-18), (double)e); s = ddadd_d2_d2_d2(s, ddmul_d2_d2_d2(x, dd(0.86858896380650363334, 1.1430059694096389311e-17))); s = ddadd_d2_d2_d(s, x2 * x.x * t); double r = s.x + s.y; if (xisinf(d)) r = SLEEF_INFINITY; if (d < 0 || xisnan(d)) r = SLEEF_NAN; if (d == 0) r = -SLEEF_INFINITY; return r; } EXPORT CONST double xlog2(double d) { Sleef_double2 x, s; double m, t, x2; int e; int o = d < DBL_MIN; if (o) d *= (double)(INT64_C(1) << 32) * (double)(INT64_C(1) << 32); e = ilogb2k(d * (1.0/0.75)); m = ldexp3k(d, -e); if (o) e -= 64; x = dddiv_d2_d2_d2(ddadd2_d2_d_d(-1, m), ddadd2_d2_d_d(1, m)); x2 = x.x * x.x; double x4 = x2 * x2, x8 = x4 * x4; t = POLY7(x2, x4, x8, +0.2211941750456081490e+0, +0.2200768693152277689e+0, +0.2623708057488514656e+0, +0.3205977477944495502e+0, +0.4121985945485324709e+0, +0.5770780162997058982e+0, +0.96179669392608091449); s = ddadd2_d2_d_d2(e, ddmul_d2_d2_d2(x, dd(2.885390081777926774, 6.0561604995516736434e-18))); s = ddadd2_d2_d2_d(s, x2 * x.x * t); double r = s.x + s.y; if (xisinf(d)) r = SLEEF_INFINITY; if (d < 0 || xisnan(d)) r = SLEEF_NAN; if (d == 0) r = -SLEEF_INFINITY; return r; } EXPORT CONST double xlog2_u35(double d) { double m, t, x, x2; int e; int o = d < DBL_MIN; if (o) d *= (double)(INT64_C(1) << 32) * (double)(INT64_C(1) << 32); e = ilogb2k(d * (1.0/0.75)); m = ldexp3k(d, -e); if (o) e -= 64; x = (m - 1) / (m + 1); x2 = x * x; t = +0.2211941750456081490e+0; t = mla(t, x2, +0.2200768693152277689e+0); t = mla(t, x2, +0.2623708057488514656e+0); t = mla(t, x2, +0.3205977477944495502e+0); t = mla(t, x2, +0.4121985945485324709e+0); t = mla(t, x2, +0.5770780162997058982e+0); t = mla(t, x2, +0.96179669392608091449 ); Sleef_double2 s = ddadd_d2_d_d2(e, ddmul_d2_d_d(2.885390081777926774, x)); double r = mla(t, x * x2, s.x + s.y); if (xisinf(d)) r = SLEEF_INFINITY; if (d < 0 || xisnan(d)) r = SLEEF_NAN; if (d == 0) r = -SLEEF_INFINITY; return r; } EXPORT CONST double xlog1p(double d) { Sleef_double2 x, s; double m, t, x2; int e; double dp1 = d + 1; int o = dp1 < DBL_MIN; if (o) dp1 *= (double)(INT64_C(1) << 32) * (double)(INT64_C(1) << 32); e = ilogb2k(dp1 * (1.0/0.75)); t = ldexp3k(1, -e); m = mla(d, t, t - 1); if (o) e -= 64; x = dddiv_d2_d2_d2(dd(m, 0), ddadd_d2_d_d(2, m)); x2 = x.x * x.x; double x4 = x2 * x2, x8 = x4 * x4; t = POLY7(x2, x4, x8, 0.1532076988502701353e+0, 0.1525629051003428716e+0, 0.1818605932937785996e+0, 0.2222214519839380009e+0, 0.2857142932794299317e+0, 0.3999999999635251990e+0, 0.6666666666667333541e+0); s = ddmul_d2_d2_d(dd(0.693147180559945286226764, 2.319046813846299558417771e-17), (double)e); s = ddadd_d2_d2_d2(s, ddscale_d2_d2_d(x, 2)); s = ddadd_d2_d2_d(s, x2 * x.x * t); double r = s.x + s.y; if (d > 1e+307) r = SLEEF_INFINITY; if (d < -1 || xisnan(d)) r = SLEEF_NAN; if (d == -1) r = -SLEEF_INFINITY; if (xisnegzero(d)) r = -0.0; return r; } // EXPORT CONST double xfma(double x, double y, double z) { double h2 = x * y + z, q = 1; if (fabsk(h2) < 1e-300) { const double c0 = UINT64_C(1) << 54, c1 = c0 * c0, c2 = c1 * c1; x *= c1; y *= c1; z *= c2; q = 1.0 / c2; } if (fabsk(h2) > 1e+299) { const double c0 = UINT64_C(1) << 54, c1 = c0 * c0, c2 = c1 * c1; x *= 1.0 / c1; y *= 1.0 / c1; z *= 1. / c2; q = c2; } Sleef_double2 d = ddmul_d2_d_d(x, y); d = ddadd2_d2_d2_d(d, z); double ret = (x == 0 || y == 0) ? z : (d.x + d.y); if ((xisinf(z) && !xisinf(x) && !xisnan(x) && !xisinf(y) && !xisnan(y))) h2 = z; return (xisinf(h2) || xisnan(h2)) ? h2 : ret*q; } EXPORT CONST double xsqrt_u05(double d) { double q = 0.5; d = d < 0 ? SLEEF_NAN : d; if (d < 8.636168555094445E-78) { d *= 1.157920892373162E77; q = 2.9387358770557188E-39 * 0.5; } if (d > 1.3407807929942597e+154) { d *= 7.4583407312002070e-155; q = 1.1579208923731620e+77 * 0.5; } // http://en.wikipedia.org/wiki/Fast_inverse_square_root double x = longBitsToDouble(0x5fe6ec85e7de30da - (doubleToRawLongBits(d + 1e-320) >> 1)); x = x * (1.5 - 0.5 * d * x * x); x = x * (1.5 - 0.5 * d * x * x); x = x * (1.5 - 0.5 * d * x * x) * d; Sleef_double2 d2 = ddmul_d2_d2_d2(ddadd2_d2_d_d2(d, ddmul_d2_d_d(x, x)), ddrec_d2_d(x)); double ret = (d2.x + d2.y) * q; ret = d == SLEEF_INFINITY ? SLEEF_INFINITY : ret; ret = d == 0 ? d : ret; return ret; } EXPORT CONST double xsqrt_u35(double d) { return xsqrt_u05(d); } EXPORT CONST double xsqrt(double d) { return SQRT(d); } EXPORT CONST double xfabs(double x) { return fabsk(x); } EXPORT CONST double xcopysign(double x, double y) { return copysignk(x, y); } EXPORT CONST double xfmax(double x, double y) { return y != y ? x : (x > y ? x : y); } EXPORT CONST double xfmin(double x, double y) { return y != y ? x : (x < y ? x : y); } EXPORT CONST double xfdim(double x, double y) { double ret = x - y; if (ret < 0 || x == y) ret = 0; return ret; } EXPORT CONST double xtrunc(double x) { double fr = x - (double)(INT64_C(1) << 31) * (int32_t)(x * (1.0 / (INT64_C(1) << 31))); fr = fr - (int32_t)fr; return (xisinf(x) || fabsk(x) >= (double)(INT64_C(1) << 52)) ? x : copysignk(x - fr, x); } EXPORT CONST double xfloor(double x) { double fr = x - (double)(INT64_C(1) << 31) * (int32_t)(x * (1.0 / (INT64_C(1) << 31))); fr = fr - (int32_t)fr; fr = fr < 0 ? fr+1.0 : fr; return (xisinf(x) || fabsk(x) >= (double)(INT64_C(1) << 52)) ? x : copysignk(x - fr, x); } EXPORT CONST double xceil(double x) { double fr = x - (double)(INT64_C(1) << 31) * (int32_t)(x * (1.0 / (INT64_C(1) << 31))); fr = fr - (int32_t)fr; fr = fr <= 0 ? fr : fr-1.0; return (xisinf(x) || fabsk(x) >= (double)(INT64_C(1) << 52)) ? x : copysignk(x - fr, x); } EXPORT CONST double xround(double d) { double x = d + 0.5; double fr = x - (double)(INT64_C(1) << 31) * (int32_t)(x * (1.0 / (INT64_C(1) << 31))); fr = fr - (int32_t)fr; if (fr == 0 && x <= 0) x--; fr = fr < 0 ? fr+1.0 : fr; x = d == 0.49999999999999994449 ? 0 : x; // nextafter(0.5, 0) return (xisinf(d) || fabsk(d) >= (double)(INT64_C(1) << 52)) ? d : copysignk(x - fr, d); } EXPORT CONST double xrint(double d) { double c = mulsign(INT64_C(1) << 52, d); return fabsk(d) > INT64_C(1) << 52 ? d : orsign(d + c - c, d); } EXPORT CONST double xhypot_u05(double x, double y) { x = fabsk(x); y = fabsk(y); double min = fmink(x, y), n = min; double max = fmaxk(x, y), d = max; if (max < DBL_MIN) { n *= UINT64_C(1) << 54; d *= UINT64_C(1) << 54; } Sleef_double2 t = dddiv_d2_d2_d2(dd(n, 0), dd(d, 0)); t = ddmul_d2_d2_d(ddsqrt_d2_d2(ddadd2_d2_d2_d(ddsqu_d2_d2(t), 1)), max); double ret = t.x + t.y; if (xisnan(ret)) ret = SLEEF_INFINITY; if (min == 0) ret = max; if (xisnan(x) || xisnan(y)) ret = SLEEF_NAN; if (x == SLEEF_INFINITY || y == SLEEF_INFINITY) ret = SLEEF_INFINITY; return ret; } EXPORT CONST double xhypot_u35(double x, double y) { x = fabsk(x); y = fabsk(y); double min = fmink(x, y); double max = fmaxk(x, y); double t = min / max; double ret = max * SQRT(1 + t*t); if (min == 0) ret = max; if (xisnan(x) || xisnan(y)) ret = SLEEF_NAN; if (x == SLEEF_INFINITY || y == SLEEF_INFINITY) ret = SLEEF_INFINITY; return ret; } EXPORT CONST double xnextafter(double x, double y) { union { double f; int64_t i; } cx; x = x == 0 ? mulsign(0, y) : x; cx.f = x; int c = (cx.i < 0) == (y < x); if (c) cx.i = -(cx.i ^ (UINT64_C(1) << 63)); if (x != y) cx.i--; if (c) cx.i = -(cx.i ^ (UINT64_C(1) << 63)); if (cx.f == 0 && x != 0) cx.f = mulsign(0, x); if (x == 0 && y == 0) cx.f = y; if (xisnan(x) || xisnan(y)) cx.f = SLEEF_NAN; return cx.f; } EXPORT CONST double xfrfrexp(double x) { union { double f; uint64_t u; } cx; if (fabsk(x) < DBL_MIN) x *= (UINT64_C(1) << 63); cx.f = x; cx.u &= ~UINT64_C(0x7ff0000000000000); cx.u |= UINT64_C(0x3fe0000000000000); if (xisinf(x)) cx.f = mulsign(SLEEF_INFINITY, x); if (x == 0) cx.f = x; return cx.f; } EXPORT CONST int xexpfrexp(double x) { union { double f; uint64_t u; } cx; int ret = 0; if (fabsk(x) < DBL_MIN) { x *= (UINT64_C(1) << 63); ret = -63; } cx.f = x; ret += (int32_t)(((cx.u >> 52) & 0x7ff)) - 0x3fe; if (x == 0 || xisnan(x) || xisinf(x)) ret = 0; return ret; } static INLINE CONST double toward0(double d) { return d == 0 ? 0 : longBitsToDouble(doubleToRawLongBits(d)-1); } static INLINE CONST double removelsb(double d) { return longBitsToDouble(doubleToRawLongBits(d) & INT64_C(0xfffffffffffffffe)); } static INLINE CONST double ptrunc(double x) { double fr = mla(-(double)(INT64_C(1) << 31), (int32_t)(x * (1.0 / (INT64_C(1) << 31))), x); return fabsk(x) >= (double)(INT64_C(1) << 52) ? x : (x - (fr - (int32_t)fr)); } EXPORT CONST double xfmod(double x, double y) { double n = fabsk(x), d = fabsk(y), s = 1, q; if (d < DBL_MIN) { n *= UINT64_C(1) << 54; d *= UINT64_C(1) << 54; s = 1.0 / (UINT64_C(1) << 54); } Sleef_double2 r = dd(n, 0); double rd = toward0(1.0 / d); for(int i=0;i < 21;i++) { // ceil(log2(DBL_MAX) / 52) q = removelsb(ptrunc(toward0(r.x) * rd)); q = (3*d > r.x && r.x > d) ? 2 : q; q = (2*d > r.x && r.x > d) ? 1 : q; q = r.x == d ? (r.y >= 0 ? 1 : 0) : q; r = ddnormalize_d2_d2(ddadd2_d2_d2_d2(r, ddmul_d2_d_d(q, -d))); if (r.x < d) break; } double ret = r.x * s; if (r.x + r.y == d) ret = 0; ret = mulsign(ret, x); if (n < d) ret = x; if (d == 0) ret = SLEEF_NAN; return ret; } static INLINE CONST double rintk2(double d) { double c = mulsign(INT64_C(1) << 52, d); return fabsk(d) > INT64_C(1) << 52 ? d : orsign(d + c - c, d); } EXPORT CONST double xremainder(double x, double y) { double n = fabsk(x), d = fabsk(y), s = 1, q; if (d < DBL_MIN*2) { n *= UINT64_C(1) << 54; d *= UINT64_C(1) << 54; s = 1.0 / (UINT64_C(1) << 54); } double rd = 1.0 / d; Sleef_double2 r = dd(n, 0); int qisodd = 0; for(int i=0;i < 21;i++) { // ceil(log2(DBL_MAX) / 52) q = removelsb(rintk2(r.x * rd)); if (fabsk(r.x) < 1.5 * d) q = r.x < 0 ? -1 : 1; if (fabsk(r.x) < 0.5 * d || (fabsk(r.x) == 0.5 * d && !qisodd)) q = 0; if (q == 0) break; if (xisinf(q * -d)) q = q + mulsign(-1, r.x); qisodd ^= xisodd(q); r = ddnormalize_d2_d2(ddadd2_d2_d2_d2(r, ddmul_d2_d_d(q, -d))); } double ret = r.x * s; ret = mulsign(ret, x); if (xisinf(y)) ret = xisinf(x) ? SLEEF_NAN : x; if (d == 0) ret = SLEEF_NAN; return ret; } EXPORT CONST Sleef_double2 xmodf(double x) { double fr = x - (double)(INT64_C(1) << 31) * (int32_t)(x * (1.0 / (INT64_C(1) << 31))); fr = fr - (int32_t)fr; fr = fabsk(x) >= (double)(INT64_C(1) << 52) ? 0 : fr; Sleef_double2 ret = { copysignk(fr, x), copysignk(x - fr, x) }; return ret; } typedef struct { Sleef_double2 a, b; } dd2; static CONST dd2 gammak(double a) { Sleef_double2 clc = dd(0, 0), clln = dd(1, 0), clld = dd(1, 0), v = dd(1, 0), x, y, z; double t, u; int otiny = fabsk(a) < 1e-306, oref = a < 0.5; x = otiny ? dd(0, 0) : (oref ? ddadd2_d2_d_d(1, -a) : dd(a, 0)); int o0 = (0.5 <= x.x && x.x <= 1.1), o2 = 2.3 < x.x; y = ddnormalize_d2_d2(ddmul_d2_d2_d2(ddadd2_d2_d2_d(x, 1), x)); y = ddnormalize_d2_d2(ddmul_d2_d2_d2(ddadd2_d2_d2_d(x, 2), y)); y = ddnormalize_d2_d2(ddmul_d2_d2_d2(ddadd2_d2_d2_d(x, 3), y)); y = ddnormalize_d2_d2(ddmul_d2_d2_d2(ddadd2_d2_d2_d(x, 4), y)); clln = (o2 && x.x <= 7) ? y : clln; x = (o2 && x.x <= 7) ? ddadd2_d2_d2_d(x, 5) : x; t = o2 ? (1.0 / x.x) : ddnormalize_d2_d2(ddadd2_d2_d2_d(x, o0 ? -1 : -2)).x; u = o2 ? -156.801412704022726379848862 : (o0 ? +0.2947916772827614196e+2 : +0.7074816000864609279e-7); u = mla(u, t, o2 ? +1.120804464289911606838558160000 : (o0 ? +0.1281459691827820109e+3 : +0.4009244333008730443e-6)); u = mla(u, t, o2 ? +13.39798545514258921833306020000 : (o0 ? +0.2617544025784515043e+3 : +0.1040114641628246946e-5)); u = mla(u, t, o2 ? -0.116546276599463200848033357000 : (o0 ? +0.3287022855685790432e+3 : +0.1508349150733329167e-5)); u = mla(u, t, o2 ? -1.391801093265337481495562410000 : (o0 ? +0.2818145867730348186e+3 : +0.1288143074933901020e-5)); u = mla(u, t, o2 ? +0.015056113040026424412918973400 : (o0 ? +0.1728670414673559605e+3 : +0.4744167749884993937e-6)); u = mla(u, t, o2 ? +0.179540117061234856098844714000 : (o0 ? +0.7748735764030416817e+2 : -0.6554816306542489902e-7)); u = mla(u, t, o2 ? -0.002481743600264997730942489280 : (o0 ? +0.2512856643080930752e+2 : -0.3189252471452599844e-6)); u = mla(u, t, o2 ? -0.029527880945699120504851034100 : (o0 ? +0.5766792106140076868e+1 : +0.1358883821470355377e-6)); u = mla(u, t, o2 ? +0.000540164767892604515196325186 : (o0 ? +0.7270275473996180571e+0 : -0.4343931277157336040e-6)); u = mla(u, t, o2 ? +0.006403362833808069794787256200 : (o0 ? +0.8396709124579147809e-1 : +0.9724785897406779555e-6)); u = mla(u, t, o2 ? -0.000162516262783915816896611252 : (o0 ? -0.8211558669746804595e-1 : -0.2036886057225966011e-5)); u = mla(u, t, o2 ? -0.001914438498565477526465972390 : (o0 ? +0.6828831828341884458e-1 : +0.4373363141819725815e-5)); u = mla(u, t, o2 ? +7.20489541602001055898311517e-05 : (o0 ? -0.7712481339961671511e-1 : -0.9439951268304008677e-5)); u = mla(u, t, o2 ? +0.000839498720672087279971000786 : (o0 ? +0.8337492023017314957e-1 : +0.2050727030376389804e-4)); u = mla(u, t, o2 ? -5.17179090826059219329394422e-05 : (o0 ? -0.9094964931456242518e-1 : -0.4492620183431184018e-4)); u = mla(u, t, o2 ? -0.000592166437353693882857342347 : (o0 ? +0.1000996313575929358e+0 : +0.9945751236071875931e-4)); u = mla(u, t, o2 ? +6.97281375836585777403743539e-05 : (o0 ? -0.1113342861544207724e+0 : -0.2231547599034983196e-3)); u = mla(u, t, o2 ? +0.000784039221720066627493314301 : (o0 ? +0.1255096673213020875e+0 : +0.5096695247101967622e-3)); u = mla(u, t, o2 ? -0.000229472093621399176949318732 : (o0 ? -0.1440498967843054368e+0 : -0.1192753911667886971e-2)); u = mla(u, t, o2 ? -0.002681327160493827160473958490 : (o0 ? +0.1695571770041949811e+0 : +0.2890510330742210310e-2)); u = mla(u, t, o2 ? +0.003472222222222222222175164840 : (o0 ? -0.2073855510284092762e+0 : -0.7385551028674461858e-2)); u = mla(u, t, o2 ? +0.083333333333333333335592087900 : (o0 ? +0.2705808084277815939e+0 : +0.2058080842778455335e-1)); y = ddmul_d2_d2_d2(ddadd2_d2_d2_d(x, -0.5), logk2(x)); y = ddadd2_d2_d2_d2(y, ddneg_d2_d2(x)); y = ddadd2_d2_d2_d2(y, dd(0.91893853320467278056, -3.8782941580672414498e-17)); // 0.5*log(2*M_PI) z = ddadd2_d2_d2_d(ddmul_d2_d_d (u, t), o0 ? -0.4006856343865314862e+0 : -0.6735230105319810201e-1); z = ddadd2_d2_d2_d(ddmul_d2_d2_d(z, t), o0 ? +0.8224670334241132030e+0 : +0.3224670334241132030e+0); z = ddadd2_d2_d2_d(ddmul_d2_d2_d(z, t), o0 ? -0.5772156649015328655e+0 : +0.4227843350984671345e+0); z = ddmul_d2_d2_d(z, t); clc = o2 ? y : z; clld = o2 ? ddadd2_d2_d2_d(ddmul_d2_d_d(u, t), 1) : clld; y = clln; clc = otiny ? dd(83.1776616671934334590333, 3.67103459631568507221878e-15) : // log(2^120) (oref ? ddadd2_d2_d2_d2(dd(1.1447298858494001639, 1.026595116270782638e-17), ddneg_d2_d2(clc)) : clc); // log(M_PI) clln = otiny ? dd(1, 0) : (oref ? clln : clld); if (oref) x = ddmul_d2_d2_d2(clld, sinpik(a - (double)(INT64_C(1) << 28) * (int32_t)(a * (1.0 / (INT64_C(1) << 28))))); clld = otiny ? dd(a*((INT64_C(1) << 60)*(double)(INT64_C(1) << 60)), 0) : (oref ? x : y); dd2 ret = { clc, dddiv_d2_d2_d2(clln, clld) }; return ret; } EXPORT CONST double xtgamma_u1(double a) { dd2 d = gammak(a); Sleef_double2 y = ddmul_d2_d2_d2(expk2(d.a), d.b); double r = y.x + y.y; r = (a == -SLEEF_INFINITY || (a < 0 && xisint(a)) || (xisnumber(a) && a < 0 && xisnan(r))) ? SLEEF_NAN : r; r = ((a == SLEEF_INFINITY || xisnumber(a)) && a >= -DBL_MIN && (a == 0 || a > 200 || xisnan(r))) ? mulsign(SLEEF_INFINITY, a) : r; return r; } EXPORT CONST double xlgamma_u1(double a) { dd2 d = gammak(a); Sleef_double2 y = ddadd2_d2_d2_d2(d.a, logk2(ddabs_d2_d2(d.b))); double r = y.x + y.y; r = (xisinf(a) || (a <= 0 && xisint(a)) || (xisnumber(a) && xisnan(r))) ? SLEEF_INFINITY : r; return r; } EXPORT CONST double xerf_u1(double a) { double s = a, t, u; Sleef_double2 d; a = fabsk(a); int o0 = a < 1.0, o1 = a < 3.7, o2 = a < 6.0; u = o0 ? (a*a) : a; t = o0 ? +0.6801072401395392157e-20 : o1 ? +0.2830954522087717660e-13 : -0.5846750404269610493e-17; t = mla(t, u, o0 ? -0.2161766247570056391e-18 : o1 ? -0.1509491946179481940e-11 : +0.6076691048812607898e-15); t = mla(t, u, o0 ? +0.4695919173301598752e-17 : o1 ? +0.3827857177807173152e-10 : -0.3007518609604893831e-13); t = mla(t, u, o0 ? -0.9049140419888010819e-16 : o1 ? -0.6139733921558987241e-09 : +0.9427906260824646063e-12); t = mla(t, u, o0 ? +0.1634018903557411517e-14 : o1 ? +0.6985387934608038824e-08 : -0.2100110908269393629e-10); t = mla(t, u, o0 ? -0.2783485786333455216e-13 : o1 ? -0.5988224513034371474e-07 : +0.3534639523461223473e-09); t = mla(t, u, o0 ? +0.4463221276786412722e-12 : o1 ? +0.4005716952355346640e-06 : -0.4664967728285395926e-08); t = mla(t, u, o0 ? -0.6711366622850138987e-11 : o1 ? -0.2132190104575784400e-05 : +0.4943823283769000532e-07); t = mla(t, u, o0 ? +0.9422759050232658346e-10 : o1 ? +0.9092461304042630325e-05 : -0.4271203394761148254e-06); t = mla(t, u, o0 ? -0.1229055530100228477e-08 : o1 ? -0.3079188080966205457e-04 : +0.3034067677404915895e-05); t = mla(t, u, o0 ? +0.1480719281585085023e-07 : o1 ? +0.7971413443082370762e-04 : -0.1776295289066871135e-04); t = mla(t, u, o0 ? -0.1636584469123402714e-06 : o1 ? -0.1387853215225442864e-03 : +0.8524547630559505050e-04); t = mla(t, u, o0 ? +0.1646211436588923363e-05 : o1 ? +0.6469678026257590965e-04 : -0.3290582944961784398e-03); t = mla(t, u, o0 ? -0.1492565035840624866e-04 : o1 ? +0.4996645280372945860e-03 : +0.9696966068789101157e-03); t = mla(t, u, o0 ? +0.1205533298178966496e-03 : o1 ? -0.1622802482842520535e-02 : -0.1812527628046986137e-02); t = mla(t, u, o0 ? -0.8548327023450851166e-03 : o1 ? +0.1615320557049377171e-03 : -0.4725409828123619017e-03); t = mla(t, u, o0 ? +0.5223977625442188799e-02 : o1 ? +0.1915262325574875607e-01 : +0.2090315427924229266e-01); t = mla(t, u, o0 ? -0.2686617064513125569e-01 : o1 ? -0.1027818298486033455e+00 : -0.1052041921842776645e+00); t = mla(t, u, o0 ? +0.1128379167095512753e+00 : o1 ? -0.6366172819842503827e+00 : -0.6345351808766568347e+00); t = mla(t, u, o0 ? -0.3761263890318375380e+00 : o1 ? -0.1128379590648910469e+01 : -0.1129442929103524396e+01); d = ddmul_d2_d_d(t, u); d = ddadd2_d2_d2_d2(d, o0 ? dd(1.1283791670955125586, 1.5335459613165822674e-17) : o1 ? dd(3.4110644736196137587e-08, -2.4875650708323294246e-24) : dd(0.00024963035690526438285, -5.4362665034856259795e-21)); d = o0 ? ddmul_d2_d2_d(d, a) : ddadd_d2_d_d2(1.0, ddneg_d2_d2(expk2(d))); u = mulsign(o2 ? (d.x + d.y) : 1, s); u = xisnan(a) ? SLEEF_NAN : u; return u; } EXPORT CONST double xerfc_u15(double a) { double s = a, r = 0, t; Sleef_double2 u, d, x; a = fabsk(a); int o0 = a < 1.0, o1 = a < 2.2, o2 = a < 4.2, o3 = a < 27.3; u = o0 ? ddmul_d2_d_d(a, a) : o1 ? dd(a, 0) : dddiv_d2_d2_d2(dd(1, 0), dd(a, 0)); t = o0 ? +0.6801072401395386139e-20 : o1 ? +0.3438010341362585303e-12 : o2 ? -0.5757819536420710449e+2 : +0.2334249729638701319e+5; t = mla(t, u.x, o0 ? -0.2161766247570055669e-18 : o1 ? -0.1237021188160598264e-10 : o2 ? +0.4669289654498104483e+3 : -0.4695661044933107769e+5); t = mla(t, u.x, o0 ? +0.4695919173301595670e-17 : o1 ? +0.2117985839877627852e-09 : o2 ? -0.1796329879461355858e+4 : +0.3173403108748643353e+5); t = mla(t, u.x, o0 ? -0.9049140419888007122e-16 : o1 ? -0.2290560929177369506e-08 : o2 ? +0.4355892193699575728e+4 : +0.3242982786959573787e+4); t = mla(t, u.x, o0 ? +0.1634018903557410728e-14 : o1 ? +0.1748931621698149538e-07 : o2 ? -0.7456258884965764992e+4 : -0.2014717999760347811e+5); t = mla(t, u.x, o0 ? -0.2783485786333451745e-13 : o1 ? -0.9956602606623249195e-07 : o2 ? +0.9553977358167021521e+4 : +0.1554006970967118286e+5); t = mla(t, u.x, o0 ? +0.4463221276786415752e-12 : o1 ? +0.4330010240640327080e-06 : o2 ? -0.9470019905444229153e+4 : -0.6150874190563554293e+4); t = mla(t, u.x, o0 ? -0.6711366622850136563e-11 : o1 ? -0.1435050600991763331e-05 : o2 ? +0.7387344321849855078e+4 : +0.1240047765634815732e+4); t = mla(t, u.x, o0 ? +0.9422759050232662223e-10 : o1 ? +0.3460139479650695662e-05 : o2 ? -0.4557713054166382790e+4 : -0.8210325475752699731e+2); t = mla(t, u.x, o0 ? -0.1229055530100229098e-08 : o1 ? -0.4988908180632898173e-05 : o2 ? +0.2207866967354055305e+4 : +0.3242443880839930870e+2); t = mla(t, u.x, o0 ? +0.1480719281585086512e-07 : o1 ? -0.1308775976326352012e-05 : o2 ? -0.8217975658621754746e+3 : -0.2923418863833160586e+2); t = mla(t, u.x, o0 ? -0.1636584469123399803e-06 : o1 ? +0.2825086540850310103e-04 : o2 ? +0.2268659483507917400e+3 : +0.3457461732814383071e+0); t = mla(t, u.x, o0 ? +0.1646211436588923575e-05 : o1 ? -0.6393913713069986071e-04 : o2 ? -0.4633361260318560682e+2 : +0.5489730155952392998e+1); t = mla(t, u.x, o0 ? -0.1492565035840623511e-04 : o1 ? -0.2566436514695078926e-04 : o2 ? +0.9557380123733945965e+1 : +0.1559934132251294134e-2); t = mla(t, u.x, o0 ? +0.1205533298178967851e-03 : o1 ? +0.5895792375659440364e-03 : o2 ? -0.2958429331939661289e+1 : -0.1541741566831520638e+1); t = mla(t, u.x, o0 ? -0.8548327023450850081e-03 : o1 ? -0.1695715579163588598e-02 : o2 ? +0.1670329508092765480e+0 : +0.2823152230558364186e-5); t = mla(t, u.x, o0 ? +0.5223977625442187932e-02 : o1 ? +0.2089116434918055149e-03 : o2 ? +0.6096615680115419211e+0 : +0.6249999184195342838e+0); t = mla(t, u.x, o0 ? -0.2686617064513125222e-01 : o1 ? +0.1912855949584917753e-01 : o2 ? +0.1059212443193543585e-2 : +0.1741749416408701288e-8); d = ddmul_d2_d2_d(u, t); d = ddadd2_d2_d2_d2(d, o0 ? dd(0.11283791670955126141, -4.0175691625932118483e-18) : o1 ? dd(-0.10277263343147646779, -6.2338714083404900225e-18) : o2 ? dd(-0.50005180473999022439, 2.6362140569041995803e-17) : dd(-0.5000000000258444377, -4.0074044712386992281e-17)); d = ddmul_d2_d2_d2(d, u); d = ddadd2_d2_d2_d2(d, o0 ? dd(-0.37612638903183753802, 1.3391897206042552387e-17) : o1 ? dd(-0.63661976742916359662, 7.6321019159085724662e-18) : o2 ? dd(1.601106273924963368e-06, 1.1974001857764476775e-23) : dd(2.3761973137523364792e-13, -1.1670076950531026582e-29)); d = ddmul_d2_d2_d2(d, u); d = ddadd2_d2_d2_d2(d, o0 ? dd(1.1283791670955125586, 1.5335459613165822674e-17) : o1 ? dd(-1.1283791674717296161, 8.0896847755965377194e-17) : o2 ? dd(-0.57236496645145429341, 3.0704553245872027258e-17) : dd(-0.57236494292470108114, -2.3984352208056898003e-17)); x = ddmul_d2_d2_d(o1 ? d : dd(-a, 0), a); x = o1 ? x : ddadd2_d2_d2_d2(x, d); x = o0 ? ddsub_d2_d2_d2(dd(1, 0), x) : expk2(x); x = o1 ? x : ddmul_d2_d2_d2(x, u); r = o3 ? (x.x + x.y) : 0; if (s < 0) r = 2 - r; r = xisnan(s) ? SLEEF_NAN : r; return r; } #ifdef ENABLE_MAIN // gcc -w -DENABLE_MAIN -I../common sleefdp.c rempitab.c -lm #include int main(int argc, char **argv) { double d1 = atof(argv[1]); printf("arg1 = %.20g\n", d1); //int i1 = atoi(argv[1]); //double d2 = atof(argv[2]); //printf("arg2 = %.20g\n", d2); //printf("%d\n", (int)d2); #if 0 double d3 = atof(argv[3]); printf("arg3 = %.20g\n", d3); #endif //printf("%g\n", pow2i(i1)); //int exp = xexpfrexp(d1); //double r = xnextafter(d1, d2); //double r = xfma(d1, d2, d3); printf("test = %.20g\n", xcos_u1(d1)); //printf("test = %.20g\n", xlog(d1)); //r = nextafter(d1, d2); printf("corr = %.20g\n", cos(d1)); //printf("%.20g %.20g\n", xround(d1), xrint(d1)); //Sleef_double2 r = xsincospi_u35(d); //printf("%g, %g\n", (double)r.x, (double)r.y); } #endif ================================================ FILE: src/sleefsimddp.c ================================================ // Copyright Naoki Shibata and contributors 2010 - 2020. // Distributed under the Boost Software License, Version 1.0. // (See accompanying file LICENSE.txt or copy at // http://www.boost.org/LICENSE_1_0.txt) // Always use -ffp-contract=off option to compile SLEEF. #if !defined(SLEEF_GENHEADER) #include #include #include #include #endif #include "misc.h" extern const double Sleef_rempitabdp[]; #define __SLEEFSIMDDP_C__ #if (defined(_MSC_VER)) #pragma fp_contract (off) #endif // Intel #ifdef ENABLE_SSE2 #define CONFIG 2 #include "helpersse2.h" #ifdef DORENAME #ifdef ENABLE_GNUABI #include "renamesse2_gnuabi.h" #else #include "renamesse2.h" #endif #endif #endif #ifdef ENABLE_SSE4 #define CONFIG 4 #include "helpersse2.h" #ifdef DORENAME #include "renamesse4.h" #endif #endif #ifdef ENABLE_AVX #define CONFIG 1 #include "helperavx.h" #ifdef DORENAME #ifdef ENABLE_GNUABI #include "renameavx_gnuabi.h" #else #include "renameavx.h" #endif #endif #endif #ifdef ENABLE_FMA4 #define CONFIG 4 #include "helperavx.h" #ifdef DORENAME #ifdef ENABLE_GNUABI #include "renamefma4_gnuabi.h" #else #include "renamefma4.h" #endif #endif #endif #ifdef ENABLE_AVX2 #define CONFIG 1 #include "helperavx2.h" #ifdef DORENAME #ifdef ENABLE_GNUABI #include "renameavx2_gnuabi.h" #else #include "renameavx2.h" #endif #endif #endif #ifdef ENABLE_AVX2128 #define CONFIG 1 #include "helperavx2_128.h" #ifdef DORENAME #include "renameavx2128.h" #endif #endif #ifdef ENABLE_AVX512F #define CONFIG 1 #include "helperavx512f.h" #ifdef DORENAME #ifdef ENABLE_GNUABI #include "renameavx512f_gnuabi.h" #else #include "renameavx512f.h" #endif #endif #endif #ifdef ENABLE_AVX512FNOFMA #define CONFIG 2 #include "helperavx512f.h" #ifdef DORENAME #include "renameavx512fnofma.h" #endif #endif // Arm #ifdef ENABLE_ADVSIMD #define CONFIG 1 #include "helperadvsimd.h" #ifdef DORENAME #ifdef ENABLE_GNUABI #include "renameadvsimd_gnuabi.h" #else #include "renameadvsimd.h" #endif #endif #endif #ifdef ENABLE_ADVSIMDNOFMA #define CONFIG 2 #include "helperadvsimd.h" #ifdef DORENAME #include "renameadvsimdnofma.h" #endif #endif #ifdef ENABLE_SVE #define CONFIG 1 #include "helpersve.h" #ifdef DORENAME #ifdef ENABLE_GNUABI #include "renamesve_gnuabi.h" #else #include "renamesve.h" #endif /* ENABLE_GNUABI */ #endif /* DORENAME */ #endif /* ENABLE_SVE */ #ifdef ENABLE_SVENOFMA #define CONFIG 2 #include "helpersve.h" #ifdef DORENAME #include "renamesvenofma.h" #endif /* DORENAME */ #endif /* ENABLE_SVE */ // IBM #ifdef ENABLE_VSX #define CONFIG 1 #include "helperpower_128.h" #ifdef DORENAME #include "renamevsx.h" #endif #endif #ifdef ENABLE_VSXNOFMA #define CONFIG 2 #include "helperpower_128.h" #ifdef DORENAME #include "renamevsxnofma.h" #endif #endif #ifdef ENABLE_ZVECTOR2 #define CONFIG 140 #include "helpers390x_128.h" #ifdef DORENAME #include "renamezvector2.h" #endif #endif #ifdef ENABLE_ZVECTOR2NOFMA #define CONFIG 141 #include "helpers390x_128.h" #ifdef DORENAME #include "renamezvector2nofma.h" #endif #endif // Generic #ifdef ENABLE_VECEXT #define CONFIG 1 #include "helpervecext.h" #ifdef DORENAME #include "renamevecext.h" #endif #endif #ifdef ENABLE_PUREC #define CONFIG 1 #include "helperpurec.h" #ifdef DORENAME #include "renamepurec.h" #endif #endif #ifdef ENABLE_PUREC_SCALAR #define CONFIG 1 #include "helperpurec_scalar.h" #ifdef DORENAME #include "renamepurec_scalar.h" #endif #endif #ifdef ENABLE_PURECFMA_SCALAR #define CONFIG 2 #include "helperpurec_scalar.h" #ifdef DORENAME #include "renamepurecfma_scalar.h" #endif #endif // #define MLA(x, y, z) vmla_vd_vd_vd_vd((x), (y), (z)) #define C2V(c) vcast_vd_d(c) #include "estrin.h" // #include "dd.h" // static INLINE VECTOR_CC vopmask vnot_vo64_vo64(vopmask x) { return vxor_vo_vo_vo(x, veq64_vo_vm_vm(vcast_vm_i_i(0, 0), vcast_vm_i_i(0, 0))); } static INLINE CONST VECTOR_CC vopmask vsignbit_vo_vd(vdouble d) { return veq64_vo_vm_vm(vand_vm_vm_vm(vreinterpret_vm_vd(d), vreinterpret_vm_vd(vcast_vd_d(-0.0))), vreinterpret_vm_vd(vcast_vd_d(-0.0))); } // return d0 < d1 ? x : y static INLINE CONST VECTOR_CC vint vsel_vi_vd_vd_vi_vi(vdouble d0, vdouble d1, vint x, vint y) { return vsel_vi_vo_vi_vi(vcast_vo32_vo64(vlt_vo_vd_vd(d0, d1)), x, y); } // return d0 < 0 ? x : 0 static INLINE CONST VECTOR_CC vint vsel_vi_vd_vi(vdouble d, vint x) { return vand_vi_vo_vi(vcast_vo32_vo64(vsignbit_vo_vd(d)), x); } static INLINE CONST VECTOR_CC vopmask visnegzero_vo_vd(vdouble d) { return veq64_vo_vm_vm(vreinterpret_vm_vd(d), vreinterpret_vm_vd(vcast_vd_d(-0.0))); } static INLINE CONST VECTOR_CC vopmask visnumber_vo_vd(vdouble x) { return vandnot_vo_vo_vo(visinf_vo_vd(x), veq_vo_vd_vd(x, x)); } static INLINE CONST VECTOR_CC vmask vsignbit_vm_vd(vdouble d) { return vand_vm_vm_vm(vreinterpret_vm_vd(d), vreinterpret_vm_vd(vcast_vd_d(-0.0))); } static INLINE CONST VECTOR_CC vdouble vmulsign_vd_vd_vd(vdouble x, vdouble y) { return vreinterpret_vd_vm(vxor_vm_vm_vm(vreinterpret_vm_vd(x), vsignbit_vm_vd(y))); } static INLINE CONST VECTOR_CC vdouble vcopysign_vd_vd_vd(vdouble x, vdouble y) { return vreinterpret_vd_vm(vxor_vm_vm_vm(vandnot_vm_vm_vm(vreinterpret_vm_vd(vcast_vd_d(-0.0)), vreinterpret_vm_vd(x)), vand_vm_vm_vm (vreinterpret_vm_vd(vcast_vd_d(-0.0)), vreinterpret_vm_vd(y)))); } static INLINE CONST VECTOR_CC vdouble vsign_vd_vd(vdouble d) { return vmulsign_vd_vd_vd(vcast_vd_d(1.0), d); } static INLINE CONST VECTOR_CC vdouble vpow2i_vd_vi(vint q) { q = vadd_vi_vi_vi(vcast_vi_i(0x3ff), q); vint2 r = vcastu_vi2_vi(q); return vreinterpret_vd_vi2(vsll_vi2_vi2_i(r, 20)); } static INLINE CONST VECTOR_CC vdouble vldexp_vd_vd_vi(vdouble x, vint q) { vint m = vsra_vi_vi_i(q, 31); m = vsll_vi_vi_i(vsub_vi_vi_vi(vsra_vi_vi_i(vadd_vi_vi_vi(m, q), 9), m), 7); q = vsub_vi_vi_vi(q, vsll_vi_vi_i(m, 2)); m = vadd_vi_vi_vi(vcast_vi_i(0x3ff), m); m = vandnot_vi_vo_vi(vgt_vo_vi_vi(vcast_vi_i(0), m), m); m = vsel_vi_vo_vi_vi(vgt_vo_vi_vi(m, vcast_vi_i(0x7ff)), vcast_vi_i(0x7ff), m); vint2 r = vcastu_vi2_vi(m); vdouble y = vreinterpret_vd_vi2(vsll_vi2_vi2_i(r, 20)); return vmul_vd_vd_vd(vmul_vd_vd_vd(vmul_vd_vd_vd(vmul_vd_vd_vd(vmul_vd_vd_vd(x, y), y), y), y), vpow2i_vd_vi(q)); } static INLINE CONST VECTOR_CC vdouble vldexp2_vd_vd_vi(vdouble d, vint e) { return vmul_vd_vd_vd(vmul_vd_vd_vd(d, vpow2i_vd_vi(vsra_vi_vi_i(e, 1))), vpow2i_vd_vi(vsub_vi_vi_vi(e, vsra_vi_vi_i(e, 1)))); } static INLINE CONST VECTOR_CC vdouble vldexp3_vd_vd_vi(vdouble d, vint q) { return vreinterpret_vd_vi2(vadd_vi2_vi2_vi2(vreinterpret_vi2_vd(d), vsll_vi2_vi2_i(vcastu_vi2_vi(q), 20))); } #if !defined(ENABLE_AVX512F) && !defined(ENABLE_AVX512FNOFMA) static INLINE CONST VECTOR_CC vint vilogbk_vi_vd(vdouble d) { vopmask o = vlt_vo_vd_vd(d, vcast_vd_d(4.9090934652977266E-91)); d = vsel_vd_vo_vd_vd(o, vmul_vd_vd_vd(vcast_vd_d(2.037035976334486E90), d), d); vint q = vcastu_vi_vi2(vreinterpret_vi2_vd(d)); q = vand_vi_vi_vi(q, vcast_vi_i(((1 << 12)-1) << 20)); q = vsrl_vi_vi_i(q, 20); q = vsub_vi_vi_vi(q, vsel_vi_vo_vi_vi(vcast_vo32_vo64(o), vcast_vi_i(300 + 0x3ff), vcast_vi_i(0x3ff))); return q; } static INLINE CONST VECTOR_CC vint vilogb2k_vi_vd(vdouble d) { vint q = vcastu_vi_vi2(vreinterpret_vi2_vd(d)); q = vsrl_vi_vi_i(q, 20); q = vand_vi_vi_vi(q, vcast_vi_i(0x7ff)); q = vsub_vi_vi_vi(q, vcast_vi_i(0x3ff)); return q; } #endif static INLINE CONST VECTOR_CC vopmask visint_vo_vd(vdouble d) { #ifdef FULL_FP_ROUNDING return veq_vo_vd_vd(vtruncate_vd_vd(d), d); #else vdouble x = vtruncate_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(1.0 / (INT64_C(1) << 31)))); x = vmla_vd_vd_vd_vd(vcast_vd_d(-(double)(INT64_C(1) << 31)), x, d); return vor_vo_vo_vo(veq_vo_vd_vd(vtruncate_vd_vd(x), x), vgt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(INT64_C(1) << 53))); #endif } static INLINE CONST VECTOR_CC vopmask visodd_vo_vd(vdouble d) { #ifdef FULL_FP_ROUNDING vdouble x = vmul_vd_vd_vd(d, vcast_vd_d(0.5)); return vneq_vo_vd_vd(vtruncate_vd_vd(x), x); #else vdouble x = vtruncate_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(1.0 / (INT64_C(1) << 31)))); x = vmla_vd_vd_vd_vd(vcast_vd_d(-(double)(INT64_C(1) << 31)), x, d); return vand_vo_vo_vo(vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(vtruncate_vi_vd(x), vcast_vi_i(1)), vcast_vi_i(1))), vlt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(INT64_C(1) << 53))); #endif } // EXPORT CONST VECTOR_CC vdouble xldexp(vdouble x, vint q) { return vldexp_vd_vd_vi(x, q); } EXPORT CONST VECTOR_CC vint xilogb(vdouble d) { vdouble e = vcast_vd_vi(vilogbk_vi_vd(vabs_vd_vd(d))); e = vsel_vd_vo_vd_vd(veq_vo_vd_vd(d, vcast_vd_d(0)), vcast_vd_d(SLEEF_FP_ILOGB0), e); e = vsel_vd_vo_vd_vd(visnan_vo_vd(d), vcast_vd_d(SLEEF_FP_ILOGBNAN), e); e = vsel_vd_vo_vd_vd(visinf_vo_vd(d), vcast_vd_d(INT_MAX), e); return vrint_vi_vd(e); } #if !(defined(ENABLE_SVE) || defined(ENABLE_SVENOFMA)) typedef struct { vdouble d; vint i; } di_t; static vdouble digetd_vd_di(di_t d) { return d.d; } static vint digeti_vi_di(di_t d) { return d.i; } static di_t disetdi_di_vd_vi(vdouble d, vint i) { di_t r = { d, i }; return r; } typedef struct { vdouble2 dd; vint i; } ddi_t; static vdouble2 ddigetdd_vd2_ddi(ddi_t d) { return d.dd; } static vint ddigeti_vi_ddi(ddi_t d) { return d.i; } static ddi_t ddisetddi_ddi_vd2_vi(vdouble2 v, vint i) { ddi_t r = { v, i }; return r; } static ddi_t ddisetdd_ddi_ddi_vd2(ddi_t ddi, vdouble2 v) { ddi.dd = v; return ddi; } #endif static INLINE CONST VECTOR_CC vdouble vorsign_vd_vd_vd(vdouble x, vdouble y) { return vreinterpret_vd_vm(vor_vm_vm_vm(vreinterpret_vm_vd(x), vsignbit_vm_vd(y))); } static INLINE CONST di_t rempisub(vdouble x) { #ifdef FULL_FP_ROUNDING vdouble y = vrint_vd_vd(vmul_vd_vd_vd(x, vcast_vd_d(4))); vint vi = vtruncate_vi_vd(vsub_vd_vd_vd(y, vmul_vd_vd_vd(vrint_vd_vd(x), vcast_vd_d(4)))); return disetdi_di_vd_vi(vsub_vd_vd_vd(x, vmul_vd_vd_vd(y, vcast_vd_d(0.25))), vi); #else vdouble c = vmulsign_vd_vd_vd(vcast_vd_d(INT64_C(1) << 52), x); vdouble rint4x = vsel_vd_vo_vd_vd(vgt_vo_vd_vd(vabs_vd_vd(vmul_vd_vd_vd(vcast_vd_d(4), x)), vcast_vd_d(INT64_C(1) << 52)), vmul_vd_vd_vd(vcast_vd_d(4), x), vorsign_vd_vd_vd(vsub_vd_vd_vd(vmla_vd_vd_vd_vd(vcast_vd_d(4), x, c), c), x)); vdouble rintx = vsel_vd_vo_vd_vd(vgt_vo_vd_vd(vabs_vd_vd(x), vcast_vd_d(INT64_C(1) << 52)), x, vorsign_vd_vd_vd(vsub_vd_vd_vd(vadd_vd_vd_vd(x, c), c), x)); return disetdi_di_vd_vi(vmla_vd_vd_vd_vd(vcast_vd_d(-0.25), rint4x, x), vtruncate_vi_vd(vmla_vd_vd_vd_vd(vcast_vd_d(-4), rintx, rint4x))); #endif } static INLINE CONST ddi_t rempi(vdouble a) { vdouble2 x, y, z; vint ex = vilogb2k_vi_vd(a); #if defined(ENABLE_AVX512F) || defined(ENABLE_AVX512FNOFMA) ex = vandnot_vi_vi_vi(vsra_vi_vi_i(ex, 31), ex); ex = vand_vi_vi_vi(ex, vcast_vi_i(1023)); #endif ex = vsub_vi_vi_vi(ex, vcast_vi_i(55)); vint q = vand_vi_vo_vi(vgt_vo_vi_vi(ex, vcast_vi_i(700-55)), vcast_vi_i(-64)); a = vldexp3_vd_vd_vi(a, q); ex = vandnot_vi_vi_vi(vsra_vi_vi_i(ex, 31), ex); ex = vsll_vi_vi_i(ex, 2); x = ddmul_vd2_vd_vd(a, vgather_vd_p_vi(Sleef_rempitabdp, ex)); di_t di = rempisub(vd2getx_vd_vd2(x)); q = digeti_vi_di(di); x = vd2setx_vd2_vd2_vd(x, digetd_vd_di(di)); x = ddnormalize_vd2_vd2(x); y = ddmul_vd2_vd_vd(a, vgather_vd_p_vi(Sleef_rempitabdp+1, ex)); x = ddadd2_vd2_vd2_vd2(x, y); di = rempisub(vd2getx_vd_vd2(x)); q = vadd_vi_vi_vi(q, digeti_vi_di(di)); x = vd2setx_vd2_vd2_vd(x, digetd_vd_di(di)); x = ddnormalize_vd2_vd2(x); y = vcast_vd2_vd_vd(vgather_vd_p_vi(Sleef_rempitabdp+2, ex), vgather_vd_p_vi(Sleef_rempitabdp+3, ex)); y = ddmul_vd2_vd2_vd(y, a); x = ddadd2_vd2_vd2_vd2(x, y); x = ddnormalize_vd2_vd2(x); x = ddmul_vd2_vd2_vd2(x, vcast_vd2_d_d(3.141592653589793116*2, 1.2246467991473532072e-16*2)); vopmask o = vlt_vo_vd_vd(vabs_vd_vd(a), vcast_vd_d(0.7)); x = vd2setx_vd2_vd2_vd(x, vsel_vd_vo_vd_vd(o, a, vd2getx_vd_vd2(x))); x = vd2sety_vd2_vd2_vd(x, vreinterpret_vd_vm(vandnot_vm_vo64_vm(o, vreinterpret_vm_vd(vd2gety_vd_vd2(x))))); return ddisetddi_ddi_vd2_vi(x, q); } EXPORT CONST VECTOR_CC vdouble xsin(vdouble d) { #if !defined(DETERMINISTIC) // The SIMD source files(sleefsimd?p.c) are compiled twice for each // vector extension, with DETERMINISTIC macro turned on and off. // Below is the normal(faster) implementation of sin function. The // function name xsin will be renamed to Sleef_sind2_u35sse2 with // renamesse2.h, for example. vdouble u, s, r = d; vint ql; if (LIKELY(vtestallones_i_vo64(vlt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(TRIGRANGEMAX2))))) { vdouble dql = vrint_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(M_1_PI))); ql = vrint_vi_vd(dql); d = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_A2), d); d = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_B2), d); } else if (LIKELY(vtestallones_i_vo64(vlt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(TRIGRANGEMAX))))) { vdouble dqh = vtruncate_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(M_1_PI / (1 << 24)))); dqh = vmul_vd_vd_vd(dqh, vcast_vd_d(1 << 24)); vdouble dql = vrint_vd_vd(vmlapn_vd_vd_vd_vd(d, vcast_vd_d(M_1_PI), dqh)); ql = vrint_vi_vd(dql); d = vmla_vd_vd_vd_vd(dqh, vcast_vd_d(-PI_A), d); d = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_A), d); d = vmla_vd_vd_vd_vd(dqh, vcast_vd_d(-PI_B), d); d = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_B), d); d = vmla_vd_vd_vd_vd(dqh, vcast_vd_d(-PI_C), d); d = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_C), d); d = vmla_vd_vd_vd_vd(vadd_vd_vd_vd(dqh, dql), vcast_vd_d(-PI_D), d); } else { ddi_t ddi = rempi(d); ql = vand_vi_vi_vi(ddigeti_vi_ddi(ddi), vcast_vi_i(3)); ql = vadd_vi_vi_vi(vadd_vi_vi_vi(ql, ql), vsel_vi_vo_vi_vi(vcast_vo32_vo64(vgt_vo_vd_vd(vd2getx_vd_vd2(ddigetdd_vd2_ddi(ddi)), vcast_vd_d(0))), vcast_vi_i(2), vcast_vi_i(1))); ql = vsra_vi_vi_i(ql, 2); vopmask o = veq_vo_vi_vi(vand_vi_vi_vi(ddigeti_vi_ddi(ddi), vcast_vi_i(1)), vcast_vi_i(1)); vdouble2 x = vcast_vd2_vd_vd(vmulsign_vd_vd_vd(vcast_vd_d(-3.141592653589793116 * 0.5), vd2getx_vd_vd2(ddigetdd_vd2_ddi(ddi))), vmulsign_vd_vd_vd(vcast_vd_d(-1.2246467991473532072e-16 * 0.5), vd2getx_vd_vd2(ddigetdd_vd2_ddi(ddi)))); x = ddadd2_vd2_vd2_vd2(ddigetdd_vd2_ddi(ddi), x); ddi = ddisetdd_ddi_ddi_vd2(ddi, vsel_vd2_vo_vd2_vd2(vcast_vo64_vo32(o), x, ddigetdd_vd2_ddi(ddi))); d = vadd_vd_vd_vd(vd2getx_vd_vd2(ddigetdd_vd2_ddi(ddi)), vd2gety_vd_vd2(ddigetdd_vd2_ddi(ddi))); d = vreinterpret_vd_vm(vor_vm_vo64_vm(vor_vo_vo_vo(visinf_vo_vd(r), visnan_vo_vd(r)), vreinterpret_vm_vd(d))); } s = vmul_vd_vd_vd(d, d); d = vreinterpret_vd_vm(vxor_vm_vm_vm(vand_vm_vo64_vm(vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(ql, vcast_vi_i(1)), vcast_vi_i(1))), vreinterpret_vm_vd(vcast_vd_d(-0.0))), vreinterpret_vm_vd(d))); vdouble s2 = vmul_vd_vd_vd(s, s), s4 = vmul_vd_vd_vd(s2, s2); u = POLY8(s, s2, s4, -7.97255955009037868891952e-18, 2.81009972710863200091251e-15, -7.64712219118158833288484e-13, 1.60590430605664501629054e-10, -2.50521083763502045810755e-08, 2.75573192239198747630416e-06, -0.000198412698412696162806809, 0.00833333333333332974823815); u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-0.166666666666666657414808)); u = vadd_vd_vd_vd(vmul_vd_vd_vd(s, vmul_vd_vd_vd(u, d)), d); u = vsel_vd_vo_vd_vd(visnegzero_vo_vd(r), r, u); return u; #else // #if !defined(DETERMINISTIC) // This is the deterministic implementation of sin function. Returned // values from deterministic functions are bitwise consistent across // all platforms. The function name xsin will be renamed to // Sleef_cinz_sind2_u35sse2 with renamesse2.h, for example. The // renaming by rename*.h is switched according to DETERMINISTIC macro. vdouble u, s, r = d; vint ql; vdouble dql = vrint_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(M_1_PI))); ql = vrint_vi_vd(dql); d = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_A2), d); d = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_B2), d); vopmask g = vlt_vo_vd_vd(vabs_vd_vd(r), vcast_vd_d(TRIGRANGEMAX2)); if (!LIKELY(vtestallones_i_vo64(g))) { vdouble dqh = vtruncate_vd_vd(vmul_vd_vd_vd(r, vcast_vd_d(M_1_PI / (1 << 24)))); dqh = vmul_vd_vd_vd(dqh, vcast_vd_d(1 << 24)); vdouble dql = vrint_vd_vd(vmlapn_vd_vd_vd_vd(r, vcast_vd_d(M_1_PI), dqh)); u = vmla_vd_vd_vd_vd(dqh, vcast_vd_d(-PI_A), r); u = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_A), u); u = vmla_vd_vd_vd_vd(dqh, vcast_vd_d(-PI_B), u); u = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_B), u); u = vmla_vd_vd_vd_vd(dqh, vcast_vd_d(-PI_C), u); u = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_C), u); u = vmla_vd_vd_vd_vd(vadd_vd_vd_vd(dqh, dql), vcast_vd_d(-PI_D), u); ql = vsel_vi_vo_vi_vi(vcast_vo32_vo64(g), ql, vrint_vi_vd(dql)); d = vsel_vd_vo_vd_vd(g, d, u); g = vlt_vo_vd_vd(vabs_vd_vd(r), vcast_vd_d(TRIGRANGEMAX)); if (!LIKELY(vtestallones_i_vo64(g))) { ddi_t ddi = rempi(r); vint ql2 = vand_vi_vi_vi(ddigeti_vi_ddi(ddi), vcast_vi_i(3)); ql2 = vadd_vi_vi_vi(vadd_vi_vi_vi(ql2, ql2), vsel_vi_vo_vi_vi(vcast_vo32_vo64(vgt_vo_vd_vd(vd2getx_vd_vd2(ddigetdd_vd2_ddi(ddi)), vcast_vd_d(0))), vcast_vi_i(2), vcast_vi_i(1))); ql2 = vsra_vi_vi_i(ql2, 2); vopmask o = veq_vo_vi_vi(vand_vi_vi_vi(ddigeti_vi_ddi(ddi), vcast_vi_i(1)), vcast_vi_i(1)); vdouble2 x = vcast_vd2_vd_vd(vmulsign_vd_vd_vd(vcast_vd_d(-3.141592653589793116 * 0.5), vd2getx_vd_vd2(ddigetdd_vd2_ddi(ddi))), vmulsign_vd_vd_vd(vcast_vd_d(-1.2246467991473532072e-16 * 0.5), vd2getx_vd_vd2(ddigetdd_vd2_ddi(ddi)))); x = ddadd2_vd2_vd2_vd2(ddigetdd_vd2_ddi(ddi), x); ddi = ddisetdd_ddi_ddi_vd2(ddi, vsel_vd2_vo_vd2_vd2(vcast_vo64_vo32(o), x, ddigetdd_vd2_ddi(ddi))); u = vadd_vd_vd_vd(vd2getx_vd_vd2(ddigetdd_vd2_ddi(ddi)), vd2gety_vd_vd2(ddigetdd_vd2_ddi(ddi))); ql = vsel_vi_vo_vi_vi(vcast_vo32_vo64(g), ql, ql2); d = vsel_vd_vo_vd_vd(g, d, u); d = vreinterpret_vd_vm(vor_vm_vo64_vm(vor_vo_vo_vo(visinf_vo_vd(r), visnan_vo_vd(r)), vreinterpret_vm_vd(d))); } } s = vmul_vd_vd_vd(d, d); d = vreinterpret_vd_vm(vxor_vm_vm_vm(vand_vm_vo64_vm(vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(ql, vcast_vi_i(1)), vcast_vi_i(1))), vreinterpret_vm_vd(vcast_vd_d(-0.0))), vreinterpret_vm_vd(d))); vdouble s2 = vmul_vd_vd_vd(s, s), s4 = vmul_vd_vd_vd(s2, s2); u = POLY8(s, s2, s4, -7.97255955009037868891952e-18, 2.81009972710863200091251e-15, -7.64712219118158833288484e-13, 1.60590430605664501629054e-10, -2.50521083763502045810755e-08, 2.75573192239198747630416e-06, -0.000198412698412696162806809, 0.00833333333333332974823815); u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-0.166666666666666657414808)); u = vadd_vd_vd_vd(vmul_vd_vd_vd(s, vmul_vd_vd_vd(u, d)), d); u = vsel_vd_vo_vd_vd(visnegzero_vo_vd(r), r, u); return u; #endif // #if !defined(DETERMINISTIC) } EXPORT CONST VECTOR_CC vdouble xsin_u1(vdouble d) { #if !defined(DETERMINISTIC) vdouble u; vdouble2 s, t, x; vint ql; if (LIKELY(vtestallones_i_vo64(vlt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(TRIGRANGEMAX2))))) { const vdouble dql = vrint_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(M_1_PI))); ql = vrint_vi_vd(dql); u = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_A2), d); s = ddadd_vd2_vd_vd (u, vmul_vd_vd_vd(dql, vcast_vd_d(-PI_B2))); } else if (LIKELY(vtestallones_i_vo64(vlt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(TRIGRANGEMAX))))) { vdouble dqh = vtruncate_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(M_1_PI / (1 << 24)))); dqh = vmul_vd_vd_vd(dqh, vcast_vd_d(1 << 24)); const vdouble dql = vrint_vd_vd(vmlapn_vd_vd_vd_vd(d, vcast_vd_d(M_1_PI), dqh)); ql = vrint_vi_vd(dql); u = vmla_vd_vd_vd_vd(dqh, vcast_vd_d(-PI_A), d); s = ddadd_vd2_vd_vd (u, vmul_vd_vd_vd(dql, vcast_vd_d(-PI_A))); s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(dqh, vcast_vd_d(-PI_B))); s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(dql, vcast_vd_d(-PI_B))); s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(dqh, vcast_vd_d(-PI_C))); s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(dql, vcast_vd_d(-PI_C))); s = ddadd_vd2_vd2_vd(s, vmul_vd_vd_vd(vadd_vd_vd_vd(dqh, dql), vcast_vd_d(-PI_D))); } else { ddi_t ddi = rempi(d); ql = vand_vi_vi_vi(ddigeti_vi_ddi(ddi), vcast_vi_i(3)); ql = vadd_vi_vi_vi(vadd_vi_vi_vi(ql, ql), vsel_vi_vo_vi_vi(vcast_vo32_vo64(vgt_vo_vd_vd(vd2getx_vd_vd2(ddigetdd_vd2_ddi(ddi)), vcast_vd_d(0))), vcast_vi_i(2), vcast_vi_i(1))); ql = vsra_vi_vi_i(ql, 2); vopmask o = veq_vo_vi_vi(vand_vi_vi_vi(ddigeti_vi_ddi(ddi), vcast_vi_i(1)), vcast_vi_i(1)); vdouble2 x = vcast_vd2_vd_vd(vmulsign_vd_vd_vd(vcast_vd_d(-3.141592653589793116 * 0.5), vd2getx_vd_vd2(ddigetdd_vd2_ddi(ddi))), vmulsign_vd_vd_vd(vcast_vd_d(-1.2246467991473532072e-16 * 0.5), vd2getx_vd_vd2(ddigetdd_vd2_ddi(ddi)))); x = ddadd2_vd2_vd2_vd2(ddigetdd_vd2_ddi(ddi), x); ddi = ddisetdd_ddi_ddi_vd2(ddi, vsel_vd2_vo_vd2_vd2(vcast_vo64_vo32(o), x, ddigetdd_vd2_ddi(ddi))); s = ddnormalize_vd2_vd2(ddigetdd_vd2_ddi(ddi)); s = vd2setx_vd2_vd2_vd(s, vreinterpret_vd_vm(vor_vm_vo64_vm(vor_vo_vo_vo(visinf_vo_vd(d), visnan_vo_vd(d)), vreinterpret_vm_vd(vd2getx_vd_vd2(s))))); } t = s; s = ddsqu_vd2_vd2(s); vdouble s2 = vmul_vd_vd_vd(vd2getx_vd_vd2(s), vd2getx_vd_vd2(s)), s4 = vmul_vd_vd_vd(s2, s2); u = POLY6(vd2getx_vd_vd2(s), s2, s4, 2.72052416138529567917983e-15, -7.6429259411395447190023e-13, 1.60589370117277896211623e-10, -2.5052106814843123359368e-08, 2.75573192104428224777379e-06, -0.000198412698412046454654947); u = vmla_vd_vd_vd_vd(u, vd2getx_vd_vd2(s), vcast_vd_d(0.00833333333333318056201922)); x = ddadd_vd2_vd_vd2(vcast_vd_d(1), ddmul_vd2_vd2_vd2(ddadd_vd2_vd_vd(vcast_vd_d(-0.166666666666666657414808), vmul_vd_vd_vd(u, vd2getx_vd_vd2(s))), s)); u = ddmul_vd_vd2_vd2(t, x); u = vreinterpret_vd_vm(vxor_vm_vm_vm(vand_vm_vo64_vm(vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(ql, vcast_vi_i(1)), vcast_vi_i(1))), vreinterpret_vm_vd(vcast_vd_d(-0.0))), vreinterpret_vm_vd(u))); u = vsel_vd_vo_vd_vd(veq_vo_vd_vd(d, vcast_vd_d(0)), d, u); return u; #else // #if !defined(DETERMINISTIC) vdouble u; vdouble2 s, t, x; vint ql; vopmask g = vlt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(TRIGRANGEMAX2)); vdouble dql = vrint_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(M_1_PI))); ql = vrint_vi_vd(dql); u = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_A2), d); x = ddadd_vd2_vd_vd (u, vmul_vd_vd_vd(dql, vcast_vd_d(-PI_B2))); if (!LIKELY(vtestallones_i_vo64(g))) { vdouble dqh = vtruncate_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(M_1_PI / (1 << 24)))); dqh = vmul_vd_vd_vd(dqh, vcast_vd_d(1 << 24)); const vdouble dql = vrint_vd_vd(vmlapn_vd_vd_vd_vd(d, vcast_vd_d(M_1_PI), dqh)); u = vmla_vd_vd_vd_vd(dqh, vcast_vd_d(-PI_A), d); s = ddadd_vd2_vd_vd (u, vmul_vd_vd_vd(dql, vcast_vd_d(-PI_A))); s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(dqh, vcast_vd_d(-PI_B))); s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(dql, vcast_vd_d(-PI_B))); s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(dqh, vcast_vd_d(-PI_C))); s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(dql, vcast_vd_d(-PI_C))); s = ddadd_vd2_vd2_vd(s, vmul_vd_vd_vd(vadd_vd_vd_vd(dqh, dql), vcast_vd_d(-PI_D))); ql = vsel_vi_vo_vi_vi(vcast_vo32_vo64(g), ql, vrint_vi_vd(dql)); x = vsel_vd2_vo_vd2_vd2(g, x, s); g = vlt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(TRIGRANGEMAX)); if (!LIKELY(vtestallones_i_vo64(g))) { ddi_t ddi = rempi(d); vint ql2 = vand_vi_vi_vi(ddigeti_vi_ddi(ddi), vcast_vi_i(3)); ql2 = vadd_vi_vi_vi(vadd_vi_vi_vi(ql2, ql2), vsel_vi_vo_vi_vi(vcast_vo32_vo64(vgt_vo_vd_vd(vd2getx_vd_vd2(ddigetdd_vd2_ddi(ddi)), vcast_vd_d(0))), vcast_vi_i(2), vcast_vi_i(1))); ql2 = vsra_vi_vi_i(ql2, 2); vopmask o = veq_vo_vi_vi(vand_vi_vi_vi(ddigeti_vi_ddi(ddi), vcast_vi_i(1)), vcast_vi_i(1)); vdouble2 t = vcast_vd2_vd_vd(vmulsign_vd_vd_vd(vcast_vd_d(-3.141592653589793116 * 0.5), vd2getx_vd_vd2(ddigetdd_vd2_ddi(ddi))), vmulsign_vd_vd_vd(vcast_vd_d(-1.2246467991473532072e-16 * 0.5), vd2getx_vd_vd2(ddigetdd_vd2_ddi(ddi)))); t = ddadd2_vd2_vd2_vd2(ddigetdd_vd2_ddi(ddi), t); ddi = ddisetdd_ddi_ddi_vd2(ddi, vsel_vd2_vo_vd2_vd2(vcast_vo64_vo32(o), t, ddigetdd_vd2_ddi(ddi))); s = ddnormalize_vd2_vd2(ddigetdd_vd2_ddi(ddi)); ql = vsel_vi_vo_vi_vi(vcast_vo32_vo64(g), ql, ql2); x = vsel_vd2_vo_vd2_vd2(g, x, s); x = vd2setx_vd2_vd2_vd(x, vreinterpret_vd_vm(vor_vm_vo64_vm(vor_vo_vo_vo(visinf_vo_vd(d), visnan_vo_vd(d)), vreinterpret_vm_vd(vd2getx_vd_vd2(x))))); } } t = x; s = ddsqu_vd2_vd2(x); vdouble s2 = vmul_vd_vd_vd(vd2getx_vd_vd2(s), vd2getx_vd_vd2(s)), s4 = vmul_vd_vd_vd(s2, s2); u = POLY6(vd2getx_vd_vd2(s), s2, s4, 2.72052416138529567917983e-15, -7.6429259411395447190023e-13, 1.60589370117277896211623e-10, -2.5052106814843123359368e-08, 2.75573192104428224777379e-06, -0.000198412698412046454654947); u = vmla_vd_vd_vd_vd(u, vd2getx_vd_vd2(s), vcast_vd_d(0.00833333333333318056201922)); x = ddadd_vd2_vd_vd2(vcast_vd_d(1), ddmul_vd2_vd2_vd2(ddadd_vd2_vd_vd(vcast_vd_d(-0.166666666666666657414808), vmul_vd_vd_vd(u, vd2getx_vd_vd2(s))), s)); u = ddmul_vd_vd2_vd2(t, x); u = vreinterpret_vd_vm(vxor_vm_vm_vm(vand_vm_vo64_vm(vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(ql, vcast_vi_i(1)), vcast_vi_i(1))), vreinterpret_vm_vd(vcast_vd_d(-0.0))), vreinterpret_vm_vd(u))); u = vsel_vd_vo_vd_vd(veq_vo_vd_vd(d, vcast_vd_d(0)), d, u); return u; #endif // #if !defined(DETERMINISTIC) } EXPORT CONST VECTOR_CC vdouble xcos(vdouble d) { #if !defined(DETERMINISTIC) vdouble u, s, r = d; vint ql; if (LIKELY(vtestallones_i_vo64(vlt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(TRIGRANGEMAX2))))) { vdouble dql = vmla_vd_vd_vd_vd(vcast_vd_d(2), vrint_vd_vd(vmla_vd_vd_vd_vd(d, vcast_vd_d(M_1_PI), vcast_vd_d(-0.5))), vcast_vd_d(1)); ql = vrint_vi_vd(dql); d = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_A2 * 0.5), d); d = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_B2 * 0.5), d); } else if (LIKELY(vtestallones_i_vo64(vlt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(TRIGRANGEMAX))))) { vdouble dqh = vtruncate_vd_vd(vmla_vd_vd_vd_vd(d, vcast_vd_d(M_1_PI / (1 << 23)), vcast_vd_d(-M_1_PI / (1 << 24)))); ql = vrint_vi_vd(vadd_vd_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(M_1_PI)), vmla_vd_vd_vd_vd(dqh, vcast_vd_d(-(1 << 23)), vcast_vd_d(-0.5)))); dqh = vmul_vd_vd_vd(dqh, vcast_vd_d(1 << 24)); ql = vadd_vi_vi_vi(vadd_vi_vi_vi(ql, ql), vcast_vi_i(1)); vdouble dql = vcast_vd_vi(ql); d = vmla_vd_vd_vd_vd(dqh, vcast_vd_d(-PI_A * 0.5), d); d = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_A * 0.5), d); d = vmla_vd_vd_vd_vd(dqh, vcast_vd_d(-PI_B * 0.5), d); d = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_B * 0.5), d); d = vmla_vd_vd_vd_vd(dqh, vcast_vd_d(-PI_C * 0.5), d); d = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_C * 0.5), d); d = vmla_vd_vd_vd_vd(vadd_vd_vd_vd(dqh, dql), vcast_vd_d(-PI_D * 0.5), d); } else { ddi_t ddi = rempi(d); ql = vand_vi_vi_vi(ddigeti_vi_ddi(ddi), vcast_vi_i(3)); ql = vadd_vi_vi_vi(vadd_vi_vi_vi(ql, ql), vsel_vi_vo_vi_vi(vcast_vo32_vo64(vgt_vo_vd_vd(vd2getx_vd_vd2(ddigetdd_vd2_ddi(ddi)), vcast_vd_d(0))), vcast_vi_i(8), vcast_vi_i(7))); ql = vsra_vi_vi_i(ql, 1); vopmask o = veq_vo_vi_vi(vand_vi_vi_vi(ddigeti_vi_ddi(ddi), vcast_vi_i(1)), vcast_vi_i(0)); vdouble y = vsel_vd_vo_vd_vd(vgt_vo_vd_vd(vd2getx_vd_vd2(ddigetdd_vd2_ddi(ddi)), vcast_vd_d(0)), vcast_vd_d(0), vcast_vd_d(-1)); vdouble2 x = vcast_vd2_vd_vd(vmulsign_vd_vd_vd(vcast_vd_d(-3.141592653589793116 * 0.5), y), vmulsign_vd_vd_vd(vcast_vd_d(-1.2246467991473532072e-16 * 0.5), y)); x = ddadd2_vd2_vd2_vd2(ddigetdd_vd2_ddi(ddi), x); ddi = ddisetdd_ddi_ddi_vd2(ddi, vsel_vd2_vo_vd2_vd2(vcast_vo64_vo32(o), x, ddigetdd_vd2_ddi(ddi))); d = vadd_vd_vd_vd(vd2getx_vd_vd2(ddigetdd_vd2_ddi(ddi)), vd2gety_vd_vd2(ddigetdd_vd2_ddi(ddi))); d = vreinterpret_vd_vm(vor_vm_vo64_vm(vor_vo_vo_vo(visinf_vo_vd(r), visnan_vo_vd(r)), vreinterpret_vm_vd(d))); } s = vmul_vd_vd_vd(d, d); d = vreinterpret_vd_vm(vxor_vm_vm_vm(vand_vm_vo64_vm(vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(ql, vcast_vi_i(2)), vcast_vi_i(0))), vreinterpret_vm_vd(vcast_vd_d(-0.0))), vreinterpret_vm_vd(d))); vdouble s2 = vmul_vd_vd_vd(s, s), s4 = vmul_vd_vd_vd(s2, s2); u = POLY8(s, s2, s4, -7.97255955009037868891952e-18, 2.81009972710863200091251e-15, -7.64712219118158833288484e-13, 1.60590430605664501629054e-10, -2.50521083763502045810755e-08, 2.75573192239198747630416e-06, -0.000198412698412696162806809, 0.00833333333333332974823815); u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-0.166666666666666657414808)); u = vadd_vd_vd_vd(vmul_vd_vd_vd(s, vmul_vd_vd_vd(u, d)), d); return u; #else // #if !defined(DETERMINISTIC) vdouble u, s, r = d; vint ql; vopmask g = vlt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(TRIGRANGEMAX2)); vdouble dql = vmla_vd_vd_vd_vd(vcast_vd_d(2), vrint_vd_vd(vmla_vd_vd_vd_vd(d, vcast_vd_d(M_1_PI), vcast_vd_d(-0.5))), vcast_vd_d(1)); ql = vrint_vi_vd(dql); d = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_A2 * 0.5), d); d = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_B2 * 0.5), d); if (!LIKELY(vtestallones_i_vo64(g))) { vdouble dqh = vtruncate_vd_vd(vmla_vd_vd_vd_vd(r, vcast_vd_d(M_1_PI / (1 << 23)), vcast_vd_d(-M_1_PI / (1 << 24)))); vint ql2 = vrint_vi_vd(vadd_vd_vd_vd(vmul_vd_vd_vd(r, vcast_vd_d(M_1_PI)), vmla_vd_vd_vd_vd(dqh, vcast_vd_d(-(1 << 23)), vcast_vd_d(-0.5)))); dqh = vmul_vd_vd_vd(dqh, vcast_vd_d(1 << 24)); ql2 = vadd_vi_vi_vi(vadd_vi_vi_vi(ql2, ql2), vcast_vi_i(1)); vdouble dql = vcast_vd_vi(ql2); u = vmla_vd_vd_vd_vd(dqh, vcast_vd_d(-PI_A * 0.5), r); u = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_A * 0.5), u); u = vmla_vd_vd_vd_vd(dqh, vcast_vd_d(-PI_B * 0.5), u); u = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_B * 0.5), u); u = vmla_vd_vd_vd_vd(dqh, vcast_vd_d(-PI_C * 0.5), u); u = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_C * 0.5), u); u = vmla_vd_vd_vd_vd(vadd_vd_vd_vd(dqh, dql), vcast_vd_d(-PI_D * 0.5), u); ql = vsel_vi_vo_vi_vi(vcast_vo32_vo64(g), ql, ql2); d = vsel_vd_vo_vd_vd(g, d, u); g = vlt_vo_vd_vd(vabs_vd_vd(r), vcast_vd_d(TRIGRANGEMAX)); if (!LIKELY(vtestallones_i_vo64(g))) { ddi_t ddi = rempi(r); vint ql2 = vand_vi_vi_vi(ddigeti_vi_ddi(ddi), vcast_vi_i(3)); ql2 = vadd_vi_vi_vi(vadd_vi_vi_vi(ql2, ql2), vsel_vi_vo_vi_vi(vcast_vo32_vo64(vgt_vo_vd_vd(vd2getx_vd_vd2(ddigetdd_vd2_ddi(ddi)), vcast_vd_d(0))), vcast_vi_i(8), vcast_vi_i(7))); ql2 = vsra_vi_vi_i(ql2, 1); vopmask o = veq_vo_vi_vi(vand_vi_vi_vi(ddigeti_vi_ddi(ddi), vcast_vi_i(1)), vcast_vi_i(0)); vdouble y = vsel_vd_vo_vd_vd(vgt_vo_vd_vd(vd2getx_vd_vd2(ddigetdd_vd2_ddi(ddi)), vcast_vd_d(0)), vcast_vd_d(0), vcast_vd_d(-1)); vdouble2 x = vcast_vd2_vd_vd(vmulsign_vd_vd_vd(vcast_vd_d(-3.141592653589793116 * 0.5), y), vmulsign_vd_vd_vd(vcast_vd_d(-1.2246467991473532072e-16 * 0.5), y)); x = ddadd2_vd2_vd2_vd2(ddigetdd_vd2_ddi(ddi), x); ddi = ddisetdd_ddi_ddi_vd2(ddi, vsel_vd2_vo_vd2_vd2(vcast_vo64_vo32(o), x, ddigetdd_vd2_ddi(ddi))); u = vadd_vd_vd_vd(vd2getx_vd_vd2(ddigetdd_vd2_ddi(ddi)), vd2gety_vd_vd2(ddigetdd_vd2_ddi(ddi))); ql = vsel_vi_vo_vi_vi(vcast_vo32_vo64(g), ql, ql2); d = vsel_vd_vo_vd_vd(g, d, u); d = vreinterpret_vd_vm(vor_vm_vo64_vm(vor_vo_vo_vo(visinf_vo_vd(r), visnan_vo_vd(r)), vreinterpret_vm_vd(d))); } } s = vmul_vd_vd_vd(d, d); d = vreinterpret_vd_vm(vxor_vm_vm_vm(vand_vm_vo64_vm(vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(ql, vcast_vi_i(2)), vcast_vi_i(0))), vreinterpret_vm_vd(vcast_vd_d(-0.0))), vreinterpret_vm_vd(d))); vdouble s2 = vmul_vd_vd_vd(s, s), s4 = vmul_vd_vd_vd(s2, s2); u = POLY8(s, s2, s4, -7.97255955009037868891952e-18, 2.81009972710863200091251e-15, -7.64712219118158833288484e-13, 1.60590430605664501629054e-10, -2.50521083763502045810755e-08, 2.75573192239198747630416e-06, -0.000198412698412696162806809, 0.00833333333333332974823815); u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-0.166666666666666657414808)); u = vadd_vd_vd_vd(vmul_vd_vd_vd(s, vmul_vd_vd_vd(u, d)), d); return u; #endif // #if !defined(DETERMINISTIC) } EXPORT CONST VECTOR_CC vdouble xcos_u1(vdouble d) { #if !defined(DETERMINISTIC) vdouble u; vdouble2 s, t, x; vint ql; if (LIKELY(vtestallones_i_vo64(vlt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(TRIGRANGEMAX2))))) { vdouble dql = vrint_vd_vd(vmla_vd_vd_vd_vd(d, vcast_vd_d(M_1_PI), vcast_vd_d(-0.5))); dql = vmla_vd_vd_vd_vd(vcast_vd_d(2), dql, vcast_vd_d(1)); ql = vrint_vi_vd(dql); s = ddadd2_vd2_vd_vd(d, vmul_vd_vd_vd(dql, vcast_vd_d(-PI_A2*0.5))); s = ddadd_vd2_vd2_vd(s, vmul_vd_vd_vd(dql, vcast_vd_d(-PI_B2*0.5))); } else if (LIKELY(vtestallones_i_vo64(vlt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(TRIGRANGEMAX))))) { vdouble dqh = vtruncate_vd_vd(vmla_vd_vd_vd_vd(d, vcast_vd_d(M_1_PI / (1 << 23)), vcast_vd_d(-M_1_PI / (1 << 24)))); ql = vrint_vi_vd(vadd_vd_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(M_1_PI)), vmla_vd_vd_vd_vd(dqh, vcast_vd_d(-(1 << 23)), vcast_vd_d(-0.5)))); dqh = vmul_vd_vd_vd(dqh, vcast_vd_d(1 << 24)); ql = vadd_vi_vi_vi(vadd_vi_vi_vi(ql, ql), vcast_vi_i(1)); const vdouble dql = vcast_vd_vi(ql); u = vmla_vd_vd_vd_vd(dqh, vcast_vd_d(-PI_A * 0.5), d); s = ddadd2_vd2_vd_vd(u, vmul_vd_vd_vd(dql, vcast_vd_d(-PI_A*0.5))); s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(dqh, vcast_vd_d(-PI_B*0.5))); s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(dql, vcast_vd_d(-PI_B*0.5))); s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(dqh, vcast_vd_d(-PI_C*0.5))); s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(dql, vcast_vd_d(-PI_C*0.5))); s = ddadd_vd2_vd2_vd(s, vmul_vd_vd_vd(vadd_vd_vd_vd(dqh, dql), vcast_vd_d(-PI_D*0.5))); } else { ddi_t ddi = rempi(d); ql = vand_vi_vi_vi(ddigeti_vi_ddi(ddi), vcast_vi_i(3)); ql = vadd_vi_vi_vi(vadd_vi_vi_vi(ql, ql), vsel_vi_vo_vi_vi(vcast_vo32_vo64(vgt_vo_vd_vd(vd2getx_vd_vd2(ddigetdd_vd2_ddi(ddi)), vcast_vd_d(0))), vcast_vi_i(8), vcast_vi_i(7))); ql = vsra_vi_vi_i(ql, 1); vopmask o = veq_vo_vi_vi(vand_vi_vi_vi(ddigeti_vi_ddi(ddi), vcast_vi_i(1)), vcast_vi_i(0)); vdouble y = vsel_vd_vo_vd_vd(vgt_vo_vd_vd(vd2getx_vd_vd2(ddigetdd_vd2_ddi(ddi)), vcast_vd_d(0)), vcast_vd_d(0), vcast_vd_d(-1)); vdouble2 x = vcast_vd2_vd_vd(vmulsign_vd_vd_vd(vcast_vd_d(-3.141592653589793116 * 0.5), y), vmulsign_vd_vd_vd(vcast_vd_d(-1.2246467991473532072e-16 * 0.5), y)); x = ddadd2_vd2_vd2_vd2(ddigetdd_vd2_ddi(ddi), x); ddi = ddisetdd_ddi_ddi_vd2(ddi, vsel_vd2_vo_vd2_vd2(vcast_vo64_vo32(o), x, ddigetdd_vd2_ddi(ddi))); s = ddnormalize_vd2_vd2(ddigetdd_vd2_ddi(ddi)); s = vd2setx_vd2_vd2_vd(s, vreinterpret_vd_vm(vor_vm_vo64_vm(vor_vo_vo_vo(visinf_vo_vd(d), visnan_vo_vd(d)), vreinterpret_vm_vd(vd2getx_vd_vd2(s))))); } t = s; s = ddsqu_vd2_vd2(s); vdouble s2 = vmul_vd_vd_vd(vd2getx_vd_vd2(s), vd2getx_vd_vd2(s)), s4 = vmul_vd_vd_vd(s2, s2); u = POLY6(vd2getx_vd_vd2(s), s2, s4, 2.72052416138529567917983e-15, -7.6429259411395447190023e-13, 1.60589370117277896211623e-10, -2.5052106814843123359368e-08, 2.75573192104428224777379e-06, -0.000198412698412046454654947); u = vmla_vd_vd_vd_vd(u, vd2getx_vd_vd2(s), vcast_vd_d(0.00833333333333318056201922)); x = ddadd_vd2_vd_vd2(vcast_vd_d(1), ddmul_vd2_vd2_vd2(ddadd_vd2_vd_vd(vcast_vd_d(-0.166666666666666657414808), vmul_vd_vd_vd(u, vd2getx_vd_vd2(s))), s)); u = ddmul_vd_vd2_vd2(t, x); u = vreinterpret_vd_vm(vxor_vm_vm_vm(vand_vm_vo64_vm(vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(ql, vcast_vi_i(2)), vcast_vi_i(0))), vreinterpret_vm_vd(vcast_vd_d(-0.0))), vreinterpret_vm_vd(u))); return u; #else // #if !defined(DETERMINISTIC) vdouble u; vdouble2 s, t, x; vint ql; vopmask g = vlt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(TRIGRANGEMAX2)); vdouble dql = vrint_vd_vd(vmla_vd_vd_vd_vd(d, vcast_vd_d(M_1_PI), vcast_vd_d(-0.5))); dql = vmla_vd_vd_vd_vd(vcast_vd_d(2), dql, vcast_vd_d(1)); ql = vrint_vi_vd(dql); x = ddadd2_vd2_vd_vd(d, vmul_vd_vd_vd(dql, vcast_vd_d(-PI_A2*0.5))); x = ddadd_vd2_vd2_vd(x, vmul_vd_vd_vd(dql, vcast_vd_d(-PI_B2*0.5))); if (!LIKELY(vtestallones_i_vo64(g))) { vdouble dqh = vtruncate_vd_vd(vmla_vd_vd_vd_vd(d, vcast_vd_d(M_1_PI / (1 << 23)), vcast_vd_d(-M_1_PI / (1 << 24)))); vint ql2 = vrint_vi_vd(vadd_vd_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(M_1_PI)), vmla_vd_vd_vd_vd(dqh, vcast_vd_d(-(1 << 23)), vcast_vd_d(-0.5)))); dqh = vmul_vd_vd_vd(dqh, vcast_vd_d(1 << 24)); ql2 = vadd_vi_vi_vi(vadd_vi_vi_vi(ql2, ql2), vcast_vi_i(1)); const vdouble dql = vcast_vd_vi(ql2); u = vmla_vd_vd_vd_vd(dqh, vcast_vd_d(-PI_A * 0.5), d); s = ddadd2_vd2_vd_vd(u, vmul_vd_vd_vd(dql, vcast_vd_d(-PI_A*0.5))); s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(dqh, vcast_vd_d(-PI_B*0.5))); s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(dql, vcast_vd_d(-PI_B*0.5))); s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(dqh, vcast_vd_d(-PI_C*0.5))); s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(dql, vcast_vd_d(-PI_C*0.5))); s = ddadd_vd2_vd2_vd(s, vmul_vd_vd_vd(vadd_vd_vd_vd(dqh, dql), vcast_vd_d(-PI_D*0.5))); ql = vsel_vi_vo_vi_vi(vcast_vo32_vo64(g), ql, ql2); x = vsel_vd2_vo_vd2_vd2(g, x, s); g = vlt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(TRIGRANGEMAX)); if (!LIKELY(vtestallones_i_vo64(g))) { ddi_t ddi = rempi(d); vint ql2 = vand_vi_vi_vi(ddigeti_vi_ddi(ddi), vcast_vi_i(3)); ql2 = vadd_vi_vi_vi(vadd_vi_vi_vi(ql2, ql2), vsel_vi_vo_vi_vi(vcast_vo32_vo64(vgt_vo_vd_vd(vd2getx_vd_vd2(ddigetdd_vd2_ddi(ddi)), vcast_vd_d(0))), vcast_vi_i(8), vcast_vi_i(7))); ql2 = vsra_vi_vi_i(ql2, 1); vopmask o = veq_vo_vi_vi(vand_vi_vi_vi(ddigeti_vi_ddi(ddi), vcast_vi_i(1)), vcast_vi_i(0)); vdouble y = vsel_vd_vo_vd_vd(vgt_vo_vd_vd(vd2getx_vd_vd2(ddigetdd_vd2_ddi(ddi)), vcast_vd_d(0)), vcast_vd_d(0), vcast_vd_d(-1)); vdouble2 t = vcast_vd2_vd_vd(vmulsign_vd_vd_vd(vcast_vd_d(-3.141592653589793116 * 0.5), y), vmulsign_vd_vd_vd(vcast_vd_d(-1.2246467991473532072e-16 * 0.5), y)); t = ddadd2_vd2_vd2_vd2(ddigetdd_vd2_ddi(ddi), t); ddi = ddisetdd_ddi_ddi_vd2(ddi, vsel_vd2_vo_vd2_vd2(vcast_vo64_vo32(o), t, ddigetdd_vd2_ddi(ddi))); s = ddnormalize_vd2_vd2(ddigetdd_vd2_ddi(ddi)); ql = vsel_vi_vo_vi_vi(vcast_vo32_vo64(g), ql, ql2); x = vsel_vd2_vo_vd2_vd2(g, x, s); x = vd2setx_vd2_vd2_vd(x, vreinterpret_vd_vm(vor_vm_vo64_vm(vor_vo_vo_vo(visinf_vo_vd(d), visnan_vo_vd(d)), vreinterpret_vm_vd(vd2getx_vd_vd2(x))))); } } t = x; s = ddsqu_vd2_vd2(x); vdouble s2 = vmul_vd_vd_vd(vd2getx_vd_vd2(s), vd2getx_vd_vd2(s)), s4 = vmul_vd_vd_vd(s2, s2); u = POLY6(vd2getx_vd_vd2(s), s2, s4, 2.72052416138529567917983e-15, -7.6429259411395447190023e-13, 1.60589370117277896211623e-10, -2.5052106814843123359368e-08, 2.75573192104428224777379e-06, -0.000198412698412046454654947); u = vmla_vd_vd_vd_vd(u, vd2getx_vd_vd2(s), vcast_vd_d(0.00833333333333318056201922)); x = ddadd_vd2_vd_vd2(vcast_vd_d(1), ddmul_vd2_vd2_vd2(ddadd_vd2_vd_vd(vcast_vd_d(-0.166666666666666657414808), vmul_vd_vd_vd(u, vd2getx_vd_vd2(s))), s)); u = ddmul_vd_vd2_vd2(t, x); u = vreinterpret_vd_vm(vxor_vm_vm_vm(vand_vm_vo64_vm(vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(ql, vcast_vi_i(2)), vcast_vi_i(0))), vreinterpret_vm_vd(vcast_vd_d(-0.0))), vreinterpret_vm_vd(u))); return u; #endif // #if !defined(DETERMINISTIC) } #ifdef ENABLE_GNUABI #define TYPE2_FUNCATR static INLINE CONST #define TYPE6_FUNCATR static INLINE CONST #define SQRTU05_FUNCATR static INLINE CONST #define XSINCOS sincosk #define XSINCOS_U1 sincosk_u1 #define XSINCOSPI_U05 sincospik_u05 #define XSINCOSPI_U35 sincospik_u35 #define XMODF modfk #else #define TYPE2_FUNCATR EXPORT #define TYPE6_FUNCATR EXPORT CONST #define SQRTU05_FUNCATR EXPORT CONST #define XSINCOS xsincos #define XSINCOS_U1 xsincos_u1 #define XSINCOSPI_U05 xsincospi_u05 #define XSINCOSPI_U35 xsincospi_u35 #define XMODF xmodf #endif TYPE2_FUNCATR VECTOR_CC vdouble2 XSINCOS(vdouble d) { #if !defined(DETERMINISTIC) vopmask o; vdouble u, t, rx, ry, s; vdouble2 r; vint ql; if (LIKELY(vtestallones_i_vo64(vlt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(TRIGRANGEMAX2))))) { vdouble dql = vrint_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(2 * M_1_PI))); ql = vrint_vi_vd(dql); s = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_A2 * 0.5), d); s = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_B2 * 0.5), s); } else if (LIKELY(vtestallones_i_vo64(vlt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(TRIGRANGEMAX))))) { vdouble dqh = vtruncate_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(2*M_1_PI / (1 << 24)))); dqh = vmul_vd_vd_vd(dqh, vcast_vd_d(1 << 24)); vdouble dql = vrint_vd_vd(vsub_vd_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(2*M_1_PI)), dqh)); ql = vrint_vi_vd(dql); s = vmla_vd_vd_vd_vd(dqh, vcast_vd_d(-PI_A * 0.5), d); s = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_A * 0.5), s); s = vmla_vd_vd_vd_vd(dqh, vcast_vd_d(-PI_B * 0.5), s); s = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_B * 0.5), s); s = vmla_vd_vd_vd_vd(dqh, vcast_vd_d(-PI_C * 0.5), s); s = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_C * 0.5), s); s = vmla_vd_vd_vd_vd(vadd_vd_vd_vd(dqh, dql), vcast_vd_d(-PI_D * 0.5), s); } else { ddi_t ddi = rempi(d); ql = ddigeti_vi_ddi(ddi); s = vadd_vd_vd_vd(vd2getx_vd_vd2(ddigetdd_vd2_ddi(ddi)), vd2gety_vd_vd2(ddigetdd_vd2_ddi(ddi))); s = vreinterpret_vd_vm(vor_vm_vo64_vm(vor_vo_vo_vo(visinf_vo_vd(d), visnan_vo_vd(d)), vreinterpret_vm_vd(s))); } t = s; s = vmul_vd_vd_vd(s, s); u = vcast_vd_d(1.58938307283228937328511e-10); u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-2.50506943502539773349318e-08)); u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(2.75573131776846360512547e-06)); u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-0.000198412698278911770864914)); u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(0.0083333333333191845961746)); u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-0.166666666666666130709393)); rx = vmla_vd_vd_vd_vd(vmul_vd_vd_vd(u, s), t, t); rx = vsel_vd_vo_vd_vd(visnegzero_vo_vd(d), vcast_vd_d(-0.0), rx); u = vcast_vd_d(-1.13615350239097429531523e-11); u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(2.08757471207040055479366e-09)); u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-2.75573144028847567498567e-07)); u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(2.48015872890001867311915e-05)); u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-0.00138888888888714019282329)); u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(0.0416666666666665519592062)); u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-0.5)); ry = vmla_vd_vd_vd_vd(s, u, vcast_vd_d(1)); o = vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(ql, vcast_vi_i(1)), vcast_vi_i(0))); r = vd2setxy_vd2_vd_vd(vsel_vd_vo_vd_vd(o, rx, ry), vsel_vd_vo_vd_vd(o, ry, rx)); o = vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(ql, vcast_vi_i(2)), vcast_vi_i(2))); r = vd2setx_vd2_vd2_vd(r, vreinterpret_vd_vm(vxor_vm_vm_vm(vand_vm_vo64_vm(o, vreinterpret_vm_vd(vcast_vd_d(-0.0))), vreinterpret_vm_vd(vd2getx_vd_vd2(r))))); o = vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(vadd_vi_vi_vi(ql, vcast_vi_i(1)), vcast_vi_i(2)), vcast_vi_i(2))); r = vd2sety_vd2_vd2_vd(r, vreinterpret_vd_vm(vxor_vm_vm_vm(vand_vm_vo64_vm(o, vreinterpret_vm_vd(vcast_vd_d(-0.0))), vreinterpret_vm_vd(vd2gety_vd_vd2(r))))); return r; #else // #if !defined(DETERMINISTIC) vopmask o; vdouble u, t, rx, ry, s = d; vdouble2 r; vint ql; vdouble dql = vrint_vd_vd(vmul_vd_vd_vd(s, vcast_vd_d(2 * M_1_PI))); ql = vrint_vi_vd(dql); s = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_A2 * 0.5), s); s = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_B2 * 0.5), s); vopmask g = vlt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(TRIGRANGEMAX2)); if (!LIKELY(vtestallones_i_vo64(g))) { vdouble dqh = vtruncate_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(2*M_1_PI / (1 << 24)))); dqh = vmul_vd_vd_vd(dqh, vcast_vd_d(1 << 24)); vdouble dql = vrint_vd_vd(vsub_vd_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(2*M_1_PI)), dqh)); u = vmla_vd_vd_vd_vd(dqh, vcast_vd_d(-PI_A * 0.5), d); u = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_A * 0.5), u); u = vmla_vd_vd_vd_vd(dqh, vcast_vd_d(-PI_B * 0.5), u); u = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_B * 0.5), u); u = vmla_vd_vd_vd_vd(dqh, vcast_vd_d(-PI_C * 0.5), u); u = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_C * 0.5), u); u = vmla_vd_vd_vd_vd(vadd_vd_vd_vd(dqh, dql), vcast_vd_d(-PI_D * 0.5), u); ql = vsel_vi_vo_vi_vi(vcast_vo32_vo64(g), ql, vrint_vi_vd(dql)); s = vsel_vd_vo_vd_vd(g, s, u); g = vlt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(TRIGRANGEMAX)); if (!LIKELY(vtestallones_i_vo64(g))) { ddi_t ddi = rempi(d); u = vadd_vd_vd_vd(vd2getx_vd_vd2(ddigetdd_vd2_ddi(ddi)), vd2gety_vd_vd2(ddigetdd_vd2_ddi(ddi))); u = vreinterpret_vd_vm(vor_vm_vo64_vm(vor_vo_vo_vo(visinf_vo_vd(d), visnan_vo_vd(d)), vreinterpret_vm_vd(u))); ql = vsel_vi_vo_vi_vi(vcast_vo32_vo64(g), ql, ddigeti_vi_ddi(ddi)); s = vsel_vd_vo_vd_vd(g, s, u); } } t = s; s = vmul_vd_vd_vd(s, s); u = vcast_vd_d(1.58938307283228937328511e-10); u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-2.50506943502539773349318e-08)); u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(2.75573131776846360512547e-06)); u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-0.000198412698278911770864914)); u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(0.0083333333333191845961746)); u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-0.166666666666666130709393)); rx = vmla_vd_vd_vd_vd(vmul_vd_vd_vd(u, s), t, t); rx = vsel_vd_vo_vd_vd(visnegzero_vo_vd(d), vcast_vd_d(-0.0), rx); u = vcast_vd_d(-1.13615350239097429531523e-11); u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(2.08757471207040055479366e-09)); u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-2.75573144028847567498567e-07)); u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(2.48015872890001867311915e-05)); u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-0.00138888888888714019282329)); u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(0.0416666666666665519592062)); u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-0.5)); ry = vmla_vd_vd_vd_vd(s, u, vcast_vd_d(1)); o = vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(ql, vcast_vi_i(1)), vcast_vi_i(0))); r = vd2setxy_vd2_vd_vd(vsel_vd_vo_vd_vd(o, rx, ry), vsel_vd_vo_vd_vd(o, ry, rx)); o = vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(ql, vcast_vi_i(2)), vcast_vi_i(2))); r = vd2setx_vd2_vd2_vd(r, vreinterpret_vd_vm(vxor_vm_vm_vm(vand_vm_vo64_vm(o, vreinterpret_vm_vd(vcast_vd_d(-0.0))), vreinterpret_vm_vd(vd2getx_vd_vd2(r))))); o = vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(vadd_vi_vi_vi(ql, vcast_vi_i(1)), vcast_vi_i(2)), vcast_vi_i(2))); r = vd2sety_vd2_vd2_vd(r, vreinterpret_vd_vm(vxor_vm_vm_vm(vand_vm_vo64_vm(o, vreinterpret_vm_vd(vcast_vd_d(-0.0))), vreinterpret_vm_vd(vd2gety_vd_vd2(r))))); return r; #endif // #if !defined(DETERMINISTIC) } TYPE2_FUNCATR VECTOR_CC vdouble2 XSINCOS_U1(vdouble d) { #if !defined(DETERMINISTIC) vopmask o; vdouble u, rx, ry; vdouble2 r, s, t, x; vint ql; if (LIKELY(vtestallones_i_vo64(vlt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(TRIGRANGEMAX2))))) { const vdouble dql = vrint_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(2 * M_1_PI))); ql = vrint_vi_vd(dql); u = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_A2*0.5), d); s = ddadd_vd2_vd_vd (u, vmul_vd_vd_vd(dql, vcast_vd_d(-PI_B2*0.5))); } else if (LIKELY(vtestallones_i_vo64(vlt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(TRIGRANGEMAX))))) { vdouble dqh = vtruncate_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(2*M_1_PI / (1 << 24)))); dqh = vmul_vd_vd_vd(dqh, vcast_vd_d(1 << 24)); const vdouble dql = vrint_vd_vd(vsub_vd_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(2*M_1_PI)), dqh)); ql = vrint_vi_vd(dql); u = vmla_vd_vd_vd_vd(dqh, vcast_vd_d(-PI_A * 0.5), d); s = ddadd_vd2_vd_vd(u, vmul_vd_vd_vd(dql, vcast_vd_d(-PI_A*0.5))); s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(dqh, vcast_vd_d(-PI_B*0.5))); s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(dql, vcast_vd_d(-PI_B*0.5))); s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(dqh, vcast_vd_d(-PI_C*0.5))); s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(dql, vcast_vd_d(-PI_C*0.5))); s = ddadd_vd2_vd2_vd(s, vmul_vd_vd_vd(vadd_vd_vd_vd(dqh, dql), vcast_vd_d(-PI_D*0.5))); } else { ddi_t ddi = rempi(d); ql = ddigeti_vi_ddi(ddi); s = ddigetdd_vd2_ddi(ddi); o = vor_vo_vo_vo(visinf_vo_vd(d), visnan_vo_vd(d)); s = vd2setxy_vd2_vd_vd(vreinterpret_vd_vm(vor_vm_vo64_vm(o, vreinterpret_vm_vd(vd2getx_vd_vd2(s)))), vreinterpret_vd_vm(vor_vm_vo64_vm(o, vreinterpret_vm_vd(vd2gety_vd_vd2(s))))); } t = s; s = vd2setx_vd2_vd2_vd(s, ddsqu_vd_vd2(s)); u = vcast_vd_d(1.58938307283228937328511e-10); u = vmla_vd_vd_vd_vd(u, vd2getx_vd_vd2(s), vcast_vd_d(-2.50506943502539773349318e-08)); u = vmla_vd_vd_vd_vd(u, vd2getx_vd_vd2(s), vcast_vd_d(2.75573131776846360512547e-06)); u = vmla_vd_vd_vd_vd(u, vd2getx_vd_vd2(s), vcast_vd_d(-0.000198412698278911770864914)); u = vmla_vd_vd_vd_vd(u, vd2getx_vd_vd2(s), vcast_vd_d(0.0083333333333191845961746)); u = vmla_vd_vd_vd_vd(u, vd2getx_vd_vd2(s), vcast_vd_d(-0.166666666666666130709393)); u = vmul_vd_vd_vd(u, vmul_vd_vd_vd(vd2getx_vd_vd2(s), vd2getx_vd_vd2(t))); x = ddadd_vd2_vd2_vd(t, u); rx = vadd_vd_vd_vd(vd2getx_vd_vd2(x), vd2gety_vd_vd2(x)); rx = vsel_vd_vo_vd_vd(visnegzero_vo_vd(d), vcast_vd_d(-0.0), rx); u = vcast_vd_d(-1.13615350239097429531523e-11); u = vmla_vd_vd_vd_vd(u, vd2getx_vd_vd2(s), vcast_vd_d(2.08757471207040055479366e-09)); u = vmla_vd_vd_vd_vd(u, vd2getx_vd_vd2(s), vcast_vd_d(-2.75573144028847567498567e-07)); u = vmla_vd_vd_vd_vd(u, vd2getx_vd_vd2(s), vcast_vd_d(2.48015872890001867311915e-05)); u = vmla_vd_vd_vd_vd(u, vd2getx_vd_vd2(s), vcast_vd_d(-0.00138888888888714019282329)); u = vmla_vd_vd_vd_vd(u, vd2getx_vd_vd2(s), vcast_vd_d(0.0416666666666665519592062)); u = vmla_vd_vd_vd_vd(u, vd2getx_vd_vd2(s), vcast_vd_d(-0.5)); x = ddadd_vd2_vd_vd2(vcast_vd_d(1), ddmul_vd2_vd_vd(vd2getx_vd_vd2(s), u)); ry = vadd_vd_vd_vd(vd2getx_vd_vd2(x), vd2gety_vd_vd2(x)); o = vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(ql, vcast_vi_i(1)), vcast_vi_i(0))); r = vd2setxy_vd2_vd_vd(vsel_vd_vo_vd_vd(o, rx, ry), vsel_vd_vo_vd_vd(o, ry, rx)); o = vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(ql, vcast_vi_i(2)), vcast_vi_i(2))); r = vd2setx_vd2_vd2_vd(r, vreinterpret_vd_vm(vxor_vm_vm_vm(vand_vm_vo64_vm(o, vreinterpret_vm_vd(vcast_vd_d(-0.0))), vreinterpret_vm_vd(vd2getx_vd_vd2(r))))); o = vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(vadd_vi_vi_vi(ql, vcast_vi_i(1)), vcast_vi_i(2)), vcast_vi_i(2))); r = vd2sety_vd2_vd2_vd(r, vreinterpret_vd_vm(vxor_vm_vm_vm(vand_vm_vo64_vm(o, vreinterpret_vm_vd(vcast_vd_d(-0.0))), vreinterpret_vm_vd(vd2gety_vd_vd2(r))))); return r; #else // #if !defined(DETERMINISTIC) vopmask o; vdouble u, rx, ry; vdouble2 r, s, t, x; vint ql; const vdouble dql = vrint_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(2 * M_1_PI))); ql = vrint_vi_vd(dql); u = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_A2*0.5), d); s = ddadd_vd2_vd_vd (u, vmul_vd_vd_vd(dql, vcast_vd_d(-PI_B2*0.5))); vopmask g = vlt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(TRIGRANGEMAX2)); if (!LIKELY(vtestallones_i_vo64(g))) { vdouble dqh = vtruncate_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(2*M_1_PI / (1 << 24)))); dqh = vmul_vd_vd_vd(dqh, vcast_vd_d(1 << 24)); const vdouble dql = vrint_vd_vd(vsub_vd_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(2*M_1_PI)), dqh)); u = vmla_vd_vd_vd_vd(dqh, vcast_vd_d(-PI_A * 0.5), d); x = ddadd_vd2_vd_vd(u, vmul_vd_vd_vd(dql, vcast_vd_d(-PI_A*0.5))); x = ddadd2_vd2_vd2_vd(x, vmul_vd_vd_vd(dqh, vcast_vd_d(-PI_B*0.5))); x = ddadd2_vd2_vd2_vd(x, vmul_vd_vd_vd(dql, vcast_vd_d(-PI_B*0.5))); x = ddadd2_vd2_vd2_vd(x, vmul_vd_vd_vd(dqh, vcast_vd_d(-PI_C*0.5))); x = ddadd2_vd2_vd2_vd(x, vmul_vd_vd_vd(dql, vcast_vd_d(-PI_C*0.5))); x = ddadd_vd2_vd2_vd(x, vmul_vd_vd_vd(vadd_vd_vd_vd(dqh, dql), vcast_vd_d(-PI_D*0.5))); ql = vsel_vi_vo_vi_vi(vcast_vo32_vo64(g), ql, vrint_vi_vd(dql)); s = vsel_vd2_vo_vd2_vd2(g, s, x); g = vlt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(TRIGRANGEMAX)); if (!LIKELY(vtestallones_i_vo64(g))) { ddi_t ddi = rempi(d); x = ddigetdd_vd2_ddi(ddi); o = vor_vo_vo_vo(visinf_vo_vd(d), visnan_vo_vd(d)); x = vd2setx_vd2_vd2_vd(x, vreinterpret_vd_vm(vor_vm_vo64_vm(o, vreinterpret_vm_vd(vd2getx_vd_vd2(x))))); x = vd2sety_vd2_vd2_vd(x, vreinterpret_vd_vm(vor_vm_vo64_vm(o, vreinterpret_vm_vd(vd2gety_vd_vd2(x))))); ql = vsel_vi_vo_vi_vi(vcast_vo32_vo64(g), ql, ddigeti_vi_ddi(ddi)); s = vsel_vd2_vo_vd2_vd2(g, s, x); } } t = s; s = vd2setx_vd2_vd2_vd(s, ddsqu_vd_vd2(s)); u = vcast_vd_d(1.58938307283228937328511e-10); u = vmla_vd_vd_vd_vd(u, vd2getx_vd_vd2(s), vcast_vd_d(-2.50506943502539773349318e-08)); u = vmla_vd_vd_vd_vd(u, vd2getx_vd_vd2(s), vcast_vd_d(2.75573131776846360512547e-06)); u = vmla_vd_vd_vd_vd(u, vd2getx_vd_vd2(s), vcast_vd_d(-0.000198412698278911770864914)); u = vmla_vd_vd_vd_vd(u, vd2getx_vd_vd2(s), vcast_vd_d(0.0083333333333191845961746)); u = vmla_vd_vd_vd_vd(u, vd2getx_vd_vd2(s), vcast_vd_d(-0.166666666666666130709393)); u = vmul_vd_vd_vd(u, vmul_vd_vd_vd(vd2getx_vd_vd2(s), vd2getx_vd_vd2(t))); x = ddadd_vd2_vd2_vd(t, u); rx = vadd_vd_vd_vd(vd2getx_vd_vd2(x), vd2gety_vd_vd2(x)); rx = vsel_vd_vo_vd_vd(visnegzero_vo_vd(d), vcast_vd_d(-0.0), rx); u = vcast_vd_d(-1.13615350239097429531523e-11); u = vmla_vd_vd_vd_vd(u, vd2getx_vd_vd2(s), vcast_vd_d(2.08757471207040055479366e-09)); u = vmla_vd_vd_vd_vd(u, vd2getx_vd_vd2(s), vcast_vd_d(-2.75573144028847567498567e-07)); u = vmla_vd_vd_vd_vd(u, vd2getx_vd_vd2(s), vcast_vd_d(2.48015872890001867311915e-05)); u = vmla_vd_vd_vd_vd(u, vd2getx_vd_vd2(s), vcast_vd_d(-0.00138888888888714019282329)); u = vmla_vd_vd_vd_vd(u, vd2getx_vd_vd2(s), vcast_vd_d(0.0416666666666665519592062)); u = vmla_vd_vd_vd_vd(u, vd2getx_vd_vd2(s), vcast_vd_d(-0.5)); x = ddadd_vd2_vd_vd2(vcast_vd_d(1), ddmul_vd2_vd_vd(vd2getx_vd_vd2(s), u)); ry = vadd_vd_vd_vd(vd2getx_vd_vd2(x), vd2gety_vd_vd2(x)); o = vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(ql, vcast_vi_i(1)), vcast_vi_i(0))); r = vd2setxy_vd2_vd_vd(vsel_vd_vo_vd_vd(o, rx, ry), vsel_vd_vo_vd_vd(o, ry, rx)); o = vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(ql, vcast_vi_i(2)), vcast_vi_i(2))); r = vd2setx_vd2_vd2_vd(r, vreinterpret_vd_vm(vxor_vm_vm_vm(vand_vm_vo64_vm(o, vreinterpret_vm_vd(vcast_vd_d(-0.0))), vreinterpret_vm_vd(vd2getx_vd_vd2(r))))); o = vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(vadd_vi_vi_vi(ql, vcast_vi_i(1)), vcast_vi_i(2)), vcast_vi_i(2))); r = vd2sety_vd2_vd2_vd(r, vreinterpret_vd_vm(vxor_vm_vm_vm(vand_vm_vo64_vm(o, vreinterpret_vm_vd(vcast_vd_d(-0.0))), vreinterpret_vm_vd(vd2gety_vd_vd2(r))))); return r; #endif // #if !defined(DETERMINISTIC) } #if !defined(DETERMINISTIC) TYPE2_FUNCATR VECTOR_CC vdouble2 XSINCOSPI_U05(vdouble d) { vopmask o; vdouble u, s, t, rx, ry; vdouble2 r, x, s2; u = vmul_vd_vd_vd(d, vcast_vd_d(4.0)); vint q = vtruncate_vi_vd(u); q = vand_vi_vi_vi(vadd_vi_vi_vi(q, vxor_vi_vi_vi(vsrl_vi_vi_i(q, 31), vcast_vi_i(1))), vcast_vi_i(~1)); s = vsub_vd_vd_vd(u, vcast_vd_vi(q)); t = s; s = vmul_vd_vd_vd(s, s); s2 = ddmul_vd2_vd_vd(t, t); // u = vcast_vd_d(-2.02461120785182399295868e-14); u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(6.94821830580179461327784e-12)); u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-1.75724749952853179952664e-09)); u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(3.13361688966868392878422e-07)); u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-3.6576204182161551920361e-05)); u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(0.00249039457019271850274356)); x = ddadd2_vd2_vd_vd2(vmul_vd_vd_vd(u, s), vcast_vd2_d_d(-0.0807455121882807852484731, 3.61852475067037104849987e-18)); x = ddadd2_vd2_vd2_vd2(ddmul_vd2_vd2_vd2(s2, x), vcast_vd2_d_d(0.785398163397448278999491, 3.06287113727155002607105e-17)); x = ddmul_vd2_vd2_vd(x, t); rx = vadd_vd_vd_vd(vd2getx_vd_vd2(x), vd2gety_vd_vd2(x)); rx = vsel_vd_vo_vd_vd(visnegzero_vo_vd(d), vcast_vd_d(-0.0), rx); // u = vcast_vd_d(9.94480387626843774090208e-16); u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-3.89796226062932799164047e-13)); u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(1.15011582539996035266901e-10)); u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-2.4611369501044697495359e-08)); u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(3.59086044859052754005062e-06)); u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-0.000325991886927389905997954)); x = ddadd2_vd2_vd_vd2(vmul_vd_vd_vd(u, s), vcast_vd2_d_d(0.0158543442438155018914259, -1.04693272280631521908845e-18)); x = ddadd2_vd2_vd2_vd2(ddmul_vd2_vd2_vd2(s2, x), vcast_vd2_d_d(-0.308425137534042437259529, -1.95698492133633550338345e-17)); x = ddadd2_vd2_vd2_vd(ddmul_vd2_vd2_vd2(x, s2), vcast_vd_d(1)); ry = vadd_vd_vd_vd(vd2getx_vd_vd2(x), vd2gety_vd_vd2(x)); // o = vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(q, vcast_vi_i(2)), vcast_vi_i(0))); r = vd2setxy_vd2_vd_vd(vsel_vd_vo_vd_vd(o, rx, ry), vsel_vd_vo_vd_vd(o, ry, rx)); o = vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(q, vcast_vi_i(4)), vcast_vi_i(4))); r = vd2setx_vd2_vd2_vd(r, vreinterpret_vd_vm(vxor_vm_vm_vm(vand_vm_vo64_vm(o, vreinterpret_vm_vd(vcast_vd_d(-0.0))), vreinterpret_vm_vd(vd2getx_vd_vd2(r))))); o = vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(vadd_vi_vi_vi(q, vcast_vi_i(2)), vcast_vi_i(4)), vcast_vi_i(4))); r = vd2sety_vd2_vd2_vd(r, vreinterpret_vd_vm(vxor_vm_vm_vm(vand_vm_vo64_vm(o, vreinterpret_vm_vd(vcast_vd_d(-0.0))), vreinterpret_vm_vd(vd2gety_vd_vd2(r))))); o = vgt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(TRIGRANGEMAX3/4)); r = vd2setx_vd2_vd2_vd(r, vreinterpret_vd_vm(vandnot_vm_vo64_vm(o, vreinterpret_vm_vd(vd2getx_vd_vd2(r))))); r = vd2sety_vd2_vd2_vd(r, vsel_vd_vo_vd_vd(o, vcast_vd_d(1), vd2gety_vd_vd2(r))); o = visinf_vo_vd(d); r = vd2setx_vd2_vd2_vd(r, vreinterpret_vd_vm(vor_vm_vo64_vm(o, vreinterpret_vm_vd(vd2getx_vd_vd2(r))))); r = vd2sety_vd2_vd2_vd(r, vreinterpret_vd_vm(vor_vm_vo64_vm(o, vreinterpret_vm_vd(vd2gety_vd_vd2(r))))); return r; } TYPE2_FUNCATR VECTOR_CC vdouble2 XSINCOSPI_U35(vdouble d) { vopmask o; vdouble u, s, t, rx, ry; vdouble2 r; u = vmul_vd_vd_vd(d, vcast_vd_d(4.0)); vint q = vtruncate_vi_vd(u); q = vand_vi_vi_vi(vadd_vi_vi_vi(q, vxor_vi_vi_vi(vsrl_vi_vi_i(q, 31), vcast_vi_i(1))), vcast_vi_i(~1)); s = vsub_vd_vd_vd(u, vcast_vd_vi(q)); t = s; s = vmul_vd_vd_vd(s, s); // u = vcast_vd_d(+0.6880638894766060136e-11); u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-0.1757159564542310199e-8)); u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(+0.3133616327257867311e-6)); u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-0.3657620416388486452e-4)); u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(+0.2490394570189932103e-2)); u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-0.8074551218828056320e-1)); u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(+0.7853981633974482790e+0)); rx = vmul_vd_vd_vd(u, t); // u = vcast_vd_d(-0.3860141213683794352e-12); u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(+0.1150057888029681415e-9)); u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-0.2461136493006663553e-7)); u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(+0.3590860446623516713e-5)); u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-0.3259918869269435942e-3)); u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(+0.1585434424381541169e-1)); u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-0.3084251375340424373e+0)); u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(1)); ry = u; // o = vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(q, vcast_vi_i(2)), vcast_vi_i(0))); r = vd2setxy_vd2_vd_vd(vsel_vd_vo_vd_vd(o, rx, ry), vsel_vd_vo_vd_vd(o, ry, rx)); o = vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(q, vcast_vi_i(4)), vcast_vi_i(4))); r = vd2setx_vd2_vd2_vd(r, vreinterpret_vd_vm(vxor_vm_vm_vm(vand_vm_vo64_vm(o, vreinterpret_vm_vd(vcast_vd_d(-0.0))), vreinterpret_vm_vd(vd2getx_vd_vd2(r))))); o = vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(vadd_vi_vi_vi(q, vcast_vi_i(2)), vcast_vi_i(4)), vcast_vi_i(4))); r = vd2sety_vd2_vd2_vd(r, vreinterpret_vd_vm(vxor_vm_vm_vm(vand_vm_vo64_vm(o, vreinterpret_vm_vd(vcast_vd_d(-0.0))), vreinterpret_vm_vd(vd2gety_vd_vd2(r))))); o = vgt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(TRIGRANGEMAX3/4)); r = vd2setx_vd2_vd2_vd(r, vreinterpret_vd_vm(vandnot_vm_vo64_vm(o, vreinterpret_vm_vd(vd2getx_vd_vd2(r))))); r = vd2sety_vd2_vd2_vd(r, vreinterpret_vd_vm(vandnot_vm_vo64_vm(o, vreinterpret_vm_vd(vd2gety_vd_vd2(r))))); o = visinf_vo_vd(d); r = vd2setx_vd2_vd2_vd(r, vreinterpret_vd_vm(vor_vm_vo64_vm(o, vreinterpret_vm_vd(vd2getx_vd_vd2(r))))); r = vd2sety_vd2_vd2_vd(r, vreinterpret_vd_vm(vor_vm_vo64_vm(o, vreinterpret_vm_vd(vd2gety_vd_vd2(r))))); return r; } TYPE6_FUNCATR VECTOR_CC vdouble2 XMODF(vdouble x) { vdouble fr = vsub_vd_vd_vd(x, vmul_vd_vd_vd(vcast_vd_d(INT64_C(1) << 31), vcast_vd_vi(vtruncate_vi_vd(vmul_vd_vd_vd(x, vcast_vd_d(1.0 / (INT64_C(1) << 31))))))); fr = vsub_vd_vd_vd(fr, vcast_vd_vi(vtruncate_vi_vd(fr))); fr = vsel_vd_vo_vd_vd(vgt_vo_vd_vd(vabs_vd_vd(x), vcast_vd_d(INT64_C(1) << 52)), vcast_vd_d(0), fr); vdouble2 ret; ret = vd2setxy_vd2_vd_vd(vcopysign_vd_vd_vd(fr, x), vcopysign_vd_vd_vd(vsub_vd_vd_vd(x, fr), x)); return ret; } #ifdef ENABLE_GNUABI EXPORT VECTOR_CC void xsincos(vdouble a, double *ps, double *pc) { vdouble2 r = sincosk(a); vstoreu_v_p_vd(ps, vd2getx_vd_vd2(r)); vstoreu_v_p_vd(pc, vd2gety_vd_vd2(r)); } EXPORT VECTOR_CC void xsincos_u1(vdouble a, double *ps, double *pc) { vdouble2 r = sincosk_u1(a); vstoreu_v_p_vd(ps, vd2getx_vd_vd2(r)); vstoreu_v_p_vd(pc, vd2gety_vd_vd2(r)); } EXPORT VECTOR_CC void xsincospi_u05(vdouble a, double *ps, double *pc) { vdouble2 r = sincospik_u05(a); vstoreu_v_p_vd(ps, vd2getx_vd_vd2(r)); vstoreu_v_p_vd(pc, vd2gety_vd_vd2(r)); } EXPORT VECTOR_CC void xsincospi_u35(vdouble a, double *ps, double *pc) { vdouble2 r = sincospik_u35(a); vstoreu_v_p_vd(ps, vd2getx_vd_vd2(r)); vstoreu_v_p_vd(pc, vd2gety_vd_vd2(r)); } EXPORT CONST VECTOR_CC vdouble xmodf(vdouble a, double *iptr) { vdouble2 r = modfk(a); vstoreu_v_p_vd(iptr, vd2gety_vd_vd2(r)); return vd2getx_vd_vd2(r); } #endif // #ifdef ENABLE_GNUABI #endif // #if !defined(DETERMINISTIC) static INLINE CONST VECTOR_CC vdouble2 sinpik(vdouble d) { vopmask o; vdouble u, s, t; vdouble2 x, s2; u = vmul_vd_vd_vd(d, vcast_vd_d(4.0)); vint q = vtruncate_vi_vd(u); q = vand_vi_vi_vi(vadd_vi_vi_vi(q, vxor_vi_vi_vi(vsrl_vi_vi_i(q, 31), vcast_vi_i(1))), vcast_vi_i(~1)); o = vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(q, vcast_vi_i(2)), vcast_vi_i(2))); s = vsub_vd_vd_vd(u, vcast_vd_vi(q)); t = s; s = vmul_vd_vd_vd(s, s); s2 = ddmul_vd2_vd_vd(t, t); // u = vsel_vd_vo_d_d(o, 9.94480387626843774090208e-16, -2.02461120785182399295868e-14); u = vmla_vd_vd_vd_vd(u, s, vsel_vd_vo_d_d(o, -3.89796226062932799164047e-13, 6.948218305801794613277840e-12)); u = vmla_vd_vd_vd_vd(u, s, vsel_vd_vo_d_d(o, 1.150115825399960352669010e-10, -1.75724749952853179952664e-09)); u = vmla_vd_vd_vd_vd(u, s, vsel_vd_vo_d_d(o, -2.46113695010446974953590e-08, 3.133616889668683928784220e-07)); u = vmla_vd_vd_vd_vd(u, s, vsel_vd_vo_d_d(o, 3.590860448590527540050620e-06, -3.65762041821615519203610e-05)); u = vmla_vd_vd_vd_vd(u, s, vsel_vd_vo_d_d(o, -0.000325991886927389905997954, 0.0024903945701927185027435600)); x = ddadd2_vd2_vd_vd2(vmul_vd_vd_vd(u, s), vsel_vd2_vo_d_d_d_d(o, 0.0158543442438155018914259, -1.04693272280631521908845e-18, -0.0807455121882807852484731, 3.61852475067037104849987e-18)); x = ddadd2_vd2_vd2_vd2(ddmul_vd2_vd2_vd2(s2, x), vsel_vd2_vo_d_d_d_d(o, -0.308425137534042437259529, -1.95698492133633550338345e-17, 0.785398163397448278999491, 3.06287113727155002607105e-17)); x = ddmul_vd2_vd2_vd2(x, vsel_vd2_vo_vd2_vd2(o, s2, vcast_vd2_vd_vd(t, vcast_vd_d(0)))); x = vsel_vd2_vo_vd2_vd2(o, ddadd2_vd2_vd2_vd(x, vcast_vd_d(1)), x); o = vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(q, vcast_vi_i(4)), vcast_vi_i(4))); x = vd2setx_vd2_vd2_vd(x, vreinterpret_vd_vm(vxor_vm_vm_vm(vand_vm_vo64_vm(o, vreinterpret_vm_vd(vcast_vd_d(-0.0))), vreinterpret_vm_vd(vd2getx_vd_vd2(x))))); x = vd2sety_vd2_vd2_vd(x, vreinterpret_vd_vm(vxor_vm_vm_vm(vand_vm_vo64_vm(o, vreinterpret_vm_vd(vcast_vd_d(-0.0))), vreinterpret_vm_vd(vd2gety_vd_vd2(x))))); return x; } EXPORT CONST VECTOR_CC vdouble xsinpi_u05(vdouble d) { vdouble2 x = sinpik(d); vdouble r = vadd_vd_vd_vd(vd2getx_vd_vd2(x), vd2gety_vd_vd2(x)); r = vsel_vd_vo_vd_vd(visnegzero_vo_vd(d), vcast_vd_d(-0.0), r); r = vreinterpret_vd_vm(vandnot_vm_vo64_vm(vgt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(TRIGRANGEMAX3/4)), vreinterpret_vm_vd(r))); r = vreinterpret_vd_vm(vor_vm_vo64_vm(visinf_vo_vd(d), vreinterpret_vm_vd(r))); return r; } static INLINE CONST VECTOR_CC vdouble2 cospik(vdouble d) { vopmask o; vdouble u, s, t; vdouble2 x, s2; u = vmul_vd_vd_vd(d, vcast_vd_d(4.0)); vint q = vtruncate_vi_vd(u); q = vand_vi_vi_vi(vadd_vi_vi_vi(q, vxor_vi_vi_vi(vsrl_vi_vi_i(q, 31), vcast_vi_i(1))), vcast_vi_i(~1)); o = vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(q, vcast_vi_i(2)), vcast_vi_i(0))); s = vsub_vd_vd_vd(u, vcast_vd_vi(q)); t = s; s = vmul_vd_vd_vd(s, s); s2 = ddmul_vd2_vd_vd(t, t); // u = vsel_vd_vo_d_d(o, 9.94480387626843774090208e-16, -2.02461120785182399295868e-14); u = vmla_vd_vd_vd_vd(u, s, vsel_vd_vo_d_d(o, -3.89796226062932799164047e-13, 6.948218305801794613277840e-12)); u = vmla_vd_vd_vd_vd(u, s, vsel_vd_vo_d_d(o, 1.150115825399960352669010e-10, -1.75724749952853179952664e-09)); u = vmla_vd_vd_vd_vd(u, s, vsel_vd_vo_d_d(o, -2.46113695010446974953590e-08, 3.133616889668683928784220e-07)); u = vmla_vd_vd_vd_vd(u, s, vsel_vd_vo_d_d(o, 3.590860448590527540050620e-06, -3.65762041821615519203610e-05)); u = vmla_vd_vd_vd_vd(u, s, vsel_vd_vo_d_d(o, -0.000325991886927389905997954, 0.0024903945701927185027435600)); x = ddadd2_vd2_vd_vd2(vmul_vd_vd_vd(u, s), vsel_vd2_vo_d_d_d_d(o, 0.0158543442438155018914259, -1.04693272280631521908845e-18, -0.0807455121882807852484731, 3.61852475067037104849987e-18)); x = ddadd2_vd2_vd2_vd2(ddmul_vd2_vd2_vd2(s2, x), vsel_vd2_vo_d_d_d_d(o, -0.308425137534042437259529, -1.95698492133633550338345e-17, 0.785398163397448278999491, 3.06287113727155002607105e-17)); x = ddmul_vd2_vd2_vd2(x, vsel_vd2_vo_vd2_vd2(o, s2, vcast_vd2_vd_vd(t, vcast_vd_d(0)))); x = vsel_vd2_vo_vd2_vd2(o, ddadd2_vd2_vd2_vd(x, vcast_vd_d(1)), x); o = vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(vadd_vi_vi_vi(q, vcast_vi_i(2)), vcast_vi_i(4)), vcast_vi_i(4))); x = vd2setx_vd2_vd2_vd(x, vreinterpret_vd_vm(vxor_vm_vm_vm(vand_vm_vo64_vm(o, vreinterpret_vm_vd(vcast_vd_d(-0.0))), vreinterpret_vm_vd(vd2getx_vd_vd2(x))))); x = vd2sety_vd2_vd2_vd(x, vreinterpret_vd_vm(vxor_vm_vm_vm(vand_vm_vo64_vm(o, vreinterpret_vm_vd(vcast_vd_d(-0.0))), vreinterpret_vm_vd(vd2gety_vd_vd2(x))))); return x; } EXPORT CONST VECTOR_CC vdouble xcospi_u05(vdouble d) { vdouble2 x = cospik(d); vdouble r = vadd_vd_vd_vd(vd2getx_vd_vd2(x), vd2gety_vd_vd2(x)); r = vsel_vd_vo_vd_vd(vgt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(TRIGRANGEMAX3/4)), vcast_vd_d(1), r); r = vreinterpret_vd_vm(vor_vm_vo64_vm(visinf_vo_vd(d), vreinterpret_vm_vd(r))); return r; } EXPORT CONST VECTOR_CC vdouble xtan(vdouble d) { #if !defined(DETERMINISTIC) vdouble u, s, x, y; vopmask o; vint ql; if (LIKELY(vtestallones_i_vo64(vlt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(TRIGRANGEMAX2))))) { vdouble dql = vrint_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(2 * M_1_PI))); ql = vrint_vi_vd(dql); x = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_A2 * 0.5), d); x = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_B2 * 0.5), x); } else if (LIKELY(vtestallones_i_vo64(vlt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(1e+6))))) { vdouble dqh = vtruncate_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(2*M_1_PI / (1 << 24)))); dqh = vmul_vd_vd_vd(dqh, vcast_vd_d(1 << 24)); vdouble dql = vrint_vd_vd(vsub_vd_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(2*M_1_PI)), dqh)); ql = vrint_vi_vd(dql); x = vmla_vd_vd_vd_vd(dqh, vcast_vd_d(-PI_A * 0.5), d); x = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_A * 0.5), x); x = vmla_vd_vd_vd_vd(dqh, vcast_vd_d(-PI_B * 0.5), x); x = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_B * 0.5), x); x = vmla_vd_vd_vd_vd(dqh, vcast_vd_d(-PI_C * 0.5), x); x = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_C * 0.5), x); x = vmla_vd_vd_vd_vd(vadd_vd_vd_vd(dqh, dql), vcast_vd_d(-PI_D * 0.5), x); } else { ddi_t ddi = rempi(d); ql = ddigeti_vi_ddi(ddi); x = vadd_vd_vd_vd(vd2getx_vd_vd2(ddigetdd_vd2_ddi(ddi)), vd2gety_vd_vd2(ddigetdd_vd2_ddi(ddi))); x = vreinterpret_vd_vm(vor_vm_vo64_vm(visinf_vo_vd(d), vreinterpret_vm_vd(x))); x = vreinterpret_vd_vm(vor_vm_vo64_vm(vor_vo_vo_vo(visinf_vo_vd(d), visnan_vo_vd(d)), vreinterpret_vm_vd(x))); } x = vmul_vd_vd_vd(x, vcast_vd_d(0.5)); s = vmul_vd_vd_vd(x, x); vdouble s2 = vmul_vd_vd_vd(s, s), s4 = vmul_vd_vd_vd(s2, s2); u = POLY8(s, s2, s4, +0.3245098826639276316e-3, +0.5619219738114323735e-3, +0.1460781502402784494e-2, +0.3591611540792499519e-2, +0.8863268409563113126e-2, +0.2186948728185535498e-1, +0.5396825399517272970e-1, +0.1333333333330500581e+0); u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(+0.3333333333333343695e+0)); u = vmla_vd_vd_vd_vd(s, vmul_vd_vd_vd(u, x), x); y = vmla_vd_vd_vd_vd(u, u, vcast_vd_d(-1)); x = vmul_vd_vd_vd(u, vcast_vd_d(-2)); o = vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(ql, vcast_vi_i(1)), vcast_vi_i(1))); u = vdiv_vd_vd_vd(vsel_vd_vo_vd_vd(o, vneg_vd_vd(y), x), vsel_vd_vo_vd_vd(o, x, y)); u = vsel_vd_vo_vd_vd(veq_vo_vd_vd(d, vcast_vd_d(0)), d, u); return u; #else // #if !defined(DETERMINISTIC) vdouble u, s, x, y; vopmask o; vint ql; vdouble dql = vrint_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(2 * M_1_PI))); ql = vrint_vi_vd(dql); s = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_A2 * 0.5), d); s = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_B2 * 0.5), s); vopmask g = vlt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(TRIGRANGEMAX2)); if (!LIKELY(vtestallones_i_vo64(g))) { vdouble dqh = vtruncate_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(2*M_1_PI / (1 << 24)))); dqh = vmul_vd_vd_vd(dqh, vcast_vd_d(1 << 24)); vdouble dql = vrint_vd_vd(vsub_vd_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(2*M_1_PI)), dqh)); u = vmla_vd_vd_vd_vd(dqh, vcast_vd_d(-PI_A * 0.5), d); u = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_A * 0.5), u); u = vmla_vd_vd_vd_vd(dqh, vcast_vd_d(-PI_B * 0.5), u); u = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_B * 0.5), u); u = vmla_vd_vd_vd_vd(dqh, vcast_vd_d(-PI_C * 0.5), u); u = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_C * 0.5), u); u = vmla_vd_vd_vd_vd(vadd_vd_vd_vd(dqh, dql), vcast_vd_d(-PI_D * 0.5), u); ql = vsel_vi_vo_vi_vi(vcast_vo32_vo64(g), ql, vrint_vi_vd(dql)); s = vsel_vd_vo_vd_vd(g, s, u); g = vlt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(1e+6)); if (!LIKELY(vtestallones_i_vo64(g))) { ddi_t ddi = rempi(d); vint ql2 = ddigeti_vi_ddi(ddi); u = vadd_vd_vd_vd(vd2getx_vd_vd2(ddigetdd_vd2_ddi(ddi)), vd2gety_vd_vd2(ddigetdd_vd2_ddi(ddi))); u = vreinterpret_vd_vm(vor_vm_vo64_vm(vor_vo_vo_vo(visinf_vo_vd(d), visnan_vo_vd(d)), vreinterpret_vm_vd(u))); ql = vsel_vi_vo_vi_vi(vcast_vo32_vo64(g), ql, ql2); s = vsel_vd_vo_vd_vd(g, s, u); } } x = vmul_vd_vd_vd(s, vcast_vd_d(0.5)); s = vmul_vd_vd_vd(x, x); vdouble s2 = vmul_vd_vd_vd(s, s), s4 = vmul_vd_vd_vd(s2, s2); u = POLY8(s, s2, s4, +0.3245098826639276316e-3, +0.5619219738114323735e-3, +0.1460781502402784494e-2, +0.3591611540792499519e-2, +0.8863268409563113126e-2, +0.2186948728185535498e-1, +0.5396825399517272970e-1, +0.1333333333330500581e+0); u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(+0.3333333333333343695e+0)); u = vmla_vd_vd_vd_vd(s, vmul_vd_vd_vd(u, x), x); y = vmla_vd_vd_vd_vd(u, u, vcast_vd_d(-1)); x = vmul_vd_vd_vd(u, vcast_vd_d(-2)); o = vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(ql, vcast_vi_i(1)), vcast_vi_i(1))); u = vdiv_vd_vd_vd(vsel_vd_vo_vd_vd(o, vneg_vd_vd(y), x), vsel_vd_vo_vd_vd(o, x, y)); u = vsel_vd_vo_vd_vd(veq_vo_vd_vd(d, vcast_vd_d(0)), d, u); return u; #endif // #if !defined(DETERMINISTIC) } EXPORT CONST VECTOR_CC vdouble xtan_u1(vdouble d) { #if !defined(DETERMINISTIC) vdouble u; vdouble2 s, t, x, y; vopmask o; vint ql; if (LIKELY(vtestallones_i_vo64(vlt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(TRIGRANGEMAX2))))) { vdouble dql = vrint_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(2 * M_1_PI))); ql = vrint_vi_vd(dql); u = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_A2*0.5), d); s = ddadd_vd2_vd_vd (u, vmul_vd_vd_vd(dql, vcast_vd_d(-PI_B2*0.5))); } else if (LIKELY(vtestallones_i_vo64(vlt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(TRIGRANGEMAX))))) { vdouble dqh = vtruncate_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(2*M_1_PI / (1 << 24)))); dqh = vmul_vd_vd_vd(dqh, vcast_vd_d(1 << 24)); s = ddadd2_vd2_vd2_vd(ddmul_vd2_vd2_vd(vcast_vd2_d_d(M_2_PI_H, M_2_PI_L), d), vsub_vd_vd_vd(vsel_vd_vo_vd_vd(vlt_vo_vd_vd(d, vcast_vd_d(0)), vcast_vd_d(-0.5), vcast_vd_d(0.5)), dqh)); const vdouble dql = vtruncate_vd_vd(vadd_vd_vd_vd(vd2getx_vd_vd2(s), vd2gety_vd_vd2(s))); ql = vrint_vi_vd(dql); u = vmla_vd_vd_vd_vd(dqh, vcast_vd_d(-PI_A * 0.5), d); s = ddadd_vd2_vd_vd(u, vmul_vd_vd_vd(dql, vcast_vd_d(-PI_A*0.5 ))); s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(dqh, vcast_vd_d(-PI_B*0.5))); s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(dql, vcast_vd_d(-PI_B*0.5 ))); s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(dqh, vcast_vd_d(-PI_C*0.5))); s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(dql, vcast_vd_d(-PI_C*0.5 ))); s = ddadd_vd2_vd2_vd(s, vmul_vd_vd_vd(vadd_vd_vd_vd(dqh, dql), vcast_vd_d(-PI_D*0.5))); } else { ddi_t ddi = rempi(d); ql = ddigeti_vi_ddi(ddi); s = ddigetdd_vd2_ddi(ddi); o = vor_vo_vo_vo(visinf_vo_vd(d), visnan_vo_vd(d)); s = vd2setx_vd2_vd2_vd(s, vreinterpret_vd_vm(vor_vm_vo64_vm(o, vreinterpret_vm_vd(vd2getx_vd_vd2(s))))); s = vd2sety_vd2_vd2_vd(s, vreinterpret_vd_vm(vor_vm_vo64_vm(o, vreinterpret_vm_vd(vd2gety_vd_vd2(s))))); } t = ddscale_vd2_vd2_vd(s, vcast_vd_d(0.5)); s = ddsqu_vd2_vd2(t); vdouble s2 = vmul_vd_vd_vd(vd2getx_vd_vd2(s), vd2getx_vd_vd2(s)), s4 = vmul_vd_vd_vd(s2, s2); u = POLY8(vd2getx_vd_vd2(s), s2, s4, +0.3245098826639276316e-3, +0.5619219738114323735e-3, +0.1460781502402784494e-2, +0.3591611540792499519e-2, +0.8863268409563113126e-2, +0.2186948728185535498e-1, +0.5396825399517272970e-1, +0.1333333333330500581e+0); u = vmla_vd_vd_vd_vd(u, vd2getx_vd_vd2(s), vcast_vd_d(+0.3333333333333343695e+0)); x = ddadd_vd2_vd2_vd2(t, ddmul_vd2_vd2_vd(ddmul_vd2_vd2_vd2(s, t), u)); y = ddadd_vd2_vd_vd2(vcast_vd_d(-1), ddsqu_vd2_vd2(x)); x = ddscale_vd2_vd2_vd(x, vcast_vd_d(-2)); o = vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(ql, vcast_vi_i(1)), vcast_vi_i(1))); x = dddiv_vd2_vd2_vd2(vsel_vd2_vo_vd2_vd2(o, ddneg_vd2_vd2(y), x), vsel_vd2_vo_vd2_vd2(o, x, y)); u = vadd_vd_vd_vd(vd2getx_vd_vd2(x), vd2gety_vd_vd2(x)); u = vsel_vd_vo_vd_vd(veq_vo_vd_vd(d, vcast_vd_d(0)), d, u); return u; #else // #if !defined(DETERMINISTIC) vdouble u; vdouble2 s, t, x, y; vopmask o; vint ql; const vdouble dql = vrint_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(2 * M_1_PI))); ql = vrint_vi_vd(dql); u = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_A2*0.5), d); s = ddadd_vd2_vd_vd (u, vmul_vd_vd_vd(dql, vcast_vd_d(-PI_B2*0.5))); vopmask g = vlt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(TRIGRANGEMAX2)); if (!LIKELY(vtestallones_i_vo64(g))) { vdouble dqh = vtruncate_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(2*M_1_PI / (1 << 24)))); dqh = vmul_vd_vd_vd(dqh, vcast_vd_d(1 << 24)); x = ddadd2_vd2_vd2_vd(ddmul_vd2_vd2_vd(vcast_vd2_d_d(M_2_PI_H, M_2_PI_L), d), vsub_vd_vd_vd(vsel_vd_vo_vd_vd(vlt_vo_vd_vd(d, vcast_vd_d(0)), vcast_vd_d(-0.5), vcast_vd_d(0.5)), dqh)); const vdouble dql = vtruncate_vd_vd(vadd_vd_vd_vd(vd2getx_vd_vd2(x), vd2gety_vd_vd2(x))); u = vmla_vd_vd_vd_vd(dqh, vcast_vd_d(-PI_A * 0.5), d); x = ddadd_vd2_vd_vd(u, vmul_vd_vd_vd(dql, vcast_vd_d(-PI_A*0.5 ))); x = ddadd2_vd2_vd2_vd(x, vmul_vd_vd_vd(dqh, vcast_vd_d(-PI_B*0.5))); x = ddadd2_vd2_vd2_vd(x, vmul_vd_vd_vd(dql, vcast_vd_d(-PI_B*0.5 ))); x = ddadd2_vd2_vd2_vd(x, vmul_vd_vd_vd(dqh, vcast_vd_d(-PI_C*0.5))); x = ddadd2_vd2_vd2_vd(x, vmul_vd_vd_vd(dql, vcast_vd_d(-PI_C*0.5 ))); x = ddadd_vd2_vd2_vd(x, vmul_vd_vd_vd(vadd_vd_vd_vd(dqh, dql), vcast_vd_d(-PI_D*0.5))); ql = vsel_vi_vo_vi_vi(vcast_vo32_vo64(g), ql, vrint_vi_vd(dql)); s = vsel_vd2_vo_vd2_vd2(g, s, x); g = vlt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(TRIGRANGEMAX)); if (!LIKELY(vtestallones_i_vo64(g))) { ddi_t ddi = rempi(d); x = ddigetdd_vd2_ddi(ddi); o = vor_vo_vo_vo(visinf_vo_vd(d), visnan_vo_vd(d)); x = vd2setx_vd2_vd2_vd(x, vreinterpret_vd_vm(vor_vm_vo64_vm(o, vreinterpret_vm_vd(vd2getx_vd_vd2(x))))); x = vd2sety_vd2_vd2_vd(x, vreinterpret_vd_vm(vor_vm_vo64_vm(o, vreinterpret_vm_vd(vd2gety_vd_vd2(x))))); ql = vsel_vi_vo_vi_vi(vcast_vo32_vo64(g), ql, ddigeti_vi_ddi(ddi)); s = vsel_vd2_vo_vd2_vd2(g, s, x); } } t = ddscale_vd2_vd2_vd(s, vcast_vd_d(0.5)); s = ddsqu_vd2_vd2(t); vdouble s2 = vmul_vd_vd_vd(vd2getx_vd_vd2(s), vd2getx_vd_vd2(s)), s4 = vmul_vd_vd_vd(s2, s2); u = POLY8(vd2getx_vd_vd2(s), s2, s4, +0.3245098826639276316e-3, +0.5619219738114323735e-3, +0.1460781502402784494e-2, +0.3591611540792499519e-2, +0.8863268409563113126e-2, +0.2186948728185535498e-1, +0.5396825399517272970e-1, +0.1333333333330500581e+0); u = vmla_vd_vd_vd_vd(u, vd2getx_vd_vd2(s), vcast_vd_d(+0.3333333333333343695e+0)); x = ddadd_vd2_vd2_vd2(t, ddmul_vd2_vd2_vd(ddmul_vd2_vd2_vd2(s, t), u)); y = ddadd_vd2_vd_vd2(vcast_vd_d(-1), ddsqu_vd2_vd2(x)); x = ddscale_vd2_vd2_vd(x, vcast_vd_d(-2)); o = vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(ql, vcast_vi_i(1)), vcast_vi_i(1))); x = dddiv_vd2_vd2_vd2(vsel_vd2_vo_vd2_vd2(o, ddneg_vd2_vd2(y), x), vsel_vd2_vo_vd2_vd2(o, x, y)); u = vadd_vd_vd_vd(vd2getx_vd_vd2(x), vd2gety_vd_vd2(x)); u = vsel_vd_vo_vd_vd(veq_vo_vd_vd(d, vcast_vd_d(0)), d, u); return u; #endif // #if !defined(DETERMINISTIC) } static INLINE CONST VECTOR_CC vdouble atan2k(vdouble y, vdouble x) { vdouble s, t, u; vint q; vopmask p; q = vsel_vi_vd_vi(x, vcast_vi_i(-2)); x = vabs_vd_vd(x); q = vsel_vi_vd_vd_vi_vi(x, y, vadd_vi_vi_vi(q, vcast_vi_i(1)), q); p = vlt_vo_vd_vd(x, y); s = vsel_vd_vo_vd_vd(p, vneg_vd_vd(x), y); t = vmax_vd_vd_vd(x, y); s = vdiv_vd_vd_vd(s, t); t = vmul_vd_vd_vd(s, s); vdouble t2 = vmul_vd_vd_vd(t, t), t4 = vmul_vd_vd_vd(t2, t2), t8 = vmul_vd_vd_vd(t4, t4), t16 = vmul_vd_vd_vd(t8, t8); u = POLY19(t, t2, t4, t8, t16, -1.88796008463073496563746e-05, 0.000209850076645816976906797, -0.00110611831486672482563471, 0.00370026744188713119232403, -0.00889896195887655491740809, 0.016599329773529201970117, -0.0254517624932312641616861, 0.0337852580001353069993897, -0.0407629191276836500001934, 0.0466667150077840625632675, -0.0523674852303482457616113, 0.0587666392926673580854313, -0.0666573579361080525984562, 0.0769219538311769618355029, -0.090908995008245008229153, 0.111111105648261418443745, -0.14285714266771329383765, 0.199999999996591265594148, -0.333333333333311110369124); t = vmla_vd_vd_vd_vd(s, vmul_vd_vd_vd(t, u), s); t = vmla_vd_vd_vd_vd(vcast_vd_vi(q), vcast_vd_d(M_PI/2), t); return t; } static INLINE CONST VECTOR_CC vdouble2 atan2k_u1(vdouble2 y, vdouble2 x) { vdouble u; vdouble2 s, t; vint q; vopmask p; q = vsel_vi_vd_vi(vd2getx_vd_vd2(x), vcast_vi_i(-2)); p = vlt_vo_vd_vd(vd2getx_vd_vd2(x), vcast_vd_d(0)); vmask b = vand_vm_vo64_vm(p, vreinterpret_vm_vd(vcast_vd_d(-0.0))); x = vd2setx_vd2_vd2_vd(x, vreinterpret_vd_vm(vxor_vm_vm_vm(b, vreinterpret_vm_vd(vd2getx_vd_vd2(x))))); x = vd2sety_vd2_vd2_vd(x, vreinterpret_vd_vm(vxor_vm_vm_vm(b, vreinterpret_vm_vd(vd2gety_vd_vd2(x))))); q = vsel_vi_vd_vd_vi_vi(vd2getx_vd_vd2(x), vd2getx_vd_vd2(y), vadd_vi_vi_vi(q, vcast_vi_i(1)), q); p = vlt_vo_vd_vd(vd2getx_vd_vd2(x), vd2getx_vd_vd2(y)); s = vsel_vd2_vo_vd2_vd2(p, ddneg_vd2_vd2(x), y); t = vsel_vd2_vo_vd2_vd2(p, y, x); s = dddiv_vd2_vd2_vd2(s, t); t = ddsqu_vd2_vd2(s); t = ddnormalize_vd2_vd2(t); vdouble t2 = vmul_vd_vd_vd(vd2getx_vd_vd2(t), vd2getx_vd_vd2(t)), t4 = vmul_vd_vd_vd(t2, t2), t8 = vmul_vd_vd_vd(t4, t4), t16 = vmul_vd_vd_vd(t8, t8); u = POLY16(vd2getx_vd_vd2(t), t2, t4, t8, 1.06298484191448746607415e-05, -0.000125620649967286867384336, 0.00070557664296393412389774, -0.00251865614498713360352999, 0.00646262899036991172313504, -0.0128281333663399031014274, 0.0208024799924145797902497, -0.0289002344784740315686289, 0.0359785005035104590853656, -0.041848579703592507506027, 0.0470843011653283988193763, -0.0524914210588448421068719, 0.0587946590969581003860434, -0.0666620884778795497194182, 0.0769225330296203768654095, -0.0909090442773387574781907); u = vmla_vd_vd_vd_vd(u, vd2getx_vd_vd2(t), vcast_vd_d(0.111111108376896236538123)); u = vmla_vd_vd_vd_vd(u, vd2getx_vd_vd2(t), vcast_vd_d(-0.142857142756268568062339)); u = vmla_vd_vd_vd_vd(u, vd2getx_vd_vd2(t), vcast_vd_d(0.199999999997977351284817)); u = vmla_vd_vd_vd_vd(u, vd2getx_vd_vd2(t), vcast_vd_d(-0.333333333333317605173818)); t = ddadd_vd2_vd2_vd2(s, ddmul_vd2_vd2_vd(ddmul_vd2_vd2_vd2(s, t), u)); t = ddadd_vd2_vd2_vd2(ddmul_vd2_vd2_vd(vcast_vd2_d_d(1.570796326794896557998982, 6.12323399573676603586882e-17), vcast_vd_vi(q)), t); return t; } static INLINE CONST VECTOR_CC vdouble visinf2_vd_vd_vd(vdouble d, vdouble m) { return vreinterpret_vd_vm(vand_vm_vo64_vm(visinf_vo_vd(d), vor_vm_vm_vm(vand_vm_vm_vm(vreinterpret_vm_vd(d), vreinterpret_vm_vd(vcast_vd_d(-0.0))), vreinterpret_vm_vd(m)))); } EXPORT CONST VECTOR_CC vdouble xatan2(vdouble y, vdouble x) { vdouble r = atan2k(vabs_vd_vd(y), x); r = vmulsign_vd_vd_vd(r, x); r = vsel_vd_vo_vd_vd(vor_vo_vo_vo(visinf_vo_vd(x), veq_vo_vd_vd(x, vcast_vd_d(0))), vsub_vd_vd_vd(vcast_vd_d(M_PI/2), visinf2_vd_vd_vd(x, vmulsign_vd_vd_vd(vcast_vd_d(M_PI/2), x))), r); r = vsel_vd_vo_vd_vd(visinf_vo_vd(y), vsub_vd_vd_vd(vcast_vd_d(M_PI/2), visinf2_vd_vd_vd(x, vmulsign_vd_vd_vd(vcast_vd_d(M_PI/4), x))), r); r = vsel_vd_vo_vd_vd(veq_vo_vd_vd(y, vcast_vd_d(0.0)), vreinterpret_vd_vm(vand_vm_vo64_vm(vsignbit_vo_vd(x), vreinterpret_vm_vd(vcast_vd_d(M_PI)))), r); r = vreinterpret_vd_vm(vor_vm_vo64_vm(vor_vo_vo_vo(visnan_vo_vd(x), visnan_vo_vd(y)), vreinterpret_vm_vd(vmulsign_vd_vd_vd(r, y)))); return r; } EXPORT CONST VECTOR_CC vdouble xatan2_u1(vdouble y, vdouble x) { vopmask o = vlt_vo_vd_vd(vabs_vd_vd(x), vcast_vd_d(5.5626846462680083984e-309)); // nexttoward((1.0 / DBL_MAX), 1) x = vsel_vd_vo_vd_vd(o, vmul_vd_vd_vd(x, vcast_vd_d(UINT64_C(1) << 53)), x); y = vsel_vd_vo_vd_vd(o, vmul_vd_vd_vd(y, vcast_vd_d(UINT64_C(1) << 53)), y); vdouble2 d = atan2k_u1(vcast_vd2_vd_vd(vabs_vd_vd(y), vcast_vd_d(0)), vcast_vd2_vd_vd(x, vcast_vd_d(0))); vdouble r = vadd_vd_vd_vd(vd2getx_vd_vd2(d), vd2gety_vd_vd2(d)); r = vmulsign_vd_vd_vd(r, x); r = vsel_vd_vo_vd_vd(vor_vo_vo_vo(visinf_vo_vd(x), veq_vo_vd_vd(x, vcast_vd_d(0))), vsub_vd_vd_vd(vcast_vd_d(M_PI/2), visinf2_vd_vd_vd(x, vmulsign_vd_vd_vd(vcast_vd_d(M_PI/2), x))), r); r = vsel_vd_vo_vd_vd(visinf_vo_vd(y), vsub_vd_vd_vd(vcast_vd_d(M_PI/2), visinf2_vd_vd_vd(x, vmulsign_vd_vd_vd(vcast_vd_d(M_PI/4), x))), r); r = vsel_vd_vo_vd_vd(veq_vo_vd_vd(y, vcast_vd_d(0.0)), vreinterpret_vd_vm(vand_vm_vo64_vm(vsignbit_vo_vd(x), vreinterpret_vm_vd(vcast_vd_d(M_PI)))), r); r = vreinterpret_vd_vm(vor_vm_vo64_vm(vor_vo_vo_vo(visnan_vo_vd(x), visnan_vo_vd(y)), vreinterpret_vm_vd(vmulsign_vd_vd_vd(r, y)))); return r; } EXPORT CONST VECTOR_CC vdouble xasin(vdouble d) { vopmask o = vlt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(0.5)); vdouble x2 = vsel_vd_vo_vd_vd(o, vmul_vd_vd_vd(d, d), vmul_vd_vd_vd(vsub_vd_vd_vd(vcast_vd_d(1), vabs_vd_vd(d)), vcast_vd_d(0.5))); vdouble x = vsel_vd_vo_vd_vd(o, vabs_vd_vd(d), vsqrt_vd_vd(x2)), u; vdouble x4 = vmul_vd_vd_vd(x2, x2), x8 = vmul_vd_vd_vd(x4, x4), x16 = vmul_vd_vd_vd(x8, x8); u = POLY12(x2, x4, x8, x16, +0.3161587650653934628e-1, -0.1581918243329996643e-1, +0.1929045477267910674e-1, +0.6606077476277170610e-2, +0.1215360525577377331e-1, +0.1388715184501609218e-1, +0.1735956991223614604e-1, +0.2237176181932048341e-1, +0.3038195928038132237e-1, +0.4464285681377102438e-1, +0.7500000000378581611e-1, +0.1666666666666497543e+0); u = vmla_vd_vd_vd_vd(u, vmul_vd_vd_vd(x, x2), x); vdouble r = vsel_vd_vo_vd_vd(o, u, vmla_vd_vd_vd_vd(u, vcast_vd_d(-2), vcast_vd_d(M_PI/2))); return vmulsign_vd_vd_vd(r, d); } EXPORT CONST VECTOR_CC vdouble xasin_u1(vdouble d) { vopmask o = vlt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(0.5)); vdouble x2 = vsel_vd_vo_vd_vd(o, vmul_vd_vd_vd(d, d), vmul_vd_vd_vd(vsub_vd_vd_vd(vcast_vd_d(1), vabs_vd_vd(d)), vcast_vd_d(0.5))), u; vdouble2 x = vsel_vd2_vo_vd2_vd2(o, vcast_vd2_vd_vd(vabs_vd_vd(d), vcast_vd_d(0)), ddsqrt_vd2_vd(x2)); x = vsel_vd2_vo_vd2_vd2(veq_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(1.0)), vcast_vd2_d_d(0, 0), x); vdouble x4 = vmul_vd_vd_vd(x2, x2), x8 = vmul_vd_vd_vd(x4, x4), x16 = vmul_vd_vd_vd(x8, x8); u = POLY12(x2, x4, x8, x16, +0.3161587650653934628e-1, -0.1581918243329996643e-1, +0.1929045477267910674e-1, +0.6606077476277170610e-2, +0.1215360525577377331e-1, +0.1388715184501609218e-1, +0.1735956991223614604e-1, +0.2237176181932048341e-1, +0.3038195928038132237e-1, +0.4464285681377102438e-1, +0.7500000000378581611e-1, +0.1666666666666497543e+0); u = vmul_vd_vd_vd(u, vmul_vd_vd_vd(x2, vd2getx_vd_vd2(x))); vdouble2 y = ddsub_vd2_vd2_vd(ddsub_vd2_vd2_vd2(vcast_vd2_d_d(3.141592653589793116/4, 1.2246467991473532072e-16/4), x), u); vdouble r = vsel_vd_vo_vd_vd(o, vadd_vd_vd_vd(u, vd2getx_vd_vd2(x)), vmul_vd_vd_vd(vadd_vd_vd_vd(vd2getx_vd_vd2(y), vd2gety_vd_vd2(y)), vcast_vd_d(2))); return vmulsign_vd_vd_vd(r, d); } EXPORT CONST VECTOR_CC vdouble xacos(vdouble d) { vopmask o = vlt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(0.5)); vdouble x2 = vsel_vd_vo_vd_vd(o, vmul_vd_vd_vd(d, d), vmul_vd_vd_vd(vsub_vd_vd_vd(vcast_vd_d(1), vabs_vd_vd(d)), vcast_vd_d(0.5))), u; vdouble x = vsel_vd_vo_vd_vd(o, vabs_vd_vd(d), vsqrt_vd_vd(x2)); x = vsel_vd_vo_vd_vd(veq_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(1.0)), vcast_vd_d(0), x); vdouble x4 = vmul_vd_vd_vd(x2, x2), x8 = vmul_vd_vd_vd(x4, x4), x16 = vmul_vd_vd_vd(x8, x8); u = POLY12(x2, x4, x8, x16, +0.3161587650653934628e-1, -0.1581918243329996643e-1, +0.1929045477267910674e-1, +0.6606077476277170610e-2, +0.1215360525577377331e-1, +0.1388715184501609218e-1, +0.1735956991223614604e-1, +0.2237176181932048341e-1, +0.3038195928038132237e-1, +0.4464285681377102438e-1, +0.7500000000378581611e-1, +0.1666666666666497543e+0); u = vmul_vd_vd_vd(u, vmul_vd_vd_vd(x2, x)); vdouble y = vsub_vd_vd_vd(vcast_vd_d(M_PI/2), vadd_vd_vd_vd(vmulsign_vd_vd_vd(x, d), vmulsign_vd_vd_vd(u, d))); x = vadd_vd_vd_vd(x, u); vdouble r = vsel_vd_vo_vd_vd(o, y, vmul_vd_vd_vd(x, vcast_vd_d(2))); return vsel_vd_vo_vd_vd(vandnot_vo_vo_vo(o, vlt_vo_vd_vd(d, vcast_vd_d(0))), vd2getx_vd_vd2(ddadd_vd2_vd2_vd(vcast_vd2_d_d(3.141592653589793116, 1.2246467991473532072e-16), vneg_vd_vd(r))), r); } EXPORT CONST VECTOR_CC vdouble xacos_u1(vdouble d) { vopmask o = vlt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(0.5)); vdouble x2 = vsel_vd_vo_vd_vd(o, vmul_vd_vd_vd(d, d), vmul_vd_vd_vd(vsub_vd_vd_vd(vcast_vd_d(1), vabs_vd_vd(d)), vcast_vd_d(0.5))), u; vdouble2 x = vsel_vd2_vo_vd2_vd2(o, vcast_vd2_vd_vd(vabs_vd_vd(d), vcast_vd_d(0)), ddsqrt_vd2_vd(x2)); x = vsel_vd2_vo_vd2_vd2(veq_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(1.0)), vcast_vd2_d_d(0, 0), x); vdouble x4 = vmul_vd_vd_vd(x2, x2), x8 = vmul_vd_vd_vd(x4, x4), x16 = vmul_vd_vd_vd(x8, x8); u = POLY12(x2, x4, x8, x16, +0.3161587650653934628e-1, -0.1581918243329996643e-1, +0.1929045477267910674e-1, +0.6606077476277170610e-2, +0.1215360525577377331e-1, +0.1388715184501609218e-1, +0.1735956991223614604e-1, +0.2237176181932048341e-1, +0.3038195928038132237e-1, +0.4464285681377102438e-1, +0.7500000000378581611e-1, +0.1666666666666497543e+0); u = vmul_vd_vd_vd(u, vmul_vd_vd_vd(x2, vd2getx_vd_vd2(x))); vdouble2 y = ddsub_vd2_vd2_vd2(vcast_vd2_d_d(3.141592653589793116/2, 1.2246467991473532072e-16/2), ddadd_vd2_vd_vd(vmulsign_vd_vd_vd(vd2getx_vd_vd2(x), d), vmulsign_vd_vd_vd(u, d))); x = ddadd_vd2_vd2_vd(x, u); y = vsel_vd2_vo_vd2_vd2(o, y, ddscale_vd2_vd2_vd(x, vcast_vd_d(2))); y = vsel_vd2_vo_vd2_vd2(vandnot_vo_vo_vo(o, vlt_vo_vd_vd(d, vcast_vd_d(0))), ddsub_vd2_vd2_vd2(vcast_vd2_d_d(3.141592653589793116, 1.2246467991473532072e-16), y), y); return vadd_vd_vd_vd(vd2getx_vd_vd2(y), vd2gety_vd_vd2(y)); } EXPORT CONST VECTOR_CC vdouble xatan_u1(vdouble d) { vdouble2 d2 = atan2k_u1(vcast_vd2_vd_vd(vabs_vd_vd(d), vcast_vd_d(0)), vcast_vd2_d_d(1, 0)); vdouble r = vadd_vd_vd_vd(vd2getx_vd_vd2(d2), vd2gety_vd_vd2(d2)); r = vsel_vd_vo_vd_vd(visinf_vo_vd(d), vcast_vd_d(1.570796326794896557998982), r); return vmulsign_vd_vd_vd(r, d); } EXPORT CONST VECTOR_CC vdouble xatan(vdouble s) { vdouble t, u; vint q; #if defined(__INTEL_COMPILER) && defined(ENABLE_PURECFMA_SCALAR) vdouble w = s; #endif q = vsel_vi_vd_vi(s, vcast_vi_i(2)); s = vabs_vd_vd(s); q = vsel_vi_vd_vd_vi_vi(vcast_vd_d(1), s, vadd_vi_vi_vi(q, vcast_vi_i(1)), q); s = vsel_vd_vo_vd_vd(vlt_vo_vd_vd(vcast_vd_d(1), s), vrec_vd_vd(s), s); t = vmul_vd_vd_vd(s, s); vdouble t2 = vmul_vd_vd_vd(t, t), t4 = vmul_vd_vd_vd(t2, t2), t8 = vmul_vd_vd_vd(t4, t4), t16 = vmul_vd_vd_vd(t8, t8); u = POLY19(t, t2, t4, t8, t16, -1.88796008463073496563746e-05, 0.000209850076645816976906797, -0.00110611831486672482563471, 0.00370026744188713119232403, -0.00889896195887655491740809, 0.016599329773529201970117, -0.0254517624932312641616861, 0.0337852580001353069993897, -0.0407629191276836500001934, 0.0466667150077840625632675, -0.0523674852303482457616113, 0.0587666392926673580854313, -0.0666573579361080525984562, 0.0769219538311769618355029, -0.090908995008245008229153, 0.111111105648261418443745, -0.14285714266771329383765, 0.199999999996591265594148, -0.333333333333311110369124); t = vmla_vd_vd_vd_vd(s, vmul_vd_vd_vd(t, u), s); t = vsel_vd_vo_vd_vd(vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(q, vcast_vi_i(1)), vcast_vi_i(1))), vsub_vd_vd_vd(vcast_vd_d(M_PI/2), t), t); t = vreinterpret_vd_vm(vxor_vm_vm_vm(vand_vm_vo64_vm(vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(q, vcast_vi_i(2)), vcast_vi_i(2))), vreinterpret_vm_vd(vcast_vd_d(-0.0))), vreinterpret_vm_vd(t))); #if defined(__INTEL_COMPILER) && defined(ENABLE_PURECFMA_SCALAR) t = vsel_vd_vo_vd_vd(veq_vo_vd_vd(w, vcast_vd_d(0)), w, t); #endif return t; } #if !defined(DETERMINISTIC) EXPORT CONST VECTOR_CC vdouble xlog(vdouble d) { vdouble x, x2; vdouble t, m; #if !defined(ENABLE_AVX512F) && !defined(ENABLE_AVX512FNOFMA) vopmask o = vlt_vo_vd_vd(d, vcast_vd_d(DBL_MIN)); d = vsel_vd_vo_vd_vd(o, vmul_vd_vd_vd(d, vcast_vd_d((double)(INT64_C(1) << 32) * (double)(INT64_C(1) << 32))), d); vint e = vilogb2k_vi_vd(vmul_vd_vd_vd(d, vcast_vd_d(1.0/0.75))); m = vldexp3_vd_vd_vi(d, vneg_vi_vi(e)); e = vsel_vi_vo_vi_vi(vcast_vo32_vo64(o), vsub_vi_vi_vi(e, vcast_vi_i(64)), e); #else vdouble e = vgetexp_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(1.0/0.75))); e = vsel_vd_vo_vd_vd(vispinf_vo_vd(e), vcast_vd_d(1024.0), e); m = vgetmant_vd_vd(d); #endif x = vdiv_vd_vd_vd(vsub_vd_vd_vd(m, vcast_vd_d(1)), vadd_vd_vd_vd(vcast_vd_d(1), m)); x2 = vmul_vd_vd_vd(x, x); vdouble x4 = vmul_vd_vd_vd(x2, x2), x8 = vmul_vd_vd_vd(x4, x4), x3 = vmul_vd_vd_vd(x, x2); t = POLY7(x2, x4, x8, 0.153487338491425068243146, 0.152519917006351951593857, 0.181863266251982985677316, 0.222221366518767365905163, 0.285714294746548025383248, 0.399999999950799600689777, 0.6666666666667778740063); #if !defined(ENABLE_AVX512F) && !defined(ENABLE_AVX512FNOFMA) x = vmla_vd_vd_vd_vd(x, vcast_vd_d(2), vmul_vd_vd_vd(vcast_vd_d(0.693147180559945286226764), vcast_vd_vi(e))); x = vmla_vd_vd_vd_vd(x3, t, x); x = vsel_vd_vo_vd_vd(vispinf_vo_vd(d), vcast_vd_d(SLEEF_INFINITY), x); x = vsel_vd_vo_vd_vd(vor_vo_vo_vo(vlt_vo_vd_vd(d, vcast_vd_d(0)), visnan_vo_vd(d)), vcast_vd_d(SLEEF_NAN), x); x = vsel_vd_vo_vd_vd(veq_vo_vd_vd(d, vcast_vd_d(0)), vcast_vd_d(-SLEEF_INFINITY), x); #else x = vmla_vd_vd_vd_vd(x, vcast_vd_d(2), vmul_vd_vd_vd(vcast_vd_d(0.693147180559945286226764), e)); x = vmla_vd_vd_vd_vd(x3, t, x); x = vfixup_vd_vd_vd_vi2_i(x, d, vcast_vi2_i((5 << (5*4))), 0); #endif return x; } #endif // #if !defined(DETERMINISTIC) EXPORT CONST VECTOR_CC vdouble xexp(vdouble d) { vdouble u = vrint_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(R_LN2))), s; vint q = vrint_vi_vd(u); s = vmla_vd_vd_vd_vd(u, vcast_vd_d(-L2U), d); s = vmla_vd_vd_vd_vd(u, vcast_vd_d(-L2L), s); #ifdef ENABLE_FMA_DP vdouble s2 = vmul_vd_vd_vd(s, s), s4 = vmul_vd_vd_vd(s2, s2), s8 = vmul_vd_vd_vd(s4, s4); u = POLY10(s, s2, s4, s8, +0.2081276378237164457e-8, +0.2511210703042288022e-7, +0.2755762628169491192e-6, +0.2755723402025388239e-5, +0.2480158687479686264e-4, +0.1984126989855865850e-3, +0.1388888888914497797e-2, +0.8333333333314938210e-2, +0.4166666666666602598e-1, +0.1666666666666669072e+0); u = vfma_vd_vd_vd_vd(u, s, vcast_vd_d(+0.5000000000000000000e+0)); u = vfma_vd_vd_vd_vd(u, s, vcast_vd_d(+0.1000000000000000000e+1)); u = vfma_vd_vd_vd_vd(u, s, vcast_vd_d(+0.1000000000000000000e+1)); #else // #ifdef ENABLE_FMA_DP vdouble s2 = vmul_vd_vd_vd(s, s), s4 = vmul_vd_vd_vd(s2, s2), s8 = vmul_vd_vd_vd(s4, s4); u = POLY10(s, s2, s4, s8, 2.08860621107283687536341e-09, 2.51112930892876518610661e-08, 2.75573911234900471893338e-07, 2.75572362911928827629423e-06, 2.4801587159235472998791e-05, 0.000198412698960509205564975, 0.00138888888889774492207962, 0.00833333333331652721664984, 0.0416666666666665047591422, 0.166666666666666851703837); u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(+0.5000000000000000000e+0)); u = vadd_vd_vd_vd(vcast_vd_d(1), vmla_vd_vd_vd_vd(vmul_vd_vd_vd(s, s), u, s)); #endif // #ifdef ENABLE_FMA_DP u = vldexp2_vd_vd_vi(u, q); u = vsel_vd_vo_vd_vd(vgt_vo_vd_vd(d, vcast_vd_d(709.78271114955742909217217426)), vcast_vd_d(SLEEF_INFINITY), u); u = vreinterpret_vd_vm(vandnot_vm_vo64_vm(vlt_vo_vd_vd(d, vcast_vd_d(-1000)), vreinterpret_vm_vd(u))); return u; } static INLINE CONST VECTOR_CC vdouble expm1k(vdouble d) { vdouble u = vrint_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(R_LN2))), s; vint q = vrint_vi_vd(u); s = vmla_vd_vd_vd_vd(u, vcast_vd_d(-L2U), d); s = vmla_vd_vd_vd_vd(u, vcast_vd_d(-L2L), s); vdouble s2 = vmul_vd_vd_vd(s, s), s4 = vmul_vd_vd_vd(s2, s2), s8 = vmul_vd_vd_vd(s4, s4); u = POLY10(s, s2, s4, s8, 2.08860621107283687536341e-09, 2.51112930892876518610661e-08, 2.75573911234900471893338e-07, 2.75572362911928827629423e-06, 2.4801587159235472998791e-05, 0.000198412698960509205564975, 0.00138888888889774492207962, 0.00833333333331652721664984, 0.0416666666666665047591422, 0.166666666666666851703837); u = vadd_vd_vd_vd(vmla_vd_vd_vd_vd(s2, vcast_vd_d(0.5), vmul_vd_vd_vd(vmul_vd_vd_vd(s2, s), u)), s); u = vsel_vd_vo_vd_vd(vcast_vo64_vo32(veq_vo_vi_vi(q, vcast_vi_i(0))), u, vsub_vd_vd_vd(vldexp2_vd_vd_vi(vadd_vd_vd_vd(u, vcast_vd_d(1)), q), vcast_vd_d(1))); return u; } static INLINE CONST VECTOR_CC vdouble2 logk(vdouble d) { vdouble2 x, x2, s; vdouble t, m; #if !defined(ENABLE_AVX512F) && !defined(ENABLE_AVX512FNOFMA) vopmask o = vlt_vo_vd_vd(d, vcast_vd_d(DBL_MIN)); d = vsel_vd_vo_vd_vd(o, vmul_vd_vd_vd(d, vcast_vd_d((double)(INT64_C(1) << 32) * (double)(INT64_C(1) << 32))), d); vint e = vilogb2k_vi_vd(vmul_vd_vd_vd(d, vcast_vd_d(1.0/0.75))); m = vldexp3_vd_vd_vi(d, vneg_vi_vi(e)); e = vsel_vi_vo_vi_vi(vcast_vo32_vo64(o), vsub_vi_vi_vi(e, vcast_vi_i(64)), e); #else vdouble e = vgetexp_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(1.0/0.75))); e = vsel_vd_vo_vd_vd(vispinf_vo_vd(e), vcast_vd_d(1024.0), e); m = vgetmant_vd_vd(d); #endif x = dddiv_vd2_vd2_vd2(ddadd2_vd2_vd_vd(vcast_vd_d(-1), m), ddadd2_vd2_vd_vd(vcast_vd_d(1), m)); x2 = ddsqu_vd2_vd2(x); vdouble x4 = vmul_vd_vd_vd(vd2getx_vd_vd2(x2), vd2getx_vd_vd2(x2)), x8 = vmul_vd_vd_vd(x4, x4), x16 = vmul_vd_vd_vd(x8, x8); t = POLY9(vd2getx_vd_vd2(x2), x4, x8, x16, 0.116255524079935043668677, 0.103239680901072952701192, 0.117754809412463995466069, 0.13332981086846273921509, 0.153846227114512262845736, 0.181818180850050775676507, 0.222222222230083560345903, 0.285714285714249172087875, 0.400000000000000077715612); vdouble2 c = vcast_vd2_d_d(0.666666666666666629659233, 3.80554962542412056336616e-17); #if !defined(ENABLE_AVX512F) && !defined(ENABLE_AVX512FNOFMA) s = ddmul_vd2_vd2_vd(vcast_vd2_d_d(0.693147180559945286226764, 2.319046813846299558417771e-17), vcast_vd_vi(e)); #else s = ddmul_vd2_vd2_vd(vcast_vd2_d_d(0.693147180559945286226764, 2.319046813846299558417771e-17), e); #endif s = ddadd_vd2_vd2_vd2(s, ddscale_vd2_vd2_vd(x, vcast_vd_d(2))); x = ddmul_vd2_vd2_vd2(x2, x); s = ddadd_vd2_vd2_vd2(s, ddmul_vd2_vd2_vd2(x, c)); x = ddmul_vd2_vd2_vd2(x2, x); s = ddadd_vd2_vd2_vd2(s, ddmul_vd2_vd2_vd(x, t)); return s; } #if !defined(DETERMINISTIC) EXPORT CONST VECTOR_CC vdouble xlog_u1(vdouble d) { vdouble2 x; vdouble t, m, x2; #if !defined(ENABLE_AVX512F) && !defined(ENABLE_AVX512FNOFMA) vopmask o = vlt_vo_vd_vd(d, vcast_vd_d(DBL_MIN)); d = vsel_vd_vo_vd_vd(o, vmul_vd_vd_vd(d, vcast_vd_d((double)(INT64_C(1) << 32) * (double)(INT64_C(1) << 32))), d); vint e = vilogb2k_vi_vd(vmul_vd_vd_vd(d, vcast_vd_d(1.0/0.75))); m = vldexp3_vd_vd_vi(d, vneg_vi_vi(e)); e = vsel_vi_vo_vi_vi(vcast_vo32_vo64(o), vsub_vi_vi_vi(e, vcast_vi_i(64)), e); #else vdouble e = vgetexp_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(1.0/0.75))); e = vsel_vd_vo_vd_vd(vispinf_vo_vd(e), vcast_vd_d(1024.0), e); m = vgetmant_vd_vd(d); #endif x = dddiv_vd2_vd2_vd2(ddadd2_vd2_vd_vd(vcast_vd_d(-1), m), ddadd2_vd2_vd_vd(vcast_vd_d(1), m)); x2 = vmul_vd_vd_vd(vd2getx_vd_vd2(x), vd2getx_vd_vd2(x)); vdouble x4 = vmul_vd_vd_vd(x2, x2), x8 = vmul_vd_vd_vd(x4, x4); t = POLY7(x2, x4, x8, 0.1532076988502701353e+0, 0.1525629051003428716e+0, 0.1818605932937785996e+0, 0.2222214519839380009e+0, 0.2857142932794299317e+0, 0.3999999999635251990e+0, 0.6666666666667333541e+0); #if !defined(ENABLE_AVX512F) && !defined(ENABLE_AVX512FNOFMA) vdouble2 s = ddmul_vd2_vd2_vd(vcast_vd2_d_d(0.693147180559945286226764, 2.319046813846299558417771e-17), vcast_vd_vi(e)); #else vdouble2 s = ddmul_vd2_vd2_vd(vcast_vd2_d_d(0.693147180559945286226764, 2.319046813846299558417771e-17), e); #endif s = ddadd_vd2_vd2_vd2(s, ddscale_vd2_vd2_vd(x, vcast_vd_d(2))); s = ddadd_vd2_vd2_vd(s, vmul_vd_vd_vd(vmul_vd_vd_vd(x2, vd2getx_vd_vd2(x)), t)); vdouble r = vadd_vd_vd_vd(vd2getx_vd_vd2(s), vd2gety_vd_vd2(s)); #if !defined(ENABLE_AVX512F) && !defined(ENABLE_AVX512FNOFMA) r = vsel_vd_vo_vd_vd(vispinf_vo_vd(d), vcast_vd_d(SLEEF_INFINITY), r); r = vsel_vd_vo_vd_vd(vor_vo_vo_vo(vlt_vo_vd_vd(d, vcast_vd_d(0)), visnan_vo_vd(d)), vcast_vd_d(SLEEF_NAN), r); r = vsel_vd_vo_vd_vd(veq_vo_vd_vd(d, vcast_vd_d(0)), vcast_vd_d(-SLEEF_INFINITY), r); #else r = vfixup_vd_vd_vd_vi2_i(r, d, vcast_vi2_i((4 << (2*4)) | (3 << (4*4)) | (5 << (5*4)) | (2 << (6*4))), 0); #endif return r; } #endif // #if !defined(DETERMINISTIC) static INLINE CONST VECTOR_CC vdouble expk(vdouble2 d) { vdouble u = vmul_vd_vd_vd(vadd_vd_vd_vd(vd2getx_vd_vd2(d), vd2gety_vd_vd2(d)), vcast_vd_d(R_LN2)); vdouble dq = vrint_vd_vd(u); vint q = vrint_vi_vd(dq); vdouble2 s, t; s = ddadd2_vd2_vd2_vd(d, vmul_vd_vd_vd(dq, vcast_vd_d(-L2U))); s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(dq, vcast_vd_d(-L2L))); s = ddnormalize_vd2_vd2(s); vdouble s2 = vmul_vd_vd_vd(vd2getx_vd_vd2(s), vd2getx_vd_vd2(s)), s4 = vmul_vd_vd_vd(s2, s2), s8 = vmul_vd_vd_vd(s4, s4); u = POLY10(vd2getx_vd_vd2(s), s2, s4, s8, 2.51069683420950419527139e-08, 2.76286166770270649116855e-07, 2.75572496725023574143864e-06, 2.48014973989819794114153e-05, 0.000198412698809069797676111, 0.0013888888939977128960529, 0.00833333333332371417601081, 0.0416666666665409524128449, 0.166666666666666740681535, 0.500000000000000999200722); t = ddadd_vd2_vd_vd2(vcast_vd_d(1), s); t = ddadd_vd2_vd2_vd2(t, ddmul_vd2_vd2_vd(ddsqu_vd2_vd2(s), u)); u = vadd_vd_vd_vd(vd2getx_vd_vd2(t), vd2gety_vd_vd2(t)); u = vldexp2_vd_vd_vi(u, q); u = vreinterpret_vd_vm(vandnot_vm_vo64_vm(vlt_vo_vd_vd(vd2getx_vd_vd2(d), vcast_vd_d(-1000)), vreinterpret_vm_vd(u))); return u; } #if !defined(DETERMINISTIC) EXPORT CONST VECTOR_CC vdouble xpow(vdouble x, vdouble y) { #if 1 vopmask yisint = visint_vo_vd(y); vopmask yisodd = vand_vo_vo_vo(visodd_vo_vd(y), yisint); vdouble2 d = ddmul_vd2_vd2_vd(logk(vabs_vd_vd(x)), y); vdouble result = expk(d); result = vsel_vd_vo_vd_vd(vgt_vo_vd_vd(vd2getx_vd_vd2(d), vcast_vd_d(709.78271114955742909217217426)), vcast_vd_d(SLEEF_INFINITY), result); result = vmul_vd_vd_vd(result, vsel_vd_vo_vd_vd(vgt_vo_vd_vd(x, vcast_vd_d(0)), vcast_vd_d(1), vsel_vd_vo_vd_vd(yisint, vsel_vd_vo_vd_vd(yisodd, vcast_vd_d(-1.0), vcast_vd_d(1)), vcast_vd_d(SLEEF_NAN)))); vdouble efx = vmulsign_vd_vd_vd(vsub_vd_vd_vd(vabs_vd_vd(x), vcast_vd_d(1)), y); result = vsel_vd_vo_vd_vd(visinf_vo_vd(y), vreinterpret_vd_vm(vandnot_vm_vo64_vm(vlt_vo_vd_vd(efx, vcast_vd_d(0.0)), vreinterpret_vm_vd(vsel_vd_vo_vd_vd(veq_vo_vd_vd(efx, vcast_vd_d(0.0)), vcast_vd_d(1.0), vcast_vd_d(SLEEF_INFINITY))))), result); result = vsel_vd_vo_vd_vd(vor_vo_vo_vo(visinf_vo_vd(x), veq_vo_vd_vd(x, vcast_vd_d(0.0))), vmul_vd_vd_vd(vsel_vd_vo_vd_vd(yisodd, vsign_vd_vd(x), vcast_vd_d(1.0)), vreinterpret_vd_vm(vandnot_vm_vo64_vm(vlt_vo_vd_vd(vsel_vd_vo_vd_vd(veq_vo_vd_vd(x, vcast_vd_d(0.0)), vneg_vd_vd(y), y), vcast_vd_d(0.0)), vreinterpret_vm_vd(vcast_vd_d(SLEEF_INFINITY))))), result); result = vreinterpret_vd_vm(vor_vm_vo64_vm(vor_vo_vo_vo(visnan_vo_vd(x), visnan_vo_vd(y)), vreinterpret_vm_vd(result))); result = vsel_vd_vo_vd_vd(vor_vo_vo_vo(veq_vo_vd_vd(y, vcast_vd_d(0)), veq_vo_vd_vd(x, vcast_vd_d(1))), vcast_vd_d(1), result); return result; #else return expk(ddmul_vd2_vd2_vd(logk(x), y)); #endif } #endif // #if !defined(DETERMINISTIC) static INLINE CONST VECTOR_CC vdouble2 expk2(vdouble2 d) { vdouble u = vmul_vd_vd_vd(vadd_vd_vd_vd(vd2getx_vd_vd2(d), vd2gety_vd_vd2(d)), vcast_vd_d(R_LN2)); vdouble dq = vrint_vd_vd(u); vint q = vrint_vi_vd(dq); vdouble2 s, t; s = ddadd2_vd2_vd2_vd(d, vmul_vd_vd_vd(dq, vcast_vd_d(-L2U))); s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(dq, vcast_vd_d(-L2L))); vdouble2 s2 = ddsqu_vd2_vd2(s), s4 = ddsqu_vd2_vd2(s2); vdouble s8 = vmul_vd_vd_vd(vd2getx_vd_vd2(s4), vd2getx_vd_vd2(s4)); u = POLY10(vd2getx_vd_vd2(s), vd2getx_vd_vd2(s2), vd2getx_vd_vd2(s4), s8, +0.1602472219709932072e-9, +0.2092255183563157007e-8, +0.2505230023782644465e-7, +0.2755724800902135303e-6, +0.2755731892386044373e-5, +0.2480158735605815065e-4, +0.1984126984148071858e-3, +0.1388888888886763255e-2, +0.8333333333333347095e-2, +0.4166666666666669905e-1); t = ddadd_vd2_vd_vd2(vcast_vd_d(0.5), ddmul_vd2_vd2_vd(s, vcast_vd_d(+0.1666666666666666574e+0))); t = ddadd_vd2_vd_vd2(vcast_vd_d(1.0), ddmul_vd2_vd2_vd2(t, s)); t = ddadd_vd2_vd_vd2(vcast_vd_d(1.0), ddmul_vd2_vd2_vd2(t, s)); t = ddadd_vd2_vd2_vd2(t, ddmul_vd2_vd2_vd(s4, u)); t = vd2setx_vd2_vd2_vd(t, vldexp2_vd_vd_vi(vd2getx_vd_vd2(t), q)); t = vd2sety_vd2_vd2_vd(t, vldexp2_vd_vd_vi(vd2gety_vd_vd2(t), q)); t = vd2setx_vd2_vd2_vd(t, vreinterpret_vd_vm(vandnot_vm_vo64_vm(vlt_vo_vd_vd(vd2getx_vd_vd2(d), vcast_vd_d(-1000)), vreinterpret_vm_vd(vd2getx_vd_vd2(t))))); t = vd2sety_vd2_vd2_vd(t, vreinterpret_vd_vm(vandnot_vm_vo64_vm(vlt_vo_vd_vd(vd2getx_vd_vd2(d), vcast_vd_d(-1000)), vreinterpret_vm_vd(vd2gety_vd_vd2(t))))); return t; } #if !defined(DETERMINISTIC) EXPORT CONST VECTOR_CC vdouble xsinh(vdouble x) { vdouble y = vabs_vd_vd(x); vdouble2 d = expk2(vcast_vd2_vd_vd(y, vcast_vd_d(0))); d = ddsub_vd2_vd2_vd2(d, ddrec_vd2_vd2(d)); y = vmul_vd_vd_vd(vadd_vd_vd_vd(vd2getx_vd_vd2(d), vd2gety_vd_vd2(d)), vcast_vd_d(0.5)); y = vsel_vd_vo_vd_vd(vor_vo_vo_vo(vgt_vo_vd_vd(vabs_vd_vd(x), vcast_vd_d(710)), visnan_vo_vd(y)), vcast_vd_d(SLEEF_INFINITY), y); y = vmulsign_vd_vd_vd(y, x); y = vreinterpret_vd_vm(vor_vm_vo64_vm(visnan_vo_vd(x), vreinterpret_vm_vd(y))); return y; } EXPORT CONST VECTOR_CC vdouble xcosh(vdouble x) { vdouble y = vabs_vd_vd(x); vdouble2 d = expk2(vcast_vd2_vd_vd(y, vcast_vd_d(0))); d = ddadd_vd2_vd2_vd2(d, ddrec_vd2_vd2(d)); y = vmul_vd_vd_vd(vadd_vd_vd_vd(vd2getx_vd_vd2(d), vd2gety_vd_vd2(d)), vcast_vd_d(0.5)); y = vsel_vd_vo_vd_vd(vor_vo_vo_vo(vgt_vo_vd_vd(vabs_vd_vd(x), vcast_vd_d(710)), visnan_vo_vd(y)), vcast_vd_d(SLEEF_INFINITY), y); y = vreinterpret_vd_vm(vor_vm_vo64_vm(visnan_vo_vd(x), vreinterpret_vm_vd(y))); return y; } EXPORT CONST VECTOR_CC vdouble xtanh(vdouble x) { vdouble y = vabs_vd_vd(x); vdouble2 d = expk2(vcast_vd2_vd_vd(y, vcast_vd_d(0))); vdouble2 e = ddrec_vd2_vd2(d); d = dddiv_vd2_vd2_vd2(ddadd2_vd2_vd2_vd2(d, ddneg_vd2_vd2(e)), ddadd2_vd2_vd2_vd2(d, e)); y = vadd_vd_vd_vd(vd2getx_vd_vd2(d), vd2gety_vd_vd2(d)); y = vsel_vd_vo_vd_vd(vor_vo_vo_vo(vgt_vo_vd_vd(vabs_vd_vd(x), vcast_vd_d(18.714973875)), visnan_vo_vd(y)), vcast_vd_d(1.0), y); y = vmulsign_vd_vd_vd(y, x); y = vreinterpret_vd_vm(vor_vm_vo64_vm(visnan_vo_vd(x), vreinterpret_vm_vd(y))); return y; } EXPORT CONST VECTOR_CC vdouble xsinh_u35(vdouble x) { vdouble e = expm1k(vabs_vd_vd(x)); vdouble y = vdiv_vd_vd_vd(vadd_vd_vd_vd(e, vcast_vd_d(2)), vadd_vd_vd_vd(e, vcast_vd_d(1))); y = vmul_vd_vd_vd(y, vmul_vd_vd_vd(vcast_vd_d(0.5), e)); y = vsel_vd_vo_vd_vd(vor_vo_vo_vo(vgt_vo_vd_vd(vabs_vd_vd(x), vcast_vd_d(709)), visnan_vo_vd(y)), vcast_vd_d(SLEEF_INFINITY), y); y = vmulsign_vd_vd_vd(y, x); y = vreinterpret_vd_vm(vor_vm_vo64_vm(visnan_vo_vd(x), vreinterpret_vm_vd(y))); return y; } EXPORT CONST VECTOR_CC vdouble xcosh_u35(vdouble x) { vdouble e = xexp(vabs_vd_vd(x)); vdouble y = vmla_vd_vd_vd_vd(vcast_vd_d(0.5), e, vdiv_vd_vd_vd(vcast_vd_d(0.5), e)); y = vsel_vd_vo_vd_vd(vor_vo_vo_vo(vgt_vo_vd_vd(vabs_vd_vd(x), vcast_vd_d(709)), visnan_vo_vd(y)), vcast_vd_d(SLEEF_INFINITY), y); y = vreinterpret_vd_vm(vor_vm_vo64_vm(visnan_vo_vd(x), vreinterpret_vm_vd(y))); return y; } EXPORT CONST VECTOR_CC vdouble xtanh_u35(vdouble x) { vdouble d = expm1k(vmul_vd_vd_vd(vcast_vd_d(2), vabs_vd_vd(x))); vdouble y = vdiv_vd_vd_vd(d, vadd_vd_vd_vd(vcast_vd_d(2), d)); y = vsel_vd_vo_vd_vd(vor_vo_vo_vo(vgt_vo_vd_vd(vabs_vd_vd(x), vcast_vd_d(18.714973875)), visnan_vo_vd(y)), vcast_vd_d(1.0), y); y = vmulsign_vd_vd_vd(y, x); y = vreinterpret_vd_vm(vor_vm_vo64_vm(visnan_vo_vd(x), vreinterpret_vm_vd(y))); return y; } static INLINE CONST VECTOR_CC vdouble2 logk2(vdouble2 d) { vdouble2 x, x2, m, s; vdouble t; vint e; e = vilogbk_vi_vd(vmul_vd_vd_vd(vd2getx_vd_vd2(d), vcast_vd_d(1.0/0.75))); m = vd2setxy_vd2_vd_vd(vldexp2_vd_vd_vi(vd2getx_vd_vd2(d), vneg_vi_vi(e)), vldexp2_vd_vd_vi(vd2gety_vd_vd2(d), vneg_vi_vi(e))); x = dddiv_vd2_vd2_vd2(ddadd2_vd2_vd2_vd(m, vcast_vd_d(-1)), ddadd2_vd2_vd2_vd(m, vcast_vd_d(1))); x2 = ddsqu_vd2_vd2(x); vdouble x4 = vmul_vd_vd_vd(vd2getx_vd_vd2(x2), vd2getx_vd_vd2(x2)), x8 = vmul_vd_vd_vd(x4, x4); t = POLY7(vd2getx_vd_vd2(x2), x4, x8, 0.13860436390467167910856, 0.131699838841615374240845, 0.153914168346271945653214, 0.181816523941564611721589, 0.22222224632662035403996, 0.285714285511134091777308, 0.400000000000914013309483); t = vmla_vd_vd_vd_vd(t, vd2getx_vd_vd2(x2), vcast_vd_d(0.666666666666664853302393)); s = ddmul_vd2_vd2_vd(vcast_vd2_d_d(0.693147180559945286226764, 2.319046813846299558417771e-17), vcast_vd_vi(e)); s = ddadd_vd2_vd2_vd2(s, ddscale_vd2_vd2_vd(x, vcast_vd_d(2))); s = ddadd_vd2_vd2_vd2(s, ddmul_vd2_vd2_vd(ddmul_vd2_vd2_vd2(x2, x), t)); return s; } EXPORT CONST VECTOR_CC vdouble xasinh(vdouble x) { vdouble y = vabs_vd_vd(x); vopmask o = vgt_vo_vd_vd(y, vcast_vd_d(1)); vdouble2 d; d = vsel_vd2_vo_vd2_vd2(o, ddrec_vd2_vd(x), vcast_vd2_vd_vd(y, vcast_vd_d(0))); d = ddsqrt_vd2_vd2(ddadd2_vd2_vd2_vd(ddsqu_vd2_vd2(d), vcast_vd_d(1))); d = vsel_vd2_vo_vd2_vd2(o, ddmul_vd2_vd2_vd(d, y), d); d = logk2(ddnormalize_vd2_vd2(ddadd2_vd2_vd2_vd(d, x))); y = vadd_vd_vd_vd(vd2getx_vd_vd2(d), vd2gety_vd_vd2(d)); y = vsel_vd_vo_vd_vd(vor_vo_vo_vo(vgt_vo_vd_vd(vabs_vd_vd(x), vcast_vd_d(SQRT_DBL_MAX)), visnan_vo_vd(y)), vmulsign_vd_vd_vd(vcast_vd_d(SLEEF_INFINITY), x), y); y = vreinterpret_vd_vm(vor_vm_vo64_vm(visnan_vo_vd(x), vreinterpret_vm_vd(y))); y = vsel_vd_vo_vd_vd(visnegzero_vo_vd(x), vcast_vd_d(-0.0), y); return y; } EXPORT CONST VECTOR_CC vdouble xacosh(vdouble x) { vdouble2 d = logk2(ddadd2_vd2_vd2_vd(ddmul_vd2_vd2_vd2(ddsqrt_vd2_vd2(ddadd2_vd2_vd_vd(x, vcast_vd_d(1))), ddsqrt_vd2_vd2(ddadd2_vd2_vd_vd(x, vcast_vd_d(-1)))), x)); vdouble y = vadd_vd_vd_vd(vd2getx_vd_vd2(d), vd2gety_vd_vd2(d)); y = vsel_vd_vo_vd_vd(vor_vo_vo_vo(vgt_vo_vd_vd(vabs_vd_vd(x), vcast_vd_d(SQRT_DBL_MAX)), visnan_vo_vd(y)), vcast_vd_d(SLEEF_INFINITY), y); y = vreinterpret_vd_vm(vandnot_vm_vo64_vm(veq_vo_vd_vd(x, vcast_vd_d(1.0)), vreinterpret_vm_vd(y))); y = vreinterpret_vd_vm(vor_vm_vo64_vm(vlt_vo_vd_vd(x, vcast_vd_d(1.0)), vreinterpret_vm_vd(y))); y = vreinterpret_vd_vm(vor_vm_vo64_vm(visnan_vo_vd(x), vreinterpret_vm_vd(y))); return y; } EXPORT CONST VECTOR_CC vdouble xatanh(vdouble x) { vdouble y = vabs_vd_vd(x); vdouble2 d = logk2(dddiv_vd2_vd2_vd2(ddadd2_vd2_vd_vd(vcast_vd_d(1), y), ddadd2_vd2_vd_vd(vcast_vd_d(1), vneg_vd_vd(y)))); y = vreinterpret_vd_vm(vor_vm_vo64_vm(vgt_vo_vd_vd(y, vcast_vd_d(1.0)), vreinterpret_vm_vd(vsel_vd_vo_vd_vd(veq_vo_vd_vd(y, vcast_vd_d(1.0)), vcast_vd_d(SLEEF_INFINITY), vmul_vd_vd_vd(vadd_vd_vd_vd(vd2getx_vd_vd2(d), vd2gety_vd_vd2(d)), vcast_vd_d(0.5)))))); y = vmulsign_vd_vd_vd(y, x); y = vreinterpret_vd_vm(vor_vm_vo64_vm(vor_vo_vo_vo(visinf_vo_vd(x), visnan_vo_vd(y)), vreinterpret_vm_vd(y))); y = vreinterpret_vd_vm(vor_vm_vo64_vm(visnan_vo_vd(x), vreinterpret_vm_vd(y))); return y; } EXPORT CONST VECTOR_CC vdouble xcbrt(vdouble d) { vdouble x, y, q = vcast_vd_d(1.0); vint e, qu, re; vdouble t; #if defined(ENABLE_AVX512F) || defined(ENABLE_AVX512FNOFMA) vdouble s = d; #endif e = vadd_vi_vi_vi(vilogbk_vi_vd(vabs_vd_vd(d)), vcast_vi_i(1)); d = vldexp2_vd_vd_vi(d, vneg_vi_vi(e)); t = vadd_vd_vd_vd(vcast_vd_vi(e), vcast_vd_d(6144)); qu = vtruncate_vi_vd(vmul_vd_vd_vd(t, vcast_vd_d(1.0/3.0))); re = vtruncate_vi_vd(vsub_vd_vd_vd(t, vmul_vd_vd_vd(vcast_vd_vi(qu), vcast_vd_d(3)))); q = vsel_vd_vo_vd_vd(vcast_vo64_vo32(veq_vo_vi_vi(re, vcast_vi_i(1))), vcast_vd_d(1.2599210498948731647672106), q); q = vsel_vd_vo_vd_vd(vcast_vo64_vo32(veq_vo_vi_vi(re, vcast_vi_i(2))), vcast_vd_d(1.5874010519681994747517056), q); q = vldexp2_vd_vd_vi(q, vsub_vi_vi_vi(qu, vcast_vi_i(2048))); q = vmulsign_vd_vd_vd(q, d); d = vabs_vd_vd(d); x = vcast_vd_d(-0.640245898480692909870982); x = vmla_vd_vd_vd_vd(x, d, vcast_vd_d(2.96155103020039511818595)); x = vmla_vd_vd_vd_vd(x, d, vcast_vd_d(-5.73353060922947843636166)); x = vmla_vd_vd_vd_vd(x, d, vcast_vd_d(6.03990368989458747961407)); x = vmla_vd_vd_vd_vd(x, d, vcast_vd_d(-3.85841935510444988821632)); x = vmla_vd_vd_vd_vd(x, d, vcast_vd_d(2.2307275302496609725722)); y = vmul_vd_vd_vd(x, x); y = vmul_vd_vd_vd(y, y); x = vsub_vd_vd_vd(x, vmul_vd_vd_vd(vmlapn_vd_vd_vd_vd(d, y, x), vcast_vd_d(1.0 / 3.0))); y = vmul_vd_vd_vd(vmul_vd_vd_vd(d, x), x); y = vmul_vd_vd_vd(vsub_vd_vd_vd(y, vmul_vd_vd_vd(vmul_vd_vd_vd(vcast_vd_d(2.0 / 3.0), y), vmla_vd_vd_vd_vd(y, x, vcast_vd_d(-1.0)))), q); #if defined(ENABLE_AVX512F) || defined(ENABLE_AVX512FNOFMA) y = vsel_vd_vo_vd_vd(visinf_vo_vd(s), vmulsign_vd_vd_vd(vcast_vd_d(SLEEF_INFINITY), s), y); y = vsel_vd_vo_vd_vd(veq_vo_vd_vd(s, vcast_vd_d(0)), vmulsign_vd_vd_vd(vcast_vd_d(0), s), y); #endif return y; } EXPORT CONST VECTOR_CC vdouble xcbrt_u1(vdouble d) { vdouble x, y, z, t; vdouble2 q2 = vcast_vd2_d_d(1, 0), u, v; vint e, qu, re; #if defined(ENABLE_AVX512F) || defined(ENABLE_AVX512FNOFMA) vdouble s = d; #endif e = vadd_vi_vi_vi(vilogbk_vi_vd(vabs_vd_vd(d)), vcast_vi_i(1)); d = vldexp2_vd_vd_vi(d, vneg_vi_vi(e)); t = vadd_vd_vd_vd(vcast_vd_vi(e), vcast_vd_d(6144)); qu = vtruncate_vi_vd(vmul_vd_vd_vd(t, vcast_vd_d(1.0/3.0))); re = vtruncate_vi_vd(vsub_vd_vd_vd(t, vmul_vd_vd_vd(vcast_vd_vi(qu), vcast_vd_d(3)))); q2 = vsel_vd2_vo_vd2_vd2(vcast_vo64_vo32(veq_vo_vi_vi(re, vcast_vi_i(1))), vcast_vd2_d_d(1.2599210498948731907, -2.5899333753005069177e-17), q2); q2 = vsel_vd2_vo_vd2_vd2(vcast_vo64_vo32(veq_vo_vi_vi(re, vcast_vi_i(2))), vcast_vd2_d_d(1.5874010519681995834, -1.0869008194197822986e-16), q2); q2 = vd2setxy_vd2_vd_vd(vmulsign_vd_vd_vd(vd2getx_vd_vd2(q2), d), vmulsign_vd_vd_vd(vd2gety_vd_vd2(q2), d)); d = vabs_vd_vd(d); x = vcast_vd_d(-0.640245898480692909870982); x = vmla_vd_vd_vd_vd(x, d, vcast_vd_d(2.96155103020039511818595)); x = vmla_vd_vd_vd_vd(x, d, vcast_vd_d(-5.73353060922947843636166)); x = vmla_vd_vd_vd_vd(x, d, vcast_vd_d(6.03990368989458747961407)); x = vmla_vd_vd_vd_vd(x, d, vcast_vd_d(-3.85841935510444988821632)); x = vmla_vd_vd_vd_vd(x, d, vcast_vd_d(2.2307275302496609725722)); y = vmul_vd_vd_vd(x, x); y = vmul_vd_vd_vd(y, y); x = vsub_vd_vd_vd(x, vmul_vd_vd_vd(vmlapn_vd_vd_vd_vd(d, y, x), vcast_vd_d(1.0 / 3.0))); z = x; u = ddmul_vd2_vd_vd(x, x); u = ddmul_vd2_vd2_vd2(u, u); u = ddmul_vd2_vd2_vd(u, d); u = ddadd2_vd2_vd2_vd(u, vneg_vd_vd(x)); y = vadd_vd_vd_vd(vd2getx_vd_vd2(u), vd2gety_vd_vd2(u)); y = vmul_vd_vd_vd(vmul_vd_vd_vd(vcast_vd_d(-2.0 / 3.0), y), z); v = ddadd2_vd2_vd2_vd(ddmul_vd2_vd_vd(z, z), y); v = ddmul_vd2_vd2_vd(v, d); v = ddmul_vd2_vd2_vd2(v, q2); z = vldexp2_vd_vd_vi(vadd_vd_vd_vd(vd2getx_vd_vd2(v), vd2gety_vd_vd2(v)), vsub_vi_vi_vi(qu, vcast_vi_i(2048))); #if !defined(ENABLE_AVX512F) && !defined(ENABLE_AVX512FNOFMA) z = vsel_vd_vo_vd_vd(visinf_vo_vd(d), vmulsign_vd_vd_vd(vcast_vd_d(SLEEF_INFINITY), vd2getx_vd_vd2(q2)), z); z = vsel_vd_vo_vd_vd(veq_vo_vd_vd(d, vcast_vd_d(0)), vreinterpret_vd_vm(vsignbit_vm_vd(vd2getx_vd_vd2(q2))), z); #else z = vsel_vd_vo_vd_vd(visinf_vo_vd(s), vmulsign_vd_vd_vd(vcast_vd_d(SLEEF_INFINITY), s), z); z = vsel_vd_vo_vd_vd(veq_vo_vd_vd(s, vcast_vd_d(0)), vmulsign_vd_vd_vd(vcast_vd_d(0), s), z); #endif return z; } #endif // #if !defined(DETERMINISTIC) EXPORT CONST VECTOR_CC vdouble xexp2(vdouble d) { vdouble u = vrint_vd_vd(d), s; vint q = vrint_vi_vd(u); s = vsub_vd_vd_vd(d, u); vdouble s2 = vmul_vd_vd_vd(s, s), s4 = vmul_vd_vd_vd(s2, s2), s8 = vmul_vd_vd_vd(s4, s4); u = POLY10(s, s2, s4, s8, +0.4434359082926529454e-9, +0.7073164598085707425e-8, +0.1017819260921760451e-6, +0.1321543872511327615e-5, +0.1525273353517584730e-4, +0.1540353045101147808e-3, +0.1333355814670499073e-2, +0.9618129107597600536e-2, +0.5550410866482046596e-1, +0.2402265069591012214e+0); u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(+0.6931471805599452862e+0)); #ifdef ENABLE_FMA_DP u = vfma_vd_vd_vd_vd(u, s, vcast_vd_d(1)); #else u = vd2getx_vd_vd2(ddnormalize_vd2_vd2(ddadd_vd2_vd_vd2(vcast_vd_d(1), ddmul_vd2_vd_vd(u, s)))); #endif u = vldexp2_vd_vd_vi(u, q); u = vsel_vd_vo_vd_vd(vge_vo_vd_vd(d, vcast_vd_d(1024)), vcast_vd_d(SLEEF_INFINITY), u); u = vreinterpret_vd_vm(vandnot_vm_vo64_vm(vlt_vo_vd_vd(d, vcast_vd_d(-2000)), vreinterpret_vm_vd(u))); return u; } EXPORT CONST VECTOR_CC vdouble xexp2_u35(vdouble d) { vdouble u = vrint_vd_vd(d), s; vint q = vrint_vi_vd(u); s = vsub_vd_vd_vd(d, u); vdouble s2 = vmul_vd_vd_vd(s, s), s4 = vmul_vd_vd_vd(s2, s2), s8 = vmul_vd_vd_vd(s4, s4); u = POLY10(s, s2, s4, s8, +0.4434359082926529454e-9, +0.7073164598085707425e-8, +0.1017819260921760451e-6, +0.1321543872511327615e-5, +0.1525273353517584730e-4, +0.1540353045101147808e-3, +0.1333355814670499073e-2, +0.9618129107597600536e-2, +0.5550410866482046596e-1, +0.2402265069591012214e+0); u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(+0.6931471805599452862e+0)); u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(1)); u = vldexp2_vd_vd_vi(u, q); u = vsel_vd_vo_vd_vd(vge_vo_vd_vd(d, vcast_vd_d(1024)), vcast_vd_d(SLEEF_INFINITY), u); u = vreinterpret_vd_vm(vandnot_vm_vo64_vm(vlt_vo_vd_vd(d, vcast_vd_d(-2000)), vreinterpret_vm_vd(u))); return u; } EXPORT CONST VECTOR_CC vdouble xexp10(vdouble d) { vdouble u = vrint_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(LOG10_2))), s; vint q = vrint_vi_vd(u); s = vmla_vd_vd_vd_vd(u, vcast_vd_d(-L10U), d); s = vmla_vd_vd_vd_vd(u, vcast_vd_d(-L10L), s); u = vcast_vd_d(+0.2411463498334267652e-3); u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(+0.1157488415217187375e-2)); u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(+0.5013975546789733659e-2)); u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(+0.1959762320720533080e-1)); u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(+0.6808936399446784138e-1)); u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(+0.2069958494722676234e+0)); u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(+0.5393829292058536229e+0)); u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(+0.1171255148908541655e+1)); u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(+0.2034678592293432953e+1)); u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(+0.2650949055239205876e+1)); u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(+0.2302585092994045901e+1)); #ifdef ENABLE_FMA_DP u = vfma_vd_vd_vd_vd(u, s, vcast_vd_d(1)); #else u = vd2getx_vd_vd2(ddnormalize_vd2_vd2(ddadd_vd2_vd_vd2(vcast_vd_d(1), ddmul_vd2_vd_vd(u, s)))); #endif u = vldexp2_vd_vd_vi(u, q); u = vsel_vd_vo_vd_vd(vgt_vo_vd_vd(d, vcast_vd_d(308.25471555991671)), vcast_vd_d(SLEEF_INFINITY), u); u = vreinterpret_vd_vm(vandnot_vm_vo64_vm(vlt_vo_vd_vd(d, vcast_vd_d(-350)), vreinterpret_vm_vd(u))); return u; } EXPORT CONST VECTOR_CC vdouble xexp10_u35(vdouble d) { vdouble u = vrint_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(LOG10_2))), s; vint q = vrint_vi_vd(u); s = vmla_vd_vd_vd_vd(u, vcast_vd_d(-L10U), d); s = vmla_vd_vd_vd_vd(u, vcast_vd_d(-L10L), s); vdouble s2 = vmul_vd_vd_vd(s, s), s4 = vmul_vd_vd_vd(s2, s2), s8 = vmul_vd_vd_vd(s4, s4); u = POLY11(s, s2, s4, s8, +0.2411463498334267652e-3, +0.1157488415217187375e-2, +0.5013975546789733659e-2, +0.1959762320720533080e-1, +0.6808936399446784138e-1, +0.2069958494722676234e+0, +0.5393829292058536229e+0, +0.1171255148908541655e+1, +0.2034678592293432953e+1, +0.2650949055239205876e+1, +0.2302585092994045901e+1); u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(1)); u = vldexp2_vd_vd_vi(u, q); u = vsel_vd_vo_vd_vd(vgt_vo_vd_vd(d, vcast_vd_d(308.25471555991671)), vcast_vd_d(SLEEF_INFINITY), u); u = vreinterpret_vd_vm(vandnot_vm_vo64_vm(vlt_vo_vd_vd(d, vcast_vd_d(-350)), vreinterpret_vm_vd(u))); return u; } #if !defined(DETERMINISTIC) EXPORT CONST VECTOR_CC vdouble xexpm1(vdouble a) { vdouble2 d = ddadd2_vd2_vd2_vd(expk2(vcast_vd2_vd_vd(a, vcast_vd_d(0))), vcast_vd_d(-1.0)); vdouble x = vadd_vd_vd_vd(vd2getx_vd_vd2(d), vd2gety_vd_vd2(d)); x = vsel_vd_vo_vd_vd(vgt_vo_vd_vd(a, vcast_vd_d(709.782712893383996732223)), vcast_vd_d(SLEEF_INFINITY), x); x = vsel_vd_vo_vd_vd(vlt_vo_vd_vd(a, vcast_vd_d(-36.736800569677101399113302437)), vcast_vd_d(-1), x); x = vsel_vd_vo_vd_vd(visnegzero_vo_vd(a), vcast_vd_d(-0.0), x); return x; } EXPORT CONST VECTOR_CC vdouble xlog10(vdouble d) { vdouble2 x; vdouble t, m, x2; #if !defined(ENABLE_AVX512F) && !defined(ENABLE_AVX512FNOFMA) vopmask o = vlt_vo_vd_vd(d, vcast_vd_d(DBL_MIN)); d = vsel_vd_vo_vd_vd(o, vmul_vd_vd_vd(d, vcast_vd_d((double)(INT64_C(1) << 32) * (double)(INT64_C(1) << 32))), d); vint e = vilogb2k_vi_vd(vmul_vd_vd_vd(d, vcast_vd_d(1.0/0.75))); m = vldexp3_vd_vd_vi(d, vneg_vi_vi(e)); e = vsel_vi_vo_vi_vi(vcast_vo32_vo64(o), vsub_vi_vi_vi(e, vcast_vi_i(64)), e); #else vdouble e = vgetexp_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(1.0/0.75))); e = vsel_vd_vo_vd_vd(vispinf_vo_vd(e), vcast_vd_d(1024.0), e); m = vgetmant_vd_vd(d); #endif x = dddiv_vd2_vd2_vd2(ddadd2_vd2_vd_vd(vcast_vd_d(-1), m), ddadd2_vd2_vd_vd(vcast_vd_d(1), m)); x2 = vmul_vd_vd_vd(vd2getx_vd_vd2(x), vd2getx_vd_vd2(x)); vdouble x4 = vmul_vd_vd_vd(x2, x2), x8 = vmul_vd_vd_vd(x4, x4); t = POLY7(x2, x4, x8, +0.6653725819576758460e-1, +0.6625722782820833712e-1, +0.7898105214313944078e-1, +0.9650955035715275132e-1, +0.1240841409721444993e+0, +0.1737177927454605086e+0, +0.2895296546021972617e+0); #if !defined(ENABLE_AVX512F) && !defined(ENABLE_AVX512FNOFMA) vdouble2 s = ddmul_vd2_vd2_vd(vcast_vd2_d_d(0.30102999566398119802, -2.803728127785170339e-18), vcast_vd_vi(e)); #else vdouble2 s = ddmul_vd2_vd2_vd(vcast_vd2_d_d(0.30102999566398119802, -2.803728127785170339e-18), e); #endif s = ddadd_vd2_vd2_vd2(s, ddmul_vd2_vd2_vd2(x, vcast_vd2_d_d(0.86858896380650363334, 1.1430059694096389311e-17))); s = ddadd_vd2_vd2_vd(s, vmul_vd_vd_vd(vmul_vd_vd_vd(x2, vd2getx_vd_vd2(x)), t)); vdouble r = vadd_vd_vd_vd(vd2getx_vd_vd2(s), vd2gety_vd_vd2(s)); #if !defined(ENABLE_AVX512F) && !defined(ENABLE_AVX512FNOFMA) r = vsel_vd_vo_vd_vd(vispinf_vo_vd(d), vcast_vd_d(SLEEF_INFINITY), r); r = vsel_vd_vo_vd_vd(vor_vo_vo_vo(vlt_vo_vd_vd(d, vcast_vd_d(0)), visnan_vo_vd(d)), vcast_vd_d(SLEEF_NAN), r); r = vsel_vd_vo_vd_vd(veq_vo_vd_vd(d, vcast_vd_d(0)), vcast_vd_d(-SLEEF_INFINITY), r); #else r = vfixup_vd_vd_vd_vi2_i(r, d, vcast_vi2_i((4 << (2*4)) | (3 << (4*4)) | (5 << (5*4)) | (2 << (6*4))), 0); #endif return r; } EXPORT CONST VECTOR_CC vdouble xlog2(vdouble d) { vdouble2 x; vdouble t, m, x2; #if !defined(ENABLE_AVX512F) && !defined(ENABLE_AVX512FNOFMA) vopmask o = vlt_vo_vd_vd(d, vcast_vd_d(DBL_MIN)); d = vsel_vd_vo_vd_vd(o, vmul_vd_vd_vd(d, vcast_vd_d((double)(INT64_C(1) << 32) * (double)(INT64_C(1) << 32))), d); vint e = vilogb2k_vi_vd(vmul_vd_vd_vd(d, vcast_vd_d(1.0/0.75))); m = vldexp3_vd_vd_vi(d, vneg_vi_vi(e)); e = vsel_vi_vo_vi_vi(vcast_vo32_vo64(o), vsub_vi_vi_vi(e, vcast_vi_i(64)), e); #else vdouble e = vgetexp_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(1.0/0.75))); e = vsel_vd_vo_vd_vd(vispinf_vo_vd(e), vcast_vd_d(1024.0), e); m = vgetmant_vd_vd(d); #endif x = dddiv_vd2_vd2_vd2(ddadd2_vd2_vd_vd(vcast_vd_d(-1), m), ddadd2_vd2_vd_vd(vcast_vd_d(1), m)); x2 = vmul_vd_vd_vd(vd2getx_vd_vd2(x), vd2getx_vd_vd2(x)); vdouble x4 = vmul_vd_vd_vd(x2, x2), x8 = vmul_vd_vd_vd(x4, x4); t = POLY7(x2, x4, x8, +0.2211941750456081490e+0, +0.2200768693152277689e+0, +0.2623708057488514656e+0, +0.3205977477944495502e+0, +0.4121985945485324709e+0, +0.5770780162997058982e+0, +0.96179669392608091449); #if !defined(ENABLE_AVX512F) && !defined(ENABLE_AVX512FNOFMA) vdouble2 s = ddadd2_vd2_vd_vd2(vcast_vd_vi(e), ddmul_vd2_vd2_vd2(x, vcast_vd2_d_d(2.885390081777926774, 6.0561604995516736434e-18))); #else vdouble2 s = ddadd2_vd2_vd_vd2(e, ddmul_vd2_vd2_vd2(x, vcast_vd2_d_d(2.885390081777926774, 6.0561604995516736434e-18))); #endif s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(vmul_vd_vd_vd(x2, vd2getx_vd_vd2(x)), t)); vdouble r = vadd_vd_vd_vd(vd2getx_vd_vd2(s), vd2gety_vd_vd2(s)); #if !defined(ENABLE_AVX512F) && !defined(ENABLE_AVX512FNOFMA) r = vsel_vd_vo_vd_vd(vispinf_vo_vd(d), vcast_vd_d(SLEEF_INFINITY), r); r = vsel_vd_vo_vd_vd(vor_vo_vo_vo(vlt_vo_vd_vd(d, vcast_vd_d(0)), visnan_vo_vd(d)), vcast_vd_d(SLEEF_NAN), r); r = vsel_vd_vo_vd_vd(veq_vo_vd_vd(d, vcast_vd_d(0)), vcast_vd_d(-SLEEF_INFINITY), r); #else r = vfixup_vd_vd_vd_vi2_i(r, d, vcast_vi2_i((4 << (2*4)) | (3 << (4*4)) | (5 << (5*4)) | (2 << (6*4))), 0); #endif return r; } EXPORT CONST VECTOR_CC vdouble xlog2_u35(vdouble d) { vdouble m, t, x, x2; #if !defined(ENABLE_AVX512F) && !defined(ENABLE_AVX512FNOFMA) vopmask o = vlt_vo_vd_vd(d, vcast_vd_d(DBL_MIN)); d = vsel_vd_vo_vd_vd(o, vmul_vd_vd_vd(d, vcast_vd_d((double)(INT64_C(1) << 32) * (double)(INT64_C(1) << 32))), d); vint e = vilogb2k_vi_vd(vmul_vd_vd_vd(d, vcast_vd_d(1.0/0.75))); m = vldexp3_vd_vd_vi(d, vneg_vi_vi(e)); e = vsel_vi_vo_vi_vi(vcast_vo32_vo64(o), vsub_vi_vi_vi(e, vcast_vi_i(64)), e); #else vdouble e = vgetexp_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(1.0/0.75))); e = vsel_vd_vo_vd_vd(vispinf_vo_vd(e), vcast_vd_d(1024.0), e); m = vgetmant_vd_vd(d); #endif x = vdiv_vd_vd_vd(vsub_vd_vd_vd(m, vcast_vd_d(1)), vadd_vd_vd_vd(m, vcast_vd_d(1))); x2 = vmul_vd_vd_vd(x, x); t = vcast_vd_d(+0.2211941750456081490e+0); t = vmla_vd_vd_vd_vd(t, x2, vcast_vd_d(+0.2200768693152277689e+0)); t = vmla_vd_vd_vd_vd(t, x2, vcast_vd_d(+0.2623708057488514656e+0)); t = vmla_vd_vd_vd_vd(t, x2, vcast_vd_d(+0.3205977477944495502e+0)); t = vmla_vd_vd_vd_vd(t, x2, vcast_vd_d(+0.4121985945485324709e+0)); t = vmla_vd_vd_vd_vd(t, x2, vcast_vd_d(+0.5770780162997058982e+0)); t = vmla_vd_vd_vd_vd(t, x2, vcast_vd_d(+0.96179669392608091449 )); #if !defined(ENABLE_AVX512F) && !defined(ENABLE_AVX512FNOFMA) vdouble2 s = ddadd_vd2_vd_vd2(vcast_vd_vi(e), ddmul_vd2_vd_vd(x, vcast_vd_d(2.885390081777926774))); #else vdouble2 s = ddadd_vd2_vd_vd2(e, ddmul_vd2_vd_vd(x, vcast_vd_d(2.885390081777926774))); #endif vdouble r = vmla_vd_vd_vd_vd(t, vmul_vd_vd_vd(x, x2), vadd_vd_vd_vd(vd2getx_vd_vd2(s), vd2gety_vd_vd2(s))); #if !defined(ENABLE_AVX512F) && !defined(ENABLE_AVX512FNOFMA) r = vsel_vd_vo_vd_vd(vispinf_vo_vd(d), vcast_vd_d(SLEEF_INFINITY), r); r = vsel_vd_vo_vd_vd(vor_vo_vo_vo(vlt_vo_vd_vd(d, vcast_vd_d(0)), visnan_vo_vd(d)), vcast_vd_d(SLEEF_NAN), r); r = vsel_vd_vo_vd_vd(veq_vo_vd_vd(d, vcast_vd_d(0)), vcast_vd_d(-SLEEF_INFINITY), r); #else r = vfixup_vd_vd_vd_vi2_i(r, d, vcast_vi2_i((4 << (2*4)) | (3 << (4*4)) | (5 << (5*4)) | (2 << (6*4))), 0); #endif return r; } EXPORT CONST VECTOR_CC vdouble xlog1p(vdouble d) { vdouble2 x; vdouble t, m, x2; vdouble dp1 = vadd_vd_vd_vd(d, vcast_vd_d(1)); #if !defined(ENABLE_AVX512F) && !defined(ENABLE_AVX512FNOFMA) vopmask o = vlt_vo_vd_vd(dp1, vcast_vd_d(DBL_MIN)); dp1 = vsel_vd_vo_vd_vd(o, vmul_vd_vd_vd(dp1, vcast_vd_d((double)(INT64_C(1) << 32) * (double)(INT64_C(1) << 32))), dp1); vint e = vilogb2k_vi_vd(vmul_vd_vd_vd(dp1, vcast_vd_d(1.0/0.75))); t = vldexp3_vd_vd_vi(vcast_vd_d(1), vneg_vi_vi(e)); m = vmla_vd_vd_vd_vd(d, t, vsub_vd_vd_vd(t, vcast_vd_d(1))); e = vsel_vi_vo_vi_vi(vcast_vo32_vo64(o), vsub_vi_vi_vi(e, vcast_vi_i(64)), e); vdouble2 s = ddmul_vd2_vd2_vd(vcast_vd2_d_d(0.693147180559945286226764, 2.319046813846299558417771e-17), vcast_vd_vi(e)); #else vdouble e = vgetexp_vd_vd(vmul_vd_vd_vd(dp1, vcast_vd_d(1.0/0.75))); e = vsel_vd_vo_vd_vd(vispinf_vo_vd(e), vcast_vd_d(1024.0), e); t = vldexp3_vd_vd_vi(vcast_vd_d(1), vneg_vi_vi(vrint_vi_vd(e))); m = vmla_vd_vd_vd_vd(d, t, vsub_vd_vd_vd(t, vcast_vd_d(1))); vdouble2 s = ddmul_vd2_vd2_vd(vcast_vd2_d_d(0.693147180559945286226764, 2.319046813846299558417771e-17), e); #endif x = dddiv_vd2_vd2_vd2(vcast_vd2_vd_vd(m, vcast_vd_d(0)), ddadd_vd2_vd_vd(vcast_vd_d(2), m)); x2 = vmul_vd_vd_vd(vd2getx_vd_vd2(x), vd2getx_vd_vd2(x)); vdouble x4 = vmul_vd_vd_vd(x2, x2), x8 = vmul_vd_vd_vd(x4, x4); t = POLY7(x2, x4, x8, 0.1532076988502701353e+0, 0.1525629051003428716e+0, 0.1818605932937785996e+0, 0.2222214519839380009e+0, 0.2857142932794299317e+0, 0.3999999999635251990e+0, 0.6666666666667333541e+0); s = ddadd_vd2_vd2_vd2(s, ddscale_vd2_vd2_vd(x, vcast_vd_d(2))); s = ddadd_vd2_vd2_vd(s, vmul_vd_vd_vd(vmul_vd_vd_vd(x2, vd2getx_vd_vd2(x)), t)); vdouble r = vadd_vd_vd_vd(vd2getx_vd_vd2(s), vd2gety_vd_vd2(s)); r = vsel_vd_vo_vd_vd(vgt_vo_vd_vd(d, vcast_vd_d(1e+307)), vcast_vd_d(SLEEF_INFINITY), r); r = vsel_vd_vo_vd_vd(vor_vo_vo_vo(vlt_vo_vd_vd(d, vcast_vd_d(-1)), visnan_vo_vd(d)), vcast_vd_d(SLEEF_NAN), r); r = vsel_vd_vo_vd_vd(veq_vo_vd_vd(d, vcast_vd_d(-1)), vcast_vd_d(-SLEEF_INFINITY), r); r = vsel_vd_vo_vd_vd(visnegzero_vo_vd(d), vcast_vd_d(-0.0), r); return r; } // static INLINE CONST VECTOR_CC vint2 vcast_vi2_i_i(int i0, int i1) { return vcast_vi2_vm(vcast_vm_i_i(i0, i1)); } EXPORT CONST VECTOR_CC vdouble xfabs(vdouble x) { return vabs_vd_vd(x); } EXPORT CONST VECTOR_CC vdouble xcopysign(vdouble x, vdouble y) { return vcopysign_vd_vd_vd(x, y); } EXPORT CONST VECTOR_CC vdouble xfmax(vdouble x, vdouble y) { #if (defined(__x86_64__) || defined(__i386__)) && !defined(ENABLE_VECEXT) && !defined(ENABLE_PUREC) return vsel_vd_vo_vd_vd(visnan_vo_vd(y), x, vmax_vd_vd_vd(x, y)); #else return vsel_vd_vo_vd_vd(visnan_vo_vd(y), x, vsel_vd_vo_vd_vd(vgt_vo_vd_vd(x, y), x, y)); #endif } EXPORT CONST VECTOR_CC vdouble xfmin(vdouble x, vdouble y) { #if (defined(__x86_64__) || defined(__i386__)) && !defined(ENABLE_VECEXT) && !defined(ENABLE_PUREC) return vsel_vd_vo_vd_vd(visnan_vo_vd(y), x, vmin_vd_vd_vd(x, y)); #else return vsel_vd_vo_vd_vd(visnan_vo_vd(y), x, vsel_vd_vo_vd_vd(vgt_vo_vd_vd(y, x), x, y)); #endif } EXPORT CONST VECTOR_CC vdouble xfdim(vdouble x, vdouble y) { vdouble ret = vsub_vd_vd_vd(x, y); ret = vsel_vd_vo_vd_vd(vor_vo_vo_vo(vlt_vo_vd_vd(ret, vcast_vd_d(0)), veq_vo_vd_vd(x, y)), vcast_vd_d(0), ret); return ret; } EXPORT CONST VECTOR_CC vdouble xtrunc(vdouble x) { #ifdef FULL_FP_ROUNDING return vtruncate_vd_vd(x); #else vdouble fr = vsub_vd_vd_vd(x, vmul_vd_vd_vd(vcast_vd_d(INT64_C(1) << 31), vcast_vd_vi(vtruncate_vi_vd(vmul_vd_vd_vd(x, vcast_vd_d(1.0 / (INT64_C(1) << 31))))))); fr = vsub_vd_vd_vd(fr, vcast_vd_vi(vtruncate_vi_vd(fr))); return vsel_vd_vo_vd_vd(vor_vo_vo_vo(visinf_vo_vd(x), vge_vo_vd_vd(vabs_vd_vd(x), vcast_vd_d(INT64_C(1) << 52))), x, vcopysign_vd_vd_vd(vsub_vd_vd_vd(x, fr), x)); #endif } EXPORT CONST VECTOR_CC vdouble xfloor(vdouble x) { vdouble fr = vsub_vd_vd_vd(x, vmul_vd_vd_vd(vcast_vd_d(INT64_C(1) << 31), vcast_vd_vi(vtruncate_vi_vd(vmul_vd_vd_vd(x, vcast_vd_d(1.0 / (INT64_C(1) << 31))))))); fr = vsub_vd_vd_vd(fr, vcast_vd_vi(vtruncate_vi_vd(fr))); fr = vsel_vd_vo_vd_vd(vlt_vo_vd_vd(fr, vcast_vd_d(0)), vadd_vd_vd_vd(fr, vcast_vd_d(1.0)), fr); return vsel_vd_vo_vd_vd(vor_vo_vo_vo(visinf_vo_vd(x), vge_vo_vd_vd(vabs_vd_vd(x), vcast_vd_d(INT64_C(1) << 52))), x, vcopysign_vd_vd_vd(vsub_vd_vd_vd(x, fr), x)); } EXPORT CONST VECTOR_CC vdouble xceil(vdouble x) { vdouble fr = vsub_vd_vd_vd(x, vmul_vd_vd_vd(vcast_vd_d(INT64_C(1) << 31), vcast_vd_vi(vtruncate_vi_vd(vmul_vd_vd_vd(x, vcast_vd_d(1.0 / (INT64_C(1) << 31))))))); fr = vsub_vd_vd_vd(fr, vcast_vd_vi(vtruncate_vi_vd(fr))); fr = vsel_vd_vo_vd_vd(vle_vo_vd_vd(fr, vcast_vd_d(0)), fr, vsub_vd_vd_vd(fr, vcast_vd_d(1.0))); return vsel_vd_vo_vd_vd(vor_vo_vo_vo(visinf_vo_vd(x), vge_vo_vd_vd(vabs_vd_vd(x), vcast_vd_d(INT64_C(1) << 52))), x, vcopysign_vd_vd_vd(vsub_vd_vd_vd(x, fr), x)); } EXPORT CONST VECTOR_CC vdouble xround(vdouble d) { vdouble x = vadd_vd_vd_vd(d, vcast_vd_d(0.5)); vdouble fr = vsub_vd_vd_vd(x, vmul_vd_vd_vd(vcast_vd_d(INT64_C(1) << 31), vcast_vd_vi(vtruncate_vi_vd(vmul_vd_vd_vd(x, vcast_vd_d(1.0 / (INT64_C(1) << 31))))))); fr = vsub_vd_vd_vd(fr, vcast_vd_vi(vtruncate_vi_vd(fr))); x = vsel_vd_vo_vd_vd(vand_vo_vo_vo(vle_vo_vd_vd(x, vcast_vd_d(0)), veq_vo_vd_vd(fr, vcast_vd_d(0))), vsub_vd_vd_vd(x, vcast_vd_d(1.0)), x); fr = vsel_vd_vo_vd_vd(vlt_vo_vd_vd(fr, vcast_vd_d(0)), vadd_vd_vd_vd(fr, vcast_vd_d(1.0)), fr); x = vsel_vd_vo_vd_vd(veq_vo_vd_vd(d, vcast_vd_d(0.49999999999999994449)), vcast_vd_d(0), x); return vsel_vd_vo_vd_vd(vor_vo_vo_vo(visinf_vo_vd(d), vge_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(INT64_C(1) << 52))), d, vcopysign_vd_vd_vd(vsub_vd_vd_vd(x, fr), d)); } EXPORT CONST VECTOR_CC vdouble xrint(vdouble d) { #ifdef FULL_FP_ROUNDING return vrint_vd_vd(d); #else vdouble c = vmulsign_vd_vd_vd(vcast_vd_d(INT64_C(1) << 52), d); return vsel_vd_vo_vd_vd(vgt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(INT64_C(1) << 52)), d, vorsign_vd_vd_vd(vsub_vd_vd_vd(vadd_vd_vd_vd(d, c), c), d)); #endif } EXPORT CONST VECTOR_CC vdouble xnextafter(vdouble x, vdouble y) { x = vsel_vd_vo_vd_vd(veq_vo_vd_vd(x, vcast_vd_d(0)), vmulsign_vd_vd_vd(vcast_vd_d(0), y), x); vint2 t, xi2 = vreinterpret_vi2_vd(x); vopmask c = vxor_vo_vo_vo(vsignbit_vo_vd(x), vge_vo_vd_vd(y, x)); t = vadd_vi2_vi2_vi2(vxor_vi2_vi2_vi2(xi2, vcast_vi2_i_i(0x7fffffff, 0xffffffff)), vcast_vi2_i_i(0, 1)); t = vadd_vi2_vi2_vi2(t, vrev21_vi2_vi2(vand_vi2_vi2_vi2(vcast_vi2_i_i(0, 1), veq_vi2_vi2_vi2(t, vcast_vi2_i_i(-1, 0))))); xi2 = vreinterpret_vi2_vd(vsel_vd_vo_vd_vd(c, vreinterpret_vd_vi2(t), vreinterpret_vd_vi2(xi2))); xi2 = vsub_vi2_vi2_vi2(xi2, vcast_vi2_vm(vand_vm_vo64_vm(vneq_vo_vd_vd(x, y), vcast_vm_i_i(0, 1)))); xi2 = vreinterpret_vi2_vd(vsel_vd_vo_vd_vd(vneq_vo_vd_vd(x, y), vreinterpret_vd_vi2(vadd_vi2_vi2_vi2(xi2, vrev21_vi2_vi2(vand_vi2_vi2_vi2(vcast_vi2_i_i(0, -1), veq_vi2_vi2_vi2(xi2, vcast_vi2_i_i(0, -1)))))), vreinterpret_vd_vi2(xi2))); t = vadd_vi2_vi2_vi2(vxor_vi2_vi2_vi2(xi2, vcast_vi2_i_i(0x7fffffff, 0xffffffff)), vcast_vi2_i_i(0, 1)); t = vadd_vi2_vi2_vi2(t, vrev21_vi2_vi2(vand_vi2_vi2_vi2(vcast_vi2_i_i(0, 1), veq_vi2_vi2_vi2(t, vcast_vi2_i_i(-1, 0))))); xi2 = vreinterpret_vi2_vd(vsel_vd_vo_vd_vd(c, vreinterpret_vd_vi2(t), vreinterpret_vd_vi2(xi2))); vdouble ret = vreinterpret_vd_vi2(xi2); ret = vsel_vd_vo_vd_vd(vand_vo_vo_vo(veq_vo_vd_vd(ret, vcast_vd_d(0)), vneq_vo_vd_vd(x, vcast_vd_d(0))), vmulsign_vd_vd_vd(vcast_vd_d(0), x), ret); ret = vsel_vd_vo_vd_vd(vand_vo_vo_vo(veq_vo_vd_vd(x, vcast_vd_d(0)), veq_vo_vd_vd(y, vcast_vd_d(0))), y, ret); ret = vsel_vd_vo_vd_vd(vor_vo_vo_vo(visnan_vo_vd(x), visnan_vo_vd(y)), vcast_vd_d(SLEEF_NAN), ret); return ret; } EXPORT CONST VECTOR_CC vdouble xfrfrexp(vdouble x) { x = vsel_vd_vo_vd_vd(vlt_vo_vd_vd(vabs_vd_vd(x), vcast_vd_d(DBL_MIN)), vmul_vd_vd_vd(x, vcast_vd_d(UINT64_C(1) << 63)), x); vmask xm = vreinterpret_vm_vd(x); xm = vand_vm_vm_vm(xm, vcast_vm_i_i(~0x7ff00000, ~0)); xm = vor_vm_vm_vm (xm, vcast_vm_i_i( 0x3fe00000, 0)); vdouble ret = vreinterpret_vd_vm(xm); ret = vsel_vd_vo_vd_vd(visinf_vo_vd(x), vmulsign_vd_vd_vd(vcast_vd_d(SLEEF_INFINITY), x), ret); ret = vsel_vd_vo_vd_vd(veq_vo_vd_vd(x, vcast_vd_d(0)), x, ret); return ret; } EXPORT CONST VECTOR_CC vint xexpfrexp(vdouble x) { x = vsel_vd_vo_vd_vd(vlt_vo_vd_vd(vabs_vd_vd(x), vcast_vd_d(DBL_MIN)), vmul_vd_vd_vd(x, vcast_vd_d(UINT64_C(1) << 63)), x); vint ret = vcastu_vi_vi2(vreinterpret_vi2_vd(x)); ret = vsub_vi_vi_vi(vand_vi_vi_vi(vsrl_vi_vi_i(ret, 20), vcast_vi_i(0x7ff)), vcast_vi_i(0x3fe)); ret = vsel_vi_vo_vi_vi(vor_vo_vo_vo(vor_vo_vo_vo(veq_vo_vd_vd(x, vcast_vd_d(0)), visnan_vo_vd(x)), visinf_vo_vd(x)), vcast_vi_i(0), ret); return ret; } EXPORT CONST VECTOR_CC vdouble xfma(vdouble x, vdouble y, vdouble z) { #ifdef ENABLE_FMA_DP return vfma_vd_vd_vd_vd(x, y, z); #else vdouble h2 = vadd_vd_vd_vd(vmul_vd_vd_vd(x, y), z), q = vcast_vd_d(1); vopmask o = vlt_vo_vd_vd(vabs_vd_vd(h2), vcast_vd_d(1e-300)); { const double c0 = UINT64_C(1) << 54, c1 = c0 * c0, c2 = c1 * c1; x = vsel_vd_vo_vd_vd(o, vmul_vd_vd_vd(x, vcast_vd_d(c1)), x); y = vsel_vd_vo_vd_vd(o, vmul_vd_vd_vd(y, vcast_vd_d(c1)), y); z = vsel_vd_vo_vd_vd(o, vmul_vd_vd_vd(z, vcast_vd_d(c2)), z); q = vsel_vd_vo_vd_vd(o, vcast_vd_d(1.0 / c2), q); } o = vgt_vo_vd_vd(vabs_vd_vd(h2), vcast_vd_d(1e+300)); { const double c0 = UINT64_C(1) << 54, c1 = c0 * c0, c2 = c1 * c1; x = vsel_vd_vo_vd_vd(o, vmul_vd_vd_vd(x, vcast_vd_d(1.0 / c1)), x); y = vsel_vd_vo_vd_vd(o, vmul_vd_vd_vd(y, vcast_vd_d(1.0 / c1)), y); z = vsel_vd_vo_vd_vd(o, vmul_vd_vd_vd(z, vcast_vd_d(1.0 / c2)), z); q = vsel_vd_vo_vd_vd(o, vcast_vd_d(c2), q); } vdouble2 d = ddmul_vd2_vd_vd(x, y); d = ddadd2_vd2_vd2_vd(d, z); vdouble ret = vsel_vd_vo_vd_vd(vor_vo_vo_vo(veq_vo_vd_vd(x, vcast_vd_d(0)), veq_vo_vd_vd(y, vcast_vd_d(0))), z, vadd_vd_vd_vd(vd2getx_vd_vd2(d), vd2gety_vd_vd2(d))); o = visinf_vo_vd(z); o = vandnot_vo_vo_vo(visinf_vo_vd(x), o); o = vandnot_vo_vo_vo(visnan_vo_vd(x), o); o = vandnot_vo_vo_vo(visinf_vo_vd(y), o); o = vandnot_vo_vo_vo(visnan_vo_vd(y), o); h2 = vsel_vd_vo_vd_vd(o, z, h2); o = vor_vo_vo_vo(visinf_vo_vd(h2), visnan_vo_vd(h2)); return vsel_vd_vo_vd_vd(o, h2, vmul_vd_vd_vd(ret, q)); #endif } SQRTU05_FUNCATR VECTOR_CC vdouble xsqrt_u05(vdouble d) { #if defined(ENABLE_FMA_DP) vdouble q, w, x, y, z; d = vsel_vd_vo_vd_vd(vlt_vo_vd_vd(d, vcast_vd_d(0)), vcast_vd_d(SLEEF_NAN), d); vopmask o = vlt_vo_vd_vd(d, vcast_vd_d(8.636168555094445E-78)); d = vsel_vd_vo_vd_vd(o, vmul_vd_vd_vd(d, vcast_vd_d(1.157920892373162E77)), d); q = vsel_vd_vo_vd_vd(o, vcast_vd_d(2.9387358770557188E-39), vcast_vd_d(1)); y = vreinterpret_vd_vi2(vsub_vi2_vi2_vi2(vcast_vi2_i_i(0x5fe6ec85, 0xe7de30da), vsrl_vi2_vi2_i(vreinterpret_vi2_vd(d), 1))); x = vmul_vd_vd_vd(d, y); w = vmul_vd_vd_vd(vcast_vd_d(0.5), y); y = vfmanp_vd_vd_vd_vd(x, w, vcast_vd_d(0.5)); x = vfma_vd_vd_vd_vd(x, y, x); w = vfma_vd_vd_vd_vd(w, y, w); y = vfmanp_vd_vd_vd_vd(x, w, vcast_vd_d(0.5)); x = vfma_vd_vd_vd_vd(x, y, x); w = vfma_vd_vd_vd_vd(w, y, w); y = vfmanp_vd_vd_vd_vd(x, w, vcast_vd_d(0.5)); x = vfma_vd_vd_vd_vd(x, y, x); w = vfma_vd_vd_vd_vd(w, y, w); y = vfmanp_vd_vd_vd_vd(x, w, vcast_vd_d(1.5)); w = vadd_vd_vd_vd(w, w); w = vmul_vd_vd_vd(w, y); x = vmul_vd_vd_vd(w, d); y = vfmapn_vd_vd_vd_vd(w, d, x); z = vfmanp_vd_vd_vd_vd(w, x, vcast_vd_d(1)); z = vfmanp_vd_vd_vd_vd(w, y, z); w = vmul_vd_vd_vd(vcast_vd_d(0.5), x); w = vfma_vd_vd_vd_vd(w, z, y); w = vadd_vd_vd_vd(w, x); w = vmul_vd_vd_vd(w, q); w = vsel_vd_vo_vd_vd(vor_vo_vo_vo(veq_vo_vd_vd(d, vcast_vd_d(0)), veq_vo_vd_vd(d, vcast_vd_d(SLEEF_INFINITY))), d, w); w = vsel_vd_vo_vd_vd(vlt_vo_vd_vd(d, vcast_vd_d(0)), vcast_vd_d(SLEEF_NAN), w); return w; #else vdouble q; vopmask o; d = vsel_vd_vo_vd_vd(vlt_vo_vd_vd(d, vcast_vd_d(0)), vcast_vd_d(SLEEF_NAN), d); o = vlt_vo_vd_vd(d, vcast_vd_d(8.636168555094445E-78)); d = vsel_vd_vo_vd_vd(o, vmul_vd_vd_vd(d, vcast_vd_d(1.157920892373162E77)), d); q = vsel_vd_vo_vd_vd(o, vcast_vd_d(2.9387358770557188E-39*0.5), vcast_vd_d(0.5)); o = vgt_vo_vd_vd(d, vcast_vd_d(1.3407807929942597e+154)); d = vsel_vd_vo_vd_vd(o, vmul_vd_vd_vd(d, vcast_vd_d(7.4583407312002070e-155)), d); q = vsel_vd_vo_vd_vd(o, vcast_vd_d(1.1579208923731620e+77*0.5), q); vdouble x = vreinterpret_vd_vi2(vsub_vi2_vi2_vi2(vcast_vi2_i_i(0x5fe6ec86, 0), vsrl_vi2_vi2_i(vreinterpret_vi2_vd(vadd_vd_vd_vd(d, vcast_vd_d(1e-320))), 1))); x = vmul_vd_vd_vd(x, vsub_vd_vd_vd(vcast_vd_d(1.5), vmul_vd_vd_vd(vmul_vd_vd_vd(vmul_vd_vd_vd(vcast_vd_d(0.5), d), x), x))); x = vmul_vd_vd_vd(x, vsub_vd_vd_vd(vcast_vd_d(1.5), vmul_vd_vd_vd(vmul_vd_vd_vd(vmul_vd_vd_vd(vcast_vd_d(0.5), d), x), x))); x = vmul_vd_vd_vd(x, vsub_vd_vd_vd(vcast_vd_d(1.5), vmul_vd_vd_vd(vmul_vd_vd_vd(vmul_vd_vd_vd(vcast_vd_d(0.5), d), x), x))); x = vmul_vd_vd_vd(x, d); vdouble2 d2 = ddmul_vd2_vd2_vd2(ddadd2_vd2_vd_vd2(d, ddmul_vd2_vd_vd(x, x)), ddrec_vd2_vd(x)); x = vmul_vd_vd_vd(vadd_vd_vd_vd(vd2getx_vd_vd2(d2), vd2gety_vd_vd2(d2)), q); x = vsel_vd_vo_vd_vd(vispinf_vo_vd(d), vcast_vd_d(SLEEF_INFINITY), x); x = vsel_vd_vo_vd_vd(veq_vo_vd_vd(d, vcast_vd_d(0)), d, x); return x; #endif } EXPORT CONST VECTOR_CC vdouble xsqrt(vdouble d) { #if defined(ACCURATE_SQRT) return vsqrt_vd_vd(d); #else // fall back to approximation if ACCURATE_SQRT is undefined return xsqrt_u05(d); #endif } EXPORT CONST VECTOR_CC vdouble xsqrt_u35(vdouble d) { return xsqrt_u05(d); } EXPORT CONST VECTOR_CC vdouble xhypot_u05(vdouble x, vdouble y) { x = vabs_vd_vd(x); y = vabs_vd_vd(y); vdouble min = vmin_vd_vd_vd(x, y), n = min; vdouble max = vmax_vd_vd_vd(x, y), d = max; vopmask o = vlt_vo_vd_vd(max, vcast_vd_d(DBL_MIN)); n = vsel_vd_vo_vd_vd(o, vmul_vd_vd_vd(n, vcast_vd_d(UINT64_C(1) << 54)), n); d = vsel_vd_vo_vd_vd(o, vmul_vd_vd_vd(d, vcast_vd_d(UINT64_C(1) << 54)), d); vdouble2 t = dddiv_vd2_vd2_vd2(vcast_vd2_vd_vd(n, vcast_vd_d(0)), vcast_vd2_vd_vd(d, vcast_vd_d(0))); t = ddmul_vd2_vd2_vd(ddsqrt_vd2_vd2(ddadd2_vd2_vd2_vd(ddsqu_vd2_vd2(t), vcast_vd_d(1))), max); vdouble ret = vadd_vd_vd_vd(vd2getx_vd_vd2(t), vd2gety_vd_vd2(t)); ret = vsel_vd_vo_vd_vd(visnan_vo_vd(ret), vcast_vd_d(SLEEF_INFINITY), ret); ret = vsel_vd_vo_vd_vd(veq_vo_vd_vd(min, vcast_vd_d(0)), max, ret); ret = vsel_vd_vo_vd_vd(vor_vo_vo_vo(visnan_vo_vd(x), visnan_vo_vd(y)), vcast_vd_d(SLEEF_NAN), ret); ret = vsel_vd_vo_vd_vd(vor_vo_vo_vo(veq_vo_vd_vd(x, vcast_vd_d(SLEEF_INFINITY)), veq_vo_vd_vd(y, vcast_vd_d(SLEEF_INFINITY))), vcast_vd_d(SLEEF_INFINITY), ret); return ret; } EXPORT CONST VECTOR_CC vdouble xhypot_u35(vdouble x, vdouble y) { x = vabs_vd_vd(x); y = vabs_vd_vd(y); vdouble min = vmin_vd_vd_vd(x, y); vdouble max = vmax_vd_vd_vd(x, y); vdouble t = vdiv_vd_vd_vd(min, max); vdouble ret = vmul_vd_vd_vd(max, vsqrt_vd_vd(vmla_vd_vd_vd_vd(t, t, vcast_vd_d(1)))); ret = vsel_vd_vo_vd_vd(veq_vo_vd_vd(min, vcast_vd_d(0)), max, ret); ret = vsel_vd_vo_vd_vd(vor_vo_vo_vo(visnan_vo_vd(x), visnan_vo_vd(y)), vcast_vd_d(SLEEF_NAN), ret); ret = vsel_vd_vo_vd_vd(vor_vo_vo_vo(veq_vo_vd_vd(x, vcast_vd_d(SLEEF_INFINITY)), veq_vo_vd_vd(y, vcast_vd_d(SLEEF_INFINITY))), vcast_vd_d(SLEEF_INFINITY), ret); return ret; } static INLINE CONST VECTOR_CC vdouble vtoward0(vdouble x) { // returns nextafter(x, 0) vdouble t = vreinterpret_vd_vm(vadd64_vm_vm_vm(vreinterpret_vm_vd(x), vcast_vm_i_i(-1, -1))); return vsel_vd_vo_vd_vd(veq_vo_vd_vd(x, vcast_vd_d(0)), vcast_vd_d(0), t); } static INLINE CONST VECTOR_CC vdouble vptrunc(vdouble x) { // round to integer toward 0, positive argument only #ifdef FULL_FP_ROUNDING return vtruncate_vd_vd(x); #else vdouble fr = vmla_vd_vd_vd_vd(vcast_vd_d(-(double)(INT64_C(1) << 31)), vcast_vd_vi(vtruncate_vi_vd(vmul_vd_vd_vd(x, vcast_vd_d(1.0 / (INT64_C(1) << 31))))), x); fr = vsub_vd_vd_vd(fr, vcast_vd_vi(vtruncate_vi_vd(fr))); return vsel_vd_vo_vd_vd(vge_vo_vd_vd(vabs_vd_vd(x), vcast_vd_d(INT64_C(1) << 52)), x, vsub_vd_vd_vd(x, fr)); #endif } /* TODO AArch64: potential optimization by using `vfmad_lane_f64` */ EXPORT CONST VECTOR_CC vdouble xfmod(vdouble x, vdouble y) { vdouble n = vabs_vd_vd(x), d = vabs_vd_vd(y), s = vcast_vd_d(1), q; vopmask o = vlt_vo_vd_vd(d, vcast_vd_d(DBL_MIN)); n = vsel_vd_vo_vd_vd(o, vmul_vd_vd_vd(n, vcast_vd_d(UINT64_C(1) << 54)), n); d = vsel_vd_vo_vd_vd(o, vmul_vd_vd_vd(d, vcast_vd_d(UINT64_C(1) << 54)), d); s = vsel_vd_vo_vd_vd(o, vmul_vd_vd_vd(s , vcast_vd_d(1.0 / (UINT64_C(1) << 54))), s); vdouble2 r = vcast_vd2_vd_vd(n, vcast_vd_d(0)); vdouble rd = vtoward0(vrec_vd_vd(d)); for(int i=0;i<21;i++) { // ceil(log2(DBL_MAX) / 52) q = vptrunc(vmul_vd_vd_vd(vtoward0(vd2getx_vd_vd2(r)), rd)); #ifndef ENABLE_FMA_DP q = vreinterpret_vd_vm(vand_vm_vm_vm(vreinterpret_vm_vd(q), vcast_vm_i_i(0xffffffff, 0xfffffffe))); #endif q = vsel_vd_vo_vd_vd(vand_vo_vo_vo(vgt_vo_vd_vd(vmul_vd_vd_vd(vcast_vd_d(3), d), vd2getx_vd_vd2(r)), vge_vo_vd_vd(vd2getx_vd_vd2(r), d)), vcast_vd_d(2), q); q = vsel_vd_vo_vd_vd(vand_vo_vo_vo(vgt_vo_vd_vd(vadd_vd_vd_vd(d, d), vd2getx_vd_vd2(r)), vge_vo_vd_vd(vd2getx_vd_vd2(r), d)), vcast_vd_d(1), q); r = ddnormalize_vd2_vd2(ddadd2_vd2_vd2_vd2(r, ddmul_vd2_vd_vd(q, vneg_vd_vd(d)))); if (vtestallones_i_vo64(vlt_vo_vd_vd(vd2getx_vd_vd2(r), d))) break; } vdouble ret = vmul_vd_vd_vd(vd2getx_vd_vd2(r), s); ret = vsel_vd_vo_vd_vd(veq_vo_vd_vd(vadd_vd_vd_vd(vd2getx_vd_vd2(r), vd2gety_vd_vd2(r)), d), vcast_vd_d(0), ret); ret = vmulsign_vd_vd_vd(ret, x); ret = vsel_vd_vo_vd_vd(vlt_vo_vd_vd(n, d), x, ret); ret = vsel_vd_vo_vd_vd(veq_vo_vd_vd(d, vcast_vd_d(0)), vcast_vd_d(SLEEF_NAN), ret); return ret; } static INLINE VECTOR_CC vdouble vrintk2_vd_vd(vdouble d) { #ifdef FULL_FP_ROUNDING return vrint_vd_vd(d); #else vdouble c = vmulsign_vd_vd_vd(vcast_vd_d(INT64_C(1) << 52), d); return vsel_vd_vo_vd_vd(vgt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(INT64_C(1) << 52)), d, vorsign_vd_vd_vd(vsub_vd_vd_vd(vadd_vd_vd_vd(d, c), c), d)); #endif } EXPORT CONST VECTOR_CC vdouble xremainder(vdouble x, vdouble y) { vdouble n = vabs_vd_vd(x), d = vabs_vd_vd(y), s = vcast_vd_d(1), q; vopmask o = vlt_vo_vd_vd(d, vcast_vd_d(DBL_MIN*2)); n = vsel_vd_vo_vd_vd(o, vmul_vd_vd_vd(n, vcast_vd_d(UINT64_C(1) << 54)), n); d = vsel_vd_vo_vd_vd(o, vmul_vd_vd_vd(d, vcast_vd_d(UINT64_C(1) << 54)), d); s = vsel_vd_vo_vd_vd(o, vmul_vd_vd_vd(s , vcast_vd_d(1.0 / (UINT64_C(1) << 54))), s); vdouble rd = vrec_vd_vd(d); vdouble2 r = vcast_vd2_vd_vd(n, vcast_vd_d(0)); vopmask qisodd = vneq_vo_vd_vd(vcast_vd_d(0), vcast_vd_d(0)); for(int i=0;i<21;i++) { // ceil(log2(DBL_MAX) / 52) q = vrintk2_vd_vd(vmul_vd_vd_vd(vd2getx_vd_vd2(r), rd)); #ifndef ENABLE_FMA_DP q = vreinterpret_vd_vm(vand_vm_vm_vm(vreinterpret_vm_vd(q), vcast_vm_i_i(0xffffffff, 0xfffffffe))); #endif q = vsel_vd_vo_vd_vd(vlt_vo_vd_vd(vabs_vd_vd(vd2getx_vd_vd2(r)), vmul_vd_vd_vd(d, vcast_vd_d(1.5))), vmulsign_vd_vd_vd(vcast_vd_d(1.0), vd2getx_vd_vd2(r)), q); q = vsel_vd_vo_vd_vd(vor_vo_vo_vo(vlt_vo_vd_vd(vabs_vd_vd(vd2getx_vd_vd2(r)), vmul_vd_vd_vd(d, vcast_vd_d(0.5))), vandnot_vo_vo_vo(qisodd, veq_vo_vd_vd(vabs_vd_vd(vd2getx_vd_vd2(r)), vmul_vd_vd_vd(d, vcast_vd_d(0.5))))), vcast_vd_d(0.0), q); if (vtestallones_i_vo64(veq_vo_vd_vd(q, vcast_vd_d(0)))) break; q = vsel_vd_vo_vd_vd(visinf_vo_vd(vmul_vd_vd_vd(q, vneg_vd_vd(d))), vadd_vd_vd_vd(q, vmulsign_vd_vd_vd(vcast_vd_d(-1), vd2getx_vd_vd2(r))), q); qisodd = vxor_vo_vo_vo(qisodd, visodd_vo_vd(q)); r = ddnormalize_vd2_vd2(ddadd2_vd2_vd2_vd2(r, ddmul_vd2_vd_vd(q, vneg_vd_vd(d)))); } vdouble ret = vmul_vd_vd_vd(vd2getx_vd_vd2(r), s); ret = vmulsign_vd_vd_vd(ret, x); ret = vsel_vd_vo_vd_vd(visinf_vo_vd(y), vsel_vd_vo_vd_vd(visinf_vo_vd(x), vcast_vd_d(SLEEF_NAN), x), ret); ret = vsel_vd_vo_vd_vd(veq_vo_vd_vd(d, vcast_vd_d(0)), vcast_vd_d(SLEEF_NAN), ret); return ret; } #if !(defined(ENABLE_SVE) || defined(ENABLE_SVENOFMA)) typedef struct { vdouble2 a, b; } dd2; static dd2 dd2setab_dd2_vd2_vd2(vdouble2 a, vdouble2 b) { dd2 r = { a, b }; return r; } static vdouble2 dd2geta_vd2_dd2(dd2 d) { return d.a; } static vdouble2 dd2getb_vd2_dd2(dd2 d) { return d.b; } #endif /* TODO AArch64: potential optimization by using `vfmad_lane_f64` */ static CONST dd2 gammak(vdouble a) { vdouble2 clc = vcast_vd2_d_d(0, 0), clln = vcast_vd2_d_d(1, 0), clld = vcast_vd2_d_d(1, 0); vdouble2 v = vcast_vd2_d_d(1, 0), x, y, z; vdouble t, u; vopmask otiny = vlt_vo_vd_vd(vabs_vd_vd(a), vcast_vd_d(1e-306)), oref = vlt_vo_vd_vd(a, vcast_vd_d(0.5)); x = vsel_vd2_vo_vd2_vd2(otiny, vcast_vd2_d_d(0, 0), vsel_vd2_vo_vd2_vd2(oref, ddadd2_vd2_vd_vd(vcast_vd_d(1), vneg_vd_vd(a)), vcast_vd2_vd_vd(a, vcast_vd_d(0)))); vopmask o0 = vand_vo_vo_vo(vle_vo_vd_vd(vcast_vd_d(0.5), vd2getx_vd_vd2(x)), vle_vo_vd_vd(vd2getx_vd_vd2(x), vcast_vd_d(1.1))); vopmask o2 = vle_vo_vd_vd(vcast_vd_d(2.3), vd2getx_vd_vd2(x)); y = ddnormalize_vd2_vd2(ddmul_vd2_vd2_vd2(ddadd2_vd2_vd2_vd(x, vcast_vd_d(1)), x)); y = ddnormalize_vd2_vd2(ddmul_vd2_vd2_vd2(ddadd2_vd2_vd2_vd(x, vcast_vd_d(2)), y)); y = ddnormalize_vd2_vd2(ddmul_vd2_vd2_vd2(ddadd2_vd2_vd2_vd(x, vcast_vd_d(3)), y)); y = ddnormalize_vd2_vd2(ddmul_vd2_vd2_vd2(ddadd2_vd2_vd2_vd(x, vcast_vd_d(4)), y)); vopmask o = vand_vo_vo_vo(o2, vle_vo_vd_vd(vd2getx_vd_vd2(x), vcast_vd_d(7))); clln = vsel_vd2_vo_vd2_vd2(o, y, clln); x = vsel_vd2_vo_vd2_vd2(o, ddadd2_vd2_vd2_vd(x, vcast_vd_d(5)), x); t = vsel_vd_vo_vd_vd(o2, vrec_vd_vd(vd2getx_vd_vd2(x)), vd2getx_vd_vd2(ddnormalize_vd2_vd2(ddadd2_vd2_vd2_vd(x, vsel_vd_vo_d_d(o0, -1, -2))))); u = vsel_vd_vo_vo_d_d_d(o2, o0, -156.801412704022726379848862, +0.2947916772827614196e+2, +0.7074816000864609279e-7); u = vmla_vd_vd_vd_vd(u, t, vsel_vd_vo_vo_d_d_d(o2, o0, +1.120804464289911606838558160000, +0.1281459691827820109e+3, +0.4009244333008730443e-6)); u = vmla_vd_vd_vd_vd(u, t, vsel_vd_vo_vo_d_d_d(o2, o0, +13.39798545514258921833306020000, +0.2617544025784515043e+3, +0.1040114641628246946e-5)); u = vmla_vd_vd_vd_vd(u, t, vsel_vd_vo_vo_d_d_d(o2, o0, -0.116546276599463200848033357000, +0.3287022855685790432e+3, +0.1508349150733329167e-5)); u = vmla_vd_vd_vd_vd(u, t, vsel_vd_vo_vo_d_d_d(o2, o0, -1.391801093265337481495562410000, +0.2818145867730348186e+3, +0.1288143074933901020e-5)); u = vmla_vd_vd_vd_vd(u, t, vsel_vd_vo_vo_d_d_d(o2, o0, +0.015056113040026424412918973400, +0.1728670414673559605e+3, +0.4744167749884993937e-6)); u = vmla_vd_vd_vd_vd(u, t, vsel_vd_vo_vo_d_d_d(o2, o0, +0.179540117061234856098844714000, +0.7748735764030416817e+2, -0.6554816306542489902e-7)); u = vmla_vd_vd_vd_vd(u, t, vsel_vd_vo_vo_d_d_d(o2, o0, -0.002481743600264997730942489280, +0.2512856643080930752e+2, -0.3189252471452599844e-6)); u = vmla_vd_vd_vd_vd(u, t, vsel_vd_vo_vo_d_d_d(o2, o0, -0.029527880945699120504851034100, +0.5766792106140076868e+1, +0.1358883821470355377e-6)); u = vmla_vd_vd_vd_vd(u, t, vsel_vd_vo_vo_d_d_d(o2, o0, +0.000540164767892604515196325186, +0.7270275473996180571e+0, -0.4343931277157336040e-6)); u = vmla_vd_vd_vd_vd(u, t, vsel_vd_vo_vo_d_d_d(o2, o0, +0.006403362833808069794787256200, +0.8396709124579147809e-1, +0.9724785897406779555e-6)); u = vmla_vd_vd_vd_vd(u, t, vsel_vd_vo_vo_d_d_d(o2, o0, -0.000162516262783915816896611252, -0.8211558669746804595e-1, -0.2036886057225966011e-5)); u = vmla_vd_vd_vd_vd(u, t, vsel_vd_vo_vo_d_d_d(o2, o0, -0.001914438498565477526465972390, +0.6828831828341884458e-1, +0.4373363141819725815e-5)); u = vmla_vd_vd_vd_vd(u, t, vsel_vd_vo_vo_d_d_d(o2, o0, +7.20489541602001055898311517e-05, -0.7712481339961671511e-1, -0.9439951268304008677e-5)); u = vmla_vd_vd_vd_vd(u, t, vsel_vd_vo_vo_d_d_d(o2, o0, +0.000839498720672087279971000786, +0.8337492023017314957e-1, +0.2050727030376389804e-4)); u = vmla_vd_vd_vd_vd(u, t, vsel_vd_vo_vo_d_d_d(o2, o0, -5.17179090826059219329394422e-05, -0.9094964931456242518e-1, -0.4492620183431184018e-4)); u = vmla_vd_vd_vd_vd(u, t, vsel_vd_vo_vo_d_d_d(o2, o0, -0.000592166437353693882857342347, +0.1000996313575929358e+0, +0.9945751236071875931e-4)); u = vmla_vd_vd_vd_vd(u, t, vsel_vd_vo_vo_d_d_d(o2, o0, +6.97281375836585777403743539e-05, -0.1113342861544207724e+0, -0.2231547599034983196e-3)); u = vmla_vd_vd_vd_vd(u, t, vsel_vd_vo_vo_d_d_d(o2, o0, +0.000784039221720066627493314301, +0.1255096673213020875e+0, +0.5096695247101967622e-3)); u = vmla_vd_vd_vd_vd(u, t, vsel_vd_vo_vo_d_d_d(o2, o0, -0.000229472093621399176949318732, -0.1440498967843054368e+0, -0.1192753911667886971e-2)); u = vmla_vd_vd_vd_vd(u, t, vsel_vd_vo_vo_d_d_d(o2, o0, -0.002681327160493827160473958490, +0.1695571770041949811e+0, +0.2890510330742210310e-2)); u = vmla_vd_vd_vd_vd(u, t, vsel_vd_vo_vo_d_d_d(o2, o0, +0.003472222222222222222175164840, -0.2073855510284092762e+0, -0.7385551028674461858e-2)); u = vmla_vd_vd_vd_vd(u, t, vsel_vd_vo_vo_d_d_d(o2, o0, +0.083333333333333333335592087900, +0.2705808084277815939e+0, +0.2058080842778455335e-1)); y = ddmul_vd2_vd2_vd2(ddadd2_vd2_vd2_vd(x, vcast_vd_d(-0.5)), logk2(x)); y = ddadd2_vd2_vd2_vd2(y, ddneg_vd2_vd2(x)); y = ddadd2_vd2_vd2_vd2(y, vcast_vd2_d_d(0.91893853320467278056, -3.8782941580672414498e-17)); // 0.5*log(2*M_PI) z = ddadd2_vd2_vd2_vd(ddmul_vd2_vd_vd (u, t), vsel_vd_vo_d_d(o0, -0.4006856343865314862e+0, -0.6735230105319810201e-1)); z = ddadd2_vd2_vd2_vd(ddmul_vd2_vd2_vd(z, t), vsel_vd_vo_d_d(o0, +0.8224670334241132030e+0, +0.3224670334241132030e+0)); z = ddadd2_vd2_vd2_vd(ddmul_vd2_vd2_vd(z, t), vsel_vd_vo_d_d(o0, -0.5772156649015328655e+0, +0.4227843350984671345e+0)); z = ddmul_vd2_vd2_vd(z, t); clc = vsel_vd2_vo_vd2_vd2(o2, y, z); clld = vsel_vd2_vo_vd2_vd2(o2, ddadd2_vd2_vd2_vd(ddmul_vd2_vd_vd(u, t), vcast_vd_d(1)), clld); y = clln; clc = vsel_vd2_vo_vd2_vd2(otiny, vcast_vd2_d_d(83.1776616671934334590333, 3.67103459631568507221878e-15), // log(2^120) vsel_vd2_vo_vd2_vd2(oref, ddadd2_vd2_vd2_vd2(vcast_vd2_d_d(1.1447298858494001639, 1.026595116270782638e-17), ddneg_vd2_vd2(clc)), clc)); // log(M_PI) clln = vsel_vd2_vo_vd2_vd2(otiny, vcast_vd2_d_d(1, 0), vsel_vd2_vo_vd2_vd2(oref, clln, clld)); if (!vtestallones_i_vo64(vnot_vo64_vo64(oref))) { t = vsub_vd_vd_vd(a, vmul_vd_vd_vd(vcast_vd_d(INT64_C(1) << 28), vcast_vd_vi(vtruncate_vi_vd(vmul_vd_vd_vd(a, vcast_vd_d(1.0 / (INT64_C(1) << 28))))))); x = ddmul_vd2_vd2_vd2(clld, sinpik(t)); } clld = vsel_vd2_vo_vd2_vd2(otiny, vcast_vd2_vd_vd(vmul_vd_vd_vd(a, vcast_vd_d((INT64_C(1) << 60)*(double)(INT64_C(1) << 60))), vcast_vd_d(0)), vsel_vd2_vo_vd2_vd2(oref, x, y)); return dd2setab_dd2_vd2_vd2(clc, dddiv_vd2_vd2_vd2(clln, clld)); } EXPORT CONST VECTOR_CC vdouble xtgamma_u1(vdouble a) { dd2 d = gammak(a); vdouble2 y = ddmul_vd2_vd2_vd2(expk2(dd2geta_vd2_dd2(d)), dd2getb_vd2_dd2(d)); vdouble r = vadd_vd_vd_vd(vd2getx_vd_vd2(y), vd2gety_vd_vd2(y)); vopmask o; o = vor_vo_vo_vo(vor_vo_vo_vo(veq_vo_vd_vd(a, vcast_vd_d(-SLEEF_INFINITY)), vand_vo_vo_vo(vlt_vo_vd_vd(a, vcast_vd_d(0)), visint_vo_vd(a))), vand_vo_vo_vo(vand_vo_vo_vo(visnumber_vo_vd(a), vlt_vo_vd_vd(a, vcast_vd_d(0))), visnan_vo_vd(r))); r = vsel_vd_vo_vd_vd(o, vcast_vd_d(SLEEF_NAN), r); o = vand_vo_vo_vo(vand_vo_vo_vo(vor_vo_vo_vo(veq_vo_vd_vd(a, vcast_vd_d(SLEEF_INFINITY)), visnumber_vo_vd(a)), vge_vo_vd_vd(a, vcast_vd_d(-DBL_MIN))), vor_vo_vo_vo(vor_vo_vo_vo(veq_vo_vd_vd(a, vcast_vd_d(0)), vgt_vo_vd_vd(a, vcast_vd_d(200))), visnan_vo_vd(r))); r = vsel_vd_vo_vd_vd(o, vmulsign_vd_vd_vd(vcast_vd_d(SLEEF_INFINITY), a), r); return r; } EXPORT CONST VECTOR_CC vdouble xlgamma_u1(vdouble a) { dd2 d = gammak(a); vdouble2 y = ddadd2_vd2_vd2_vd2(dd2geta_vd2_dd2(d), logk2(ddabs_vd2_vd2(dd2getb_vd2_dd2(d)))); vdouble r = vadd_vd_vd_vd(vd2getx_vd_vd2(y), vd2gety_vd_vd2(y)); vopmask o; o = vor_vo_vo_vo(visinf_vo_vd(a), vor_vo_vo_vo(vand_vo_vo_vo(vle_vo_vd_vd(a, vcast_vd_d(0)), visint_vo_vd(a)), vand_vo_vo_vo(visnumber_vo_vd(a), visnan_vo_vd(r)))); r = vsel_vd_vo_vd_vd(o, vcast_vd_d(SLEEF_INFINITY), r); return r; } /* TODO AArch64: potential optimization by using `vfmad_lane_f64` */ EXPORT CONST VECTOR_CC vdouble xerf_u1(vdouble a) { vdouble s = a, t, u; vdouble2 d; a = vabs_vd_vd(a); vopmask o0 = vlt_vo_vd_vd(a, vcast_vd_d(1.0)); vopmask o1 = vlt_vo_vd_vd(a, vcast_vd_d(3.7)); vopmask o2 = vlt_vo_vd_vd(a, vcast_vd_d(6.0)); u = vsel_vd_vo_vd_vd(o0, vmul_vd_vd_vd(a, a), a); t = vsel_vd_vo_vo_d_d_d(o0, o1, +0.6801072401395392157e-20, +0.2830954522087717660e-13, -0.5846750404269610493e-17); t = vmla_vd_vd_vd_vd(t, u, vsel_vd_vo_vo_d_d_d(o0, o1, -0.2161766247570056391e-18, -0.1509491946179481940e-11, +0.6076691048812607898e-15)); t = vmla_vd_vd_vd_vd(t, u, vsel_vd_vo_vo_d_d_d(o0, o1, +0.4695919173301598752e-17, +0.3827857177807173152e-10, -0.3007518609604893831e-13)); t = vmla_vd_vd_vd_vd(t, u, vsel_vd_vo_vo_d_d_d(o0, o1, -0.9049140419888010819e-16, -0.6139733921558987241e-09, +0.9427906260824646063e-12)); t = vmla_vd_vd_vd_vd(t, u, vsel_vd_vo_vo_d_d_d(o0, o1, +0.1634018903557411517e-14, +0.6985387934608038824e-08, -0.2100110908269393629e-10)); t = vmla_vd_vd_vd_vd(t, u, vsel_vd_vo_vo_d_d_d(o0, o1, -0.2783485786333455216e-13, -0.5988224513034371474e-07, +0.3534639523461223473e-09)); t = vmla_vd_vd_vd_vd(t, u, vsel_vd_vo_vo_d_d_d(o0, o1, +0.4463221276786412722e-12, +0.4005716952355346640e-06, -0.4664967728285395926e-08)); t = vmla_vd_vd_vd_vd(t, u, vsel_vd_vo_vo_d_d_d(o0, o1, -0.6711366622850138987e-11, -0.2132190104575784400e-05, +0.4943823283769000532e-07)); t = vmla_vd_vd_vd_vd(t, u, vsel_vd_vo_vo_d_d_d(o0, o1, +0.9422759050232658346e-10, +0.9092461304042630325e-05, -0.4271203394761148254e-06)); t = vmla_vd_vd_vd_vd(t, u, vsel_vd_vo_vo_d_d_d(o0, o1, -0.1229055530100228477e-08, -0.3079188080966205457e-04, +0.3034067677404915895e-05)); t = vmla_vd_vd_vd_vd(t, u, vsel_vd_vo_vo_d_d_d(o0, o1, +0.1480719281585085023e-07, +0.7971413443082370762e-04, -0.1776295289066871135e-04)); t = vmla_vd_vd_vd_vd(t, u, vsel_vd_vo_vo_d_d_d(o0, o1, -0.1636584469123402714e-06, -0.1387853215225442864e-03, +0.8524547630559505050e-04)); t = vmla_vd_vd_vd_vd(t, u, vsel_vd_vo_vo_d_d_d(o0, o1, +0.1646211436588923363e-05, +0.6469678026257590965e-04, -0.3290582944961784398e-03)); t = vmla_vd_vd_vd_vd(t, u, vsel_vd_vo_vo_d_d_d(o0, o1, -0.1492565035840624866e-04, +0.4996645280372945860e-03, +0.9696966068789101157e-03)); t = vmla_vd_vd_vd_vd(t, u, vsel_vd_vo_vo_d_d_d(o0, o1, +0.1205533298178966496e-03, -0.1622802482842520535e-02, -0.1812527628046986137e-02)); t = vmla_vd_vd_vd_vd(t, u, vsel_vd_vo_vo_d_d_d(o0, o1, -0.8548327023450851166e-03, +0.1615320557049377171e-03, -0.4725409828123619017e-03)); t = vmla_vd_vd_vd_vd(t, u, vsel_vd_vo_vo_d_d_d(o0, o1, +0.5223977625442188799e-02, +0.1915262325574875607e-01, +0.2090315427924229266e-01)); t = vmla_vd_vd_vd_vd(t, u, vsel_vd_vo_vo_d_d_d(o0, o1, -0.2686617064513125569e-01, -0.1027818298486033455e+00, -0.1052041921842776645e+00)); t = vmla_vd_vd_vd_vd(t, u, vsel_vd_vo_vo_d_d_d(o0, o1, +0.1128379167095512753e+00, -0.6366172819842503827e+00, -0.6345351808766568347e+00)); t = vmla_vd_vd_vd_vd(t, u, vsel_vd_vo_vo_d_d_d(o0, o1, -0.3761263890318375380e+00, -0.1128379590648910469e+01, -0.1129442929103524396e+01)); d = ddmul_vd2_vd_vd(t, u); d = ddadd2_vd2_vd2_vd2(d, vcast_vd2_vd_vd(vsel_vd_vo_vo_d_d_d(o0, o1, 1.1283791670955125586, 3.4110644736196137587e-08, 0.00024963035690526438285), vsel_vd_vo_vo_d_d_d(o0, o1, 1.5335459613165822674e-17, -2.4875650708323294246e-24, -5.4362665034856259795e-21))); d = vsel_vd2_vo_vd2_vd2(o0, ddmul_vd2_vd2_vd(d, a), ddadd_vd2_vd_vd2(vcast_vd_d(1.0), ddneg_vd2_vd2(expk2(d)))); u = vmulsign_vd_vd_vd(vsel_vd_vo_vd_vd(o2, vadd_vd_vd_vd(vd2getx_vd_vd2(d), vd2gety_vd_vd2(d)), vcast_vd_d(1)), s); u = vsel_vd_vo_vd_vd(visnan_vo_vd(a), vcast_vd_d(SLEEF_NAN), u); return u; } /* TODO AArch64: potential optimization by using `vfmad_lane_f64` */ EXPORT CONST VECTOR_CC vdouble xerfc_u15(vdouble a) { vdouble s = a, r = vcast_vd_d(0), t; vdouble2 u, d, x; a = vabs_vd_vd(a); vopmask o0 = vlt_vo_vd_vd(a, vcast_vd_d(1.0)); vopmask o1 = vlt_vo_vd_vd(a, vcast_vd_d(2.2)); vopmask o2 = vlt_vo_vd_vd(a, vcast_vd_d(4.2)); vopmask o3 = vlt_vo_vd_vd(a, vcast_vd_d(27.3)); u = vsel_vd2_vo_vd2_vd2(o0, ddmul_vd2_vd_vd(a, a), vsel_vd2_vo_vd2_vd2(o1, vcast_vd2_vd_vd(a, vcast_vd_d(0)), dddiv_vd2_vd2_vd2(vcast_vd2_d_d(1, 0), vcast_vd2_vd_vd(a, vcast_vd_d(0))))); t = vsel_vd_vo_vo_vo_d_d_d_d(o0, o1, o2, +0.6801072401395386139e-20, +0.3438010341362585303e-12, -0.5757819536420710449e+2, +0.2334249729638701319e+5); t = vmla_vd_vd_vd_vd(t, vd2getx_vd_vd2(u), vsel_vd_vo_vo_vo_d_d_d_d(o0, o1, o2, -0.2161766247570055669e-18, -0.1237021188160598264e-10, +0.4669289654498104483e+3, -0.4695661044933107769e+5)); t = vmla_vd_vd_vd_vd(t, vd2getx_vd_vd2(u), vsel_vd_vo_vo_vo_d_d_d_d(o0, o1, o2, +0.4695919173301595670e-17, +0.2117985839877627852e-09, -0.1796329879461355858e+4, +0.3173403108748643353e+5)); t = vmla_vd_vd_vd_vd(t, vd2getx_vd_vd2(u), vsel_vd_vo_vo_vo_d_d_d_d(o0, o1, o2, -0.9049140419888007122e-16, -0.2290560929177369506e-08, +0.4355892193699575728e+4, +0.3242982786959573787e+4)); t = vmla_vd_vd_vd_vd(t, vd2getx_vd_vd2(u), vsel_vd_vo_vo_vo_d_d_d_d(o0, o1, o2, +0.1634018903557410728e-14, +0.1748931621698149538e-07, -0.7456258884965764992e+4, -0.2014717999760347811e+5)); t = vmla_vd_vd_vd_vd(t, vd2getx_vd_vd2(u), vsel_vd_vo_vo_vo_d_d_d_d(o0, o1, o2, -0.2783485786333451745e-13, -0.9956602606623249195e-07, +0.9553977358167021521e+4, +0.1554006970967118286e+5)); t = vmla_vd_vd_vd_vd(t, vd2getx_vd_vd2(u), vsel_vd_vo_vo_vo_d_d_d_d(o0, o1, o2, +0.4463221276786415752e-12, +0.4330010240640327080e-06, -0.9470019905444229153e+4, -0.6150874190563554293e+4)); t = vmla_vd_vd_vd_vd(t, vd2getx_vd_vd2(u), vsel_vd_vo_vo_vo_d_d_d_d(o0, o1, o2, -0.6711366622850136563e-11, -0.1435050600991763331e-05, +0.7387344321849855078e+4, +0.1240047765634815732e+4)); t = vmla_vd_vd_vd_vd(t, vd2getx_vd_vd2(u), vsel_vd_vo_vo_vo_d_d_d_d(o0, o1, o2, +0.9422759050232662223e-10, +0.3460139479650695662e-05, -0.4557713054166382790e+4, -0.8210325475752699731e+2)); t = vmla_vd_vd_vd_vd(t, vd2getx_vd_vd2(u), vsel_vd_vo_vo_vo_d_d_d_d(o0, o1, o2, -0.1229055530100229098e-08, -0.4988908180632898173e-05, +0.2207866967354055305e+4, +0.3242443880839930870e+2)); t = vmla_vd_vd_vd_vd(t, vd2getx_vd_vd2(u), vsel_vd_vo_vo_vo_d_d_d_d(o0, o1, o2, +0.1480719281585086512e-07, -0.1308775976326352012e-05, -0.8217975658621754746e+3, -0.2923418863833160586e+2)); t = vmla_vd_vd_vd_vd(t, vd2getx_vd_vd2(u), vsel_vd_vo_vo_vo_d_d_d_d(o0, o1, o2, -0.1636584469123399803e-06, +0.2825086540850310103e-04, +0.2268659483507917400e+3, +0.3457461732814383071e+0)); t = vmla_vd_vd_vd_vd(t, vd2getx_vd_vd2(u), vsel_vd_vo_vo_vo_d_d_d_d(o0, o1, o2, +0.1646211436588923575e-05, -0.6393913713069986071e-04, -0.4633361260318560682e+2, +0.5489730155952392998e+1)); t = vmla_vd_vd_vd_vd(t, vd2getx_vd_vd2(u), vsel_vd_vo_vo_vo_d_d_d_d(o0, o1, o2, -0.1492565035840623511e-04, -0.2566436514695078926e-04, +0.9557380123733945965e+1, +0.1559934132251294134e-2)); t = vmla_vd_vd_vd_vd(t, vd2getx_vd_vd2(u), vsel_vd_vo_vo_vo_d_d_d_d(o0, o1, o2, +0.1205533298178967851e-03, +0.5895792375659440364e-03, -0.2958429331939661289e+1, -0.1541741566831520638e+1)); t = vmla_vd_vd_vd_vd(t, vd2getx_vd_vd2(u), vsel_vd_vo_vo_vo_d_d_d_d(o0, o1, o2, -0.8548327023450850081e-03, -0.1695715579163588598e-02, +0.1670329508092765480e+0, +0.2823152230558364186e-5)); t = vmla_vd_vd_vd_vd(t, vd2getx_vd_vd2(u), vsel_vd_vo_vo_vo_d_d_d_d(o0, o1, o2, +0.5223977625442187932e-02, +0.2089116434918055149e-03, +0.6096615680115419211e+0, +0.6249999184195342838e+0)); t = vmla_vd_vd_vd_vd(t, vd2getx_vd_vd2(u), vsel_vd_vo_vo_vo_d_d_d_d(o0, o1, o2, -0.2686617064513125222e-01, +0.1912855949584917753e-01, +0.1059212443193543585e-2, +0.1741749416408701288e-8)); d = ddmul_vd2_vd2_vd(u, t); d = ddadd2_vd2_vd2_vd2(d, vcast_vd2_vd_vd(vsel_vd_vo_vo_vo_d_d_d_d(o0, o1, o2, 0.11283791670955126141, -0.10277263343147646779, -0.50005180473999022439, -0.5000000000258444377), vsel_vd_vo_vo_vo_d_d_d_d(o0, o1, o2, -4.0175691625932118483e-18, -6.2338714083404900225e-18, 2.6362140569041995803e-17, -4.0074044712386992281e-17))); d = ddmul_vd2_vd2_vd2(d, u); d = ddadd2_vd2_vd2_vd2(d, vcast_vd2_vd_vd(vsel_vd_vo_vo_vo_d_d_d_d(o0, o1, o2, -0.37612638903183753802, -0.63661976742916359662, 1.601106273924963368e-06, 2.3761973137523364792e-13), vsel_vd_vo_vo_vo_d_d_d_d(o0, o1, o2, 1.3391897206042552387e-17, 7.6321019159085724662e-18, 1.1974001857764476775e-23, -1.1670076950531026582e-29))); d = ddmul_vd2_vd2_vd2(d, u); d = ddadd2_vd2_vd2_vd2(d, vcast_vd2_vd_vd(vsel_vd_vo_vo_vo_d_d_d_d(o0, o1, o2, 1.1283791670955125586, -1.1283791674717296161, -0.57236496645145429341, -0.57236494292470108114), vsel_vd_vo_vo_vo_d_d_d_d(o0, o1, o2, 1.5335459613165822674e-17, 8.0896847755965377194e-17, 3.0704553245872027258e-17, -2.3984352208056898003e-17))); x = ddmul_vd2_vd2_vd(vsel_vd2_vo_vd2_vd2(o1, d, vcast_vd2_vd_vd(vneg_vd_vd(a), vcast_vd_d(0))), a); x = vsel_vd2_vo_vd2_vd2(o1, x, ddadd2_vd2_vd2_vd2(x, d)); x = vsel_vd2_vo_vd2_vd2(o0, ddsub_vd2_vd2_vd2(vcast_vd2_d_d(1, 0), x), expk2(x)); x = vsel_vd2_vo_vd2_vd2(o1, x, ddmul_vd2_vd2_vd2(x, u)); r = vsel_vd_vo_vd_vd(o3, vadd_vd_vd_vd(vd2getx_vd_vd2(x), vd2gety_vd_vd2(x)), vcast_vd_d(0)); r = vsel_vd_vo_vd_vd(vsignbit_vo_vd(s), vsub_vd_vd_vd(vcast_vd_d(2), r), r); r = vsel_vd_vo_vd_vd(visnan_vo_vd(s), vcast_vd_d(SLEEF_NAN), r); return r; } #endif // #if !defined(DETERMINISTIC) #if !defined(DETERMINISTIC) && !defined(ENABLE_GNUABI) && !defined(SLEEF_GENHEADER) // The normal and deterministic versions of implementations are common // for the functions like sincospi_u05. Aliases are defined by // DALIAS_* macros for such functions. The defined aliases // (e.g. ysincospi_u05) are renamed(e.g. to // Sleef_cinz_sincospid2_u05sse2) by rename*.h. #ifdef ENABLE_ALIAS #define DALIAS_vd_vd(FUNC) EXPORT CONST VECTOR_CC vdouble y ## FUNC(vdouble) __attribute__((alias( stringify(x ## FUNC) ))); #define DALIAS_vd2_vd(FUNC) EXPORT CONST VECTOR_CC vdouble2 y ## FUNC(vdouble) __attribute__((alias( stringify(x ## FUNC) ))); #define DALIAS_vi_vd(FUNC) EXPORT CONST VECTOR_CC vint y ## FUNC(vdouble) __attribute__((alias( stringify(x ## FUNC) ))); #define DALIAS_vd_vd_vd(FUNC) EXPORT CONST VECTOR_CC vdouble y ## FUNC(vdouble, vdouble) __attribute__((alias( stringify(x ## FUNC) ))); #define DALIAS_vd_vd_vd_vd(FUNC) EXPORT CONST VECTOR_CC vdouble y ## FUNC(vdouble, vdouble, vdouble) __attribute__((alias( stringify(x ## FUNC) ))); #else #define DALIAS_vd_vd(FUNC) EXPORT CONST VECTOR_CC vdouble y ## FUNC(vdouble d) { return x ## FUNC (d); } #define DALIAS_vd2_vd(FUNC) EXPORT CONST VECTOR_CC vdouble2 y ## FUNC(vdouble d) { return x ## FUNC (d); } #define DALIAS_vi_vd(FUNC) EXPORT CONST VECTOR_CC vint y ## FUNC(vdouble d) { return x ## FUNC (d); } #define DALIAS_vd_vd_vd(FUNC) EXPORT CONST VECTOR_CC vdouble y ## FUNC(vdouble x, vdouble y) { return x ## FUNC (x, y); } #define DALIAS_vd_vd_vd_vd(FUNC) EXPORT CONST VECTOR_CC vdouble y ## FUNC(vdouble x, vdouble y, vdouble z) { return x ## FUNC (x, y, z); } #endif /* DALIAS_vd2_vd(sincospi_u05) */ /* DALIAS_vd2_vd(sincospi_u35) */ /* DALIAS_vd2_vd(modf) */ /* DALIAS_vd_vd(log) */ /* DALIAS_vd_vd(log_u1) */ /* DALIAS_vd_vd_vd(pow) */ /* DALIAS_vd_vd(sinh) */ /* DALIAS_vd_vd(cosh) */ /* DALIAS_vd_vd(tanh) */ /* DALIAS_vd_vd(sinh_u35) */ /* DALIAS_vd_vd(cosh_u35) */ /* DALIAS_vd_vd(tanh_u35) */ /* DALIAS_vd_vd(asinh) */ /* DALIAS_vd_vd(acosh) */ /* DALIAS_vd_vd(atanh) */ /* DALIAS_vd_vd(cbrt) */ /* DALIAS_vd_vd(cbrt_u1) */ /* DALIAS_vd_vd(expm1) */ /* DALIAS_vd_vd(log10) */ /* DALIAS_vd_vd(log2) */ /* DALIAS_vd_vd(log2_u35) */ /* DALIAS_vd_vd(log1p) */ /* DALIAS_vd_vd(fabs) */ /* DALIAS_vd_vd_vd(copysign) */ /* DALIAS_vd_vd_vd(fmax) */ /* DALIAS_vd_vd_vd(fmin) */ /* DALIAS_vd_vd_vd(fdim) */ /* DALIAS_vd_vd(trunc) */ /* DALIAS_vd_vd(floor) */ /* DALIAS_vd_vd(ceil) */ /* DALIAS_vd_vd(round) */ /* DALIAS_vd_vd(rint) */ /* DALIAS_vd_vd_vd(nextafter) */ /* DALIAS_vd_vd(frfrexp) */ /* DALIAS_vi_vd(expfrexp) */ /* DALIAS_vd_vd_vd_vd(fma) */ /* DALIAS_vd_vd(sqrt_u05) */ /* DALIAS_vd_vd(sqrt_u35) */ /* DALIAS_vd_vd_vd(hypot_u05) */ /* DALIAS_vd_vd_vd(hypot_u35) */ /* DALIAS_vd_vd_vd(fmod) */ /* DALIAS_vd_vd_vd(remainder) */ /* DALIAS_vd_vd(tgamma_u1) */ /* DALIAS_vd_vd(lgamma_u1) */ /* DALIAS_vd_vd(erf_u1) */ /* DALIAS_vd_vd(erfc_u15) */ #endif // #if !defined(DETERMINISTIC) && !defined(ENABLE_GNUABI) && !defined(SLEEF_GENHEADER) #if !defined(ENABLE_GNUABI) && !defined(SLEEF_GENHEADER) EXPORT CONST int xgetInt(int name) { if (1 <= name && name <= 10) return vavailability_i(name); return 0; } EXPORT CONST void *xgetPtr(int name) { if (name == 0) return ISANAME; return (void *)0; } #endif #if defined(ALIAS_NO_EXT_SUFFIX) && !defined(DETERMINISTIC) #include ALIAS_NO_EXT_SUFFIX #endif #ifdef ENABLE_MAIN // gcc -DENABLE_MAIN -Wno-attributes -I../common -I../arch -DENABLE_AVX2 -mavx2 -mfma sleefsimddp.c rempitab.c ../common/common.c -lm #include #include #include int main(int argc, char **argv) { vdouble d1 = vcast_vd_d(atof(argv[1])); vdouble d2 = vcast_vd_d(atof(argv[2])); //vdouble d3 = vcast_vd_d(atof(argv[3])); //vdouble r = xnextafter(d1, d2); //int i; //double fr = frexp(atof(argv[1]), &i); //printf("%.20g\n", xfma(d1, d2, d3)[0]);; //printf("test %.20g\n", xtgamma_u1(d1)[0]); //printf("corr %.20g\n", tgamma(d1[0])); //printf("test %.20g\n", xerf_u1(d1)[0]); //printf("corr %.20g\n", erf(d1[0])); //printf("test %.20g\n", xerfc_u15(d1)[0]); //printf("corr %.20g\n", erfc(d1[0])); //printf("%.20g\n", nextafter(d1[0], d2[0]));; //printf("%.20g\n", vcast_d_vd(xhypot_u05(d1, d2))); //printf("%.20g\n", fr); printf("%.20g\n", fmod(atof(argv[1]), atof(argv[2]))); printf("%.20g\n", xfmod(d1, d2)[0]); //vdouble2 r = xsincospi_u35(a); //printf("%g, %g\n", vcast_d_vd(r.x), vcast_d_vd(r.y)); } #endif #ifdef ENABLE_GNUABI /* "finite" aliases for compatibility with GLIBC */ EXPORT CONST VECTOR_CC vdouble __acos_finite (vdouble) __attribute__((weak, alias(str_xacos ))); EXPORT CONST VECTOR_CC vdouble __acosh_finite (vdouble) __attribute__((weak, alias(str_xacosh ))); EXPORT CONST VECTOR_CC vdouble __asin_finite (vdouble) __attribute__((weak, alias(str_xasin_u1 ))); EXPORT CONST VECTOR_CC vdouble __atan2_finite (vdouble, vdouble) __attribute__((weak, alias(str_xatan2_u1 ))); EXPORT CONST VECTOR_CC vdouble __atanh_finite (vdouble) __attribute__((weak, alias(str_xatanh ))); EXPORT CONST VECTOR_CC vdouble __cosh_finite (vdouble) __attribute__((weak, alias(str_xcosh ))); EXPORT CONST VECTOR_CC vdouble __exp10_finite (vdouble) __attribute__((weak, alias(str_xexp10 ))); EXPORT CONST VECTOR_CC vdouble __exp2_finite (vdouble) __attribute__((weak, alias(str_xexp2 ))); EXPORT CONST VECTOR_CC vdouble __exp_finite (vdouble) __attribute__((weak, alias(str_xexp ))); EXPORT CONST VECTOR_CC vdouble __fmod_finite (vdouble, vdouble) __attribute__((weak, alias(str_xfmod ))); EXPORT CONST VECTOR_CC vdouble __remainder_finite(vdouble, vdouble) __attribute__((weak, alias(str_xremainder))); EXPORT CONST VECTOR_CC vdouble __modf_finite (vdouble, vdouble *) __attribute__((weak, alias(str_xmodf ))); EXPORT CONST VECTOR_CC vdouble __hypot_u05_finite(vdouble, vdouble) __attribute__((weak, alias(str_xhypot_u05))); EXPORT CONST VECTOR_CC vdouble __lgamma_u1_finite(vdouble) __attribute__((weak, alias(str_xlgamma_u1))); EXPORT CONST VECTOR_CC vdouble __log10_finite (vdouble) __attribute__((weak, alias(str_xlog10 ))); EXPORT CONST VECTOR_CC vdouble __log_finite (vdouble) __attribute__((weak, alias(str_xlog_u1 ))); EXPORT CONST VECTOR_CC vdouble __pow_finite (vdouble, vdouble) __attribute__((weak, alias(str_xpow ))); EXPORT CONST VECTOR_CC vdouble __sinh_finite (vdouble) __attribute__((weak, alias(str_xsinh ))); EXPORT CONST VECTOR_CC vdouble __sqrt_finite (vdouble) __attribute__((weak, alias(str_xsqrt ))); EXPORT CONST VECTOR_CC vdouble __tgamma_u1_finite(vdouble) __attribute__((weak, alias(str_xtgamma_u1))); #ifdef HEADER_MASKED #include HEADER_MASKED #endif #endif /* #ifdef ENABLE_GNUABI */ ================================================ FILE: src/sleefsimddp_emulation.c ================================================ /* Copyright (c) 2021 Agenium Scale Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #include #ifdef ENABLE_NEON32 #include "renameneon32.h" #define nsimd_vec_f64 nsimd_neon128_vf64 #endif #ifdef ENABLE_VSX #include "renamevsx.h" #define nsimd_vec_f64 nsimd_vmx_vf64 #endif nsimd_vec_f64 xsin(nsimd_vec_f64 a0_) { nsimd_vec_f64 ret_; nsimd_cpu_vf64 a0, ret; a0.v0 = a0_.v0; a0.v1 = a0_.v1; ret = nsimd_sin_u35_cpu_f64(a0); ret_.v0 = ret.v0; ret_.v1 = ret.v1; return ret_; } nsimd_vec_f64 xcos(nsimd_vec_f64 a0_) { nsimd_vec_f64 ret_; nsimd_cpu_vf64 a0, ret; a0.v0 = a0_.v0; a0.v1 = a0_.v1; ret = nsimd_cos_u35_cpu_f64(a0); ret_.v0 = ret.v0; ret_.v1 = ret.v1; return ret_; } nsimd_vec_f64 xtan(nsimd_vec_f64 a0_) { nsimd_vec_f64 ret_; nsimd_cpu_vf64 a0, ret; a0.v0 = a0_.v0; a0.v1 = a0_.v1; ret = nsimd_tan_u35_cpu_f64(a0); ret_.v0 = ret.v0; ret_.v1 = ret.v1; return ret_; } nsimd_vec_f64 xasin(nsimd_vec_f64 a0_) { nsimd_vec_f64 ret_; nsimd_cpu_vf64 a0, ret; a0.v0 = a0_.v0; a0.v1 = a0_.v1; ret = nsimd_asin_u35_cpu_f64(a0); ret_.v0 = ret.v0; ret_.v1 = ret.v1; return ret_; } nsimd_vec_f64 xacos(nsimd_vec_f64 a0_) { nsimd_vec_f64 ret_; nsimd_cpu_vf64 a0, ret; a0.v0 = a0_.v0; a0.v1 = a0_.v1; ret = nsimd_acos_u35_cpu_f64(a0); ret_.v0 = ret.v0; ret_.v1 = ret.v1; return ret_; } nsimd_vec_f64 xatan(nsimd_vec_f64 a0_) { nsimd_vec_f64 ret_; nsimd_cpu_vf64 a0, ret; a0.v0 = a0_.v0; a0.v1 = a0_.v1; ret = nsimd_atan_u35_cpu_f64(a0); ret_.v0 = ret.v0; ret_.v1 = ret.v1; return ret_; } nsimd_vec_f64 xatan2(nsimd_vec_f64 a0_, nsimd_vec_f64 a1_) { nsimd_vec_f64 ret_; nsimd_cpu_vf64 a0, a1, ret; a0.v0 = a0_.v0; a0.v1 = a0_.v1; a1.v0 = a1_.v0; a1.v1 = a1_.v1; ret = nsimd_atan2_u35_cpu_f64(a0, a1); ret_.v0 = ret.v0; ret_.v1 = ret.v1; return ret_; } nsimd_vec_f64 xlog(nsimd_vec_f64 a0_) { nsimd_vec_f64 ret_; nsimd_cpu_vf64 a0, ret; a0.v0 = a0_.v0; a0.v1 = a0_.v1; ret = nsimd_log_u35_cpu_f64(a0); ret_.v0 = ret.v0; ret_.v1 = ret.v1; return ret_; } nsimd_vec_f64 xcbrt(nsimd_vec_f64 a0_) { nsimd_vec_f64 ret_; nsimd_cpu_vf64 a0, ret; a0.v0 = a0_.v0; a0.v1 = a0_.v1; ret = nsimd_cbrt_u35_cpu_f64(a0); ret_.v0 = ret.v0; ret_.v1 = ret.v1; return ret_; } nsimd_vec_f64 xsin_u1(nsimd_vec_f64 a0_) { nsimd_vec_f64 ret_; nsimd_cpu_vf64 a0, ret; a0.v0 = a0_.v0; a0.v1 = a0_.v1; ret = nsimd_sin_u10_cpu_f64(a0); ret_.v0 = ret.v0; ret_.v1 = ret.v1; return ret_; } nsimd_vec_f64 xcos_u1(nsimd_vec_f64 a0_) { nsimd_vec_f64 ret_; nsimd_cpu_vf64 a0, ret; a0.v0 = a0_.v0; a0.v1 = a0_.v1; ret = nsimd_cos_u10_cpu_f64(a0); ret_.v0 = ret.v0; ret_.v1 = ret.v1; return ret_; } nsimd_vec_f64 xtan_u1(nsimd_vec_f64 a0_) { nsimd_vec_f64 ret_; nsimd_cpu_vf64 a0, ret; a0.v0 = a0_.v0; a0.v1 = a0_.v1; ret = nsimd_tan_u10_cpu_f64(a0); ret_.v0 = ret.v0; ret_.v1 = ret.v1; return ret_; } nsimd_vec_f64 xasin_u1(nsimd_vec_f64 a0_) { nsimd_vec_f64 ret_; nsimd_cpu_vf64 a0, ret; a0.v0 = a0_.v0; a0.v1 = a0_.v1; ret = nsimd_asin_u10_cpu_f64(a0); ret_.v0 = ret.v0; ret_.v1 = ret.v1; return ret_; } nsimd_vec_f64 xacos_u1(nsimd_vec_f64 a0_) { nsimd_vec_f64 ret_; nsimd_cpu_vf64 a0, ret; a0.v0 = a0_.v0; a0.v1 = a0_.v1; ret = nsimd_acos_u10_cpu_f64(a0); ret_.v0 = ret.v0; ret_.v1 = ret.v1; return ret_; } nsimd_vec_f64 xatan_u1(nsimd_vec_f64 a0_) { nsimd_vec_f64 ret_; nsimd_cpu_vf64 a0, ret; a0.v0 = a0_.v0; a0.v1 = a0_.v1; ret = nsimd_atan_u10_cpu_f64(a0); ret_.v0 = ret.v0; ret_.v1 = ret.v1; return ret_; } nsimd_vec_f64 xatan2_u1(nsimd_vec_f64 a0_, nsimd_vec_f64 a1_) { nsimd_vec_f64 ret_; nsimd_cpu_vf64 a0, a1, ret; a0.v0 = a0_.v0; a0.v1 = a0_.v1; a1.v0 = a1_.v0; a1.v1 = a1_.v1; ret = nsimd_atan2_u10_cpu_f64(a0, a1); ret_.v0 = ret.v0; ret_.v1 = ret.v1; return ret_; } nsimd_vec_f64 xlog_u1(nsimd_vec_f64 a0_) { nsimd_vec_f64 ret_; nsimd_cpu_vf64 a0, ret; a0.v0 = a0_.v0; a0.v1 = a0_.v1; ret = nsimd_log_u10_cpu_f64(a0); ret_.v0 = ret.v0; ret_.v1 = ret.v1; return ret_; } nsimd_vec_f64 xcbrt_u1(nsimd_vec_f64 a0_) { nsimd_vec_f64 ret_; nsimd_cpu_vf64 a0, ret; a0.v0 = a0_.v0; a0.v1 = a0_.v1; ret = nsimd_cbrt_u10_cpu_f64(a0); ret_.v0 = ret.v0; ret_.v1 = ret.v1; return ret_; } nsimd_vec_f64 xexp(nsimd_vec_f64 a0_) { nsimd_vec_f64 ret_; nsimd_cpu_vf64 a0, ret; a0.v0 = a0_.v0; a0.v1 = a0_.v1; ret = nsimd_exp_u10_cpu_f64(a0); ret_.v0 = ret.v0; ret_.v1 = ret.v1; return ret_; } nsimd_vec_f64 xpow(nsimd_vec_f64 a0_, nsimd_vec_f64 a1_) { nsimd_vec_f64 ret_; nsimd_cpu_vf64 a0, a1, ret; a0.v0 = a0_.v0; a0.v1 = a0_.v1; a1.v0 = a1_.v0; a1.v1 = a1_.v1; ret = nsimd_pow_u10_cpu_f64(a0, a1); ret_.v0 = ret.v0; ret_.v1 = ret.v1; return ret_; } nsimd_vec_f64 xsinh(nsimd_vec_f64 a0_) { nsimd_vec_f64 ret_; nsimd_cpu_vf64 a0, ret; a0.v0 = a0_.v0; a0.v1 = a0_.v1; ret = nsimd_sinh_u10_cpu_f64(a0); ret_.v0 = ret.v0; ret_.v1 = ret.v1; return ret_; } nsimd_vec_f64 xcosh(nsimd_vec_f64 a0_) { nsimd_vec_f64 ret_; nsimd_cpu_vf64 a0, ret; a0.v0 = a0_.v0; a0.v1 = a0_.v1; ret = nsimd_cosh_u10_cpu_f64(a0); ret_.v0 = ret.v0; ret_.v1 = ret.v1; return ret_; } nsimd_vec_f64 xtanh(nsimd_vec_f64 a0_) { nsimd_vec_f64 ret_; nsimd_cpu_vf64 a0, ret; a0.v0 = a0_.v0; a0.v1 = a0_.v1; ret = nsimd_tanh_u10_cpu_f64(a0); ret_.v0 = ret.v0; ret_.v1 = ret.v1; return ret_; } nsimd_vec_f64 xsinh_u35(nsimd_vec_f64 a0_) { nsimd_vec_f64 ret_; nsimd_cpu_vf64 a0, ret; a0.v0 = a0_.v0; a0.v1 = a0_.v1; ret = nsimd_sinh_u35_cpu_f64(a0); ret_.v0 = ret.v0; ret_.v1 = ret.v1; return ret_; } nsimd_vec_f64 xcosh_u35(nsimd_vec_f64 a0_) { nsimd_vec_f64 ret_; nsimd_cpu_vf64 a0, ret; a0.v0 = a0_.v0; a0.v1 = a0_.v1; ret = nsimd_cosh_u35_cpu_f64(a0); ret_.v0 = ret.v0; ret_.v1 = ret.v1; return ret_; } nsimd_vec_f64 xtanh_u35(nsimd_vec_f64 a0_) { nsimd_vec_f64 ret_; nsimd_cpu_vf64 a0, ret; a0.v0 = a0_.v0; a0.v1 = a0_.v1; ret = nsimd_tanh_u35_cpu_f64(a0); ret_.v0 = ret.v0; ret_.v1 = ret.v1; return ret_; } nsimd_vec_f64 xasinh(nsimd_vec_f64 a0_) { nsimd_vec_f64 ret_; nsimd_cpu_vf64 a0, ret; a0.v0 = a0_.v0; a0.v1 = a0_.v1; ret = nsimd_asinh_u10_cpu_f64(a0); ret_.v0 = ret.v0; ret_.v1 = ret.v1; return ret_; } nsimd_vec_f64 xacosh(nsimd_vec_f64 a0_) { nsimd_vec_f64 ret_; nsimd_cpu_vf64 a0, ret; a0.v0 = a0_.v0; a0.v1 = a0_.v1; ret = nsimd_acosh_u10_cpu_f64(a0); ret_.v0 = ret.v0; ret_.v1 = ret.v1; return ret_; } nsimd_vec_f64 xatanh(nsimd_vec_f64 a0_) { nsimd_vec_f64 ret_; nsimd_cpu_vf64 a0, ret; a0.v0 = a0_.v0; a0.v1 = a0_.v1; ret = nsimd_atanh_u10_cpu_f64(a0); ret_.v0 = ret.v0; ret_.v1 = ret.v1; return ret_; } nsimd_vec_f64 xexp2(nsimd_vec_f64 a0_) { nsimd_vec_f64 ret_; nsimd_cpu_vf64 a0, ret; a0.v0 = a0_.v0; a0.v1 = a0_.v1; ret = nsimd_exp2_u10_cpu_f64(a0); ret_.v0 = ret.v0; ret_.v1 = ret.v1; return ret_; } nsimd_vec_f64 xexp2_u35(nsimd_vec_f64 a0_) { nsimd_vec_f64 ret_; nsimd_cpu_vf64 a0, ret; a0.v0 = a0_.v0; a0.v1 = a0_.v1; ret = nsimd_exp2_u35_cpu_f64(a0); ret_.v0 = ret.v0; ret_.v1 = ret.v1; return ret_; } nsimd_vec_f64 xexp10(nsimd_vec_f64 a0_) { nsimd_vec_f64 ret_; nsimd_cpu_vf64 a0, ret; a0.v0 = a0_.v0; a0.v1 = a0_.v1; ret = nsimd_exp10_u10_cpu_f64(a0); ret_.v0 = ret.v0; ret_.v1 = ret.v1; return ret_; } nsimd_vec_f64 xexp10_u35(nsimd_vec_f64 a0_) { nsimd_vec_f64 ret_; nsimd_cpu_vf64 a0, ret; a0.v0 = a0_.v0; a0.v1 = a0_.v1; ret = nsimd_exp10_u35_cpu_f64(a0); ret_.v0 = ret.v0; ret_.v1 = ret.v1; return ret_; } nsimd_vec_f64 xexpm1(nsimd_vec_f64 a0_) { nsimd_vec_f64 ret_; nsimd_cpu_vf64 a0, ret; a0.v0 = a0_.v0; a0.v1 = a0_.v1; ret = nsimd_expm1_u10_cpu_f64(a0); ret_.v0 = ret.v0; ret_.v1 = ret.v1; return ret_; } nsimd_vec_f64 xlog10(nsimd_vec_f64 a0_) { nsimd_vec_f64 ret_; nsimd_cpu_vf64 a0, ret; a0.v0 = a0_.v0; a0.v1 = a0_.v1; ret = nsimd_log10_u10_cpu_f64(a0); ret_.v0 = ret.v0; ret_.v1 = ret.v1; return ret_; } nsimd_vec_f64 xlog2(nsimd_vec_f64 a0_) { nsimd_vec_f64 ret_; nsimd_cpu_vf64 a0, ret; a0.v0 = a0_.v0; a0.v1 = a0_.v1; ret = nsimd_log2_u10_cpu_f64(a0); ret_.v0 = ret.v0; ret_.v1 = ret.v1; return ret_; } nsimd_vec_f64 xlog2_u35(nsimd_vec_f64 a0_) { nsimd_vec_f64 ret_; nsimd_cpu_vf64 a0, ret; a0.v0 = a0_.v0; a0.v1 = a0_.v1; ret = nsimd_log2_u35_cpu_f64(a0); ret_.v0 = ret.v0; ret_.v1 = ret.v1; return ret_; } nsimd_vec_f64 xlog1p(nsimd_vec_f64 a0_) { nsimd_vec_f64 ret_; nsimd_cpu_vf64 a0, ret; a0.v0 = a0_.v0; a0.v1 = a0_.v1; ret = nsimd_log1p_u10_cpu_f64(a0); ret_.v0 = ret.v0; ret_.v1 = ret.v1; return ret_; } nsimd_vec_f64 xsinpi_u05(nsimd_vec_f64 a0_) { nsimd_vec_f64 ret_; nsimd_cpu_vf64 a0, ret; a0.v0 = a0_.v0; a0.v1 = a0_.v1; ret = nsimd_sinpi_u05_cpu_f64(a0); ret_.v0 = ret.v0; ret_.v1 = ret.v1; return ret_; } nsimd_vec_f64 xcospi_u05(nsimd_vec_f64 a0_) { nsimd_vec_f64 ret_; nsimd_cpu_vf64 a0, ret; a0.v0 = a0_.v0; a0.v1 = a0_.v1; ret = nsimd_cospi_u05_cpu_f64(a0); ret_.v0 = ret.v0; ret_.v1 = ret.v1; return ret_; } nsimd_vec_f64 xhypot_u05(nsimd_vec_f64 a0_, nsimd_vec_f64 a1_) { nsimd_vec_f64 ret_; nsimd_cpu_vf64 a0, a1, ret; a0.v0 = a0_.v0; a0.v1 = a0_.v1; a1.v0 = a1_.v0; a1.v1 = a1_.v1; ret = nsimd_hypot_u05_cpu_f64(a0, a1); ret_.v0 = ret.v0; ret_.v1 = ret.v1; return ret_; } nsimd_vec_f64 xhypot_u35(nsimd_vec_f64 a0_, nsimd_vec_f64 a1_) { nsimd_vec_f64 ret_; nsimd_cpu_vf64 a0, a1, ret; a0.v0 = a0_.v0; a0.v1 = a0_.v1; a1.v0 = a1_.v0; a1.v1 = a1_.v1; ret = nsimd_hypot_u35_cpu_f64(a0, a1); ret_.v0 = ret.v0; ret_.v1 = ret.v1; return ret_; } nsimd_vec_f64 xfmod(nsimd_vec_f64 a0_, nsimd_vec_f64 a1_) { nsimd_vec_f64 ret_; nsimd_cpu_vf64 a0, a1, ret; a0.v0 = a0_.v0; a0.v1 = a0_.v1; a1.v0 = a1_.v0; a1.v1 = a1_.v1; ret = nsimd_fmod_cpu_f64(a0, a1); ret_.v0 = ret.v0; ret_.v1 = ret.v1; return ret_; } nsimd_vec_f64 xremainder(nsimd_vec_f64 a0_, nsimd_vec_f64 a1_) { nsimd_vec_f64 ret_; nsimd_cpu_vf64 a0, a1, ret; a0.v0 = a0_.v0; a0.v1 = a0_.v1; a1.v0 = a1_.v0; a1.v1 = a1_.v1; ret = nsimd_remainder_cpu_f64(a0, a1); ret_.v0 = ret.v0; ret_.v1 = ret.v1; return ret_; } nsimd_vec_f64 xlgamma_u1(nsimd_vec_f64 a0_) { nsimd_vec_f64 ret_; nsimd_cpu_vf64 a0, ret; a0.v0 = a0_.v0; a0.v1 = a0_.v1; ret = nsimd_lgamma_u10_cpu_f64(a0); ret_.v0 = ret.v0; ret_.v1 = ret.v1; return ret_; } nsimd_vec_f64 xtgamma_u1(nsimd_vec_f64 a0_) { nsimd_vec_f64 ret_; nsimd_cpu_vf64 a0, ret; a0.v0 = a0_.v0; a0.v1 = a0_.v1; ret = nsimd_tgamma_u10_cpu_f64(a0); ret_.v0 = ret.v0; ret_.v1 = ret.v1; return ret_; } nsimd_vec_f64 xerf_u1(nsimd_vec_f64 a0_) { nsimd_vec_f64 ret_; nsimd_cpu_vf64 a0, ret; a0.v0 = a0_.v0; a0.v1 = a0_.v1; ret = nsimd_erf_u10_cpu_f64(a0); ret_.v0 = ret.v0; ret_.v1 = ret.v1; return ret_; } nsimd_vec_f64 xerfc_u15(nsimd_vec_f64 a0_) { nsimd_vec_f64 ret_; nsimd_cpu_vf64 a0, ret; a0.v0 = a0_.v0; a0.v1 = a0_.v1; ret = nsimd_erfc_u15_cpu_f64(a0); ret_.v0 = ret.v0; ret_.v1 = ret.v1; return ret_; } ================================================ FILE: src/sleefsimdsp.c ================================================ // Copyright Naoki Shibata and contributors 2010 - 2020. // Distributed under the Boost Software License, Version 1.0. // (See accompanying file LICENSE.txt or copy at // http://www.boost.org/LICENSE_1_0.txt) // Always use -ffp-contract=off option to compile SLEEF. #if !defined(SLEEF_GENHEADER) #include #include #include #include #endif #include "misc.h" extern const float Sleef_rempitabsp[]; #define __SLEEFSIMDSP_C__ #if (defined(_MSC_VER)) #pragma fp_contract (off) #endif // Intel #ifdef ENABLE_SSE2 #define CONFIG 2 #if !defined(SLEEF_GENHEADER) #include "helpersse2.h" #else #include "macroonlySSE2.h" #endif #ifdef DORENAME #ifdef ENABLE_GNUABI #include "renamesse2_gnuabi.h" #else #include "renamesse2.h" #endif #endif #endif #ifdef ENABLE_SSE4 #define CONFIG 4 #if !defined(SLEEF_GENHEADER) #include "helpersse2.h" #else #include "macroonlySSE4.h" #endif #ifdef DORENAME #include "renamesse4.h" #endif #endif #ifdef ENABLE_AVX #define CONFIG 1 #if !defined(SLEEF_GENHEADER) #include "helperavx.h" #else #include "macroonlyAVX.h" #endif #ifdef DORENAME #ifdef ENABLE_GNUABI #include "renameavx_gnuabi.h" #else #include "renameavx.h" #endif #endif #endif #ifdef ENABLE_FMA4 #define CONFIG 4 #if !defined(SLEEF_GENHEADER) #include "helperavx.h" #else #include "macroonlyFMA4.h" #endif #ifdef DORENAME #ifdef ENABLE_GNUABI #include "renamefma4_gnuabi.h" #else #include "renamefma4.h" #endif #endif #endif #ifdef ENABLE_AVX2 #define CONFIG 1 #if !defined(SLEEF_GENHEADER) #include "helperavx2.h" #else #include "macroonlyAVX2.h" #endif #ifdef DORENAME #ifdef ENABLE_GNUABI #include "renameavx2_gnuabi.h" #else #include "renameavx2.h" #endif #endif #endif #ifdef ENABLE_AVX2128 #define CONFIG 1 #if !defined(SLEEF_GENHEADER) #include "helperavx2_128.h" #else #include "macroonlyAVX2128.h" #endif #ifdef DORENAME #include "renameavx2128.h" #endif #endif #ifdef ENABLE_AVX512F #define CONFIG 1 #if !defined(SLEEF_GENHEADER) #include "helperavx512f.h" #else #include "macroonlyAVX512F.h" #endif #ifdef DORENAME #ifdef ENABLE_GNUABI #include "renameavx512f_gnuabi.h" #else #include "renameavx512f.h" #endif #endif #endif #ifdef ENABLE_AVX512FNOFMA #define CONFIG 2 #if !defined(SLEEF_GENHEADER) #include "helperavx512f.h" #else #include "macroonlyAVX512FNOFMA.h" #endif #ifdef DORENAME #include "renameavx512fnofma.h" #endif #endif // Arm #ifdef ENABLE_ADVSIMD #define CONFIG 1 #if !defined(SLEEF_GENHEADER) #include "helperadvsimd.h" #else #include "macroonlyADVSIMD.h" #endif #ifdef DORENAME #ifdef ENABLE_GNUABI #include "renameadvsimd_gnuabi.h" #else #include "renameadvsimd.h" #endif #endif #endif #ifdef ENABLE_ADVSIMDNOFMA #define CONFIG 2 #if !defined(SLEEF_GENHEADER) #include "helperadvsimd.h" #else #include "macroonlyADVSIMDNOFMA.h" #endif #ifdef DORENAME #include "renameadvsimdnofma.h" #endif #endif #ifdef ENABLE_NEON32 #define CONFIG 1 #if !defined(SLEEF_GENHEADER) #include "helperneon32.h" #endif #ifdef DORENAME #include "renameneon32.h" #endif #endif #ifdef ENABLE_NEON32VFPV4 #define CONFIG 4 #if !defined(SLEEF_GENHEADER) #include "helperneon32.h" #endif #ifdef DORENAME #include "renameneon32vfpv4.h" #endif #endif #ifdef ENABLE_SVE #define CONFIG 1 #if !defined(SLEEF_GENHEADER) #include "helpersve.h" #else #include "macroonlySVE.h" #endif #ifdef DORENAME #ifdef ENABLE_GNUABI #include "renamesve_gnuabi.h" #else #include "renamesve.h" #endif /* ENABLE_GNUABI */ #endif /* DORENAME */ #endif /* ENABLE_SVE */ #ifdef ENABLE_SVENOFMA #define CONFIG 2 #if !defined(SLEEF_GENHEADER) #include "helpersve.h" #else #include "macroonlySVENOFMA.h" #endif #ifdef DORENAME #include "renamesvenofma.h" #endif /* DORENAME */ #endif /* ENABLE_SVE */ // IBM #ifdef ENABLE_VSX #define CONFIG 1 #if !defined(SLEEF_GENHEADER) #include "helperpower_128.h" #else #include "macroonlyVSX.h" #endif #ifdef DORENAME #include "renamevsx.h" #endif #endif #ifdef ENABLE_VSXNOFMA #define CONFIG 2 #if !defined(SLEEF_GENHEADER) #include "helperpower_128.h" #else #include "macroonlyVSXNOFMA.h" #endif #ifdef DORENAME #include "renamevsxnofma.h" #endif #endif #ifdef ENABLE_ZVECTOR2 #define CONFIG 140 #if !defined(SLEEF_GENHEADER) #include "helpers390x_128.h" #else #include "macroonlyZVECTOR2.h" #endif #ifdef DORENAME #include "renamezvector2.h" #endif #endif #ifdef ENABLE_ZVECTOR2NOFMA #define CONFIG 141 #if !defined(SLEEF_GENHEADER) #include "helpers390x_128.h" #else #include "macroonlyZVECTOR2NOFMA.h" #endif #ifdef DORENAME #include "renamezvector2nofma.h" #endif #endif // Generic #ifdef ENABLE_VECEXT #define CONFIG 1 #if !defined(SLEEF_GENHEADER) #include "helpervecext.h" #endif #ifdef DORENAME #include "renamevecext.h" #endif #endif #ifdef ENABLE_PUREC #define CONFIG 1 #if !defined(SLEEF_GENHEADER) #include "helperpurec.h" #endif #ifdef DORENAME #include "renamepurec.h" #endif #endif #ifdef ENABLE_PUREC_SCALAR #define CONFIG 1 #if !defined(SLEEF_GENHEADER) #include "helperpurec_scalar.h" #else #include "macroonlyPUREC_SCALAR.h" #endif #ifdef DORENAME #include "renamepurec_scalar.h" #endif #endif #ifdef ENABLE_PURECFMA_SCALAR #define CONFIG 2 #if !defined(SLEEF_GENHEADER) #include "helperpurec_scalar.h" #else #include "macroonlyPURECFMA_SCALAR.h" #endif #ifdef DORENAME #include "renamepurecfma_scalar.h" #endif #endif // #define MLA(x, y, z) vmla_vf_vf_vf_vf((x), (y), (z)) #define C2V(c) vcast_vf_f(c) #include "estrin.h" // #include "df.h" static INLINE CONST VECTOR_CC vopmask visnegzero_vo_vf(vfloat d) { return veq_vo_vi2_vi2(vreinterpret_vi2_vf(d), vreinterpret_vi2_vf(vcast_vf_f(-0.0))); } static INLINE VECTOR_CC vopmask vnot_vo32_vo32(vopmask x) { return vxor_vo_vo_vo(x, veq_vo_vi2_vi2(vcast_vi2_i(0), vcast_vi2_i(0))); } static INLINE CONST VECTOR_CC vmask vsignbit_vm_vf(vfloat f) { return vand_vm_vm_vm(vreinterpret_vm_vf(f), vreinterpret_vm_vf(vcast_vf_f(-0.0f))); } static INLINE CONST VECTOR_CC vfloat vmulsign_vf_vf_vf(vfloat x, vfloat y) { return vreinterpret_vf_vm(vxor_vm_vm_vm(vreinterpret_vm_vf(x), vsignbit_vm_vf(y))); } static INLINE CONST VECTOR_CC vfloat vcopysign_vf_vf_vf(vfloat x, vfloat y) { return vreinterpret_vf_vm(vxor_vm_vm_vm(vandnot_vm_vm_vm(vreinterpret_vm_vf(vcast_vf_f(-0.0f)), vreinterpret_vm_vf(x)), vand_vm_vm_vm (vreinterpret_vm_vf(vcast_vf_f(-0.0f)), vreinterpret_vm_vf(y)))); } static INLINE CONST VECTOR_CC vfloat vsign_vf_vf(vfloat f) { return vreinterpret_vf_vm(vor_vm_vm_vm(vreinterpret_vm_vf(vcast_vf_f(1.0f)), vand_vm_vm_vm(vreinterpret_vm_vf(vcast_vf_f(-0.0f)), vreinterpret_vm_vf(f)))); } static INLINE CONST VECTOR_CC vopmask vsignbit_vo_vf(vfloat d) { return veq_vo_vi2_vi2(vand_vi2_vi2_vi2(vreinterpret_vi2_vf(d), vcast_vi2_i(0x80000000)), vcast_vi2_i(0x80000000)); } static INLINE CONST VECTOR_CC vint2 vsel_vi2_vf_vf_vi2_vi2(vfloat f0, vfloat f1, vint2 x, vint2 y) { return vsel_vi2_vo_vi2_vi2(vlt_vo_vf_vf(f0, f1), x, y); } static INLINE CONST VECTOR_CC vint2 vsel_vi2_vf_vi2(vfloat d, vint2 x) { return vand_vi2_vo_vi2(vsignbit_vo_vf(d), x); } static INLINE CONST VECTOR_CC vopmask visint_vo_vf(vfloat y) { return veq_vo_vf_vf(vtruncate_vf_vf(y), y); } static INLINE CONST VECTOR_CC vopmask visnumber_vo_vf(vfloat x) { return vnot_vo32_vo32(vor_vo_vo_vo(visinf_vo_vf(x), visnan_vo_vf(x))); } #if !defined(ENABLE_AVX512F) && !defined(ENABLE_AVX512FNOFMA) static INLINE CONST VECTOR_CC vint2 vilogbk_vi2_vf(vfloat d) { vopmask o = vlt_vo_vf_vf(d, vcast_vf_f(5.421010862427522E-20f)); d = vsel_vf_vo_vf_vf(o, vmul_vf_vf_vf(vcast_vf_f(1.8446744073709552E19f), d), d); vint2 q = vand_vi2_vi2_vi2(vsrl_vi2_vi2_i(vreinterpret_vi2_vf(d), 23), vcast_vi2_i(0xff)); q = vsub_vi2_vi2_vi2(q, vsel_vi2_vo_vi2_vi2(o, vcast_vi2_i(64 + 0x7f), vcast_vi2_i(0x7f))); return q; } static INLINE CONST VECTOR_CC vint2 vilogb2k_vi2_vf(vfloat d) { vint2 q = vreinterpret_vi2_vf(d); q = vsrl_vi2_vi2_i(q, 23); q = vand_vi2_vi2_vi2(q, vcast_vi2_i(0xff)); q = vsub_vi2_vi2_vi2(q, vcast_vi2_i(0x7f)); return q; } #endif // EXPORT CONST VECTOR_CC vint2 xilogbf(vfloat d) { vint2 e = vilogbk_vi2_vf(vabs_vf_vf(d)); e = vsel_vi2_vo_vi2_vi2(veq_vo_vf_vf(d, vcast_vf_f(0.0f)), vcast_vi2_i(SLEEF_FP_ILOGB0), e); e = vsel_vi2_vo_vi2_vi2(visnan_vo_vf(d), vcast_vi2_i(SLEEF_FP_ILOGBNAN), e); e = vsel_vi2_vo_vi2_vi2(visinf_vo_vf(d), vcast_vi2_i(INT_MAX), e); return e; } static INLINE CONST VECTOR_CC vfloat vpow2i_vf_vi2(vint2 q) { return vreinterpret_vf_vi2(vsll_vi2_vi2_i(vadd_vi2_vi2_vi2(q, vcast_vi2_i(0x7f)), 23)); } static INLINE CONST VECTOR_CC vfloat vldexp_vf_vf_vi2(vfloat x, vint2 q) { vfloat u; vint2 m = vsra_vi2_vi2_i(q, 31); m = vsll_vi2_vi2_i(vsub_vi2_vi2_vi2(vsra_vi2_vi2_i(vadd_vi2_vi2_vi2(m, q), 6), m), 4); q = vsub_vi2_vi2_vi2(q, vsll_vi2_vi2_i(m, 2)); m = vadd_vi2_vi2_vi2(m, vcast_vi2_i(0x7f)); m = vand_vi2_vi2_vi2(vgt_vi2_vi2_vi2(m, vcast_vi2_i(0)), m); vint2 n = vgt_vi2_vi2_vi2(m, vcast_vi2_i(0xff)); m = vor_vi2_vi2_vi2(vandnot_vi2_vi2_vi2(n, m), vand_vi2_vi2_vi2(n, vcast_vi2_i(0xff))); u = vreinterpret_vf_vi2(vsll_vi2_vi2_i(m, 23)); x = vmul_vf_vf_vf(vmul_vf_vf_vf(vmul_vf_vf_vf(vmul_vf_vf_vf(x, u), u), u), u); u = vreinterpret_vf_vi2(vsll_vi2_vi2_i(vadd_vi2_vi2_vi2(q, vcast_vi2_i(0x7f)), 23)); return vmul_vf_vf_vf(x, u); } static INLINE CONST VECTOR_CC vfloat vldexp2_vf_vf_vi2(vfloat d, vint2 e) { return vmul_vf_vf_vf(vmul_vf_vf_vf(d, vpow2i_vf_vi2(vsra_vi2_vi2_i(e, 1))), vpow2i_vf_vi2(vsub_vi2_vi2_vi2(e, vsra_vi2_vi2_i(e, 1)))); } static INLINE CONST VECTOR_CC vfloat vldexp3_vf_vf_vi2(vfloat d, vint2 q) { return vreinterpret_vf_vi2(vadd_vi2_vi2_vi2(vreinterpret_vi2_vf(d), vsll_vi2_vi2_i(q, 23))); } EXPORT CONST VECTOR_CC vfloat xldexpf(vfloat x, vint2 q) { return vldexp_vf_vf_vi2(x, q); } #if !(defined(ENABLE_SVE) || defined(ENABLE_SVENOFMA)) typedef struct { vfloat d; vint2 i; } fi_t; static vfloat figetd_vf_di(fi_t d) { return d.d; } static vint2 figeti_vi2_di(fi_t d) { return d.i; } static fi_t fisetdi_fi_vf_vi2(vfloat d, vint2 i) { fi_t r = { d, i }; return r; } typedef struct { vfloat2 df; vint2 i; } dfi_t; static vfloat2 dfigetdf_vf2_dfi(dfi_t d) { return d.df; } static vint2 dfigeti_vi2_dfi(dfi_t d) { return d.i; } static dfi_t dfisetdfi_dfi_vf2_vi2(vfloat2 v, vint2 i) { dfi_t r = { v, i }; return r; } static dfi_t dfisetdf_dfi_dfi_vf2(dfi_t dfi, vfloat2 v) { dfi.df = v; return dfi; } #endif static INLINE CONST VECTOR_CC vfloat vorsign_vf_vf_vf(vfloat x, vfloat y) { return vreinterpret_vf_vm(vor_vm_vm_vm(vreinterpret_vm_vf(x), vsignbit_vm_vf(y))); } static INLINE CONST fi_t rempisubf(vfloat x) { #ifdef FULL_FP_ROUNDING vfloat y = vrint_vf_vf(vmul_vf_vf_vf(x, vcast_vf_f(4))); vint2 vi = vtruncate_vi2_vf(vsub_vf_vf_vf(y, vmul_vf_vf_vf(vrint_vf_vf(x), vcast_vf_f(4)))); return fisetdi_fi_vf_vi2(vsub_vf_vf_vf(x, vmul_vf_vf_vf(y, vcast_vf_f(0.25))), vi); #else vfloat c = vmulsign_vf_vf_vf(vcast_vf_f(1 << 23), x); vfloat rint4x = vsel_vf_vo_vf_vf(vgt_vo_vf_vf(vabs_vf_vf(vmul_vf_vf_vf(vcast_vf_f(4), x)), vcast_vf_f(1 << 23)), vmul_vf_vf_vf(vcast_vf_f(4), x), vorsign_vf_vf_vf(vsub_vf_vf_vf(vmla_vf_vf_vf_vf(vcast_vf_f(4), x, c), c), x)); vfloat rintx = vsel_vf_vo_vf_vf(vgt_vo_vf_vf(vabs_vf_vf(x), vcast_vf_f(1 << 23)), x, vorsign_vf_vf_vf(vsub_vf_vf_vf(vadd_vf_vf_vf(x, c), c), x)); return fisetdi_fi_vf_vi2(vmla_vf_vf_vf_vf(vcast_vf_f(-0.25), rint4x, x), vtruncate_vi2_vf(vmla_vf_vf_vf_vf(vcast_vf_f(-4), rintx, rint4x))); #endif } static INLINE CONST dfi_t rempif(vfloat a) { vfloat2 x, y, z; vint2 ex = vilogb2k_vi2_vf(a); #if defined(ENABLE_AVX512F) || defined(ENABLE_AVX512FNOFMA) ex = vandnot_vi2_vi2_vi2(vsra_vi2_vi2_i(ex, 31), ex); ex = vand_vi2_vi2_vi2(ex, vcast_vi2_i(127)); #endif ex = vsub_vi2_vi2_vi2(ex, vcast_vi2_i(25)); vint2 q = vand_vi2_vo_vi2(vgt_vo_vi2_vi2(ex, vcast_vi2_i(90-25)), vcast_vi2_i(-64)); a = vldexp3_vf_vf_vi2(a, q); ex = vandnot_vi2_vi2_vi2(vsra_vi2_vi2_i(ex, 31), ex); ex = vsll_vi2_vi2_i(ex, 2); x = dfmul_vf2_vf_vf(a, vgather_vf_p_vi2(Sleef_rempitabsp, ex)); fi_t di = rempisubf(vf2getx_vf_vf2(x)); q = figeti_vi2_di(di); x = vf2setx_vf2_vf2_vf(x, figetd_vf_di(di)); x = dfnormalize_vf2_vf2(x); y = dfmul_vf2_vf_vf(a, vgather_vf_p_vi2(Sleef_rempitabsp+1, ex)); x = dfadd2_vf2_vf2_vf2(x, y); di = rempisubf(vf2getx_vf_vf2(x)); q = vadd_vi2_vi2_vi2(q, figeti_vi2_di(di)); x = vf2setx_vf2_vf2_vf(x, figetd_vf_di(di)); x = dfnormalize_vf2_vf2(x); y = vcast_vf2_vf_vf(vgather_vf_p_vi2(Sleef_rempitabsp+2, ex), vgather_vf_p_vi2(Sleef_rempitabsp+3, ex)); y = dfmul_vf2_vf2_vf(y, a); x = dfadd2_vf2_vf2_vf2(x, y); x = dfnormalize_vf2_vf2(x); x = dfmul_vf2_vf2_vf2(x, vcast_vf2_f_f(3.1415927410125732422f*2, -8.7422776573475857731e-08f*2)); x = vsel_vf2_vo_vf2_vf2(vlt_vo_vf_vf(vabs_vf_vf(a), vcast_vf_f(0.7f)), vcast_vf2_vf_vf(a, vcast_vf_f(0)), x); return dfisetdfi_dfi_vf2_vi2(x, q); } EXPORT CONST VECTOR_CC vfloat xsinf(vfloat d) { #if !defined(DETERMINISTIC) vint2 q; vfloat u, s, r = d; if (LIKELY(vtestallones_i_vo32(vlt_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(TRIGRANGEMAX2f))))) { q = vrint_vi2_vf(vmul_vf_vf_vf(d, vcast_vf_f((float)M_1_PI))); u = vcast_vf_vi2(q); d = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_A2f), d); d = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_B2f), d); d = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_C2f), d); } else if (LIKELY(vtestallones_i_vo32(vlt_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(TRIGRANGEMAXf))))) { q = vrint_vi2_vf(vmul_vf_vf_vf(d, vcast_vf_f((float)M_1_PI))); u = vcast_vf_vi2(q); d = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_Af), d); d = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_Bf), d); d = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_Cf), d); d = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_Df), d); } else { dfi_t dfi = rempif(d); q = vand_vi2_vi2_vi2(dfigeti_vi2_dfi(dfi), vcast_vi2_i(3)); q = vadd_vi2_vi2_vi2(vadd_vi2_vi2_vi2(q, q), vsel_vi2_vo_vi2_vi2(vgt_vo_vf_vf(vf2getx_vf_vf2(dfigetdf_vf2_dfi(dfi)), vcast_vf_f(0)), vcast_vi2_i(2), vcast_vi2_i(1))); q = vsra_vi2_vi2_i(q, 2); vopmask o = veq_vo_vi2_vi2(vand_vi2_vi2_vi2(dfigeti_vi2_dfi(dfi), vcast_vi2_i(1)), vcast_vi2_i(1)); vfloat2 x = vcast_vf2_vf_vf(vmulsign_vf_vf_vf(vcast_vf_f(3.1415927410125732422f*-0.5), vf2getx_vf_vf2(dfigetdf_vf2_dfi(dfi))), vmulsign_vf_vf_vf(vcast_vf_f(-8.7422776573475857731e-08f*-0.5), vf2getx_vf_vf2(dfigetdf_vf2_dfi(dfi)))); x = dfadd2_vf2_vf2_vf2(dfigetdf_vf2_dfi(dfi), x); dfi = dfisetdf_dfi_dfi_vf2(dfi, vsel_vf2_vo_vf2_vf2(o, x, dfigetdf_vf2_dfi(dfi))); d = vadd_vf_vf_vf(vf2getx_vf_vf2(dfigetdf_vf2_dfi(dfi)), vf2gety_vf_vf2(dfigetdf_vf2_dfi(dfi))); d = vreinterpret_vf_vm(vor_vm_vo32_vm(vor_vo_vo_vo(visinf_vo_vf(r), visnan_vo_vf(r)), vreinterpret_vm_vf(d))); } s = vmul_vf_vf_vf(d, d); d = vreinterpret_vf_vm(vxor_vm_vm_vm(vand_vm_vo32_vm(veq_vo_vi2_vi2(vand_vi2_vi2_vi2(q, vcast_vi2_i(1)), vcast_vi2_i(1)), vreinterpret_vm_vf(vcast_vf_f(-0.0f))), vreinterpret_vm_vf(d))); u = vcast_vf_f(2.6083159809786593541503e-06f); u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(-0.0001981069071916863322258f)); u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(0.00833307858556509017944336f)); u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(-0.166666597127914428710938f)); u = vadd_vf_vf_vf(vmul_vf_vf_vf(s, vmul_vf_vf_vf(u, d)), d); u = vsel_vf_vo_vf_vf(visnegzero_vo_vf(r), r, u); return u; #else // #if !defined(DETERMINISTIC) vint2 q; vfloat u, s, r = d; q = vrint_vi2_vf(vmul_vf_vf_vf(d, vcast_vf_f((float)M_1_PI))); u = vcast_vf_vi2(q); d = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_A2f), d); d = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_B2f), d); d = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_C2f), d); vopmask g = vlt_vo_vf_vf(vabs_vf_vf(r), vcast_vf_f(TRIGRANGEMAX2f)); if (!LIKELY(vtestallones_i_vo32(g))) { s = vcast_vf_vi2(q); u = vmla_vf_vf_vf_vf(s, vcast_vf_f(-PI_Af), r); u = vmla_vf_vf_vf_vf(s, vcast_vf_f(-PI_Bf), u); u = vmla_vf_vf_vf_vf(s, vcast_vf_f(-PI_Cf), u); u = vmla_vf_vf_vf_vf(s, vcast_vf_f(-PI_Df), u); d = vsel_vf_vo_vf_vf(g, d, u); g = vlt_vo_vf_vf(vabs_vf_vf(r), vcast_vf_f(TRIGRANGEMAXf)); if (!LIKELY(vtestallones_i_vo32(g))) { dfi_t dfi = rempif(r); vint2 q2 = vand_vi2_vi2_vi2(dfigeti_vi2_dfi(dfi), vcast_vi2_i(3)); q2 = vadd_vi2_vi2_vi2(vadd_vi2_vi2_vi2(q2, q2), vsel_vi2_vo_vi2_vi2(vgt_vo_vf_vf(vf2getx_vf_vf2(dfigetdf_vf2_dfi(dfi)), vcast_vf_f(0)), vcast_vi2_i(2), vcast_vi2_i(1))); q2 = vsra_vi2_vi2_i(q2, 2); vopmask o = veq_vo_vi2_vi2(vand_vi2_vi2_vi2(dfigeti_vi2_dfi(dfi), vcast_vi2_i(1)), vcast_vi2_i(1)); vfloat2 x = vcast_vf2_vf_vf(vmulsign_vf_vf_vf(vcast_vf_f(3.1415927410125732422f*-0.5), vf2getx_vf_vf2(dfigetdf_vf2_dfi(dfi))), vmulsign_vf_vf_vf(vcast_vf_f(-8.7422776573475857731e-08f*-0.5), vf2getx_vf_vf2(dfigetdf_vf2_dfi(dfi)))); x = dfadd2_vf2_vf2_vf2(dfigetdf_vf2_dfi(dfi), x); dfi = dfisetdf_dfi_dfi_vf2(dfi, vsel_vf2_vo_vf2_vf2(o, x, dfigetdf_vf2_dfi(dfi))); u = vadd_vf_vf_vf(vf2getx_vf_vf2(dfigetdf_vf2_dfi(dfi)), vf2gety_vf_vf2(dfigetdf_vf2_dfi(dfi))); u = vreinterpret_vf_vm(vor_vm_vo32_vm(vor_vo_vo_vo(visinf_vo_vf(r), visnan_vo_vf(r)), vreinterpret_vm_vf(u))); q = vsel_vi2_vo_vi2_vi2(g, q, q2); d = vsel_vf_vo_vf_vf(g, d, u); } } s = vmul_vf_vf_vf(d, d); d = vreinterpret_vf_vm(vxor_vm_vm_vm(vand_vm_vo32_vm(veq_vo_vi2_vi2(vand_vi2_vi2_vi2(q, vcast_vi2_i(1)), vcast_vi2_i(1)), vreinterpret_vm_vf(vcast_vf_f(-0.0f))), vreinterpret_vm_vf(d))); u = vcast_vf_f(2.6083159809786593541503e-06f); u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(-0.0001981069071916863322258f)); u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(0.00833307858556509017944336f)); u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(-0.166666597127914428710938f)); u = vadd_vf_vf_vf(vmul_vf_vf_vf(s, vmul_vf_vf_vf(u, d)), d); u = vsel_vf_vo_vf_vf(visnegzero_vo_vf(r), r, u); return u; #endif // #if !defined(DETERMINISTIC) } EXPORT CONST VECTOR_CC vfloat xcosf(vfloat d) { #if !defined(DETERMINISTIC) vint2 q; vfloat u, s, r = d; if (LIKELY(vtestallones_i_vo32(vlt_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(TRIGRANGEMAX2f))))) { q = vrint_vi2_vf(vsub_vf_vf_vf(vmul_vf_vf_vf(d, vcast_vf_f((float)M_1_PI)), vcast_vf_f(0.5f))); q = vadd_vi2_vi2_vi2(vadd_vi2_vi2_vi2(q, q), vcast_vi2_i(1)); u = vcast_vf_vi2(q); d = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_A2f*0.5f), d); d = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_B2f*0.5f), d); d = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_C2f*0.5f), d); } else if (LIKELY(vtestallones_i_vo32(vlt_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(TRIGRANGEMAXf))))) { q = vrint_vi2_vf(vsub_vf_vf_vf(vmul_vf_vf_vf(d, vcast_vf_f((float)M_1_PI)), vcast_vf_f(0.5f))); q = vadd_vi2_vi2_vi2(vadd_vi2_vi2_vi2(q, q), vcast_vi2_i(1)); u = vcast_vf_vi2(q); d = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_Af*0.5f), d); d = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_Bf*0.5f), d); d = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_Cf*0.5f), d); d = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_Df*0.5f), d); } else { dfi_t dfi = rempif(d); q = vand_vi2_vi2_vi2(dfigeti_vi2_dfi(dfi), vcast_vi2_i(3)); q = vadd_vi2_vi2_vi2(vadd_vi2_vi2_vi2(q, q), vsel_vi2_vo_vi2_vi2(vgt_vo_vf_vf(vf2getx_vf_vf2(dfigetdf_vf2_dfi(dfi)), vcast_vf_f(0)), vcast_vi2_i(8), vcast_vi2_i(7))); q = vsra_vi2_vi2_i(q, 1); vopmask o = veq_vo_vi2_vi2(vand_vi2_vi2_vi2(dfigeti_vi2_dfi(dfi), vcast_vi2_i(1)), vcast_vi2_i(0)); vfloat y = vsel_vf_vo_vf_vf(vgt_vo_vf_vf(vf2getx_vf_vf2(dfigetdf_vf2_dfi(dfi)), vcast_vf_f(0)), vcast_vf_f(0), vcast_vf_f(-1)); vfloat2 x = vcast_vf2_vf_vf(vmulsign_vf_vf_vf(vcast_vf_f(3.1415927410125732422f*-0.5), y), vmulsign_vf_vf_vf(vcast_vf_f(-8.7422776573475857731e-08f*-0.5), y)); x = dfadd2_vf2_vf2_vf2(dfigetdf_vf2_dfi(dfi), x); dfi = dfisetdf_dfi_dfi_vf2(dfi, vsel_vf2_vo_vf2_vf2(o, x, dfigetdf_vf2_dfi(dfi))); d = vadd_vf_vf_vf(vf2getx_vf_vf2(dfigetdf_vf2_dfi(dfi)), vf2gety_vf_vf2(dfigetdf_vf2_dfi(dfi))); d = vreinterpret_vf_vm(vor_vm_vo32_vm(vor_vo_vo_vo(visinf_vo_vf(r), visnan_vo_vf(r)), vreinterpret_vm_vf(d))); } s = vmul_vf_vf_vf(d, d); d = vreinterpret_vf_vm(vxor_vm_vm_vm(vand_vm_vo32_vm(veq_vo_vi2_vi2(vand_vi2_vi2_vi2(q, vcast_vi2_i(2)), vcast_vi2_i(0)), vreinterpret_vm_vf(vcast_vf_f(-0.0f))), vreinterpret_vm_vf(d))); u = vcast_vf_f(2.6083159809786593541503e-06f); u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(-0.0001981069071916863322258f)); u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(0.00833307858556509017944336f)); u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(-0.166666597127914428710938f)); u = vadd_vf_vf_vf(vmul_vf_vf_vf(s, vmul_vf_vf_vf(u, d)), d); return u; #else // #if !defined(DETERMINISTIC) vint2 q; vfloat u, s, r = d; q = vrint_vi2_vf(vsub_vf_vf_vf(vmul_vf_vf_vf(d, vcast_vf_f((float)M_1_PI)), vcast_vf_f(0.5f))); q = vadd_vi2_vi2_vi2(vadd_vi2_vi2_vi2(q, q), vcast_vi2_i(1)); u = vcast_vf_vi2(q); d = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_A2f*0.5f), d); d = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_B2f*0.5f), d); d = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_C2f*0.5f), d); vopmask g = vlt_vo_vf_vf(vabs_vf_vf(r), vcast_vf_f(TRIGRANGEMAX2f)); if (!LIKELY(vtestallones_i_vo32(g))) { s = vcast_vf_vi2(q); u = vmla_vf_vf_vf_vf(s, vcast_vf_f(-PI_Af*0.5f), r); u = vmla_vf_vf_vf_vf(s, vcast_vf_f(-PI_Bf*0.5f), u); u = vmla_vf_vf_vf_vf(s, vcast_vf_f(-PI_Cf*0.5f), u); u = vmla_vf_vf_vf_vf(s, vcast_vf_f(-PI_Df*0.5f), u); d = vsel_vf_vo_vf_vf(g, d, u); g = vlt_vo_vf_vf(vabs_vf_vf(r), vcast_vf_f(TRIGRANGEMAXf)); if (!LIKELY(vtestallones_i_vo32(g))) { dfi_t dfi = rempif(r); vint2 q2 = vand_vi2_vi2_vi2(dfigeti_vi2_dfi(dfi), vcast_vi2_i(3)); q2 = vadd_vi2_vi2_vi2(vadd_vi2_vi2_vi2(q2, q2), vsel_vi2_vo_vi2_vi2(vgt_vo_vf_vf(vf2getx_vf_vf2(dfigetdf_vf2_dfi(dfi)), vcast_vf_f(0)), vcast_vi2_i(8), vcast_vi2_i(7))); q2 = vsra_vi2_vi2_i(q2, 1); vopmask o = veq_vo_vi2_vi2(vand_vi2_vi2_vi2(dfigeti_vi2_dfi(dfi), vcast_vi2_i(1)), vcast_vi2_i(0)); vfloat y = vsel_vf_vo_vf_vf(vgt_vo_vf_vf(vf2getx_vf_vf2(dfigetdf_vf2_dfi(dfi)), vcast_vf_f(0)), vcast_vf_f(0), vcast_vf_f(-1)); vfloat2 x = vcast_vf2_vf_vf(vmulsign_vf_vf_vf(vcast_vf_f(3.1415927410125732422f*-0.5), y), vmulsign_vf_vf_vf(vcast_vf_f(-8.7422776573475857731e-08f*-0.5), y)); x = dfadd2_vf2_vf2_vf2(dfigetdf_vf2_dfi(dfi), x); dfi = dfisetdf_dfi_dfi_vf2(dfi, vsel_vf2_vo_vf2_vf2(o, x, dfigetdf_vf2_dfi(dfi))); u = vadd_vf_vf_vf(vf2getx_vf_vf2(dfigetdf_vf2_dfi(dfi)), vf2gety_vf_vf2(dfigetdf_vf2_dfi(dfi))); u = vreinterpret_vf_vm(vor_vm_vo32_vm(vor_vo_vo_vo(visinf_vo_vf(r), visnan_vo_vf(r)), vreinterpret_vm_vf(u))); q = vsel_vi2_vo_vi2_vi2(g, q, q2); d = vsel_vf_vo_vf_vf(g, d, u); } } s = vmul_vf_vf_vf(d, d); d = vreinterpret_vf_vm(vxor_vm_vm_vm(vand_vm_vo32_vm(veq_vo_vi2_vi2(vand_vi2_vi2_vi2(q, vcast_vi2_i(2)), vcast_vi2_i(0)), vreinterpret_vm_vf(vcast_vf_f(-0.0f))), vreinterpret_vm_vf(d))); u = vcast_vf_f(2.6083159809786593541503e-06f); u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(-0.0001981069071916863322258f)); u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(0.00833307858556509017944336f)); u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(-0.166666597127914428710938f)); u = vadd_vf_vf_vf(vmul_vf_vf_vf(s, vmul_vf_vf_vf(u, d)), d); return u; #endif // #if !defined(DETERMINISTIC) } EXPORT CONST VECTOR_CC vfloat xtanf(vfloat d) { #if !defined(DETERMINISTIC) vint2 q; vopmask o; vfloat u, s, x; x = d; if (LIKELY(vtestallones_i_vo32(vlt_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(TRIGRANGEMAX2f*0.5f))))) { q = vrint_vi2_vf(vmul_vf_vf_vf(d, vcast_vf_f((float)(2 * M_1_PI)))); u = vcast_vf_vi2(q); x = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_A2f*0.5f), x); x = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_B2f*0.5f), x); x = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_C2f*0.5f), x); } else if (LIKELY(vtestallones_i_vo32(vlt_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(TRIGRANGEMAXf))))) { q = vrint_vi2_vf(vmul_vf_vf_vf(d, vcast_vf_f((float)(2 * M_1_PI)))); u = vcast_vf_vi2(q); x = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_Af*0.5f), x); x = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_Bf*0.5f), x); x = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_Cf*0.5f), x); x = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_Df*0.5f), x); } else { dfi_t dfi = rempif(d); q = dfigeti_vi2_dfi(dfi); x = vadd_vf_vf_vf(vf2getx_vf_vf2(dfigetdf_vf2_dfi(dfi)), vf2gety_vf_vf2(dfigetdf_vf2_dfi(dfi))); x = vreinterpret_vf_vm(vor_vm_vo32_vm(vor_vo_vo_vo(visinf_vo_vf(d), visnan_vo_vf(d)), vreinterpret_vm_vf(x))); x = vsel_vf_vo_vf_vf(visnegzero_vo_vf(d), d, x); } s = vmul_vf_vf_vf(x, x); o = veq_vo_vi2_vi2(vand_vi2_vi2_vi2(q, vcast_vi2_i(1)), vcast_vi2_i(1)); x = vreinterpret_vf_vm(vxor_vm_vm_vm(vand_vm_vo32_vm(o, vreinterpret_vm_vf(vcast_vf_f(-0.0f))), vreinterpret_vm_vf(x))); #if defined(ENABLE_NEON32) u = vcast_vf_f(0.00927245803177356719970703f); u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(0.00331984995864331722259521f)); u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(0.0242998078465461730957031f)); u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(0.0534495301544666290283203f)); u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(0.133383005857467651367188f)); u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(0.333331853151321411132812f)); #else vfloat s2 = vmul_vf_vf_vf(s, s), s4 = vmul_vf_vf_vf(s2, s2); u = POLY6(s, s2, s4, 0.00927245803177356719970703f, 0.00331984995864331722259521f, 0.0242998078465461730957031f, 0.0534495301544666290283203f, 0.133383005857467651367188f, 0.333331853151321411132812f); #endif u = vmla_vf_vf_vf_vf(s, vmul_vf_vf_vf(u, x), x); u = vsel_vf_vo_vf_vf(o, vrec_vf_vf(u), u); return u; #else // #if !defined(DETERMINISTIC) vint2 q; vopmask o; vfloat u, s, x; q = vrint_vi2_vf(vmul_vf_vf_vf(d, vcast_vf_f((float)(2 * M_1_PI)))); u = vcast_vf_vi2(q); x = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_A2f*0.5f), d); x = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_B2f*0.5f), x); x = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_C2f*0.5f), x); vopmask g = vlt_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(TRIGRANGEMAX2f*0.5f)); if (!LIKELY(vtestallones_i_vo32(g))) { vint2 q2 = vrint_vi2_vf(vmul_vf_vf_vf(d, vcast_vf_f((float)(2 * M_1_PI)))); s = vcast_vf_vi2(q); u = vmla_vf_vf_vf_vf(s, vcast_vf_f(-PI_Af*0.5f), d); u = vmla_vf_vf_vf_vf(s, vcast_vf_f(-PI_Bf*0.5f), u); u = vmla_vf_vf_vf_vf(s, vcast_vf_f(-PI_Cf*0.5f), u); u = vmla_vf_vf_vf_vf(s, vcast_vf_f(-PI_Df*0.5f), u); q = vsel_vi2_vo_vi2_vi2(g, q, q2); x = vsel_vf_vo_vf_vf(g, x, u); g = vlt_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(TRIGRANGEMAXf)); if (!LIKELY(vtestallones_i_vo32(g))) { dfi_t dfi = rempif(d); u = vadd_vf_vf_vf(vf2getx_vf_vf2(dfigetdf_vf2_dfi(dfi)), vf2gety_vf_vf2(dfigetdf_vf2_dfi(dfi))); u = vreinterpret_vf_vm(vor_vm_vo32_vm(vor_vo_vo_vo(visinf_vo_vf(d), visnan_vo_vf(d)), vreinterpret_vm_vf(u))); u = vsel_vf_vo_vf_vf(visnegzero_vo_vf(d), d, u); q = vsel_vi2_vo_vi2_vi2(g, q, dfigeti_vi2_dfi(dfi)); x = vsel_vf_vo_vf_vf(g, x, u); } } s = vmul_vf_vf_vf(x, x); o = veq_vo_vi2_vi2(vand_vi2_vi2_vi2(q, vcast_vi2_i(1)), vcast_vi2_i(1)); x = vreinterpret_vf_vm(vxor_vm_vm_vm(vand_vm_vo32_vm(o, vreinterpret_vm_vf(vcast_vf_f(-0.0f))), vreinterpret_vm_vf(x))); #if defined(ENABLE_NEON32) u = vcast_vf_f(0.00927245803177356719970703f); u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(0.00331984995864331722259521f)); u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(0.0242998078465461730957031f)); u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(0.0534495301544666290283203f)); u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(0.133383005857467651367188f)); u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(0.333331853151321411132812f)); #else vfloat s2 = vmul_vf_vf_vf(s, s), s4 = vmul_vf_vf_vf(s2, s2); u = POLY6(s, s2, s4, 0.00927245803177356719970703f, 0.00331984995864331722259521f, 0.0242998078465461730957031f, 0.0534495301544666290283203f, 0.133383005857467651367188f, 0.333331853151321411132812f); #endif u = vmla_vf_vf_vf_vf(s, vmul_vf_vf_vf(u, x), x); u = vsel_vf_vo_vf_vf(o, vrec_vf_vf(u), u); return u; #endif // #if !defined(DETERMINISTIC) } EXPORT CONST VECTOR_CC vfloat xsinf_u1(vfloat d) { #if !defined(DETERMINISTIC) vint2 q; vfloat u, v; vfloat2 s, t, x; if (LIKELY(vtestallones_i_vo32(vlt_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(TRIGRANGEMAX2f))))) { u = vrint_vf_vf(vmul_vf_vf_vf(d, vcast_vf_f(M_1_PI))); q = vrint_vi2_vf(u); v = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_A2f), d); s = dfadd2_vf2_vf_vf(v, vmul_vf_vf_vf(u, vcast_vf_f(-PI_B2f))); s = dfadd_vf2_vf2_vf(s, vmul_vf_vf_vf(u, vcast_vf_f(-PI_C2f))); } else { dfi_t dfi = rempif(d); q = vand_vi2_vi2_vi2(dfigeti_vi2_dfi(dfi), vcast_vi2_i(3)); q = vadd_vi2_vi2_vi2(vadd_vi2_vi2_vi2(q, q), vsel_vi2_vo_vi2_vi2(vgt_vo_vf_vf(vf2getx_vf_vf2(dfigetdf_vf2_dfi(dfi)), vcast_vf_f(0)), vcast_vi2_i(2), vcast_vi2_i(1))); q = vsra_vi2_vi2_i(q, 2); vopmask o = veq_vo_vi2_vi2(vand_vi2_vi2_vi2(dfigeti_vi2_dfi(dfi), vcast_vi2_i(1)), vcast_vi2_i(1)); vfloat2 x = vcast_vf2_vf_vf(vmulsign_vf_vf_vf(vcast_vf_f(3.1415927410125732422f*-0.5), vf2getx_vf_vf2(dfigetdf_vf2_dfi(dfi))), vmulsign_vf_vf_vf(vcast_vf_f(-8.7422776573475857731e-08f*-0.5), vf2getx_vf_vf2(dfigetdf_vf2_dfi(dfi)))); x = dfadd2_vf2_vf2_vf2(dfigetdf_vf2_dfi(dfi), x); dfi = dfisetdf_dfi_dfi_vf2(dfi, vsel_vf2_vo_vf2_vf2(o, x, dfigetdf_vf2_dfi(dfi))); s = dfnormalize_vf2_vf2(dfigetdf_vf2_dfi(dfi)); #if !defined(_MSC_VER) s = vf2setx_vf2_vf2_vf(s, vreinterpret_vf_vm(vor_vm_vo32_vm(vor_vo_vo_vo(visinf_vo_vf(d), visnan_vo_vf(d)), vreinterpret_vm_vf(vf2getx_vf_vf2(s))))); #else s.x = vreinterpret_vf_vm(vor_vm_vo32_vm(vor_vo_vo_vo(visinf_vo_vf(d), visnan_vo_vf(d)), vreinterpret_vm_vf(s.x))); #endif } t = s; s = dfsqu_vf2_vf2(s); u = vcast_vf_f(2.6083159809786593541503e-06f); u = vmla_vf_vf_vf_vf(u, vf2getx_vf_vf2(s), vcast_vf_f(-0.0001981069071916863322258f)); u = vmla_vf_vf_vf_vf(u, vf2getx_vf_vf2(s), vcast_vf_f(0.00833307858556509017944336f)); x = dfadd_vf2_vf_vf2(vcast_vf_f(1), dfmul_vf2_vf2_vf2(dfadd_vf2_vf_vf(vcast_vf_f(-0.166666597127914428710938f), vmul_vf_vf_vf(u, vf2getx_vf_vf2(s))), s)); u = dfmul_vf_vf2_vf2(t, x); u = vreinterpret_vf_vm(vxor_vm_vm_vm(vand_vm_vo32_vm(veq_vo_vi2_vi2(vand_vi2_vi2_vi2(q, vcast_vi2_i(1)), vcast_vi2_i(1)), vreinterpret_vm_vf(vcast_vf_f(-0.0))), vreinterpret_vm_vf(u))); u = vsel_vf_vo_vf_vf(visnegzero_vo_vf(d), d, u); return u; #else // #if !defined(DETERMINISTIC) vint2 q; vfloat u, v; vfloat2 s, t, x; u = vrint_vf_vf(vmul_vf_vf_vf(d, vcast_vf_f(M_1_PI))); q = vrint_vi2_vf(u); v = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_A2f), d); s = dfadd2_vf2_vf_vf(v, vmul_vf_vf_vf(u, vcast_vf_f(-PI_B2f))); s = dfadd_vf2_vf2_vf(s, vmul_vf_vf_vf(u, vcast_vf_f(-PI_C2f))); vopmask g = vlt_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(TRIGRANGEMAX2f)); if (!LIKELY(vtestallones_i_vo32(g))) { dfi_t dfi = rempif(d); vint2 q2 = vand_vi2_vi2_vi2(dfigeti_vi2_dfi(dfi), vcast_vi2_i(3)); q2 = vadd_vi2_vi2_vi2(vadd_vi2_vi2_vi2(q2, q2), vsel_vi2_vo_vi2_vi2(vgt_vo_vf_vf(vf2getx_vf_vf2(dfigetdf_vf2_dfi(dfi)), vcast_vf_f(0)), vcast_vi2_i(2), vcast_vi2_i(1))); q2 = vsra_vi2_vi2_i(q2, 2); vopmask o = veq_vo_vi2_vi2(vand_vi2_vi2_vi2(dfigeti_vi2_dfi(dfi), vcast_vi2_i(1)), vcast_vi2_i(1)); vfloat2 x = vcast_vf2_vf_vf(vmulsign_vf_vf_vf(vcast_vf_f(3.1415927410125732422f*-0.5), vf2getx_vf_vf2(dfigetdf_vf2_dfi(dfi))), vmulsign_vf_vf_vf(vcast_vf_f(-8.7422776573475857731e-08f*-0.5), vf2getx_vf_vf2(dfigetdf_vf2_dfi(dfi)))); x = dfadd2_vf2_vf2_vf2(dfigetdf_vf2_dfi(dfi), x); dfi = dfisetdf_dfi_dfi_vf2(dfi, vsel_vf2_vo_vf2_vf2(o, x, dfigetdf_vf2_dfi(dfi))); t = dfnormalize_vf2_vf2(dfigetdf_vf2_dfi(dfi)); t = vf2setx_vf2_vf2_vf(t, vreinterpret_vf_vm(vor_vm_vo32_vm(vor_vo_vo_vo(visinf_vo_vf(d), visnan_vo_vf(d)), vreinterpret_vm_vf(vf2getx_vf_vf2(t))))); q = vsel_vi2_vo_vi2_vi2(g, q, q2); s = vsel_vf2_vo_vf2_vf2(g, s, t); } t = s; s = dfsqu_vf2_vf2(s); u = vcast_vf_f(2.6083159809786593541503e-06f); u = vmla_vf_vf_vf_vf(u, vf2getx_vf_vf2(s), vcast_vf_f(-0.0001981069071916863322258f)); u = vmla_vf_vf_vf_vf(u, vf2getx_vf_vf2(s), vcast_vf_f(0.00833307858556509017944336f)); x = dfadd_vf2_vf_vf2(vcast_vf_f(1), dfmul_vf2_vf2_vf2(dfadd_vf2_vf_vf(vcast_vf_f(-0.166666597127914428710938f), vmul_vf_vf_vf(u, vf2getx_vf_vf2(s))), s)); u = dfmul_vf_vf2_vf2(t, x); u = vreinterpret_vf_vm(vxor_vm_vm_vm(vand_vm_vo32_vm(veq_vo_vi2_vi2(vand_vi2_vi2_vi2(q, vcast_vi2_i(1)), vcast_vi2_i(1)), vreinterpret_vm_vf(vcast_vf_f(-0.0))), vreinterpret_vm_vf(u))); u = vsel_vf_vo_vf_vf(visnegzero_vo_vf(d), d, u); return u; #endif // #if !defined(DETERMINISTIC) } EXPORT CONST VECTOR_CC vfloat xcosf_u1(vfloat d) { #if !defined(DETERMINISTIC) vint2 q; vfloat u; vfloat2 s, t, x; if (LIKELY(vtestallones_i_vo32(vlt_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(TRIGRANGEMAX2f))))) { vfloat dq = vmla_vf_vf_vf_vf(vrint_vf_vf(vmla_vf_vf_vf_vf(d, vcast_vf_f(M_1_PI), vcast_vf_f(-0.5f))), vcast_vf_f(2), vcast_vf_f(1)); q = vrint_vi2_vf(dq); s = dfadd2_vf2_vf_vf (d, vmul_vf_vf_vf(dq, vcast_vf_f(-PI_A2f*0.5f))); s = dfadd2_vf2_vf2_vf(s, vmul_vf_vf_vf(dq, vcast_vf_f(-PI_B2f*0.5f))); s = dfadd2_vf2_vf2_vf(s, vmul_vf_vf_vf(dq, vcast_vf_f(-PI_C2f*0.5f))); } else { dfi_t dfi = rempif(d); q = vand_vi2_vi2_vi2(dfigeti_vi2_dfi(dfi), vcast_vi2_i(3)); q = vadd_vi2_vi2_vi2(vadd_vi2_vi2_vi2(q, q), vsel_vi2_vo_vi2_vi2(vgt_vo_vf_vf(vf2getx_vf_vf2(dfigetdf_vf2_dfi(dfi)), vcast_vf_f(0)), vcast_vi2_i(8), vcast_vi2_i(7))); q = vsra_vi2_vi2_i(q, 1); vopmask o = veq_vo_vi2_vi2(vand_vi2_vi2_vi2(dfigeti_vi2_dfi(dfi), vcast_vi2_i(1)), vcast_vi2_i(0)); vfloat y = vsel_vf_vo_vf_vf(vgt_vo_vf_vf(vf2getx_vf_vf2(dfigetdf_vf2_dfi(dfi)), vcast_vf_f(0)), vcast_vf_f(0), vcast_vf_f(-1)); vfloat2 x = vcast_vf2_vf_vf(vmulsign_vf_vf_vf(vcast_vf_f(3.1415927410125732422f*-0.5), y), vmulsign_vf_vf_vf(vcast_vf_f(-8.7422776573475857731e-08f*-0.5), y)); x = dfadd2_vf2_vf2_vf2(dfigetdf_vf2_dfi(dfi), x); dfi = dfisetdf_dfi_dfi_vf2(dfi, vsel_vf2_vo_vf2_vf2(o, x, dfigetdf_vf2_dfi(dfi))); s = dfnormalize_vf2_vf2(dfigetdf_vf2_dfi(dfi)); #if !defined(_MSC_VER) s = vf2setx_vf2_vf2_vf(s, vreinterpret_vf_vm(vor_vm_vo32_vm(vor_vo_vo_vo(visinf_vo_vf(d), visnan_vo_vf(d)), vreinterpret_vm_vf(vf2getx_vf_vf2(s))))); #else s.x = vreinterpret_vf_vm(vor_vm_vo32_vm(vor_vo_vo_vo(visinf_vo_vf(d), visnan_vo_vf(d)), vreinterpret_vm_vf(s.x))); #endif } t = s; s = dfsqu_vf2_vf2(s); u = vcast_vf_f(2.6083159809786593541503e-06f); u = vmla_vf_vf_vf_vf(u, vf2getx_vf_vf2(s), vcast_vf_f(-0.0001981069071916863322258f)); u = vmla_vf_vf_vf_vf(u, vf2getx_vf_vf2(s), vcast_vf_f(0.00833307858556509017944336f)); x = dfadd_vf2_vf_vf2(vcast_vf_f(1), dfmul_vf2_vf2_vf2(dfadd_vf2_vf_vf(vcast_vf_f(-0.166666597127914428710938f), vmul_vf_vf_vf(u, vf2getx_vf_vf2(s))), s)); u = dfmul_vf_vf2_vf2(t, x); u = vreinterpret_vf_vm(vxor_vm_vm_vm(vand_vm_vo32_vm(veq_vo_vi2_vi2(vand_vi2_vi2_vi2(q, vcast_vi2_i(2)), vcast_vi2_i(0)), vreinterpret_vm_vf(vcast_vf_f(-0.0))), vreinterpret_vm_vf(u))); return u; #else // #if !defined(DETERMINISTIC) vint2 q; vfloat u; vfloat2 s, t, x; vfloat dq = vmla_vf_vf_vf_vf(vrint_vf_vf(vmla_vf_vf_vf_vf(d, vcast_vf_f(M_1_PI), vcast_vf_f(-0.5f))), vcast_vf_f(2), vcast_vf_f(1)); q = vrint_vi2_vf(dq); s = dfadd2_vf2_vf_vf (d, vmul_vf_vf_vf(dq, vcast_vf_f(-PI_A2f*0.5f))); s = dfadd2_vf2_vf2_vf(s, vmul_vf_vf_vf(dq, vcast_vf_f(-PI_B2f*0.5f))); s = dfadd2_vf2_vf2_vf(s, vmul_vf_vf_vf(dq, vcast_vf_f(-PI_C2f*0.5f))); vopmask g = vlt_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(TRIGRANGEMAX2f)); if (!LIKELY(vtestallones_i_vo32(g))) { dfi_t dfi = rempif(d); vint2 q2 = vand_vi2_vi2_vi2(dfigeti_vi2_dfi(dfi), vcast_vi2_i(3)); q2 = vadd_vi2_vi2_vi2(vadd_vi2_vi2_vi2(q2, q2), vsel_vi2_vo_vi2_vi2(vgt_vo_vf_vf(vf2getx_vf_vf2(dfigetdf_vf2_dfi(dfi)), vcast_vf_f(0)), vcast_vi2_i(8), vcast_vi2_i(7))); q2 = vsra_vi2_vi2_i(q2, 1); vopmask o = veq_vo_vi2_vi2(vand_vi2_vi2_vi2(dfigeti_vi2_dfi(dfi), vcast_vi2_i(1)), vcast_vi2_i(0)); vfloat y = vsel_vf_vo_vf_vf(vgt_vo_vf_vf(vf2getx_vf_vf2(dfigetdf_vf2_dfi(dfi)), vcast_vf_f(0)), vcast_vf_f(0), vcast_vf_f(-1)); vfloat2 x = vcast_vf2_vf_vf(vmulsign_vf_vf_vf(vcast_vf_f(3.1415927410125732422f*-0.5), y), vmulsign_vf_vf_vf(vcast_vf_f(-8.7422776573475857731e-08f*-0.5), y)); x = dfadd2_vf2_vf2_vf2(dfigetdf_vf2_dfi(dfi), x); dfi = dfisetdf_dfi_dfi_vf2(dfi, vsel_vf2_vo_vf2_vf2(o, x, dfigetdf_vf2_dfi(dfi))); t = dfnormalize_vf2_vf2(dfigetdf_vf2_dfi(dfi)); t = vf2setx_vf2_vf2_vf(t, vreinterpret_vf_vm(vor_vm_vo32_vm(vor_vo_vo_vo(visinf_vo_vf(d), visnan_vo_vf(d)), vreinterpret_vm_vf(vf2getx_vf_vf2(t))))); q = vsel_vi2_vo_vi2_vi2(g, q, q2); s = vsel_vf2_vo_vf2_vf2(g, s, t); } t = s; s = dfsqu_vf2_vf2(s); u = vcast_vf_f(2.6083159809786593541503e-06f); u = vmla_vf_vf_vf_vf(u, vf2getx_vf_vf2(s), vcast_vf_f(-0.0001981069071916863322258f)); u = vmla_vf_vf_vf_vf(u, vf2getx_vf_vf2(s), vcast_vf_f(0.00833307858556509017944336f)); x = dfadd_vf2_vf_vf2(vcast_vf_f(1), dfmul_vf2_vf2_vf2(dfadd_vf2_vf_vf(vcast_vf_f(-0.166666597127914428710938f), vmul_vf_vf_vf(u, vf2getx_vf_vf2(s))), s)); u = dfmul_vf_vf2_vf2(t, x); u = vreinterpret_vf_vm(vxor_vm_vm_vm(vand_vm_vo32_vm(veq_vo_vi2_vi2(vand_vi2_vi2_vi2(q, vcast_vi2_i(2)), vcast_vi2_i(0)), vreinterpret_vm_vf(vcast_vf_f(-0.0))), vreinterpret_vm_vf(u))); return u; #endif // #if !defined(DETERMINISTIC) } EXPORT CONST VECTOR_CC vfloat xfastsinf_u3500(vfloat d) { vint2 q; vfloat u, s, t = d; s = vmul_vf_vf_vf(d, vcast_vf_f((float)M_1_PI)); u = vrint_vf_vf(s); q = vrint_vi2_vf(s); d = vmla_vf_vf_vf_vf(u, vcast_vf_f(-(float)M_PI), d); s = vmul_vf_vf_vf(d, d); u = vcast_vf_f(-0.1881748176e-3); u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(+0.8323502727e-2)); u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(-0.1666651368e+0)); u = vmla_vf_vf_vf_vf(vmul_vf_vf_vf(s, d), u, d); u = vreinterpret_vf_vm(vxor_vm_vm_vm(vand_vm_vo32_vm(veq_vo_vi2_vi2(vand_vi2_vi2_vi2(q, vcast_vi2_i(1)), vcast_vi2_i(1)), vreinterpret_vm_vf(vcast_vf_f(-0.0f))), vreinterpret_vm_vf(u))); vopmask g = vlt_vo_vf_vf(vabs_vf_vf(t), vcast_vf_f(30.0f)); if (!LIKELY(vtestallones_i_vo32(g))) return vsel_vf_vo_vf_vf(g, u, xsinf(t)); return u; } EXPORT CONST VECTOR_CC vfloat xfastcosf_u3500(vfloat d) { vint2 q; vfloat u, s, t = d; s = vmla_vf_vf_vf_vf(d, vcast_vf_f((float)M_1_PI), vcast_vf_f(-0.5f)); u = vrint_vf_vf(s); q = vrint_vi2_vf(s); d = vmla_vf_vf_vf_vf(u, vcast_vf_f(-(float)M_PI), vsub_vf_vf_vf(d, vcast_vf_f((float)M_PI * 0.5f))); s = vmul_vf_vf_vf(d, d); u = vcast_vf_f(-0.1881748176e-3); u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(+0.8323502727e-2)); u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(-0.1666651368e+0)); u = vmla_vf_vf_vf_vf(vmul_vf_vf_vf(s, d), u, d); u = vreinterpret_vf_vm(vxor_vm_vm_vm(vand_vm_vo32_vm(veq_vo_vi2_vi2(vand_vi2_vi2_vi2(q, vcast_vi2_i(1)), vcast_vi2_i(0)), vreinterpret_vm_vf(vcast_vf_f(-0.0f))), vreinterpret_vm_vf(u))); vopmask g = vlt_vo_vf_vf(vabs_vf_vf(t), vcast_vf_f(30.0f)); if (!LIKELY(vtestallones_i_vo32(g))) return vsel_vf_vo_vf_vf(g, u, xcosf(t)); return u; } #ifdef ENABLE_GNUABI #define TYPE2_FUNCATR static INLINE CONST #define TYPE6_FUNCATR static INLINE CONST #define SQRTFU05_FUNCATR static INLINE CONST #define XSINCOSF sincosfk #define XSINCOSF_U1 sincosfk_u1 #define XSINCOSPIF_U05 sincospifk_u05 #define XSINCOSPIF_U35 sincospifk_u35 #define XMODFF modffk #else #define TYPE2_FUNCATR EXPORT CONST #define TYPE6_FUNCATR EXPORT #define SQRTFU05_FUNCATR EXPORT #define XSINCOSF xsincosf #define XSINCOSF_U1 xsincosf_u1 #define XSINCOSPIF_U05 xsincospif_u05 #define XSINCOSPIF_U35 xsincospif_u35 #define XMODFF xmodff #endif TYPE2_FUNCATR VECTOR_CC vfloat2 XSINCOSF(vfloat d) { #if !defined(DETERMINISTIC) vint2 q; vopmask o; vfloat u, s, t, rx, ry; vfloat2 r; s = d; if (LIKELY(vtestallones_i_vo32(vlt_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(TRIGRANGEMAX2f))))) { q = vrint_vi2_vf(vmul_vf_vf_vf(d, vcast_vf_f((float)M_2_PI))); u = vcast_vf_vi2(q); s = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_A2f*0.5f), s); s = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_B2f*0.5f), s); s = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_C2f*0.5f), s); } else if (LIKELY(vtestallones_i_vo32(vlt_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(TRIGRANGEMAXf))))) { q = vrint_vi2_vf(vmul_vf_vf_vf(d, vcast_vf_f((float)M_2_PI))); u = vcast_vf_vi2(q); s = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_Af*0.5f), s); s = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_Bf*0.5f), s); s = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_Cf*0.5f), s); s = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_Df*0.5f), s); } else { dfi_t dfi = rempif(d); q = dfigeti_vi2_dfi(dfi); s = vadd_vf_vf_vf(vf2getx_vf_vf2(dfigetdf_vf2_dfi(dfi)), vf2gety_vf_vf2(dfigetdf_vf2_dfi(dfi))); s = vreinterpret_vf_vm(vor_vm_vo32_vm(vor_vo_vo_vo(visinf_vo_vf(d), visnan_vo_vf(d)), vreinterpret_vm_vf(s))); } t = s; s = vmul_vf_vf_vf(s, s); u = vcast_vf_f(-0.000195169282960705459117889f); u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(0.00833215750753879547119141f)); u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(-0.166666537523269653320312f)); rx = vmla_vf_vf_vf_vf(vmul_vf_vf_vf(u, s), t, t); rx = vsel_vf_vo_vf_vf(visnegzero_vo_vf(d), vcast_vf_f(-0.0f), rx); u = vcast_vf_f(-2.71811842367242206819355e-07f); u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(2.47990446951007470488548e-05f)); u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(-0.00138888787478208541870117f)); u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(0.0416666641831398010253906f)); u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(-0.5)); ry = vmla_vf_vf_vf_vf(s, u, vcast_vf_f(1)); o = veq_vo_vi2_vi2(vand_vi2_vi2_vi2(q, vcast_vi2_i(1)), vcast_vi2_i(0)); r = vf2setxy_vf2_vf_vf(vsel_vf_vo_vf_vf(o, rx, ry), vsel_vf_vo_vf_vf(o, ry, rx)); o = veq_vo_vi2_vi2(vand_vi2_vi2_vi2(q, vcast_vi2_i(2)), vcast_vi2_i(2)); r = vf2setx_vf2_vf2_vf(r, vreinterpret_vf_vm(vxor_vm_vm_vm(vand_vm_vo32_vm(o, vreinterpret_vm_vf(vcast_vf_f(-0.0))), vreinterpret_vm_vf(vf2getx_vf_vf2(r))))); o = veq_vo_vi2_vi2(vand_vi2_vi2_vi2(vadd_vi2_vi2_vi2(q, vcast_vi2_i(1)), vcast_vi2_i(2)), vcast_vi2_i(2)); r = vf2sety_vf2_vf2_vf(r, vreinterpret_vf_vm(vxor_vm_vm_vm(vand_vm_vo32_vm(o, vreinterpret_vm_vf(vcast_vf_f(-0.0))), vreinterpret_vm_vf(vf2gety_vf_vf2(r))))); return r; #else // #if !defined(DETERMINISTIC) vint2 q; vopmask o; vfloat u, s, t, rx, ry; vfloat2 r; q = vrint_vi2_vf(vmul_vf_vf_vf(d, vcast_vf_f((float)M_2_PI))); u = vcast_vf_vi2(q); s = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_A2f*0.5f), d); s = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_B2f*0.5f), s); s = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_C2f*0.5f), s); vopmask g = vlt_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(TRIGRANGEMAX2f)); if (!LIKELY(vtestallones_i_vo32(g))) { vint2 q2 = vrint_vi2_vf(vmul_vf_vf_vf(d, vcast_vf_f((float)M_2_PI))); u = vcast_vf_vi2(q2); t = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_Af*0.5f), d); t = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_Bf*0.5f), t); t = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_Cf*0.5f), t); t = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_Df*0.5f), t); q = vsel_vi2_vo_vi2_vi2(g, q, q2); s = vsel_vf_vo_vf_vf(g, s, t); g = vlt_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(TRIGRANGEMAXf)); if (!LIKELY(vtestallones_i_vo32(g))) { dfi_t dfi = rempif(d); t = vadd_vf_vf_vf(vf2getx_vf_vf2(dfigetdf_vf2_dfi(dfi)), vf2gety_vf_vf2(dfigetdf_vf2_dfi(dfi))); t = vreinterpret_vf_vm(vor_vm_vo32_vm(vor_vo_vo_vo(visinf_vo_vf(d), visnan_vo_vf(d)), vreinterpret_vm_vf(t))); q = vsel_vi2_vo_vi2_vi2(g, q, dfigeti_vi2_dfi(dfi)); s = vsel_vf_vo_vf_vf(g, s, t); } } t = s; s = vmul_vf_vf_vf(s, s); u = vcast_vf_f(-0.000195169282960705459117889f); u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(0.00833215750753879547119141f)); u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(-0.166666537523269653320312f)); rx = vmla_vf_vf_vf_vf(vmul_vf_vf_vf(u, s), t, t); rx = vsel_vf_vo_vf_vf(visnegzero_vo_vf(d), vcast_vf_f(-0.0f), rx); u = vcast_vf_f(-2.71811842367242206819355e-07f); u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(2.47990446951007470488548e-05f)); u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(-0.00138888787478208541870117f)); u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(0.0416666641831398010253906f)); u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(-0.5)); ry = vmla_vf_vf_vf_vf(s, u, vcast_vf_f(1)); o = veq_vo_vi2_vi2(vand_vi2_vi2_vi2(q, vcast_vi2_i(1)), vcast_vi2_i(0)); r = vf2setxy_vf2_vf_vf(vsel_vf_vo_vf_vf(o, rx, ry), vsel_vf_vo_vf_vf(o, ry, rx)); o = veq_vo_vi2_vi2(vand_vi2_vi2_vi2(q, vcast_vi2_i(2)), vcast_vi2_i(2)); r = vf2setx_vf2_vf2_vf(r, vreinterpret_vf_vm(vxor_vm_vm_vm(vand_vm_vo32_vm(o, vreinterpret_vm_vf(vcast_vf_f(-0.0))), vreinterpret_vm_vf(vf2getx_vf_vf2(r))))); o = veq_vo_vi2_vi2(vand_vi2_vi2_vi2(vadd_vi2_vi2_vi2(q, vcast_vi2_i(1)), vcast_vi2_i(2)), vcast_vi2_i(2)); r = vf2sety_vf2_vf2_vf(r, vreinterpret_vf_vm(vxor_vm_vm_vm(vand_vm_vo32_vm(o, vreinterpret_vm_vf(vcast_vf_f(-0.0))), vreinterpret_vm_vf(vf2gety_vf_vf2(r))))); return r; #endif // #if !defined(DETERMINISTIC) } TYPE2_FUNCATR VECTOR_CC vfloat2 XSINCOSF_U1(vfloat d) { #if !defined(DETERMINISTIC) vint2 q; vopmask o; vfloat u, v, rx, ry; vfloat2 r, s, t, x; if (LIKELY(vtestallones_i_vo32(vlt_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(TRIGRANGEMAX2f))))) { u = vrint_vf_vf(vmul_vf_vf_vf(d, vcast_vf_f(2 * M_1_PI))); q = vrint_vi2_vf(u); v = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_A2f*0.5f), d); s = dfadd2_vf2_vf_vf(v, vmul_vf_vf_vf(u, vcast_vf_f(-PI_B2f*0.5f))); s = dfadd_vf2_vf2_vf(s, vmul_vf_vf_vf(u, vcast_vf_f(-PI_C2f*0.5f))); } else { dfi_t dfi = rempif(d); q = dfigeti_vi2_dfi(dfi); s = dfigetdf_vf2_dfi(dfi); o = vor_vo_vo_vo(visinf_vo_vf(d), visnan_vo_vf(d)); s = vf2setx_vf2_vf2_vf(s, vreinterpret_vf_vm(vor_vm_vo32_vm(o, vreinterpret_vm_vf(vf2getx_vf_vf2(s))))); } t = s; s = vf2setx_vf2_vf2_vf(s, dfsqu_vf_vf2(s)); u = vcast_vf_f(-0.000195169282960705459117889f); u = vmla_vf_vf_vf_vf(u, vf2getx_vf_vf2(s), vcast_vf_f(0.00833215750753879547119141f)); u = vmla_vf_vf_vf_vf(u, vf2getx_vf_vf2(s), vcast_vf_f(-0.166666537523269653320312f)); u = vmul_vf_vf_vf(u, vmul_vf_vf_vf(vf2getx_vf_vf2(s), vf2getx_vf_vf2(t))); x = dfadd_vf2_vf2_vf(t, u); rx = vadd_vf_vf_vf(vf2getx_vf_vf2(x), vf2gety_vf_vf2(x)); rx = vsel_vf_vo_vf_vf(visnegzero_vo_vf(d), vcast_vf_f(-0.0f), rx); u = vcast_vf_f(-2.71811842367242206819355e-07f); u = vmla_vf_vf_vf_vf(u, vf2getx_vf_vf2(s), vcast_vf_f(2.47990446951007470488548e-05f)); u = vmla_vf_vf_vf_vf(u, vf2getx_vf_vf2(s), vcast_vf_f(-0.00138888787478208541870117f)); u = vmla_vf_vf_vf_vf(u, vf2getx_vf_vf2(s), vcast_vf_f(0.0416666641831398010253906f)); u = vmla_vf_vf_vf_vf(u, vf2getx_vf_vf2(s), vcast_vf_f(-0.5)); x = dfadd_vf2_vf_vf2(vcast_vf_f(1), dfmul_vf2_vf_vf(vf2getx_vf_vf2(s), u)); ry = vadd_vf_vf_vf(vf2getx_vf_vf2(x), vf2gety_vf_vf2(x)); o = veq_vo_vi2_vi2(vand_vi2_vi2_vi2(q, vcast_vi2_i(1)), vcast_vi2_i(0)); r = vf2setxy_vf2_vf_vf(vsel_vf_vo_vf_vf(o, rx, ry), vsel_vf_vo_vf_vf(o, ry, rx)); o = veq_vo_vi2_vi2(vand_vi2_vi2_vi2(q, vcast_vi2_i(2)), vcast_vi2_i(2)); r = vf2setx_vf2_vf2_vf(r, vreinterpret_vf_vm(vxor_vm_vm_vm(vand_vm_vo32_vm(o, vreinterpret_vm_vf(vcast_vf_f(-0.0))), vreinterpret_vm_vf(vf2getx_vf_vf2(r))))); o = veq_vo_vi2_vi2(vand_vi2_vi2_vi2(vadd_vi2_vi2_vi2(q, vcast_vi2_i(1)), vcast_vi2_i(2)), vcast_vi2_i(2)); r = vf2sety_vf2_vf2_vf(r, vreinterpret_vf_vm(vxor_vm_vm_vm(vand_vm_vo32_vm(o, vreinterpret_vm_vf(vcast_vf_f(-0.0))), vreinterpret_vm_vf(vf2gety_vf_vf2(r))))); return r; #else // #if !defined(DETERMINISTIC) vint2 q; vopmask o; vfloat u, v, rx, ry; vfloat2 r, s, t, x; u = vrint_vf_vf(vmul_vf_vf_vf(d, vcast_vf_f(2 * M_1_PI))); q = vrint_vi2_vf(u); v = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_A2f*0.5f), d); s = dfadd2_vf2_vf_vf(v, vmul_vf_vf_vf(u, vcast_vf_f(-PI_B2f*0.5f))); s = dfadd_vf2_vf2_vf(s, vmul_vf_vf_vf(u, vcast_vf_f(-PI_C2f*0.5f))); vopmask g = vlt_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(TRIGRANGEMAX2f)); if (!LIKELY(vtestallones_i_vo32(g))) { dfi_t dfi = rempif(d); t = dfigetdf_vf2_dfi(dfi); o = vor_vo_vo_vo(visinf_vo_vf(d), visnan_vo_vf(d)); t = vf2setx_vf2_vf2_vf(t, vreinterpret_vf_vm(vor_vm_vo32_vm(o, vreinterpret_vm_vf(vf2getx_vf_vf2(t))))); q = vsel_vi2_vo_vi2_vi2(g, q, dfigeti_vi2_dfi(dfi)); s = vsel_vf2_vo_vf2_vf2(g, s, t); } t = s; s = vf2setx_vf2_vf2_vf(s, dfsqu_vf_vf2(s)); u = vcast_vf_f(-0.000195169282960705459117889f); u = vmla_vf_vf_vf_vf(u, vf2getx_vf_vf2(s), vcast_vf_f(0.00833215750753879547119141f)); u = vmla_vf_vf_vf_vf(u, vf2getx_vf_vf2(s), vcast_vf_f(-0.166666537523269653320312f)); u = vmul_vf_vf_vf(u, vmul_vf_vf_vf(vf2getx_vf_vf2(s), vf2getx_vf_vf2(t))); x = dfadd_vf2_vf2_vf(t, u); rx = vadd_vf_vf_vf(vf2getx_vf_vf2(x), vf2gety_vf_vf2(x)); rx = vsel_vf_vo_vf_vf(visnegzero_vo_vf(d), vcast_vf_f(-0.0f), rx); u = vcast_vf_f(-2.71811842367242206819355e-07f); u = vmla_vf_vf_vf_vf(u, vf2getx_vf_vf2(s), vcast_vf_f(2.47990446951007470488548e-05f)); u = vmla_vf_vf_vf_vf(u, vf2getx_vf_vf2(s), vcast_vf_f(-0.00138888787478208541870117f)); u = vmla_vf_vf_vf_vf(u, vf2getx_vf_vf2(s), vcast_vf_f(0.0416666641831398010253906f)); u = vmla_vf_vf_vf_vf(u, vf2getx_vf_vf2(s), vcast_vf_f(-0.5)); x = dfadd_vf2_vf_vf2(vcast_vf_f(1), dfmul_vf2_vf_vf(vf2getx_vf_vf2(s), u)); ry = vadd_vf_vf_vf(vf2getx_vf_vf2(x), vf2gety_vf_vf2(x)); o = veq_vo_vi2_vi2(vand_vi2_vi2_vi2(q, vcast_vi2_i(1)), vcast_vi2_i(0)); r = vf2setxy_vf2_vf_vf(vsel_vf_vo_vf_vf(o, rx, ry), vsel_vf_vo_vf_vf(o, ry, rx)); o = veq_vo_vi2_vi2(vand_vi2_vi2_vi2(q, vcast_vi2_i(2)), vcast_vi2_i(2)); r = vf2setx_vf2_vf2_vf(r, vreinterpret_vf_vm(vxor_vm_vm_vm(vand_vm_vo32_vm(o, vreinterpret_vm_vf(vcast_vf_f(-0.0))), vreinterpret_vm_vf(vf2getx_vf_vf2(r))))); o = veq_vo_vi2_vi2(vand_vi2_vi2_vi2(vadd_vi2_vi2_vi2(q, vcast_vi2_i(1)), vcast_vi2_i(2)), vcast_vi2_i(2)); r = vf2sety_vf2_vf2_vf(r, vreinterpret_vf_vm(vxor_vm_vm_vm(vand_vm_vo32_vm(o, vreinterpret_vm_vf(vcast_vf_f(-0.0))), vreinterpret_vm_vf(vf2gety_vf_vf2(r))))); return r; #endif // #if !defined(DETERMINISTIC) } #if !defined(DETERMINISTIC) TYPE2_FUNCATR VECTOR_CC vfloat2 XSINCOSPIF_U05(vfloat d) { vopmask o; vfloat u, s, t, rx, ry; vfloat2 r, x, s2; u = vmul_vf_vf_vf(d, vcast_vf_f(4)); vint2 q = vtruncate_vi2_vf(u); q = vand_vi2_vi2_vi2(vadd_vi2_vi2_vi2(q, vxor_vi2_vi2_vi2(vsrl_vi2_vi2_i(q, 31), vcast_vi2_i(1))), vcast_vi2_i(~1)); s = vsub_vf_vf_vf(u, vcast_vf_vi2(q)); t = s; s = vmul_vf_vf_vf(s, s); s2 = dfmul_vf2_vf_vf(t, t); // u = vcast_vf_f(+0.3093842054e-6); u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(-0.3657307388e-4)); u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(+0.2490393585e-2)); x = dfadd2_vf2_vf_vf2(vmul_vf_vf_vf(u, s), vcast_vf2_f_f(-0.080745510756969451904, -1.3373665339076936258e-09)); x = dfadd2_vf2_vf2_vf2(dfmul_vf2_vf2_vf2(s2, x), vcast_vf2_f_f(0.78539818525314331055, -2.1857338617566484855e-08)); x = dfmul_vf2_vf2_vf(x, t); rx = vadd_vf_vf_vf(vf2getx_vf_vf2(x), vf2gety_vf_vf2(x)); rx = vsel_vf_vo_vf_vf(visnegzero_vo_vf(d), vcast_vf_f(-0.0f), rx); // u = vcast_vf_f(-0.2430611801e-7); u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(+0.3590577080e-5)); u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(-0.3259917721e-3)); x = dfadd2_vf2_vf_vf2(vmul_vf_vf_vf(u, s), vcast_vf2_f_f(0.015854343771934509277, 4.4940051354032242811e-10)); x = dfadd2_vf2_vf2_vf2(dfmul_vf2_vf2_vf2(s2, x), vcast_vf2_f_f(-0.30842512845993041992, -9.0728339030733922277e-09)); x = dfadd2_vf2_vf2_vf(dfmul_vf2_vf2_vf2(x, s2), vcast_vf_f(1)); ry = vadd_vf_vf_vf(vf2getx_vf_vf2(x), vf2gety_vf_vf2(x)); // o = veq_vo_vi2_vi2(vand_vi2_vi2_vi2(q, vcast_vi2_i(2)), vcast_vi2_i(0)); r = vf2setxy_vf2_vf_vf(vsel_vf_vo_vf_vf(o, rx, ry), vsel_vf_vo_vf_vf(o, ry, rx)); o = veq_vo_vi2_vi2(vand_vi2_vi2_vi2(q, vcast_vi2_i(4)), vcast_vi2_i(4)); r = vf2setx_vf2_vf2_vf(r, vreinterpret_vf_vm(vxor_vm_vm_vm(vand_vm_vo32_vm(o, vreinterpret_vm_vf(vcast_vf_f(-0.0))), vreinterpret_vm_vf(vf2getx_vf_vf2(r))))); o = veq_vo_vi2_vi2(vand_vi2_vi2_vi2(vadd_vi2_vi2_vi2(q, vcast_vi2_i(2)), vcast_vi2_i(4)), vcast_vi2_i(4)); r = vf2sety_vf2_vf2_vf(r, vreinterpret_vf_vm(vxor_vm_vm_vm(vand_vm_vo32_vm(o, vreinterpret_vm_vf(vcast_vf_f(-0.0))), vreinterpret_vm_vf(vf2gety_vf_vf2(r))))); o = vgt_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(1e+7f)); r = vf2setx_vf2_vf2_vf(r, vreinterpret_vf_vm(vandnot_vm_vo32_vm(o, vreinterpret_vm_vf(vf2getx_vf_vf2(r))))); r = vf2sety_vf2_vf2_vf(r, vreinterpret_vf_vm(vandnot_vm_vo32_vm(o, vreinterpret_vm_vf(vf2gety_vf_vf2(r))))); o = visinf_vo_vf(d); r = vf2setx_vf2_vf2_vf(r, vreinterpret_vf_vm(vor_vm_vo32_vm(o, vreinterpret_vm_vf(vf2getx_vf_vf2(r))))); r = vf2sety_vf2_vf2_vf(r, vreinterpret_vf_vm(vor_vm_vo32_vm(o, vreinterpret_vm_vf(vf2gety_vf_vf2(r))))); return r; } TYPE2_FUNCATR VECTOR_CC vfloat2 XSINCOSPIF_U35(vfloat d) { vopmask o; vfloat u, s, t, rx, ry; vfloat2 r; u = vmul_vf_vf_vf(d, vcast_vf_f(4)); vint2 q = vtruncate_vi2_vf(u); q = vand_vi2_vi2_vi2(vadd_vi2_vi2_vi2(q, vxor_vi2_vi2_vi2(vsrl_vi2_vi2_i(q, 31), vcast_vi2_i(1))), vcast_vi2_i(~1)); s = vsub_vf_vf_vf(u, vcast_vf_vi2(q)); t = s; s = vmul_vf_vf_vf(s, s); // u = vcast_vf_f(-0.3600925265e-4); u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(+0.2490088111e-2)); u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(-0.8074551076e-1)); u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(+0.7853981853e+0)); rx = vmul_vf_vf_vf(u, t); // u = vcast_vf_f(+0.3539815225e-5); u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(-0.3259574005e-3)); u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(+0.1585431583e-1)); u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(-0.3084251285e+0)); u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(1)); ry = u; // o = veq_vo_vi2_vi2(vand_vi2_vi2_vi2(q, vcast_vi2_i(2)), vcast_vi2_i(0)); r = vf2setxy_vf2_vf_vf(vsel_vf_vo_vf_vf(o, rx, ry), vsel_vf_vo_vf_vf(o, ry, rx)); o = veq_vo_vi2_vi2(vand_vi2_vi2_vi2(q, vcast_vi2_i(4)), vcast_vi2_i(4)); r = vf2setx_vf2_vf2_vf(r, vreinterpret_vf_vm(vxor_vm_vm_vm(vand_vm_vo32_vm(o, vreinterpret_vm_vf(vcast_vf_f(-0.0))), vreinterpret_vm_vf(vf2getx_vf_vf2(r))))); o = veq_vo_vi2_vi2(vand_vi2_vi2_vi2(vadd_vi2_vi2_vi2(q, vcast_vi2_i(2)), vcast_vi2_i(4)), vcast_vi2_i(4)); r = vf2sety_vf2_vf2_vf(r, vreinterpret_vf_vm(vxor_vm_vm_vm(vand_vm_vo32_vm(o, vreinterpret_vm_vf(vcast_vf_f(-0.0))), vreinterpret_vm_vf(vf2gety_vf_vf2(r))))); o = vgt_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(1e+7f)); r = vf2setx_vf2_vf2_vf(r, vreinterpret_vf_vm(vandnot_vm_vo32_vm(o, vreinterpret_vm_vf(vf2getx_vf_vf2(r))))); r = vf2sety_vf2_vf2_vf(r, vreinterpret_vf_vm(vandnot_vm_vo32_vm(o, vreinterpret_vm_vf(vf2gety_vf_vf2(r))))); o = visinf_vo_vf(d); r = vf2setx_vf2_vf2_vf(r, vreinterpret_vf_vm(vor_vm_vo32_vm(o, vreinterpret_vm_vf(vf2getx_vf_vf2(r))))); r = vf2sety_vf2_vf2_vf(r, vreinterpret_vf_vm(vor_vm_vo32_vm(o, vreinterpret_vm_vf(vf2gety_vf_vf2(r))))); return r; } TYPE6_FUNCATR VECTOR_CC vfloat2 XMODFF(vfloat x) { vfloat fr = vsub_vf_vf_vf(x, vcast_vf_vi2(vtruncate_vi2_vf(x))); fr = vsel_vf_vo_vf_vf(vgt_vo_vf_vf(vabs_vf_vf(x), vcast_vf_f(INT64_C(1) << 23)), vcast_vf_f(0), fr); vfloat2 ret; ret = vf2setxy_vf2_vf_vf(vcopysign_vf_vf_vf(fr, x), vcopysign_vf_vf_vf(vsub_vf_vf_vf(x, fr), x)); return ret; } #ifdef ENABLE_GNUABI EXPORT VECTOR_CC void xsincosf(vfloat a, float *ps, float *pc) { vfloat2 r = sincosfk(a); vstoreu_v_p_vf(ps, vf2getx_vf_vf2(r)); vstoreu_v_p_vf(pc, vf2gety_vf_vf2(r)); } EXPORT VECTOR_CC void xsincosf_u1(vfloat a, float *ps, float *pc) { vfloat2 r = sincosfk_u1(a); vstoreu_v_p_vf(ps, vf2getx_vf_vf2(r)); vstoreu_v_p_vf(pc, vf2gety_vf_vf2(r)); } EXPORT VECTOR_CC void xsincospif_u05(vfloat a, float *ps, float *pc) { vfloat2 r = sincospifk_u05(a); vstoreu_v_p_vf(ps, vf2getx_vf_vf2(r)); vstoreu_v_p_vf(pc, vf2gety_vf_vf2(r)); } EXPORT VECTOR_CC void xsincospif_u35(vfloat a, float *ps, float *pc) { vfloat2 r = sincospifk_u35(a); vstoreu_v_p_vf(ps, vf2getx_vf_vf2(r)); vstoreu_v_p_vf(pc, vf2gety_vf_vf2(r)); } EXPORT CONST VECTOR_CC vfloat xmodff(vfloat a, float *iptr) { vfloat2 r = modffk(a); vstoreu_v_p_vf(iptr, vf2gety_vf_vf2(r)); return vf2getx_vf_vf2(r); } #endif // #ifdef ENABLE_GNUABI #endif // #if !defined(DETERMINISTIC) EXPORT CONST VECTOR_CC vfloat xtanf_u1(vfloat d) { #if !defined(DETERMINISTIC) vint2 q; vfloat u, v; vfloat2 s, t, x; vopmask o; if (LIKELY(vtestallones_i_vo32(vlt_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(TRIGRANGEMAX2f))))) { u = vrint_vf_vf(vmul_vf_vf_vf(d, vcast_vf_f(2 * M_1_PI))); q = vrint_vi2_vf(u); v = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_A2f*0.5f), d); s = dfadd2_vf2_vf_vf(v, vmul_vf_vf_vf(u, vcast_vf_f(-PI_B2f*0.5f))); s = dfadd_vf2_vf2_vf(s, vmul_vf_vf_vf(u, vcast_vf_f(-PI_C2f*0.5f))); } else { dfi_t dfi = rempif(d); q = dfigeti_vi2_dfi(dfi); s = dfigetdf_vf2_dfi(dfi); o = vor_vo_vo_vo(visinf_vo_vf(d), visnan_vo_vf(d)); s = vf2setx_vf2_vf2_vf(s, vreinterpret_vf_vm(vor_vm_vo32_vm(o, vreinterpret_vm_vf(vf2getx_vf_vf2(s))))); s = vf2sety_vf2_vf2_vf(s, vreinterpret_vf_vm(vor_vm_vo32_vm(o, vreinterpret_vm_vf(vf2gety_vf_vf2(s))))); } o = veq_vo_vi2_vi2(vand_vi2_vi2_vi2(q, vcast_vi2_i(1)), vcast_vi2_i(1)); vmask n = vand_vm_vo32_vm(o, vreinterpret_vm_vf(vcast_vf_f(-0.0))); #if !defined(_MSC_VER) s = vf2setx_vf2_vf2_vf(s, vreinterpret_vf_vm(vxor_vm_vm_vm(vreinterpret_vm_vf(vf2getx_vf_vf2(s)), n))); s = vf2sety_vf2_vf2_vf(s, vreinterpret_vf_vm(vxor_vm_vm_vm(vreinterpret_vm_vf(vf2gety_vf_vf2(s)), n))); #else s.x = vreinterpret_vf_vm(vxor_vm_vm_vm(vreinterpret_vm_vf(s.x), n)); s.y = vreinterpret_vf_vm(vxor_vm_vm_vm(vreinterpret_vm_vf(s.y), n)); #endif t = s; s = dfsqu_vf2_vf2(s); s = dfnormalize_vf2_vf2(s); u = vcast_vf_f(0.00446636462584137916564941f); u = vmla_vf_vf_vf_vf(u, vf2getx_vf_vf2(s), vcast_vf_f(-8.3920182078145444393158e-05f)); u = vmla_vf_vf_vf_vf(u, vf2getx_vf_vf2(s), vcast_vf_f(0.0109639242291450500488281f)); u = vmla_vf_vf_vf_vf(u, vf2getx_vf_vf2(s), vcast_vf_f(0.0212360303848981857299805f)); u = vmla_vf_vf_vf_vf(u, vf2getx_vf_vf2(s), vcast_vf_f(0.0540687143802642822265625f)); x = dfadd_vf2_vf_vf(vcast_vf_f(0.133325666189193725585938f), vmul_vf_vf_vf(u, vf2getx_vf_vf2(s))); x = dfadd_vf2_vf_vf2(vcast_vf_f(1), dfmul_vf2_vf2_vf2(dfadd_vf2_vf_vf2(vcast_vf_f(0.33333361148834228515625f), dfmul_vf2_vf2_vf2(s, x)), s)); x = dfmul_vf2_vf2_vf2(t, x); x = vsel_vf2_vo_vf2_vf2(o, dfrec_vf2_vf2(x), x); u = vadd_vf_vf_vf(vf2getx_vf_vf2(x), vf2gety_vf_vf2(x)); u = vsel_vf_vo_vf_vf(visnegzero_vo_vf(d), d, u); return u; #else // #if !defined(DETERMINISTIC) vint2 q; vfloat u, v; vfloat2 s, t, x; vopmask o; u = vrint_vf_vf(vmul_vf_vf_vf(d, vcast_vf_f(2 * M_1_PI))); q = vrint_vi2_vf(u); v = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_A2f*0.5f), d); s = dfadd2_vf2_vf_vf(v, vmul_vf_vf_vf(u, vcast_vf_f(-PI_B2f*0.5f))); s = dfadd_vf2_vf2_vf(s, vmul_vf_vf_vf(u, vcast_vf_f(-PI_C2f*0.5f))); vopmask g = vlt_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(TRIGRANGEMAX2f)); if (!LIKELY(vtestallones_i_vo32(g))) { dfi_t dfi = rempif(d); t = dfigetdf_vf2_dfi(dfi); o = vor_vo_vo_vo(visinf_vo_vf(d), visnan_vo_vf(d)); t = vf2setx_vf2_vf2_vf(t, vreinterpret_vf_vm(vor_vm_vo32_vm(o, vreinterpret_vm_vf(vf2getx_vf_vf2(t))))); t = vf2sety_vf2_vf2_vf(t, vreinterpret_vf_vm(vor_vm_vo32_vm(o, vreinterpret_vm_vf(vf2gety_vf_vf2(t))))); q = vsel_vi2_vo_vi2_vi2(g, q, dfigeti_vi2_dfi(dfi)); s = vsel_vf2_vo_vf2_vf2(g, s, t); } o = veq_vo_vi2_vi2(vand_vi2_vi2_vi2(q, vcast_vi2_i(1)), vcast_vi2_i(1)); vmask n = vand_vm_vo32_vm(o, vreinterpret_vm_vf(vcast_vf_f(-0.0))); s = vf2setx_vf2_vf2_vf(s, vreinterpret_vf_vm(vxor_vm_vm_vm(vreinterpret_vm_vf(vf2getx_vf_vf2(s)), n))); s = vf2sety_vf2_vf2_vf(s, vreinterpret_vf_vm(vxor_vm_vm_vm(vreinterpret_vm_vf(vf2gety_vf_vf2(s)), n))); t = s; s = dfsqu_vf2_vf2(s); s = dfnormalize_vf2_vf2(s); u = vcast_vf_f(0.00446636462584137916564941f); u = vmla_vf_vf_vf_vf(u, vf2getx_vf_vf2(s), vcast_vf_f(-8.3920182078145444393158e-05f)); u = vmla_vf_vf_vf_vf(u, vf2getx_vf_vf2(s), vcast_vf_f(0.0109639242291450500488281f)); u = vmla_vf_vf_vf_vf(u, vf2getx_vf_vf2(s), vcast_vf_f(0.0212360303848981857299805f)); u = vmla_vf_vf_vf_vf(u, vf2getx_vf_vf2(s), vcast_vf_f(0.0540687143802642822265625f)); x = dfadd_vf2_vf_vf(vcast_vf_f(0.133325666189193725585938f), vmul_vf_vf_vf(u, vf2getx_vf_vf2(s))); x = dfadd_vf2_vf_vf2(vcast_vf_f(1), dfmul_vf2_vf2_vf2(dfadd_vf2_vf_vf2(vcast_vf_f(0.33333361148834228515625f), dfmul_vf2_vf2_vf2(s, x)), s)); x = dfmul_vf2_vf2_vf2(t, x); x = vsel_vf2_vo_vf2_vf2(o, dfrec_vf2_vf2(x), x); u = vadd_vf_vf_vf(vf2getx_vf_vf2(x), vf2gety_vf_vf2(x)); u = vsel_vf_vo_vf_vf(visnegzero_vo_vf(d), d, u); return u; #endif // #if !defined(DETERMINISTIC) } #if !defined(DETERMINISTIC) EXPORT CONST VECTOR_CC vfloat xatanf(vfloat d) { vfloat s, t, u; vint2 q; q = vsel_vi2_vf_vi2(d, vcast_vi2_i(2)); s = vabs_vf_vf(d); q = vsel_vi2_vf_vf_vi2_vi2(vcast_vf_f(1.0f), s, vadd_vi2_vi2_vi2(q, vcast_vi2_i(1)), q); s = vsel_vf_vo_vf_vf(vlt_vo_vf_vf(vcast_vf_f(1.0f), s), vrec_vf_vf(s), s); t = vmul_vf_vf_vf(s, s); vfloat t2 = vmul_vf_vf_vf(t, t), t4 = vmul_vf_vf_vf(t2, t2); u = POLY8(t, t2, t4, 0.00282363896258175373077393f, -0.0159569028764963150024414f, 0.0425049886107444763183594f, -0.0748900920152664184570312f, 0.106347933411598205566406f, -0.142027363181114196777344f, 0.199926957488059997558594f, -0.333331018686294555664062f); t = vmla_vf_vf_vf_vf(s, vmul_vf_vf_vf(t, u), s); t = vsel_vf_vo_vf_vf(veq_vo_vi2_vi2(vand_vi2_vi2_vi2(q, vcast_vi2_i(1)), vcast_vi2_i(1)), vsub_vf_vf_vf(vcast_vf_f((float)(M_PI/2)), t), t); t = vreinterpret_vf_vm(vxor_vm_vm_vm(vand_vm_vo32_vm(veq_vo_vi2_vi2(vand_vi2_vi2_vi2(q, vcast_vi2_i(2)), vcast_vi2_i(2)), vreinterpret_vm_vf(vcast_vf_f(-0.0f))), vreinterpret_vm_vf(t))); #if defined(ENABLE_NEON32) || defined(ENABLE_NEON32VFPV4) t = vsel_vf_vo_vf_vf(visinf_vo_vf(d), vmulsign_vf_vf_vf(vcast_vf_f(1.5874010519681994747517056f), d), t); #endif return t; } #endif // #if !defined(DETERMINISTIC) static INLINE CONST VECTOR_CC vfloat atan2kf(vfloat y, vfloat x) { vfloat s, t, u; vint2 q; vopmask p; q = vsel_vi2_vf_vi2(x, vcast_vi2_i(-2)); x = vabs_vf_vf(x); q = vsel_vi2_vf_vf_vi2_vi2(x, y, vadd_vi2_vi2_vi2(q, vcast_vi2_i(1)), q); p = vlt_vo_vf_vf(x, y); s = vsel_vf_vo_vf_vf(p, vneg_vf_vf(x), y); t = vmax_vf_vf_vf(x, y); s = vdiv_vf_vf_vf(s, t); t = vmul_vf_vf_vf(s, s); vfloat t2 = vmul_vf_vf_vf(t, t), t4 = vmul_vf_vf_vf(t2, t2); u = POLY8(t, t2, t4, 0.00282363896258175373077393f, -0.0159569028764963150024414f, 0.0425049886107444763183594f, -0.0748900920152664184570312f, 0.106347933411598205566406f, -0.142027363181114196777344f, 0.199926957488059997558594f, -0.333331018686294555664062f); t = vmla_vf_vf_vf_vf(s, vmul_vf_vf_vf(t, u), s); t = vmla_vf_vf_vf_vf(vcast_vf_vi2(q), vcast_vf_f((float)(M_PI/2)), t); return t; } static INLINE CONST VECTOR_CC vfloat visinf2_vf_vf_vf(vfloat d, vfloat m) { return vreinterpret_vf_vm(vand_vm_vo32_vm(visinf_vo_vf(d), vor_vm_vm_vm(vsignbit_vm_vf(d), vreinterpret_vm_vf(m)))); } #if !defined(DETERMINISTIC) EXPORT CONST VECTOR_CC vfloat xatan2f(vfloat y, vfloat x) { vfloat r = atan2kf(vabs_vf_vf(y), x); r = vmulsign_vf_vf_vf(r, x); r = vsel_vf_vo_vf_vf(vor_vo_vo_vo(visinf_vo_vf(x), veq_vo_vf_vf(x, vcast_vf_f(0.0f))), vsub_vf_vf_vf(vcast_vf_f((float)(M_PI/2)), visinf2_vf_vf_vf(x, vmulsign_vf_vf_vf(vcast_vf_f((float)(M_PI/2)), x))), r); r = vsel_vf_vo_vf_vf(visinf_vo_vf(y), vsub_vf_vf_vf(vcast_vf_f((float)(M_PI/2)), visinf2_vf_vf_vf(x, vmulsign_vf_vf_vf(vcast_vf_f((float)(M_PI/4)), x))), r); r = vsel_vf_vo_vf_vf(veq_vo_vf_vf(y, vcast_vf_f(0.0f)), vreinterpret_vf_vm(vand_vm_vo32_vm(vsignbit_vo_vf(x), vreinterpret_vm_vf(vcast_vf_f((float)M_PI)))), r); r = vreinterpret_vf_vm(vor_vm_vo32_vm(vor_vo_vo_vo(visnan_vo_vf(x), visnan_vo_vf(y)), vreinterpret_vm_vf(vmulsign_vf_vf_vf(r, y)))); return r; } EXPORT CONST VECTOR_CC vfloat xasinf(vfloat d) { vopmask o = vlt_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(0.5f)); vfloat x2 = vsel_vf_vo_vf_vf(o, vmul_vf_vf_vf(d, d), vmul_vf_vf_vf(vsub_vf_vf_vf(vcast_vf_f(1), vabs_vf_vf(d)), vcast_vf_f(0.5f))); vfloat x = vsel_vf_vo_vf_vf(o, vabs_vf_vf(d), vsqrt_vf_vf(x2)), u; u = vcast_vf_f(+0.4197454825e-1); u = vmla_vf_vf_vf_vf(u, x2, vcast_vf_f(+0.2424046025e-1)); u = vmla_vf_vf_vf_vf(u, x2, vcast_vf_f(+0.4547423869e-1)); u = vmla_vf_vf_vf_vf(u, x2, vcast_vf_f(+0.7495029271e-1)); u = vmla_vf_vf_vf_vf(u, x2, vcast_vf_f(+0.1666677296e+0)); u = vmla_vf_vf_vf_vf(u, vmul_vf_vf_vf(x, x2), x); vfloat r = vsel_vf_vo_vf_vf(o, u, vmla_vf_vf_vf_vf(u, vcast_vf_f(-2), vcast_vf_f(M_PIf/2))); return vmulsign_vf_vf_vf(r, d); } EXPORT CONST VECTOR_CC vfloat xacosf(vfloat d) { vopmask o = vlt_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(0.5f)); vfloat x2 = vsel_vf_vo_vf_vf(o, vmul_vf_vf_vf(d, d), vmul_vf_vf_vf(vsub_vf_vf_vf(vcast_vf_f(1), vabs_vf_vf(d)), vcast_vf_f(0.5f))), u; vfloat x = vsel_vf_vo_vf_vf(o, vabs_vf_vf(d), vsqrt_vf_vf(x2)); x = vsel_vf_vo_vf_vf(veq_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(1.0f)), vcast_vf_f(0), x); u = vcast_vf_f(+0.4197454825e-1); u = vmla_vf_vf_vf_vf(u, x2, vcast_vf_f(+0.2424046025e-1)); u = vmla_vf_vf_vf_vf(u, x2, vcast_vf_f(+0.4547423869e-1)); u = vmla_vf_vf_vf_vf(u, x2, vcast_vf_f(+0.7495029271e-1)); u = vmla_vf_vf_vf_vf(u, x2, vcast_vf_f(+0.1666677296e+0)); u = vmul_vf_vf_vf(u, vmul_vf_vf_vf(x2, x)); vfloat y = vsub_vf_vf_vf(vcast_vf_f(3.1415926535897932f/2), vadd_vf_vf_vf(vmulsign_vf_vf_vf(x, d), vmulsign_vf_vf_vf(u, d))); x = vadd_vf_vf_vf(x, u); vfloat r = vsel_vf_vo_vf_vf(o, y, vmul_vf_vf_vf(x, vcast_vf_f(2))); return vsel_vf_vo_vf_vf(vandnot_vo_vo_vo(o, vlt_vo_vf_vf(d, vcast_vf_f(0))), vf2getx_vf_vf2(dfadd_vf2_vf2_vf(vcast_vf2_f_f(3.1415927410125732422f,-8.7422776573475857731e-08f), vneg_vf_vf(r))), r); } #endif // #if !defined(DETERMINISTIC) // static INLINE CONST VECTOR_CC vfloat2 atan2kf_u1(vfloat2 y, vfloat2 x) { vfloat u; vfloat2 s, t; vint2 q; vopmask p; vmask r; q = vsel_vi2_vf_vf_vi2_vi2(vf2getx_vf_vf2(x), vcast_vf_f(0), vcast_vi2_i(-2), vcast_vi2_i(0)); p = vlt_vo_vf_vf(vf2getx_vf_vf2(x), vcast_vf_f(0)); r = vand_vm_vo32_vm(p, vreinterpret_vm_vf(vcast_vf_f(-0.0))); x = vf2setx_vf2_vf2_vf(x, vreinterpret_vf_vm(vxor_vm_vm_vm(vreinterpret_vm_vf(vf2getx_vf_vf2(x)), r))); x = vf2sety_vf2_vf2_vf(x, vreinterpret_vf_vm(vxor_vm_vm_vm(vreinterpret_vm_vf(vf2gety_vf_vf2(x)), r))); q = vsel_vi2_vf_vf_vi2_vi2(vf2getx_vf_vf2(x), vf2getx_vf_vf2(y), vadd_vi2_vi2_vi2(q, vcast_vi2_i(1)), q); p = vlt_vo_vf_vf(vf2getx_vf_vf2(x), vf2getx_vf_vf2(y)); s = vsel_vf2_vo_vf2_vf2(p, dfneg_vf2_vf2(x), y); t = vsel_vf2_vo_vf2_vf2(p, y, x); s = dfdiv_vf2_vf2_vf2(s, t); t = dfsqu_vf2_vf2(s); t = dfnormalize_vf2_vf2(t); u = vcast_vf_f(-0.00176397908944636583328247f); u = vmla_vf_vf_vf_vf(u, vf2getx_vf_vf2(t), vcast_vf_f(0.0107900900766253471374512f)); u = vmla_vf_vf_vf_vf(u, vf2getx_vf_vf2(t), vcast_vf_f(-0.0309564601629972457885742f)); u = vmla_vf_vf_vf_vf(u, vf2getx_vf_vf2(t), vcast_vf_f(0.0577365085482597351074219f)); u = vmla_vf_vf_vf_vf(u, vf2getx_vf_vf2(t), vcast_vf_f(-0.0838950723409652709960938f)); u = vmla_vf_vf_vf_vf(u, vf2getx_vf_vf2(t), vcast_vf_f(0.109463557600975036621094f)); u = vmla_vf_vf_vf_vf(u, vf2getx_vf_vf2(t), vcast_vf_f(-0.142626821994781494140625f)); u = vmla_vf_vf_vf_vf(u, vf2getx_vf_vf2(t), vcast_vf_f(0.199983194470405578613281f)); t = dfmul_vf2_vf2_vf2(t, dfadd_vf2_vf_vf(vcast_vf_f(-0.333332866430282592773438f), vmul_vf_vf_vf(u, vf2getx_vf_vf2(t)))); t = dfmul_vf2_vf2_vf2(s, dfadd_vf2_vf_vf2(vcast_vf_f(1), t)); t = dfadd_vf2_vf2_vf2(dfmul_vf2_vf2_vf(vcast_vf2_f_f(1.5707963705062866211f, -4.3711388286737928865e-08f), vcast_vf_vi2(q)), t); return t; } #if !defined(DETERMINISTIC) EXPORT CONST VECTOR_CC vfloat xatan2f_u1(vfloat y, vfloat x) { vopmask o = vlt_vo_vf_vf(vabs_vf_vf(x), vcast_vf_f(2.9387372783541830947e-39f)); // nexttowardf((1.0 / FLT_MAX), 1) x = vsel_vf_vo_vf_vf(o, vmul_vf_vf_vf(x, vcast_vf_f(1 << 24)), x); y = vsel_vf_vo_vf_vf(o, vmul_vf_vf_vf(y, vcast_vf_f(1 << 24)), y); vfloat2 d = atan2kf_u1(vcast_vf2_vf_vf(vabs_vf_vf(y), vcast_vf_f(0)), vcast_vf2_vf_vf(x, vcast_vf_f(0))); vfloat r = vadd_vf_vf_vf(vf2getx_vf_vf2(d), vf2gety_vf_vf2(d)); r = vmulsign_vf_vf_vf(r, x); r = vsel_vf_vo_vf_vf(vor_vo_vo_vo(visinf_vo_vf(x), veq_vo_vf_vf(x, vcast_vf_f(0))), vsub_vf_vf_vf(vcast_vf_f(M_PI/2), visinf2_vf_vf_vf(x, vmulsign_vf_vf_vf(vcast_vf_f(M_PI/2), x))), r); r = vsel_vf_vo_vf_vf(visinf_vo_vf(y), vsub_vf_vf_vf(vcast_vf_f(M_PI/2), visinf2_vf_vf_vf(x, vmulsign_vf_vf_vf(vcast_vf_f(M_PI/4), x))), r); r = vsel_vf_vo_vf_vf(veq_vo_vf_vf(y, vcast_vf_f(0.0f)), vreinterpret_vf_vm(vand_vm_vo32_vm(vsignbit_vo_vf(x), vreinterpret_vm_vf(vcast_vf_f((float)M_PI)))), r); r = vreinterpret_vf_vm(vor_vm_vo32_vm(vor_vo_vo_vo(visnan_vo_vf(x), visnan_vo_vf(y)), vreinterpret_vm_vf(vmulsign_vf_vf_vf(r, y)))); return r; } EXPORT CONST VECTOR_CC vfloat xasinf_u1(vfloat d) { vopmask o = vlt_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(0.5f)); vfloat x2 = vsel_vf_vo_vf_vf(o, vmul_vf_vf_vf(d, d), vmul_vf_vf_vf(vsub_vf_vf_vf(vcast_vf_f(1), vabs_vf_vf(d)), vcast_vf_f(0.5f))), u; vfloat2 x = vsel_vf2_vo_vf2_vf2(o, vcast_vf2_vf_vf(vabs_vf_vf(d), vcast_vf_f(0)), dfsqrt_vf2_vf(x2)); x = vsel_vf2_vo_vf2_vf2(veq_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(1.0f)), vcast_vf2_f_f(0, 0), x); u = vcast_vf_f(+0.4197454825e-1); u = vmla_vf_vf_vf_vf(u, x2, vcast_vf_f(+0.2424046025e-1)); u = vmla_vf_vf_vf_vf(u, x2, vcast_vf_f(+0.4547423869e-1)); u = vmla_vf_vf_vf_vf(u, x2, vcast_vf_f(+0.7495029271e-1)); u = vmla_vf_vf_vf_vf(u, x2, vcast_vf_f(+0.1666677296e+0)); u = vmul_vf_vf_vf(u, vmul_vf_vf_vf(x2, vf2getx_vf_vf2(x))); vfloat2 y = dfsub_vf2_vf2_vf(dfsub_vf2_vf2_vf2(vcast_vf2_f_f(3.1415927410125732422f/4,-8.7422776573475857731e-08f/4), x), u); vfloat r = vsel_vf_vo_vf_vf(o, vadd_vf_vf_vf(u, vf2getx_vf_vf2(x)), vmul_vf_vf_vf(vadd_vf_vf_vf(vf2getx_vf_vf2(y), vf2gety_vf_vf2(y)), vcast_vf_f(2))); return vmulsign_vf_vf_vf(r, d); } EXPORT CONST VECTOR_CC vfloat xacosf_u1(vfloat d) { vopmask o = vlt_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(0.5f)); vfloat x2 = vsel_vf_vo_vf_vf(o, vmul_vf_vf_vf(d, d), vmul_vf_vf_vf(vsub_vf_vf_vf(vcast_vf_f(1), vabs_vf_vf(d)), vcast_vf_f(0.5f))), u; vfloat2 x = vsel_vf2_vo_vf2_vf2(o, vcast_vf2_vf_vf(vabs_vf_vf(d), vcast_vf_f(0)), dfsqrt_vf2_vf(x2)); x = vsel_vf2_vo_vf2_vf2(veq_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(1.0f)), vcast_vf2_f_f(0, 0), x); u = vcast_vf_f(+0.4197454825e-1); u = vmla_vf_vf_vf_vf(u, x2, vcast_vf_f(+0.2424046025e-1)); u = vmla_vf_vf_vf_vf(u, x2, vcast_vf_f(+0.4547423869e-1)); u = vmla_vf_vf_vf_vf(u, x2, vcast_vf_f(+0.7495029271e-1)); u = vmla_vf_vf_vf_vf(u, x2, vcast_vf_f(+0.1666677296e+0)); u = vmul_vf_vf_vf(u, vmul_vf_vf_vf(x2, vf2getx_vf_vf2(x))); vfloat2 y = dfsub_vf2_vf2_vf2(vcast_vf2_f_f(3.1415927410125732422f/2, -8.7422776573475857731e-08f/2), dfadd_vf2_vf_vf(vmulsign_vf_vf_vf(vf2getx_vf_vf2(x), d), vmulsign_vf_vf_vf(u, d))); x = dfadd_vf2_vf2_vf(x, u); y = vsel_vf2_vo_vf2_vf2(o, y, dfscale_vf2_vf2_vf(x, vcast_vf_f(2))); y = vsel_vf2_vo_vf2_vf2(vandnot_vo_vo_vo(o, vlt_vo_vf_vf(d, vcast_vf_f(0))), dfsub_vf2_vf2_vf2(vcast_vf2_f_f(3.1415927410125732422f, -8.7422776573475857731e-08f), y), y); return vadd_vf_vf_vf(vf2getx_vf_vf2(y), vf2gety_vf_vf2(y)); } EXPORT CONST VECTOR_CC vfloat xatanf_u1(vfloat d) { vfloat2 d2 = atan2kf_u1(vcast_vf2_vf_vf(vabs_vf_vf(d), vcast_vf_f(0)), vcast_vf2_f_f(1, 0)); vfloat r = vadd_vf_vf_vf(vf2getx_vf_vf2(d2), vf2gety_vf_vf2(d2)); r = vsel_vf_vo_vf_vf(visinf_vo_vf(d), vcast_vf_f(1.570796326794896557998982), r); return vmulsign_vf_vf_vf(r, d); } #endif // #if !defined(DETERMINISTIC) // #if !defined(DETERMINISTIC) EXPORT CONST VECTOR_CC vfloat xlogf(vfloat d) { vfloat x, x2, t, m; #if !defined(ENABLE_AVX512F) && !defined(ENABLE_AVX512FNOFMA) vopmask o = vlt_vo_vf_vf(d, vcast_vf_f(FLT_MIN)); d = vsel_vf_vo_vf_vf(o, vmul_vf_vf_vf(d, vcast_vf_f((float)(INT64_C(1) << 32) * (float)(INT64_C(1) << 32))), d); vint2 e = vilogb2k_vi2_vf(vmul_vf_vf_vf(d, vcast_vf_f(1.0f/0.75f))); m = vldexp3_vf_vf_vi2(d, vneg_vi2_vi2(e)); e = vsel_vi2_vo_vi2_vi2(o, vsub_vi2_vi2_vi2(e, vcast_vi2_i(64)), e); #else vfloat e = vgetexp_vf_vf(vmul_vf_vf_vf(d, vcast_vf_f(1.0f/0.75f))); e = vsel_vf_vo_vf_vf(vispinf_vo_vf(e), vcast_vf_f(128.0f), e); m = vgetmant_vf_vf(d); #endif x = vdiv_vf_vf_vf(vsub_vf_vf_vf(m, vcast_vf_f(1.0f)), vadd_vf_vf_vf(vcast_vf_f(1.0f), m)); x2 = vmul_vf_vf_vf(x, x); t = vcast_vf_f(0.2392828464508056640625f); t = vmla_vf_vf_vf_vf(t, x2, vcast_vf_f(0.28518211841583251953125f)); t = vmla_vf_vf_vf_vf(t, x2, vcast_vf_f(0.400005877017974853515625f)); t = vmla_vf_vf_vf_vf(t, x2, vcast_vf_f(0.666666686534881591796875f)); t = vmla_vf_vf_vf_vf(t, x2, vcast_vf_f(2.0f)); #if !defined(ENABLE_AVX512F) && !defined(ENABLE_AVX512FNOFMA) x = vmla_vf_vf_vf_vf(x, t, vmul_vf_vf_vf(vcast_vf_f(0.693147180559945286226764f), vcast_vf_vi2(e))); x = vsel_vf_vo_vf_vf(vispinf_vo_vf(d), vcast_vf_f(SLEEF_INFINITYf), x); x = vsel_vf_vo_vf_vf(vor_vo_vo_vo(vlt_vo_vf_vf(d, vcast_vf_f(0)), visnan_vo_vf(d)), vcast_vf_f(SLEEF_NANf), x); x = vsel_vf_vo_vf_vf(veq_vo_vf_vf(d, vcast_vf_f(0)), vcast_vf_f(-SLEEF_INFINITYf), x); #else x = vmla_vf_vf_vf_vf(x, t, vmul_vf_vf_vf(vcast_vf_f(0.693147180559945286226764f), e)); x = vfixup_vf_vf_vf_vi2_i(x, d, vcast_vi2_i((5 << (5*4))), 0); #endif return x; } #endif // #if !defined(DETERMINISTIC) #if !defined(DETERMINISTIC) EXPORT CONST VECTOR_CC vfloat xexpf(vfloat d) { vint2 q = vrint_vi2_vf(vmul_vf_vf_vf(d, vcast_vf_f(R_LN2f))); vfloat s, u; s = vmla_vf_vf_vf_vf(vcast_vf_vi2(q), vcast_vf_f(-L2Uf), d); s = vmla_vf_vf_vf_vf(vcast_vf_vi2(q), vcast_vf_f(-L2Lf), s); u = vcast_vf_f(0.000198527617612853646278381); u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(0.00139304355252534151077271)); u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(0.00833336077630519866943359)); u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(0.0416664853692054748535156)); u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(0.166666671633720397949219)); u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(0.5)); u = vadd_vf_vf_vf(vcast_vf_f(1.0f), vmla_vf_vf_vf_vf(vmul_vf_vf_vf(s, s), u, s)); u = vldexp2_vf_vf_vi2(u, q); u = vreinterpret_vf_vm(vandnot_vm_vo32_vm(vlt_vo_vf_vf(d, vcast_vf_f(-104)), vreinterpret_vm_vf(u))); u = vsel_vf_vo_vf_vf(vlt_vo_vf_vf(vcast_vf_f(100), d), vcast_vf_f(SLEEF_INFINITYf), u); return u; } #endif // #if !defined(DETERMINISTIC) static INLINE CONST VECTOR_CC vfloat expm1fk(vfloat d) { vint2 q = vrint_vi2_vf(vmul_vf_vf_vf(d, vcast_vf_f(R_LN2f))); vfloat s, u; s = vmla_vf_vf_vf_vf(vcast_vf_vi2(q), vcast_vf_f(-L2Uf), d); s = vmla_vf_vf_vf_vf(vcast_vf_vi2(q), vcast_vf_f(-L2Lf), s); vfloat s2 = vmul_vf_vf_vf(s, s), s4 = vmul_vf_vf_vf(s2, s2); u = POLY6(s, s2, s4, 0.000198527617612853646278381, 0.00139304355252534151077271, 0.00833336077630519866943359, 0.0416664853692054748535156, 0.166666671633720397949219, 0.5); u = vmla_vf_vf_vf_vf(vmul_vf_vf_vf(s, s), u, s); u = vsel_vf_vo_vf_vf(veq_vo_vi2_vi2(q, vcast_vi2_i(0)), u, vsub_vf_vf_vf(vldexp2_vf_vf_vi2(vadd_vf_vf_vf(u, vcast_vf_f(1)), q), vcast_vf_f(1))); return u; } #if defined(ENABLE_NEON32) || defined(ENABLE_NEON32VFPV4) EXPORT CONST VECTOR_CC vfloat xsqrtf_u35(vfloat d) { vfloat e = vreinterpret_vf_vi2(vadd_vi2_vi2_vi2(vcast_vi2_i(0x20000000), vand_vi2_vi2_vi2(vcast_vi2_i(0x7f000000), vsrl_vi2_vi2_i(vreinterpret_vi2_vf(d), 1)))); vfloat m = vreinterpret_vf_vi2(vadd_vi2_vi2_vi2(vcast_vi2_i(0x3f000000), vand_vi2_vi2_vi2(vcast_vi2_i(0x01ffffff), vreinterpret_vi2_vf(d)))); float32x4_t x = vrsqrteq_f32(m); x = vmulq_f32(x, vrsqrtsq_f32(m, vmulq_f32(x, x))); float32x4_t u = vmulq_f32(x, m); u = vmlaq_f32(u, vmlsq_f32(m, u, u), vmulq_f32(x, vdupq_n_f32(0.5))); e = vreinterpret_vf_vm(vandnot_vm_vo32_vm(veq_vo_vf_vf(d, vcast_vf_f(0)), vreinterpret_vm_vf(e))); u = vmul_vf_vf_vf(e, u); u = vsel_vf_vo_vf_vf(visinf_vo_vf(d), vcast_vf_f(SLEEF_INFINITYf), u); u = vreinterpret_vf_vm(vor_vm_vo32_vm(vor_vo_vo_vo(visnan_vo_vf(d), vlt_vo_vf_vf(d, vcast_vf_f(0))), vreinterpret_vm_vf(u))); u = vmulsign_vf_vf_vf(u, d); return u; } #elif defined(ENABLE_VECEXT) EXPORT CONST VECTOR_CC vfloat xsqrtf_u35(vfloat d) { vfloat q = vsqrt_vf_vf(d); q = vsel_vf_vo_vf_vf(visnegzero_vo_vf(d), vcast_vf_f(-0.0), q); return vsel_vf_vo_vf_vf(vispinf_vo_vf(d), vcast_vf_f(SLEEF_INFINITYf), q); } #else EXPORT CONST VECTOR_CC vfloat xsqrtf_u35(vfloat d) { return vsqrt_vf_vf(d); } #endif #if !defined(DETERMINISTIC) EXPORT CONST VECTOR_CC vfloat xcbrtf(vfloat d) { vfloat x, y, q = vcast_vf_f(1.0), t; vint2 e, qu, re; #if defined(ENABLE_AVX512F) || defined(ENABLE_AVX512FNOFMA) vfloat s = d; #endif e = vadd_vi2_vi2_vi2(vilogbk_vi2_vf(vabs_vf_vf(d)), vcast_vi2_i(1)); d = vldexp2_vf_vf_vi2(d, vneg_vi2_vi2(e)); t = vadd_vf_vf_vf(vcast_vf_vi2(e), vcast_vf_f(6144)); qu = vtruncate_vi2_vf(vmul_vf_vf_vf(t, vcast_vf_f(1.0f/3.0f))); re = vtruncate_vi2_vf(vsub_vf_vf_vf(t, vmul_vf_vf_vf(vcast_vf_vi2(qu), vcast_vf_f(3)))); q = vsel_vf_vo_vf_vf(veq_vo_vi2_vi2(re, vcast_vi2_i(1)), vcast_vf_f(1.2599210498948731647672106f), q); q = vsel_vf_vo_vf_vf(veq_vo_vi2_vi2(re, vcast_vi2_i(2)), vcast_vf_f(1.5874010519681994747517056f), q); q = vldexp2_vf_vf_vi2(q, vsub_vi2_vi2_vi2(qu, vcast_vi2_i(2048))); q = vmulsign_vf_vf_vf(q, d); d = vabs_vf_vf(d); x = vcast_vf_f(-0.601564466953277587890625f); x = vmla_vf_vf_vf_vf(x, d, vcast_vf_f(2.8208892345428466796875f)); x = vmla_vf_vf_vf_vf(x, d, vcast_vf_f(-5.532182216644287109375f)); x = vmla_vf_vf_vf_vf(x, d, vcast_vf_f(5.898262500762939453125f)); x = vmla_vf_vf_vf_vf(x, d, vcast_vf_f(-3.8095417022705078125f)); x = vmla_vf_vf_vf_vf(x, d, vcast_vf_f(2.2241256237030029296875f)); y = vmul_vf_vf_vf(vmul_vf_vf_vf(d, x), x); y = vmul_vf_vf_vf(vsub_vf_vf_vf(y, vmul_vf_vf_vf(vmul_vf_vf_vf(vcast_vf_f(2.0f / 3.0f), y), vmla_vf_vf_vf_vf(y, x, vcast_vf_f(-1.0f)))), q); #if defined(ENABLE_AVX512F) || defined(ENABLE_AVX512FNOFMA) y = vsel_vf_vo_vf_vf(visinf_vo_vf(s), vmulsign_vf_vf_vf(vcast_vf_f(SLEEF_INFINITYf), s), y); y = vsel_vf_vo_vf_vf(veq_vo_vf_vf(s, vcast_vf_f(0)), vmulsign_vf_vf_vf(vcast_vf_f(0), s), y); #endif return y; } #endif // #if !defined(DETERMINISTIC) #if !defined(DETERMINISTIC) EXPORT CONST VECTOR_CC vfloat xcbrtf_u1(vfloat d) { vfloat x, y, z, t; vfloat2 q2 = vcast_vf2_f_f(1, 0), u, v; vint2 e, qu, re; #if defined(ENABLE_AVX512F) || defined(ENABLE_AVX512FNOFMA) vfloat s = d; #endif e = vadd_vi2_vi2_vi2(vilogbk_vi2_vf(vabs_vf_vf(d)), vcast_vi2_i(1)); d = vldexp2_vf_vf_vi2(d, vneg_vi2_vi2(e)); t = vadd_vf_vf_vf(vcast_vf_vi2(e), vcast_vf_f(6144)); qu = vtruncate_vi2_vf(vmul_vf_vf_vf(t, vcast_vf_f(1.0/3.0))); re = vtruncate_vi2_vf(vsub_vf_vf_vf(t, vmul_vf_vf_vf(vcast_vf_vi2(qu), vcast_vf_f(3)))); q2 = vsel_vf2_vo_vf2_vf2(veq_vo_vi2_vi2(re, vcast_vi2_i(1)), vcast_vf2_f_f(1.2599210739135742188f, -2.4018701694217270415e-08), q2); q2 = vsel_vf2_vo_vf2_vf2(veq_vo_vi2_vi2(re, vcast_vi2_i(2)), vcast_vf2_f_f(1.5874010324478149414f, 1.9520385308169352356e-08), q2); q2 = vf2setx_vf2_vf2_vf(q2, vmulsign_vf_vf_vf(vf2getx_vf_vf2(q2), d)); q2 = vf2sety_vf2_vf2_vf(q2, vmulsign_vf_vf_vf(vf2gety_vf_vf2(q2), d)); d = vabs_vf_vf(d); x = vcast_vf_f(-0.601564466953277587890625f); x = vmla_vf_vf_vf_vf(x, d, vcast_vf_f(2.8208892345428466796875f)); x = vmla_vf_vf_vf_vf(x, d, vcast_vf_f(-5.532182216644287109375f)); x = vmla_vf_vf_vf_vf(x, d, vcast_vf_f(5.898262500762939453125f)); x = vmla_vf_vf_vf_vf(x, d, vcast_vf_f(-3.8095417022705078125f)); x = vmla_vf_vf_vf_vf(x, d, vcast_vf_f(2.2241256237030029296875f)); y = vmul_vf_vf_vf(x, x); y = vmul_vf_vf_vf(y, y); x = vsub_vf_vf_vf(x, vmul_vf_vf_vf(vmlanp_vf_vf_vf_vf(d, y, x), vcast_vf_f(-1.0 / 3.0))); z = x; u = dfmul_vf2_vf_vf(x, x); u = dfmul_vf2_vf2_vf2(u, u); u = dfmul_vf2_vf2_vf(u, d); u = dfadd2_vf2_vf2_vf(u, vneg_vf_vf(x)); y = vadd_vf_vf_vf(vf2getx_vf_vf2(u), vf2gety_vf_vf2(u)); y = vmul_vf_vf_vf(vmul_vf_vf_vf(vcast_vf_f(-2.0 / 3.0), y), z); v = dfadd2_vf2_vf2_vf(dfmul_vf2_vf_vf(z, z), y); v = dfmul_vf2_vf2_vf(v, d); v = dfmul_vf2_vf2_vf2(v, q2); z = vldexp2_vf_vf_vi2(vadd_vf_vf_vf(vf2getx_vf_vf2(v), vf2gety_vf_vf2(v)), vsub_vi2_vi2_vi2(qu, vcast_vi2_i(2048))); z = vsel_vf_vo_vf_vf(visinf_vo_vf(d), vmulsign_vf_vf_vf(vcast_vf_f(SLEEF_INFINITYf), vf2getx_vf_vf2(q2)), z); z = vsel_vf_vo_vf_vf(veq_vo_vf_vf(d, vcast_vf_f(0)), vreinterpret_vf_vm(vsignbit_vm_vf(vf2getx_vf_vf2(q2))), z); #if defined(ENABLE_AVX512F) || defined(ENABLE_AVX512FNOFMA) z = vsel_vf_vo_vf_vf(visinf_vo_vf(s), vmulsign_vf_vf_vf(vcast_vf_f(SLEEF_INFINITYf), s), z); z = vsel_vf_vo_vf_vf(veq_vo_vf_vf(s, vcast_vf_f(0)), vmulsign_vf_vf_vf(vcast_vf_f(0), s), z); #endif return z; } #endif // #if !defined(DETERMINISTIC) static INLINE CONST VECTOR_CC vfloat2 logkf(vfloat d) { vfloat2 x, x2; vfloat t, m; #if !defined(ENABLE_AVX512F) && !defined(ENABLE_AVX512FNOFMA) vopmask o = vlt_vo_vf_vf(d, vcast_vf_f(FLT_MIN)); d = vsel_vf_vo_vf_vf(o, vmul_vf_vf_vf(d, vcast_vf_f((float)(INT64_C(1) << 32) * (float)(INT64_C(1) << 32))), d); vint2 e = vilogb2k_vi2_vf(vmul_vf_vf_vf(d, vcast_vf_f(1.0f/0.75f))); m = vldexp3_vf_vf_vi2(d, vneg_vi2_vi2(e)); e = vsel_vi2_vo_vi2_vi2(o, vsub_vi2_vi2_vi2(e, vcast_vi2_i(64)), e); #else vfloat e = vgetexp_vf_vf(vmul_vf_vf_vf(d, vcast_vf_f(1.0f/0.75f))); e = vsel_vf_vo_vf_vf(vispinf_vo_vf(e), vcast_vf_f(128.0f), e); m = vgetmant_vf_vf(d); #endif x = dfdiv_vf2_vf2_vf2(dfadd2_vf2_vf_vf(vcast_vf_f(-1), m), dfadd2_vf2_vf_vf(vcast_vf_f(1), m)); x2 = dfsqu_vf2_vf2(x); t = vcast_vf_f(0.240320354700088500976562); t = vmla_vf_vf_vf_vf(t, vf2getx_vf_vf2(x2), vcast_vf_f(0.285112679004669189453125)); t = vmla_vf_vf_vf_vf(t, vf2getx_vf_vf2(x2), vcast_vf_f(0.400007992982864379882812)); vfloat2 c = vcast_vf2_f_f(0.66666662693023681640625f, 3.69183861259614332084311e-09f); #if !defined(ENABLE_AVX512F) && !defined(ENABLE_AVX512FNOFMA) vfloat2 s = dfmul_vf2_vf2_vf(vcast_vf2_f_f(0.69314718246459960938f, -1.904654323148236017e-09f), vcast_vf_vi2(e)); #else vfloat2 s = dfmul_vf2_vf2_vf(vcast_vf2_f_f(0.69314718246459960938f, -1.904654323148236017e-09f), e); #endif s = dfadd_vf2_vf2_vf2(s, dfscale_vf2_vf2_vf(x, vcast_vf_f(2))); s = dfadd_vf2_vf2_vf2(s, dfmul_vf2_vf2_vf2(dfmul_vf2_vf2_vf2(x2, x), dfadd2_vf2_vf2_vf2(dfmul_vf2_vf2_vf(x2, t), c))); return s; } static INLINE CONST VECTOR_CC vfloat logk3f(vfloat d) { vfloat x, x2, t, m; #if !defined(ENABLE_AVX512F) && !defined(ENABLE_AVX512FNOFMA) vopmask o = vlt_vo_vf_vf(d, vcast_vf_f(FLT_MIN)); d = vsel_vf_vo_vf_vf(o, vmul_vf_vf_vf(d, vcast_vf_f((float)(INT64_C(1) << 32) * (float)(INT64_C(1) << 32))), d); vint2 e = vilogb2k_vi2_vf(vmul_vf_vf_vf(d, vcast_vf_f(1.0f/0.75f))); m = vldexp3_vf_vf_vi2(d, vneg_vi2_vi2(e)); e = vsel_vi2_vo_vi2_vi2(o, vsub_vi2_vi2_vi2(e, vcast_vi2_i(64)), e); #else vfloat e = vgetexp_vf_vf(vmul_vf_vf_vf(d, vcast_vf_f(1.0f/0.75f))); e = vsel_vf_vo_vf_vf(vispinf_vo_vf(e), vcast_vf_f(128.0f), e); m = vgetmant_vf_vf(d); #endif x = vdiv_vf_vf_vf(vsub_vf_vf_vf(m, vcast_vf_f(1.0f)), vadd_vf_vf_vf(vcast_vf_f(1.0f), m)); x2 = vmul_vf_vf_vf(x, x); t = vcast_vf_f(0.2392828464508056640625f); t = vmla_vf_vf_vf_vf(t, x2, vcast_vf_f(0.28518211841583251953125f)); t = vmla_vf_vf_vf_vf(t, x2, vcast_vf_f(0.400005877017974853515625f)); t = vmla_vf_vf_vf_vf(t, x2, vcast_vf_f(0.666666686534881591796875f)); t = vmla_vf_vf_vf_vf(t, x2, vcast_vf_f(2.0f)); #if !defined(ENABLE_AVX512F) && !defined(ENABLE_AVX512FNOFMA) x = vmla_vf_vf_vf_vf(x, t, vmul_vf_vf_vf(vcast_vf_f(0.693147180559945286226764f), vcast_vf_vi2(e))); #else x = vmla_vf_vf_vf_vf(x, t, vmul_vf_vf_vf(vcast_vf_f(0.693147180559945286226764f), e)); #endif return x; } #if !defined(DETERMINISTIC) EXPORT CONST VECTOR_CC vfloat xlogf_u1(vfloat d) { vfloat2 x; vfloat t, m, x2; #if !defined(ENABLE_AVX512F) && !defined(ENABLE_AVX512FNOFMA) vopmask o = vlt_vo_vf_vf(d, vcast_vf_f(FLT_MIN)); d = vsel_vf_vo_vf_vf(o, vmul_vf_vf_vf(d, vcast_vf_f((float)(INT64_C(1) << 32) * (float)(INT64_C(1) << 32))), d); vint2 e = vilogb2k_vi2_vf(vmul_vf_vf_vf(d, vcast_vf_f(1.0f/0.75f))); m = vldexp3_vf_vf_vi2(d, vneg_vi2_vi2(e)); e = vsel_vi2_vo_vi2_vi2(o, vsub_vi2_vi2_vi2(e, vcast_vi2_i(64)), e); vfloat2 s = dfmul_vf2_vf2_vf(vcast_vf2_f_f(0.69314718246459960938f, -1.904654323148236017e-09f), vcast_vf_vi2(e)); #else vfloat e = vgetexp_vf_vf(vmul_vf_vf_vf(d, vcast_vf_f(1.0f/0.75f))); e = vsel_vf_vo_vf_vf(vispinf_vo_vf(e), vcast_vf_f(128.0f), e); m = vgetmant_vf_vf(d); vfloat2 s = dfmul_vf2_vf2_vf(vcast_vf2_f_f(0.69314718246459960938f, -1.904654323148236017e-09f), e); #endif x = dfdiv_vf2_vf2_vf2(dfadd2_vf2_vf_vf(vcast_vf_f(-1), m), dfadd2_vf2_vf_vf(vcast_vf_f(1), m)); x2 = vmul_vf_vf_vf(vf2getx_vf_vf2(x), vf2getx_vf_vf2(x)); t = vcast_vf_f(+0.3027294874e+0f); t = vmla_vf_vf_vf_vf(t, x2, vcast_vf_f(+0.3996108174e+0f)); t = vmla_vf_vf_vf_vf(t, x2, vcast_vf_f(+0.6666694880e+0f)); s = dfadd_vf2_vf2_vf2(s, dfscale_vf2_vf2_vf(x, vcast_vf_f(2))); s = dfadd_vf2_vf2_vf(s, vmul_vf_vf_vf(vmul_vf_vf_vf(x2, vf2getx_vf_vf2(x)), t)); vfloat r = vadd_vf_vf_vf(vf2getx_vf_vf2(s), vf2gety_vf_vf2(s)); #if !defined(ENABLE_AVX512F) && !defined(ENABLE_AVX512FNOFMA) r = vsel_vf_vo_vf_vf(vispinf_vo_vf(d), vcast_vf_f(SLEEF_INFINITYf), r); r = vsel_vf_vo_vf_vf(vor_vo_vo_vo(vlt_vo_vf_vf(d, vcast_vf_f(0)), visnan_vo_vf(d)), vcast_vf_f(SLEEF_NANf), r); r = vsel_vf_vo_vf_vf(veq_vo_vf_vf(d, vcast_vf_f(0)), vcast_vf_f(-SLEEF_INFINITYf), r); #else r = vfixup_vf_vf_vf_vi2_i(r, d, vcast_vi2_i((4 << (2*4)) | (3 << (4*4)) | (5 << (5*4)) | (2 << (6*4))), 0); #endif return r; } #endif // #if !defined(DETERMINISTIC) static INLINE CONST VECTOR_CC vfloat expkf(vfloat2 d) { vfloat u = vmul_vf_vf_vf(vadd_vf_vf_vf(vf2getx_vf_vf2(d), vf2gety_vf_vf2(d)), vcast_vf_f(R_LN2f)); vint2 q = vrint_vi2_vf(u); vfloat2 s, t; s = dfadd2_vf2_vf2_vf(d, vmul_vf_vf_vf(vcast_vf_vi2(q), vcast_vf_f(-L2Uf))); s = dfadd2_vf2_vf2_vf(s, vmul_vf_vf_vf(vcast_vf_vi2(q), vcast_vf_f(-L2Lf))); s = dfnormalize_vf2_vf2(s); u = vcast_vf_f(0.00136324646882712841033936f); u = vmla_vf_vf_vf_vf(u, vf2getx_vf_vf2(s), vcast_vf_f(0.00836596917361021041870117f)); u = vmla_vf_vf_vf_vf(u, vf2getx_vf_vf2(s), vcast_vf_f(0.0416710823774337768554688f)); u = vmla_vf_vf_vf_vf(u, vf2getx_vf_vf2(s), vcast_vf_f(0.166665524244308471679688f)); u = vmla_vf_vf_vf_vf(u, vf2getx_vf_vf2(s), vcast_vf_f(0.499999850988388061523438f)); t = dfadd_vf2_vf2_vf2(s, dfmul_vf2_vf2_vf(dfsqu_vf2_vf2(s), u)); t = dfadd_vf2_vf_vf2(vcast_vf_f(1), t); u = vadd_vf_vf_vf(vf2getx_vf_vf2(t), vf2gety_vf_vf2(t)); u = vldexp_vf_vf_vi2(u, q); u = vreinterpret_vf_vm(vandnot_vm_vo32_vm(vlt_vo_vf_vf(vf2getx_vf_vf2(d), vcast_vf_f(-104)), vreinterpret_vm_vf(u))); return u; } static INLINE CONST VECTOR_CC vfloat expk3f(vfloat d) { vint2 q = vrint_vi2_vf(vmul_vf_vf_vf(d, vcast_vf_f(R_LN2f))); vfloat s, u; s = vmla_vf_vf_vf_vf(vcast_vf_vi2(q), vcast_vf_f(-L2Uf), d); s = vmla_vf_vf_vf_vf(vcast_vf_vi2(q), vcast_vf_f(-L2Lf), s); u = vcast_vf_f(0.000198527617612853646278381); u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(0.00139304355252534151077271)); u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(0.00833336077630519866943359)); u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(0.0416664853692054748535156)); u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(0.166666671633720397949219)); u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(0.5)); u = vmla_vf_vf_vf_vf(vmul_vf_vf_vf(s, s), u, vadd_vf_vf_vf(s, vcast_vf_f(1.0f))); u = vldexp2_vf_vf_vi2(u, q); u = vreinterpret_vf_vm(vandnot_vm_vo32_vm(vlt_vo_vf_vf(d, vcast_vf_f(-104)), vreinterpret_vm_vf(u))); return u; } #if !defined(DETERMINISTIC) EXPORT CONST VECTOR_CC vfloat xpowf(vfloat x, vfloat y) { #if 1 vopmask yisint = vor_vo_vo_vo(veq_vo_vf_vf(vtruncate_vf_vf(y), y), vgt_vo_vf_vf(vabs_vf_vf(y), vcast_vf_f(1 << 24))); vopmask yisodd = vand_vo_vo_vo(vand_vo_vo_vo(veq_vo_vi2_vi2(vand_vi2_vi2_vi2(vtruncate_vi2_vf(y), vcast_vi2_i(1)), vcast_vi2_i(1)), yisint), vlt_vo_vf_vf(vabs_vf_vf(y), vcast_vf_f(1 << 24))); #if defined(ENABLE_NEON32) || defined(ENABLE_NEON32VFPV4) yisodd = vandnot_vm_vo32_vm(visinf_vo_vf(y), yisodd); #endif vfloat result = expkf(dfmul_vf2_vf2_vf(logkf(vabs_vf_vf(x)), y)); result = vsel_vf_vo_vf_vf(visnan_vo_vf(result), vcast_vf_f(SLEEF_INFINITYf), result); result = vmul_vf_vf_vf(result, vsel_vf_vo_vf_vf(vgt_vo_vf_vf(x, vcast_vf_f(0)), vcast_vf_f(1), vsel_vf_vo_vf_vf(yisint, vsel_vf_vo_vf_vf(yisodd, vcast_vf_f(-1.0f), vcast_vf_f(1)), vcast_vf_f(SLEEF_NANf)))); vfloat efx = vmulsign_vf_vf_vf(vsub_vf_vf_vf(vabs_vf_vf(x), vcast_vf_f(1)), y); result = vsel_vf_vo_vf_vf(visinf_vo_vf(y), vreinterpret_vf_vm(vandnot_vm_vo32_vm(vlt_vo_vf_vf(efx, vcast_vf_f(0.0f)), vreinterpret_vm_vf(vsel_vf_vo_vf_vf(veq_vo_vf_vf(efx, vcast_vf_f(0.0f)), vcast_vf_f(1.0f), vcast_vf_f(SLEEF_INFINITYf))))), result); result = vsel_vf_vo_vf_vf(vor_vo_vo_vo(visinf_vo_vf(x), veq_vo_vf_vf(x, vcast_vf_f(0))), vmul_vf_vf_vf(vsel_vf_vo_vf_vf(yisodd, vsign_vf_vf(x), vcast_vf_f(1)), vreinterpret_vf_vm(vandnot_vm_vo32_vm(vlt_vo_vf_vf(vsel_vf_vo_vf_vf(veq_vo_vf_vf(x, vcast_vf_f(0)), vneg_vf_vf(y), y), vcast_vf_f(0)), vreinterpret_vm_vf(vcast_vf_f(SLEEF_INFINITYf))))), result); result = vreinterpret_vf_vm(vor_vm_vo32_vm(vor_vo_vo_vo(visnan_vo_vf(x), visnan_vo_vf(y)), vreinterpret_vm_vf(result))); result = vsel_vf_vo_vf_vf(vor_vo_vo_vo(veq_vo_vf_vf(y, vcast_vf_f(0)), veq_vo_vf_vf(x, vcast_vf_f(1))), vcast_vf_f(1), result); return result; #else return expkf(dfmul_vf2_vf2_vf(logkf(x), y)); #endif } EXPORT CONST VECTOR_CC vfloat xfastpowf_u3500(vfloat x, vfloat y) { vfloat result = expk3f(vmul_vf_vf_vf(logk3f(vabs_vf_vf(x)), y)); vopmask yisint = vor_vo_vo_vo(veq_vo_vf_vf(vtruncate_vf_vf(y), y), vgt_vo_vf_vf(vabs_vf_vf(y), vcast_vf_f(1 << 24))); vopmask yisodd = vand_vo_vo_vo(vand_vo_vo_vo(veq_vo_vi2_vi2(vand_vi2_vi2_vi2(vtruncate_vi2_vf(y), vcast_vi2_i(1)), vcast_vi2_i(1)), yisint), vlt_vo_vf_vf(vabs_vf_vf(y), vcast_vf_f(1 << 24))); result = vsel_vf_vo_vf_vf(vand_vo_vo_vo(vsignbit_vo_vf(x), yisodd), vneg_vf_vf(result), result); result = vsel_vf_vo_vf_vf(veq_vo_vf_vf(x, vcast_vf_f(0)), vcast_vf_f(0), result); result = vsel_vf_vo_vf_vf(veq_vo_vf_vf(y, vcast_vf_f(0)), vcast_vf_f(1), result); return result; } #endif // #if !defined(DETERMINISTIC) static INLINE CONST VECTOR_CC vfloat2 expk2f(vfloat2 d) { vfloat u = vmul_vf_vf_vf(vadd_vf_vf_vf(vf2getx_vf_vf2(d), vf2gety_vf_vf2(d)), vcast_vf_f(R_LN2f)); vint2 q = vrint_vi2_vf(u); vfloat2 s, t; s = dfadd2_vf2_vf2_vf(d, vmul_vf_vf_vf(vcast_vf_vi2(q), vcast_vf_f(-L2Uf))); s = dfadd2_vf2_vf2_vf(s, vmul_vf_vf_vf(vcast_vf_vi2(q), vcast_vf_f(-L2Lf))); u = vcast_vf_f(+0.1980960224e-3f); u = vmla_vf_vf_vf_vf(u, vf2getx_vf_vf2(s), vcast_vf_f(+0.1394256484e-2f)); u = vmla_vf_vf_vf_vf(u, vf2getx_vf_vf2(s), vcast_vf_f(+0.8333456703e-2f)); u = vmla_vf_vf_vf_vf(u, vf2getx_vf_vf2(s), vcast_vf_f(+0.4166637361e-1f)); t = dfadd2_vf2_vf2_vf(dfmul_vf2_vf2_vf(s, u), vcast_vf_f(+0.166666659414234244790680580464e+0f)); t = dfadd2_vf2_vf2_vf(dfmul_vf2_vf2_vf2(s, t), vcast_vf_f(0.5)); t = dfadd2_vf2_vf2_vf2(s, dfmul_vf2_vf2_vf2(dfsqu_vf2_vf2(s), t)); t = dfadd_vf2_vf_vf2(vcast_vf_f(1), t); t = vf2setx_vf2_vf2_vf(t, vldexp2_vf_vf_vi2(vf2getx_vf_vf2(t), q)); t = vf2sety_vf2_vf2_vf(t, vldexp2_vf_vf_vi2(vf2gety_vf_vf2(t), q)); t = vf2setx_vf2_vf2_vf(t, vreinterpret_vf_vm(vandnot_vm_vo32_vm(vlt_vo_vf_vf(vf2getx_vf_vf2(d), vcast_vf_f(-104)), vreinterpret_vm_vf(vf2getx_vf_vf2(t))))); t = vf2sety_vf2_vf2_vf(t, vreinterpret_vf_vm(vandnot_vm_vo32_vm(vlt_vo_vf_vf(vf2getx_vf_vf2(d), vcast_vf_f(-104)), vreinterpret_vm_vf(vf2gety_vf_vf2(t))))); return t; } #if !defined(DETERMINISTIC) EXPORT CONST VECTOR_CC vfloat xsinhf(vfloat x) { vfloat y = vabs_vf_vf(x); vfloat2 d = expk2f(vcast_vf2_vf_vf(y, vcast_vf_f(0))); d = dfsub_vf2_vf2_vf2(d, dfrec_vf2_vf2(d)); y = vmul_vf_vf_vf(vadd_vf_vf_vf(vf2getx_vf_vf2(d), vf2gety_vf_vf2(d)), vcast_vf_f(0.5)); y = vsel_vf_vo_vf_vf(vor_vo_vo_vo(vgt_vo_vf_vf(vabs_vf_vf(x), vcast_vf_f(89)), visnan_vo_vf(y)), vcast_vf_f(SLEEF_INFINITYf), y); y = vmulsign_vf_vf_vf(y, x); y = vreinterpret_vf_vm(vor_vm_vo32_vm(visnan_vo_vf(x), vreinterpret_vm_vf(y))); return y; } EXPORT CONST VECTOR_CC vfloat xcoshf(vfloat x) { vfloat y = vabs_vf_vf(x); vfloat2 d = expk2f(vcast_vf2_vf_vf(y, vcast_vf_f(0))); d = dfadd_vf2_vf2_vf2(d, dfrec_vf2_vf2(d)); y = vmul_vf_vf_vf(vadd_vf_vf_vf(vf2getx_vf_vf2(d), vf2gety_vf_vf2(d)), vcast_vf_f(0.5)); y = vsel_vf_vo_vf_vf(vor_vo_vo_vo(vgt_vo_vf_vf(vabs_vf_vf(x), vcast_vf_f(89)), visnan_vo_vf(y)), vcast_vf_f(SLEEF_INFINITYf), y); y = vreinterpret_vf_vm(vor_vm_vo32_vm(visnan_vo_vf(x), vreinterpret_vm_vf(y))); return y; } EXPORT CONST VECTOR_CC vfloat xtanhf(vfloat x) { vfloat y = vabs_vf_vf(x); vfloat2 d = expk2f(vcast_vf2_vf_vf(y, vcast_vf_f(0))); vfloat2 e = dfrec_vf2_vf2(d); d = dfdiv_vf2_vf2_vf2(dfadd_vf2_vf2_vf2(d, dfneg_vf2_vf2(e)), dfadd_vf2_vf2_vf2(d, e)); y = vadd_vf_vf_vf(vf2getx_vf_vf2(d), vf2gety_vf_vf2(d)); y = vsel_vf_vo_vf_vf(vor_vo_vo_vo(vgt_vo_vf_vf(vabs_vf_vf(x), vcast_vf_f(8.664339742f)), visnan_vo_vf(y)), vcast_vf_f(1.0f), y); y = vmulsign_vf_vf_vf(y, x); y = vreinterpret_vf_vm(vor_vm_vo32_vm(visnan_vo_vf(x), vreinterpret_vm_vf(y))); return y; } EXPORT CONST VECTOR_CC vfloat xsinhf_u35(vfloat x) { vfloat e = expm1fk(vabs_vf_vf(x)); vfloat y = vdiv_vf_vf_vf(vadd_vf_vf_vf(e, vcast_vf_f(2)), vadd_vf_vf_vf(e, vcast_vf_f(1))); y = vmul_vf_vf_vf(y, vmul_vf_vf_vf(vcast_vf_f(0.5f), e)); y = vsel_vf_vo_vf_vf(vor_vo_vo_vo(vgt_vo_vf_vf(vabs_vf_vf(x), vcast_vf_f(88)), visnan_vo_vf(y)), vcast_vf_f(SLEEF_INFINITYf), y); y = vmulsign_vf_vf_vf(y, x); y = vreinterpret_vf_vm(vor_vm_vo32_vm(visnan_vo_vf(x), vreinterpret_vm_vf(y))); return y; } EXPORT CONST VECTOR_CC vfloat xcoshf_u35(vfloat x) { vfloat e = xexpf(vabs_vf_vf(x)); vfloat y = vmla_vf_vf_vf_vf(vcast_vf_f(0.5f), e, vdiv_vf_vf_vf(vcast_vf_f(0.5), e)); y = vsel_vf_vo_vf_vf(vor_vo_vo_vo(vgt_vo_vf_vf(vabs_vf_vf(x), vcast_vf_f(88)), visnan_vo_vf(y)), vcast_vf_f(SLEEF_INFINITYf), y); y = vreinterpret_vf_vm(vor_vm_vo32_vm(visnan_vo_vf(x), vreinterpret_vm_vf(y))); return y; } EXPORT CONST VECTOR_CC vfloat xtanhf_u35(vfloat x) { vfloat d = expm1fk(vmul_vf_vf_vf(vcast_vf_f(2), vabs_vf_vf(x))); vfloat y = vdiv_vf_vf_vf(d, vadd_vf_vf_vf(vcast_vf_f(2), d)); y = vsel_vf_vo_vf_vf(vor_vo_vo_vo(vgt_vo_vf_vf(vabs_vf_vf(x), vcast_vf_f(8.664339742f)), visnan_vo_vf(y)), vcast_vf_f(1.0f), y); y = vmulsign_vf_vf_vf(y, x); y = vreinterpret_vf_vm(vor_vm_vo32_vm(visnan_vo_vf(x), vreinterpret_vm_vf(y))); return y; } #endif // #if !defined(DETERMINISTIC) static INLINE CONST VECTOR_CC vfloat2 logk2f(vfloat2 d) { vfloat2 x, x2, m, s; vfloat t; vint2 e; #if !defined(ENABLE_AVX512F) && !defined(ENABLE_AVX512FNOFMA) e = vilogbk_vi2_vf(vmul_vf_vf_vf(vf2getx_vf_vf2(d), vcast_vf_f(1.0f/0.75f))); #else e = vrint_vi2_vf(vgetexp_vf_vf(vmul_vf_vf_vf(vf2getx_vf_vf2(d), vcast_vf_f(1.0f/0.75f)))); #endif m = dfscale_vf2_vf2_vf(d, vpow2i_vf_vi2(vneg_vi2_vi2(e))); x = dfdiv_vf2_vf2_vf2(dfadd2_vf2_vf2_vf(m, vcast_vf_f(-1)), dfadd2_vf2_vf2_vf(m, vcast_vf_f(1))); x2 = dfsqu_vf2_vf2(x); t = vcast_vf_f(0.2392828464508056640625f); t = vmla_vf_vf_vf_vf(t, vf2getx_vf_vf2(x2), vcast_vf_f(0.28518211841583251953125f)); t = vmla_vf_vf_vf_vf(t, vf2getx_vf_vf2(x2), vcast_vf_f(0.400005877017974853515625f)); t = vmla_vf_vf_vf_vf(t, vf2getx_vf_vf2(x2), vcast_vf_f(0.666666686534881591796875f)); s = dfmul_vf2_vf2_vf(vcast_vf2_vf_vf(vcast_vf_f(0.69314718246459960938f), vcast_vf_f(-1.904654323148236017e-09f)), vcast_vf_vi2(e)); s = dfadd_vf2_vf2_vf2(s, dfscale_vf2_vf2_vf(x, vcast_vf_f(2))); s = dfadd_vf2_vf2_vf2(s, dfmul_vf2_vf2_vf(dfmul_vf2_vf2_vf2(x2, x), t)); return s; } #if !defined(DETERMINISTIC) EXPORT CONST VECTOR_CC vfloat xasinhf(vfloat x) { vfloat y = vabs_vf_vf(x); vopmask o = vgt_vo_vf_vf(y, vcast_vf_f(1)); vfloat2 d; d = vsel_vf2_vo_vf2_vf2(o, dfrec_vf2_vf(x), vcast_vf2_vf_vf(y, vcast_vf_f(0))); d = dfsqrt_vf2_vf2(dfadd2_vf2_vf2_vf(dfsqu_vf2_vf2(d), vcast_vf_f(1))); d = vsel_vf2_vo_vf2_vf2(o, dfmul_vf2_vf2_vf(d, y), d); d = logk2f(dfnormalize_vf2_vf2(dfadd2_vf2_vf2_vf(d, x))); y = vadd_vf_vf_vf(vf2getx_vf_vf2(d), vf2gety_vf_vf2(d)); y = vsel_vf_vo_vf_vf(vor_vo_vo_vo(vgt_vo_vf_vf(vabs_vf_vf(x), vcast_vf_f(SQRT_FLT_MAX)), visnan_vo_vf(y)), vmulsign_vf_vf_vf(vcast_vf_f(SLEEF_INFINITYf), x), y); y = vreinterpret_vf_vm(vor_vm_vo32_vm(visnan_vo_vf(x), vreinterpret_vm_vf(y))); y = vsel_vf_vo_vf_vf(visnegzero_vo_vf(x), vcast_vf_f(-0.0), y); return y; } EXPORT CONST VECTOR_CC vfloat xacoshf(vfloat x) { vfloat2 d = logk2f(dfadd2_vf2_vf2_vf(dfmul_vf2_vf2_vf2(dfsqrt_vf2_vf2(dfadd2_vf2_vf_vf(x, vcast_vf_f(1))), dfsqrt_vf2_vf2(dfadd2_vf2_vf_vf(x, vcast_vf_f(-1)))), x)); vfloat y = vadd_vf_vf_vf(vf2getx_vf_vf2(d), vf2gety_vf_vf2(d)); y = vsel_vf_vo_vf_vf(vor_vo_vo_vo(vgt_vo_vf_vf(vabs_vf_vf(x), vcast_vf_f(SQRT_FLT_MAX)), visnan_vo_vf(y)), vcast_vf_f(SLEEF_INFINITYf), y); y = vreinterpret_vf_vm(vandnot_vm_vo32_vm(veq_vo_vf_vf(x, vcast_vf_f(1.0f)), vreinterpret_vm_vf(y))); y = vreinterpret_vf_vm(vor_vm_vo32_vm(vlt_vo_vf_vf(x, vcast_vf_f(1.0f)), vreinterpret_vm_vf(y))); y = vreinterpret_vf_vm(vor_vm_vo32_vm(visnan_vo_vf(x), vreinterpret_vm_vf(y))); return y; } EXPORT CONST VECTOR_CC vfloat xatanhf(vfloat x) { vfloat y = vabs_vf_vf(x); vfloat2 d = logk2f(dfdiv_vf2_vf2_vf2(dfadd2_vf2_vf_vf(vcast_vf_f(1), y), dfadd2_vf2_vf_vf(vcast_vf_f(1), vneg_vf_vf(y)))); y = vreinterpret_vf_vm(vor_vm_vo32_vm(vgt_vo_vf_vf(y, vcast_vf_f(1.0)), vreinterpret_vm_vf(vsel_vf_vo_vf_vf(veq_vo_vf_vf(y, vcast_vf_f(1.0)), vcast_vf_f(SLEEF_INFINITYf), vmul_vf_vf_vf(vadd_vf_vf_vf(vf2getx_vf_vf2(d), vf2gety_vf_vf2(d)), vcast_vf_f(0.5)))))); y = vreinterpret_vf_vm(vor_vm_vo32_vm(vor_vo_vo_vo(visinf_vo_vf(x), visnan_vo_vf(y)), vreinterpret_vm_vf(y))); y = vmulsign_vf_vf_vf(y, x); y = vreinterpret_vf_vm(vor_vm_vo32_vm(visnan_vo_vf(x), vreinterpret_vm_vf(y))); return y; } #endif // #if !defined(DETERMINISTIC) #if !defined(DETERMINISTIC) EXPORT CONST VECTOR_CC vfloat xexp2f(vfloat d) { vfloat u = vrint_vf_vf(d), s; vint2 q = vrint_vi2_vf(u); s = vsub_vf_vf_vf(d, u); u = vcast_vf_f(+0.1535920892e-3); u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(+0.1339262701e-2)); u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(+0.9618384764e-2)); u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(+0.5550347269e-1)); u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(+0.2402264476e+0)); u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(+0.6931471825e+0)); #ifdef ENABLE_FMA_SP u = vfma_vf_vf_vf_vf(u, s, vcast_vf_f(1)); #else u = vf2getx_vf_vf2(dfnormalize_vf2_vf2(dfadd_vf2_vf_vf2(vcast_vf_f(1), dfmul_vf2_vf_vf(u, s)))); #endif u = vldexp2_vf_vf_vi2(u, q); u = vsel_vf_vo_vf_vf(vge_vo_vf_vf(d, vcast_vf_f(128)), vcast_vf_f(SLEEF_INFINITY), u); u = vreinterpret_vf_vm(vandnot_vm_vo32_vm(vlt_vo_vf_vf(d, vcast_vf_f(-150)), vreinterpret_vm_vf(u))); return u; } EXPORT CONST VECTOR_CC vfloat xexp2f_u35(vfloat d) { vfloat u = vrint_vf_vf(d), s; vint2 q = vrint_vi2_vf(u); s = vsub_vf_vf_vf(d, u); u = vcast_vf_f(+0.1535920892e-3); u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(+0.1339262701e-2)); u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(+0.9618384764e-2)); u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(+0.5550347269e-1)); u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(+0.2402264476e+0)); u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(+0.6931471825e+0)); u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(+0.1000000000e+1)); u = vldexp2_vf_vf_vi2(u, q); u = vsel_vf_vo_vf_vf(vge_vo_vf_vf(d, vcast_vf_f(128)), vcast_vf_f(SLEEF_INFINITY), u); u = vreinterpret_vf_vm(vandnot_vm_vo32_vm(vlt_vo_vf_vf(d, vcast_vf_f(-150)), vreinterpret_vm_vf(u))); return u; } EXPORT CONST VECTOR_CC vfloat xexp10f(vfloat d) { vfloat u = vrint_vf_vf(vmul_vf_vf_vf(d, vcast_vf_f(LOG10_2))), s; vint2 q = vrint_vi2_vf(u); s = vmla_vf_vf_vf_vf(u, vcast_vf_f(-L10Uf), d); s = vmla_vf_vf_vf_vf(u, vcast_vf_f(-L10Lf), s); u = vcast_vf_f(+0.6802555919e-1); u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(+0.2078080326e+0)); u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(+0.5393903852e+0)); u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(+0.1171245337e+1)); u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(+0.2034678698e+1)); u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(+0.2650949001e+1)); vfloat2 x = dfadd_vf2_vf2_vf(vcast_vf2_f_f(2.3025851249694824219, -3.1705172516493593157e-08), vmul_vf_vf_vf(u, s)); u = vf2getx_vf_vf2(dfnormalize_vf2_vf2(dfadd_vf2_vf_vf2(vcast_vf_f(1), dfmul_vf2_vf2_vf(x, s)))); u = vldexp2_vf_vf_vi2(u, q); u = vsel_vf_vo_vf_vf(vgt_vo_vf_vf(d, vcast_vf_f(38.5318394191036238941387f)), vcast_vf_f(SLEEF_INFINITYf), u); u = vreinterpret_vf_vm(vandnot_vm_vo32_vm(vlt_vo_vf_vf(d, vcast_vf_f(-50)), vreinterpret_vm_vf(u))); return u; } EXPORT CONST VECTOR_CC vfloat xexp10f_u35(vfloat d) { vfloat u = vrint_vf_vf(vmul_vf_vf_vf(d, vcast_vf_f(LOG10_2))), s; vint2 q = vrint_vi2_vf(u); s = vmla_vf_vf_vf_vf(u, vcast_vf_f(-L10Uf), d); s = vmla_vf_vf_vf_vf(u, vcast_vf_f(-L10Lf), s); u = vcast_vf_f(+0.2064004987e+0); u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(+0.5417877436e+0)); u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(+0.1171286821e+1)); u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(+0.2034656048e+1)); u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(+0.2650948763e+1)); u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(+0.2302585125e+1)); u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(+0.1000000000e+1)); u = vldexp2_vf_vf_vi2(u, q); u = vsel_vf_vo_vf_vf(vgt_vo_vf_vf(d, vcast_vf_f(38.5318394191036238941387f)), vcast_vf_f(SLEEF_INFINITYf), u); u = vreinterpret_vf_vm(vandnot_vm_vo32_vm(vlt_vo_vf_vf(d, vcast_vf_f(-50)), vreinterpret_vm_vf(u))); return u; } EXPORT CONST VECTOR_CC vfloat xexpm1f(vfloat a) { vfloat2 d = dfadd2_vf2_vf2_vf(expk2f(vcast_vf2_vf_vf(a, vcast_vf_f(0))), vcast_vf_f(-1.0)); vfloat x = vadd_vf_vf_vf(vf2getx_vf_vf2(d), vf2gety_vf_vf2(d)); x = vsel_vf_vo_vf_vf(vgt_vo_vf_vf(a, vcast_vf_f(88.72283172607421875f)), vcast_vf_f(SLEEF_INFINITYf), x); x = vsel_vf_vo_vf_vf(vlt_vo_vf_vf(a, vcast_vf_f(-16.635532333438687426013570f)), vcast_vf_f(-1), x); x = vsel_vf_vo_vf_vf(visnegzero_vo_vf(a), vcast_vf_f(-0.0f), x); return x; } #endif // #if !defined(DETERMINISTIC) #if !defined(DETERMINISTIC) EXPORT CONST VECTOR_CC vfloat xlog10f(vfloat d) { vfloat2 x; vfloat t, m, x2; #if !defined(ENABLE_AVX512F) && !defined(ENABLE_AVX512FNOFMA) vopmask o = vlt_vo_vf_vf(d, vcast_vf_f(FLT_MIN)); d = vsel_vf_vo_vf_vf(o, vmul_vf_vf_vf(d, vcast_vf_f((float)(INT64_C(1) << 32) * (float)(INT64_C(1) << 32))), d); vint2 e = vilogb2k_vi2_vf(vmul_vf_vf_vf(d, vcast_vf_f(1.0/0.75))); m = vldexp3_vf_vf_vi2(d, vneg_vi2_vi2(e)); e = vsel_vi2_vo_vi2_vi2(o, vsub_vi2_vi2_vi2(e, vcast_vi2_i(64)), e); #else vfloat e = vgetexp_vf_vf(vmul_vf_vf_vf(d, vcast_vf_f(1.0/0.75))); e = vsel_vf_vo_vf_vf(vispinf_vo_vf(e), vcast_vf_f(128.0f), e); m = vgetmant_vf_vf(d); #endif x = dfdiv_vf2_vf2_vf2(dfadd2_vf2_vf_vf(vcast_vf_f(-1), m), dfadd2_vf2_vf_vf(vcast_vf_f(1), m)); x2 = vmul_vf_vf_vf(vf2getx_vf_vf2(x), vf2getx_vf_vf2(x)); t = vcast_vf_f(+0.1314289868e+0); t = vmla_vf_vf_vf_vf(t, x2, vcast_vf_f( +0.1735493541e+0)); t = vmla_vf_vf_vf_vf(t, x2, vcast_vf_f( +0.2895309627e+0)); #if !defined(ENABLE_AVX512F) && !defined(ENABLE_AVX512FNOFMA) vfloat2 s = dfmul_vf2_vf2_vf(vcast_vf2_f_f(0.30103001, -1.432098889e-08), vcast_vf_vi2(e)); #else vfloat2 s = dfmul_vf2_vf2_vf(vcast_vf2_f_f(0.30103001, -1.432098889e-08), e); #endif s = dfadd_vf2_vf2_vf2(s, dfmul_vf2_vf2_vf2(x, vcast_vf2_f_f(0.868588984, -2.170757285e-08))); s = dfadd_vf2_vf2_vf(s, vmul_vf_vf_vf(vmul_vf_vf_vf(x2, vf2getx_vf_vf2(x)), t)); vfloat r = vadd_vf_vf_vf(vf2getx_vf_vf2(s), vf2gety_vf_vf2(s)); #if !defined(ENABLE_AVX512F) && !defined(ENABLE_AVX512FNOFMA) r = vsel_vf_vo_vf_vf(vispinf_vo_vf(d), vcast_vf_f(SLEEF_INFINITY), r); r = vsel_vf_vo_vf_vf(vor_vo_vo_vo(vlt_vo_vf_vf(d, vcast_vf_f(0)), visnan_vo_vf(d)), vcast_vf_f(SLEEF_NAN), r); r = vsel_vf_vo_vf_vf(veq_vo_vf_vf(d, vcast_vf_f(0)), vcast_vf_f(-SLEEF_INFINITY), r); #else r = vfixup_vf_vf_vf_vi2_i(r, d, vcast_vi2_i((4 << (2*4)) | (3 << (4*4)) | (5 << (5*4)) | (2 << (6*4))), 0); #endif return r; } EXPORT CONST VECTOR_CC vfloat xlog2f(vfloat d) { vfloat2 x; vfloat t, m, x2; #if !defined(ENABLE_AVX512F) && !defined(ENABLE_AVX512FNOFMA) vopmask o = vlt_vo_vf_vf(d, vcast_vf_f(FLT_MIN)); d = vsel_vf_vo_vf_vf(o, vmul_vf_vf_vf(d, vcast_vf_f((float)(INT64_C(1) << 32) * (float)(INT64_C(1) << 32))), d); vint2 e = vilogb2k_vi2_vf(vmul_vf_vf_vf(d, vcast_vf_f(1.0/0.75))); m = vldexp3_vf_vf_vi2(d, vneg_vi2_vi2(e)); e = vsel_vi2_vo_vi2_vi2(o, vsub_vi2_vi2_vi2(e, vcast_vi2_i(64)), e); #else vfloat e = vgetexp_vf_vf(vmul_vf_vf_vf(d, vcast_vf_f(1.0/0.75))); e = vsel_vf_vo_vf_vf(vispinf_vo_vf(e), vcast_vf_f(128.0f), e); m = vgetmant_vf_vf(d); #endif x = dfdiv_vf2_vf2_vf2(dfadd2_vf2_vf_vf(vcast_vf_f(-1), m), dfadd2_vf2_vf_vf(vcast_vf_f(1), m)); x2 = vmul_vf_vf_vf(vf2getx_vf_vf2(x), vf2getx_vf_vf2(x)); t = vcast_vf_f(+0.4374550283e+0f); t = vmla_vf_vf_vf_vf(t, x2, vcast_vf_f(+0.5764790177e+0f)); t = vmla_vf_vf_vf_vf(t, x2, vcast_vf_f(+0.9618012905120f)); #if !defined(ENABLE_AVX512F) && !defined(ENABLE_AVX512FNOFMA) vfloat2 s = dfadd2_vf2_vf_vf2(vcast_vf_vi2(e), dfmul_vf2_vf2_vf2(x, vcast_vf2_f_f(2.8853900432586669922, 3.2734474483568488616e-08))); #else vfloat2 s = dfadd2_vf2_vf_vf2(e, dfmul_vf2_vf2_vf2(x, vcast_vf2_f_f(2.8853900432586669922, 3.2734474483568488616e-08))); #endif s = dfadd2_vf2_vf2_vf(s, vmul_vf_vf_vf(vmul_vf_vf_vf(x2, vf2getx_vf_vf2(x)), t)); vfloat r = vadd_vf_vf_vf(vf2getx_vf_vf2(s), vf2gety_vf_vf2(s)); #if !defined(ENABLE_AVX512F) && !defined(ENABLE_AVX512FNOFMA) r = vsel_vf_vo_vf_vf(vispinf_vo_vf(d), vcast_vf_f(SLEEF_INFINITY), r); r = vsel_vf_vo_vf_vf(vor_vo_vo_vo(vlt_vo_vf_vf(d, vcast_vf_f(0)), visnan_vo_vf(d)), vcast_vf_f(SLEEF_NAN), r); r = vsel_vf_vo_vf_vf(veq_vo_vf_vf(d, vcast_vf_f(0)), vcast_vf_f(-SLEEF_INFINITY), r); #else r = vfixup_vf_vf_vf_vi2_i(r, d, vcast_vi2_i((4 << (2*4)) | (3 << (4*4)) | (5 << (5*4)) | (2 << (6*4))), 0); #endif return r; } EXPORT CONST VECTOR_CC vfloat xlog2f_u35(vfloat d) { vfloat m, t, x, x2; #if !defined(ENABLE_AVX512F) && !defined(ENABLE_AVX512FNOFMA) vopmask o = vlt_vo_vf_vf(d, vcast_vf_f(FLT_MIN)); d = vsel_vf_vo_vf_vf(o, vmul_vf_vf_vf(d, vcast_vf_f((float)(INT64_C(1) << 32) * (float)(INT64_C(1) << 32))), d); vint2 e = vilogb2k_vi2_vf(vmul_vf_vf_vf(d, vcast_vf_f(1.0/0.75))); m = vldexp3_vf_vf_vi2(d, vneg_vi2_vi2(e)); e = vsel_vi2_vo_vi2_vi2(o, vsub_vi2_vi2_vi2(e, vcast_vi2_i(64)), e); #else vfloat e = vgetexp_vf_vf(vmul_vf_vf_vf(d, vcast_vf_f(1.0/0.75))); e = vsel_vf_vo_vf_vf(vispinf_vo_vf(e), vcast_vf_f(128.0f), e); m = vgetmant_vf_vf(d); #endif x = vdiv_vf_vf_vf(vsub_vf_vf_vf(m, vcast_vf_f(1)), vadd_vf_vf_vf(m, vcast_vf_f(1))); x2 = vmul_vf_vf_vf(x, x); t = vcast_vf_f(+0.4374088347e+0); t = vmla_vf_vf_vf_vf(t, x2, vcast_vf_f(+0.5764843822e+0)); t = vmla_vf_vf_vf_vf(t, x2, vcast_vf_f(+0.9618024230e+0)); #if !defined(ENABLE_AVX512F) && !defined(ENABLE_AVX512FNOFMA) vfloat r = vmla_vf_vf_vf_vf(vmul_vf_vf_vf(x2, x), t, vmla_vf_vf_vf_vf(x, vcast_vf_f(+0.2885390043e+1), vcast_vf_vi2(e))); r = vsel_vf_vo_vf_vf(vispinf_vo_vf(d), vcast_vf_f(SLEEF_INFINITY), r); r = vsel_vf_vo_vf_vf(vor_vo_vo_vo(vlt_vo_vf_vf(d, vcast_vf_f(0)), visnan_vo_vf(d)), vcast_vf_f(SLEEF_NAN), r); r = vsel_vf_vo_vf_vf(veq_vo_vf_vf(d, vcast_vf_f(0)), vcast_vf_f(-SLEEF_INFINITY), r); #else vfloat r = vmla_vf_vf_vf_vf(vmul_vf_vf_vf(x2, x), t, vmla_vf_vf_vf_vf(x, vcast_vf_f(+0.2885390043e+1), e)); r = vfixup_vf_vf_vf_vi2_i(r, d, vcast_vi2_i((4 << (2*4)) | (3 << (4*4)) | (5 << (5*4)) | (2 << (6*4))), 0); #endif return r; } EXPORT CONST VECTOR_CC vfloat xlog1pf(vfloat d) { vfloat2 x; vfloat t, m, x2; vfloat dp1 = vadd_vf_vf_vf(d, vcast_vf_f(1)); #if !defined(ENABLE_AVX512F) && !defined(ENABLE_AVX512FNOFMA) vopmask o = vlt_vo_vf_vf(dp1, vcast_vf_f(FLT_MIN)); dp1 = vsel_vf_vo_vf_vf(o, vmul_vf_vf_vf(dp1, vcast_vf_f((float)(INT64_C(1) << 32) * (float)(INT64_C(1) << 32))), dp1); vint2 e = vilogb2k_vi2_vf(vmul_vf_vf_vf(dp1, vcast_vf_f(1.0f/0.75f))); t = vldexp3_vf_vf_vi2(vcast_vf_f(1), vneg_vi2_vi2(e)); m = vmla_vf_vf_vf_vf(d, t, vsub_vf_vf_vf(t, vcast_vf_f(1))); e = vsel_vi2_vo_vi2_vi2(o, vsub_vi2_vi2_vi2(e, vcast_vi2_i(64)), e); vfloat2 s = dfmul_vf2_vf2_vf(vcast_vf2_f_f(0.69314718246459960938f, -1.904654323148236017e-09f), vcast_vf_vi2(e)); #else vfloat e = vgetexp_vf_vf(vmul_vf_vf_vf(dp1, vcast_vf_f(1.0f/0.75f))); e = vsel_vf_vo_vf_vf(vispinf_vo_vf(e), vcast_vf_f(128.0f), e); t = vldexp3_vf_vf_vi2(vcast_vf_f(1), vneg_vi2_vi2(vrint_vi2_vf(e))); m = vmla_vf_vf_vf_vf(d, t, vsub_vf_vf_vf(t, vcast_vf_f(1))); vfloat2 s = dfmul_vf2_vf2_vf(vcast_vf2_f_f(0.69314718246459960938f, -1.904654323148236017e-09f), e); #endif x = dfdiv_vf2_vf2_vf2(vcast_vf2_vf_vf(m, vcast_vf_f(0)), dfadd_vf2_vf_vf(vcast_vf_f(2), m)); x2 = vmul_vf_vf_vf(vf2getx_vf_vf2(x), vf2getx_vf_vf2(x)); t = vcast_vf_f(+0.3027294874e+0f); t = vmla_vf_vf_vf_vf(t, x2, vcast_vf_f(+0.3996108174e+0f)); t = vmla_vf_vf_vf_vf(t, x2, vcast_vf_f(+0.6666694880e+0f)); s = dfadd_vf2_vf2_vf2(s, dfscale_vf2_vf2_vf(x, vcast_vf_f(2))); s = dfadd_vf2_vf2_vf(s, vmul_vf_vf_vf(vmul_vf_vf_vf(x2, vf2getx_vf_vf2(x)), t)); vfloat r = vadd_vf_vf_vf(vf2getx_vf_vf2(s), vf2gety_vf_vf2(s)); r = vsel_vf_vo_vf_vf(vgt_vo_vf_vf(d, vcast_vf_f(1e+38)), vcast_vf_f(SLEEF_INFINITYf), r); r = vreinterpret_vf_vm(vor_vm_vo32_vm(vgt_vo_vf_vf(vcast_vf_f(-1), d), vreinterpret_vm_vf(r))); r = vsel_vf_vo_vf_vf(veq_vo_vf_vf(d, vcast_vf_f(-1)), vcast_vf_f(-SLEEF_INFINITYf), r); r = vsel_vf_vo_vf_vf(visnegzero_vo_vf(d), vcast_vf_f(-0.0f), r); return r; } #endif // #if !defined(DETERMINISTIC) // #if !defined(DETERMINISTIC) EXPORT CONST VECTOR_CC vfloat xfabsf(vfloat x) { return vabs_vf_vf(x); } EXPORT CONST VECTOR_CC vfloat xcopysignf(vfloat x, vfloat y) { return vcopysign_vf_vf_vf(x, y); } EXPORT CONST VECTOR_CC vfloat xfmaxf(vfloat x, vfloat y) { #if (defined(__x86_64__) || defined(__i386__)) && !defined(ENABLE_VECEXT) && !defined(ENABLE_PUREC) return vsel_vf_vo_vf_vf(visnan_vo_vf(y), x, vmax_vf_vf_vf(x, y)); #else return vsel_vf_vo_vf_vf(visnan_vo_vf(y), x, vsel_vf_vo_vf_vf(vgt_vo_vf_vf(x, y), x, y)); #endif } EXPORT CONST VECTOR_CC vfloat xfminf(vfloat x, vfloat y) { #if (defined(__x86_64__) || defined(__i386__)) && !defined(ENABLE_VECEXT) && !defined(ENABLE_PUREC) return vsel_vf_vo_vf_vf(visnan_vo_vf(y), x, vmin_vf_vf_vf(x, y)); #else return vsel_vf_vo_vf_vf(visnan_vo_vf(y), x, vsel_vf_vo_vf_vf(vgt_vo_vf_vf(y, x), x, y)); #endif } EXPORT CONST VECTOR_CC vfloat xfdimf(vfloat x, vfloat y) { vfloat ret = vsub_vf_vf_vf(x, y); ret = vsel_vf_vo_vf_vf(vor_vo_vo_vo(vlt_vo_vf_vf(ret, vcast_vf_f(0)), veq_vo_vf_vf(x, y)), vcast_vf_f(0), ret); return ret; } EXPORT CONST VECTOR_CC vfloat xtruncf(vfloat x) { #ifdef FULL_FP_ROUNDING return vtruncate_vf_vf(x); #else vfloat fr = vsub_vf_vf_vf(x, vcast_vf_vi2(vtruncate_vi2_vf(x))); return vsel_vf_vo_vf_vf(vor_vo_vo_vo(visinf_vo_vf(x), vge_vo_vf_vf(vabs_vf_vf(x), vcast_vf_f(INT64_C(1) << 23))), x, vcopysign_vf_vf_vf(vsub_vf_vf_vf(x, fr), x)); #endif } EXPORT CONST VECTOR_CC vfloat xfloorf(vfloat x) { vfloat fr = vsub_vf_vf_vf(x, vcast_vf_vi2(vtruncate_vi2_vf(x))); fr = vsel_vf_vo_vf_vf(vlt_vo_vf_vf(fr, vcast_vf_f(0)), vadd_vf_vf_vf(fr, vcast_vf_f(1.0f)), fr); return vsel_vf_vo_vf_vf(vor_vo_vo_vo(visinf_vo_vf(x), vge_vo_vf_vf(vabs_vf_vf(x), vcast_vf_f(INT64_C(1) << 23))), x, vcopysign_vf_vf_vf(vsub_vf_vf_vf(x, fr), x)); } EXPORT CONST VECTOR_CC vfloat xceilf(vfloat x) { vfloat fr = vsub_vf_vf_vf(x, vcast_vf_vi2(vtruncate_vi2_vf(x))); fr = vsel_vf_vo_vf_vf(vle_vo_vf_vf(fr, vcast_vf_f(0)), fr, vsub_vf_vf_vf(fr, vcast_vf_f(1.0f))); return vsel_vf_vo_vf_vf(vor_vo_vo_vo(visinf_vo_vf(x), vge_vo_vf_vf(vabs_vf_vf(x), vcast_vf_f(INT64_C(1) << 23))), x, vcopysign_vf_vf_vf(vsub_vf_vf_vf(x, fr), x)); } EXPORT CONST VECTOR_CC vfloat xroundf(vfloat d) { vfloat x = vadd_vf_vf_vf(d, vcast_vf_f(0.5f)); vfloat fr = vsub_vf_vf_vf(x, vcast_vf_vi2(vtruncate_vi2_vf(x))); x = vsel_vf_vo_vf_vf(vand_vo_vo_vo(vle_vo_vf_vf(x, vcast_vf_f(0)), veq_vo_vf_vf(fr, vcast_vf_f(0))), vsub_vf_vf_vf(x, vcast_vf_f(1.0f)), x); fr = vsel_vf_vo_vf_vf(vlt_vo_vf_vf(fr, vcast_vf_f(0)), vadd_vf_vf_vf(fr, vcast_vf_f(1.0f)), fr); x = vsel_vf_vo_vf_vf(veq_vo_vf_vf(d, vcast_vf_f(0.4999999701976776123f)), vcast_vf_f(0), x); return vsel_vf_vo_vf_vf(vor_vo_vo_vo(visinf_vo_vf(d), vge_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(INT64_C(1) << 23))), d, vcopysign_vf_vf_vf(vsub_vf_vf_vf(x, fr), d)); } EXPORT CONST VECTOR_CC vfloat xrintf(vfloat d) { #ifdef FULL_FP_ROUNDING return vrint_vf_vf(d); #else vfloat c = vmulsign_vf_vf_vf(vcast_vf_f(1 << 23), d); return vsel_vf_vo_vf_vf(vgt_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(1 << 23)), d, vorsign_vf_vf_vf(vsub_vf_vf_vf(vadd_vf_vf_vf(d, c), c), d)); #endif } EXPORT CONST VECTOR_CC vfloat xfmaf(vfloat x, vfloat y, vfloat z) { #ifdef ENABLE_FMA_SP return vfma_vf_vf_vf_vf(x, y, z); #else vfloat h2 = vadd_vf_vf_vf(vmul_vf_vf_vf(x, y), z), q = vcast_vf_f(1); vopmask o = vlt_vo_vf_vf(vabs_vf_vf(h2), vcast_vf_f(1e-38f)); { const float c0 = UINT64_C(1) << 25, c1 = c0 * c0, c2 = c1 * c1; x = vsel_vf_vo_vf_vf(o, vmul_vf_vf_vf(x, vcast_vf_f(c1)), x); y = vsel_vf_vo_vf_vf(o, vmul_vf_vf_vf(y, vcast_vf_f(c1)), y); z = vsel_vf_vo_vf_vf(o, vmul_vf_vf_vf(z, vcast_vf_f(c2)), z); q = vsel_vf_vo_vf_vf(o, vcast_vf_f(1.0f / c2), q); } o = vgt_vo_vf_vf(vabs_vf_vf(h2), vcast_vf_f(1e+38f)); { const float c0 = UINT64_C(1) << 25, c1 = c0 * c0, c2 = c1 * c1; x = vsel_vf_vo_vf_vf(o, vmul_vf_vf_vf(x, vcast_vf_f(1.0f / c1)), x); y = vsel_vf_vo_vf_vf(o, vmul_vf_vf_vf(y, vcast_vf_f(1.0f / c1)), y); z = vsel_vf_vo_vf_vf(o, vmul_vf_vf_vf(z, vcast_vf_f(1.0f / c2)), z); q = vsel_vf_vo_vf_vf(o, vcast_vf_f(c2), q); } vfloat2 d = dfmul_vf2_vf_vf(x, y); d = dfadd2_vf2_vf2_vf(d, z); vfloat ret = vsel_vf_vo_vf_vf(vor_vo_vo_vo(veq_vo_vf_vf(x, vcast_vf_f(0)), veq_vo_vf_vf(y, vcast_vf_f(0))), z, vadd_vf_vf_vf(vf2getx_vf_vf2(d), vf2gety_vf_vf2(d))); o = visinf_vo_vf(z); o = vandnot_vo_vo_vo(visinf_vo_vf(x), o); o = vandnot_vo_vo_vo(visnan_vo_vf(x), o); o = vandnot_vo_vo_vo(visinf_vo_vf(y), o); o = vandnot_vo_vo_vo(visnan_vo_vf(y), o); h2 = vsel_vf_vo_vf_vf(o, z, h2); o = vor_vo_vo_vo(visinf_vo_vf(h2), visnan_vo_vf(h2)); return vsel_vf_vo_vf_vf(o, h2, vmul_vf_vf_vf(ret, q)); #endif } #endif // #if !defined(DETERMINISTIC) #if !defined(SLEEF_GENHEADER) static INLINE CONST VECTOR_CC vint2 vcast_vi2_i_i(int i0, int i1) { return vcast_vi2_vm(vcast_vm_i_i(i0, i1)); } #endif SQRTFU05_FUNCATR VECTOR_CC vfloat xsqrtf_u05(vfloat d) { #if defined(ENABLE_FMA_SP) vfloat q, w, x, y, z; d = vsel_vf_vo_vf_vf(vlt_vo_vf_vf(d, vcast_vf_f(0)), vcast_vf_f(SLEEF_NANf), d); vopmask o = vlt_vo_vf_vf(d, vcast_vf_f(5.2939559203393770e-23f)); d = vsel_vf_vo_vf_vf(o, vmul_vf_vf_vf(d, vcast_vf_f(1.8889465931478580e+22f)), d); q = vsel_vf_vo_vf_vf(o, vcast_vf_f(7.2759576141834260e-12f), vcast_vf_f(1.0f)); y = vreinterpret_vf_vi2(vsub_vi2_vi2_vi2(vcast_vi2_i(0x5f3759df), vsrl_vi2_vi2_i(vreinterpret_vi2_vf(d), 1))); x = vmul_vf_vf_vf(d, y); w = vmul_vf_vf_vf(vcast_vf_f(0.5), y); y = vfmanp_vf_vf_vf_vf(x, w, vcast_vf_f(0.5)); x = vfma_vf_vf_vf_vf(x, y, x); w = vfma_vf_vf_vf_vf(w, y, w); y = vfmanp_vf_vf_vf_vf(x, w, vcast_vf_f(0.5)); x = vfma_vf_vf_vf_vf(x, y, x); w = vfma_vf_vf_vf_vf(w, y, w); y = vfmanp_vf_vf_vf_vf(x, w, vcast_vf_f(1.5)); w = vadd_vf_vf_vf(w, w); w = vmul_vf_vf_vf(w, y); x = vmul_vf_vf_vf(w, d); y = vfmapn_vf_vf_vf_vf(w, d, x); z = vfmanp_vf_vf_vf_vf(w, x, vcast_vf_f(1)); z = vfmanp_vf_vf_vf_vf(w, y, z); w = vmul_vf_vf_vf(vcast_vf_f(0.5), x); w = vfma_vf_vf_vf_vf(w, z, y); w = vadd_vf_vf_vf(w, x); w = vmul_vf_vf_vf(w, q); w = vsel_vf_vo_vf_vf(vor_vo_vo_vo(veq_vo_vf_vf(d, vcast_vf_f(0)), veq_vo_vf_vf(d, vcast_vf_f(SLEEF_INFINITYf))), d, w); w = vsel_vf_vo_vf_vf(vlt_vo_vf_vf(d, vcast_vf_f(0)), vcast_vf_f(SLEEF_NANf), w); return w; #else vfloat q; vopmask o; d = vsel_vf_vo_vf_vf(vlt_vo_vf_vf(d, vcast_vf_f(0)), vcast_vf_f(SLEEF_NANf), d); o = vlt_vo_vf_vf(d, vcast_vf_f(5.2939559203393770e-23f)); d = vsel_vf_vo_vf_vf(o, vmul_vf_vf_vf(d, vcast_vf_f(1.8889465931478580e+22f)), d); q = vsel_vf_vo_vf_vf(o, vcast_vf_f(7.2759576141834260e-12f*0.5f), vcast_vf_f(0.5f)); o = vgt_vo_vf_vf(d, vcast_vf_f(1.8446744073709552e+19f)); d = vsel_vf_vo_vf_vf(o, vmul_vf_vf_vf(d, vcast_vf_f(5.4210108624275220e-20f)), d); q = vsel_vf_vo_vf_vf(o, vcast_vf_f(4294967296.0f * 0.5f), q); vfloat x = vreinterpret_vf_vi2(vsub_vi2_vi2_vi2(vcast_vi2_i(0x5f375a86), vsrl_vi2_vi2_i(vreinterpret_vi2_vf(vadd_vf_vf_vf(d, vcast_vf_f(1e-45f))), 1))); x = vmul_vf_vf_vf(x, vsub_vf_vf_vf(vcast_vf_f(1.5f), vmul_vf_vf_vf(vmul_vf_vf_vf(vmul_vf_vf_vf(vcast_vf_f(0.5f), d), x), x))); x = vmul_vf_vf_vf(x, vsub_vf_vf_vf(vcast_vf_f(1.5f), vmul_vf_vf_vf(vmul_vf_vf_vf(vmul_vf_vf_vf(vcast_vf_f(0.5f), d), x), x))); x = vmul_vf_vf_vf(x, vsub_vf_vf_vf(vcast_vf_f(1.5f), vmul_vf_vf_vf(vmul_vf_vf_vf(vmul_vf_vf_vf(vcast_vf_f(0.5f), d), x), x))); x = vmul_vf_vf_vf(x, d); vfloat2 d2 = dfmul_vf2_vf2_vf2(dfadd2_vf2_vf_vf2(d, dfmul_vf2_vf_vf(x, x)), dfrec_vf2_vf(x)); x = vmul_vf_vf_vf(vadd_vf_vf_vf(vf2getx_vf_vf2(d2), vf2gety_vf_vf2(d2)), q); x = vsel_vf_vo_vf_vf(vispinf_vo_vf(d), vcast_vf_f(SLEEF_INFINITYf), x); x = vsel_vf_vo_vf_vf(veq_vo_vf_vf(d, vcast_vf_f(0)), d, x); return x; #endif } EXPORT CONST VECTOR_CC vfloat xsqrtf(vfloat d) { #ifdef ACCURATE_SQRT return vsqrt_vf_vf(d); #else // fall back to approximation if ACCURATE_SQRT is undefined return xsqrtf_u05(d); #endif } #if !defined(DETERMINISTIC) EXPORT CONST VECTOR_CC vfloat xhypotf_u05(vfloat x, vfloat y) { x = vabs_vf_vf(x); y = vabs_vf_vf(y); vfloat min = vmin_vf_vf_vf(x, y), n = min; vfloat max = vmax_vf_vf_vf(x, y), d = max; vopmask o = vlt_vo_vf_vf(max, vcast_vf_f(FLT_MIN)); n = vsel_vf_vo_vf_vf(o, vmul_vf_vf_vf(n, vcast_vf_f(UINT64_C(1) << 24)), n); d = vsel_vf_vo_vf_vf(o, vmul_vf_vf_vf(d, vcast_vf_f(UINT64_C(1) << 24)), d); vfloat2 t = dfdiv_vf2_vf2_vf2(vcast_vf2_vf_vf(n, vcast_vf_f(0)), vcast_vf2_vf_vf(d, vcast_vf_f(0))); t = dfmul_vf2_vf2_vf(dfsqrt_vf2_vf2(dfadd2_vf2_vf2_vf(dfsqu_vf2_vf2(t), vcast_vf_f(1))), max); vfloat ret = vadd_vf_vf_vf(vf2getx_vf_vf2(t), vf2gety_vf_vf2(t)); ret = vsel_vf_vo_vf_vf(visnan_vo_vf(ret), vcast_vf_f(SLEEF_INFINITYf), ret); ret = vsel_vf_vo_vf_vf(veq_vo_vf_vf(min, vcast_vf_f(0)), max, ret); ret = vsel_vf_vo_vf_vf(vor_vo_vo_vo(visnan_vo_vf(x), visnan_vo_vf(y)), vcast_vf_f(SLEEF_NANf), ret); ret = vsel_vf_vo_vf_vf(vor_vo_vo_vo(veq_vo_vf_vf(x, vcast_vf_f(SLEEF_INFINITYf)), veq_vo_vf_vf(y, vcast_vf_f(SLEEF_INFINITYf))), vcast_vf_f(SLEEF_INFINITYf), ret); return ret; } EXPORT CONST VECTOR_CC vfloat xhypotf_u35(vfloat x, vfloat y) { x = vabs_vf_vf(x); y = vabs_vf_vf(y); vfloat min = vmin_vf_vf_vf(x, y), n = min; vfloat max = vmax_vf_vf_vf(x, y), d = max; vfloat t = vdiv_vf_vf_vf(min, max); vfloat ret = vmul_vf_vf_vf(max, vsqrt_vf_vf(vmla_vf_vf_vf_vf(t, t, vcast_vf_f(1)))); ret = vsel_vf_vo_vf_vf(veq_vo_vf_vf(min, vcast_vf_f(0)), max, ret); ret = vsel_vf_vo_vf_vf(vor_vo_vo_vo(visnan_vo_vf(x), visnan_vo_vf(y)), vcast_vf_f(SLEEF_NANf), ret); ret = vsel_vf_vo_vf_vf(vor_vo_vo_vo(veq_vo_vf_vf(x, vcast_vf_f(SLEEF_INFINITYf)), veq_vo_vf_vf(y, vcast_vf_f(SLEEF_INFINITYf))), vcast_vf_f(SLEEF_INFINITYf), ret); return ret; } EXPORT CONST VECTOR_CC vfloat xnextafterf(vfloat x, vfloat y) { x = vsel_vf_vo_vf_vf(veq_vo_vf_vf(x, vcast_vf_f(0)), vmulsign_vf_vf_vf(vcast_vf_f(0), y), x); vint2 t, xi2 = vreinterpret_vi2_vf(x); vopmask c = vxor_vo_vo_vo(vsignbit_vo_vf(x), vge_vo_vf_vf(y, x)); xi2 = vsel_vi2_vo_vi2_vi2(c, vsub_vi2_vi2_vi2(vcast_vi2_i(0), vxor_vi2_vi2_vi2(xi2, vcast_vi2_i(1 << 31))), xi2); xi2 = vsel_vi2_vo_vi2_vi2(vneq_vo_vf_vf(x, y), vsub_vi2_vi2_vi2(xi2, vcast_vi2_i(1)), xi2); xi2 = vsel_vi2_vo_vi2_vi2(c, vsub_vi2_vi2_vi2(vcast_vi2_i(0), vxor_vi2_vi2_vi2(xi2, vcast_vi2_i(1 << 31))), xi2); vfloat ret = vreinterpret_vf_vi2(xi2); ret = vsel_vf_vo_vf_vf(vand_vo_vo_vo(veq_vo_vf_vf(ret, vcast_vf_f(0)), vneq_vo_vf_vf(x, vcast_vf_f(0))), vmulsign_vf_vf_vf(vcast_vf_f(0), x), ret); ret = vsel_vf_vo_vf_vf(vand_vo_vo_vo(veq_vo_vf_vf(x, vcast_vf_f(0)), veq_vo_vf_vf(y, vcast_vf_f(0))), y, ret); ret = vsel_vf_vo_vf_vf(vor_vo_vo_vo(visnan_vo_vf(x), visnan_vo_vf(y)), vcast_vf_f(SLEEF_NANf), ret); return ret; } EXPORT CONST VECTOR_CC vfloat xfrfrexpf(vfloat x) { x = vsel_vf_vo_vf_vf(vlt_vo_vf_vf(vabs_vf_vf(x), vcast_vf_f(FLT_MIN)), vmul_vf_vf_vf(x, vcast_vf_f(UINT64_C(1) << 30)), x); vmask xm = vreinterpret_vm_vf(x); xm = vand_vm_vm_vm(xm, vcast_vm_i_i(~0x7f800000U, ~0x7f800000U)); xm = vor_vm_vm_vm (xm, vcast_vm_i_i( 0x3f000000U, 0x3f000000U)); vfloat ret = vreinterpret_vf_vm(xm); ret = vsel_vf_vo_vf_vf(visinf_vo_vf(x), vmulsign_vf_vf_vf(vcast_vf_f(SLEEF_INFINITYf), x), ret); ret = vsel_vf_vo_vf_vf(veq_vo_vf_vf(x, vcast_vf_f(0)), x, ret); return ret; } #endif // #if !defined(DETERMINISTIC) EXPORT CONST VECTOR_CC vint2 xexpfrexpf(vfloat x) { /* x = vsel_vf_vo_vf_vf(vlt_vo_vf_vf(vabs_vf_vf(x), vcast_vf_f(FLT_MIN)), vmul_vf_vf_vf(x, vcast_vf_f(UINT64_C(1) << 63)), x); vint ret = vcastu_vi_vi2(vreinterpret_vi2_vf(x)); ret = vsub_vi_vi_vi(vand_vi_vi_vi(vsrl_vi_vi_i(ret, 20), vcast_vi_i(0x7ff)), vcast_vi_i(0x3fe)); ret = vsel_vi_vo_vi_vi(vor_vo_vo_vo(vor_vo_vo_vo(veq_vo_vf_vf(x, vcast_vf_f(0)), visnan_vo_vf(x)), visinf_vo_vf(x)), vcast_vi_i(0), ret); return ret; */ return vcast_vi2_i(0); } static INLINE CONST VECTOR_CC vfloat vtoward0f(vfloat x) { vfloat t = vreinterpret_vf_vi2(vsub_vi2_vi2_vi2(vreinterpret_vi2_vf(x), vcast_vi2_i(1))); return vsel_vf_vo_vf_vf(veq_vo_vf_vf(x, vcast_vf_f(0)), vcast_vf_f(0), t); } static INLINE CONST VECTOR_CC vfloat vptruncf(vfloat x) { #ifdef FULL_FP_ROUNDING return vtruncate_vf_vf(x); #else vfloat fr = vsub_vf_vf_vf(x, vcast_vf_vi2(vtruncate_vi2_vf(x))); return vsel_vf_vo_vf_vf(vge_vo_vf_vf(vabs_vf_vf(x), vcast_vf_f(INT64_C(1) << 23)), x, vsub_vf_vf_vf(x, fr)); #endif } #if !defined(DETERMINISTIC) EXPORT CONST VECTOR_CC vfloat xfmodf(vfloat x, vfloat y) { vfloat nu = vabs_vf_vf(x), de = vabs_vf_vf(y), s = vcast_vf_f(1), q; vopmask o = vlt_vo_vf_vf(de, vcast_vf_f(FLT_MIN)); nu = vsel_vf_vo_vf_vf(o, vmul_vf_vf_vf(nu, vcast_vf_f(UINT64_C(1) << 25)), nu); de = vsel_vf_vo_vf_vf(o, vmul_vf_vf_vf(de, vcast_vf_f(UINT64_C(1) << 25)), de); s = vsel_vf_vo_vf_vf(o, vmul_vf_vf_vf(s , vcast_vf_f(1.0f / (UINT64_C(1) << 25))), s); vfloat rde = vtoward0f(vrec_vf_vf(de)); #if defined(ENABLE_NEON32) || defined(ENABLE_NEON32VFPV4) rde = vtoward0f(rde); #endif vfloat2 r = vcast_vf2_vf_vf(nu, vcast_vf_f(0)); for(int i=0;i<8;i++) { // ceil(log2(FLT_MAX) / 22)+1 q = vptruncf(vmul_vf_vf_vf(vtoward0f(vf2getx_vf_vf2(r)), rde)); q = vsel_vf_vo_vf_vf(vand_vo_vo_vo(vgt_vo_vf_vf(vmul_vf_vf_vf(vcast_vf_f(3), de), vf2getx_vf_vf2(r)), vge_vo_vf_vf(vf2getx_vf_vf2(r), de)), vcast_vf_f(2), q); q = vsel_vf_vo_vf_vf(vand_vo_vo_vo(vgt_vo_vf_vf(vmul_vf_vf_vf(vcast_vf_f(2), de), vf2getx_vf_vf2(r)), vge_vo_vf_vf(vf2getx_vf_vf2(r), de)), vcast_vf_f(1), q); r = dfnormalize_vf2_vf2(dfadd2_vf2_vf2_vf2(r, dfmul_vf2_vf_vf(vptruncf(q), vneg_vf_vf(de)))); if (vtestallones_i_vo32(vlt_vo_vf_vf(vf2getx_vf_vf2(r), de))) break; } vfloat ret = vmul_vf_vf_vf(vadd_vf_vf_vf(vf2getx_vf_vf2(r), vf2gety_vf_vf2(r)), s); ret = vsel_vf_vo_vf_vf(veq_vo_vf_vf(vadd_vf_vf_vf(vf2getx_vf_vf2(r), vf2gety_vf_vf2(r)), de), vcast_vf_f(0), ret); ret = vmulsign_vf_vf_vf(ret, x); ret = vsel_vf_vo_vf_vf(vlt_vo_vf_vf(nu, de), x, ret); ret = vsel_vf_vo_vf_vf(veq_vo_vf_vf(de, vcast_vf_f(0)), vcast_vf_f(SLEEF_NANf), ret); return ret; } static INLINE CONST VECTOR_CC vfloat vrintfk2_vf_vf(vfloat d) { #ifdef FULL_FP_ROUNDING return vrint_vf_vf(d); #else vfloat c = vmulsign_vf_vf_vf(vcast_vf_f(1 << 23), d); return vsel_vf_vo_vf_vf(vgt_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(1 << 23)), d, vorsign_vf_vf_vf(vsub_vf_vf_vf(vadd_vf_vf_vf(d, c), c), d)); #endif } EXPORT CONST VECTOR_CC vfloat xremainderf(vfloat x, vfloat y) { vfloat n = vabs_vf_vf(x), d = vabs_vf_vf(y), s = vcast_vf_f(1), q; vopmask o = vlt_vo_vf_vf(d, vcast_vf_f(FLT_MIN*2)); n = vsel_vf_vo_vf_vf(o, vmul_vf_vf_vf(n, vcast_vf_f(UINT64_C(1) << 25)), n); d = vsel_vf_vo_vf_vf(o, vmul_vf_vf_vf(d, vcast_vf_f(UINT64_C(1) << 25)), d); s = vsel_vf_vo_vf_vf(o, vmul_vf_vf_vf(s , vcast_vf_f(1.0f / (UINT64_C(1) << 25))), s); vfloat2 r = vcast_vf2_vf_vf(n, vcast_vf_f(0)); vfloat rd = vrec_vf_vf(d); vopmask qisodd = vneq_vo_vf_vf(vcast_vf_f(0), vcast_vf_f(0)); for(int i=0;i<8;i++) { // ceil(log2(FLT_MAX) / 22)+1 q = vrintfk2_vf_vf(vmul_vf_vf_vf(vf2getx_vf_vf2(r), rd)); q = vsel_vf_vo_vf_vf(vlt_vo_vf_vf(vabs_vf_vf(vf2getx_vf_vf2(r)), vmul_vf_vf_vf(d, vcast_vf_f(1.5f))), vmulsign_vf_vf_vf(vcast_vf_f(1.0f), vf2getx_vf_vf2(r)), q); q = vsel_vf_vo_vf_vf(vor_vo_vo_vo(vlt_vo_vf_vf(vabs_vf_vf(vf2getx_vf_vf2(r)), vmul_vf_vf_vf(d, vcast_vf_f(0.5f))), vandnot_vo_vo_vo(qisodd, veq_vo_vf_vf(vabs_vf_vf(vf2getx_vf_vf2(r)), vmul_vf_vf_vf(d, vcast_vf_f(0.5f))))), vcast_vf_f(0.0), q); if (vtestallones_i_vo32(veq_vo_vf_vf(q, vcast_vf_f(0)))) break; q = vsel_vf_vo_vf_vf(visinf_vo_vf(vmul_vf_vf_vf(q, vneg_vf_vf(d))), vadd_vf_vf_vf(q, vmulsign_vf_vf_vf(vcast_vf_f(-1), vf2getx_vf_vf2(r))), q); qisodd = vxor_vo_vo_vo(qisodd, vand_vo_vo_vo(veq_vo_vi2_vi2(vand_vi2_vi2_vi2(vtruncate_vi2_vf(q), vcast_vi2_i(1)), vcast_vi2_i(1)), vlt_vo_vf_vf(vabs_vf_vf(q), vcast_vf_f(1 << 24)))); r = dfnormalize_vf2_vf2(dfadd2_vf2_vf2_vf2(r, dfmul_vf2_vf_vf(q, vneg_vf_vf(d)))); } vfloat ret = vmul_vf_vf_vf(vadd_vf_vf_vf(vf2getx_vf_vf2(r), vf2gety_vf_vf2(r)), s); ret = vmulsign_vf_vf_vf(ret, x); ret = vsel_vf_vo_vf_vf(visinf_vo_vf(y), vsel_vf_vo_vf_vf(visinf_vo_vf(x), vcast_vf_f(SLEEF_NANf), x), ret); ret = vsel_vf_vo_vf_vf(veq_vo_vf_vf(d, vcast_vf_f(0)), vcast_vf_f(SLEEF_NANf), ret); return ret; } #endif // #if !defined(DETERMINISTIC) // static INLINE CONST VECTOR_CC vfloat2 sinpifk(vfloat d) { vopmask o; vfloat u, s, t; vfloat2 x, s2; u = vmul_vf_vf_vf(d, vcast_vf_f(4.0)); vint2 q = vtruncate_vi2_vf(u); q = vand_vi2_vi2_vi2(vadd_vi2_vi2_vi2(q, vxor_vi2_vi2_vi2(vsrl_vi2_vi2_i(q, 31), vcast_vi2_i(1))), vcast_vi2_i(~1)); o = veq_vo_vi2_vi2(vand_vi2_vi2_vi2(q, vcast_vi2_i(2)), vcast_vi2_i(2)); s = vsub_vf_vf_vf(u, vcast_vf_vi2(q)); t = s; s = vmul_vf_vf_vf(s, s); s2 = dfmul_vf2_vf_vf(t, t); // u = vsel_vf_vo_f_f(o, -0.2430611801e-7f, +0.3093842054e-6f); u = vmla_vf_vf_vf_vf(u, s, vsel_vf_vo_f_f(o, +0.3590577080e-5f, -0.3657307388e-4f)); u = vmla_vf_vf_vf_vf(u, s, vsel_vf_vo_f_f(o, -0.3259917721e-3f, +0.2490393585e-2f)); x = dfadd2_vf2_vf_vf2(vmul_vf_vf_vf(u, s), vsel_vf2_vo_f_f_f_f(o, 0.015854343771934509277, 4.4940051354032242811e-10, -0.080745510756969451904, -1.3373665339076936258e-09)); x = dfadd2_vf2_vf2_vf2(dfmul_vf2_vf2_vf2(s2, x), vsel_vf2_vo_f_f_f_f(o, -0.30842512845993041992, -9.0728339030733922277e-09, 0.78539818525314331055, -2.1857338617566484855e-08)); x = dfmul_vf2_vf2_vf2(x, vsel_vf2_vo_vf2_vf2(o, s2, vcast_vf2_vf_vf(t, vcast_vf_f(0)))); x = vsel_vf2_vo_vf2_vf2(o, dfadd2_vf2_vf2_vf(x, vcast_vf_f(1)), x); o = veq_vo_vi2_vi2(vand_vi2_vi2_vi2(q, vcast_vi2_i(4)), vcast_vi2_i(4)); x = vf2setx_vf2_vf2_vf(x, vreinterpret_vf_vm(vxor_vm_vm_vm(vand_vm_vo32_vm(o, vreinterpret_vm_vf(vcast_vf_f(-0.0))), vreinterpret_vm_vf(vf2getx_vf_vf2(x))))); x = vf2sety_vf2_vf2_vf(x, vreinterpret_vf_vm(vxor_vm_vm_vm(vand_vm_vo32_vm(o, vreinterpret_vm_vf(vcast_vf_f(-0.0))), vreinterpret_vm_vf(vf2gety_vf_vf2(x))))); return x; } #if !defined(DETERMINISTIC) EXPORT CONST VECTOR_CC vfloat xsinpif_u05(vfloat d) { vfloat2 x = sinpifk(d); vfloat r = vadd_vf_vf_vf(vf2getx_vf_vf2(x), vf2gety_vf_vf2(x)); r = vsel_vf_vo_vf_vf(visnegzero_vo_vf(d), vcast_vf_f(-0.0), r); r = vreinterpret_vf_vm(vandnot_vm_vo32_vm(vgt_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(TRIGRANGEMAX4f)), vreinterpret_vm_vf(r))); r = vreinterpret_vf_vm(vor_vm_vo32_vm(visinf_vo_vf(d), vreinterpret_vm_vf(r))); return r; } #endif // #if !defined(DETERMINISTIC) static INLINE CONST VECTOR_CC vfloat2 cospifk(vfloat d) { vopmask o; vfloat u, s, t; vfloat2 x, s2; u = vmul_vf_vf_vf(d, vcast_vf_f(4.0)); vint2 q = vtruncate_vi2_vf(u); q = vand_vi2_vi2_vi2(vadd_vi2_vi2_vi2(q, vxor_vi2_vi2_vi2(vsrl_vi2_vi2_i(q, 31), vcast_vi2_i(1))), vcast_vi2_i(~1)); o = veq_vo_vi2_vi2(vand_vi2_vi2_vi2(q, vcast_vi2_i(2)), vcast_vi2_i(0)); s = vsub_vf_vf_vf(u, vcast_vf_vi2(q)); t = s; s = vmul_vf_vf_vf(s, s); s2 = dfmul_vf2_vf_vf(t, t); // u = vsel_vf_vo_f_f(o, -0.2430611801e-7f, +0.3093842054e-6f); u = vmla_vf_vf_vf_vf(u, s, vsel_vf_vo_f_f(o, +0.3590577080e-5f, -0.3657307388e-4f)); u = vmla_vf_vf_vf_vf(u, s, vsel_vf_vo_f_f(o, -0.3259917721e-3f, +0.2490393585e-2f)); x = dfadd2_vf2_vf_vf2(vmul_vf_vf_vf(u, s), vsel_vf2_vo_f_f_f_f(o, 0.015854343771934509277, 4.4940051354032242811e-10, -0.080745510756969451904, -1.3373665339076936258e-09)); x = dfadd2_vf2_vf2_vf2(dfmul_vf2_vf2_vf2(s2, x), vsel_vf2_vo_f_f_f_f(o, -0.30842512845993041992, -9.0728339030733922277e-09, 0.78539818525314331055, -2.1857338617566484855e-08)); x = dfmul_vf2_vf2_vf2(x, vsel_vf2_vo_vf2_vf2(o, s2, vcast_vf2_vf_vf(t, vcast_vf_f(0)))); x = vsel_vf2_vo_vf2_vf2(o, dfadd2_vf2_vf2_vf(x, vcast_vf_f(1)), x); o = veq_vo_vi2_vi2(vand_vi2_vi2_vi2(vadd_vi2_vi2_vi2(q, vcast_vi2_i(2)), vcast_vi2_i(4)), vcast_vi2_i(4)); x = vf2setx_vf2_vf2_vf(x, vreinterpret_vf_vm(vxor_vm_vm_vm(vand_vm_vo32_vm(o, vreinterpret_vm_vf(vcast_vf_f(-0.0))), vreinterpret_vm_vf(vf2getx_vf_vf2(x))))); x = vf2sety_vf2_vf2_vf(x, vreinterpret_vf_vm(vxor_vm_vm_vm(vand_vm_vo32_vm(o, vreinterpret_vm_vf(vcast_vf_f(-0.0))), vreinterpret_vm_vf(vf2gety_vf_vf2(x))))); return x; } #if !defined(DETERMINISTIC) EXPORT CONST VECTOR_CC vfloat xcospif_u05(vfloat d) { vfloat2 x = cospifk(d); vfloat r = vadd_vf_vf_vf(vf2getx_vf_vf2(x), vf2gety_vf_vf2(x)); r = vsel_vf_vo_vf_vf(vgt_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(TRIGRANGEMAX4f)), vcast_vf_f(1), r); r = vreinterpret_vf_vm(vor_vm_vo32_vm(visinf_vo_vf(d), vreinterpret_vm_vf(r))); return r; } #endif // #if !defined(DETERMINISTIC) #if !(defined(ENABLE_SVE) || defined(ENABLE_SVENOFMA)) typedef struct { vfloat2 a, b; } df2; static df2 df2setab_df2_vf2_vf2(vfloat2 a, vfloat2 b) { df2 r = { a, b }; return r; } static vfloat2 df2geta_vf2_df2(df2 d) { return d.a; } static vfloat2 df2getb_vf2_df2(df2 d) { return d.b; } #endif /* TODO AArch64: potential optimization by using `vfmad_lane_f64` */ static CONST df2 gammafk(vfloat a) { vfloat2 clc = vcast_vf2_f_f(0, 0), clln = vcast_vf2_f_f(1, 0), clld = vcast_vf2_f_f(1, 0); vfloat2 v = vcast_vf2_f_f(1, 0), x, y, z; vfloat t, u; vopmask otiny = vlt_vo_vf_vf(vabs_vf_vf(a), vcast_vf_f(1e-30f)), oref = vlt_vo_vf_vf(a, vcast_vf_f(0.5)); x = vsel_vf2_vo_vf2_vf2(otiny, vcast_vf2_f_f(0, 0), vsel_vf2_vo_vf2_vf2(oref, dfadd2_vf2_vf_vf(vcast_vf_f(1), vneg_vf_vf(a)), vcast_vf2_vf_vf(a, vcast_vf_f(0)))); vopmask o0 = vand_vo_vo_vo(vle_vo_vf_vf(vcast_vf_f(0.5), vf2getx_vf_vf2(x)), vle_vo_vf_vf(vf2getx_vf_vf2(x), vcast_vf_f(1.2))); vopmask o2 = vle_vo_vf_vf(vcast_vf_f(2.3), vf2getx_vf_vf2(x)); y = dfnormalize_vf2_vf2(dfmul_vf2_vf2_vf2(dfadd2_vf2_vf2_vf(x, vcast_vf_f(1)), x)); y = dfnormalize_vf2_vf2(dfmul_vf2_vf2_vf2(dfadd2_vf2_vf2_vf(x, vcast_vf_f(2)), y)); vopmask o = vand_vo_vo_vo(o2, vle_vo_vf_vf(vf2getx_vf_vf2(x), vcast_vf_f(7))); clln = vsel_vf2_vo_vf2_vf2(o, y, clln); x = vsel_vf2_vo_vf2_vf2(o, dfadd2_vf2_vf2_vf(x, vcast_vf_f(3)), x); t = vsel_vf_vo_vf_vf(o2, vrec_vf_vf(vf2getx_vf_vf2(x)), vf2getx_vf_vf2(dfnormalize_vf2_vf2(dfadd2_vf2_vf2_vf(x, vsel_vf_vo_f_f(o0, -1, -2))))); u = vsel_vf_vo_vo_f_f_f(o2, o0, +0.000839498720672087279971000786, +0.9435157776e+0f, +0.1102489550e-3f); u = vmla_vf_vf_vf_vf(u, t, vsel_vf_vo_vo_f_f_f(o2, o0, -5.17179090826059219329394422e-05, +0.8670063615e+0f, +0.8160019934e-4f)); u = vmla_vf_vf_vf_vf(u, t, vsel_vf_vo_vo_f_f_f(o2, o0, -0.000592166437353693882857342347, +0.4826702476e+0f, +0.1528468856e-3f)); u = vmla_vf_vf_vf_vf(u, t, vsel_vf_vo_vo_f_f_f(o2, o0, +6.97281375836585777403743539e-05, -0.8855129778e-1f, -0.2355068718e-3f)); u = vmla_vf_vf_vf_vf(u, t, vsel_vf_vo_vo_f_f_f(o2, o0, +0.000784039221720066627493314301, +0.1013825238e+0f, +0.4962242092e-3f)); u = vmla_vf_vf_vf_vf(u, t, vsel_vf_vo_vo_f_f_f(o2, o0, -0.000229472093621399176949318732, -0.1493408978e+0f, -0.1193488017e-2f)); u = vmla_vf_vf_vf_vf(u, t, vsel_vf_vo_vo_f_f_f(o2, o0, -0.002681327160493827160473958490, +0.1697509140e+0f, +0.2891599433e-2f)); u = vmla_vf_vf_vf_vf(u, t, vsel_vf_vo_vo_f_f_f(o2, o0, +0.003472222222222222222175164840, -0.2072454542e+0f, -0.7385451812e-2f)); u = vmla_vf_vf_vf_vf(u, t, vsel_vf_vo_vo_f_f_f(o2, o0, +0.083333333333333333335592087900, +0.2705872357e+0f, +0.2058077045e-1f)); y = dfmul_vf2_vf2_vf2(dfadd2_vf2_vf2_vf(x, vcast_vf_f(-0.5)), logk2f(x)); y = dfadd2_vf2_vf2_vf2(y, dfneg_vf2_vf2(x)); y = dfadd2_vf2_vf2_vf2(y, vcast_vf2_d(0.91893853320467278056)); // 0.5*log(2*M_PI) z = dfadd2_vf2_vf2_vf(dfmul_vf2_vf_vf (u, t), vsel_vf_vo_f_f(o0, -0.400686534596170958447352690395e+0f, -0.673523028297382446749257758235e-1f)); z = dfadd2_vf2_vf2_vf(dfmul_vf2_vf2_vf(z, t), vsel_vf_vo_f_f(o0, +0.822466960142643054450325495997e+0f, +0.322467033928981157743538726901e+0f)); z = dfadd2_vf2_vf2_vf(dfmul_vf2_vf2_vf(z, t), vsel_vf_vo_f_f(o0, -0.577215665946766039837398973297e+0f, +0.422784335087484338986941629852e+0f)); z = dfmul_vf2_vf2_vf(z, t); clc = vsel_vf2_vo_vf2_vf2(o2, y, z); clld = vsel_vf2_vo_vf2_vf2(o2, dfadd2_vf2_vf2_vf(dfmul_vf2_vf_vf(u, t), vcast_vf_f(1)), clld); y = clln; clc = vsel_vf2_vo_vf2_vf2(otiny, vcast_vf2_d(41.58883083359671856503), // log(2^60) vsel_vf2_vo_vf2_vf2(oref, dfadd2_vf2_vf2_vf2(vcast_vf2_d(1.1447298858494001639), dfneg_vf2_vf2(clc)), clc)); // log(M_PI) clln = vsel_vf2_vo_vf2_vf2(otiny, vcast_vf2_f_f(1, 0), vsel_vf2_vo_vf2_vf2(oref, clln, clld)); if (!vtestallones_i_vo32(vnot_vo32_vo32(oref))) { t = vsub_vf_vf_vf(a, vmul_vf_vf_vf(vcast_vf_f(INT64_C(1) << 12), vcast_vf_vi2(vtruncate_vi2_vf(vmul_vf_vf_vf(a, vcast_vf_f(1.0 / (INT64_C(1) << 12))))))); x = dfmul_vf2_vf2_vf2(clld, sinpifk(t)); } clld = vsel_vf2_vo_vf2_vf2(otiny, vcast_vf2_vf_vf(vmul_vf_vf_vf(a, vcast_vf_f((INT64_C(1) << 30)*(float)(INT64_C(1) << 30))), vcast_vf_f(0)), vsel_vf2_vo_vf2_vf2(oref, x, y)); return df2setab_df2_vf2_vf2(clc, dfdiv_vf2_vf2_vf2(clln, clld)); } #if !defined(DETERMINISTIC) EXPORT CONST VECTOR_CC vfloat xtgammaf_u1(vfloat a) { df2 d = gammafk(a); vfloat2 y = dfmul_vf2_vf2_vf2(expk2f(df2geta_vf2_df2(d)), df2getb_vf2_df2(d)); vfloat r = vadd_vf_vf_vf(vf2getx_vf_vf2(y), vf2gety_vf_vf2(y)); vopmask o; o = vor_vo_vo_vo(vor_vo_vo_vo(veq_vo_vf_vf(a, vcast_vf_f(-SLEEF_INFINITYf)), vand_vo_vo_vo(vlt_vo_vf_vf(a, vcast_vf_f(0)), visint_vo_vf(a))), vand_vo_vo_vo(vand_vo_vo_vo(visnumber_vo_vf(a), vlt_vo_vf_vf(a, vcast_vf_f(0))), visnan_vo_vf(r))); r = vsel_vf_vo_vf_vf(o, vcast_vf_f(SLEEF_NANf), r); o = vand_vo_vo_vo(vand_vo_vo_vo(vor_vo_vo_vo(veq_vo_vf_vf(a, vcast_vf_f(SLEEF_INFINITYf)), visnumber_vo_vf(a)), vge_vo_vf_vf(a, vcast_vf_f(-FLT_MIN))), vor_vo_vo_vo(vor_vo_vo_vo(veq_vo_vf_vf(a, vcast_vf_f(0)), vgt_vo_vf_vf(a, vcast_vf_f(36))), visnan_vo_vf(r))); r = vsel_vf_vo_vf_vf(o, vmulsign_vf_vf_vf(vcast_vf_f(SLEEF_INFINITYf), a), r); return r; } EXPORT CONST VECTOR_CC vfloat xlgammaf_u1(vfloat a) { df2 d = gammafk(a); vfloat2 y = dfadd2_vf2_vf2_vf2(df2geta_vf2_df2(d), logk2f(dfabs_vf2_vf2(df2getb_vf2_df2(d)))); vfloat r = vadd_vf_vf_vf(vf2getx_vf_vf2(y), vf2gety_vf_vf2(y)); vopmask o; o = vor_vo_vo_vo(visinf_vo_vf(a), vor_vo_vo_vo(vand_vo_vo_vo(vle_vo_vf_vf(a, vcast_vf_f(0)), visint_vo_vf(a)), vand_vo_vo_vo(visnumber_vo_vf(a), visnan_vo_vf(r)))); r = vsel_vf_vo_vf_vf(o, vcast_vf_f(SLEEF_INFINITYf), r); return r; } /* TODO AArch64: potential optimization by using `vfmad_lane_f64` */ EXPORT CONST VECTOR_CC vfloat xerff_u1(vfloat a) { vfloat s = a, t, u; vfloat2 d; a = vabs_vf_vf(a); vopmask o0 = vlt_vo_vf_vf(a, vcast_vf_f(1.1)); vopmask o1 = vlt_vo_vf_vf(a, vcast_vf_f(2.4)); vopmask o2 = vlt_vo_vf_vf(a, vcast_vf_f(4.0)); u = vsel_vf_vo_vf_vf(o0, vmul_vf_vf_vf(a, a), a); t = vsel_vf_vo_vo_f_f_f(o0, o1, +0.7089292194e-4f, -0.1792667899e-4f, -0.9495757695e-5f); t = vmla_vf_vf_vf_vf(t, u, vsel_vf_vo_vo_f_f_f(o0, o1, -0.7768311189e-3f, +0.3937633010e-3f, +0.2481465926e-3f)); t = vmla_vf_vf_vf_vf(t, u, vsel_vf_vo_vo_f_f_f(o0, o1, +0.5159463733e-2f, -0.3949181177e-2f, -0.2918176819e-2f)); t = vmla_vf_vf_vf_vf(t, u, vsel_vf_vo_vo_f_f_f(o0, o1, -0.2683781274e-1f, +0.2445474640e-1f, +0.2059706673e-1f)); t = vmla_vf_vf_vf_vf(t, u, vsel_vf_vo_vo_f_f_f(o0, o1, +0.1128318012e+0f, -0.1070996150e+0f, -0.9901899844e-1f)); d = dfmul_vf2_vf_vf(t, u); d = dfadd2_vf2_vf2_vf2(d, vsel_vf2_vo_vo_d_d_d(o0, o1, -0.376125876000657465175213237214e+0, -0.634588905908410389971210809210e+0, -0.643598050547891613081201721633e+0)); d = dfmul_vf2_vf2_vf(d, u); d = dfadd2_vf2_vf2_vf2(d, vsel_vf2_vo_vo_d_d_d(o0, o1, +0.112837916021059138255978217023e+1, -0.112879855826694507209862753992e+1, -0.112461487742845562801052956293e+1)); d = dfmul_vf2_vf2_vf(d, a); d = vsel_vf2_vo_vf2_vf2(o0, d, dfadd_vf2_vf_vf2(vcast_vf_f(1.0), dfneg_vf2_vf2(expk2f(d)))); u = vmulsign_vf_vf_vf(vsel_vf_vo_vf_vf(o2, vadd_vf_vf_vf(vf2getx_vf_vf2(d), vf2gety_vf_vf2(d)), vcast_vf_f(1)), s); u = vsel_vf_vo_vf_vf(visnan_vo_vf(a), vcast_vf_f(SLEEF_NANf), u); return u; } /* TODO AArch64: potential optimization by using `vfmad_lane_f64` */ EXPORT CONST VECTOR_CC vfloat xerfcf_u15(vfloat a) { vfloat s = a, r = vcast_vf_f(0), t; vfloat2 u, d, x; a = vabs_vf_vf(a); vopmask o0 = vlt_vo_vf_vf(a, vcast_vf_f(1.0)); vopmask o1 = vlt_vo_vf_vf(a, vcast_vf_f(2.2)); vopmask o2 = vlt_vo_vf_vf(a, vcast_vf_f(4.3)); vopmask o3 = vlt_vo_vf_vf(a, vcast_vf_f(10.1)); u = vsel_vf2_vo_vf2_vf2(o1, vcast_vf2_vf_vf(a, vcast_vf_f(0)), dfdiv_vf2_vf2_vf2(vcast_vf2_f_f(1, 0), vcast_vf2_vf_vf(a, vcast_vf_f(0)))); t = vsel_vf_vo_vo_vo_f_f_f_f(o0, o1, o2, -0.8638041618e-4f, -0.6236977242e-5f, -0.3869504035e+0f, +0.1115344167e+1f); t = vmla_vf_vf_vf_vf(t, vf2getx_vf_vf2(u), vsel_vf_vo_vo_vo_f_f_f_f(o0, o1, o2, +0.6000166177e-3f, +0.5749821503e-4f, +0.1288077235e+1f, -0.9454904199e+0f)); t = vmla_vf_vf_vf_vf(t, vf2getx_vf_vf2(u), vsel_vf_vo_vo_vo_f_f_f_f(o0, o1, o2, -0.1665703603e-2f, +0.6002851478e-5f, -0.1816803217e+1f, -0.3667259514e+0f)); t = vmla_vf_vf_vf_vf(t, vf2getx_vf_vf2(u), vsel_vf_vo_vo_vo_f_f_f_f(o0, o1, o2, +0.1795156277e-3f, -0.2851036377e-2f, +0.1249150872e+1f, +0.7155663371e+0f)); t = vmla_vf_vf_vf_vf(t, vf2getx_vf_vf2(u), vsel_vf_vo_vo_vo_f_f_f_f(o0, o1, o2, +0.1914106123e-1f, +0.2260518074e-1f, -0.1328857988e+0f, -0.1262947265e-1f)); d = dfmul_vf2_vf2_vf(u, t); d = dfadd2_vf2_vf2_vf2(d, vsel_vf2_vo_vo_vo_d_d_d_d(o0, o1, o2, -0.102775359343930288081655368891e+0, -0.105247583459338632253369014063e+0, -0.482365310333045318680618892669e+0, -0.498961546254537647970305302739e+0)); d = dfmul_vf2_vf2_vf2(d, u); d = dfadd2_vf2_vf2_vf2(d, vsel_vf2_vo_vo_vo_d_d_d_d(o0, o1, o2, -0.636619483208481931303752546439e+0, -0.635609463574589034216723775292e+0, -0.134450203224533979217859332703e-2, -0.471199543422848492080722832666e-4)); d = dfmul_vf2_vf2_vf2(d, u); d = dfadd2_vf2_vf2_vf2(d, vsel_vf2_vo_vo_vo_d_d_d_d(o0, o1, o2, -0.112837917790537404939545770596e+1, -0.112855987376668622084547028949e+1, -0.572319781150472949561786101080e+0, -0.572364030327966044425932623525e+0)); x = dfmul_vf2_vf2_vf(vsel_vf2_vo_vf2_vf2(o1, d, vcast_vf2_vf_vf(vneg_vf_vf(a), vcast_vf_f(0))), a); x = vsel_vf2_vo_vf2_vf2(o1, x, dfadd2_vf2_vf2_vf2(x, d)); x = expk2f(x); x = vsel_vf2_vo_vf2_vf2(o1, x, dfmul_vf2_vf2_vf2(x, u)); r = vsel_vf_vo_vf_vf(o3, vadd_vf_vf_vf(vf2getx_vf_vf2(x), vf2gety_vf_vf2(x)), vcast_vf_f(0)); r = vsel_vf_vo_vf_vf(vsignbit_vo_vf(s), vsub_vf_vf_vf(vcast_vf_f(2), r), r); r = vsel_vf_vo_vf_vf(visnan_vo_vf(s), vcast_vf_f(SLEEF_NANf), r); return r; } #endif // #if !defined(DETERMINISTIC) #if !defined(DETERMINISTIC) && !defined(ENABLE_GNUABI) && !defined(SLEEF_GENHEADER) // See sleefsimddp.c for explanation of these macros #ifdef ENABLE_ALIAS #define DALIAS_vf_vf(FUNC) EXPORT CONST VECTOR_CC vfloat y ## FUNC(vfloat) __attribute__((alias( stringify(x ## FUNC) ))); #define DALIAS_vf2_vf(FUNC) EXPORT CONST VECTOR_CC vfloat2 y ## FUNC(vfloat) __attribute__((alias( stringify(x ## FUNC) ))); #define DALIAS_vf_vf_vf(FUNC) EXPORT CONST VECTOR_CC vfloat y ## FUNC(vfloat, vfloat) __attribute__((alias( stringify(x ## FUNC) ))); #define DALIAS_vf_vf_vf_vf(FUNC) EXPORT CONST VECTOR_CC vfloat y ## FUNC(vfloat, vfloat, vfloat) __attribute__((alias( stringify(x ## FUNC) ))); #else #define DALIAS_vf_vf(FUNC) EXPORT CONST VECTOR_CC vfloat y ## FUNC(vfloat d) { return x ## FUNC (d); } #define DALIAS_vf2_vf(FUNC) EXPORT CONST VECTOR_CC vfloat2 y ## FUNC(vfloat d) { return x ## FUNC (d); } #define DALIAS_vf_vf_vf(FUNC) EXPORT CONST VECTOR_CC vfloat y ## FUNC(vfloat x, vfloat y) { return x ## FUNC (x, y); } #define DALIAS_vf_vf_vf_vf(FUNC) EXPORT CONST VECTOR_CC vfloat y ## FUNC(vfloat x, vfloat y, vfloat z) { return x ## FUNC (x, y, z); } #endif /* DALIAS_vf2_vf(sincospif_u05) */ /* DALIAS_vf2_vf(sincospif_u35) */ /* DALIAS_vf2_vf(modff) */ /* DALIAS_vf_vf(atanf) */ /* DALIAS_vf_vf_vf(atan2f) */ /* DALIAS_vf_vf(asinf) */ /* DALIAS_vf_vf(acosf) */ /* DALIAS_vf_vf_vf(atan2f_u1) */ /* DALIAS_vf_vf(asinf_u1) */ /* DALIAS_vf_vf(acosf_u1) */ /* DALIAS_vf_vf(atanf_u1) */ /* DALIAS_vf_vf(logf) */ /* DALIAS_vf_vf(expf) */ /* DALIAS_vf_vf(cbrtf) */ /* DALIAS_vf_vf(cbrtf_u1) */ /* DALIAS_vf_vf(logf_u1) */ /* DALIAS_vf_vf_vf(powf) */ /* DALIAS_vf_vf(sinhf) */ /* DALIAS_vf_vf(coshf) */ /* DALIAS_vf_vf(tanhf) */ /* DALIAS_vf_vf(sinhf_u35) */ /* DALIAS_vf_vf(coshf_u35) */ /* DALIAS_vf_vf(tanhf_u35) */ /* DALIAS_vf_vf(asinhf) */ /* DALIAS_vf_vf(acoshf) */ /* DALIAS_vf_vf(atanhf) */ /* DALIAS_vf_vf(exp2f) */ /* DALIAS_vf_vf(exp2f_u35) */ /* DALIAS_vf_vf(exp10f) */ /* DALIAS_vf_vf(exp10f_u35) */ /* DALIAS_vf_vf(expm1f) */ /* DALIAS_vf_vf(log10f) */ /* DALIAS_vf_vf(log2f) */ /* DALIAS_vf_vf(log2f_u35) */ /* DALIAS_vf_vf(log1pf) */ /* DALIAS_vf_vf(fabsf) */ /* DALIAS_vf_vf_vf(copysignf) */ /* DALIAS_vf_vf_vf(fmaxf) */ /* DALIAS_vf_vf_vf(fminf) */ /* DALIAS_vf_vf_vf(fdimf) */ /* DALIAS_vf_vf(truncf) */ /* DALIAS_vf_vf(floorf) */ /* DALIAS_vf_vf(ceilf) */ /* DALIAS_vf_vf(roundf) */ /* DALIAS_vf_vf(rintf) */ /* DALIAS_vf_vf_vf_vf(fmaf) */ /* DALIAS_vf_vf_vf(hypotf_u05) */ /* DALIAS_vf_vf_vf(hypotf_u35) */ /* DALIAS_vf_vf_vf(nextafterf) */ /* DALIAS_vf_vf(frfrexpf) */ /* DALIAS_vf_vf_vf(fmodf) */ /* DALIAS_vf_vf_vf(remainderf) */ /* DALIAS_vf_vf(sinpif_u05) */ /* DALIAS_vf_vf(cospif_u05) */ /* DALIAS_vf_vf(tgammaf_u1) */ /* DALIAS_vf_vf(lgammaf_u1) */ /* DALIAS_vf_vf(erff_u1) */ /* DALIAS_vf_vf(erfcf_u15) */ /* DALIAS_vf_vf_vf(fastpowf_u3500) */ #endif // #if !defined(DETERMINISTIC) && !defined(ENABLE_GNUABI) && !defined(SLEEF_GENHEADER) #if !defined(ENABLE_GNUABI) && !defined(SLEEF_GENHEADER) EXPORT CONST int xgetIntf(int name) { if (1 <= name && name <= 10) return vavailability_i(name); return 0; } EXPORT CONST void *xgetPtrf(int name) { if (name == 0) return ISANAME; return (void *)0; } #endif #if defined(ALIAS_NO_EXT_SUFFIX) && !defined(DETERMINISTIC) #include ALIAS_NO_EXT_SUFFIX #endif #ifdef ENABLE_GNUABI EXPORT CONST VECTOR_CC vfloat __acosf_finite (vfloat) __attribute__((weak, alias(str_xacosf_u1 ))); EXPORT CONST VECTOR_CC vfloat __acoshf_finite (vfloat) __attribute__((weak, alias(str_xacoshf ))); EXPORT CONST VECTOR_CC vfloat __asinf_finite (vfloat) __attribute__((weak, alias(str_xasinf_u1 ))); EXPORT CONST VECTOR_CC vfloat __atan2f_finite (vfloat, vfloat) __attribute__((weak, alias(str_xatan2f_u1 ))); EXPORT CONST VECTOR_CC vfloat __atanhf_finite (vfloat) __attribute__((weak, alias(str_xatanhf ))); EXPORT CONST VECTOR_CC vfloat __coshf_finite (vfloat) __attribute__((weak, alias(str_xcoshf ))); EXPORT CONST VECTOR_CC vfloat __exp10f_finite (vfloat) __attribute__((weak, alias(str_xexp10f ))); EXPORT CONST VECTOR_CC vfloat __exp2f_finite (vfloat) __attribute__((weak, alias(str_xexp2f ))); EXPORT CONST VECTOR_CC vfloat __expf_finite (vfloat) __attribute__((weak, alias(str_xexpf ))); EXPORT CONST VECTOR_CC vfloat __fmodf_finite (vfloat, vfloat) __attribute__((weak, alias(str_xfmodf ))); EXPORT CONST VECTOR_CC vfloat __remainderf_finite(vfloat, vfloat) __attribute__((weak, alias(str_xremainderf))); EXPORT CONST VECTOR_CC vfloat __modff_finite (vfloat, vfloat *) __attribute__((weak, alias(str_xmodff ))); EXPORT CONST VECTOR_CC vfloat __hypotf_u05_finite(vfloat, vfloat) __attribute__((weak, alias(str_xhypotf_u05))); EXPORT CONST VECTOR_CC vfloat __lgammaf_u1_finite(vfloat) __attribute__((weak, alias(str_xlgammaf_u1))); EXPORT CONST VECTOR_CC vfloat __log10f_finite (vfloat) __attribute__((weak, alias(str_xlog10f ))); EXPORT CONST VECTOR_CC vfloat __logf_finite (vfloat) __attribute__((weak, alias(str_xlogf_u1 ))); EXPORT CONST VECTOR_CC vfloat __powf_finite (vfloat, vfloat) __attribute__((weak, alias(str_xpowf ))); EXPORT CONST VECTOR_CC vfloat __sinhf_finite (vfloat) __attribute__((weak, alias(str_xsinhf ))); EXPORT CONST VECTOR_CC vfloat __sqrtf_finite (vfloat) __attribute__((weak, alias(str_xsqrtf ))); EXPORT CONST VECTOR_CC vfloat __tgammaf_u1_finite(vfloat) __attribute__((weak, alias(str_xtgammaf_u1))); #ifdef HEADER_MASKED #include HEADER_MASKED #endif #endif /* #ifdef ENABLE_GNUABI */ #ifdef ENABLE_MAIN // gcc -DENABLE_MAIN -Wno-attributes -I../common -I../arch -DENABLE_AVX2 -mavx2 -mfma sleefsimdsp.c rempitab.c ../common/common.c -lm #include #include #include int main(int argc, char **argv) { vfloat vf1 = vcast_vf_f(atof(argv[1])); //vfloat vf2 = vcast_vf_f(atof(argv[2])); //vfloat r = xpowf(vf1, vf2); //vfloat r = xsqrtf_u05(vf1); //printf("%g\n", xnextafterf(vf1, vf2)[0]); //printf("%g\n", nextafterf(atof(argv[1]), atof(argv[2]))); printf("t = %.20g\n", xlogf_u1(vf1)[0]); printf("c = %.20g\n", logf(atof(argv[1]))); } #endif ================================================ FILE: src/sleefsimdsp_emulation.c ================================================ /* Copyright (c) 2021 Agenium Scale Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #include #ifdef ENABLE_VSX #include "renamevsx.h" #define nsimd_vec_f32 nsimd_vmx_vf32 #define get0(a) vec_extract(a, 0) #define get1(a) vec_extract(a, 1) #define get2(a) vec_extract(a, 2) #define get3(a) vec_extract(a, 3) #define set0(a, b) vec_splats(b) #define set1(a, b) vec_insert(b, a, 1) #define set2(a, b) vec_insert(b, a, 2) #define set3(a, b) vec_insert(b, a, 3) #endif nsimd_vec_f32 xsinf(nsimd_vec_f32 a0_) { nsimd_vec_f32 ret_; nsimd_cpu_vf32 a0, ret; a0.v0 = get0(a0_); a0.v1 = get1(a0_); a0.v2 = get2(a0_); a0.v3 = get3(a0_); ret = nsimd_sin_u35_cpu_f32(a0); ret_ = set0(ret_, ret.v0); ret_ = set1(ret_, ret.v1); ret_ = set2(ret_, ret.v2); ret_ = set3(ret_, ret.v3); return ret_; } nsimd_vec_f32 xcosf(nsimd_vec_f32 a0_) { nsimd_vec_f32 ret_; nsimd_cpu_vf32 a0, ret; a0.v0 = get0(a0_); a0.v1 = get1(a0_); a0.v2 = get2(a0_); a0.v3 = get3(a0_); ret = nsimd_cos_u35_cpu_f32(a0); ret_ = set0(ret_, ret.v0); ret_ = set1(ret_, ret.v1); ret_ = set2(ret_, ret.v2); ret_ = set3(ret_, ret.v3); return ret_; } nsimd_vec_f32 xtanf(nsimd_vec_f32 a0_) { nsimd_vec_f32 ret_; nsimd_cpu_vf32 a0, ret; a0.v0 = get0(a0_); a0.v1 = get1(a0_); a0.v2 = get2(a0_); a0.v3 = get3(a0_); ret = nsimd_tan_u35_cpu_f32(a0); ret_ = set0(ret_, ret.v0); ret_ = set1(ret_, ret.v1); ret_ = set2(ret_, ret.v2); ret_ = set3(ret_, ret.v3); return ret_; } nsimd_vec_f32 xasinf(nsimd_vec_f32 a0_) { nsimd_vec_f32 ret_; nsimd_cpu_vf32 a0, ret; a0.v0 = get0(a0_); a0.v1 = get1(a0_); a0.v2 = get2(a0_); a0.v3 = get3(a0_); ret = nsimd_asin_u35_cpu_f32(a0); ret_ = set0(ret_, ret.v0); ret_ = set1(ret_, ret.v1); ret_ = set2(ret_, ret.v2); ret_ = set3(ret_, ret.v3); return ret_; } nsimd_vec_f32 xacosf(nsimd_vec_f32 a0_) { nsimd_vec_f32 ret_; nsimd_cpu_vf32 a0, ret; a0.v0 = get0(a0_); a0.v1 = get1(a0_); a0.v2 = get2(a0_); a0.v3 = get3(a0_); ret = nsimd_acos_u35_cpu_f32(a0); ret_ = set0(ret_, ret.v0); ret_ = set1(ret_, ret.v1); ret_ = set2(ret_, ret.v2); ret_ = set3(ret_, ret.v3); return ret_; } nsimd_vec_f32 xatanf(nsimd_vec_f32 a0_) { nsimd_vec_f32 ret_; nsimd_cpu_vf32 a0, ret; a0.v0 = get0(a0_); a0.v1 = get1(a0_); a0.v2 = get2(a0_); a0.v3 = get3(a0_); ret = nsimd_atan_u35_cpu_f32(a0); ret_ = set0(ret_, ret.v0); ret_ = set1(ret_, ret.v1); ret_ = set2(ret_, ret.v2); ret_ = set3(ret_, ret.v3); return ret_; } nsimd_vec_f32 xatan2f(nsimd_vec_f32 a0_, nsimd_vec_f32 a1_) { nsimd_vec_f32 ret_; nsimd_cpu_vf32 a0, a1, ret; a0.v0 = get0(a0_); a0.v1 = get1(a0_); a0.v2 = get2(a0_); a0.v3 = get3(a0_); a1.v0 = get0(a1_); a1.v1 = get1(a1_); a1.v2 = get2(a1_); a1.v3 = get3(a1_); ret = nsimd_atan2_u35_cpu_f32(a0, a1); ret_ = set0(ret_, ret.v0); ret_ = set1(ret_, ret.v1); ret_ = set2(ret_, ret.v2); ret_ = set3(ret_, ret.v3); return ret_; } nsimd_vec_f32 xlogf(nsimd_vec_f32 a0_) { nsimd_vec_f32 ret_; nsimd_cpu_vf32 a0, ret; a0.v0 = get0(a0_); a0.v1 = get1(a0_); a0.v2 = get2(a0_); a0.v3 = get3(a0_); ret = nsimd_log_u35_cpu_f32(a0); ret_ = set0(ret_, ret.v0); ret_ = set1(ret_, ret.v1); ret_ = set2(ret_, ret.v2); ret_ = set3(ret_, ret.v3); return ret_; } nsimd_vec_f32 xcbrtf(nsimd_vec_f32 a0_) { nsimd_vec_f32 ret_; nsimd_cpu_vf32 a0, ret; a0.v0 = get0(a0_); a0.v1 = get1(a0_); a0.v2 = get2(a0_); a0.v3 = get3(a0_); ret = nsimd_cbrt_u35_cpu_f32(a0); ret_ = set0(ret_, ret.v0); ret_ = set1(ret_, ret.v1); ret_ = set2(ret_, ret.v2); ret_ = set3(ret_, ret.v3); return ret_; } nsimd_vec_f32 xsinf_u1(nsimd_vec_f32 a0_) { nsimd_vec_f32 ret_; nsimd_cpu_vf32 a0, ret; a0.v0 = get0(a0_); a0.v1 = get1(a0_); a0.v2 = get2(a0_); a0.v3 = get3(a0_); ret = nsimd_sin_u10_cpu_f32(a0); ret_ = set0(ret_, ret.v0); ret_ = set1(ret_, ret.v1); ret_ = set2(ret_, ret.v2); ret_ = set3(ret_, ret.v3); return ret_; } nsimd_vec_f32 xcosf_u1(nsimd_vec_f32 a0_) { nsimd_vec_f32 ret_; nsimd_cpu_vf32 a0, ret; a0.v0 = get0(a0_); a0.v1 = get1(a0_); a0.v2 = get2(a0_); a0.v3 = get3(a0_); ret = nsimd_cos_u10_cpu_f32(a0); ret_ = set0(ret_, ret.v0); ret_ = set1(ret_, ret.v1); ret_ = set2(ret_, ret.v2); ret_ = set3(ret_, ret.v3); return ret_; } nsimd_vec_f32 xtanf_u1(nsimd_vec_f32 a0_) { nsimd_vec_f32 ret_; nsimd_cpu_vf32 a0, ret; a0.v0 = get0(a0_); a0.v1 = get1(a0_); a0.v2 = get2(a0_); a0.v3 = get3(a0_); ret = nsimd_tan_u10_cpu_f32(a0); ret_ = set0(ret_, ret.v0); ret_ = set1(ret_, ret.v1); ret_ = set2(ret_, ret.v2); ret_ = set3(ret_, ret.v3); return ret_; } nsimd_vec_f32 xasinf_u1(nsimd_vec_f32 a0_) { nsimd_vec_f32 ret_; nsimd_cpu_vf32 a0, ret; a0.v0 = get0(a0_); a0.v1 = get1(a0_); a0.v2 = get2(a0_); a0.v3 = get3(a0_); ret = nsimd_asin_u10_cpu_f32(a0); ret_ = set0(ret_, ret.v0); ret_ = set1(ret_, ret.v1); ret_ = set2(ret_, ret.v2); ret_ = set3(ret_, ret.v3); return ret_; } nsimd_vec_f32 xacosf_u1(nsimd_vec_f32 a0_) { nsimd_vec_f32 ret_; nsimd_cpu_vf32 a0, ret; a0.v0 = get0(a0_); a0.v1 = get1(a0_); a0.v2 = get2(a0_); a0.v3 = get3(a0_); ret = nsimd_acos_u10_cpu_f32(a0); ret_ = set0(ret_, ret.v0); ret_ = set1(ret_, ret.v1); ret_ = set2(ret_, ret.v2); ret_ = set3(ret_, ret.v3); return ret_; } nsimd_vec_f32 xatanf_u1(nsimd_vec_f32 a0_) { nsimd_vec_f32 ret_; nsimd_cpu_vf32 a0, ret; a0.v0 = get0(a0_); a0.v1 = get1(a0_); a0.v2 = get2(a0_); a0.v3 = get3(a0_); ret = nsimd_atan_u10_cpu_f32(a0); ret_ = set0(ret_, ret.v0); ret_ = set1(ret_, ret.v1); ret_ = set2(ret_, ret.v2); ret_ = set3(ret_, ret.v3); return ret_; } nsimd_vec_f32 xatan2f_u1(nsimd_vec_f32 a0_, nsimd_vec_f32 a1_) { nsimd_vec_f32 ret_; nsimd_cpu_vf32 a0, a1, ret; a0.v0 = get0(a0_); a0.v1 = get1(a0_); a0.v2 = get2(a0_); a0.v3 = get3(a0_); a1.v0 = get0(a1_); a1.v1 = get1(a1_); a1.v2 = get2(a1_); a1.v3 = get3(a1_); ret = nsimd_atan2_u10_cpu_f32(a0, a1); ret_ = set0(ret_, ret.v0); ret_ = set1(ret_, ret.v1); ret_ = set2(ret_, ret.v2); ret_ = set3(ret_, ret.v3); return ret_; } nsimd_vec_f32 xlogf_u1(nsimd_vec_f32 a0_) { nsimd_vec_f32 ret_; nsimd_cpu_vf32 a0, ret; a0.v0 = get0(a0_); a0.v1 = get1(a0_); a0.v2 = get2(a0_); a0.v3 = get3(a0_); ret = nsimd_log_u10_cpu_f32(a0); ret_ = set0(ret_, ret.v0); ret_ = set1(ret_, ret.v1); ret_ = set2(ret_, ret.v2); ret_ = set3(ret_, ret.v3); return ret_; } nsimd_vec_f32 xcbrtf_u1(nsimd_vec_f32 a0_) { nsimd_vec_f32 ret_; nsimd_cpu_vf32 a0, ret; a0.v0 = get0(a0_); a0.v1 = get1(a0_); a0.v2 = get2(a0_); a0.v3 = get3(a0_); ret = nsimd_cbrt_u10_cpu_f32(a0); ret_ = set0(ret_, ret.v0); ret_ = set1(ret_, ret.v1); ret_ = set2(ret_, ret.v2); ret_ = set3(ret_, ret.v3); return ret_; } nsimd_vec_f32 xexpf(nsimd_vec_f32 a0_) { nsimd_vec_f32 ret_; nsimd_cpu_vf32 a0, ret; a0.v0 = get0(a0_); a0.v1 = get1(a0_); a0.v2 = get2(a0_); a0.v3 = get3(a0_); ret = nsimd_exp_u10_cpu_f32(a0); ret_ = set0(ret_, ret.v0); ret_ = set1(ret_, ret.v1); ret_ = set2(ret_, ret.v2); ret_ = set3(ret_, ret.v3); return ret_; } nsimd_vec_f32 xpowf(nsimd_vec_f32 a0_, nsimd_vec_f32 a1_) { nsimd_vec_f32 ret_; nsimd_cpu_vf32 a0, a1, ret; a0.v0 = get0(a0_); a0.v1 = get1(a0_); a0.v2 = get2(a0_); a0.v3 = get3(a0_); a1.v0 = get0(a1_); a1.v1 = get1(a1_); a1.v2 = get2(a1_); a1.v3 = get3(a1_); ret = nsimd_pow_u10_cpu_f32(a0, a1); ret_ = set0(ret_, ret.v0); ret_ = set1(ret_, ret.v1); ret_ = set2(ret_, ret.v2); ret_ = set3(ret_, ret.v3); return ret_; } nsimd_vec_f32 xsinhf(nsimd_vec_f32 a0_) { nsimd_vec_f32 ret_; nsimd_cpu_vf32 a0, ret; a0.v0 = get0(a0_); a0.v1 = get1(a0_); a0.v2 = get2(a0_); a0.v3 = get3(a0_); ret = nsimd_sinh_u10_cpu_f32(a0); ret_ = set0(ret_, ret.v0); ret_ = set1(ret_, ret.v1); ret_ = set2(ret_, ret.v2); ret_ = set3(ret_, ret.v3); return ret_; } nsimd_vec_f32 xcoshf(nsimd_vec_f32 a0_) { nsimd_vec_f32 ret_; nsimd_cpu_vf32 a0, ret; a0.v0 = get0(a0_); a0.v1 = get1(a0_); a0.v2 = get2(a0_); a0.v3 = get3(a0_); ret = nsimd_cosh_u10_cpu_f32(a0); ret_ = set0(ret_, ret.v0); ret_ = set1(ret_, ret.v1); ret_ = set2(ret_, ret.v2); ret_ = set3(ret_, ret.v3); return ret_; } nsimd_vec_f32 xtanhf(nsimd_vec_f32 a0_) { nsimd_vec_f32 ret_; nsimd_cpu_vf32 a0, ret; a0.v0 = get0(a0_); a0.v1 = get1(a0_); a0.v2 = get2(a0_); a0.v3 = get3(a0_); ret = nsimd_tanh_u10_cpu_f32(a0); ret_ = set0(ret_, ret.v0); ret_ = set1(ret_, ret.v1); ret_ = set2(ret_, ret.v2); ret_ = set3(ret_, ret.v3); return ret_; } nsimd_vec_f32 xsinhf_u35(nsimd_vec_f32 a0_) { nsimd_vec_f32 ret_; nsimd_cpu_vf32 a0, ret; a0.v0 = get0(a0_); a0.v1 = get1(a0_); a0.v2 = get2(a0_); a0.v3 = get3(a0_); ret = nsimd_sinh_u35_cpu_f32(a0); ret_ = set0(ret_, ret.v0); ret_ = set1(ret_, ret.v1); ret_ = set2(ret_, ret.v2); ret_ = set3(ret_, ret.v3); return ret_; } nsimd_vec_f32 xcoshf_u35(nsimd_vec_f32 a0_) { nsimd_vec_f32 ret_; nsimd_cpu_vf32 a0, ret; a0.v0 = get0(a0_); a0.v1 = get1(a0_); a0.v2 = get2(a0_); a0.v3 = get3(a0_); ret = nsimd_cosh_u35_cpu_f32(a0); ret_ = set0(ret_, ret.v0); ret_ = set1(ret_, ret.v1); ret_ = set2(ret_, ret.v2); ret_ = set3(ret_, ret.v3); return ret_; } nsimd_vec_f32 xtanhf_u35(nsimd_vec_f32 a0_) { nsimd_vec_f32 ret_; nsimd_cpu_vf32 a0, ret; a0.v0 = get0(a0_); a0.v1 = get1(a0_); a0.v2 = get2(a0_); a0.v3 = get3(a0_); ret = nsimd_tanh_u35_cpu_f32(a0); ret_ = set0(ret_, ret.v0); ret_ = set1(ret_, ret.v1); ret_ = set2(ret_, ret.v2); ret_ = set3(ret_, ret.v3); return ret_; } nsimd_vec_f32 xasinhf(nsimd_vec_f32 a0_) { nsimd_vec_f32 ret_; nsimd_cpu_vf32 a0, ret; a0.v0 = get0(a0_); a0.v1 = get1(a0_); a0.v2 = get2(a0_); a0.v3 = get3(a0_); ret = nsimd_asinh_u10_cpu_f32(a0); ret_ = set0(ret_, ret.v0); ret_ = set1(ret_, ret.v1); ret_ = set2(ret_, ret.v2); ret_ = set3(ret_, ret.v3); return ret_; } nsimd_vec_f32 xacoshf(nsimd_vec_f32 a0_) { nsimd_vec_f32 ret_; nsimd_cpu_vf32 a0, ret; a0.v0 = get0(a0_); a0.v1 = get1(a0_); a0.v2 = get2(a0_); a0.v3 = get3(a0_); ret = nsimd_acosh_u10_cpu_f32(a0); ret_ = set0(ret_, ret.v0); ret_ = set1(ret_, ret.v1); ret_ = set2(ret_, ret.v2); ret_ = set3(ret_, ret.v3); return ret_; } nsimd_vec_f32 xatanhf(nsimd_vec_f32 a0_) { nsimd_vec_f32 ret_; nsimd_cpu_vf32 a0, ret; a0.v0 = get0(a0_); a0.v1 = get1(a0_); a0.v2 = get2(a0_); a0.v3 = get3(a0_); ret = nsimd_atanh_u10_cpu_f32(a0); ret_ = set0(ret_, ret.v0); ret_ = set1(ret_, ret.v1); ret_ = set2(ret_, ret.v2); ret_ = set3(ret_, ret.v3); return ret_; } nsimd_vec_f32 xexp2f(nsimd_vec_f32 a0_) { nsimd_vec_f32 ret_; nsimd_cpu_vf32 a0, ret; a0.v0 = get0(a0_); a0.v1 = get1(a0_); a0.v2 = get2(a0_); a0.v3 = get3(a0_); ret = nsimd_exp2_u10_cpu_f32(a0); ret_ = set0(ret_, ret.v0); ret_ = set1(ret_, ret.v1); ret_ = set2(ret_, ret.v2); ret_ = set3(ret_, ret.v3); return ret_; } nsimd_vec_f32 xexp2f_u35(nsimd_vec_f32 a0_) { nsimd_vec_f32 ret_; nsimd_cpu_vf32 a0, ret; a0.v0 = get0(a0_); a0.v1 = get1(a0_); a0.v2 = get2(a0_); a0.v3 = get3(a0_); ret = nsimd_exp2_u35_cpu_f32(a0); ret_ = set0(ret_, ret.v0); ret_ = set1(ret_, ret.v1); ret_ = set2(ret_, ret.v2); ret_ = set3(ret_, ret.v3); return ret_; } nsimd_vec_f32 xexp10f(nsimd_vec_f32 a0_) { nsimd_vec_f32 ret_; nsimd_cpu_vf32 a0, ret; a0.v0 = get0(a0_); a0.v1 = get1(a0_); a0.v2 = get2(a0_); a0.v3 = get3(a0_); ret = nsimd_exp10_u10_cpu_f32(a0); ret_ = set0(ret_, ret.v0); ret_ = set1(ret_, ret.v1); ret_ = set2(ret_, ret.v2); ret_ = set3(ret_, ret.v3); return ret_; } nsimd_vec_f32 xexp10f_u35(nsimd_vec_f32 a0_) { nsimd_vec_f32 ret_; nsimd_cpu_vf32 a0, ret; a0.v0 = get0(a0_); a0.v1 = get1(a0_); a0.v2 = get2(a0_); a0.v3 = get3(a0_); ret = nsimd_exp10_u35_cpu_f32(a0); ret_ = set0(ret_, ret.v0); ret_ = set1(ret_, ret.v1); ret_ = set2(ret_, ret.v2); ret_ = set3(ret_, ret.v3); return ret_; } nsimd_vec_f32 xexpm1f(nsimd_vec_f32 a0_) { nsimd_vec_f32 ret_; nsimd_cpu_vf32 a0, ret; a0.v0 = get0(a0_); a0.v1 = get1(a0_); a0.v2 = get2(a0_); a0.v3 = get3(a0_); ret = nsimd_expm1_u10_cpu_f32(a0); ret_ = set0(ret_, ret.v0); ret_ = set1(ret_, ret.v1); ret_ = set2(ret_, ret.v2); ret_ = set3(ret_, ret.v3); return ret_; } nsimd_vec_f32 xlog10f(nsimd_vec_f32 a0_) { nsimd_vec_f32 ret_; nsimd_cpu_vf32 a0, ret; a0.v0 = get0(a0_); a0.v1 = get1(a0_); a0.v2 = get2(a0_); a0.v3 = get3(a0_); ret = nsimd_log10_u10_cpu_f32(a0); ret_ = set0(ret_, ret.v0); ret_ = set1(ret_, ret.v1); ret_ = set2(ret_, ret.v2); ret_ = set3(ret_, ret.v3); return ret_; } nsimd_vec_f32 xlog2f(nsimd_vec_f32 a0_) { nsimd_vec_f32 ret_; nsimd_cpu_vf32 a0, ret; a0.v0 = get0(a0_); a0.v1 = get1(a0_); a0.v2 = get2(a0_); a0.v3 = get3(a0_); ret = nsimd_log2_u10_cpu_f32(a0); ret_ = set0(ret_, ret.v0); ret_ = set1(ret_, ret.v1); ret_ = set2(ret_, ret.v2); ret_ = set3(ret_, ret.v3); return ret_; } nsimd_vec_f32 xlog2f_u35(nsimd_vec_f32 a0_) { nsimd_vec_f32 ret_; nsimd_cpu_vf32 a0, ret; a0.v0 = get0(a0_); a0.v1 = get1(a0_); a0.v2 = get2(a0_); a0.v3 = get3(a0_); ret = nsimd_log2_u35_cpu_f32(a0); ret_ = set0(ret_, ret.v0); ret_ = set1(ret_, ret.v1); ret_ = set2(ret_, ret.v2); ret_ = set3(ret_, ret.v3); return ret_; } nsimd_vec_f32 xlog1pf(nsimd_vec_f32 a0_) { nsimd_vec_f32 ret_; nsimd_cpu_vf32 a0, ret; a0.v0 = get0(a0_); a0.v1 = get1(a0_); a0.v2 = get2(a0_); a0.v3 = get3(a0_); ret = nsimd_log1p_u10_cpu_f32(a0); ret_ = set0(ret_, ret.v0); ret_ = set1(ret_, ret.v1); ret_ = set2(ret_, ret.v2); ret_ = set3(ret_, ret.v3); return ret_; } nsimd_vec_f32 xsinpif_u05(nsimd_vec_f32 a0_) { nsimd_vec_f32 ret_; nsimd_cpu_vf32 a0, ret; a0.v0 = get0(a0_); a0.v1 = get1(a0_); a0.v2 = get2(a0_); a0.v3 = get3(a0_); ret = nsimd_sinpi_u05_cpu_f32(a0); ret_ = set0(ret_, ret.v0); ret_ = set1(ret_, ret.v1); ret_ = set2(ret_, ret.v2); ret_ = set3(ret_, ret.v3); return ret_; } nsimd_vec_f32 xcospif_u05(nsimd_vec_f32 a0_) { nsimd_vec_f32 ret_; nsimd_cpu_vf32 a0, ret; a0.v0 = get0(a0_); a0.v1 = get1(a0_); a0.v2 = get2(a0_); a0.v3 = get3(a0_); ret = nsimd_cospi_u05_cpu_f32(a0); ret_ = set0(ret_, ret.v0); ret_ = set1(ret_, ret.v1); ret_ = set2(ret_, ret.v2); ret_ = set3(ret_, ret.v3); return ret_; } nsimd_vec_f32 xhypotf_u05(nsimd_vec_f32 a0_, nsimd_vec_f32 a1_) { nsimd_vec_f32 ret_; nsimd_cpu_vf32 a0, a1, ret; a0.v0 = get0(a0_); a0.v1 = get1(a0_); a0.v2 = get2(a0_); a0.v3 = get3(a0_); a1.v0 = get0(a1_); a1.v1 = get1(a1_); a1.v2 = get2(a1_); a1.v3 = get3(a1_); ret = nsimd_hypot_u05_cpu_f32(a0, a1); ret_ = set0(ret_, ret.v0); ret_ = set1(ret_, ret.v1); ret_ = set2(ret_, ret.v2); ret_ = set3(ret_, ret.v3); return ret_; } nsimd_vec_f32 xhypotf_u35(nsimd_vec_f32 a0_, nsimd_vec_f32 a1_) { nsimd_vec_f32 ret_; nsimd_cpu_vf32 a0, a1, ret; a0.v0 = get0(a0_); a0.v1 = get1(a0_); a0.v2 = get2(a0_); a0.v3 = get3(a0_); a1.v0 = get0(a1_); a1.v1 = get1(a1_); a1.v2 = get2(a1_); a1.v3 = get3(a1_); ret = nsimd_hypot_u35_cpu_f32(a0, a1); ret_ = set0(ret_, ret.v0); ret_ = set1(ret_, ret.v1); ret_ = set2(ret_, ret.v2); ret_ = set3(ret_, ret.v3); return ret_; } nsimd_vec_f32 xfmodf(nsimd_vec_f32 a0_, nsimd_vec_f32 a1_) { nsimd_vec_f32 ret_; nsimd_cpu_vf32 a0, a1, ret; a0.v0 = get0(a0_); a0.v1 = get1(a0_); a0.v2 = get2(a0_); a0.v3 = get3(a0_); a1.v0 = get0(a1_); a1.v1 = get1(a1_); a1.v2 = get2(a1_); a1.v3 = get3(a1_); ret = nsimd_fmod_cpu_f32(a0, a1); ret_ = set0(ret_, ret.v0); ret_ = set1(ret_, ret.v1); ret_ = set2(ret_, ret.v2); ret_ = set3(ret_, ret.v3); return ret_; } nsimd_vec_f32 xremainderf(nsimd_vec_f32 a0_, nsimd_vec_f32 a1_) { nsimd_vec_f32 ret_; nsimd_cpu_vf32 a0, a1, ret; a0.v0 = get0(a0_); a0.v1 = get1(a0_); a0.v2 = get2(a0_); a0.v3 = get3(a0_); a1.v0 = get0(a1_); a1.v1 = get1(a1_); a1.v2 = get2(a1_); a1.v3 = get3(a1_); ret = nsimd_remainder_cpu_f32(a0, a1); ret_ = set0(ret_, ret.v0); ret_ = set1(ret_, ret.v1); ret_ = set2(ret_, ret.v2); ret_ = set3(ret_, ret.v3); return ret_; } nsimd_vec_f32 xlgammaf_u1(nsimd_vec_f32 a0_) { nsimd_vec_f32 ret_; nsimd_cpu_vf32 a0, ret; a0.v0 = get0(a0_); a0.v1 = get1(a0_); a0.v2 = get2(a0_); a0.v3 = get3(a0_); ret = nsimd_lgamma_u10_cpu_f32(a0); ret_ = set0(ret_, ret.v0); ret_ = set1(ret_, ret.v1); ret_ = set2(ret_, ret.v2); ret_ = set3(ret_, ret.v3); return ret_; } nsimd_vec_f32 xtgammaf_u1(nsimd_vec_f32 a0_) { nsimd_vec_f32 ret_; nsimd_cpu_vf32 a0, ret; a0.v0 = get0(a0_); a0.v1 = get1(a0_); a0.v2 = get2(a0_); a0.v3 = get3(a0_); ret = nsimd_tgamma_u10_cpu_f32(a0); ret_ = set0(ret_, ret.v0); ret_ = set1(ret_, ret.v1); ret_ = set2(ret_, ret.v2); ret_ = set3(ret_, ret.v3); return ret_; } nsimd_vec_f32 xerff_u1(nsimd_vec_f32 a0_) { nsimd_vec_f32 ret_; nsimd_cpu_vf32 a0, ret; a0.v0 = get0(a0_); a0.v1 = get1(a0_); a0.v2 = get2(a0_); a0.v3 = get3(a0_); ret = nsimd_erf_u10_cpu_f32(a0); ret_ = set0(ret_, ret.v0); ret_ = set1(ret_, ret.v1); ret_ = set2(ret_, ret.v2); ret_ = set3(ret_, ret.v3); return ret_; } nsimd_vec_f32 xerfcf_u15(nsimd_vec_f32 a0_) { nsimd_vec_f32 ret_; nsimd_cpu_vf32 a0, ret; a0.v0 = get0(a0_); a0.v1 = get1(a0_); a0.v2 = get2(a0_); a0.v3 = get3(a0_); ret = nsimd_erfc_u15_cpu_f32(a0); ret_ = set0(ret_, ret.v0); ret_ = set1(ret_, ret.v1); ret_ = set2(ret_, ret.v2); ret_ = set3(ret_, ret.v3); return ret_; } ================================================ FILE: src/sleefsp.c ================================================ // Copyright Naoki Shibata and contributors 2010 - 2020. // Distributed under the Boost Software License, Version 1.0. // (See accompanying file LICENSE.txt or copy at // http://www.boost.org/LICENSE_1_0.txt) // Always use -ffp-contract=off option to compile SLEEF. #include #include #include #include #include #ifndef ENABLE_BUILTIN_MATH #include #define SQRTF sqrtf #else #define SQRTF __builtin_sqrtf #endif #include "misc.h" extern const float Sleef_rempitabsp[]; #ifdef DORENAME #include "rename.h" #endif #if (defined(_MSC_VER)) #pragma fp_contract (off) #endif #define MLA mlaf #define C2V(x) (x) #include "estrin.h" static INLINE CONST int32_t floatToRawIntBits(float d) { union { float f; int32_t i; } tmp; tmp.f = d; return tmp.i; } static INLINE CONST float intBitsToFloat(int32_t i) { union { float f; int32_t i; } tmp; tmp.i = i; return tmp.f; } static INLINE CONST float fabsfk(float x) { return intBitsToFloat(0x7fffffffL & floatToRawIntBits(x)); } static INLINE CONST float mulsignf(float x, float y) { return intBitsToFloat(floatToRawIntBits(x) ^ (floatToRawIntBits(y) & (1 << 31))); } static INLINE CONST float copysignfk(float x, float y) { return intBitsToFloat((floatToRawIntBits(x) & ~(1 << 31)) ^ (floatToRawIntBits(y) & (1 << 31))); } static INLINE CONST float signf(float d) { return mulsignf(1, d); } static INLINE CONST float mlaf(float x, float y, float z) { return x * y + z; } static INLINE CONST float rintfk(float x) { return x < 0 ? (int)(x - 0.5f) : (int)(x + 0.5f); } static INLINE CONST int ceilfk(float x) { return (int)x + (x < 0 ? 0 : 1); } static INLINE CONST float fminfk(float x, float y) { return x < y ? x : y; } static INLINE CONST float fmaxfk(float x, float y) { return x > y ? x : y; } static INLINE CONST int xisintf(float x) { return (x == (int)x); } static INLINE CONST int xisnanf(float x) { return x != x; } static INLINE CONST int xisinff(float x) { return x == SLEEF_INFINITYf || x == -SLEEF_INFINITYf; } static INLINE CONST int xisminff(float x) { return x == -SLEEF_INFINITYf; } static INLINE CONST int xispinff(float x) { return x == SLEEF_INFINITYf; } static INLINE CONST int xisnegzerof(float x) { return floatToRawIntBits(x) == floatToRawIntBits(-0.0); } static INLINE CONST int xisnumberf(float x) { return !xisinff(x) && !xisnanf(x); } static INLINE CONST int ilogbkf(float d) { int m = d < 5.421010862427522E-20f; d = m ? 1.8446744073709552E19f * d : d; int q = (floatToRawIntBits(d) >> 23) & 0xff; q = m ? q - (64 + 0x7f) : q - 0x7f; return q; } // vilogb2kf is similar to ilogbkf, but the argument has to be a // normalized FP value. static INLINE CONST int ilogb2kf(float d) { return ((floatToRawIntBits(d) >> 23) & 0xff) - 0x7f; } EXPORT CONST int xilogbf(float d) { int e = ilogbkf(fabsfk(d)); e = d == 0.0f ? SLEEF_FP_ILOGB0 : e; e = xisnanf(d) ? SLEEF_FP_ILOGBNAN : e; e = xisinff(d) ? INT_MAX : e; return e; } static INLINE CONST float pow2if(int q) { return intBitsToFloat(((int32_t)(q + 0x7f)) << 23); } static INLINE CONST float ldexpkf(float x, int q) { float u; int m; m = q >> 31; m = (((m + q) >> 6) - m) << 4; q = q - (m << 2); m += 127; m = m < 0 ? 0 : m; m = m > 255 ? 255 : m; u = intBitsToFloat(((int32_t)m) << 23); x = x * u * u * u * u; u = intBitsToFloat(((int32_t)(q + 0x7f)) << 23); return x * u; } static INLINE CONST float ldexp2kf(float d, int e) { // faster than ldexpkf, short reach return d * pow2if(e >> 1) * pow2if(e - (e >> 1)); } static INLINE CONST float ldexp3kf(float d, int e) { // very fast, no denormal return intBitsToFloat(floatToRawIntBits(d) + (e << 23)); } // #ifndef NDEBUG static int checkfp(float x) { if (xisinff(x) || xisnanf(x)) return 1; return 0; } #endif static INLINE CONST float upperf(float d) { return intBitsToFloat(floatToRawIntBits(d) & 0xfffff000); } static INLINE CONST Sleef_float2 df(float h, float l) { Sleef_float2 ret; ret.x = h; ret.y = l; return ret; } static INLINE CONST Sleef_float2 dfx(double d) { Sleef_float2 ret; ret.x = d; ret.y = d - ret.x; return ret; } static INLINE CONST Sleef_float2 dfnormalize_f2_f2(Sleef_float2 t) { Sleef_float2 s; s.x = t.x + t.y; s.y = t.x - s.x + t.y; return s; } static INLINE CONST Sleef_float2 dfscale_f2_f2_f(Sleef_float2 d, float s) { Sleef_float2 r; r.x = d.x * s; r.y = d.y * s; return r; } static INLINE CONST Sleef_float2 dfneg_f2_f2(Sleef_float2 d) { Sleef_float2 r; r.x = -d.x; r.y = -d.y; return r; } static INLINE CONST Sleef_float2 dfabs_f2_f2(Sleef_float2 x) { return df(x.x < 0 ? -x.x : x.x, x.x < 0 ? -x.y : x.y); } static INLINE CONST Sleef_float2 dfadd_f2_f_f(float x, float y) { // |x| >= |y| Sleef_float2 r; #ifndef NDEBUG if (!(checkfp(x) || checkfp(y) || fabsfk(x) >= fabsfk(y))) fprintf(stderr, "[dfadd_f2_f_f : %g, %g]", x, y); #endif r.x = x + y; r.y = x - r.x + y; return r; } static INLINE CONST Sleef_float2 dfadd2_f2_f_f(float x, float y) { Sleef_float2 r; r.x = x + y; float v = r.x - x; r.y = (x - (r.x - v)) + (y - v); return r; } static INLINE CONST Sleef_float2 dfadd_f2_f2_f(Sleef_float2 x, float y) { // |x| >= |y| Sleef_float2 r; #ifndef NDEBUG if (!(checkfp(x.x) || checkfp(y) || fabsfk(x.x) >= fabsfk(y))) fprintf(stderr, "[dfadd_f2_f2_f : %g %g]", x.x, y); #endif r.x = x.x + y; r.y = x.x - r.x + y + x.y; return r; } static INLINE CONST Sleef_float2 dfadd_f2_f_f2(float x, Sleef_float2 y) { // |x| >= |y| Sleef_float2 r; #ifndef NDEBUG if (!(checkfp(x) || checkfp(y.x) || fabsfk(x) >= fabsfk(y.x))) { fprintf(stderr, "[dfadd_f2_f_f2 : %g %g]\n", x, y.x); fflush(stderr); } #endif r.x = x + y.x; r.y = x - r.x + y.x + y.y; return r; } static INLINE CONST Sleef_float2 dfadd2_f2_f2_f(Sleef_float2 x, float y) { // |x| >= |y| Sleef_float2 r; r.x = x.x + y; float v = r.x - x.x; r.y = (x.x - (r.x - v)) + (y - v); r.y += x.y; return r; } static INLINE CONST Sleef_float2 dfadd2_f2_f_f2(float x, Sleef_float2 y) { Sleef_float2 r; r.x = x + y.x; float v = r.x - x; r.y = (x - (r.x - v)) + (y.x - v) + y.y; return r; } static INLINE CONST Sleef_float2 dfadd_f2_f2_f2(Sleef_float2 x, Sleef_float2 y) { // |x| >= |y| Sleef_float2 r; #ifndef NDEBUG if (!(checkfp(x.x) || checkfp(y.x) || fabsfk(x.x) >= fabsfk(y.x))) fprintf(stderr, "[dfadd_f2_f2_f2 : %g %g]", x.x, y.x); #endif r.x = x.x + y.x; r.y = x.x - r.x + y.x + x.y + y.y; return r; } static INLINE CONST Sleef_float2 dfadd2_f2_f2_f2(Sleef_float2 x, Sleef_float2 y) { Sleef_float2 r; r.x = x.x + y.x; float v = r.x - x.x; r.y = (x.x - (r.x - v)) + (y.x - v); r.y += x.y + y.y; return r; } static INLINE CONST Sleef_float2 dfsub_f2_f2_f2(Sleef_float2 x, Sleef_float2 y) { // |x| >= |y| Sleef_float2 r; #ifndef NDEBUG if (!(checkfp(x.x) || checkfp(y.x) || fabsfk(x.x) >= fabsfk(y.x))) fprintf(stderr, "[dfsub_f2_f2_f2 : %g %g]", x.x, y.x); #endif r.x = x.x - y.x; r.y = x.x - r.x - y.x + x.y - y.y; return r; } static INLINE CONST Sleef_float2 dfdiv_f2_f2_f2(Sleef_float2 n, Sleef_float2 d) { float t = 1.0f / d.x; float dh = upperf(d.x), dl = d.x - dh; float th = upperf(t ), tl = t - th; float nhh = upperf(n.x), nhl = n.x - nhh; Sleef_float2 q; q.x = n.x * t; float u = -q.x + nhh * th + nhh * tl + nhl * th + nhl * tl + q.x * (1 - dh * th - dh * tl - dl * th - dl * tl); q.y = t * (n.y - q.x * d.y) + u; return q; } static INLINE CONST Sleef_float2 dfmul_f2_f_f(float x, float y) { float xh = upperf(x), xl = x - xh; float yh = upperf(y), yl = y - yh; Sleef_float2 r; r.x = x * y; r.y = xh * yh - r.x + xl * yh + xh * yl + xl * yl; return r; } static INLINE CONST Sleef_float2 dfmul_f2_f2_f(Sleef_float2 x, float y) { float xh = upperf(x.x), xl = x.x - xh; float yh = upperf(y ), yl = y - yh; Sleef_float2 r; r.x = x.x * y; r.y = xh * yh - r.x + xl * yh + xh * yl + xl * yl + x.y * y; return r; } static INLINE CONST Sleef_float2 dfmul_f2_f2_f2(Sleef_float2 x, Sleef_float2 y) { float xh = upperf(x.x), xl = x.x - xh; float yh = upperf(y.x), yl = y.x - yh; Sleef_float2 r; r.x = x.x * y.x; r.y = xh * yh - r.x + xl * yh + xh * yl + xl * yl + x.x * y.y + x.y * y.x; return r; } static INLINE CONST float dfmul_f_f2_f2(Sleef_float2 x, Sleef_float2 y) { float xh = upperf(x.x), xl = x.x - xh; float yh = upperf(y.x), yl = y.x - yh; return x.y * yh + xh * y.y + xl * yl + xh * yl + xl * yh + xh * yh; } static INLINE CONST Sleef_float2 dfsqu_f2_f2(Sleef_float2 x) { float xh = upperf(x.x), xl = x.x - xh; Sleef_float2 r; r.x = x.x * x.x; r.y = xh * xh - r.x + (xh + xh) * xl + xl * xl + x.x * (x.y + x.y); return r; } static INLINE CONST float dfsqu_f_f2(Sleef_float2 x) { float xh = upperf(x.x), xl = x.x - xh; return xh * x.y + xh * x.y + xl * xl + (xh * xl + xh * xl) + xh * xh; } static INLINE CONST Sleef_float2 dfrec_f2_f(float d) { float t = 1.0f / d; float dh = upperf(d), dl = d - dh; float th = upperf(t), tl = t - th; Sleef_float2 q; q.x = t; q.y = t * (1 - dh * th - dh * tl - dl * th - dl * tl); return q; } static INLINE CONST Sleef_float2 dfrec_f2_f2(Sleef_float2 d) { float t = 1.0f / d.x; float dh = upperf(d.x), dl = d.x - dh; float th = upperf(t ), tl = t - th; Sleef_float2 q; q.x = t; q.y = t * (1 - dh * th - dh * tl - dl * th - dl * tl - d.y * t); return q; } static INLINE CONST Sleef_float2 dfsqrt_f2_f2(Sleef_float2 d) { float t = SQRTF(d.x + d.y); return dfscale_f2_f2_f(dfmul_f2_f2_f2(dfadd2_f2_f2_f2(d, dfmul_f2_f_f(t, t)), dfrec_f2_f(t)), 0.5f); } static INLINE CONST Sleef_float2 dfsqrt_f2_f(float d) { float t = SQRTF(d); return dfscale_f2_f2_f(dfmul_f2_f2_f2(dfadd2_f2_f_f2(d, dfmul_f2_f_f(t, t)), dfrec_f2_f(t)), 0.5); } // typedef struct { float d; int32_t i; } fi_t; typedef struct { Sleef_float2 df; int32_t i; } dfi_t; static CONST fi_t rempisubf(float x) { fi_t ret; float fr = x - (float)(INT64_C(1) << 10) * (int32_t)(x * (1.0f / (INT64_C(1) << 10))); ret.i = ((7 & ((x > 0 ? 4 : 3) + (int32_t)(fr * 8))) - 3) >> 1; fr = fr - 0.25f * (int32_t)(fr * 4 + mulsignf(0.5f, x)); fr = fabsfk(fr) > 0.125f ? (fr - mulsignf(0.5f, x)) : fr; fr = fabsfk(fr) > 1e+10f ? 0 : fr; if (fabsfk(x) == 0.12499999254941940308f) { fr = x; ret.i = 0; } ret.d = fr; return ret; } static CONST dfi_t rempif(float a) { Sleef_float2 x, y, z; fi_t di; float t; int ex = ilogb2kf(a) - 25, q = ex > (90 - 25) ? -64 : 0; a = ldexp3kf(a, q); if (ex < 0) ex = 0; ex *= 4; x = dfmul_f2_f_f(a, Sleef_rempitabsp[ex]); di = rempisubf(x.x); q = di.i; x.x = di.d; x = dfnormalize_f2_f2(x); y = dfmul_f2_f_f(a, Sleef_rempitabsp[ex+1]); x = dfadd2_f2_f2_f2(x, y); di = rempisubf(x.x); q += di.i; x.x = di.d; x = dfnormalize_f2_f2(x); y = dfmul_f2_f2_f(df(Sleef_rempitabsp[ex+2], Sleef_rempitabsp[ex+3]), a); x = dfadd2_f2_f2_f2(x, y); x = dfnormalize_f2_f2(x); x = dfmul_f2_f2_f2(x, df(3.1415927410125732422f*2, -8.7422776573475857731e-08f*2)); dfi_t ret = { fabsfk(a) < 0.7f ? df(a, 0) : x, q }; return ret; } EXPORT CONST float xsinf(float d) { int q; float u, s, t = d; if (fabsfk(d) < TRIGRANGEMAX2f) { q = (int)rintfk(d * (float)M_1_PI); d = mlaf(q, -PI_A2f, d); d = mlaf(q, -PI_B2f, d); d = mlaf(q, -PI_C2f, d); } else if (fabsfk(d) < TRIGRANGEMAXf) { q = (int)rintfk(d * (float)M_1_PI); d = mlaf(q, -PI_Af, d); d = mlaf(q, -PI_Bf, d); d = mlaf(q, -PI_Cf, d); d = mlaf(q, -PI_Df, d); } else { dfi_t dfi = rempif(t); q = ((dfi.i & 3) * 2 + (dfi.df.x > 0) + 1) >> 2; if ((dfi.i & 1) != 0) { dfi.df = dfadd2_f2_f2_f2(dfi.df, df(mulsignf(3.1415927410125732422f*-0.5, dfi.df.x), mulsignf(-8.7422776573475857731e-08f*-0.5, dfi.df.x))); } d = dfi.df.x + dfi.df.y; if (xisinff(t) || xisnanf(t)) d = SLEEF_NANf; } s = d * d; if ((q & 1) != 0) d = -d; u = 2.6083159809786593541503e-06f; u = mlaf(u, s, -0.0001981069071916863322258f); u = mlaf(u, s, 0.00833307858556509017944336f); u = mlaf(u, s, -0.166666597127914428710938f); u = mlaf(s, u * d, d); if (xisnegzerof(t)) u = -0.0f; return u; } EXPORT CONST float xsinf_u1(float d) { int q; float u; Sleef_float2 s, t, x; if (fabsfk(d) < TRIGRANGEMAX2f) { q = (int)rintfk(d * (float)M_1_PI); u = mlaf(q, -PI_A2f, d); s = dfadd2_f2_f_f(u, q * (-PI_B2f)); s = dfadd_f2_f2_f(s, q * (-PI_C2f)); } else { dfi_t dfi = rempif(d); q = ((dfi.i & 3) * 2 + (dfi.df.x > 0) + 1) >> 2; if ((dfi.i & 1) != 0) { dfi.df = dfadd2_f2_f2_f2(dfi.df, df(mulsignf(3.1415927410125732422f*-0.5, dfi.df.x), mulsignf(-8.7422776573475857731e-08f*-0.5, dfi.df.x))); } s = dfnormalize_f2_f2(dfi.df); if (xisinff(d) || xisnanf(d)) s.x = SLEEF_NANf; } t = s; s = dfsqu_f2_f2(s); u = 2.6083159809786593541503e-06f; u = mlaf(u, s.x, -0.0001981069071916863322258f); u = mlaf(u, s.x, 0.00833307858556509017944336f); x = dfadd_f2_f_f2(1, dfmul_f2_f2_f2(dfadd_f2_f_f(-0.166666597127914428710938f, u * s.x), s)); u = dfmul_f_f2_f2(t, x); if ((q & 1) != 0) u = -u; if (xisnegzerof(d)) u = d; return u; } EXPORT CONST float xcosf(float d) { int q; float u, s, t = d; if (fabsfk(d) < TRIGRANGEMAX2f) { q = 1 + 2*(int)rintfk(d * (float)M_1_PI - 0.5f); d = mlaf(q, -PI_A2f*0.5f, d); d = mlaf(q, -PI_B2f*0.5f, d); d = mlaf(q, -PI_C2f*0.5f, d); } else if (fabsfk(d) < TRIGRANGEMAXf) { q = 1 + 2*(int)rintfk(d * (float)M_1_PI - 0.5f); d = mlaf(q, -PI_Af*0.5f, d); d = mlaf(q, -PI_Bf*0.5f, d); d = mlaf(q, -PI_Cf*0.5f, d); d = mlaf(q, -PI_Df*0.5f, d); } else { dfi_t dfi = rempif(t); q = ((dfi.i & 3) * 2 + (dfi.df.x > 0) + 7) >> 1; if ((dfi.i & 1) == 0) { dfi.df = dfadd2_f2_f2_f2(dfi.df, df(mulsignf(3.1415927410125732422f*-0.5, dfi.df.x > 0 ? 1 : -1), mulsignf(-8.7422776573475857731e-08f*-0.5, dfi.df.x > 0 ? 1 : -1))); } d = dfi.df.x + dfi.df.y; if (xisinff(t) || xisnanf(t)) d = SLEEF_NANf; } s = d * d; if ((q & 2) == 0) d = -d; u = 2.6083159809786593541503e-06f; u = mlaf(u, s, -0.0001981069071916863322258f); u = mlaf(u, s, 0.00833307858556509017944336f); u = mlaf(u, s, -0.166666597127914428710938f); u = mlaf(s, u * d, d); return u; } EXPORT CONST float xcosf_u1(float d) { float u; Sleef_float2 s, t, x; int q; if (fabsfk(d) < TRIGRANGEMAX2f) { d = fabsfk(d); float dq = mlaf(rintfk(d * (float)M_1_PI - 0.5f), 2, 1); q = (int)dq; s = dfadd2_f2_f_f (d, dq * (-PI_A2f*0.5f)); s = dfadd2_f2_f2_f(s, dq * (-PI_B2f*0.5f)); s = dfadd2_f2_f2_f(s, dq * (-PI_C2f*0.5f)); } else { dfi_t dfi = rempif(d); q = ((dfi.i & 3) * 2 + (dfi.df.x > 0) + 7) >> 1; if ((dfi.i & 1) == 0) { dfi.df = dfadd2_f2_f2_f2(dfi.df, df(mulsignf(3.1415927410125732422f*-0.5, dfi.df.x > 0 ? 1 : -1), mulsignf(-8.7422776573475857731e-08f*-0.5, dfi.df.x > 0 ? 1 : -1))); } s = dfnormalize_f2_f2(dfi.df); if (xisinff(d) || xisnanf(d)) s.x = SLEEF_NANf; } t = s; s = dfsqu_f2_f2(s); u = 2.6083159809786593541503e-06f; u = mlaf(u, s.x, -0.0001981069071916863322258f); u = mlaf(u, s.x, 0.00833307858556509017944336f); x = dfadd_f2_f_f2(1, dfmul_f2_f2_f2(dfadd_f2_f_f(-0.166666597127914428710938f, u * s.x), s)); u = dfmul_f_f2_f2(t, x); if ((((int)q) & 2) == 0) u = -u; return u; } EXPORT CONST float xfastsinf_u3500(float d) { int q; float u, s, t = d; q = rintfk(d * (float)M_1_PI); d = mlaf(q, -(float)M_PI, d); s = d * d; u = -0.1881748176e-3; u = mlaf(u, s, +0.8323502727e-2); u = mlaf(u, s, -0.1666651368e+0); u = mlaf(s * d, u, d); if ((q & 1) != 0) u = -u; if (UNLIKELY(fabsfk(t) > 30.0f)) return xsinf(t); return u; } EXPORT CONST float xfastcosf_u3500(float d) { int q; float u, s, t = d; q = rintfk(mlaf(d, (float)M_1_PI, -0.5f)); d = mlaf(q, -(float)M_PI, d - (float)M_PI*0.5f); s = d * d; u = -0.1881748176e-3; u = mlaf(u, s, +0.8323502727e-2); u = mlaf(u, s, -0.1666651368e+0); u = mlaf(s * d, u, d); if ((q & 1) == 0) u = -u; if (UNLIKELY(fabsfk(t) > 30.0f)) return xcosf(t); return u; } EXPORT CONST Sleef_float2 xsincosf(float d) { int q; float u, s, t; Sleef_float2 r; s = d; if (fabsfk(d) < TRIGRANGEMAX2f) { q = (int)rintfk(d * ((float)(2 * M_1_PI))); s = mlaf(q, -PI_A2f*0.5f, s); s = mlaf(q, -PI_B2f*0.5f, s); s = mlaf(q, -PI_C2f*0.5f, s); } else if (fabsfk(d) < TRIGRANGEMAXf) { q = (int)rintfk(d * ((float)(2 * M_1_PI))); s = mlaf(q, -PI_Af*0.5f, s); s = mlaf(q, -PI_Bf*0.5f, s); s = mlaf(q, -PI_Cf*0.5f, s); s = mlaf(q, -PI_Df*0.5f, s); } else { dfi_t dfi = rempif(d); q = dfi.i; s = dfi.df.x + dfi.df.y; if (xisinff(d) || xisnanf(d)) s = SLEEF_NANf; } t = s; s = s * s; u = -0.000195169282960705459117889f; u = mlaf(u, s, 0.00833215750753879547119141f); u = mlaf(u, s, -0.166666537523269653320312f); u = u * s * t; r.x = t + u; if (xisnegzerof(d)) r.x = -0.0f; u = -2.71811842367242206819355e-07f; u = mlaf(u, s, 2.47990446951007470488548e-05f); u = mlaf(u, s, -0.00138888787478208541870117f); u = mlaf(u, s, 0.0416666641831398010253906f); u = mlaf(u, s, -0.5f); r.y = u * s + 1; if ((q & 1) != 0) { s = r.y; r.y = r.x; r.x = s; } if ((q & 2) != 0) { r.x = -r.x; } if (((q+1) & 2) != 0) { r.y = -r.y; } return r; } EXPORT CONST Sleef_float2 xsincosf_u1(float d) { int q; float u; Sleef_float2 r, s, t, x; if (fabsfk(d) < TRIGRANGEMAX2f) { q = (int)rintfk(d * (float)(2 * M_1_PI)); u = mlaf(q, -PI_A2f*0.5f, d); s = dfadd2_f2_f_f(u, q * (-PI_B2f*0.5f)); s = dfadd_f2_f2_f(s, q * (-PI_C2f*0.5f)); } else { dfi_t dfi = rempif(d); q = dfi.i; s = dfi.df; if (xisinff(d) || xisnanf(d)) s.x = SLEEF_NANf; } t = s; s.x = dfsqu_f_f2(s); u = -0.000195169282960705459117889f; u = mlaf(u, s.x, 0.00833215750753879547119141f); u = mlaf(u, s.x, -0.166666537523269653320312f); u *= s.x * t.x; x = dfadd_f2_f2_f(t, u); r.x = x.x + x.y; if (xisnegzerof(d)) r.x = -0.0f; u = -2.71811842367242206819355e-07f; u = mlaf(u, s.x, 2.47990446951007470488548e-05f); u = mlaf(u, s.x, -0.00138888787478208541870117f); u = mlaf(u, s.x, 0.0416666641831398010253906f); u = mlaf(u, s.x, -0.5f); x = dfadd_f2_f_f2(1, dfmul_f2_f_f(s.x, u)); r.y = x.x + x.y; if ((q & 1) != 0) { u = r.y; r.y = r.x; r.x = u; } if ((q & 2) != 0) { r.x = -r.x; } if (((q+1) & 2) != 0) { r.y = -r.y; } return r; } EXPORT CONST Sleef_float2 xsincospif_u05(float d) { float u, s, t; Sleef_float2 r, x, s2; u = d * 4; int q = ceilfk(u) & ~(int)1; s = u - (float)q; t = s; s = s * s; s2 = dfmul_f2_f_f(t, t); // u = +0.3093842054e-6; u = mlaf(u, s, -0.3657307388e-4); u = mlaf(u, s, +0.2490393585e-2); x = dfadd2_f2_f_f2(u * s, df(-0.080745510756969451904, -1.3373665339076936258e-09)); x = dfadd2_f2_f2_f2(dfmul_f2_f2_f2(s2, x), df(0.78539818525314331055, -2.1857338617566484855e-08)); x = dfmul_f2_f2_f(x, t); r.x = x.x + x.y; if (xisnegzerof(d)) r.x = -0.0f; u = -0.2430611801e-7; u = mlaf(u, s, +0.3590577080e-5); u = mlaf(u, s, -0.3259917721e-3); x = dfadd2_f2_f_f2(u * s, df(0.015854343771934509277, 4.4940051354032242811e-10)); x = dfadd2_f2_f2_f2(dfmul_f2_f2_f2(s2, x), df(-0.30842512845993041992, -9.0728339030733922277e-09)); x = dfadd2_f2_f2_f(dfmul_f2_f2_f2(x, s2), 1); r.y = x.x + x.y; if ((q & 2) != 0) { s = r.y; r.y = r.x; r.x = s; } if ((q & 4) != 0) { r.x = -r.x; } if (((q+2) & 4) != 0) { r.y = -r.y; } if (fabsfk(d) > 1e+7f) { r.x = 0; r.y = 1; } if (xisinff(d)) { r.x = r.y = SLEEF_NANf; } return r; } EXPORT CONST Sleef_float2 xsincospif_u35(float d) { float u, s, t; Sleef_float2 r; u = d * 4; int q = ceilfk(u) & ~(int)1; s = u - (float)q; t = s; s = s * s; // u = -0.3600925265e-4; u = mlaf(u, s, +0.2490088111e-2); u = mlaf(u, s, -0.8074551076e-1); u = mlaf(u, s, +0.7853981853e+0); r.x = u * t; u = +0.3539815225e-5; u = mlaf(u, s, -0.3259574005e-3); u = mlaf(u, s, +0.1585431583e-1); u = mlaf(u, s, -0.3084251285e+0); u = mlaf(u, s, 1); r.y = u; if ((q & 2) != 0) { s = r.y; r.y = r.x; r.x = s; } if ((q & 4) != 0) { r.x = -r.x; } if (((q+2) & 4) != 0) { r.y = -r.y; } if (fabsfk(d) > 1e+7f) { r.x = 0; r.y = 1; } if (xisinff(d)) { r.x = r.y = SLEEF_NANf; } return r; } EXPORT CONST float xtanf(float d) { int q; float u, s, x; x = d; if (fabsfk(d) < TRIGRANGEMAX2f*0.5f) { q = (int)rintfk(d * (float)(2 * M_1_PI)); x = mlaf(q, -PI_A2f*0.5f, x); x = mlaf(q, -PI_B2f*0.5f, x); x = mlaf(q, -PI_C2f*0.5f, x); } else if (fabsfk(d) < TRIGRANGEMAXf) { q = (int)rintfk(d * (float)(2 * M_1_PI)); x = mlaf(q, -PI_Af*0.5f, x); x = mlaf(q, -PI_Bf*0.5f, x); x = mlaf(q, -PI_Cf*0.5f, x); x = mlaf(q, -PI_Df*0.5f, x); } else { dfi_t dfi = rempif(d); q = dfi.i; x = dfi.df.x + dfi.df.y; if (xisinff(d) || xisnanf(d)) x = SLEEF_NANf; } s = x * x; if ((q & 1) != 0) x = -x; float s2 = s * s, s4 = s2 * s2; u = POLY6(s, s2, s4, 0.00927245803177356719970703f, 0.00331984995864331722259521f, 0.0242998078465461730957031f, 0.0534495301544666290283203f, 0.133383005857467651367188f, 0.333331853151321411132812f); u = mlaf(s, u * x, x); if ((q & 1) != 0) u = 1.0f / u; return u; } EXPORT CONST float xtanf_u1(float d) { int q; float u; Sleef_float2 s, t, x; if (fabsfk(d) < TRIGRANGEMAX2f) { q = (int)rintfk(d * (float)(2 * M_1_PI)); u = mlaf(q, -PI_A2f*0.5f, d); s = dfadd2_f2_f_f(u, q * (-PI_B2f*0.5f)); s = dfadd_f2_f2_f(s, q * (-PI_C2f*0.5f)); } else { dfi_t dfi = rempif(d); q = dfi.i; s = dfi.df; if (xisinff(d) || xisnanf(d)) s.x = SLEEF_NANf; } if ((q & 1) != 0) s = dfneg_f2_f2(s); t = s; s = dfsqu_f2_f2(s); s = dfnormalize_f2_f2(s); u = 0.00446636462584137916564941f; u = mlaf(u, s.x, -8.3920182078145444393158e-05f); u = mlaf(u, s.x, 0.0109639242291450500488281f); u = mlaf(u, s.x, 0.0212360303848981857299805f); u = mlaf(u, s.x, 0.0540687143802642822265625f); x = dfadd_f2_f_f(0.133325666189193725585938f, u * s.x); x = dfadd_f2_f_f2(1, dfmul_f2_f2_f2(dfadd_f2_f_f2(0.33333361148834228515625f, dfmul_f2_f2_f2(s, x)), s)); x = dfmul_f2_f2_f2(t, x); if ((q & 1) != 0) x = dfrec_f2_f2(x); u = x.x + x.y; if (xisnegzerof(d)) u = -0.0f; return u; } EXPORT CONST float xatanf(float s) { float t, u; int q = 0; if (signf(s) == -1) { s = -s; q = 2; } if (s > 1) { s = 1.0f / s; q |= 1; } t = s * s; float t2 = t * t, t4 = t2 * t2; u = POLY8(t, t2, t4, 0.00282363896258175373077393f, -0.0159569028764963150024414f, 0.0425049886107444763183594f, -0.0748900920152664184570312f, 0.106347933411598205566406f, -0.142027363181114196777344f, 0.199926957488059997558594f, -0.333331018686294555664062f); t = s + s * (t * u); if ((q & 1) != 0) t = 1.570796326794896557998982f - t; if ((q & 2) != 0) t = -t; return t; } static INLINE CONST float atan2kf(float y, float x) { float s, t, u; int q = 0; if (x < 0) { x = -x; q = -2; } if (y > x) { t = x; x = y; y = -t; q += 1; } s = y / x; t = s * s; float t2 = t * t, t4 = t2 * t2; u = POLY8(t, t2, t4, 0.00282363896258175373077393f, -0.0159569028764963150024414f, 0.0425049886107444763183594f, -0.0748900920152664184570312f, 0.106347933411598205566406f, -0.142027363181114196777344f, 0.199926957488059997558594f, -0.333331018686294555664062f); t = u * t * s + s; t = q * (float)(M_PI/2) + t; return t; } EXPORT CONST float xatan2f(float y, float x) { float r = atan2kf(fabsfk(y), x); r = mulsignf(r, x); if (xisinff(x) || x == 0) r = M_PIf/2 - (xisinff(x) ? (signf(x) * (float)(M_PI /2)) : 0); if (xisinff(y) ) r = M_PIf/2 - (xisinff(x) ? (signf(x) * (float)(M_PI*1/4)) : 0); if ( y == 0) r = (signf(x) == -1 ? M_PIf : 0); return xisnanf(x) || xisnanf(y) ? SLEEF_NANf : mulsignf(r, y); } EXPORT CONST float xasinf(float d) { int o = fabsfk(d) < 0.5f; float x2 = o ? (d*d) : ((1-fabsfk(d))*0.5f), x = o ? fabsfk(d) : SQRTF(x2), u; u = +0.4197454825e-1; u = mlaf(u, x2, +0.2424046025e-1); u = mlaf(u, x2, +0.4547423869e-1); u = mlaf(u, x2, +0.7495029271e-1); u = mlaf(u, x2, +0.1666677296e+0); u = mlaf(u, x * x2, x); float r = o ? u : (M_PIf/2 - 2*u); r = mulsignf(r, d); return r; } EXPORT CONST float xacosf(float d) { int o = fabsfk(d) < 0.5f; float x2 = o ? (d*d) : ((1-fabsfk(d))*0.5f), u; float x = o ? fabsfk(d) : SQRTF(x2); x = fabsfk(d) == 1.0 ? 0 : x; u = +0.4197454825e-1; u = mlaf(u, x2, +0.2424046025e-1); u = mlaf(u, x2, +0.4547423869e-1); u = mlaf(u, x2, +0.7495029271e-1); u = mlaf(u, x2, +0.1666677296e+0); u *= x * x2; float y = 3.1415926535897932f/2 - (mulsignf(x, d) + mulsignf(u, d)); x += u; float r = o ? y : (x*2); if (!o && d < 0) r = dfadd_f2_f2_f(df(3.1415927410125732422f,-8.7422776573475857731e-08f), -r).x; return r; } static Sleef_float2 atan2kf_u1(Sleef_float2 y, Sleef_float2 x) { float u; Sleef_float2 s, t; int q = 0; if (x.x < 0) { x.x = -x.x; x.y = -x.y; q = -2; } if (y.x > x.x) { t = x; x = y; y.x = -t.x; y.y = -t.y; q += 1; } s = dfdiv_f2_f2_f2(y, x); t = dfsqu_f2_f2(s); t = dfnormalize_f2_f2(t); u = -0.00176397908944636583328247f; u = mlaf(u, t.x, 0.0107900900766253471374512f); u = mlaf(u, t.x, -0.0309564601629972457885742f); u = mlaf(u, t.x, 0.0577365085482597351074219f); u = mlaf(u, t.x, -0.0838950723409652709960938f); u = mlaf(u, t.x, 0.109463557600975036621094f); u = mlaf(u, t.x, -0.142626821994781494140625f); u = mlaf(u, t.x, 0.199983194470405578613281f); t = dfmul_f2_f2_f2(t, dfadd_f2_f_f(-0.333332866430282592773438f, u * t.x)); t = dfmul_f2_f2_f2(s, dfadd_f2_f_f2(1, t)); t = dfadd2_f2_f2_f2(dfmul_f2_f2_f(df(1.5707963705062866211f, -4.3711388286737928865e-08f), q), t); return t; } EXPORT CONST float xatan2f_u1(float y, float x) { if (fabsfk(x) < 2.9387372783541830947e-39f) { y *= (UINT64_C(1) << 24); x *= (UINT64_C(1) << 24); } // nexttowardf((1.0 / FLT_MAX), 1) Sleef_float2 d = atan2kf_u1(df(fabsfk(y), 0), df(x, 0)); float r = d.x + d.y; r = mulsignf(r, x); if (xisinff(x) || x == 0) r = (float)M_PI/2 - (xisinff(x) ? (signf(x) * (float)(M_PI /2)) : 0.0f); if (xisinff(y) ) r = (float)M_PI/2 - (xisinff(x) ? (signf(x) * (float)(M_PI*1/4)) : 0.0f); if ( y == 0) r = (signf(x) == -1 ? (float)M_PI : 0.0f); return xisnanf(x) || xisnanf(y) ? SLEEF_NANf : mulsignf(r, y); } EXPORT CONST float xasinf_u1(float d) { int o = fabsfk(d) < 0.5f; float x2 = o ? (d*d) : ((1-fabsfk(d))*0.5f), u; Sleef_float2 x = o ? df(fabsfk(d), 0) : dfsqrt_f2_f(x2); x = fabsfk(d) == 1.0f ? df(0, 0) : x; u = +0.4197454825e-1; u = mlaf(u, x2, +0.2424046025e-1); u = mlaf(u, x2, +0.4547423869e-1); u = mlaf(u, x2, +0.7495029271e-1); u = mlaf(u, x2, +0.1666677296e+0); u *= x2 * x.x; Sleef_float2 y = dfadd_f2_f2_f(dfsub_f2_f2_f2(df(3.1415927410125732422f/4,-8.7422776573475857731e-08f/4), x), -u); float r = o ? (u + x.x) : ((y.x + y.y)*2); r = mulsignf(r, d); return r; } EXPORT CONST float xacosf_u1(float d) { int o = fabsfk(d) < 0.5f; float x2 = o ? (d*d) : ((1-fabsfk(d))*0.5f), u; Sleef_float2 x = o ? df(fabsfk(d), 0) : dfsqrt_f2_f(x2); x = fabsfk(d) == 1.0 ? df(0, 0) : x; u = +0.4197454825e-1; u = mlaf(u, x2, +0.2424046025e-1); u = mlaf(u, x2, +0.4547423869e-1); u = mlaf(u, x2, +0.7495029271e-1); u = mlaf(u, x2, +0.1666677296e+0); u = u * x.x * x2; Sleef_float2 y = dfsub_f2_f2_f2(df(3.1415927410125732422f/2,-8.7422776573475857731e-08f/2), dfadd_f2_f_f(mulsignf(x.x, d), mulsignf(u, d))); x = dfadd_f2_f2_f(x, u); y = o ? y : dfscale_f2_f2_f(x, 2); if (!o && d < 0) y = dfsub_f2_f2_f2(df(3.1415927410125732422f,-8.7422776573475857731e-08f), y); return y.x + y.y; } EXPORT CONST float xatanf_u1(float d) { Sleef_float2 d2 = atan2kf_u1(df(fabsfk(d), 0.0f), df(1.0f, 0.0f)); float r = d2.x + d2.y; if (xisinff(d)) r = 1.570796326794896557998982f; return mulsignf(r, d); } EXPORT CONST float xlogf(float d) { float x, x2, t, m; int e; int o = d < FLT_MIN; if (o) d *= (float)(INT64_C(1) << 32) * (float)(INT64_C(1) << 32); e = ilogb2kf(d * (1.0f/0.75f)); m = ldexp3kf(d, -e); if (o) e -= 64; x = (m-1.0f) / (m+1.0f); x2 = x * x; t = 0.2392828464508056640625f; t = mlaf(t, x2, 0.28518211841583251953125f); t = mlaf(t, x2, 0.400005877017974853515625f); t = mlaf(t, x2, 0.666666686534881591796875f); t = mlaf(t, x2, 2.0f); x = x * t + 0.693147180559945286226764f * e; if (xisinff(d)) x = SLEEF_INFINITYf; if (d < 0 || xisnanf(d)) x = SLEEF_NANf; if (d == 0) x = -SLEEF_INFINITYf; return x; } EXPORT CONST float xexpf(float d) { int q = (int)rintfk(d * R_LN2f); float s, u; s = mlaf(q, -L2Uf, d); s = mlaf(q, -L2Lf, s); u = 0.000198527617612853646278381; u = mlaf(u, s, 0.00139304355252534151077271); u = mlaf(u, s, 0.00833336077630519866943359); u = mlaf(u, s, 0.0416664853692054748535156); u = mlaf(u, s, 0.166666671633720397949219); u = mlaf(u, s, 0.5); u = s * s * u + s + 1.0f; u = ldexp2kf(u, q); if (d < -104) u = 0; if (d > 104) u = SLEEF_INFINITYf; return u; } static INLINE CONST float expkf(Sleef_float2 d) { int q = (int)rintfk((d.x + d.y) * R_LN2f); Sleef_float2 s, t; float u; s = dfadd2_f2_f2_f(d, q * -L2Uf); s = dfadd2_f2_f2_f(s, q * -L2Lf); s = dfnormalize_f2_f2(s); u = 0.00136324646882712841033936f; u = mlaf(u, s.x, 0.00836596917361021041870117f); u = mlaf(u, s.x, 0.0416710823774337768554688f); u = mlaf(u, s.x, 0.166665524244308471679688f); u = mlaf(u, s.x, 0.499999850988388061523438f); t = dfadd_f2_f2_f2(s, dfmul_f2_f2_f(dfsqu_f2_f2(s), u)); t = dfadd_f2_f_f2(1, t); u = ldexpkf(t.x + t.y, q); if (d.x < -104) u = 0; return u; } static INLINE CONST float expm1kf(float d) { int q = (int)rintfk(d * R_LN2f); float s, u; s = mlaf(q, -L2Uf, d); s = mlaf(q, -L2Lf, s); float s2 = s * s, s4 = s2 * s2; u = POLY6(s, s2, s4, 0.000198527617612853646278381, 0.00139304355252534151077271, 0.00833336077630519866943359, 0.0416664853692054748535156, 0.166666671633720397949219, 0.5); u = s * s * u + s; if (q != 0) u = ldexp2kf(u + 1, q) - 1; return u; } static INLINE CONST Sleef_float2 logkf(float d) { Sleef_float2 x, x2, s; float m, t; int e; int o = d < FLT_MIN; if (o) d *= (float)(INT64_C(1) << 32) * (float)(INT64_C(1) << 32); e = ilogb2kf(d * (1.0f/0.75f)); m = ldexp3kf(d, -e); if (o) e -= 64; x = dfdiv_f2_f2_f2(dfadd2_f2_f_f(-1, m), dfadd2_f2_f_f(1, m)); x2 = dfsqu_f2_f2(x); t = 0.240320354700088500976562; t = mlaf(t, x2.x, 0.285112679004669189453125); t = mlaf(t, x2.x, 0.400007992982864379882812); Sleef_float2 c = df(0.66666662693023681640625f, 3.69183861259614332084311e-09f); s = dfmul_f2_f2_f(df(0.69314718246459960938f, -1.904654323148236017e-09f), e); s = dfadd_f2_f2_f2(s, dfscale_f2_f2_f(x, 2)); s = dfadd_f2_f2_f2(s, dfmul_f2_f2_f2(dfmul_f2_f2_f2(x2, x), dfadd2_f2_f2_f2(dfmul_f2_f2_f(x2, t), c))); return s; } EXPORT CONST float xlogf_u1(float d) { Sleef_float2 x, s; float m, t, x2; int e; int o = d < FLT_MIN; if (o) d *= (float)(INT64_C(1) << 32) * (float)(INT64_C(1) << 32); e = ilogb2kf(d * (1.0f/0.75f)); m = ldexp3kf(d, -e); if (o) e -= 64; x = dfdiv_f2_f2_f2(dfadd2_f2_f_f(-1, m), dfadd2_f2_f_f(1, m)); x2 = x.x * x.x; t = +0.3027294874e+0f; t = mlaf(t, x2, +0.3996108174e+0f); t = mlaf(t, x2, +0.6666694880e+0f); s = dfmul_f2_f2_f(df(0.69314718246459960938f, -1.904654323148236017e-09f), (float)e); s = dfadd_f2_f2_f2(s, dfscale_f2_f2_f(x, 2)); s = dfadd_f2_f2_f(s, x2 * x.x * t); float r = s.x + s.y; if (xisinff(d)) r = SLEEF_INFINITYf; if (d < 0 || xisnanf(d)) r = SLEEF_NANf; if (d == 0) r = -SLEEF_INFINITYf; return r; } static INLINE CONST Sleef_float2 expk2f(Sleef_float2 d) { int q = (int)rintfk((d.x + d.y) * R_LN2f); Sleef_float2 s, t; float u; s = dfadd2_f2_f2_f(d, q * -L2Uf); s = dfadd2_f2_f2_f(s, q * -L2Lf); u = +0.1980960224e-3f; u = mlaf(u, s.x, +0.1394256484e-2f); u = mlaf(u, s.x, +0.8333456703e-2f); u = mlaf(u, s.x, +0.4166637361e-1f); t = dfadd2_f2_f2_f(dfmul_f2_f2_f(s, u), +0.166666659414234244790680580464e+0f); t = dfadd2_f2_f2_f(dfmul_f2_f2_f2(s, t), 0.5); t = dfadd2_f2_f2_f2(s, dfmul_f2_f2_f2(dfsqu_f2_f2(s), t)); t = dfadd2_f2_f_f2(1, t); t.x = ldexp2kf(t.x, q); t.y = ldexp2kf(t.y, q); return d.x < -104 ? df(0, 0) : t; } EXPORT CONST float xpowf(float x, float y) { int yisint = (y == (int)y) || (fabsfk(y) >= (float)(INT64_C(1) << 24)); int yisodd = (1 & (int)y) != 0 && yisint && fabsfk(y) < (float)(INT64_C(1) << 24); float result = expkf(dfmul_f2_f2_f(logkf(fabsfk(x)), y)); result = xisnanf(result) ? SLEEF_INFINITYf : result; result *= (x >= 0 ? 1 : (!yisint ? SLEEF_NANf : (yisodd ? -1 : 1))); float efx = mulsignf(fabsfk(x) - 1, y); if (xisinff(y)) result = efx < 0 ? 0.0f : (efx == 0 ? 1.0f : SLEEF_INFINITYf); if (xisinff(x) || x == 0) result = (yisodd ? signf(x) : 1) * ((x == 0 ? -y : y) < 0 ? 0 : SLEEF_INFINITYf); if (xisnanf(x) || xisnanf(y)) result = SLEEF_NANf; if (y == 0 || x == 1) result = 1; return result; } static INLINE CONST float logk3f(float d) { float x, x2, t, m; int e; int o = d < FLT_MIN; if (o) d *= (float)(INT64_C(1) << 32) * (float)(INT64_C(1) << 32); e = ilogb2kf(d * (1.0f/0.75f)); m = ldexp3kf(d, -e); if (o) e -= 64; x = (m-1) / (m+1); x2 = x * x; t = 0.2392828464508056640625f; t = mlaf(t, x2, 0.28518211841583251953125f); t = mlaf(t, x2, 0.400005877017974853515625f); t = mlaf(t, x2, 0.666666686534881591796875f); t = mlaf(t, x2, 2.0f); x = mlaf(x, t, 0.693147180559945286226764f * e); return x; } static INLINE CONST float expk3f(float d) { int q = (int)rintfk(d * R_LN2f); float s, u; s = mlaf(q, -L2Uf, d); s = mlaf(q, -L2Lf, s); u = 0.000198527617612853646278381; u = mlaf(u, s, 0.00139304355252534151077271); u = mlaf(u, s, 0.00833336077630519866943359); u = mlaf(u, s, 0.0416664853692054748535156); u = mlaf(u, s, 0.166666671633720397949219); u = mlaf(u, s, 0.5); u = mlaf(s * s, u, s + 1.0f); u = ldexpkf(u, q); if (d < -104) u = 0; return u; } EXPORT CONST float xfastpowf_u3500(float x, float y) { float result = expk3f(logk3f(fabsfk(x)) * y); int yisint = (y == (int)y) || (fabsfk(y) >= (float)(INT64_C(1) << 24)); int yisodd = (1 & (int)y) != 0 && yisint && fabsfk(y) < (float)(INT64_C(1) << 24); result *= (x < 0 && yisodd) ? -1 : 1; if (x == 0) result = 0; if (y == 0) result = 1; return result; } EXPORT CONST float xsinhf(float x) { float y = fabsfk(x); Sleef_float2 d = expk2f(df(y, 0)); d = dfsub_f2_f2_f2(d, dfrec_f2_f2(d)); y = (d.x + d.y) * 0.5f; y = fabsfk(x) > 89 ? SLEEF_INFINITYf : y; y = xisnanf(y) ? SLEEF_INFINITYf : y; y = mulsignf(y, x); y = xisnanf(x) ? SLEEF_NANf : y; return y; } EXPORT CONST float xcoshf(float x) { float y = fabsfk(x); Sleef_float2 d = expk2f(df(y, 0)); d = dfadd_f2_f2_f2(d, dfrec_f2_f2(d)); y = (d.x + d.y) * 0.5f; y = fabsfk(x) > 89 ? SLEEF_INFINITYf : y; y = xisnanf(y) ? SLEEF_INFINITYf : y; y = xisnanf(x) ? SLEEF_NANf : y; return y; } EXPORT CONST float xtanhf(float x) { float y = fabsfk(x); Sleef_float2 d = expk2f(df(y, 0)); Sleef_float2 e = dfrec_f2_f2(d); d = dfdiv_f2_f2_f2(dfsub_f2_f2_f2(d, e), dfadd_f2_f2_f2(d, e)); y = d.x + d.y; y = fabsfk(x) > 18.714973875f ? 1.0f : y; y = xisnanf(y) ? 1.0f : y; y = mulsignf(y, x); y = xisnanf(x) ? SLEEF_NANf : y; return y; } EXPORT CONST float xsinhf_u35(float x) { float e = expm1kf(fabsfk(x)); float y = (e + 2) / (e + 1) * (0.5f * e); y = fabsfk(x) > 88 ? SLEEF_INFINITYf : y; y = xisnanf(y) ? SLEEF_INFINITYf : y; y = mulsignf(y, x); y = xisnanf(x) ? SLEEF_NANf : y; return y; } EXPORT CONST float xcoshf_u35(float x) { float e = xexpf(fabsfk(x)); float y = 0.5f * e + 0.5f / e; y = fabsfk(x) > 88 ? SLEEF_INFINITYf : y; y = xisnanf(y) ? SLEEF_INFINITYf : y; y = xisnanf(x) ? SLEEF_NANf : y; return y; } EXPORT CONST float xtanhf_u35(float x) { float y = fabsfk(x); float d = expm1kf(2*y); y = d / (d + 2); y = fabsfk(x) > 18.714973875f ? 1.0f : y; y = xisnanf(y) ? 1.0f : y; y = mulsignf(y, x); y = xisnanf(x) ? SLEEF_NANf : y; return y; } static INLINE CONST Sleef_float2 logk2f(Sleef_float2 d) { Sleef_float2 x, x2, m, s; float t; int e; e = ilogbkf(d.x * (1.0f/0.75f)); m = dfscale_f2_f2_f(d, pow2if(-e)); x = dfdiv_f2_f2_f2(dfadd2_f2_f2_f(m, -1), dfadd2_f2_f2_f(m, 1)); x2 = dfsqu_f2_f2(x); t = 0.2392828464508056640625f; t = mlaf(t, x2.x, 0.28518211841583251953125f); t = mlaf(t, x2.x, 0.400005877017974853515625f); t = mlaf(t, x2.x, 0.666666686534881591796875f); s = dfmul_f2_f2_f(df(0.69314718246459960938f, -1.904654323148236017e-09f), e); s = dfadd_f2_f2_f2(s, dfscale_f2_f2_f(x, 2)); s = dfadd_f2_f2_f2(s, dfmul_f2_f2_f(dfmul_f2_f2_f2(x2, x), t)); return s; } EXPORT CONST float xasinhf(float x) { float y = fabsfk(x); Sleef_float2 d; d = y > 1 ? dfrec_f2_f(x) : df(y, 0); d = dfsqrt_f2_f2(dfadd2_f2_f2_f(dfsqu_f2_f2(d), 1)); d = y > 1 ? dfmul_f2_f2_f(d, y) : d; d = logk2f(dfnormalize_f2_f2(dfadd_f2_f2_f(d, x))); y = d.x + d.y; y = (fabsfk(x) > SQRT_FLT_MAX || xisnanf(y)) ? mulsignf(SLEEF_INFINITYf, x) : y; y = xisnanf(x) ? SLEEF_NANf : y; y = xisnegzerof(x) ? -0.0f : y; return y; } EXPORT CONST float xacoshf(float x) { Sleef_float2 d = logk2f(dfadd2_f2_f2_f(dfmul_f2_f2_f2(dfsqrt_f2_f2(dfadd2_f2_f_f(x, 1)), dfsqrt_f2_f2(dfadd2_f2_f_f(x, -1))), x)); float y = d.x + d.y; y = (x > SQRT_FLT_MAX || xisnanf(y)) ? SLEEF_INFINITYf : y; y = x == 1.0f ? 0.0f : y; y = x < 1.0f ? SLEEF_NANf : y; y = xisnanf(x) ? SLEEF_NANf : y; return y; } EXPORT CONST float xatanhf(float x) { float y = fabsfk(x); Sleef_float2 d = logk2f(dfdiv_f2_f2_f2(dfadd2_f2_f_f(1, y), dfadd2_f2_f_f(1, -y))); y = y > 1.0f ? SLEEF_NANf : (y == 1.0f ? SLEEF_INFINITYf : (d.x + d.y) * 0.5f); y = xisinff(x) || xisnanf(y) ? SLEEF_NANf : y; y = mulsignf(y, x); y = xisnanf(x) ? SLEEF_NANf : y; return y; } EXPORT CONST float xexp2f(float d) { int q = (int)rintfk(d); float s, u; s = d - q; u = +0.1535920892e-3; u = mlaf(u, s, +0.1339262701e-2); u = mlaf(u, s, +0.9618384764e-2); u = mlaf(u, s, +0.5550347269e-1); u = mlaf(u, s, +0.2402264476e+0); u = mlaf(u, s, +0.6931471825e+0); u = dfnormalize_f2_f2(dfadd_f2_f_f2(1, dfmul_f2_f_f(u, s))).x; u = ldexp2kf(u, q); if (d >= 128) u = SLEEF_INFINITYf; if (d < -150) u = 0; return u; } EXPORT CONST float xexp2f_u35(float d) { int q = (int)rintfk(d); float s, u; s = d - q; u = +0.1535920892e-3; u = mlaf(u, s, +0.1339262701e-2); u = mlaf(u, s, +0.9618384764e-2); u = mlaf(u, s, +0.5550347269e-1); u = mlaf(u, s, +0.2402264476e+0); u = mlaf(u, s, +0.6931471825e+0); u = mlaf(u, s, +0.1000000000e+1); u = ldexp2kf(u, q); if (d >= 128) u = SLEEF_INFINITYf; if (d < -150) u = 0; return u; } EXPORT CONST float xexp10f(float d) { int q = (int)rintfk(d * (float)LOG10_2); float s, u; s = mlaf(q, -L10Uf, d); s = mlaf(q, -L10Lf, s); u = +0.6802555919e-1; u = mlaf(u, s, +0.2078080326e+0); u = mlaf(u, s, +0.5393903852e+0); u = mlaf(u, s, +0.1171245337e+1); u = mlaf(u, s, +0.2034678698e+1); u = mlaf(u, s, +0.2650949001e+1); Sleef_float2 x = dfadd_f2_f2_f(df(2.3025851249694824219, -3.1705172516493593157e-08), u * s); u = dfnormalize_f2_f2(dfadd_f2_f_f2(1, dfmul_f2_f2_f(x, s))).x; u = ldexp2kf(u, q); if (d > 38.5318394191036238941387f) u = SLEEF_INFINITYf; // log10(FLT_MAX) if (d < -50) u = 0; return u; } EXPORT CONST float xexp10f_u35(float d) { int q = (int)rintfk(d * (float)LOG10_2); float s, u; s = mlaf(q, -L10Uf, d); s = mlaf(q, -L10Lf, s); u = +0.2064004987e+0; u = mlaf(u, s, +0.5417877436e+0); u = mlaf(u, s, +0.1171286821e+1); u = mlaf(u, s, +0.2034656048e+1); u = mlaf(u, s, +0.2650948763e+1); u = mlaf(u, s, +0.2302585125e+1); u = mlaf(u, s, +0.1000000000e+1); u = ldexp2kf(u, q); if (d > 38.5318394191036238941387f) u = SLEEF_INFINITYf; // log10(FLT_MAX) if (d < -50) u = 0; return u; } EXPORT CONST float xexpm1f(float a) { Sleef_float2 d = dfadd2_f2_f2_f(expk2f(df(a, 0)), -1.0f); float x = d.x + d.y; if (a > 88.72283172607421875f) x = SLEEF_INFINITYf; if (a < -16.635532333438687426013570f) x = -1; if (xisnegzerof(a)) x = -0.0f; return x; } EXPORT CONST float xlog10f(float d) { Sleef_float2 x, s; float m, t, x2; int e; int o = d < FLT_MIN; if (o) d *= (float)(INT64_C(1) << 32) * (float)(INT64_C(1) << 32); e = ilogb2kf(d * (1.0f/0.75f)); m = ldexp3kf(d, -e); if (o) e -= 64; x = dfdiv_f2_f2_f2(dfadd2_f2_f_f(-1, m), dfadd2_f2_f_f(1, m)); x2 = x.x * x.x; t = +0.1314289868e+0; t = mlaf(t, x2, +0.1735493541e+0); t = mlaf(t, x2, +0.2895309627e+0); s = dfmul_f2_f2_f(df(0.30103001, -1.432098889e-08), (float)e); s = dfadd_f2_f2_f2(s, dfmul_f2_f2_f2(x, df(0.868588984, -2.170757285e-08))); s = dfadd_f2_f2_f(s, x2 * x.x * t); float r = s.x + s.y; if (xisinff(d)) r = SLEEF_INFINITYf; if (d < 0 || xisnanf(d)) r = SLEEF_NANf; if (d == 0) r = -SLEEF_INFINITYf; return r; } EXPORT CONST float xlog2f(float d) { Sleef_float2 x, s; float m, t, x2; int e; int o = d < FLT_MIN; if (o) d *= (float)(INT64_C(1) << 32) * (float)(INT64_C(1) << 32); e = ilogb2kf(d * (1.0f/0.75f)); m = ldexp3kf(d, -e); if (o) e -= 64; x = dfdiv_f2_f2_f2(dfadd2_f2_f_f(-1, m), dfadd2_f2_f_f(1, m)); x2 = x.x * x.x; t = +0.4374550283e+0f; t = mlaf(t, x2, +0.5764790177e+0f); t = mlaf(t, x2, +0.9618012905120f); s = dfadd2_f2_f_f2(e, dfmul_f2_f2_f2(x, df(2.8853900432586669922, 3.2734474483568488616e-08))); s = dfadd2_f2_f2_f(s, x2 * x.x * t); float r = s.x + s.y; if (xisinff(d)) r = SLEEF_INFINITYf; if (d < 0 || xisnanf(d)) r = SLEEF_NANf; if (d == 0) r = -SLEEF_INFINITYf; return r; } EXPORT CONST float xlog2f_u35(float d) { float m, t, x, x2; int e; int o = d < FLT_MIN; if (o) d *= (float)(INT64_C(1) << 32) * (float)(INT64_C(1) << 32); e = ilogb2kf(d * (1.0f/0.75f)); m = ldexp3kf(d, -e); if (o) e -= 64; x = (m - 1) / (m + 1); x2 = x * x; t = +0.4374088347e+0; t = mlaf(t, x2, +0.5764843822e+0); t = mlaf(t, x2, +0.9618024230e+0); float r = mlaf(x2 * x, t, mlaf(x, +0.2885390043e+1, e)); if (xisinff(d)) r = SLEEF_INFINITYf; if (d < 0 || xisnanf(d)) r = SLEEF_NANf; if (d == 0) r = -SLEEF_INFINITYf; return r; } EXPORT CONST float xlog1pf(float d) { Sleef_float2 x, s; float m, t, x2; int e; float dp1 = d + 1; int o = dp1 < FLT_MIN; if (o) dp1 *= (float)(INT64_C(1) << 32) * (float)(INT64_C(1) << 32); e = ilogb2kf(dp1 * (1.0f/0.75f)); t = ldexp3kf(1, -e); m = mlaf(d, t, t-1); if (o) e -= 64; x = dfdiv_f2_f2_f2(df(m, 0), dfadd_f2_f_f(2, m)); x2 = x.x * x.x; t = +0.3027294874e+0f; t = mlaf(t, x2, +0.3996108174e+0f); t = mlaf(t, x2, +0.6666694880e+0f); s = dfmul_f2_f2_f(df(0.69314718246459960938f, -1.904654323148236017e-09f), (float)e); s = dfadd_f2_f2_f2(s, dfscale_f2_f2_f(x, 2)); s = dfadd_f2_f2_f(s, x2 * x.x * t); float r = s.x + s.y; if (d > 1e+38) r = SLEEF_INFINITYf; if (d < -1) r = SLEEF_NANf; if (d == -1) r = -SLEEF_INFINITYf; if (xisnegzerof(d)) r = -0.0f; return r; } EXPORT CONST float xcbrtf(float d) { float x, y, q = 1.0f; int e, r; e = ilogbkf(fabsfk(d))+1; d = ldexp2kf(d, -e); r = (e + 6144) % 3; q = (r == 1) ? 1.2599210498948731647672106f : q; q = (r == 2) ? 1.5874010519681994747517056f : q; q = ldexp2kf(q, (e + 6144) / 3 - 2048); q = mulsignf(q, d); d = fabsfk(d); x = -0.601564466953277587890625f; x = mlaf(x, d, 2.8208892345428466796875f); x = mlaf(x, d, -5.532182216644287109375f); x = mlaf(x, d, 5.898262500762939453125f); x = mlaf(x, d, -3.8095417022705078125f); x = mlaf(x, d, 2.2241256237030029296875f); y = d * x * x; y = (y - (2.0f / 3.0f) * y * (y * x - 1.0f)) * q; return y; } EXPORT CONST float xcbrtf_u1(float d) { float x, y, z; Sleef_float2 q2 = df(1, 0), u, v; int e, r; e = ilogbkf(fabsfk(d))+1; d = ldexp2kf(d, -e); r = (e + 6144) % 3; q2 = (r == 1) ? df(1.2599210739135742188, -2.4018701694217270415e-08) : q2; q2 = (r == 2) ? df(1.5874010324478149414, 1.9520385308169352356e-08) : q2; q2.x = mulsignf(q2.x, d); q2.y = mulsignf(q2.y, d); d = fabsfk(d); x = -0.601564466953277587890625f; x = mlaf(x, d, 2.8208892345428466796875f); x = mlaf(x, d, -5.532182216644287109375f); x = mlaf(x, d, 5.898262500762939453125f); x = mlaf(x, d, -3.8095417022705078125f); x = mlaf(x, d, 2.2241256237030029296875f); y = x * x; y = y * y; x -= (d * y - x) * (1.0 / 3.0f); z = x; u = dfmul_f2_f_f(x, x); u = dfmul_f2_f2_f2(u, u); u = dfmul_f2_f2_f(u, d); u = dfadd2_f2_f2_f(u, -x); y = u.x + u.y; y = -2.0 / 3.0 * y * z; v = dfadd2_f2_f2_f(dfmul_f2_f_f(z, z), y); v = dfmul_f2_f2_f(v, d); v = dfmul_f2_f2_f2(v, q2); z = ldexp2kf(v.x + v.y, (e + 6144) / 3 - 2048); if (xisinff(d)) { z = mulsignf(SLEEF_INFINITYf, q2.x); } if (d == 0) { z = mulsignf(0, q2.x); } return z; } // EXPORT CONST float xfabsf(float x) { return fabsfk(x); } EXPORT CONST float xcopysignf(float x, float y) { return copysignfk(x, y); } EXPORT CONST float xfmaxf(float x, float y) { return y != y ? x : (x > y ? x : y); } EXPORT CONST float xfminf(float x, float y) { return y != y ? x : (x < y ? x : y); } EXPORT CONST float xfdimf(float x, float y) { float ret = x - y; if (ret < 0 || x == y) ret = 0; return ret; } EXPORT CONST float xtruncf(float x) { float fr = x - (int32_t)x; return (xisinff(x) || fabsfk(x) >= (float)(INT64_C(1) << 23)) ? x : copysignfk(x - fr, x); } EXPORT CONST float xfloorf(float x) { float fr = x - (int32_t)x; fr = fr < 0 ? fr+1.0f : fr; return (xisinff(x) || fabsfk(x) >= (float)(INT64_C(1) << 23)) ? x : copysignfk(x - fr, x); } EXPORT CONST float xceilf(float x) { float fr = x - (int32_t)x; fr = fr <= 0 ? fr : fr-1.0f; return (xisinff(x) || fabsfk(x) >= (float)(INT64_C(1) << 23)) ? x : copysignfk(x - fr, x); } EXPORT CONST float xroundf(float d) { float x = d + 0.5f; float fr = x - (int32_t)x; if (fr == 0 && x <= 0) x--; fr = fr < 0 ? fr+1.0f : fr; x = d == 0.4999999701976776123f ? 0 : x; // nextafterf(0.5, 0) return (xisinff(d) || fabsfk(d) >= (float)(INT64_C(1) << 23)) ? d : copysignfk(x - fr, d); } EXPORT CONST float xrintf(float d) { float x = d + 0.5f; int32_t isodd = (1 & (int32_t)x) != 0; float fr = x - (int32_t)x; fr = (fr < 0 || (fr == 0 && isodd)) ? fr+1.0f : fr; x = d == 0.50000005960464477539f ? 0 : x; // nextafterf(0.5, 1) return (xisinff(d) || fabsfk(d) >= (float)(INT64_C(1) << 23)) ? d : copysignfk(x - fr, d); } EXPORT CONST Sleef_float2 xmodff(float x) { float fr = x - (int32_t)x; fr = fabsfk(x) > (float)(INT64_C(1) << 23) ? 0 : fr; Sleef_float2 ret = { copysignfk(fr, x), copysignfk(x - fr, x) }; return ret; } EXPORT CONST float xldexpf(float x, int exp) { if (exp > 300) exp = 300; if (exp < -300) exp = -300; int e0 = exp >> 2; if (exp < 0) e0++; if (-50 < exp && exp < 50) e0 = 0; int e1 = exp - (e0 << 2); float p = pow2if(e0); float ret = x * pow2if(e1) * p * p * p * p; return ret; } EXPORT CONST float xnextafterf(float x, float y) { union { float f; int32_t i; } cx; cx.f = x == 0 ? mulsignf(0, y) : x; int c = (cx.i < 0) == (y < x); if (c) cx.i = -(cx.i ^ (1 << 31)); if (x != y) cx.i--; if (c) cx.i = -(cx.i ^ (1 << 31)); if (cx.f == 0 && x != 0) cx.f = mulsignf(0, x); if (x == 0 && y == 0) cx.f = y; if (xisnanf(x) || xisnanf(y)) cx.f = SLEEF_NANf; return cx.f; } EXPORT CONST float xfrfrexpf(float x) { union { float f; int32_t u; } cx; if (fabsfk(x) < FLT_MIN) x *= (1 << 30); cx.f = x; cx.u &= ~0x7f800000U; cx.u |= 0x3f000000U; if (xisinff(x)) cx.f = mulsignf(SLEEF_INFINITYf, x); if (x == 0) cx.f = x; return cx.f; } EXPORT CONST int xexpfrexpf(float x) { union { float f; uint32_t u; } cx; int ret = 0; if (fabsfk(x) < FLT_MIN) { x *= (1 << 30); ret = -30; } cx.f = x; ret += (int32_t)(((cx.u >> 23) & 0xff)) - 0x7e; if (x == 0 || xisnanf(x) || xisinff(x)) ret = 0; return ret; } EXPORT CONST float xhypotf_u05(float x, float y) { x = fabsfk(x); y = fabsfk(y); float min = fminfk(x, y), n = min; float max = fmaxfk(x, y), d = max; if (max < FLT_MIN) { n *= UINT64_C(1) << 24; d *= UINT64_C(1) << 24; } Sleef_float2 t = dfdiv_f2_f2_f2(df(n, 0), df(d, 0)); t = dfmul_f2_f2_f(dfsqrt_f2_f2(dfadd2_f2_f2_f(dfsqu_f2_f2(t), 1)), max); float ret = t.x + t.y; if (xisnanf(ret)) ret = SLEEF_INFINITYf; if (min == 0) ret = max; if (xisnanf(x) || xisnanf(y)) ret = SLEEF_NANf; if (x == SLEEF_INFINITYf || y == SLEEF_INFINITYf) ret = SLEEF_INFINITYf; return ret; } EXPORT CONST float xhypotf_u35(float x, float y) { x = fabsfk(x); y = fabsfk(y); float min = fminfk(x, y); float max = fmaxfk(x, y); float t = min / max; float ret = max * SQRTF(1 + t*t); if (min == 0) ret = max; if (xisnanf(x) || xisnanf(y)) ret = SLEEF_NANf; if (x == SLEEF_INFINITYf || y == SLEEF_INFINITYf) ret = SLEEF_INFINITYf; return ret; } static INLINE CONST float toward0f(float d) { return d == 0 ? 0 : intBitsToFloat(floatToRawIntBits(d)-1); } static INLINE CONST float ptruncf(float x) { return fabsfk(x) >= (float)(INT64_C(1) << 23) ? x : (x - (x - (int32_t)x)); } EXPORT CONST float xfmodf(float x, float y) { float nu = fabsfk(x), de = fabsfk(y), s = 1, q; if (de < FLT_MIN) { nu *= UINT64_C(1) << 25; de *= UINT64_C(1) << 25; s = 1.0f / (UINT64_C(1) << 25); } Sleef_float2 r = df(nu, 0); float rde = toward0f(1.0f / de); for(int i=0;i<8;i++) { // ceil(log2(FLT_MAX) / 22)+1 q = ptruncf(toward0f(r.x) * rde); q = (3*de > r.x && r.x >= de) ? 2 : q; q = (2*de > r.x && r.x >= de) ? 1 : q; r = dfnormalize_f2_f2(dfadd2_f2_f2_f2(r, dfmul_f2_f_f(q, -de))); if (r.x < de) break; } float ret = (r.x + r.y) * s; if (r.x + r.y == de) ret = 0; ret = mulsignf(ret, x); if (nu < de) ret = x; if (de == 0) ret = SLEEF_NANf; return ret; } static INLINE CONST float rintfk2(float d) { float x = d + 0.5f; int32_t isodd = (1 & (int32_t)x) != 0; float fr = x - (int32_t)x; fr = (fr < 0 || (fr == 0 && isodd)) ? fr+1.0f : fr; return (fabsfk(d) >= (float)(INT64_C(1) << 23)) ? d : copysignfk(x - fr, d); } EXPORT CONST float xremainderf(float x, float y) { float n = fabsfk(x), d = fabsfk(y), s = 1, q; if (d < FLT_MIN*2) { n *= UINT64_C(1) << 25; d *= UINT64_C(1) << 25; s = 1.0f / (UINT64_C(1) << 25); } float rd = 1.0f / d; Sleef_float2 r = df(n, 0); int qisodd = 0; for(int i=0;i<8;i++) { // ceil(log2(FLT_MAX) / 22)+1 q = rintfk2(r.x * rd); if (fabsfk(r.x) < 1.5f * d) q = r.x < 0 ? -1 : 1; if (fabsfk(r.x) < 0.5f * d || (fabsfk(r.x) == 0.5f * d && !qisodd)) q = 0; if (q == 0) break; if (xisinff(q * -d)) q = q + mulsignf(-1, r.x); qisodd ^= (1 & (int)q) != 0 && fabsfk(q) < (float)(INT64_C(1) << 24); r = dfnormalize_f2_f2(dfadd2_f2_f2_f2(r, dfmul_f2_f_f(q, -d))); } float ret = r.x * s; ret = mulsignf(ret, x); if (xisinff(y)) ret = xisinff(x) ? SLEEF_NANf : x; if (d == 0) ret = SLEEF_NANf; return ret; } EXPORT CONST float xsqrtf_u05(float d) { float q = 0.5f; d = d < 0 ? SLEEF_NANf : d; if (d < 5.2939559203393770e-23f) { d *= 1.8889465931478580e+22f; q = 7.2759576141834260e-12f * 0.5f; } if (d > 1.8446744073709552e+19f) { d *= 5.4210108624275220e-20f; q = 4294967296.0f * 0.5f; } // http://en.wikipedia.org/wiki/Fast_inverse_square_root float x = intBitsToFloat(0x5f375a86 - (floatToRawIntBits(d + 1e-45f) >> 1)); x = x * (1.5f - 0.5f * d * x * x); x = x * (1.5f - 0.5f * d * x * x); x = x * (1.5f - 0.5f * d * x * x) * d; Sleef_float2 d2 = dfmul_f2_f2_f2(dfadd2_f2_f_f2(d, dfmul_f2_f_f(x, x)), dfrec_f2_f(x)); float ret = (d2.x + d2.y) * q; ret = d == SLEEF_INFINITYf ? SLEEF_INFINITYf : ret; ret = d == 0 ? d : ret; return ret; } EXPORT CONST float xsqrtf_u35(float d) { float q = 1.0f; d = d < 0 ? SLEEF_NANf : d; if (d < 5.2939559203393770e-23f) { d *= 1.8889465931478580e+22f; q = 7.2759576141834260e-12f; } if (d > 1.8446744073709552e+19f) { d *= 5.4210108624275220e-20f; q = 4294967296.0f; } // http://en.wikipedia.org/wiki/Fast_inverse_square_root float x = intBitsToFloat(0x5f375a86 - (floatToRawIntBits(d + 1e-45) >> 1)); x = x * (1.5f - 0.5f * d * x * x); x = x * (1.5f - 0.5f * d * x * x); x = x * (1.5f - 0.5f * d * x * x); x = x * (1.5f - 0.5f * d * x * x); return d == SLEEF_INFINITYf ? SLEEF_INFINITYf : (x * d * q); } EXPORT CONST float xsqrtf(float d) { return SQRTF(d); } EXPORT CONST float xfmaf(float x, float y, float z) { float h2 = x * y + z, q = 1; if (fabsfk(h2) < 1e-38f) { const float c0 = 1 << 25, c1 = c0 * c0, c2 = c1 * c1; x *= c1; y *= c1; z *= c2; q = 1.0f / c2; } if (fabsfk(h2) > 1e+38f) { const float c0 = 1 << 25, c1 = c0 * c0, c2 = c1 * c1; x *= 1.0 / c1; y *= 1.0 / c1; z *= 1.0 / c2; q = c2; } Sleef_float2 d = dfmul_f2_f_f(x, y); d = dfadd2_f2_f2_f(d, z); float ret = (x == 0 || y == 0) ? z : (d.x + d.y); if (xisinff(z) && !xisinff(x) && !xisnanf(x) && !xisinff(y) && !xisnanf(y)) h2 = z; return (xisinff(h2) || xisnanf(h2)) ? h2 : ret*q; } // static INLINE CONST Sleef_float2 sinpifk(float d) { float u, s, t; Sleef_float2 x, s2; u = d * 4; int q = ceilfk(u) & ~1; int o = (q & 2) != 0; s = u - (float)q; t = s; s = s * s; s2 = dfmul_f2_f_f(t, t); // u = o ? -0.2430611801e-7f : +0.3093842054e-6f; u = mlaf(u, s, o ? +0.3590577080e-5f : -0.3657307388e-4f); u = mlaf(u, s, o ? -0.3259917721e-3f : +0.2490393585e-2f); x = dfadd2_f2_f_f2(u * s, o ? df(0.015854343771934509277, 4.4940051354032242811e-10) : df(-0.080745510756969451904, -1.3373665339076936258e-09)); x = dfadd2_f2_f2_f2(dfmul_f2_f2_f2(s2, x), o ? df(-0.30842512845993041992, -9.0728339030733922277e-09) : df(0.78539818525314331055, -2.1857338617566484855e-08)); x = dfmul_f2_f2_f2(x, o ? s2 : df(t, 0)); x = o ? dfadd2_f2_f2_f(x, 1) : x; // if ((q & 4) != 0) { x.x = -x.x; x.y = -x.y; } return x; } EXPORT CONST float xsinpif_u05(float d) { Sleef_float2 x = sinpifk(d); float r = x.x + x.y; if (xisnegzerof(d)) r = -0.0; if (fabsfk(d) > TRIGRANGEMAX4f) r = 0; if (xisinff(d)) r = SLEEF_NANf; return r; } static INLINE CONST Sleef_float2 cospifk(float d) { float u, s, t; Sleef_float2 x, s2; u = d * 4; int q = ceilfk(u) & ~1; int o = (q & 2) == 0; s = u - (float)q; t = s; s = s * s; s2 = dfmul_f2_f_f(t, t); // u = o ? -0.2430611801e-7f : +0.3093842054e-6f; u = mlaf(u, s, o ? +0.3590577080e-5f : -0.3657307388e-4f); u = mlaf(u, s, o ? -0.3259917721e-3f : +0.2490393585e-2f); x = dfadd2_f2_f_f2(u * s, o ? df(0.015854343771934509277, 4.4940051354032242811e-10) : df(-0.080745510756969451904, -1.3373665339076936258e-09)); x = dfadd2_f2_f2_f2(dfmul_f2_f2_f2(s2, x), o ? df(-0.30842512845993041992, -9.0728339030733922277e-09) : df(0.78539818525314331055, -2.1857338617566484855e-08)); x = dfmul_f2_f2_f2(x, o ? s2 : df(t, 0)); x = o ? dfadd2_f2_f2_f(x, 1) : x; // if (((q+2) & 4) != 0) { x.x = -x.x; x.y = -x.y; } return x; } EXPORT CONST float xcospif_u05(float d) { Sleef_float2 x = cospifk(d); float r = x.x + x.y; if (fabsfk(d) > TRIGRANGEMAX4f) r = 1; if (xisinff(d)) r = SLEEF_NANf; return r; } typedef struct { Sleef_float2 a, b; } df2; static CONST df2 gammafk(float a) { Sleef_float2 clc = df(0, 0), clln = df(1, 0), clld = df(1, 0), v = df(1, 0), x, y, z; float t, u; int otiny = fabsfk(a) < 1e-30f, oref = a < 0.5f; x = otiny ? df(0, 0) : (oref ? dfadd2_f2_f_f(1, -a) : df(a, 0)); int o0 = (0.5f <= x.x && x.x <= 1.2), o2 = 2.3 < x.x; y = dfnormalize_f2_f2(dfmul_f2_f2_f2(dfadd2_f2_f2_f(x, 1), x)); y = dfnormalize_f2_f2(dfmul_f2_f2_f2(dfadd2_f2_f2_f(x, 2), y)); clln = (o2 && x.x <= 7) ? y : clln; x = (o2 && x.x <= 7) ? dfadd2_f2_f2_f(x, 3) : x; t = o2 ? (1.0 / x.x) : dfnormalize_f2_f2(dfadd2_f2_f2_f(x, o0 ? -1 : -2)).x; u = o2 ? +0.000839498720672087279971000786 : (o0 ? +0.9435157776e+0f : +0.1102489550e-3f); u = mlaf(u, t, o2 ? -5.17179090826059219329394422e-05 : (o0 ? +0.8670063615e+0f : +0.8160019934e-4f)); u = mlaf(u, t, o2 ? -0.000592166437353693882857342347 : (o0 ? +0.4826702476e+0f : +0.1528468856e-3f)); u = mlaf(u, t, o2 ? +6.97281375836585777403743539e-05 : (o0 ? -0.8855129778e-1f : -0.2355068718e-3f)); u = mlaf(u, t, o2 ? +0.000784039221720066627493314301 : (o0 ? +0.1013825238e+0f : +0.4962242092e-3f)); u = mlaf(u, t, o2 ? -0.000229472093621399176949318732 : (o0 ? -0.1493408978e+0f : -0.1193488017e-2f)); u = mlaf(u, t, o2 ? -0.002681327160493827160473958490 : (o0 ? +0.1697509140e+0f : +0.2891599433e-2f)); u = mlaf(u, t, o2 ? +0.003472222222222222222175164840 : (o0 ? -0.2072454542e+0f : -0.7385451812e-2f)); u = mlaf(u, t, o2 ? +0.083333333333333333335592087900 : (o0 ? +0.2705872357e+0f : +0.2058077045e-1f)); y = dfmul_f2_f2_f2(dfadd2_f2_f2_f(x, -0.5), logk2f(x)); y = dfadd2_f2_f2_f2(y, dfneg_f2_f2(x)); y = dfadd2_f2_f2_f2(y, dfx(0.91893853320467278056)); // 0.5*log(2*M_PI) z = dfadd2_f2_f2_f(dfmul_f2_f_f (u, t), o0 ? -0.400686534596170958447352690395e+0f : -0.673523028297382446749257758235e-1f); z = dfadd2_f2_f2_f(dfmul_f2_f2_f(z, t), o0 ? +0.822466960142643054450325495997e+0f : +0.322467033928981157743538726901e+0f); z = dfadd2_f2_f2_f(dfmul_f2_f2_f(z, t), o0 ? -0.577215665946766039837398973297e+0f : +0.422784335087484338986941629852e+0f); z = dfmul_f2_f2_f(z, t); clc = o2 ? y : z; clld = o2 ? dfadd2_f2_f2_f(dfmul_f2_f_f(u, t), 1) : clld; y = clln; clc = otiny ? dfx(41.58883083359671856503) : // log(2^60) (oref ? dfadd2_f2_f2_f2(dfx(1.1447298858494001639), dfneg_f2_f2(clc)) : clc); // log(M_PI) clln = otiny ? df(1, 0) : (oref ? clln : clld); if (oref) x = dfmul_f2_f2_f2(clld, sinpifk(a - (float)(INT64_C(1) << 12) * (int32_t)(a * (1.0 / (INT64_C(1) << 12))))); clld = otiny ? df(a*((INT64_C(1) << 30)*(float)(INT64_C(1) << 30)), 0) : (oref ? x : y); df2 ret = { clc, dfdiv_f2_f2_f2(clln, clld) }; return ret; } EXPORT CONST float xtgammaf_u1(float a) { df2 d = gammafk(a); Sleef_float2 y = dfmul_f2_f2_f2(expk2f(d.a), d.b); float r = y.x + y.y; r = (a == -SLEEF_INFINITYf || (a < 0 && xisintf(a)) || (xisnumberf(a) && a < 0 && xisnanf(r))) ? SLEEF_NANf : r; r = ((a == SLEEF_INFINITYf || xisnumberf(a)) && a >= -FLT_MIN && (a == 0 || a > 36 || xisnanf(r))) ? mulsignf(SLEEF_INFINITYf, a) : r; return r; } EXPORT CONST float xlgammaf_u1(float a) { df2 d = gammafk(a); Sleef_float2 y = dfadd2_f2_f2_f2(d.a, logk2f(dfabs_f2_f2(d.b))); float r = y.x + y.y; r = (xisinff(a) || (a <= 0 && xisintf(a)) || (xisnumberf(a) && xisnanf(r))) ? SLEEF_INFINITYf : r; return r; } EXPORT CONST float xerff_u1(float a) { float s = a, t, u; Sleef_float2 d; a = fabsfk(a); int o0 = a < 1.1f, o1 = a < 2.4f, o2 = a < 4.0f; u = o0 ? (a*a) : a; t = o0 ? +0.7089292194e-4f : o1 ? -0.1792667899e-4f : -0.9495757695e-5f; t = mlaf(t, u, o0 ? -0.7768311189e-3f : o1 ? +0.3937633010e-3f : +0.2481465926e-3f); t = mlaf(t, u, o0 ? +0.5159463733e-2f : o1 ? -0.3949181177e-2f : -0.2918176819e-2f); t = mlaf(t, u, o0 ? -0.2683781274e-1f : o1 ? +0.2445474640e-1f : +0.2059706673e-1f); t = mlaf(t, u, o0 ? +0.1128318012e+0f : o1 ? -0.1070996150e+0f : -0.9901899844e-1f); d = dfmul_f2_f_f(t, u); d = dfadd2_f2_f2_f2(d, o0 ? dfx(-0.376125876000657465175213237214e+0) : o1 ? dfx(-0.634588905908410389971210809210e+0) : dfx(-0.643598050547891613081201721633e+0)); d = dfmul_f2_f2_f(d, u); d = dfadd2_f2_f2_f2(d, o0 ? dfx(+0.112837916021059138255978217023e+1) : o1 ? dfx(-0.112879855826694507209862753992e+1) : dfx(-0.112461487742845562801052956293e+1)); d = dfmul_f2_f2_f(d, a); d = o0 ? d : dfadd_f2_f_f2(1.0, dfneg_f2_f2(expk2f(d))); u = mulsignf(o2 ? (d.x + d.y) : 1, s); u = xisnanf(a) ? SLEEF_NANf : u; return u; } EXPORT CONST float xerfcf_u15(float a) { float s = a, r = 0, t; Sleef_float2 u, d, x; a = fabsfk(a); int o0 = a < 1.0f, o1 = a < 2.2f, o2 = a < 4.3f, o3 = a < 10.1f; u = o1 ? df(a, 0) : dfdiv_f2_f2_f2(df(1, 0), df(a, 0)); t = o0 ? -0.8638041618e-4f : o1 ? -0.6236977242e-5f : o2 ? -0.3869504035e+0f : +0.1115344167e+1f; t = mlaf(t, u.x, o0 ? +0.6000166177e-3f : o1 ? +0.5749821503e-4f : o2 ? +0.1288077235e+1f : -0.9454904199e+0f); t = mlaf(t, u.x, o0 ? -0.1665703603e-2f : o1 ? +0.6002851478e-5f : o2 ? -0.1816803217e+1f : -0.3667259514e+0f); t = mlaf(t, u.x, o0 ? +0.1795156277e-3f : o1 ? -0.2851036377e-2f : o2 ? +0.1249150872e+1f : +0.7155663371e+0f); t = mlaf(t, u.x, o0 ? +0.1914106123e-1f : o1 ? +0.2260518074e-1f : o2 ? -0.1328857988e+0f : -0.1262947265e-1f); d = dfmul_f2_f2_f(u, t); d = dfadd2_f2_f2_f2(d, o0 ? dfx(-0.102775359343930288081655368891e+0) : o1 ? dfx(-0.105247583459338632253369014063e+0) : o2 ? dfx(-0.482365310333045318680618892669e+0) : dfx(-0.498961546254537647970305302739e+0)); d = dfmul_f2_f2_f2(d, u); d = dfadd2_f2_f2_f2(d, o0 ? dfx(-0.636619483208481931303752546439e+0) : o1 ? dfx(-0.635609463574589034216723775292e+0) : o2 ? dfx(-0.134450203224533979217859332703e-2) : dfx(-0.471199543422848492080722832666e-4)); d = dfmul_f2_f2_f2(d, u); d = dfadd2_f2_f2_f2(d, o0 ? dfx(-0.112837917790537404939545770596e+1) : o1 ? dfx(-0.112855987376668622084547028949e+1) : o2 ? dfx(-0.572319781150472949561786101080e+0) : dfx(-0.572364030327966044425932623525e+0)); x = dfmul_f2_f2_f(o1 ? d : df(-a, 0), a); x = o1 ? x : dfadd2_f2_f2_f2(x, d); x = expk2f(x); x = o1 ? x : dfmul_f2_f2_f2(x, u); r = o3 ? (x.x + x.y) : 0; if (s < 0) r = 2 - r; r = xisnanf(s) ? SLEEF_NANf : r; return r; } // #ifdef ENABLE_MAIN // gcc -w -DENABLE_MAIN -I../common sleefsp.c rempitab.c -lm #include int main(int argc, char **argv) { float d1 = atof(argv[1]); //float d2 = atof(argv[2]); //float d3 = atof(argv[3]); //printf("%.20g, %.20g\n", (double)d1, (double)d2); //float i2 = atoi(argv[2]); //float c = xatan2f_u1(d1, d2); //printf("round %.20g\n", (double)d1); printf("test = %.20g\n", (double)xsqrtf_u05(d1)); //printf("correct = %.20g\n", (double)roundf(d1)); //printf("rint %.20g\n", (double)d1); //printf("test = %.20g\n", (double)xrintf(d1)); //printf("correct = %.20g\n", (double)rintf(d1)); //Sleef_float2 r = xsincospif_u35(d); //printf("%g, %g\n", (double)r.x, (double)r.y); } #endif ================================================ FILE: src/ufp.cpp ================================================ /* Copyright (c) 2021 Agenium Scale Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #include // ---------------------------------------------------------------------------- // Actual implementation namespace nsimd { template int ufp(T a_, T b_) { UnsignedType a = nsimd::scalar_reinterpret(UnsignedType(), a_); UnsignedType b = nsimd::scalar_reinterpret(UnsignedType(), b_); UnsignedType exp_mask = ((UnsignedType)1 << ExponentSize) - 1; i64 ea = (i64)((a >> MantissaSize) & exp_mask); i64 eb = (i64)((b >> MantissaSize) & exp_mask); if (ea - eb > 1 || ea - eb < -1) { return 0; } UnsignedType man_mask = ((UnsignedType)1 << MantissaSize) - 1; i64 ma = (i64)(a & man_mask) | ((i64)1 << MantissaSize); i64 mb = (i64)(b & man_mask) | ((i64)1 << MantissaSize); i64 d = 0; if (ea == eb) { d = ma - mb; } else if (ea > eb) { d = 2 * ma - mb; } else { d = 2 * mb - ma; } d = (d >= 0 ? d : -d); int i = 0; for (; i <= MantissaSize + 1 && d >= ((i64)1 << i); i++) ; return (int)(MantissaSize + 1 - i); } } // namespace nsimd // ---------------------------------------------------------------------------- // C ABI extern "C" { NSIMD_DLLSPEC int nsimd_ufp_f16(f16 a, f16 b) { return nsimd::ufp<5, 10, u16>(a, b); } NSIMD_DLLSPEC int nsimd_ufp_f32(f32 a, f32 b) { return nsimd::ufp<8, 23, u32>(a, b); } NSIMD_DLLSPEC int nsimd_ufp_f64(f64 a, f64 b) { return nsimd::ufp<11, 52, u64>(a, b); } } // extern "C" ================================================ FILE: tests/CMakeLists.txt.sh ================================================ # MIT License # # Copyright (c) 2020 Agenium Scale # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. set -e set -x BUF="`dirname $0`/.." NSIMD_CMAKE=`realpath ${BUF}` for simd_ext in "$@"; do # Take care of cross compilation here case ${simd_ext} in aarch64 | sve | sve128 | sve256 | sve512 | sve1024 | sve2048) C_COMP="aarch64-linux-gnu-gcc" CXX_COMP="aarch64-linux-gnu-g++" ;; neon128) C_COMP="arm-linux-gnueabi-gcc" CXX_COMP="arm-linux-gnueabi-g++" ;; vmx | vsx) C_COMP="${NSIMD_CMAKE}/scripts/powerpc64le-linux-gnu-clang.sh" CXX_COMP="${NSIMD_CMAKE}/scripts/powerpc64le-linux-gnu-clang++.sh" ;; oneapi) C_COMP="gcc" CXX_COMP="dpcpp" ;; rocm) C_COMP="gcc" CXX_COMP="${NSIMD_CMAKE}/scripts/hipcc.sh" ;; cuda) C_COMP="gcc" CXX_COMP="nvcc" ;; *) C_COMP="gcc" CXX_COMP="g++" ;; esac # First case: find a specific component ROOT_DIR="${PWD}/nsimd_cmake_tests/${simd_ext}" rm -rf ${ROOT_DIR} mkdir -p ${ROOT_DIR} (cd ${ROOT_DIR} && \ cmake ${NSIMD_CMAKE} \ -Dsimd=${simd_ext} \ -DCMAKE_INSTALL_PREFIX=${ROOT_DIR}/root \ -DCMAKE_C_COMPILER="${C_COMP}" \ -DCMAKE_CXX_COMPILER="${CXX_COMP}" && \ make VERBOSE=1 && \ make install) done ================================================ FILE: tests/FindNSIMD.cmake.sh ================================================ #!/bin/bash # # Copyright (c) 2020 Agenium Scale # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. set -e set -x FIND_NSIMD_CMAKE="`dirname $0`/../scripts/FindNSIMD.cmake" SIMD_EXTS="sse2 sse42 avx avx2 avx512_knl avx512_skylake neon128 aarch64 \ sve sve128 sve256 sve512 sve1024 sve2048 cuda rocm" for simd_ext in ${SIMD_EXTS}; do # First case: find a specific component ROOT_DIR="${PWD}/find_nsimd_cmake_tests/${simd_ext}" rm -rf ${ROOT_DIR} mkdir -p "${ROOT_DIR}/cmake" cp "${FIND_NSIMD_CMAKE}" "${ROOT_DIR}/cmake" mkdir -p "${ROOT_DIR}/root/include/nsimd" touch "${ROOT_DIR}/root/include/nsimd/nsimd.h" mkdir -p "${ROOT_DIR}/root/lib" touch "${ROOT_DIR}/root/lib/libnsimd_${simd_ext}.so" cat >"${ROOT_DIR}/CMakeLists.txt" <<-EOF cmake_minimum_required(VERSION 3.0.0) project(FIND_NSIMD_CMAKE_TESTS) set(CMAKE_MODULE_PATH "${ROOT_DIR}/cmake") set(CMAKE_PREFIX_PATH "${ROOT_DIR}/root") find_package(NSIMD COMPONENTS ${simd_ext}) message(STATUS "FindNSIMD.cmake test : specific for ${simd_ext}") message(STATUS "NSIMD_FOUND = \${NSIMD_FOUND}") if (\${NSIMD_FOUND}) message(STATUS "NSIMD_INCLUDE_DIRS = \${NSIMD_INCLUDE_DIRS}") message(STATUS "NSIMD_LIBRARY_DIRS = \${NSIMD_LIBRARY_DIRS}") message(STATUS "NSIMD_LIBRARIES = \${NSIMD_LIBRARIES}") else() message(FATAL_ERROR "error NSIMD_FOUND should be TRUE") endif() EOF (cd "${ROOT_DIR}" && mkdir -p build && cd build && cmake ..) # Second case: find a automatically a component ROOT_DIR="${PWD}/find_nsimd_cmake_tests/${simd_ext}-auto" rm -rf ${ROOT_DIR} mkdir -p "${ROOT_DIR}/cmake" cp "${FIND_NSIMD_CMAKE}" "${ROOT_DIR}/cmake" mkdir -p "${ROOT_DIR}/root/include/nsimd" touch "${ROOT_DIR}/root/include/nsimd/nsimd.h" mkdir -p "${ROOT_DIR}/root/lib" touch "${ROOT_DIR}/root/lib/libnsimd_${simd_ext}.so" cat >"${ROOT_DIR}/CMakeLists.txt" <<-EOF cmake_minimum_required(VERSION 3.0.0) project(FIND_NSIMD_CMAKE_TESTS) set(CMAKE_MODULE_PATH "${ROOT_DIR}/cmake") set(CMAKE_PREFIX_PATH "${ROOT_DIR}/root") find_package(NSIMD) message(STATUS "FindNSIMD.cmake test : automatic for ${simd_ext}") message(STATUS "NSIMD_FOUND = \${NSIMD_FOUND}") if (\${NSIMD_FOUND}) message(STATUS "NSIMD_INCLUDE_DIRS = \${NSIMD_INCLUDE_DIRS}") message(STATUS "NSIMD_LIBRARY_DIRS = \${NSIMD_LIBRARY_DIRS}") message(STATUS "NSIMD_LIBRARIES = \${NSIMD_LIBRARIES}") else() message(FATAL_ERROR "error NSIMD_FOUND should be TRUE") endif() EOF (cd "${ROOT_DIR}" && mkdir -p build && cd build && cmake ..) # Third case: find a specific component ROOT_DIR="${PWD}/find_nsimd_cmake_tests/${simd_ext}-notfound" rm -rf ${ROOT_DIR} mkdir -p "${ROOT_DIR}/cmake" cp "${FIND_NSIMD_CMAKE}" "${ROOT_DIR}/cmake" mkdir -p "${ROOT_DIR}/root/include/nsimd" touch "${ROOT_DIR}/root/include/nsimd/nsimd.h" mkdir -p "${ROOT_DIR}/root/lib" touch "${ROOT_DIR}/root/lib/libnsimd_cpu.so" cat >"${ROOT_DIR}/CMakeLists.txt" <<-EOF cmake_minimum_required(VERSION 3.0.0) project(FIND_NSIMD_CMAKE_TESTS) set(CMAKE_MODULE_PATH "${ROOT_DIR}/cmake") set(CMAKE_PREFIX_PATH "${ROOT_DIR}/root") find_package(NSIMD COMPONENTS ${simd_ext}) message(STATUS "FindNSIMD.cmake test : " "notfound specific for ${simd_ext}") message(STATUS "NSIMD_FOUND = \${NSIMD_FOUND}") if (\${NSIMD_FOUND}) message(STATUS "NSIMD_INCLUDE_DIRS = \${NSIMD_INCLUDE_DIRS}") message(STATUS "NSIMD_LIBRARY_DIRS = \${NSIMD_LIBRARY_DIRS}") message(STATUS "NSIMD_LIBRARIES = \${NSIMD_LIBRARIES}") message(FATAL_ERROR "error NSIMD_FOUND should be FALSE") else() message(STATUS "NSIMD not found") endif() EOF (cd "${ROOT_DIR}" && mkdir -p build && cd build && cmake ..) # Fourth case: find a automatically a component ROOT_DIR="${PWD}/find_nsimd_cmake_tests/${simd_ext}-auto-notfound" rm -rf ${ROOT_DIR} mkdir -p "${ROOT_DIR}/cmake" cp "${FIND_NSIMD_CMAKE}" "${ROOT_DIR}/cmake" mkdir -p "${ROOT_DIR}/root/include/nsimd" touch "${ROOT_DIR}/root/include/nsimd/nsimd.h" mkdir -p "${ROOT_DIR}/root/lib" cat >"${ROOT_DIR}/CMakeLists.txt" <<-EOF cmake_minimum_required(VERSION 3.0.0) project(FIND_NSIMD_CMAKE_TESTS) set(CMAKE_MODULE_PATH "${ROOT_DIR}/cmake") set(CMAKE_PREFIX_PATH "${ROOT_DIR}/root") find_package(NSIMD) message(STATUS "FindNSIMD.cmake test : " "notfound automatic for ${simd_ext}") message(STATUS "NSIMD_FOUND = \${NSIMD_FOUND}") if (\${NSIMD_FOUND}) message(STATUS "NSIMD_INCLUDE_DIRS = \${NSIMD_INCLUDE_DIRS}") message(STATUS "NSIMD_LIBRARY_DIRS = \${NSIMD_LIBRARY_DIRS}") message(STATUS "NSIMD_LIBRARIES = \${NSIMD_LIBRARIES}") message(FATAL_ERROR "error NSIMD_FOUND should be FALSE") else() message(STATUS "NSIMD not found") endif() EOF (cd "${ROOT_DIR}" && mkdir -p build && cd build && cmake ..) done ================================================ FILE: tests/allocator.cpp ================================================ /* Copyright (c) 2019 Agenium Scale Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #include #include #include int main() { std::vector > v; v.clear(); v.resize(100); v.clear(); v.resize(100); v.resize(10000); v.clear(); v.reserve(30); for (int i = 0; i < 1000; i++) { v.push_back(float(i)); } if (v.size() != 1000) { exit(EXIT_FAILURE); } for (int i = 0; i < 500; i++) { v.pop_back(); } if (v.size() != 500) { exit(EXIT_FAILURE); } return 0; } ================================================ FILE: tests/assign_arith.cpp ================================================ /* Copyright (c) 2021 Agenium Scale Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #include #include /* ------------------------------------------------------------------------- */ /* Random number */ template T get_rand() { return (T)((rand() % 10) + 1); } template <> f16 get_rand() { return nsimd_f32_to_f16(get_rand()); } /* ------------------------------------------------------------------------- */ /* Arithmetic operators */ #define HELPER(op1, op2, name) \ template int test_##name##_T(size_t n) { \ std::vector a(n), b(n); \ for (size_t i = 0; i < n; i++) { \ a[i] = get_rand(); \ b[i] = get_rand(); \ } \ \ using namespace nsimd; \ typedef pack pack; \ for (size_t i = 0; i < n; i += size_t(len(pack()))) { \ pack tmp1 = loadu(&a[i]); \ tmp1 op1 loadu(&b[i]); \ pack tmp2 = loadu(&a[i]) op2 loadu(&b[i]); \ if (any(tmp1 != tmp2)) { \ return -1; \ } \ } \ return 0; \ } \ \ int test_##name(size_t n) { \ return test_##name##_T(n) || test_##name##_T(n) || \ test_##name##_T(n) || test_##name##_T(n) || \ test_##name##_T(n) || test_##name##_T(n) || \ test_##name##_T(n) || test_##name##_T(n) || \ test_##name##_T(n) || test_##name##_T(n) || \ test_##name##_T(n); \ } \ \ int test_##name##_int_only(size_t n) { \ return test_##name##_T(n) || test_##name##_T(n) || \ test_##name##_T(n) || test_##name##_T(n) || \ test_##name##_T(n) || test_##name##_T(n) || \ test_##name##_T(n) || test_##name##_T(n); \ } HELPER(+=, +, add) HELPER(-=, -, sub) HELPER(*=, *, mul) HELPER(/=, /, div) HELPER(|=, |, orb) HELPER(&=, &, andb) HELPER(^=, ^, xorb) #undef HELPER /* ------------------------------------------------------------------------- */ /* Shift operators */ #define HELPER(op1, op2, name) \ template int test_##name##_T(size_t n) { \ std::vector a(n); \ for (size_t i = 0; i < n; i++) { \ a[i] = get_rand(); \ } \ \ using namespace nsimd; \ typedef pack pack; \ for (int s = 0; s <= 3; s++) { \ for (size_t i = 0; i < n; i += size_t(len(pack()))) { \ pack tmp = loadu(&a[i]); \ tmp op1 s; \ if (any(tmp != (loadu(&a[i]) op2 s))) { \ return -1; \ } \ } \ } \ return 0; \ } \ \ int test_##name(size_t n) { \ return test_##name##_T(n) || test_##name##_T(n) || \ test_##name##_T(n) || test_##name##_T(n) || \ test_##name##_T(n) || test_##name##_T(n) || \ test_##name##_T(n) || test_##name##_T(n); \ } HELPER(<<=, <<, shl) HELPER(>>=, >>, shr) #undef HELPER /* ------------------------------------------------------------------------- */ int main() { const size_t n = 2048; return test_add(n) || test_sub(n) || test_mul(n) || test_div(n) || test_orb_int_only(n) || test_andb_int_only(n) || test_xorb_int_only(n) || test_shl(n) || test_shr(n); } ================================================ FILE: tests/booleans.cpp ================================================ /* Copyright (c) 2020 Agenium Scale Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #include #include // ---------------------------------------------------------------------------- int main() { using namespace nsimd; packl v = packl(true) || packl(false); if (!all(v)) { return -1; } return 0; } ================================================ FILE: tests/c11_vec.c ================================================ /* Copyright (c) 2021 Agenium Scale Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #include int main() { #if NSIMD_C >= 2011 float in[NSIMD_MAX_LEN(f32)]; int out[NSIMD_MAX_LEN(i32)]; nsimd_pack(f32) vin = nsimd_load(unaligned, nsimd_pack(f32), in); nsimd_pack(i32) vout = nsimd_reinterpret(nsimd_pack(i32), vin); nsimd_store(unaligned, out, vout); #endif return 0; } ================================================ FILE: tests/cxx_adv_api_aliases.cpp ================================================ /* Copyright (c) 2021 Agenium Scale Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #include /* ------------------------------------------------------------------------- */ /* Random number */ template T get_rand() { return (T)((rand() % 100) - 50); } template <> f16 get_rand() { return nsimd_f32_to_f16(get_rand()); } /* ------------------------------------------------------------------------- */ template int test_aliases(size_t n) { std::vector a(n), b(n); for (size_t i = 0; i < n; i++) { a[i] = get_rand(); b[i] = get_rand(); } using namespace nsimd; typedef pack pack; size_t step = size_t(len(pack())); for (size_t i = 0; i + step <= n; i += step) { pack tmp1 = loadu(&a[i]); pack tmp2 = loadu(&b[i]); if (any(fabs(tmp1) != abs(tmp1))) { return -1; } if (any(fmin(tmp1, tmp2) != min(tmp1, tmp2))) { return -1; } if (any(fmax(tmp1, tmp2) != max(tmp1, tmp2))) { return -1; } } return 0; } /* ------------------------------------------------------------------------- */ int main() { return test_aliases(2048) || test_aliases(2048) || test_aliases(2048) || test_aliases(2048) || test_aliases(2048) || test_aliases(2048) || test_aliases(2048) || test_aliases(2048) || test_aliases(2048) || test_aliases(2048) || test_aliases(2048); } ================================================ FILE: tests/fp16.prec11.c ================================================ /* Copyright (c) 2019 Agenium Scale Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #define _POSIX_C_SOURCE 200112L #include #include #include #include /* ------------------------------------------------------------------------- */ float via_fp16(float a) { return nsimd_f16_to_f32(nsimd_f32_to_f16(a)); } /* ------------------------------------------------------------------------- */ float mk_fp32(int mantissa, int exponent) { return (float)ldexp((double)mantissa, exponent); } /* ------------------------------------------------------------------------- */ int test_f16_to_f32(u16 val, u32 expected) { f32 fexpected = nsimd_scalar_reinterpret_f32_u32(expected); f32 res = nsimd_u16_to_f32(val); u32 ures = nsimd_scalar_reinterpret_u32_f32(res); if ((nsimd_isnan_f32(fexpected) && !nsimd_isnan_f32(res)) || (!nsimd_isnan_f32(fexpected) && ures != expected)) { fprintf(stdout, "Error, nsimd_f16_to_f32: expected %e(0x%x) but got %e(0x%x) \n", (f64)fexpected, expected, (f64)res, ures); fflush(stdout); return 1; } return 0; } /* ------------------------------------------------------------------------- */ int test_f32_to_f16(u32 val, u16 expected) { f16 fres = nsimd_f32_to_f16(nsimd_scalar_reinterpret_f32_u32(val)); u16 ures = nsimd_scalar_reinterpret_u16_f16(fres); if (ures != expected) { fprintf(stdout, "Error, nsimd_f32_to_f16: expected 0x%x but got 0x%x \n", expected, ures); fflush(stdout); return 1; } return 0; } /* ------------------------------------------------------------------------- */ int main(void) { #ifndef NSIMD_NO_IEEE754 const float infty = nsimd_scalar_reinterpret_f32_u32(0x7F800000); const float m_infty = nsimd_scalar_reinterpret_f32_u32(0xFF800000); const float nan = nsimd_scalar_reinterpret_f32_u32(0x7FC00000); #endif int i; /* Some corner cases first. */ if (test_f16_to_f32(0x0000, 0x0)) { return EXIT_FAILURE; } if (test_f16_to_f32(0x8000, 0x80000000)) { return EXIT_FAILURE; } if (test_f16_to_f32(0x3C00, 0x3f800000)) { return EXIT_FAILURE; } if (test_f16_to_f32(0x13e, 0x379F0000)) { /* 1.8954277E-5 */ return EXIT_FAILURE; } if (test_f16_to_f32(0x977e, 0xBAEFC000)) { /* -1.8291473E-3 */ return EXIT_FAILURE; } if (test_f32_to_f16(0xC7BDC4FC, 0xFC00)) { /* -97161.97 */ return EXIT_FAILURE; } if (test_f32_to_f16(0x37c3642c, 0x187)) { /* 2.329246e-05 */ return EXIT_FAILURE; } if (test_f32_to_f16(0xb314e840, 0x8001)) { return EXIT_FAILURE; } /* Test rounding when the input f32 is perfectly between 2 f16 */ if (test_f32_to_f16(0xC66AD000, 0xf356)) { return EXIT_FAILURE; } /* Close to ±Inf */ if (test_f32_to_f16(0x477fefff, 0x7bff)) { return EXIT_FAILURE; } if (test_f32_to_f16(0x477ff000, 0x7c00)) { return EXIT_FAILURE; } if (test_f32_to_f16(0xC77fefff, 0xfbff)) { return EXIT_FAILURE; } if (test_f32_to_f16(0xC77ff000, 0xfc00)) { return EXIT_FAILURE; } /* Close to ±0 */ if (test_f32_to_f16(0x33000001, 0x0001)) { return EXIT_FAILURE; } if (test_f32_to_f16(0x33000000, 0x0000)) { return EXIT_FAILURE; } if (test_f32_to_f16(0xB3000001, 0x8001)) { return EXIT_FAILURE; } if (test_f32_to_f16(0xB3000000, 0x8000)) { return EXIT_FAILURE; } /* Close to the denormal limit */ if (test_f32_to_f16(0x38800000, 0x0400)) { return EXIT_FAILURE; } if (test_f32_to_f16(0x387fffff, 0x0400)) { return EXIT_FAILURE; } /* NaN special value (Copy Intel intrinsics which set the MSB of the mantissa * of NaNs to 1 when converting f16 to f32). */ if (test_f16_to_f32(0xfcf8, 0xff9f0000)) { return EXIT_FAILURE; } #ifndef NSIMD_NO_IEEE754 if (via_fp16(mk_fp32(1, 20)) != infty) { fprintf(stdout, "... Error, %i \n", __LINE__); fflush(stdout); return EXIT_FAILURE; } if (via_fp16(mk_fp32(-1, 20)) != m_infty) { fprintf(stdout, "... Error, %i \n", __LINE__); fflush(stdout); return EXIT_FAILURE; } if (!nsimd_isnan_f32(via_fp16(nan))) { fprintf(stdout, "... Error, %i \n", __LINE__); fflush(stdout); return EXIT_FAILURE; } #endif /* Some random inputs */ for (i = 0; i < 100; i++) { float a = (float)rand() / (float)RAND_MAX; if (fabsf(a - via_fp16(a)) > ldexpf(1.0, -9)) { return EXIT_FAILURE; } } fprintf(stdout, "... OK\n"); fflush(stdout); return EXIT_SUCCESS; } ================================================ FILE: tests/get_pack.cpp ================================================ /* Copyright (c) 2020 Agenium Scale Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #define STATUS "test of get_pack over all types" #include "tests_helpers.hpp" // ---------------------------------------------------------------------------- // Little helper for scope memory // ---------------------------------------------------------------------------- template bool get_pack_from_pack_N_1() { LOG_TEST_DEBUG("get_pack_from_pack_N_1", T); nsimd::pack pack_1(42); nsimd::pack v0_get = nsimd::get_pack<0>(pack_1); nsimd::scoped_aligned_mem_for expected(NSIMD_MAX_LEN_BIT / 8); nsimd::scoped_aligned_mem_for computed(NSIMD_MAX_LEN_BIT / 8); return nsimd_tests::check_pack_expected_vs_computed( pack_1, v0_get, "nsimd::pack", "nsimd::pack", expected.get(), computed.get()); } // ---------------------------------------------------------------------------- template bool get_pack_from_packx2_N_3() { LOG_TEST_DEBUG("get_pack_from_packx2_N_3", T); nsimd::pack v0(42); nsimd::pack v1(24); nsimd::packx2 packx2_3; packx2_3.v0 = v0; packx2_3.v1 = v1; nsimd::scoped_aligned_mem_for expected(3 * NSIMD_MAX_REGISTER_SIZE_BYTES); nsimd::scoped_aligned_mem_for computed(3 * NSIMD_MAX_REGISTER_SIZE_BYTES); nsimd::pack v0_get = nsimd::get_pack<0>(packx2_3); if (!nsimd_tests::check_pack_expected_vs_computed( v0, v0_get, "nsimd::packx2.v0", "nsimd::pack", expected.get(), computed.get())) { return false; } nsimd::pack v1_get = nsimd::get_pack<1>(packx2_3); return nsimd_tests::check_pack_expected_vs_computed( v1, v1_get, "nsimd::packx2.v1", "nsimd::pack", expected.get(), computed.get()); } // ---------------------------------------------------------------------------- template bool get_pack_from_packx3_N_3() { LOG_TEST_DEBUG("get_pack_from_packx3_N_3", T); nsimd::pack v0(42); nsimd::pack v1(24); nsimd::pack v2(66); nsimd::packx3 packx3_3; packx3_3.v0 = v0; packx3_3.v1 = v1; packx3_3.v2 = v2; nsimd::scoped_aligned_mem_for expected(3 * NSIMD_MAX_REGISTER_SIZE_BYTES); nsimd::scoped_aligned_mem_for computed(3 * NSIMD_MAX_REGISTER_SIZE_BYTES); nsimd::pack v0_get = nsimd::get_pack<0>(packx3_3); if (!nsimd_tests::check_pack_expected_vs_computed( v0, v0_get, "nsimd::packx3.v0", "nsimd::pack", expected.get(), computed.get())) { return false; } nsimd::pack v1_get = nsimd::get_pack<1>(packx3_3); if (!nsimd_tests::check_pack_expected_vs_computed( v1, v1_get, "nsimd::packx3.v1", "nsimd::pack", expected.get(), computed.get())) { return false; } nsimd::pack v2_get = nsimd::get_pack<2>(packx3_3); return nsimd_tests::check_pack_expected_vs_computed( v2, v2_get, "nsimd::packx3.v2", "nsimd::pack", expected.get(), computed.get()); } // ---------------------------------------------------------------------------- template bool get_pack_from_packx4_N_3() { LOG_TEST_DEBUG("get_pack_from_packx4_N_3", T); nsimd::pack v0(42); nsimd::pack v1(24); nsimd::pack v2(66); nsimd::pack v3(90); nsimd::packx4 packx4_3; packx4_3.v0 = v0; packx4_3.v1 = v1; packx4_3.v2 = v2; packx4_3.v3 = v3; nsimd::scoped_aligned_mem_for expected(3 * NSIMD_MAX_REGISTER_SIZE_BYTES); nsimd::scoped_aligned_mem_for computed(3 * NSIMD_MAX_REGISTER_SIZE_BYTES); nsimd::pack v0_get = nsimd::get_pack<0>(packx4_3); if (!nsimd_tests::check_pack_expected_vs_computed( v0, v0_get, "nsimd::packx4.v0", "nsimd::pack", expected.get(), computed.get())) { return false; } nsimd::pack v1_get = nsimd::get_pack<1>(packx4_3); if (!nsimd_tests::check_pack_expected_vs_computed( v1, v1_get, "nsimd::packx4.v1", "nsimd::pack", expected.get(), computed.get())) { return false; } nsimd::pack v2_get = nsimd::get_pack<2>(packx4_3); if (!nsimd_tests::check_pack_expected_vs_computed( v2, v2_get, "nsimd::packx4.v2", "nsimd::pack", expected.get(), computed.get())) { return false; } nsimd::pack v3_get = nsimd::get_pack<3>(packx4_3); return nsimd_tests::check_pack_expected_vs_computed( v3, v3_get, "nsimd::packx4.v3", "nsimd::pack", expected.get(), computed.get()); } // ---------------------------------------------------------------------------- template bool test_all() { if (!get_pack_from_pack_N_1()) { return 0; } if (!get_pack_from_packx2_N_3()) { return 0; } if (!get_pack_from_packx3_N_3()) { return 0; } if (!get_pack_from_packx4_N_3()) { return 0; } return 1; } // ---------------------------------------------------------------------------- int main(void) { if (!test_all() || !test_all() || !test_all() || !test_all() || !test_all() || !test_all() || !test_all() || !test_all() || !test_all() || !test_all()) { return -1; } fprintf(stdout, STATUS "... OK\n"); fflush(stdout); return 0; } ================================================ FILE: tests/memory.cpp ================================================ /* Copyright (c) 2019 Agenium Scale Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #include #include #include int test_aligned_alloc() { void *ptr = nsimd_aligned_alloc(17); if (ptr == NULL || ((size_t)ptr % NSIMD_MAX_ALIGNMENT) != 0) { return EXIT_FAILURE; } nsimd_aligned_free(ptr); return EXIT_SUCCESS; } template int test_aligned_alloc_for() { void *ptr = nsimd::aligned_alloc(17); if (ptr == NULL || ((size_t)ptr % NSIMD_MAX_ALIGNMENT) != 0) { return EXIT_FAILURE; } nsimd::aligned_free(ptr); return EXIT_SUCCESS; } template int test_allocator_for() { std::vector< T, nsimd::allocator > v(17); if (v.size() != 17 || ((size_t)v.data() % NSIMD_MAX_ALIGNMENT) != 0) { return EXIT_FAILURE; } v.resize(17017); if (v.size() != 17017 || ((size_t)v.data() % NSIMD_MAX_ALIGNMENT) != 0) { return EXIT_FAILURE; } v.clear(); if (v.size() != 0) { return EXIT_FAILURE; } return EXIT_SUCCESS; } int main() { return test_aligned_alloc() || test_aligned_alloc_for() || test_aligned_alloc_for() || test_aligned_alloc_for() || test_aligned_alloc_for() || test_aligned_alloc_for() || test_aligned_alloc_for() || test_aligned_alloc_for() || test_aligned_alloc_for() || test_aligned_alloc_for() || test_aligned_alloc_for() || test_allocator_for() || test_allocator_for() || test_allocator_for() || test_allocator_for() || test_allocator_for() || test_allocator_for() || test_allocator_for() || test_allocator_for() || test_allocator_for() || test_allocator_for(); } ================================================ FILE: tests/memory.prec11.c ================================================ /* Copyright (c) 2019 Agenium Scale Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #include #include int main(void) { void *ptr = nsimd_aligned_alloc(17); if (ptr == NULL || ((size_t)ptr % NSIMD_MAX_ALIGNMENT) != 0) { return EXIT_FAILURE; } nsimd_aligned_free(ptr); return EXIT_SUCCESS; } ================================================ FILE: tests/modules/common.hpp ================================================ /* Copyright (c) 2021 Agenium Scale Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #ifndef NSIMD_MODULES_SPMD_COMMON_HPP #define NSIMD_MODULES_SPMD_COMMON_HPP #include #include #include #include #include #include // ---------------------------------------------------------------------------- // Common code for devices #if defined(NSIMD_CUDA) || defined(NSIMD_ROCM) template __device__ bool cmp_Ts(T a, T b) { return a == b; } __device__ bool cmp_Ts(__half a, __half b) { return __half_as_short(a) == __half_as_short(b); } __device__ bool cmp_Ts(float a, float b) { return __float_as_int(a) == __float_as_int(b); } __device__ bool cmp_Ts(double a, double b) { return __double_as_longlong(a) == __double_as_longlong(b); } #elif defined(NSIMD_ONEAPI) template bool cmp_Ts(const T a, const T b) { return a == b; } bool cmp_Ts(sycl::half a, const sycl::half b) { return nsimd::gpu_reinterpret(u16(), a) == nsimd::gpu_reinterpret(u16(), b); } bool cmp_Ts(sycl::cl_float a, sycl::cl_float b) { return nsimd::gpu_reinterpret(u32(), a) == nsimd::gpu_reinterpret(u32(), b); } bool cmp_Ts(sycl::cl_double a, sycl::cl_double b) { return nsimd::gpu_reinterpret(u64(), a) == nsimd::gpu_reinterpret(u64(), b); } #endif // ---------------------------------------------------------------------------- // CUDA #if defined(NSIMD_CUDA) // perform reduction on blocks first, note that this could be optimized // but to check correctness we don't need it now template __global__ void device_cmp_blocks(T *src1, T *src2, int n) { extern __shared__ char buf_[]; // size of a block T *buf = (T*)buf_; int tid = threadIdx.x; int i = tid + blockIdx.x * blockDim.x; if (i < n) { buf[tid] = T(cmp_Ts(src1[i], src2[i]) ? 1 : 0); } const int block_start = blockIdx.x * blockDim.x; const int block_end = block_start + blockDim.x; int size; if (block_end < n) { size = blockDim.x; } else { size = n - block_start; } __syncthreads(); for (int s = size / 2; s != 0; s /= 2) { if (tid < s && i < n) { buf[tid] = nsimd::gpu_mul(buf[tid], buf[tid + s]); __syncthreads(); } } if (tid == 0) { src1[i] = buf[0]; } } template __global__ void device_cmp_array(int *dst, T *src1, int n) { // reduction on the whole vector T buf = T(1); for (int i = 0; i < n; i += blockDim.x) { buf = nsimd::gpu_mul(buf, src1[i]); } int i = threadIdx.x + blockIdx.x * blockDim.x; if (i == 0) { dst[0] = int(buf); } } template bool cmp(T *src1, T *src2, unsigned int n) { int host_ret; int *device_ret; if (cudaMalloc((void **)&device_ret, sizeof(int)) != cudaSuccess) { std::cerr << "ERROR: cannot cudaMalloc " << sizeof(int) << " bytes\n"; exit(EXIT_FAILURE); } device_cmp_blocks<<<(n + 127) / 128, 128, 128 * sizeof(T)>>>(src1, src2, int(n)); device_cmp_array<<<(n + 127) / 128, 128>>>(device_ret, src1, int(n)); cudaMemcpy((void *)&host_ret, (void *)device_ret, sizeof(int), cudaMemcpyDeviceToHost); cudaFree((void *)device_ret); return bool(host_ret); } template bool cmp(T *src1, T *src2, unsigned int n, int) { return cmp(src1, src2, n); } template void del(T *ptr) { cudaFree(ptr); } #elif defined(NSIMD_ROCM) // ---------------------------------------------------------------------------- // ROCm // perform reduction on blocks first, note that this could be optimized // but to check correctness we don't need it now template __global__ void device_cmp_blocks(T *src1, T *src2, size_t n) { extern __shared__ char buf_[]; // size of a block T *buf = (T*)buf_; size_t tid = hipThreadIdx_x; size_t i = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; if (i < n) { buf[tid] = T(cmp_Ts(src1[i], src2[i]) ? 1 : 0); } const size_t block_start = hipBlockIdx_x * hipBlockDim_x; const size_t block_end = block_start + hipBlockDim_x; size_t size; if (block_end < n) { size = hipBlockDim_x; } else { size = n - block_start; } __syncthreads(); for (size_t s = size / 2; s != 0; s /= 2) { if (tid < s && i < n) { buf[tid] = nsimd::gpu_mul(buf[tid], buf[tid + s]); __syncthreads(); } } if (tid == 0) { src1[i] = buf[0]; } } template __global__ void device_cmp_array(int *dst, T *src1, size_t n) { // reduction on the whole vector T buf = T(1); for (size_t i = 0; i < n; i += blockDim.x) { buf = nsimd::gpu_mul(buf, src1[i]); } size_t i = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; if (i == 0) { dst[0] = int(buf); } } template bool cmp(T *src1, T *src2, size_t n) { int host_ret; int *device_ret; if (hipMalloc((void **)&device_ret, sizeof(int)) != hipSuccess) { return false; } hipLaunchKernelGGL(device_cmp_blocks, (n + 127) / 128, 128, 128 * sizeof(T), 0, src1, src2, n); hipLaunchKernelGGL(device_cmp_array, (n + 127) / 128, 128, 0, 0, device_ret, src1, n); hipMemcpy((void *)&host_ret, (void *)device_ret, sizeof(int), hipMemcpyDeviceToHost); hipFree((void *)device_ret); return bool(host_ret); } template bool cmp(T *src1, T *src2, size_t n, int) { return cmp(src1, src2, n); } template void del(T *ptr) { hipFree(ptr); } #elif defined(NSIMD_ONEAPI) // ---------------------------------------------------------------------------- // oneAPI // perform reduction on blocks first, note that this could be optimized // but to check correctness we don't need it now template void device_cmp_blocks(T *const src1, const T *const src2, const size_t n, sycl::accessor local_buffer, sycl::nd_item<1> item) { size_t tid = item.get_local_id().get(0); size_t i = item.get_global_id().get(0); if (i < n) { local_buffer[tid] = T(cmp_Ts(src1[i], src2[i]) ? 1 : 0); } item.barrier(sycl::access::fence_space::local_space); // other approach: see book p 345 if (tid == 0) { sycl::ext::oneapi::sub_group sg = item.get_sub_group(); src1[i] = sycl::ext::oneapi::reduce_over_group( sg, local_buffer[0], sycl::ext::oneapi::multiplies()); } } template void device_cmp_array(int *const dst, const T *const src1, const size_t n, sycl::nd_item<1> item) { // reduction mul on the whole vector T buf = T(1); sycl::nd_range<1> nd_range = item.get_nd_range(); sycl::range<1> range = nd_range.get_local_range(); for (size_t i = 0; i < n; i += range.size()) { buf = nsimd::gpu_mul(buf, src1[i]); } size_t i = item.get_global_id().get(0); if (i == 0) { dst[0] = int(buf); } } template bool cmp(T *const src1, const T *const src2, unsigned int n) { const size_t total_num_threads = (size_t)nsimd_kernel_param(n, 128); sycl::queue q = nsimd::oneapi::default_queue(); sycl::event e1 = q.submit([=](sycl::handler &h) { sycl::accessor local_buffer(128, h); h.parallel_for(sycl::nd_range<1>(sycl::range<1>(total_num_threads), sycl::range<1>(128)), [=](sycl::nd_item<1> item_) { device_cmp_blocks(src1, src2, size_t(n), local_buffer, item_); }); }); e1.wait_and_throw(); int *device_ret = nsimd::device_calloc(n); if (device_ret == NULL) { std::cerr << "ERROR: cannot sycl::malloc_device " << sizeof(int) << " bytes\n"; exit(EXIT_FAILURE); } sycl::event e2 = q.parallel_for(sycl::nd_range<1>(sycl::range<1>(total_num_threads), sycl::range<1>(128)), [=](sycl::nd_item<1> item_) { device_cmp_array(device_ret, src1, size_t(n), item_); }); e2.wait_and_throw(); int host_ret; q.memcpy((void *)&host_ret, (void *)device_ret, sizeof(int)).wait(); nsimd::device_free(device_ret); return bool(host_ret); } template bool cmp(T *src1, T *src2, unsigned int n, double) { return cmp(src1, src2, n); } template void del(T *ptr) { sycl::queue q = nsimd::oneapi::default_queue(); sycl::free(ptr, q); } #else // ---------------------------------------------------------------------------- // SIMD template bool cmp(T *src1, T *src2, unsigned int n) { return memcmp(src1, src2, n * sizeof(T)) == 0; } template bool cmp(T *src1, T *src2, unsigned int n, int ufp) { for (unsigned int i = 0; i < n; i++) { if (nsimd::ufp(src1[i], src2[i]) < ufp) { return false; } } return true; } #endif // ---------------------------------------------------------------------------- #endif ================================================ FILE: tests/nsimd-all.cpp ================================================ /* Copyright (c) 2019 Agenium Scale Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #include #include #include // ---------------------------------------------------------------------------- int main() { using namespace nsimd; const int unroll = 3; typedef pack upack; const int n_max = unroll * NSIMD_MAX_LEN(f32); const int n = len(upack()); float buf[n_max]; for(int i = 0; i < n; i++) { buf[i] = float(i); } upack p = loadu(buf); p = -(p * p) + 1.0f; storeu(buf, p); for (int i = 0; i < n; i++) { fprintf(stdout, "%f vs %f\n", double(buf[i]), double(-(i * i) + 1)); } for (int i = 0; i < n; i++) { if (buf[i] != float(-(i * i) + 1)) { exit(EXIT_FAILURE); } } return 0; } ================================================ FILE: tests/nsimd.cpp ================================================ /* Copyright (c) 2019 Agenium Scale Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #include #include #include #include // ---------------------------------------------------------------------------- void test_native_register() { nsimd_cpu_vf32 a = nsimd_set1_cpu_f32(1.0f); nsimd::pack p1(a); nsimd::pack p2(1.0f); if (nsimd_any_cpu_f32(nsimd_ne_cpu_f32(a, p1.native_register()))) { exit(EXIT_FAILURE); } if (nsimd_any_cpu_f32(nsimd_ne_cpu_f32(a, nsimd::native_register(p1)))) { exit(EXIT_FAILURE); } if (nsimd_any_cpu_f32(nsimd_ne_cpu_f32(nsimd::native_register(a), nsimd::native_register(p1)))) { exit(EXIT_FAILURE); } if (nsimd_any_cpu_f32( nsimd_ne_cpu_f32(p2.native_register(), p1.native_register()))) { exit(EXIT_FAILURE); } } // ---------------------------------------------------------------------------- void test_output() { nsimd_cpu_vf32 a = nsimd_set1_cpu_f32(1.0f); if (nsimd_put_cpu_f32(stdout, NULL, a) == -1) { exit(EXIT_FAILURE); } if (nsimd_put_cpu_f32(stdout, "%f", a) == -1) { exit(EXIT_FAILURE); } fflush(stdout); nsimd::pack p1(a); nsimd::pack p2(1.0f); std::cout << p1 << std::endl << p2 << std::endl; } // ---------------------------------------------------------------------------- void test_unroll() { using namespace nsimd; const int unroll = 3; typedef pack upack; const int n_max = unroll * NSIMD_MAX_LEN(f32); const int n = len(upack()); float buf[n_max]; for(int i = 0; i < n; i++) { buf[i] = float(i); } upack p = loadu(buf); p = -(p * p); storeu(buf, p); for (int i = 0; i < n; i++) { fprintf(stdout, "%f vs %f\n", double(buf[i]), double(-i * i)); } for (int i = 0; i < n; i++) { if (buf[i] != float(-(i * i))) { exit(EXIT_FAILURE); } } } // ---------------------------------------------------------------------------- int main(void) { test_native_register(); test_output(); test_unroll(); return 0; } ================================================ FILE: tests/nsimd.prec11.c ================================================ /* Copyright (c) 2019 Agenium Scale Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #include int main(void) { return 0; } ================================================ FILE: tests/operator_vector_scalar.cpp ================================================ /* Copyright (c) 2019 Agenium Scale Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #include #include int main() { nsimd::pack a(1.0f); return (nsimd::any(a != 0) != 0 ? 0 : 1); } ================================================ FILE: tests/shifts.cpp ================================================ /* Copyright (c) 2019 Agenium Scale Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #include #include // ---------------------------------------------------------------------------- int main() { using namespace nsimd; const int unroll = 3; typedef pack upack; const int n_max = unroll * NSIMD_MAX_LEN(f32); const int n = len(upack()); unsigned int buf[n_max]; for(int i = 0; i < n; i++) { buf[i] = (unsigned int)i; } upack v = loadu(buf); if (any(((v << 4) >> 4) != v)) { exit(EXIT_FAILURE); } return 0; } ================================================ FILE: tests/templated_loads_stores.cpp ================================================ /* Copyright (c) 2019 Agenium Scale Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #include #include #include // ---------------------------------------------------------------------------- float *getmem(nsimd::aligned, int sz) { float *ret = (float *)nsimd::aligned_alloc(sz); if (ret == NULL) { std::cerr << "ERROR: cannot malloc aligned memory" << std::endl; } return ret; } float *getmem(nsimd::unaligned, int sz) { return getmem(nsimd::aligned(), 2 * sz) + 1; } // ---------------------------------------------------------------------------- template int test() { using namespace nsimd; f32 *buf = getmem(Alignment(), NSIMD_MAX_LEN(f32)); memset((void *)buf, 0, NSIMD_MAX_LEN(f32)); pack v = masko_load(packl(false), buf, set1 >(1.0f)); if (any(v != 1.0f)) { std::cerr << "[1]: v != [ 1.0f ... 1.0f ]" << std::endl; return -1; } v = load, Alignment>(buf); if (any(v != 0.0f)) { std::cerr << "[2]: v != [ 0.0f ... 0.0f ]" << std::endl; return -1; } v = set1 >(1.0f); store(buf, v); for (int i = 0; i < len(pack()); i++) { if (buf[i] != 1.0f) { std::cerr << "[3]: buf != [ 1.0f ... 1.0f ]" << std::endl; return -1; } } v = set1 >(2.0f); mask_store(packl(false), buf, v); for (int i = 0; i < len(pack()); i++) { if (buf[i] != 1.0f) { std::cerr << "[4]: buf != [ 1.0f ... 1.0f ]" << std::endl; return -1; } } v = maskz_load(packl(false), buf); if (any(v != 0.0f)) { std::cerr << "[5]: v != [ 0.0f ... 0.0f ]" << std::endl; return -1; } return 0; } // ---------------------------------------------------------------------------- int main() { return test() || test(); } ================================================ FILE: tests/tests_helpers.hpp ================================================ #ifndef TESTS_HELPERS_HPP #define TESTS_HELPERS_HPP #include #include #include #include #include #include #define NSIMD_LOG_DEBUG 0 #define NSIMD_MAX_REGISTER_SIZE_BYTES NSIMD_MAX_LEN_BIT / 8 #define LOG_TEST_DEBUG(test_name, T) \ do { \ if (NSIMD_LOG_DEBUG) { \ fprintf(stdout, "%s%s%s%s%s", "\n--------- ", \ nsimd_tests::get_type_str(T()), ": ", test_name, \ "---------------\n\n"); \ } \ } while (0) #define LOG_MEMORY_CONTENT_DEBUG(vout, len_, memory_type) \ do { \ if (NSIMD_LOG_DEBUG) { \ nsimd_tests::print(vout, len_, memory_type); \ } \ } while (0) #define CHECK(a) \ { \ errno = 0; \ if (!(a)) { \ fprintf(stderr, "ERROR: " #a ":%d: %s\n", __LINE__, strerror(errno)); \ fflush(stderr); \ exit(EXIT_FAILURE); \ } \ } #define TEST_NSIMD_FALSE 0 #define TEST_NSIMD_TRUE 1 #define TEST_NSIMD_ERROR -1 /* ----------------------------------------------------------------------- */ namespace nsimd_tests { template int expected_not_equal_computed(const T expected, const T computed) { return expected != computed; } namespace fprintf_helper { // silent the warning for implicit conversion from ‘float’ to ‘double’ when // passing argument to fprintf template struct f64_if_f32_else_T { typedef T value_type; }; template <> struct f64_if_f32_else_T { typedef f64 value_type; }; const char *specifier(i8) { return "%hhu"; } const char *specifier(u8) { return "%hhu"; } const char *specifier(i16) { return "%hd"; } const char *specifier(u16) { return "%hu"; } const char *specifier(i32) { return "%d"; } const char *specifier(u32) { return "%u"; } const char *specifier(i64) { return "%ld"; } const char *specifier(u64) { return "%lu"; } const char *specifier(f32) { return "%f"; } const char *specifier(f64) { return "%f"; } } // namespace fprintf_helper const char *get_type_str(i8) { return "i8"; } const char *get_type_str(u8) { return "u8"; } const char *get_type_str(i16) { return "i16"; } const char *get_type_str(u16) { return "u16"; } const char *get_type_str(i32) { return "i32"; } const char *get_type_str(u32) { return "u32"; } const char *get_type_str(i64) { return "i64"; } const char *get_type_str(u64) { return "u64"; } const char *get_type_str(f32) { return "f32"; } const char *get_type_str(f64) { return "f64"; } template void print(T *const arr, const nsimd_nat len_, const char *msg) { fprintf(stdout, "%-24s: ", msg); char formatter[12]; strcpy(formatter, "%s"); strcat(formatter, fprintf_helper::specifier(T())); for (nsimd_nat ii = 0; ii < len_; ++ii) { fprintf( stdout, formatter, 0 == ii ? "{" : ", ", (typename fprintf_helper::f64_if_f32_else_T::value_type)arr[ii]); } fprintf(stdout, "%s", "}\n"); fflush(stdout); } template void init_arrays(T *const vout_expected, T *const vout_computed, const nsimd_nat len_) { for (nsimd_nat ii = 0; ii < len_; ++ii) { vout_expected[ii] = (T)-1; vout_computed[ii] = (T)1; } } /* ----------------------------- storea ---------------------------- */ // storea for all packx[Y]<1 .. N> Y in {1, 2, 3, 4} // struct storea_recurs_helper for packx[Y]<1 .. N> y in {2, 3, 4} // General definition template class pack_t, int VIx, bool EndRecurs> struct storea_recurs_helper {}; // Recursive case template class pack_t, int VIx> struct storea_recurs_helper { void operator()(T *const begin, const pack_t &pack_) const { nsimd::storea(begin, nsimd::get_pack(pack_)); storea_recurs_helper::soa_num_packs>()( begin + nsimd::len(nsimd::pack()), pack_); } }; // Base case template class pack_t, int VIx> struct storea_recurs_helper { void operator()(T *const begin, const pack_t &pack_) const { (void)begin; (void)pack_; } }; // storea function for packx[Y]<1 .. N> y in {2, 3, 4} template class pack_t> void storea__(T *const begin, const pack_t &pack_) { storea_recurs_helper::soa_num_packs>()(begin, pack_); } // storea for pack<1 .. N> template void storea__(T *const begin, const nsimd::pack &pack_) { nsimd::storea(begin, pack_); } /* ---------------------- check_arrays ------------------------------- */ template bool check_arrays(const T *const vout_expected, const T *const vout_computed, const nsimd_nat len_) { for (nsimd_nat ii = 0; ii < len_; ++ii) { if (expected_not_equal_computed(vout_expected[ii], vout_computed[ii])) { fprintf(stdout, STATUS "... FAIL\n"); fflush(stdout); return 0; } } return 1; } /* ---------------------- check_packs_content ------------------------ */ template class PackFrom, template class PackTo> bool check_pack_expected_vs_computed( const PackFrom &pack_from, const PackTo &pack_to, const char *from_type, const char *to_type, T *const vout_expected, T *const vout_computed) { if (nsimd::len(pack_from) != nsimd::len(pack_to)) { return 0; } const nsimd_nat len_ = (nsimd_nat)(nsimd::len(pack_to)); init_arrays(vout_expected, vout_computed, len_); storea__(vout_expected, pack_from); LOG_MEMORY_CONTENT_DEBUG(vout_expected, nsimd::len(pack_from), from_type); nsimd::storea(vout_computed, pack_to); LOG_MEMORY_CONTENT_DEBUG(vout_computed, nsimd::len(pack_to), to_type); if (!check_arrays(vout_expected, vout_computed, len_)) { return 0; } return 1; } } // namespace nsimd_tests #endif ================================================ FILE: tests/to_pack.cpp ================================================ #define STATUS "test of to_pack over all types" #include "tests_helpers.hpp" template bool to_pack_from_pack_1_N_1() { LOG_TEST_DEBUG("to_pack_from_pack_1_N_1", T); nsimd::pack pack_from(42); nsimd::pack pack_to = nsimd::to_pack(pack_from); nsimd::scoped_aligned_mem_for expected(NSIMD_MAX_REGISTER_SIZE_BYTES); nsimd::scoped_aligned_mem_for computed(NSIMD_MAX_REGISTER_SIZE_BYTES); return nsimd_tests::check_pack_expected_vs_computed( pack_from, pack_to, "nsimd::pack", "nsimd::pack", expected.get(), computed.get()); } template bool to_pack_from_packx2_N_1() { LOG_TEST_DEBUG("to_pack_from_packx2_N_1", T); nsimd::pack v0(42); nsimd::pack v1(24); nsimd::packx2 pack_from; pack_from.v0 = v0; pack_from.v1 = v1; nsimd::pack pack_to = nsimd::to_pack(pack_from); nsimd::scoped_aligned_mem_for expected(2 * NSIMD_MAX_REGISTER_SIZE_BYTES); nsimd::scoped_aligned_mem_for computed(2 * NSIMD_MAX_REGISTER_SIZE_BYTES); return nsimd_tests::check_pack_expected_vs_computed( pack_from, pack_to, "nsimd::packx2", "nsimd::pack", expected.get(), computed.get()); } template bool to_pack_from_packx3_N_1() { LOG_TEST_DEBUG("to_pack_from_packx3_N_1", T); nsimd::pack v0(42); nsimd::pack v1(24); nsimd::pack v2(66); nsimd::packx3 pack_from; pack_from.v0 = v0; pack_from.v1 = v1; pack_from.v2 = v2; nsimd::pack pack_to = nsimd::to_pack(pack_from); nsimd::scoped_aligned_mem_for expected(3 * NSIMD_MAX_REGISTER_SIZE_BYTES); nsimd::scoped_aligned_mem_for computed(3 * NSIMD_MAX_REGISTER_SIZE_BYTES); return nsimd_tests::check_pack_expected_vs_computed( pack_from, pack_to, "nsimd::packx3", "nsimd::pack", expected.get(), computed.get()); } template bool to_pack_from_packx2_N_2() { LOG_TEST_DEBUG("to_pack_from_packx2_N_2", T); nsimd::pack v0(42); nsimd::pack v1(24); nsimd::packx2 pack_from; pack_from.v0 = v0; pack_from.v1 = v1; nsimd::pack pack_to = nsimd::to_pack(pack_from); nsimd::scoped_aligned_mem_for expected(4 * NSIMD_MAX_REGISTER_SIZE_BYTES); nsimd::scoped_aligned_mem_for computed(4 * NSIMD_MAX_REGISTER_SIZE_BYTES); return nsimd_tests::check_pack_expected_vs_computed( pack_from, pack_to, "nsimd::packx2", "nsimd::pack", expected.get(), computed.get()); } template bool to_pack_from_packx2_N_3() { LOG_TEST_DEBUG("to_pack_from_packx2_N_3", T); nsimd::pack v0(42); nsimd::pack v1(24); nsimd::packx2 pack_from; pack_from.v0 = v0; pack_from.v1 = v1; nsimd::pack pack_to = nsimd::to_pack(pack_from); nsimd::scoped_aligned_mem_for expected(6 * NSIMD_MAX_REGISTER_SIZE_BYTES); nsimd::scoped_aligned_mem_for computed(6 * NSIMD_MAX_REGISTER_SIZE_BYTES); return nsimd_tests::check_pack_expected_vs_computed( pack_from, pack_to, "nsimd::packx2", "nsimd::pack", expected.get(), computed.get()); } template bool to_pack_from_packx3_N_2() { LOG_TEST_DEBUG("to_pack_from_packx3_N_2", T); nsimd::pack v0(42); nsimd::pack v1(24); nsimd::pack v2(66); nsimd::packx3 pack_from; pack_from.v0 = v0; pack_from.v1 = v1; pack_from.v2 = v2; nsimd::pack pack_to = nsimd::to_pack(pack_from); nsimd::scoped_aligned_mem_for expected(6 * NSIMD_MAX_REGISTER_SIZE_BYTES); nsimd::scoped_aligned_mem_for computed(6 * NSIMD_MAX_REGISTER_SIZE_BYTES); return nsimd_tests::check_pack_expected_vs_computed( pack_from, pack_to, "nsimd::packx3", "nsimd::pack", expected.get(), computed.get()); } template bool to_pack_from_packx3_N_3() { LOG_TEST_DEBUG("to_pack_from_packx3_N_3", T); nsimd::pack v0(42); nsimd::pack v1(24); nsimd::pack v2(66); nsimd::packx3 pack_from; pack_from.v0 = v0; pack_from.v1 = v1; pack_from.v2 = v2; nsimd::pack pack_to = nsimd::to_pack(pack_from); nsimd::scoped_aligned_mem_for expected(9 * NSIMD_MAX_REGISTER_SIZE_BYTES); nsimd::scoped_aligned_mem_for computed(9 * NSIMD_MAX_REGISTER_SIZE_BYTES); return nsimd_tests::check_pack_expected_vs_computed( pack_from, pack_to, "nsimd::packx3", "nsimd::pack", expected.get(), computed.get()); } template bool to_pack_from_packx4_N_1() { LOG_TEST_DEBUG("to_pack_from_packx4_N_1", T); nsimd::pack v0(42); nsimd::pack v1(24); nsimd::pack v2(66); nsimd::pack v3(132); nsimd::packx4 pack_from; pack_from.v0 = v0; pack_from.v1 = v1; pack_from.v2 = v2; pack_from.v3 = v3; nsimd::pack pack_to = nsimd::to_pack(pack_from); nsimd::scoped_aligned_mem_for expected(4 * NSIMD_MAX_REGISTER_SIZE_BYTES); nsimd::scoped_aligned_mem_for computed(4 * NSIMD_MAX_REGISTER_SIZE_BYTES); return nsimd_tests::check_pack_expected_vs_computed( pack_from, pack_to, "nsimd::packx4", "nsimd::pack", expected.get(), computed.get()); } template bool to_pack_from_packx4_N_2() { LOG_TEST_DEBUG("to_pack_from_packx4_N_2", T); nsimd::pack v0(42); nsimd::pack v1(24); nsimd::pack v2(66); nsimd::pack v3(132); nsimd::packx4 pack_from; pack_from.v0 = v0; pack_from.v1 = v1; pack_from.v2 = v2; pack_from.v3 = v3; nsimd::pack pack_to = nsimd::to_pack(pack_from); nsimd::scoped_aligned_mem_for expected(8 * NSIMD_MAX_REGISTER_SIZE_BYTES); nsimd::scoped_aligned_mem_for computed(8 * NSIMD_MAX_REGISTER_SIZE_BYTES); return nsimd_tests::check_pack_expected_vs_computed( pack_from, pack_to, "nsimd::packx4", "nsimd::pack", expected.get(), computed.get()); } template bool to_pack_from_packx4_N_3() { LOG_TEST_DEBUG("to_pack_from_packx4_N_3", T); nsimd::pack v0(42); nsimd::pack v1(24); nsimd::pack v2(66); nsimd::pack v3(132); nsimd::packx4 pack_from; pack_from.v0 = v0; pack_from.v1 = v1; pack_from.v2 = v2; pack_from.v3 = v3; nsimd::pack pack_to = nsimd::to_pack(pack_from); nsimd::scoped_aligned_mem_for expected(12 * NSIMD_MAX_REGISTER_SIZE_BYTES); nsimd::scoped_aligned_mem_for computed(12 * NSIMD_MAX_REGISTER_SIZE_BYTES); return nsimd_tests::check_pack_expected_vs_computed( pack_from, pack_to, "nsimd::packx4", "nsimd::pack", expected.get(), computed.get()); } template bool test_all() { if (!to_pack_from_pack_1_N_1()) { return 0; } if (!to_pack_from_packx2_N_1()) { return 0; } if (!to_pack_from_packx2_N_2()) { return 0; } if (!to_pack_from_packx2_N_3()) { return 0; } if (!to_pack_from_packx3_N_1()) { return 0; } if (!to_pack_from_packx3_N_2()) { return 0; } if (!to_pack_from_packx3_N_3()) { return 0; } if (!to_pack_from_packx4_N_1()) { return 0; } if (!to_pack_from_packx4_N_2()) { return 0; } if (!to_pack_from_packx4_N_3()) { return 0; } return 1; } int main(void) { if (!test_all() || !test_all() || !test_all() || !test_all() || !test_all() || !test_all() || !test_all() || !test_all() || !test_all() || !test_all()) { return -1; } fprintf(stdout, STATUS "... OK\n"); fflush(stdout); return 0; } ================================================ FILE: tests/to_pack_interleave.cpp ================================================ #define STATUS "test of to_pack_interleave over all types" #include "tests_helpers.hpp" template bool to_pack_interleave_from_pack_1_N_1() { LOG_TEST_DEBUG("to_pack_interleave_from_pack_1_N_1", T); nsimd::pack pack_from(42); nsimd::pack pack_to = nsimd::to_pack_interleave(pack_from); nsimd::scoped_aligned_mem_for expected(NSIMD_MAX_REGISTER_SIZE_BYTES); nsimd::scoped_aligned_mem_for computed(NSIMD_MAX_REGISTER_SIZE_BYTES); return nsimd_tests::check_pack_expected_vs_computed( pack_from, pack_to, "nsimd::pack", "nsimd::pack", expected.get(), computed.get()); } template bool to_pack_interleave_from_packx2_N_1() { LOG_TEST_DEBUG("to_pack_interleave_from_packx2_N_1", T); nsimd::pack v0(42); nsimd::pack v1(24); nsimd::packx2 pack_from; pack_from.v0 = v0; pack_from.v1 = v1; nsimd::scoped_aligned_mem_for expected(2 * NSIMD_MAX_REGISTER_SIZE_BYTES); nsimd::scoped_aligned_mem_for computed(2 * NSIMD_MAX_REGISTER_SIZE_BYTES); const int len_ = nsimd::len(nsimd::packx2()); nsimd_tests::init_arrays(expected.get(), computed.get(), len_); T *begin = expected.get(); nsimd::storea(begin, nsimd::pack(pack_from.v0.car)); begin += nsimd::len(nsimd::pack()); nsimd::storea(begin, nsimd::pack(pack_from.v1.car)); LOG_MEMORY_CONTENT_DEBUG(expected.get(), len_, "nsimd::packx2"); nsimd::pack pack_to = nsimd::to_pack_interleave(pack_from); nsimd::storea(computed.get(), pack_to); LOG_MEMORY_CONTENT_DEBUG(computed.get(), len_, "nsimd::pack"); return nsimd_tests::check_arrays(expected.get(), computed.get(), len_); } template bool to_pack_interleave_from_packx2_N_2() { LOG_TEST_DEBUG("to_pack_interleave_from_packx2_N_2", T); nsimd::pack v0(42); nsimd::pack v1(24); nsimd::packx2 pack_from; pack_from.v0 = v0; pack_from.v1 = v1; nsimd::scoped_aligned_mem_for expected(4 * NSIMD_MAX_REGISTER_SIZE_BYTES); nsimd::scoped_aligned_mem_for computed(4 * NSIMD_MAX_REGISTER_SIZE_BYTES); const int len_ = nsimd::len(nsimd::packx2()); nsimd_tests::init_arrays(expected.get(), computed.get(), len_); T *begin = expected.get(); nsimd::storea(begin, nsimd::pack(pack_from.v0.car)); begin += nsimd::len(nsimd::pack()); nsimd::storea(begin, nsimd::pack(pack_from.v1.car)); begin += nsimd::len(nsimd::pack()); nsimd::storea(begin, nsimd::pack(pack_from.v0.cdr.car)); begin += nsimd::len(nsimd::pack()); nsimd::storea(begin, nsimd::pack(pack_from.v1.cdr.car)); LOG_MEMORY_CONTENT_DEBUG(expected.get(), len_, "nsimd::packx2"); nsimd::pack pack_to = nsimd::to_pack_interleave(pack_from); nsimd::storea(computed.get(), pack_to); LOG_MEMORY_CONTENT_DEBUG(computed.get(), len_, "nsimd::pack"); return nsimd_tests::check_arrays(expected.get(), computed.get(), len_); } template bool to_pack_interleave_from_packx3_N_2() { LOG_TEST_DEBUG("to_pack_interleave_from_packx3_N_2", T); nsimd::pack v0(42); nsimd::pack v1(24); nsimd::pack v2(66); nsimd::packx3 pack_from; pack_from.v0 = v0; pack_from.v1 = v1; pack_from.v2 = v2; nsimd::scoped_aligned_mem_for expected(6 * NSIMD_MAX_REGISTER_SIZE_BYTES); nsimd::scoped_aligned_mem_for computed(6 * NSIMD_MAX_REGISTER_SIZE_BYTES); const int len_ = nsimd::len(nsimd::packx3()); nsimd_tests::init_arrays(expected.get(), computed.get(), len_); T *begin = expected.get(); nsimd::storea(begin, nsimd::pack(pack_from.v0.car)); begin += nsimd::len(nsimd::pack()); nsimd::storea(begin, nsimd::pack(pack_from.v1.car)); begin += nsimd::len(nsimd::pack()); nsimd::storea(begin, nsimd::pack(pack_from.v2.car)); begin += nsimd::len(nsimd::pack()); nsimd::storea(begin, nsimd::pack(pack_from.v0.cdr.car)); begin += nsimd::len(nsimd::pack()); nsimd::storea(begin, nsimd::pack(pack_from.v1.cdr.car)); begin += nsimd::len(nsimd::pack()); nsimd::storea(begin, nsimd::pack(pack_from.v2.cdr.car)); LOG_MEMORY_CONTENT_DEBUG(expected.get(), len_, "nsimd::packx3"); nsimd::pack pack_to = nsimd::to_pack_interleave(pack_from); nsimd::storea(computed.get(), pack_to); LOG_MEMORY_CONTENT_DEBUG(computed.get(), len_, "nsimd::pack"); return nsimd_tests::check_arrays(expected.get(), computed.get(), len_); } template bool to_pack_interleave_from_packx3_N_3() { LOG_TEST_DEBUG("to_pack_interleave_from_packx3_N_3", T); nsimd::pack v0(42); nsimd::pack v1(24); nsimd::pack v2(66); nsimd::packx3 pack_from; pack_from.v0 = v0; pack_from.v1 = v1; pack_from.v2 = v2; nsimd::scoped_aligned_mem_for expected(9 * NSIMD_MAX_REGISTER_SIZE_BYTES); nsimd::scoped_aligned_mem_for computed(9 * NSIMD_MAX_REGISTER_SIZE_BYTES); const int len_ = nsimd::len(nsimd::packx3()); nsimd_tests::init_arrays(expected.get(), computed.get(), len_); T *begin = expected.get(); nsimd::storea(begin, nsimd::pack(pack_from.v0.car)); begin += nsimd::len(nsimd::pack()); nsimd::storea(begin, nsimd::pack(pack_from.v1.car)); begin += nsimd::len(nsimd::pack()); nsimd::storea(begin, nsimd::pack(pack_from.v2.car)); begin += nsimd::len(nsimd::pack()); nsimd::storea(begin, nsimd::pack(pack_from.v0.cdr.car)); begin += nsimd::len(nsimd::pack()); nsimd::storea(begin, nsimd::pack(pack_from.v1.cdr.car)); begin += nsimd::len(nsimd::pack()); nsimd::storea(begin, nsimd::pack(pack_from.v2.cdr.car)); begin += nsimd::len(nsimd::pack()); nsimd::storea(begin, nsimd::pack(pack_from.v0.cdr.cdr.car)); begin += nsimd::len(nsimd::pack()); nsimd::storea(begin, nsimd::pack(pack_from.v1.cdr.cdr.car)); begin += nsimd::len(nsimd::pack()); nsimd::storea(begin, nsimd::pack(pack_from.v2.cdr.cdr.car)); LOG_MEMORY_CONTENT_DEBUG(expected.get(), len_, "nsimd::packx3"); nsimd::pack pack_to = nsimd::to_pack_interleave(pack_from); nsimd::storea(computed.get(), pack_to); LOG_MEMORY_CONTENT_DEBUG(computed.get(), len_, "nsimd::pack"); return nsimd_tests::check_arrays(expected.get(), computed.get(), len_); } template bool to_pack_interleave_from_packx4_N_1() { LOG_TEST_DEBUG("to_pack_interleave_from_packx4_N_1", T); nsimd::pack v0(42); nsimd::pack v1(24); nsimd::pack v2(66); nsimd::pack v3(132); nsimd::packx4 pack_from; pack_from.v0 = v0; pack_from.v1 = v1; pack_from.v2 = v2; pack_from.v3 = v3; nsimd::scoped_aligned_mem_for expected(4 * NSIMD_MAX_REGISTER_SIZE_BYTES); nsimd::scoped_aligned_mem_for computed(4 * NSIMD_MAX_REGISTER_SIZE_BYTES); const int len_ = nsimd::len(nsimd::packx4()); nsimd_tests::init_arrays(expected.get(), computed.get(), len_); T *begin = expected.get(); nsimd::storea(begin, nsimd::pack(pack_from.v0.car)); begin += nsimd::len(nsimd::pack()); nsimd::storea(begin, nsimd::pack(pack_from.v1.car)); begin += nsimd::len(nsimd::pack()); nsimd::storea(begin, nsimd::pack(pack_from.v2.car)); begin += nsimd::len(nsimd::pack()); nsimd::storea(begin, nsimd::pack(pack_from.v3.car)); LOG_MEMORY_CONTENT_DEBUG(expected.get(), len_, "nsimd::packx4"); nsimd::pack pack_to = nsimd::to_pack_interleave(pack_from); nsimd::storea(computed.get(), pack_to); LOG_MEMORY_CONTENT_DEBUG(computed.get(), len_, "nsimd::pack"); return nsimd_tests::check_arrays(expected.get(), computed.get(), len_); } template bool to_pack_interleave_from_packx4_N_2() { LOG_TEST_DEBUG("to_pack_interleave_from_packx4_N_2", T); nsimd::pack v0(42); nsimd::pack v1(24); nsimd::pack v2(66); nsimd::pack v3(132); nsimd::packx4 pack_from; pack_from.v0 = v0; pack_from.v1 = v1; pack_from.v2 = v2; pack_from.v3 = v3; nsimd::scoped_aligned_mem_for expected(8 * NSIMD_MAX_REGISTER_SIZE_BYTES); nsimd::scoped_aligned_mem_for computed(8 * NSIMD_MAX_REGISTER_SIZE_BYTES); const int len_ = nsimd::len(nsimd::packx4()); nsimd_tests::init_arrays(expected.get(), computed.get(), len_); T *begin = expected.get(); nsimd::storea(begin, nsimd::pack(pack_from.v0.car)); begin += nsimd::len(nsimd::pack()); nsimd::storea(begin, nsimd::pack(pack_from.v1.car)); begin += nsimd::len(nsimd::pack()); nsimd::storea(begin, nsimd::pack(pack_from.v2.car)); begin += nsimd::len(nsimd::pack()); nsimd::storea(begin, nsimd::pack(pack_from.v3.car)); begin += nsimd::len(nsimd::pack()); nsimd::storea(begin, nsimd::pack(pack_from.v0.cdr.car)); begin += nsimd::len(nsimd::pack()); nsimd::storea(begin, nsimd::pack(pack_from.v1.cdr.car)); begin += nsimd::len(nsimd::pack()); nsimd::storea(begin, nsimd::pack(pack_from.v2.cdr.car)); begin += nsimd::len(nsimd::pack()); nsimd::storea(begin, nsimd::pack(pack_from.v3.cdr.car)); LOG_MEMORY_CONTENT_DEBUG(expected.get(), len_, "nsimd::packx4"); nsimd::pack pack_to = nsimd::to_pack_interleave(pack_from); nsimd::storea(computed.get(), pack_to); LOG_MEMORY_CONTENT_DEBUG(computed.get(), len_, "nsimd::pack"); return nsimd_tests::check_arrays(expected.get(), computed.get(), len_); } template bool to_pack_interleave_from_packx4_N_3() { LOG_TEST_DEBUG("to_pack_interleave_from_packx4_N_3", T); nsimd::pack v0(42); nsimd::pack v1(24); nsimd::pack v2(66); nsimd::pack v3(132); nsimd::packx4 pack_from; pack_from.v0 = v0; pack_from.v1 = v1; pack_from.v2 = v2; pack_from.v3 = v3; nsimd::scoped_aligned_mem_for expected(12 * NSIMD_MAX_REGISTER_SIZE_BYTES); nsimd::scoped_aligned_mem_for computed(12 * NSIMD_MAX_REGISTER_SIZE_BYTES); const int len_ = nsimd::len(nsimd::packx4()); nsimd_tests::init_arrays(expected.get(), computed.get(), len_); T *begin = expected.get(); nsimd::storea(begin, nsimd::pack(pack_from.v0.car)); begin += nsimd::len(nsimd::pack()); nsimd::storea(begin, nsimd::pack(pack_from.v1.car)); begin += nsimd::len(nsimd::pack()); nsimd::storea(begin, nsimd::pack(pack_from.v2.car)); begin += nsimd::len(nsimd::pack()); nsimd::storea(begin, nsimd::pack(pack_from.v3.car)); begin += nsimd::len(nsimd::pack()); nsimd::storea(begin, nsimd::pack(pack_from.v0.cdr.car)); begin += nsimd::len(nsimd::pack()); nsimd::storea(begin, nsimd::pack(pack_from.v1.cdr.car)); begin += nsimd::len(nsimd::pack()); nsimd::storea(begin, nsimd::pack(pack_from.v2.cdr.car)); begin += nsimd::len(nsimd::pack()); nsimd::storea(begin, nsimd::pack(pack_from.v3.cdr.car)); begin += nsimd::len(nsimd::pack()); nsimd::storea(begin, nsimd::pack(pack_from.v0.cdr.cdr.car)); begin += nsimd::len(nsimd::pack()); nsimd::storea(begin, nsimd::pack(pack_from.v1.cdr.cdr.car)); begin += nsimd::len(nsimd::pack()); nsimd::storea(begin, nsimd::pack(pack_from.v2.cdr.cdr.car)); begin += nsimd::len(nsimd::pack()); nsimd::storea(begin, nsimd::pack(pack_from.v3.cdr.cdr.car)); LOG_MEMORY_CONTENT_DEBUG(expected.get(), len_, "nsimd::packx4"); nsimd::pack pack_to = nsimd::to_pack_interleave(pack_from); nsimd::storea(computed.get(), pack_to); LOG_MEMORY_CONTENT_DEBUG(computed.get(), len_, "nsimd::pack"); return nsimd_tests::check_arrays(expected.get(), computed.get(), len_); } template bool test_all() { if (!to_pack_interleave_from_pack_1_N_1()) { return 0; } if (!to_pack_interleave_from_packx2_N_1()) { return 0; } if (!to_pack_interleave_from_packx2_N_2()) { return 0; } if (!to_pack_interleave_from_packx3_N_2()) { return 0; } if (!to_pack_interleave_from_packx3_N_3()) { return 0; } if (!to_pack_interleave_from_packx4_N_1()) { return 0; } if (!to_pack_interleave_from_packx4_N_2()) { return 0; } if (!to_pack_interleave_from_packx4_N_3()) { return 0; } return 1; } int main(void) { if (!test_all() || !test_all() || !test_all() || !test_all() || !test_all() || !test_all() || !test_all() || !test_all() || !test_all() || !test_all()) { return -1; } fprintf(stdout, STATUS "... OK\n"); fflush(stdout); return 0; } ================================================ FILE: tests/ufp.cpp ================================================ /* Copyright (c) 2019 Agenium Scale Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #include #include // ---------------------------------------------------------------------------- template U randbits() { U ret = 0; U mask = ((U)1 << CHAR_BIT) - 1; for (int i = 0; i < (int)sizeof(U); i++) { ret = (U)(ret | (U)((((U)rand()) & mask) << (CHAR_BIT * i))); } return ret; } // ---------------------------------------------------------------------------- template int log_std_ulp(U a, U b) { U d = (U)(a < b ? b - a : a - b); int i = 0; for (; i < 63 && d >= (U)1 << i; i++) ; return i; } // ---------------------------------------------------------------------------- template struct mantissa{}; template <> struct mantissa { static const int size = 53; }; template <> struct mantissa { static const int size = 24; }; template <> struct mantissa { static const int size = 11; }; // ---------------------------------------------------------------------------- template int test_ufp(int n) { T a = nsimd::scalar_cvt(T(), (U)1); U ua = nsimd::scalar_reinterpret(U(), a); T ap1 = nsimd::scalar_reinterpret(T(), (U)(ua + 1)); if (nsimd::ufp(a, ap1) != mantissa::size - 1) { return -1; } T am1 = nsimd::scalar_reinterpret(T(), (U)(ua - 1)); if (nsimd::ufp(a, am1) != mantissa::size - 1) { return -1; } if (nsimd::ufp(a, a) != mantissa::size) { return -1; } if (nsimd::ufp(a, a) != mantissa::size) { return -1; } if (nsimd::ufp(a, a) != mantissa::size) { return -1; } T ax4 = nsimd::scalar_cvt(T(), (U)4); if (nsimd::ufp(a, ax4) != 0) { return -1; } U mask = (U)1 << (mantissa::size - 1); U exponent = (U)((~mask) & ua); for (int i = 0; i < n; i++) { U ub = exponent | (randbits() & mask); T b = nsimd::scalar_reinterpret(T(), ub); U uc = exponent | (randbits() & mask); T c = nsimd::scalar_reinterpret(T(), uc); if (nsimd::ufp(b, c) != mantissa::size - log_std_ulp(ub, uc)) { return -1; } } return 0; } // ---------------------------------------------------------------------------- int main(void) { int n = 10000; return test_ufp(n) || test_ufp(n) || test_ufp(n); }