[
  {
    "path": ".clang-format",
    "content": "Standard: Cpp03\nColumnLimit: 79\n"
  },
  {
    "path": ".gitignore",
    "content": "# Common build dirs\nbuild*/\n\n# Dependencies\nnstools/\n\n# Binaries\n*.o\n*.so\n*.pyc\n*.exe\n*.dll\n*.dylib\n\n# Generated files\n## API\nsrc/api_*.cpp\nsrc/api_*\n\n## Plateform specific code\ninclude/nsimd/arm\ninclude/nsimd/cpu\ninclude/nsimd/cxx_adv_api_functions.hpp\ninclude/nsimd/friendly_but_not_optimized.hpp\ninclude/nsimd/functions.h\ninclude/nsimd/ppc\ninclude/nsimd/x86\n\n## Tests\ntests/c_base\ntests/cxx_base\ntests/cxx_adv\ntests/modules/tet1d/\ntests/modules/fixed_point/\ntests/modules/rand/*.cpp\ntests/modules/spmd/\ntests/modules/random/\n\n## Benches\nbenches/cxx_adv\n\n## Modules\ninclude/nsimd/modules/tet1d/\ninclude/nsimd/modules/spmd/\ninclude/nsimd/modules/fixed_point/\ninclude/nsimd/scalar_utilities.h\n\n## Doc\ndoc/html/*\n!doc/html/assets/\ndoc/markdown/overview.md\ndoc/markdown/api.md\ndoc/markdown/api_*.md\ndoc/markdown/module_fixed_point_api*.md\ndoc/markdown/module_fixed_point_overview.md\ndoc/markdown/module_spmd_api*.md\ndoc/markdown/module_spmd_overview.md\ndoc/markdown/module_memory_management_overview.md\ndoc/md2html\ndoc/tmp.html\n\n## Ulps\nulps/\n\n## CI\n_ci/\n"
  },
  {
    "path": "CMakeLists.txt",
    "content": "# MIT License\n#\n# Copyright (c) 2021 Agenium Scale\n#\n# Permission is hereby granted, free of charge, to any person obtaining a copy\n# of this software and associated documentation files (the \"Software\"), to deal\n# in the Software without restriction, including without limitation the rights\n# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n# copies of the Software, and to permit persons to whom the Software is\n# furnished to do so, subject to the following conditions:\n#\n# The above copyright notice and this permission notice shall be included in all\n# copies or substantial portions of the Software.\n#\n# THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n# SOFTWARE.\n\ncmake_minimum_required(VERSION 3.0.2)\nproject(NSIMD VERSION 3.0 LANGUAGES C CXX)\n\n# -----------------------------------------------------------------------------\n# First check that NSIMD code has been generated\n\nif (NOT EXISTS \"${CMAKE_CURRENT_SOURCE_DIR}/include/nsimd/functions.h\")\n  if (WIN32)\n    execute_process(COMMAND\n                    python ${CMAKE_CURRENT_SOURCE_DIR}\\\\egg\\\\hatch.py -lf)\n  else()\n    execute_process(COMMAND\n                    python3 ${CMAKE_CURRENT_SOURCE_DIR}/egg/hatch.py -lf)\n  endif()\nendif()\n\n# -----------------------------------------------------------------------------\n# Compilations options\n\noption(NSIMD_ARM32_IS_ARMEL \"Set whether ARM32 is in fact armel or armhf\" ON)\n\nfunction(nsimd_get_compiler_argument simd_ext argument)\n  if (MSVC)\n    if (CMAKE_CL_64)\n      set(mapping_sse2 \"/DSSE2\")\n      set(mapping_sse42 \"/DSSE42\")\n    else()  \n      set(mapping_sse2 \"/DSSE2;/arch:SSE2\")\n      set(mapping_sse42 \"/DSSE42;/arch:SSE2\")\n    endif()\n    set(mapping_avx \"/DAVX;/arch:AVX\")\n    set(mapping_avx2 \"/DAVX2;/arch:AVX2\")\n    set(mapping_avx512_knl \"/DAVX512_KNL;/arch:AVX512\")\n    set(mapping_avx512_skylake \"/DAVX512_SKYLAKE;/arch:AVX512\")\n    set(mapping_neon128 \"/DNEON128;/arch:VFPv4\")\n    set(mapping_aarch64 \"/DAARCH64\")\n    set(mapping_sve \"/DSVE\")\n    set(mapping_sve128 \"/DSVE128\")\n    set(mapping_sve256 \"/DSVE256\")\n    set(mapping_sve512 \"/DSVE512\")\n    set(mapping_sve1024 \"/DSVE1024\")\n    set(mapping_sve2048 \"/DSVE2048\")\n    set(mapping_vmx \"/DVMX\")\n    set(mapping_vsx \"/DVSX\")\n    set(mapping_cuda \"/DCUDA\")\n    set(mapping_rocm \"/DROCM\")\n    set(mapping_oneapi \"/ONEAPI\")\n  else()\n    set(mapping_sse2 \"-DSSE2;-msse2\" )\n    set(mapping_sse42 \"-DSSE42;-msse4.2\" )\n    set(mapping_avx \"-DAVX;-mavx;-mno-avx256-split-unaligned-load\"\n                    \";-mno-avx256-split-unaligned-store\" )\n    set(mapping_avx2 \"-DAVX2;-mavx2;-mfma;-mno-avx256-split-unaligned-load\"\n                     \";-mno-avx256-split-unaligned-store\" )\n    set(mapping_avx512_knl \"-DAVX512_KNL;-mavx512f;-mavx512pf;-mavx512er\"\n                           \";-mavx512cd\")\n    set(mapping_avx512_skylake \"-DAVX512_SKYLAKE;-mavx512f;-mavx512dq\"\n                               \";-mavx512cd;-mavx512bw;-mavx512vl\")\n    if (NSIMD_ARM32_IS_ARMEL)\n      set(mapping_neon128 \"-DNEON128;-mfloat-abi=softfp;-mfpu=neon\")\n    else()\n      set(mapping_neon128 \"-DNEON128;-mfpu=neon\")\n    endif()\n    set(mapping_aarch64 \"-DAARCH64\")\n    set(mapping_sve \"-DSVE;-march=armv8.2-a+sve\")\n    set(mapping_sve128 \"-DSVE128;-march=armv8.2-a+sve;-msve-vector-bits=128\")\n    set(mapping_sve256 \"-DSVE256;-march=armv8.2-a+sve;-msve-vector-bits=256\")\n    set(mapping_sve512 \"-DSVE512;-march=armv8.2-a+sve;-msve-vector-bits=512\")\n    set(mapping_sve1024 \"-DSVE1024;-march=armv8.2-a+sve\"\n                        \";-msve-vector-bits=1024\")\n    set(mapping_sve2048 \"-DSVE2048 -march=armv8.2-a+sve\"\n                        \";-msve-vector-bits=2048\")\n    set(mapping_vmx \"-DVMX;-mcpu=powerpc64le;-maltivec\")\n    set(mapping_vsx \"-DVSX;-mcpu=powerpc64le;-mvsx\")\n    set(mapping_cuda \"-DCUDA\")\n    set(mapping_rocm \"-DROCM\")\n    set(mapping_oneapi \"-DONEAPI\")\n  endif()\n  if (DEFINED mapping_${simd_ext})\n    set(${argument} \"${mapping_${simd_ext}}\" PARENT_SCOPE)\n  else()\n    if (MSVC)\n      set(${argument} \"/DCPU\" PARENT_SCOPE)\n    else()\n      set(${argument} \"-DCPU\" PARENT_SCOPE)\n    endif()\n  endif()\nendfunction()\n\nif (NOT DEFINED simd)\n  set(simd \"cpu\")\nendif()\nnsimd_get_compiler_argument(${simd} NSIMD_COMPILATION_OPTIONS)\n\n# -----------------------------------------------------------------------------\n# Object file selection\n\nset(NSIMD_OBJS \"fp16;gpu;memory;api_cpu;rempitab;sleefsp;sleefdp\")\n\nif (\"${simd}\" STREQUAL \"sse2\")\n  set(NSIMD_OBJS \"${NSIMD_OBJS};api_sse2;sleef_sse2_f32;sleef_sse2_f64\")\nelseif (\"${simd}\" STREQUAL \"sse42\")\n  set(NSIMD_OBJS \"${NSIMD_OBJS};api_sse2;api_sse42;\"\n                 \"sleef_sse2_f32;sleef_sse2_f64;\"\n                 \"sleef_sse42_f32;sleef_sse42_f64\")\nelseif (\"${simd}\" STREQUAL \"avx\")\n  set(NSIMD_OBJS \"${NSIMD_OBJS};api_sse2;api_sse42;api_avx;\"\n                 \"sleef_sse2_f32;sleef_sse2_f64;\"\n                 \"sleef_sse42_f32;sleef_sse42_f64;\"\n                 \"sleef_avx_f32;sleef_avx_f64\")\nelseif (\"${simd}\" STREQUAL \"avx2\")\n  set(NSIMD_OBJS \"${NSIMD_OBJS};api_sse2;api_sse42;api_avx;api_avx2;\"\n                 \"sleef_sse2_f32;sleef_sse2_f64;\"\n                 \"sleef_sse42_f32;sleef_sse42_f64;\"\n                 \"sleef_avx_f32;sleef_avx_f64;\"\n                 \"sleef_avx2_f32;sleef_avx2_f64\")\nelseif (\"${simd}\" STREQUAL \"avx512_knl\")\n  set(NSIMD_OBJS \"${NSIMD_OBJS};api_sse2;api_sse42;api_avx;api_avx2\"\n                 \"sleef_sse2_f32;sleef_sse2_f64;\"\n                 \"sleef_sse42_f32;sleef_sse42_f64;\"\n                 \"sleef_avx_f32;sleef_avx_f64;\"\n                 \"sleef_avx2_f32;sleef_avx2_f64;\"\n                 \"api_avx512_knl;sleef_avx512_knl_f32;sleef_avx512_knl_f64\")\nelseif (\"${simd}\" STREQUAL \"avx512_skylake\")\n  set(NSIMD_OBJS \"${NSIMD_OBJS};api_sse2;api_sse42;api_avx;api_avx2;\"\n                 \"api_avx512_skylake;sleef_avx512_skylake_f32;\"\n                 \"sleef_sse2_f32;sleef_sse2_f64;\"\n                 \"sleef_sse42_f32;sleef_sse42_f64;\"\n                 \"sleef_avx_f32;sleef_avx_f64;\"\n                 \"sleef_avx2_f32;sleef_avx2_f64;\"\n                 \"sleef_avx512_skylake_f64\")\nelseif (\"${simd}\" STREQUAL \"neon128\")\n  set(NSIMD_OBJS \"${NSIMD_OBJS};api_neon128;\"\n                 \"sleef_neon128_f32;sleef_neon128_f64\")\nelseif (\"${simd}\" STREQUAL \"aarch64\")\n  set(NSIMD_OBJS \"${NSIMD_OBJS};api_aarch64;\"\n                 \"sleef_aarch64_f32;sleef_aarch64_f64\")\nelseif (\"${simd}\" STREQUAL \"sve\")\n  set(NSIMD_OBJS \"${NSIMD_OBJS};api_aarch64;api_sve;\"\n                 \"sleef_aarch64_f32;sleef_aarch64_f64;\"\n                 \"sleef_sve_f32;sleef_sve_f64\")\nelseif (\"${simd}\" STREQUAL \"sve128\")\n  set(NSIMD_OBJS \"${NSIMD_OBJS};api_aarch64;api_sve128;\"\n                 \"sleef_aarch64_f32;sleef_aarch64_f64;\"\n                 \"sleef_sve128_f32;sleef_sve128_f64\")\nelseif (\"${simd}\" STREQUAL \"sve256\")\n  set(NSIMD_OBJS \"${NSIMD_OBJS};api_aarch64;api_sve256;\"\n                 \"sleef_aarch64_f32;sleef_aarch64_f64;\"\n                 \"sleef_sve256_f32;sleef_sve256_f64\")\nelseif (\"${simd}\" STREQUAL \"sve512\")\n  set(NSIMD_OBJS \"${NSIMD_OBJS};api_aarch64;api_sve512;\"\n                 \"sleef_aarch64_f32;sleef_aarch64_f64;\"\n                 \"sleef_sve512_f32;sleef_sve512_f64\")\nelseif (\"${simd}\" STREQUAL \"sve1024\")\n  set(NSIMD_OBJS \"${NSIMD_OBJS};api_aarch64;api_sve1024;\"\n                 \"sleef_aarch64_f32;sleef_aarch64_f64;\"\n                 \"sleef_sve1024_f32;sleef_sve1024_f64\")\nelseif (\"${simd}\" STREQUAL \"sve2048\")\n  set(NSIMD_OBJS \"${NSIMD_OBJS};api_aarch64;api_sve2048;\"\n                 \"sleef_aarch64_f32;sleef_aarch64_f64;\"\n                 \"sleef_sve2048_f32;sleef_sve2048_f64\")\nelseif (\"${simd}\" STREQUAL \"vmx\")\n  set(NSIMD_OBJS \"${NSIMD_OBJS};api_vmx;sleef_vmx_f32;sleef_vmx_f64\")\nelseif (\"${simd}\" STREQUAL \"vsx\")\n  set(NSIMD_OBJS \"${NSIMD_OBJS};api_vmx;api_vsx;sleef_vmx_f32;sleef_vmx_f64;\"\n                 \"sleef_vsx_f32;sleef_vsx_f64\")\nendif()\n\n# -----------------------------------------------------------------------------\n# Rules for building the library\n\nset(NSIMD_LIB_DEPS \"\")\nforeach(o ${NSIMD_OBJS})\n  if (EXISTS \"${CMAKE_CURRENT_SOURCE_DIR}/src/${o}.cpp\")\n    add_library(${o} OBJECT \"${CMAKE_CURRENT_SOURCE_DIR}/src/${o}.cpp\")\n  elseif(EXISTS \"${CMAKE_CURRENT_SOURCE_DIR}/src/${o}.c\")\n    add_library(${o} OBJECT \"${CMAKE_CURRENT_SOURCE_DIR}/src/${o}.c\")\n  elseif((\"${o}\" STREQUAL \"sleef_neon128_f64\") OR\n         (\"${o}\" STREQUAL \"sleef_vmx_f64\"))\n    add_library(${o} OBJECT\n                \"${CMAKE_CURRENT_SOURCE_DIR}/src/sleefsimddp_emulation.c\")\n  elseif(\"${o}\" STREQUAL \"sleef_vmx_f32\")\n    add_library(${o} OBJECT\n                \"${CMAKE_CURRENT_SOURCE_DIR}/src/sleefsimdsp_emulation.c\")\n  elseif(o MATCHES \"sleef_.*_f32\")\n    add_library(${o} OBJECT \"${CMAKE_CURRENT_SOURCE_DIR}/src/sleefsimdsp.c\")\n  elseif(o MATCHES \"sleef_.*_f64\")\n    add_library(${o} OBJECT \"${CMAKE_CURRENT_SOURCE_DIR}/src/sleefsimddp.c\")\n  endif()\n  if (MSVC)\n    set(sleef_cflags \"/DNDEBUG;/DDORENAME=1\")\n  else()\n    set(sleef_cflags \"-DNDEBUG;-DDORENAME=1\")\n  endif()\n  set_property(TARGET ${o} PROPERTY POSITION_INDEPENDENT_CODE ON)\n  target_include_directories(${o} PUBLIC \"${CMAKE_CURRENT_SOURCE_DIR}/include\")\n  if (MSVC)\n    target_compile_definitions(${o} PUBLIC \"/D_CRT_SECURE_NO_WARNINGS\")\n  endif()\n  set(buf \"\")\n  if (\"${o}\" STREQUAL \"api_sse2\")\n    nsimd_get_compiler_argument(\"sse2\" buf)\n  elseif (\"${o}\" STREQUAL \"api_sse42\")\n    nsimd_get_compiler_argument(\"sse42\" buf)\n  elseif (\"${o}\" STREQUAL \"api_avx\")\n    nsimd_get_compiler_argument(\"avx\" buf)\n  elseif (\"${o}\" STREQUAL \"api_avx2\")\n    nsimd_get_compiler_argument(\"avx2\" buf)\n  elseif (\"${o}\" STREQUAL \"api_avx512_knl\")\n    nsimd_get_compiler_argument(\"avx512_knl\" buf)\n  elseif (\"${o}\" STREQUAL \"api_avx512_skylake\")\n    nsimd_get_compiler_argument(\"avx512_skylake\" buf)\n  elseif (\"${o}\" STREQUAL \"api_neon128\")\n    nsimd_get_compiler_argument(\"neon128\" buf)\n  elseif (\"${o}\" STREQUAL \"api_aarch64\")\n    nsimd_get_compiler_argument(\"aarch64\" buf)\n  elseif (\"${o}\" STREQUAL \"api_sve\")\n    nsimd_get_compiler_argument(\"sve\" buf)\n  elseif (\"${o}\" STREQUAL \"api_sve128\")\n    nsimd_get_compiler_argument(\"sve128\" buf)\n  elseif (\"${o}\" STREQUAL \"api_sve256\")\n    nsimd_get_compiler_argument(\"sve256\" buf)\n  elseif (\"${o}\" STREQUAL \"api_sve512\")\n    nsimd_get_compiler_argument(\"sve512\" buf)\n  elseif (\"${o}\" STREQUAL \"api_sve1024\")\n    nsimd_get_compiler_argument(\"sve1024\" buf)\n  elseif (\"${o}\" STREQUAL \"api_sve2048\")\n    nsimd_get_compiler_argument(\"sve2048\" buf)\n  elseif (\"${o}\" STREQUAL \"api_vmx\")\n    nsimd_get_compiler_argument(\"vmx\" buf)\n  elseif (\"${o}\" STREQUAL \"api_vsx\")\n    nsimd_get_compiler_argument(\"vsx\" buf)\n  elseif (\"${o}\" STREQUAL \"api_cuda\")\n    nsimd_get_compiler_argument(\"cuda\" buf)\n  elseif (\"${o}\" STREQUAL \"api_rocm\")\n    nsimd_get_compiler_argument(\"rocm\" buf)\n  elseif (\"${o}\" STREQUAL \"api_cpu\")\n    nsimd_get_compiler_argument(\"cpu\" buf)\n  elseif (\"${o}\" STREQUAL \"rempitab\")\n    list(APPEND buf \"${sleef_cflags}\")\n  elseif (\"${o}\" STREQUAL \"sleefsp\")\n    list(APPEND buf \"${sleef_cflags}\")\n  elseif (\"${o}\" STREQUAL \"sleefdp\")\n    list(APPEND buf \"${sleef_cflags}\")\n  elseif (\"${o}\" MATCHES \"sleef_sse2_\")\n    nsimd_get_compiler_argument(\"sse2\" buf)\n    list(APPEND buf \"-DNSIMD_SSE2;-DENABLE_SSE2=1;${sleef_cflags}\")\n  elseif (\"${o}\" MATCHES \"sleef_sse42_\")\n    nsimd_get_compiler_argument(\"sse42\" buf)\n    list(APPEND buf \"-DNSIMD_SSE42;-DENABLE_SSE4=1;${sleef_cflags}\")\n  elseif (\"${o}\" MATCHES \"sleef_avx_\")\n    nsimd_get_compiler_argument(\"avx\" buf)\n    list(APPEND buf \"-DNSIMD_AVX;-DENABLE_AVX=1;${sleef_cflags}\")\n  elseif (\"${o}\" MATCHES \"sleef_avx2_\")\n    nsimd_get_compiler_argument(\"avx2\" buf)\n    list(APPEND buf \"-DNSIMD_AVX2;-DENABLE_AVX2=1;${sleef_cflags}\")\n  elseif (\"${o}\" MATCHES \"sleef_avx512_knl_\")\n    nsimd_get_compiler_argument(\"avx512_knl\" buf)\n    list(APPEND buf \"-DNSIMD_AVX512_KNL;-DENABLE_AVX512F=1;${sleef_cflags}\")\n  elseif (\"${o}\" MATCHES \"sleef_avx512_skylake_\")\n    nsimd_get_compiler_argument(\"avx512_skylake\" buf)\n    list(APPEND buf\n         \"-DNSIMD_AVX512_SKYLAKE;-DENABLE_AVX512F=1;${sleef_cflags}\")\n  elseif (\"${o}\" MATCHES \"sleef_neon128_\")\n    nsimd_get_compiler_argument(\"neon128\" buf)\n    list(APPEND buf \"-DNSIMD_NEON128;-DENABLE_NEON32=1;${sleef_cflags}\")\n  elseif (\"${o}\" MATCHES \"sleef_aarch64_\")\n    nsimd_get_compiler_argument(\"aarch64\" buf)\n    list(APPEND buf \"-DNSIMD_AARCH64;-DENABLE_ADVSIMD=1;${sleef_cflags}\")\n  elseif (\"${o}\" MATCHES \"sleef_sve_\")\n    nsimd_get_compiler_argument(\"sve\" buf)\n    list(APPEND buf \"-DNSIMD_SVE;-DENABLE_SVE=1;${sleef_cflags}\")\n  elseif (\"${o}\" MATCHES \"sleef_sve128_\")\n    nsimd_get_compiler_argument(\"sve128\" buf)\n    list(APPEND buf \"-DNSIMD_SVE128;-DENABLE_SVE=1;${sleef_cflags}\")\n  elseif (\"${o}\" MATCHES \"sleef_sve256_\")\n    nsimd_get_compiler_argument(\"sve256\" buf)\n    list(APPEND buf \"-DNSIMD_SVE256;-DENABLE_SVE=1;${sleef_cflags}\")\n  elseif (\"${o}\" MATCHES \"sleef_sve512_\")\n    nsimd_get_compiler_argument(\"sve512\" buf)\n    list(APPEND buf \"-DNSIMD_SVE512;-DENABLE_SVE=1;${sleef_cflags}\")\n  elseif (\"${o}\" MATCHES \"sleef_sve1024_\")\n    nsimd_get_compiler_argument(\"sve1024\" buf)\n    list(APPEND buf \"-DNSIMD_SVE1024;-DENABLE_SVE=1;${sleef_cflags}\")\n  elseif (\"${o}\" MATCHES \"sleef_sve2048_\")\n    nsimd_get_compiler_argument(\"sve2048\" buf)\n    list(APPEND buf \"-DNSIMD_SVE2048;-DENABLE_SVE=1;${sleef_cflags}\")\n  elseif (\"${o}\" MATCHES \"sleef_vmx_\")\n    nsimd_get_compiler_argument(\"vmx\" buf)\n    list(APPEND buf \"-DNSIMD_VMX;-DENABLE_VSX=1;${sleef_cflags}\")\n  elseif (\"${o}\" MATCHES \"sleef_vsx_\")\n    nsimd_get_compiler_argument(\"vsx\" buf)\n    list(APPEND buf \"-DNSIMD_VSX;-DENABLE_VSX=1;${sleef_cflags}\")\n  else()\n    set(buf \"\")\n  endif()\n  if (NOT \"${buf}\" STREQUAL \"\")\n    target_compile_options(${o} PUBLIC \"${buf}\")\n  endif()\n  list(APPEND NSIMD_LIB_DEPS \"$<TARGET_OBJECTS:${o}>\")\nendforeach()\n\nset(NSIMD_LIB_TARGET \"nsimd_${simd}\")\nadd_library(${NSIMD_LIB_TARGET} SHARED ${NSIMD_LIB_DEPS})\n\n# -----------------------------------------------------------------------------\n# Installation stuff\n\nif (WIN32)\n  install(TARGETS ${NSIMD_LIB_TARGET} RUNTIME DESTINATION lib\n                                      ARCHIVE DESTINATION lib)\nelse()\n  install(TARGETS ${NSIMD_LIB_TARGET} LIBRARY DESTINATION lib)\nendif()\n\ninstall(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/include/nsimd\n        DESTINATION include)\n"
  },
  {
    "path": "CONTRIBUTING.md",
    "content": "<!--\n\nCopyright (c) 2019 Agenium Scale\n\nPermission is hereby granted, free of charge, to any person obtaining a copy\nof this software and associated documentation files (the \"Software\"), to deal\nin the Software without restriction, including without limitation the rights\nto use, copy, modify, merge, publish, distribute, sublicense, and/or sell\ncopies of the Software, and to permit persons to whom the Software is\nfurnished to do so, subject to the following conditions:\n\nThe above copyright notice and this permission notice shall be included in all\ncopies or substantial portions of the Software.\n\nTHE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\nIMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\nFITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\nAUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\nLIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\nOUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\nSOFTWARE.\n\n-->\n\n## How to Contribute to `nsimd`?\n\nYou are welcome to contribute to `nsimd`. This document gives some details on\nhow to add/wrap new intrinsics. When you have finished fixing some bugs or\nadding some new features, please make a pull request. One of our repository\nmaintainer will then merge or comment the pull request.\n\n\n##  Prerequisites\n\n- Respect the philosophy of the library (see [index](index.md).)\n- Basic knowledge of Python 3.\n- Good knowledge of C.\n- Good knowledge of C++.\n- Good knowledge of SIMD programming.\n\n## How Do I Add Support for a New Intrinsic?\n\n### Introduction\n\n`nsimd` currently supports the following architectures:\n- `CPU`:\n  + `CPU` called `CPU` in source code. This \"extension\" is not really one as it\n    is only present so that code written with `nsimd` can compile and run on\n    targets not supported by `nsimd` or with no SIMD.\n- Intel:\n  + `SSE2` called `SSE2` in source code.\n  + `SSE4.2` called `SSE42` in source code.\n  + `AVX` called `AVX` in source code.\n  + `AVX2` called `AVX2` in source code.\n  + `AVX-512` as found on KNLs called `AVX512_KNL` in source code.\n  + `AVX-512` as found on Xeon Skylake CPUs called `AVX512_SKYLAKE` in source\n    code.\n- Arm\n  + `NEON` 128 bits as found on ARMv7 CPUs called `NEON128` in source code.\n  + `NEON` 128 bits as found on Aarch64 CPUs called `AARCH64` in source code.\n  + `SVE` called `SVE` in source code.\n  + `SVE` 128 bits known at compiled time called `SVE128` in source code.\n  + `SVE` 256 bits known at compiled time called `SVE256` in source code.\n  + `SVE` 512 bits known at compiled time called `SVE512` in source code.\n  + `SVE` 1024 bits known at compiled time called `SVE1024` in source code.\n  + `SVE` 2048 bits known at compiled time called `SVE2048` in source code.\n- IBM POWERPC\n  + `VMX` 128 bits as found on POWER6 CPUs called `VMX` in source code.\n  + `VSX` 128 bits as found on POWER7/8 CPUs called `VSX` in source code.\n- NVIDIA\n  + `CUDA` called `CUDA` in source code\n- AMD\n  + `ROCm` called `ROCM` in source code\n- Intel oneAPI\n  + `oneAPI` called `ONEAPI` in source code\n\n`nsimd` currently supports the following types:\n- `i8`: signed integers over 8 bits (usually `signed char`),\n- `u8`: unsigned integers over 8 bits (usually `unsigned char`),\n- `i16`: signed integers over 16 bits (usually `short`),\n- `u16`: unsigned integers over 16 bits (usually `unsigned short`),\n- `i32`: signed integers over 32 bits (usually `int`),\n- `u32`: unsigned integers over 32 bits (usually `unsigned int`),\n- `i64`: signed integers over 64 bits (usually `long`),\n- `u64`: unsigned integers over 64 bits (usually `unsigned long`),\n- `f16`: floating point numbers over 16 bits in IEEE format called `float16`\n  in the rest of this document\n  (<https://en.wikipedia.org/wiki/Half-precision_floating-point_format>),\n- `f32`: floating point numbers over 32 bits (usually `float`)\n- `f64`: floating point numbers over 64 bits (usually `double`),\n\nAs C and C++ do not support `float16`, `nsimd` provides its own types to handle\nthem. Therefore special care has to be taken when implementing\nintrinsics/operators on architecures that do not natively supports them.\n\nWe will make the following misuse of language in the rest of this document.\nThe type taken by intrinsics is of course a SIMD vector and more precisely a\nSIMD vector of chars or a SIMD vector of `short`s or a SIMD vector of `int`s…\nTherefore when we will talk about an intrinsic, we will say that it takes\ntype `T` as arguments when it takes in fact a SIMD vector of `T`.\n\n### Our imaginary intrinsic\n\nWe will add support to the library for the following imaginary intrinsic: given\na SIMD vector, suppose that this intrisic called `foo` takes each element `x`\nof the vector and compute `1 / (1 - x) + 1 / (1 - x)^2`. Moreover suppose that\nhardware vendors all propose this intrisic only for floatting point numbers as\nfollows:\n- CPU (no intrinsics is given of course in standard C and C++)\n- Intel (no intrinsics is given for `float16`s)\n  + `SSE2`: no intrinsics is provided.\n  + `SSE42`: `_mm_foo_ps` for `float`s and `_mm_foo_pd` for `double`s.\n  + `AVX`: no intrinsics is provided.\n  + `AVX2`: `_mm256_foo_ps` for `float`s and `_mm256_foo_pd` for `double`s.\n  + `AVX512_KNL`: no intrinsics is provided.\n  + `AVX512_SKYLAKE`: `_mm512_foo_ps` for `float`s and `_mm512_foo_pd` for\n    `double`s.\n- ARM\n  + `NEON128`: `vfooq_f16` for `float16`s, `vfooq_f32` for `float`s and no\n    intrinsics for `double`s.\n  + `AARCH64`: same as `NEON128` but `vfooq_f64` for doubles.\n  + `SVE`: `svfoo_f16`, `svfoo_f32` and `svfoo_f64` for respectively\n    `float16`s, `float`s and `double`s.\n  + `SVE128`: `svfoo_f16`, `svfoo_f32` and `svfoo_f64` for respectively\n    `float16`s, `float`s and `double`s.\n  + `SVE256`: `svfoo_f16`, `svfoo_f32` and `svfoo_f64` for respectively\n    `float16`s, `float`s and `double`s.\n  + `SVE512`: `svfoo_f16`, `svfoo_f32` and `svfoo_f64` for respectively\n    `float16`s, `float`s and `double`s.\n  + `SVE1024`: `svfoo_f16`, `svfoo_f32` and `svfoo_f64` for respectively\n    `float16`s, `float`s and `double`s.\n  + `SVE2048`: `svfoo_f16`, `svfoo_f32` and `svfoo_f64` for respectively\n    `float16`s, `float`s and `double`s.\n- IBM POWERPC\n  + `VMX`: `vec_foo` for `float`s and no intrinsics for `double`s.\n  + `VSX`: `vec_foo` for `float`s and `double`s.\n- NVIDIA\n  + `CUDA`: no intrinsics is provided.\n- AMD\n  + `ROCM`: no intrinsics is provided.\n- Intel oneAPI\n  + `ONEAPI`: no intrinsics is provided.\n\nFirst thing to do is to declare this new intrinsic to the generation system.\nA lot of work is done by the generation system such as generating all functions\nsignatures for C and C++ APIs, tests, benchmarks and documentation. Of course\nthe default documentation does not say much but you can add a better\ndescription.\n\n### Registering the intrinsic (or operator)\n\nA function or an intrinsic is called an operator in the generation system.\nGo at the bottom of `egg/operators.py` and add the following just after\nthe `Rsqrt11` class.\n\n```python\nclass Foo(Operator):\n    full_name = 'foo'\n    signature = 'v foo v'\n    types = common.ftypes\n    domain = Domain('R\\{1}')\n    categories = [DocBasicArithmetic]\n```\n\nThis little class will be processed by the generation system so that operator\n`foo` will be available for the end-user of the library in both C and C++ APIs.\nEach member of this class controls how the generation is be done:\n- `full_name` is a string containing the human readable name of the operator.\n  If not given, the class name will be taken for it.\n- `signature` is a string describing what kind of arguments and how many takes\n  the operator. This member is mandatory and must respect the following syntax:\n  `return_type name_of_operator arg1_type arg2_type ...` where `return_type`\n  and the `arg*_type` can be taken from the following list:\n  + `v   ` SIMD vector parameter\n  + `vx2 ` Structure of 2 SIMD vector parameters\n  + `vx3 ` Structure of 3 SIMD vector parameters\n  + `vx4 ` Structure of 4 SIMD vector parameters\n  + `l   ` SIMD vector of logicals parameter\n  + `s   ` Scalar parameter\n  + `*   ` Pointer to scalar parameter\n  + `c*  ` Pointer to const scalar parameter\n  + `_   ` void (only for return type)\n  + `p   ` Parameter (integer)\n\nIn our case `v foo v` means that `foo` takes one SIMD vector as argument and\nreturns a SIMD vector as output. Several signatures will be generated for this\nintrinsic according to the types it can supports. In our case the intrinsic\nonly support floatting point types.\n- `types` is a Python list indicating which types are supported by the\n  intrinsic. If not given, the intrinsic is supposed to support all types.\n  Some Python lists are predefined to help the programmer:\n  + `ftypes = ['f64', 'f32', 'f16']       ` All floatting point types\n  + `ftypes_no_f16 = ['f64', 'f32']       `\n  + `itypes = ['i64', 'i32', 'i16', 'i8'] ` All signed integer types\n  + `utypes = ['u64', 'u32', 'u16', 'u8'] ` All unsigned integer types\n  + `iutypes = itypes + utypes`\n  + `types = ftypes + iutypes`\n- `domain` is a string indicating the mathematical domain of definition of the\n  operator. This helps for benchmarks and tests for generating random numbers\n  as inputs in the correct interval. In our case `R\\{1}` means all real numbers\n  (of course all floating point numbers) expect `-1` for which the operator\n  cannot be computed. For examples see how other operators are defined in\n  `egg/operators.py`.\n- `categories` is a list of Python classes that indicates the generation\n  system to which categories `foo` belongs. The list of available categories\n  is as follow:\n  + `DocShuffle          ` for Shuffle functions\n  + `DocTrigo            ` for Trigonometric functions\n  + `DocHyper            ` for Hyperbolic functions\n  + `DocExpLog           ` for Exponential and logarithmic functions\n  + `DocBasicArithmetic  ` for Basic arithmetic operators\n  + `DocBitsOperators    ` for Bits manipulation operators\n  + `DocLogicalOperators ` for Logicals operators\n  + `DocMisc             ` for Miscellaneous\n  + `DocLoadStore        ` for Loads & stores\n  + `DocComparison       ` for Comparison operators\n  + `DocRounding         ` for Rounding functions\n  + `DocConversion       ` for Conversion operators\n  If no category corresponds to the operator you want to add to `nsimd` then feel\n  free to create a new category (see the bottom of this document)\n\nMany other members are supported by the generation system. We describe them\nquickly here and will give more details in a later version of this document.\nDefault values are given in square brakets:\n- `cxx_operator [= None]` in case the operator has a corresponding C++ operator.\n- `autogen_cxx_adv [= True]` in case the C++ advanced API signatures for this\n  operator must not be auto-generated.\n- `output_to [= common.OUTPUT_TO_SAME_TYPE]` in case the operator output type\n  differs from its input type. Possible values are:\n  + `OUTPUT_TO_SAME_TYPE`: output is of same type as input.\n  + `OUTPUT_TO_SAME_SIZE_TYPES`: output can be any type of same bit size.\n  + `OUTPUT_TO_UP_TYPES`: output can be any type of bit size twice the bit\n    bit size of the input. In this case the input type will never be a 64-bits\n    type.\n  + `OUTPUT_TO_DOWN_TYPES`: output can be any type of bit size half the bit\n    bit size of the input. In this case the input type will never be a 8-bits\n    type.\n- `src [= False]` in case the code must be compiled in the library.\n- `load_store [= False]` in case the operator loads/store data from/to\n  memory.\n- `do_bench [= True]` in case benchmarks for the operator must not be\n  auto-generated.\n- `desc [= '']` description (in Markdown format) that will appear in the\n  documentation for the operator.\n- `bench_auto_against_cpu [= True]` for auto-generation of benchmark against\n  `nsimd` CPU implementation.\n- `bench_auto_against_mipp [= False]` for auto-generation of benchmark against\n  the MIPP library.\n- `bench_auto_against_sleef [= False]` for auto-generation of benchmark against\n  the Sleef library.\n- `bench_auto_against_std [= False]` for auto-generation of benchmark against\n  the standard library.\n- `tests_mpfr [= False]` in case the operator has an MPFR counterpart for\n  comparison, then test the correctness of the operator against it.\n- `tests_ulps [= False]` in case the auto-generated tests has to compare ULPs\n  (<https://en.wikipedia.org/wiki/Unit_in_the_last_place>).\n- `has_scalar_impl [= True]` in case the operator has a CPU scalar and GPU\n  implementation.\n\n### Implementing the operator\n\nNow that the operator is registered, all signatures will be generated but\nthe implemenatations will be missing. Type\n\n```sh\npython3 egg/hatch.py -lf\n```\n\nand the following files (among many other) should appear:\n- `include/nsimd/cpu/cpu/foo.h`\n- `include/nsimd/x86/sse2/foo.h`\n- `include/nsimd/x86/sse42/foo.h`\n- `include/nsimd/x86/avx/foo.h`\n- `include/nsimd/x86/avx2/foo.h`\n- `include/nsimd/x86/avx512_knl/foo.h`\n- `include/nsimd/x86/avx512_skylake/foo.h`\n- `include/nsimd/arm/neon128/foo.h`\n- `include/nsimd/arm/aarch64/foo.h`\n- `include/nsimd/arm/sve/foo.h`\n- `include/nsimd/arm/sve128/foo.h`\n- `include/nsimd/arm/sve256/foo.h`\n- `include/nsimd/arm/sve512/foo.h`\n- `include/nsimd/arm/sve1024/foo.h`\n- `include/nsimd/arm/sve2048/foo.h`\n- `include/nsimd/ppc/vmx/foo.h`\n- `include/nsimd/ppc/vsx/foo.h`\n\nThey each correspond to the implementations of the operator for each supported\narchitectures. When openening one of these files the implementations in plain\nC and then in C++ (falling back to the C function) should be there but all the\nC implementations are reduced to `abort();`. This is the default when none is\nprovided. Note that the \"cpu\" architecture is just a fallback involving no\nSIMD at all. This is used on architectures not supported by `nsimd` or when the\narchitectures does not offer any SIMD.\n\nProviding implementations for `foo` is done by completing the following Python\nfiles:\n\n- `egg/platform_cpu.py`\n- `egg/platform_x86.py`\n- `egg/platform_arm.py`\n- `egg/platform_ppc.py`\n- `egg/scalar.py`\n- `egg/cuda.py`\n- `egg/hip.py`\n- `egg/oneapi.py`\n\nThe idea is to produce plain C (not C++) code using Python string format. Each\nof the Python files provides some helper functions to ease as much as\npossible the programmer's job. But every file provides the same \"global\"\nvariables available in every functions and is designed in the same way:\n\n1. At the bottom of the file is the `get_impl` function taking the following\n   arguments:\n   + `func     ` the name of the operator the system is currently\n     auto-generating.\n   + `simd_ext ` the SIMD extension for which the system wants the\n     implemetation.\n   + `from_typ ` the input type of the argument that will be passed to the\n     operator.\n   + `to_typ   ` the output type produced by the operator.\n2. Inside this function lies a Python dictionary that provides functions\n   implementing each operator. The string containing the C code for the\n   implementations can be put here directly but usually the string is\n   returned by a Python function that is written above in the same file.\n3. At the top of the file lies helper functions that helps generating code.\n   This is specific to each architecture. Do not hesitate to look at it.\n\nLet's begin by the `cpu` implementations. It turns out that there is no SIMD\nextension in this case, and by convention, `simd_ext == 'cpu'` and this\nargument can therefore be ignored. So we first add an entry to the `impls`\nPython dictionary of the `get_impl` function:\n\n```python\n    impls = {\n\n        ...\n\n        'reverse': reverse1(from_typ),\n        'addv': addv(from_typ),\n        'foo': foo1(from_typ) # Added at the bottom of the dictionary\n    }\n    if simd_ext != 'cpu':\n        raise ValueError('Unknown SIMD extension \"{}\"'.format(simd_ext))\n\n    ...\n```\n\nThen, above in the file we write the Python function `foo1` that will provide\nthe C implementation of operator `foo`:\n\n```python\ndef foo1(typ):\n    return func_body(\n           '''ret.v{{i}} = ({typ})1 / (({typ})1 - {in0}.v{{i}}) +\n                           ({typ})1 / ((({typ})1 - {in0}.v{{i}}) *\n                                       (({typ})1 - {in0}.v{{i}}));'''. \\\n                                       format(**fmtspec), typ)\n```\n\nFirst note that the arguments names passed to the operator in its C\nimplementation are not known in the Python side. Several other parameters\nare not known or are cumbersome to find out. Therefore each function has access\nto the `fmtspec` Python dictionary that hold some of these values:\n- `in0`: name of the first parameter for the C implementation.\n- `in1`: name of the second parameter for the C implementation.\n- `in2`: name of the third parameter for the C implementation.\n- `simd_ext`: name of the SIMD extension (for the cpu architecture, this is\n  equal to `\"cpu\"`).\n- `from_typ`: type of the input.\n- `to_typ`: type of the output.\n- `typ`: equals `from_typ`, shorter to write as usually `from_typ == to_typ`.\n- `utyp`: bitfield type of the same size of `typ`.\n- `typnbits`: number of bits in `typ`.\n\nThe CPU extension can emulate 64-bits or 128-bits wide SIMD vectors. Each type\nis a struct containing as much members as necessary so that `sizeof(T) *\n(number of members) == 64 or 128`. In order to avoid the developper to write\ntwo cases (64-bits wide and 128-bits wide) the `func_body` function is provided\nas a helper. Note that the index `{{i}}` is in double curly brackets to go\nthrough two Python string formats:\n\n1. The first pass is done within the `foo1` Python function and replaces\n   `{typ}` and `{in0}`. In this pass `{{i}}` is formatted into `{i}`.\n2. The second pass is done by the `func_body` function which unrolls the string\n   to the necessary number and replace `{i}` by the corresponding number. The\n   produced C code will look like one would written the same statement for each\n   members of the input struct.\n\nThen note that as plain C (and C++) does not support native 16-bits wide\nfloating point types `nsimd` emulates it with a C struct containing 4 floats\n(32-bits swide floatting point numbers). In some cases extra care has to be\ntaken to handle this type.\n\nFor each SIMD extension one can find a `types.h` file (for `cpu` the files can\nbe found in `include/nsimd/cpu/cpu/types.h`) that declares all SIMD types. If\nyou have any doubt on a given type do not hesitate to take a look at this file.\nNote also that this file is auto-generated and is therefore readable only after\na successfull first `python3 egg/hatch -Af`.\n\nNow that the `cpu` implementation is written, you should be able to write the\nimplementation of `foo` for other architectures. Each architecture has its\nparticularities. We will cover them now by providing directly the Python\nimplementations and explaining in less details.\n\nFinally note that `clang-format` is called by the generation system to\nautoformat produced C/C++ code. Therefore prefer indenting C code strings within\nthe Python according to Python indentations, do not write C code beginning at\ncolumn 0 in Python files.\n\n### For Intel\n\n```python\ndef foo1(simd_ext, typ):\n    if typ == 'f16':\n        return '''nsimd_{simd_ext}_vf16 ret;\n                  ret.v1 = {pre}foo_ps({in0}.v1);\n                  ret.v2 = {pre}foo_ps({in0}.v2);\n                  return ret;'''.format(**fmtspec)\n    if simd_ext == 'sse2':\n        return emulate_op1('foo', 'sse2', typ)\n    if simd_ext in ['avx', 'avx512_knl']:\n        return split_opn('foo', simd_ext, typ, 1)\n    return 'return {pre}foo{suf}({in0});'.format(**fmtspec)\n```\n\nHere are some notes concerning the Intel implementation:\n\n1. `float16`s are emulated with two SIMD vectors of `float`s.\n2. When the intrinsic is provided by Intel one can access it easily by\n   constructing it with `{pre}` and `{suf}`. Indeed all Intel intrinsics\n   names follow a pattern with a prefix indicating the SIMD extension and a\n   suffix indicating the type of data. As for `{in0}`,  `{pre}` and\n   `{suf}` are provided and contain the correct values with respect to\n   `simd_ext` and `typ`, you do not need to compute them yourself.\n3. When the intrinsic is not provided by Intel then one has to use tricks.\n   + For `SSE2` one can use complete emulation, that is, putting the content of\n     the SIMD vector into a C-array, working on it with a simple for loop and\n     loading back the result into the resulting SIMD vector. As said before a\n     lot of helper functions are provided and the `emulate_op1` Python function\n     avoid writing by hand this for-loop emulation.\n   + For `AVX` and `AVX512_KNL`, one can fallback to the \"lower\" SIMD extension\n     (`SSE42` for `AVX` and `AVX2` for `AVX512_KNL`) by splitting the input\n     vector into two smaller vectors belonging to the \"lower\" SIMD extension. In\n     this case again the tedious and cumbersome work is done by the `split_opn`\n     Python function.\n4. Do not forget to add the `foo` entry to the `impls` dictionary in the `get_impl`\n   Python function.\n\n### For ARM\n\n```python\ndef foo1(simd_ext, typ):\n    ret = f16f64(simd_ext, typ, 'foo', 'foo', 1)\n    if ret != '':\n        return ret\n    if simd_ext in neon:\n        return 'return vfooq_{suf}({in0});'.format(**fmtspec)\n    else:\n        return 'return svfoo_{suf}_z({svtrue}, {in0});'.format(**fmtspec)\n```\n\nHere are some notes concerning the ARM implementation:\n\n1. `float16`s can be natively supported but this is not mandatory.\n2. On 32-bits ARM chips, intrinsics on `double` almost never exist.\n3. The Python helper function `f16f64` hides a lot of details concerning the\n   above two points. If the function returns a non empty string then it means\n   that the returned string contains C code to handle the case given by the\n   pair `(simd_ext, typ)`. We advise you to look at the generated C code. You\n   will see the `nsimd_FP16` macro used. When defined it indicates that `nsimd`\n   is compiled with native `float16` support. This also affect SIMD types (see\n   `nsimd/include/arm/*/types.h`.)\n4. Do not forget to add the `foo` entry to the `impls` dictionary in the\n   `get_impl` Python function.\n\n### For IBM POWERPC\n\n```python\ndef foo1(simd_ext, typ):\n    if has_to_be_emulated(simd_ext, typ):\n        return emulation_code(op, simd_ext, typ, ['v', 'v'])\n    else:\n        return 'return vec_foo({in0});'.format(**fmtspec)\n```\n\nHere are some notes concerning the PPC implementation:\n\n1. For VMX, intrinsics on `double` almost never exist.\n2. The Python helper function `has_to_be_emulated` returns `True` when the\n   implementation of `foo` concerns float16 or `double`s for `VMX`. When this\n   function returns True you can then use `emulation_code`.\n3. The `emulation_code` function returns a generic implementation of an\n   operator. However this iplementation is not suitable for any operator\n   and the programmer has to take care of that.\n4. Do not forget to add the `foo` entry to the `impls` dictionary in the\n   `get_impl` Python function.\n\n### The scalar CPU version\n\n```python\ndef foo1(func, typ):\n    normal = \\\n    'return ({typ})(1 / (1 - {in0}) + 1 / ((1 - {in0}) * (1 - {in0})));'. \\\n    if typ == 'f16':\n        return \\\n        '''#ifdef NSIMD_NATIVE_FP16\n             {normal}\n           #else\n             return nsimd_f32_to_f16({normal_fp16});\n           #endif'''. \\\n           format(normal=normal.format(**fmtspec),\n                  normal_fp16=normal.format(in0='nsimd_f16_to_f32({in0})))\n    else:\n        return normal.format(**fmtspec)\n```\n\nThe only caveat for the CPU scalar implementation is to handle float16\ncorrectly. The easiest way to do is to have the same implementation as float32\nbut replacing `{in0}`'s by `nsimd_f16_to_f32({in0})`'s and converting back\nthe float32 result to a float16.\n\n### The GPU versions\n\nThe GPU generator Python files `cuda.py`, `rocm.py` and `oneapi.py` are a bit\ndifferent from the other files but it is easy to find where to add the relevant\npieces of code. Note that ROCm syntax is fully compatible with CUDA's one only\nneeds to modify the `cuda.py` file while it easy to understand `oneapi.py`.\n\nThe code to add for float32's is as follows to be added inside the `get_impl`\nPython function.\n\n```python\nreturn '1 / (1 - {in0}) + 1 / ((1 - {in0}) * (1 - {in0}))'.format(**fmtspec)\n```\n\nThe code for CUDA and ROCm to add for float16's is as follows. It has to be\nadded inside the `get_impl_f16` Python function.\n\n```python\narch53_code = '''__half one = __float2half(1.0f);\n                 return __hadd(\n                               __hdiv(one, __hsub(one, {in0})),\n                               __hmul(\n                                      __hdiv(one, __hsub(one, {in0})),\n                                      __hdiv(one, __hsub(one, {in0}))\n                                     )\n                              );'''.format(**fmtspec)\n```\n\nAs Intel oneAPI natively support float16's the code is the same as the one\nfor floats:\n\n```python\nreturn '1 / (1 - {in0}) + 1 / ((1 - {in0}) * (1 - {in0}))'.format(**fmtspec)\n```\n\n### Implementing the test for the operator\n\nNow that we have written the implementations for the `foo` operator we must\nwrite the corresponding tests. For tests all generations are done by\n`egg/gen_tests.py`. Writing tests is more simple. The intrinsic that we just\nimplemented can be tested by an already-written test pattern code, namely by\nthe `gen_test` Python function.\n\nHere is how the `egg/gen_tests.py` is organized:\n\n1. The entry point is the `doit` function located at the bottom of the file.\n2. In the `doit` function a dispatching is done according to the operator that\n   is to be tested. All operators cannot be tested by the same C/C++ code. The\n   reading of all different kind of tests is rather easy and we are not going\n   through all the code in this document.\n3. All Python functions generating test code begins with the following:\n   ```python\n       filename = get_filename(opts, op, typ, lang)\n       if filename == None:\n           return\n   ```\n   This must be the case for newly created function. The `get_filename` function\n   ensures that the file must be created with respect to the command line\n   options given to the `egg/hatch.py` script. Then note that to output to a\n   file the Python function `open_utf8` must be used to handle Windows and to\n   automatically put the MIT license at the beginning of generated files.\n4. Tests must be written for C base API, the C++ base API and the C++ advanced\n   API.\n\nIf you need to create a new kind of tests then the best way is to copy-paste\nthe Python function that produces the test that resembles the most to the test\nyou want. Then modify the newly function to suit your needs. Here is a quick\noverview of Python functions present in the `egg/gen_test.py` file:\n- `gen_nbtrue`, `gen_adv`, `gen_all_any` generate tests for reduction operators.\n- `gen_reinterpret_convert` generates tests for non closed operators.\n- `gen_load_store` generates tests for load/store operators.\n- `gen_reverse` generates tests for one type of shuffle but can be extended\n  for other kind of shuffles.\n- `gen_test` generates tests for \"standard\" operators, typically those who do\n  some computations. This is the kind of tests that can handle our `foo`\n  operator and therefore nothing has to be done on our part.\n\n## Not all tests are to be done\n\nAs explained in <how_tests_are_done.md> doing all tests is not recommanded.\nTake for example the `cvt` operator. Testing `cvt` from say `f32` to `i32`\nis complicated as the result depends on how NaN, infinities are handled and\non the current round mode. In turn these prameters depends on the vendor, the\nchip, the bugs in the chip, the chosen rounding mode by users or other\nsoftwares...\n\nThe function `should_i_do_the_test` gives an hint on whether to implement the\ntest or not. Its code is really simple and you may need to modify it. The\nlisting below is a possible implementation that takes care of the case\ndescribed in the previous paragraph.\n\n```python\ndef should_i_do_the_test(operator, tt='', t=''):\n    if operator.name == 'cvt' and t in common.ftypes and tt in common.iutypes:\n        # When converting from float to int to float then we may not\n        # get the initial result because of roundings. As tests are usually\n        # done by going back and forth then both directions get tested in the\n        # end\n        return False\n    if operator.name == 'reinterpret' and t in common.iutypes and \\\n       tt in common.ftypes:\n        # When reinterpreting from int to float we may get NaN or infinities\n        # and no ones knows what this will give when going back to ints\n        # especially when float16 are emulated. Again as tests are done by\n        # going back and forth both directions get tested in the end.\n        return False\n    if operator.name in ['notb', 'andb', 'andnotb', 'xorb', 'orb'] and \\\n       t == 'f16':\n        # Bit operations on float16 are hard to check because they are\n        # emulated in most cases. Therefore going back and forth with\n        # reinterprets for doing bitwise operations make the bit in the last\n        # place to wrong. This is normal but makes testing real hard. So for\n        # now we do not test them on float16.\n        return False\n    if operator.name in ['len', 'set1', 'set1l', 'mask_for_loop_tail',\n                         'loadu', 'loada', 'storeu', 'storea', 'loadla',\n                         'loadlu', 'storela', 'storelu', 'if_else1']:\n        # These functions are used in almost every tests so we consider\n        # that they are extensively tested.\n        return False\n    if operator.name in ['store2a', 'store2u', 'store3a', 'store3u',\n                         'store4a', 'store4u', 'scatter', 'scatter_linear',\n                         'downcvt', 'to_logical']:\n        # These functions are tested along with their load counterparts.\n        # downcvt is tested along with upcvt and to_logical is tested with\n        # to_mask\n        return False\n    return True\n```\n\n### Conclusion\n\nAt first sight the implementation of `foo` seems complicated because intrinsics\nfor all types and all architectures are not provided by vendors. But `nsimd`\nprovides a lot of helper functions and tries to put away details so that\nwrapping intrinsics is quickly done and easy, the goal is that the programmer\nconcentrate on the implementation itself. But be aware that more complicated\ntricks can be implemented. Browse through a `platform_*.py` file to see what\nkind of tricks are used and how they are implemented.\n\n\n## How do I add a new category?\n\nAdding a category is way much simplier than an operator. It suffices to add\na class with only one member named `title` as follows:\n```python\nclass DocMyCategoryName(DocCategory):\n    title = 'My category name functions'\n```\n\nThe class must inherit from the `DocCategory` class and its name must begin\nwith `Doc`. The system will then take it into account, generate the entry\nin the documentation and so on.\n\n## How to I add a new module?\n\nA module is a set of functionnalities that make sense to be provided alongside\nNSIMD but that cannot be part of NSIMD's core. Therefore it is not mandatory\nto provide all C and C++ APIs versions or to support all operators. For what\nfollows let's call the module we want to implement `mymod`.\n\nInclude files (written by hand or generated by Python) must be placed into\nthe `nsimd/include/nsimd/modules/mymod` directory and a master header file must\nbe placed at `nsimd/include/nsimd/modules/mymod.h`. You are free to organize\nthe `nsimd/include/nsimd/modules/mymod` folder as you see fit.\n\nYour module has to be found by NSIMD generation system. For this you must\ncreate the `nsimd/egg/modules/mymod` directory and\n`nsimd/egg/modules/mymod/hatch.py` file. The latter must expose the following\nfunctions:\n\n- `def name()`  \n  Return a human readable module name beginning with a uppercase letter.\n\n- `def desc()`  \n  Return a small description of 4-5 lines of text for the module. This text\n  will appear in the `modules.md` file that lists all the available modules.\n\n- `def doc_menu()`  \n  Return a Python dictionnary containing the menu for when the generation\n  system produces the HTML pages of documentation for the module. The entry\n  markdown file must be `nsimd/doc/markdown/module_mymod_overview.md` for\n  module documentation. Then  if your module has no other documentation\n  pages this function can simply returns `dict()`. Otherwise if has to return\n  `{'menu_label': 'filename_suffix', ...}` where `menu_label` is a menu entry\n  to be displayed and pointing to `nsimd/egg/module_mymod_filename_suffix.md`.\n  Several fucntion in `egg/common.py` (`import common`) have to be used to\n  ease crafting documentation pages filenames:\n  + `def get_markdown_dir(opts)`  \n    Return the folder into which markdown for documentation have to be put.\n  + `def get_markdown_file(opts, name, module='')`  \n    Return the filename to be passed to the `common.open_utf8` function. The\n    `name` argument acts as a suffix as explained above while the `module`\n    argument if the name of the module.\n  \n- `def doit(opts)` \n  Is the real entry point of the module. This function has the responsability\n  to generate all the code for your module. It can of course import all Python\n  files from NSIMD and take advantage of the `operators.py` file. To\n  respect the switches passed by the user at command line it is recommanded to\n  write this function as follows.\n\n  ```python\n  def doit(opts):\n      common.myprint(opts, 'Generating module mymod')\n      if opts.library:\n          gen_module_headers(opts)\n      if opts.tests:\n          gen_tests(opts)\n      if opts.doc:\n          gen_doc(opts)\n  ```\n\nTests for the module have to be put into the `nsimd/tests/mymod` directory.\n\n## How to I add a new platform?\n\nThe list of supported platforms is determined by looking in the `egg`\ndirectory and listing all `platform_*.py` files. Each file must contain all\nSIMD extensions for a given platform. For example the default (no SIMD) is\ngiven by `platform_cpu.py`. All the Intel SIMD extensions are given by\n`platform_x86.py`.\n\nEach Python file that implements a platform must be named\n`platform_[name for platform].py` and must export at least the following\nfunctions:\n\n- `def get_simd_exts()`  \n  Return the list of SIMD extensions implemented by this file as a Python\n  list.\n\n- `def get_prev_simd_ext(simd_ext)`  \n  Usually SIMD extensions are added over time by vendors and a chip\n  implementing  a SIMD extension supports previous SIMD extension. This\n  function must return the previous SIMD extension supported by the vendor if\n  it exists otherwise it must return the empty string. Note that `cpu` is the\n  only SIMD extensions that has no previous SIMD extensions. Every other SIMD\n  extension has at least `cpu` as previous SIMD extension.\n\n- `def get_native_typ(simd_ext, typ)`  \n  Return the native SIMD type corresponding of the SIMD extension `simd_ext`\n  whose elements are of type `typ`. If `typ` or `simd_ext` is not known then a\n  ValueError exception must be raised.\n\n- `def get_type(simd_ext, typ)`  \n  Returns the \"intrinsic\" SIMD type corresponding to the given\n  arithmetic type. If `typ` or `simd_ext` is not known then a ValueError\n  exception must be raised.\n\n- `def get_additional_include(func, simd_ext, typ)`  \n  Returns additional include if need be for the implementation of `func` for\n  the given `simd_ext` and `typ`.\n\n- `def get_logical_type(simd_ext, typ)`  \n  Returns the \"intrinsic\" logical SIMD type corresponding to the given\n  arithmetic type. If `typ` or `simd_ext` is not known then a ValueError\n  exception must be raised.\n\n- `def get_nb_registers(simd_ext)`  \n  Returns the number of registers for this SIMD extension.\n\n- `def get_impl(func, simd_ext, from_typ, to_typ)`  \n  Returns the implementation (C code) for `func` on type `typ` for `simd_ext`.\n  If `typ` or `simd_ext` is not known then a ValueError exception must be\n  raised. Any `func` given satisfies `S func(T a0, T a1, ... T an)`.\n\n- `def has_compatible_SoA_types(simd_ext)`  \n  Returns True iff the given `simd_ext` has structure of arrays types\n  compatible with NSIMD i.e. whose members are v1, v2, ... Returns False\n  otherwise. If `simd_ext` is not known then a ValueError exception must be\n  raised.\n\n- `def get_SoA_type(simd_ext, typ, deg)`  \n  Returns the structure of arrays types for the given `typ`, `simd_ext` and\n  `deg`. If `simd_ext` is not known or does not name a type whose\n  corresponding SoA types are compatible with NSIMD then a ValueError\n  exception must be raised.\n\n- `def emulate_fp16(simd_ext)`\n  Returns True iff the given SIMD extension has to emulate FP16's with\n  two FP32's.\n\nThen you are free to implement the SIMd extensions for the platform. See above\non how to add the implementations of operators.\n"
  },
  {
    "path": "LICENSE",
    "content": "Copyright (c) 2019 Agenium Scale\n\nPermission is hereby granted, free of charge, to any person obtaining a copy\nof this software and associated documentation files (the \"Software\"), to deal\nin the Software without restriction, including without limitation the rights\nto use, copy, modify, merge, publish, distribute, sublicense, and/or sell\ncopies of the Software, and to permit persons to whom the Software is\nfurnished to do so, subject to the following conditions:\n\nThe above copyright notice and this permission notice shall be included in all\ncopies or substantial portions of the Software.\n\nTHE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\nIMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\nFITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\nAUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\nLIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\nOUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\nSOFTWARE.\n"
  },
  {
    "path": "README.md",
    "content": "Documentation can be found [here](https://agenium-scale.github.io/nsimd/).\nWe put a lot of effort into\n[testing](https://agenium-scale.github.io/nsimd/how_tests_are_done.html).\n\n# What is NSIMD?\n\nAt its core, NSIMD is a vectorization library that abstracts [SIMD\nprogramming](<https://en.wikipedia.org/wiki/SIMD>). It was designed to exploit\nthe maximum power of processors at a low development cost. NSIMD comes with\nmodules. As of now two of them adds support for GPUs to NSIMD. The\ndirection that NSIMD is taking is to provide several programming paradigms\nto address different problems and to allow a wider support of architectures.\nWith two of its modules NSIMD provides three programming paradigms:\n\n- Imperative programming provided by NSIMD core that supports a lots of\n  CPU/SIMD extensions.\n- Expressions templates provided by the TET1D module that supports all\n  architectures from NSIMD core and adds support for NVIDIA and AMD GPUs.\n- Single Program Multiple Data provided by the SPMD module that supports all\n  architectures from NSIMD core and adds support for NVIDIA and AMD GPUs.\n\n## Supported architectures\n\n| Architecture                          | NSIMD core | TET1D module | SPMD module |\n|:--------------------------------------|:----------:|:------------:|:-----------:|\n| CPU (scalar functions)                |     Y      |      Y       |      Y      |\n| CPU (128-bits SIMD emulation)         |     Y      |      Y       |      Y      |\n| Intel SSE 2                           |     Y      |      Y       |      Y      |\n| Intel SSE 4.2                         |     Y      |      Y       |      Y      |\n| Intel AVX                             |     Y      |      Y       |      Y      |\n| Intel AVX2                            |     Y      |      Y       |      Y      |\n| Intel AVX-512 for KNLs                |     Y      |      Y       |      Y      |\n| Intel AVX-512 for Skylake processors  |     Y      |      Y       |      Y      |\n| Arm NEON 128 bits (ARMv7 and earlier) |     Y      |      Y       |      Y      |\n| Arm NEON 128 bits (ARMv8 and later)   |     Y      |      Y       |      Y      |\n| Arm SVE (original sizeless SVE)       |     Y      |      Y       |      Y      |\n| Arm fixed sized SVE                   |     Y      |      Y       |      Y      |\n| IBM POWERPC VMX                       |     Y      |      Y       |      Y      |\n| IBM POWERPC VSX                       |     Y      |      Y       |      Y      |\n| NVIDIA CUDA                           |     N      |      Y       |      Y      |\n| AMD ROCm                              |     N      |      Y       |      Y      |\n| Intel oneAPI                          |     N      |      Y       |      Y      |\n\n## Contributions\n\n| Contributor          | Contribution(s)                                   |\n|:---------------------|:--------------------------------------------------|\n| Guillaume Quintin    | Maintainer + main contributor                     |\n| Alan Kelly           | Arm NEON + mathematical functions                 |\n| Kenny Péou           | Fixed point module                                |\n| Xavier Berault       | PowerPC VMX and VSX                               |\n| Vianney Stricher     | NSIMD core + oneAPI in SPMD and TET1D modules     |\n| Quentin Khan         | Soa/AoS loads and stores                          |\n| Paul Gannay          | PowerPC VMX, VSX + testing system                 |\n| Charly Chevalier     | Benchmarking system + Python internals            |\n| Erik Schnetter       | Fixes + code generation                           |\n| Lénaïc Bagnères      | Fixes + TET1D module                              |\n| Jean-Didier Pailleux | Shuffles operators                                |\n\n## How it works?\n\nTo achieve maximum performance, NSIMD mainly relies on the inline optimization\npass of the compiler. Therefore using any mainstream compiler such as GCC,\nClang, MSVC, XL C/C++, ICC and others with NSIMD will give you a zero-cost SIMD\nabstraction library.\n\nTo allow inlining, a lot of code is placed in header files. *Small* functions\nsuch as addition, multiplication, square root, etc, are all present in header\nfiles whereas big functions such as I/O are put in source files that are\ncompiled as a `.so`/`.dll` library.\n\nNSIMD provides C89, C11, C++98, C++11, C++14 and C++20 APIs. All APIs allow\nwriting generic code. For the C API this is achieved through a thin layer of\nmacros and with the `_Generic` keyword for the C advanced API; for the C++ APIs\nit is achieved using templates and function overloading. The C++ APIs are split\ninto two. The first part is a C-like API with only function calls and direct\ntype definitions for SIMD types while the second one provides operator\noverloading, higher level type definitions that allows unrolling.  C++11, C++14\nAPIs add for instance templated type definitions and templated constants while\nthe C++20 API uses concepts for better error reporting.\n\nBinary compatibility is guaranteed by the fact that only a C ABI is exposed.\nThe C++ API only wraps the C calls.\n\n## Supported compilers\n\nNSIMD is tested with GCC, Clang, MSVC, NVCC, HIPCC and ARMClang. As a C89 and a\nC++98 API are provided, other compilers should work fine. Old compiler versions\nshould work as long as they support the targeted SIMD extension. For instance,\nNSIMD can compile SSE 4.2 code with MSVC 2010.\n\n# Build the library\n\n## CMake\n\nAs CMake is widely used as a build system, we have added support for building\nthe library only and the corresponding find module.\n\n```sh\nmkdir build\ncd build\ncmake .. -Dsimd=SIMD_EXT\nmake\nmake install\n```\n\nwhere `SIMD_EXT` is one of the following: CPU, SSE2, SSE42, AVX, AVX2,\nAVX512\\_KNL, AVX512\\_SKYLAKE, NEON128, AARCH64, SVE, SVE128, SVE256, SVE512,\nSVE1024, SVE2048, VMX, VSX, CUDA, ROCM.\n\nNote that when compiling for NEON128 on Linux one has to choose the ABI, either\narmel or armhf. Default is armel. As CMake is unable to autodetect this\nparameter one has to tell CMake manually.\n\n```sh\ncmake .. -Dsimd=neon128                               # for armel\ncmake .. -Dsimd=neon128 -DNSIMD_ARM32_IS_ARMEL=OFF    # for armhf\n```\n\nWe provide in the `scripts` directory a CMake find module to find NSIMD on\nyour system. One can let the module find NSIMD on its own, if several\nversions for different SIMD extensions of NSIMD are installed then the\nmodule will find and return one. There is no guaranty on which versions will\nbe chosen by the module.\n\n```cmake\nfind_package(NSIMD)\n```\n\nIf one wants a specific version of the library for a given SIMD extension then\nuse the `COMPONENTS` part of `find_package`. Only one component is supported\nat a time.\n\n```cmake\nfind_package(NSIMD COMPONENTS avx2)         # find only NSIMD for Intel AVX2\nfind_package(NSIMD COMPONENTS sve)          # find only NSIMD for Arm SVE\nfind_package(NSIMD COMPONENTS sse2 sse42)   # unsupported\n```\n\n## Nsconfig\n\nThe support for CMake has been limited to building the library only. If you\nwish to run tests or contribute you need to use nsconfig as CMake has several\nflaws:\n- too slow especially on Windows,\n- inability to use several compilers at once,\n- inability to have a portable build system,\n- very poor support for portable compilation flags,\n- ...\n\n## Dependencies (nsconfig only)\n\nGenerating C/C++ files is done by the Python3 code contained in the `egg`.\nPython should be installed by default on any Linux distro. On Windows it comes\nwith the latest versions of Visual Studio on Windows\n(<https://visualstudio.microsoft.com/vs/community/>), you can also download and\ninstall it directly from <https://www.python.org/>.\n\nThe Python code can call `clang-format` to properly format all generated C/C++\nsource. On Linux you can install it via your package manager. On Windows you\ncan use the official binary at <https://llvm.org/builds/>.\n\nCompiling the library requires a C++98 compiler. Any version of GCC, Clang or\nMSVC will do. Note that the produced library and header files for the end-user\nare C89, C++98, C++11 compatible. Note that C/C++ files are generated by a\nbunch of Python scripts and they must be executed first before running building\nthe library.\n\n## Build for Linux\n\n```bash\nbash scripts/build.sh for simd_ext1/.../simd_extN with comp1/.../compN\n```\n\nFor each combination a directory `build-simd_ext-comp` will be created and\nwill contain the library. Supported SIMD extension are:\n\n- sse2\n- sse42\n- avx\n- avx2\n- avx512\\_knl\n- avx512\\_skylake\n- neon128\n- aarch64\n- sve\n- sve128\n- sve256\n- sve512\n- sve1024\n- sve2048\n- vmx\n- vsx\n- cuda\n- rocm\n\nSupported compiler are:\n\n- gcc\n- clang\n- icc\n- armclang\n- xlc\n- dpcpp\n- fcc\n- cl\n- nvcc\n- hipcc\n\nNote that certain combination of SIMD extension/compilers are not supported\nsuch as aarch64 with icc, or avx512\\_skylake with nvcc.\n\n## Build on Windows\n\nMake sure you are typing in a Visual Studio prompt. The command is almost the\nsame as for Linux with the same constraints on the pairs SIMD\nextension/compilers.\n\n```batch\nscripts\\build.bat for simd_ext1/.../simd_extN with comp1/.../compN\n```\n\n## More details on building the library\n\nThe library uses a tool called nsconfig\n(<https://github.com/agenium-scale/nstools>) which is basically a Makefile\ntranslator. If you have just built NSIMD following what's described above\nyou should have a `nstools` directory which contains `bin/nsconfig`. If not\nyou can generate it using on Linux\n\n```bash\nbash scripts/setup.sh\n```\n\nand on Windows\n\n```batch\nscripts\\setup.bat\n```\n\nThen you can use `nsconfig` directly it has a syntax similar to CMake at\ncommand line. Here is a quick tutorial with Linux command line. We first\ngo to the NSIMD directory and generate both NSIMD and nsconfig.\n\n```bash\n$ cd nsimd\n$ python3 egg/hatch.py -ltf\n$ bash scripts/setup.sh\n$ mkdir build\n$ cd build\n```\n\nHelp can be displayed using `--help`.\n\n```bash\n$ ../nstools/bin/nsconfig --help\nusage: nsconfig [OPTIONS]... DIRECTORY\nConfigure project for compilation.\n\n  -v              verbose mode, useful for debugging\n  -nodev          Build system will never call nsconfig\n  -DVAR=VALUE     Set value of variable VAR to VALUE\n  -list-vars      List project specific variable\n  -GBUILD_SYSTEM  Produce files for build system BUILD_SYSTEM\n                  Supported BUILD_SYSTEM:\n                    make       POSIX Makefile\n                    gnumake    GNU Makefile\n                    nmake      Microsot Visual Studio NMake Makefile\n                    ninja      Ninja build file (this is the default)\n                    list-vars  List project specific variables\n  -oOUTPUT        Output to OUTPUT instead of default\n  -suite=SUITE    Use compilers from SUITE as default ones\n                  Supported SUITE:\n                    gcc       The GNU compiler collection\n                    msvc      Microsoft C and C++ compiler\n                    llvm      The LLVM compiler infrastructure\n                    armclang  Arm suite of compilers based on LLVM\n                    xlc       IBM suite of compilers\n                    fcc_trad_mode\n                              Fujitsu compiler in traditional mode\n                    fcc_clang_mode\n                              Fujitsu compiler in clang mode\n                    emscripten\n                              Emscripten suite for compiling into JS\n                    icc       Intel C amd C++ compiler\n                    rocm      Radeon Open Compute compilers\n                    oneapi    Intel oneAPI compilers\n                    cuda, cuda+gcc, cuda+clang, cuda+msvc\n                              Nvidia CUDA C++ compiler\n  -comp=COMMAND,COMPILER[,PATH[,VERSION[,ARCHI]]]\n                  Use COMPILER when COMMAND is invoked for compilation\n                  If VERSION and/or ARCHI are not given, nsconfig will\n                  try to determine those. This is useful for cross\n                  compiling and/or setting the CUDA host compiler.\n                  COMMAND must be in { cc, c++, gcc, g++, cl, icc, nvcc,\n                  hipcc, hcc, clang, clang++, armclang, armclang++,\n                  cuda-host-c++, emcc, em++ } ;\n                  VERSION is compiler dependant. Note that VERSION\n                  can be set to only major number(s) in which case\n                  nsconfig fill missing numbers with zeros.\n                  Supported ARCHI:\n                    x86      Intel 32-bits ISA\n                    x86_64   Intel/AMD 64-bits ISA\n                    armel    ARMv5 and ARMv6 32-bits ISA\n                    armhf    ARMv7 32-bits ISA\n                    aarch64  ARM 64-bits ISA\n                    ppc64el  PowerPC 64-bits little entian\n                    wasm32   WebAssembly with 32-bits memory indexing\n                    wasm64   WebAssembly with 64-bits memory indexing\n                  Supported COMPILER:\n                    gcc, g++              GNU Compiler Collection\n                    clang, clang++        LLVM Compiler Infrastructure\n                    emcc, em++            Emscripten compilers\n                    msvc, cl              Microsoft Visual C++\n                    armclang, armclang++  ARM Compiler\n                    xlc, xlc++            IBM Compiler\n                    icc                   Intel C/C++ Compiler\n                    dpcpp                 Intel DPC++ Compiler\n                    nvcc                  Nvidia CUDA compiler\n                    hipcc                 ROCm HIP compiler\n                    fcc_trad_mode, FCC_trad_mode\n                                          Fujitsu C and C++ traditionnal\n                                          compiler\n                    fcc_clang_mode, FCC_clang_mode\n                                          Fujitsu C and C++ traditionnal\n                                          compiler\n  -prefix=PREFIX  Set path for installation to PREFIX\n  -h, --help      Print the current help\n\nNOTE: Nvidia CUDA compiler (nvcc) needs a host compiler. Usually on\n      Linux systems it is GCC while on Windows systems it is MSVC.\n      If nvcc is chosen as the default C++ compiler via the -suite\n      switch, then its host compiler can be invoked in compilation\n      commands with 'cuda-host-c++'. The latter defaults to GCC on Linux\n      systems and MSVC on Windows systems. The user can of course choose\n      a specific version and path of this host compiler via the\n      '-comp=cuda-host-c++,... parameters. If nvcc is not chosen as the\n      default C++ compiler but is used for compilation then its default\n      C++ host compiler is 'c++'. The latter can also be customized via\n      the '-comp=c++,...' command line switch.\n```\n\nEach project can defined its own set of variable controlling the generation of\nthe ninja file of Makefile.\n\n```bash\n$ ../nstools/bin/nsconfig .. -list-vars\nProject variables list:\nname             | description\n-----------------|-----------------------------------\nsimd             | SIMD extension to use\ncuda_arch_flags  | CUDA target arch flag(s) for tests\nstatic_libstdcpp | Compile the libstdc++ statically\ncpp20_tests      | Enable C++20 tests\n```\n\nFinally one can choose what to do and compile NSIMD and its tests.\n\n```bash\n$ ../nstools/bin/nsconfig .. -Dsimd=avx2\n$ ninja\n$ ninja tests\n```\n\nNsconfig comes with nstest a small tool to execute tests.\n\n```bash\n$ ../nstools/bin/nstest -j20\n```\n\n## Cross compilation\n\nIt is useful to cross-compile for example when you are on a Intel workstation\nand want to compile for a Raspberry Pi. Nsconfig generate some code, compile\nand run it to obtain informations on the C or C++ compilers. When cross\ncompiling, unless you configured your Linux box with binfmt\\_misc to\ntranparently execute aarch64 binaries on a x86\\_64 host you need to give\nnsconfig all the informations about the compilers so that it does not need to\nrun aarch64 code on x86\\_64 host.\n\n```bash\n$ ../nstools/bin/nsconfig .. -Dsimd=aarch64 \\\n      -comp=cc,gcc,aarch64-linux-gnu-gcc,10.0,aarch64 \\\n      -comp=c++,gcc,aarch64-linux-gnu-g++,10.0,aarch64\n```\n\n## Defines that control NSIMD compilation and usage\n\nSeveral defines control NSIMD.\n\n- `FMA` or `NSIMD_FMA` indicate to NSIMD that fma intrinsics can be used\n  when compiling code. This is useful on Intel SSE2, SSE42, AVX and AVX2.\n\n- `FP16` or `NSIMD_FP16` indicate to NSIMD that the targeted architecture\n  natively (and possibly partially) supports IEEE float16's. This is useful\n  when compiling for Intel SSE2, SSE42, AVX and AVX2, Arm NEON128 and AARCH64.\n\n# Philosophy of NSIMD\n\nOriginally the library aimed at providing a portable zero-cost abstraction over\nSIMD vendor intrinsics disregarding the underlying SIMD vector length. NSIMD\nwill of course continue to wrap SIMD intrinsics from various vendors but\nmore efforts will be put into writing NSIMD modules and improving the existing\nones especially the SPMD module. \n\n## The SPMD paradigm\n\nIt is our belief that SPMD is a good paradigm for writing vectorized code. It\nhelps both the developer and the compiler writer. It forces the developers to\nbetter arrange its data ion memory more suited for vectorization. On the\ncompiler side it is more simplier to write a \"SPMD compiler\" than a standard\nC/C++/Fortran compiler that tries to autovectorize some weird loop with data\nscattered all around the place. Our priority for our SPMD module are the\nfollowing:\n\n- Add oneAPI/SYCL support.\n- Provide a richer API.\n- Provide cross-lane data transfer.\n- Provide a way to abstract shared memory.\n\nOur approach can be roughly compared to ISPC (<https://ispc.github.io/>)\nbut from a library point of view.\n\n## Wrapping intrinsics in NSIMD core\n\nNSIMD was designed following as closely as possible the following guidelines:\n\n- Correctness primes over speed except for corner cases which may include the\n  following:\n  + Buggy intrinsics on rare input values (denormal numbers, infinities,\n    NaNs) in which case a slower but correct alternative may be\n    proposed to bypass the buggy intrinsics.\n  + A buggy intrinsics but for a specific version of a family of chips. It\n    would be unreasonable to penalize the majority of users vs. a few (or\n    even no) users.\n- Emulate with tricks and intrinsic integer arithmetic when not available.\n- Use common names as found in common computation libraries.\n- Do not hide SIMD registers, one variable (of a type such as `nsimd::pack`)\n  matches one register. When possible force the user to think different between\n  SIMD code and scalar code.\n- Make the life of the compiler as easy as possible: keep the code simple to\n  allow the compiler to perform as many optimizations as possible.\n- Favor the advanced C++ API.\n\nYou may wrap intrinsics that require compile time knowledge of the underlying\nvector length but this should be done with caution.\n\nWrapping intrinsics that do not exist for all types is difficult and may\nrequire casting or emulation. For instance, 8 bit integer vector multiplication\nusing SSE2 does not exist. We can either process each pair of integers\nindividually or we can cast the 8 bit vectors to 16 bit vectors, do the\nmultiplication and cast them back to 8 bit vectors. In the second case,\nchaining operations will generate many unwanted casts.\n\nTo avoid hiding important details to the user, overloads of operators involving\nscalars and SIMD vectors are not provided by default. Those can be included\nexplicitely to emphasize the fact that using expressions like `scalar + vector`\nmight incur an optimization penalty.\n\nThe use of `nsimd::pack` may not be portable to ARM SVE and therefore must be\nincluded manually. ARM SVE registers can only be stored in sizeless strucs\n(`__sizeless_struct`). This feature (as of 2019/04/05) is only supported by the\nARM compiler. We do not know whether other compilers will use the same keyword\nor paradigm to support SVE intrinsics.\n\n# Contributing to NSIMD\n\nThe wrapping of intrinsics, the writing of test and bench files are tedious and\nrepetitive tasks. Most of those are generated using Python scripts that can be\nfound in `egg`.\n\n- Intrinsics that do not require to known the vector length can be wrapped and\n  will be accepted with no problem.\n- Intrinsics that do require the vector length at compile time can be wrapped\n  but it is up to the maintainer to accept it.\n- Use `clang-format` when writing C or C++ code.\n- The `.cpp` files are written in C++98.\n- The headers files must be compatible with C89 (when possible otherwise\n  C99), C++98, C++11, C++14 up to and including C++20.\n\nPlease see <doc/markdown/CONTRIBUTE.md> for more details.\n\n# LICENSES\n\nNSIMD contains files from the excellent [Sleef library](https://sleef.org/)\nwhose license is stated below. The corresponding files are all located\nin the `src` folder and have retained their original license notices.\n\n## NSIMD license\n\nCopyright (c) 2021 Agenium Scale\n\nPermission is hereby granted, free of charge, to any person obtaining a copy of\nthis software and associated documentation files (the \"Software\"), to deal in\nthe Software without restriction, including without limitation the rights to\nuse, copy, modify, merge, publish, distribute, sublicense, and/or sell copies\nof the Software, and to permit persons to whom the Software is furnished to do\nso, subject to the following conditions:\n\nThe above copyright notice and this permission notice shall be included in all\ncopies or substantial portions of the Software.\n\nTHE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\nIMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\nFITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\nAUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\nLIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\nOUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\nSOFTWARE.\n\n## Sleef license ([Boost Software License v1.0](https://www.boost.org/LICENSE_1_0.txt))\n\nBoost Software License - Version 1.0 - August 17th, 2003\n\nPermission is hereby granted, free of charge, to any person or organization\nobtaining a copy of the software and accompanying documentation covered by\nthis license (the \"Software\") to use, reproduce, display, distribute,\nexecute, and transmit the Software, and to prepare derivative works of the\nSoftware, and to permit third-parties to whom the Software is furnished to\ndo so, all subject to the following:\n\nThe copyright notices in the Software and this entire statement, including\nthe above license grant, this restriction and the following disclaimer,\nmust be included in all copies of the Software, in whole or in part, and\nall derivative works of the Software, unless such copies or derivative\nworks are solely in the form of machine-executable object code generated by\na source language processor.\n\nTHE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\nIMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\nFITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT\nSHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE\nFOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,\nARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER\nDEALINGS IN THE SOFTWARE.\n\n"
  },
  {
    "path": "benches/benches.hpp",
    "content": "#ifndef BENCHES_HPP\n#define BENCHES_HPP\n\n#include <limits>\n#include <cmath>\n#include <climits>\n\nnamespace nsimd {\nnamespace benches {\n\ntemplate <typename T>\ndouble rand_sign() {\n  if (std::is_unsigned<T>::value) {\n    return 1.;\n  } else {\n    return (::rand() % 2) ? 1. : -1.;\n  }\n}\n\ntemplate <typename T>\nT rand_bits(T min, T max = std::numeric_limits<T>::max()) {\n  T r;\n  do {\n    int nbits = sizeof(T) * CHAR_BIT;\n    u64 x = 0;\n    for (int i = 0; i < nbits; ++i) {\n      x |= u64(::rand() % 2) << i;\n    }\n    r = *((T*)&x);\n  } while (r < min || r > max);\n  return r;\n}\n\ntemplate <typename T>\nT rand_from(T min, T max = std::numeric_limits<T>::max()) {\n  // From: http://c-faq.com/lib/randrange.html\n  return T(double(min)\n      + (double(::rand()) / (double(RAND_MAX) / (double(max) - double(min) + 1))));\n}\n\ntemplate <typename T>\nT rand_fp(T min, T max) {\n  T r;\n  if (std::isinf(min) && std::isinf(max)) {\n    // For now, we're not using this method for random number\n    //r = rand_bits<T>(min, max);\n    r = rand_from<T>(-1000000, 1000000);\n  } else {\n    r = rand_from<T>(min, max);\n  }\n  return r;\n}\n\ntemplate <typename T>\nT rand(T min, T max = std::numeric_limits<T>::max()) {\n  return rand_from<T>(min, max);\n}\n\ntemplate <>\nfloat rand<float>(float min, float max) {\n  return rand_fp<float>(min, max);\n}\n\ntemplate <>\ndouble rand<double>(double min, double max) {\n  return rand_fp<double>(min, max);\n}\n\n}\n}\n\n#endif\n"
  },
  {
    "path": "build.nsconfig",
    "content": "# MIT License\n#\n# Copyright (c) 2021 Agenium Scale\n#\n# Permission is hereby granted, free of charge, to any person obtaining a copy\n# of this software and associated documentation files (the \"Software\"), to deal\n# in the Software without restriction, including without limitation the rights\n# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n# copies of the Software, and to permit persons to whom the Software is\n# furnished to do so, subject to the following conditions:\n#\n# The above copyright notice and this permission notice shall be included in all\n# copies or substantial portions of the Software.\n#\n# THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n# SOFTWARE.\n\npackage_name nsimd-3.0\n\n## ----------------------------------------------------------------------------\n## Get OS/Compiler specific file extensions\n\nset o       = @obj_ext\nset exe     = @exe_ext\nset s       = @asm_ext\nset so      = @shared_lib_ext\nset lib     = @shared_link_ext\nset root    = @source_dir\nset make    = @make_command\nset build   = @build_dir\nset root    = @source_dir\nset ccomp   = @ccomp_name\nset cppcomp = @cppcomp_name\n\n## ----------------------------------------------------------------------------\n## Some defaults\n\nifnot_set \"SIMD extension to use\" simd = cpu\nifnot_set \"CUDA target arch flag(s) for tests\" cuda_arch_flags = \"\"\nifnot_set \"Compile the libstdc++ statically\" static_libstdcpp = true\nifnot_set \"Enable C++20 tests\" cpp20_tests = \"\"\n\n## ----------------------------------------------------------------------------\n## Targets for compilation\n\nset o_for_               = fp16$o memory$o ufp$o api_cpu$o rempitab$o \\\n                           sleefsp$o sleefdp$o gpu$o\nset o_for_cpu            = $o_for_\nset o_for_cuda           = $o_for_\nset o_for_rocm           = $o_for_\nset o_for_oneapi         = $o_for_\nset o_for_sse2           = $o_for_cpu api_sse2$o sleef_sse2_f32$o \\\n                           sleef_sse2_f64$o\nset o_for_sse42          = $o_for_sse2 api_sse42$o sleef_sse42_f32$o \\\n                           sleef_sse42_f64$o\nset o_for_avx            = $o_for_sse42 api_avx$o sleef_avx_f32$o \\\n                           sleef_avx_f64$o\nset o_for_avx2           = $o_for_avx api_avx2$o sleef_avx2_f32$o \\\n                           sleef_avx2_f64$o\nset o_for_avx512_knl     = $o_for_avx2 api_avx512_knl$o \\\n                           sleef_avx512_knl_f32$o sleef_avx512_knl_f64$o\nset o_for_avx512_skylake = $o_for_avx2 api_avx512_skylake$o \\\n                           sleef_avx512_skylake_f32$o \\\n                           sleef_avx512_skylake_f64$o\nset o_for_neon128        = $o_for_cpu api_neon128$o sleef_neon128_f32$o \\\n                           sleef_neon128_f64$o\nset o_for_aarch64        = $o_for_cpu api_aarch64$o sleef_aarch64_f32$o \\\n                           sleef_aarch64_f64$o\nset o_for_sve            = $o_for_aarch64 api_sve$o sleef_sve_f32$o \\\n                           sleef_sve_f64$o\nset o_for_sve128         = $o_for_aarch64 api_sve128$o sleef_sve128_f32$o \\\n                           sleef_sve128_f64$o\nset o_for_sve256         = $o_for_aarch64 api_sve256$o sleef_sve256_f32$o \\\n                           sleef_sve256_f64$o\nset o_for_sve512         = $o_for_aarch64 api_sve512$o sleef_sve512_f32$o \\\n                           sleef_sve512_f64$o\nset o_for_sve1024        = $o_for_aarch64 api_sve1024$o sleef_sve1024_f32$o \\\n                           sleef_sve1024_f64$o\nset o_for_sve2048        = $o_for_aarch64 api_sve2048$o sleef_sve2048_f32$o \\\n                           sleef_sve2048_f64$o\nset o_for_vmx            = $o_for_cpu api_vmx$o sleef_vmx_f32$o sleef_vmx_f64$o\nset o_for_vsx            = $o_for_vmx api_vsx$o sleef_vsx_f32$o sleef_vsx_f64$o\n\n## ----------------------------------------------------------------------------\n## SIMD compiler flags\n\nlambda cflags_for_generic_*      = -DCPU\nset    cflags_for_generic_cuda   = -DCUDA\nset    cflags_for_generic_rocm   = -DROCM\nset    cflags_for_generic_oneapi = -DONEAPI\n\nset cflags_for_               = ${cflags_for_generic_$simd$}\nset cflags_for_cpu            = $cflags_for_\nset cflags_for_cuda           = -DCUDA\nset cflags_for_rocm           = -DROCM\nset cflags_for_oneapi         = -DONEAPI\nset cflags_for_sse2           = -DSSE2 -msse2\nset cflags_for_sse42          = -DSSE42 -msse42\nset cflags_for_avx            = -DAVX -mavx\nset cflags_for_avx2           = -DAVX2 -mavx2 -DFMA -mfma -DFP16 -mfp16\nset cflags_for_avx512_knl     = -DAVX512_KNL -mavx512_knl -mfma -DFP16 -mfp16\nset cflags_for_avx512_skylake = -DAVX512_SKYLAKE -mavx512_skylake -mfma \\\n                                -DFP16 -mfp16\nset cflags_for_neon128        = -DNEON128 -mneon128\nset cflags_for_aarch64        = -DAARCH64 -maarch64\nset cflags_for_sve            = -DSVE -msve\nset cflags_for_sve128         = -DSVE128 -msve128\nset cflags_for_sve256         = -DSVE256 -msve256\nset cflags_for_sve512         = -DSVE512 -msve512\nset cflags_for_sve1024        = -DSVE1024 -msve1024\nset cflags_for_sve2048        = -DSVE2048 -msve2048\nset cflags_for_vmx            = -DVMX -mvmx\nset cflags_for_vsx            = -DVSX -mvsx\n\n## ----------------------------------------------------------------------------\n## std default flag\n\nlambda std_flag_for_*      = -std=c++98\nset    std_flag_for_rocm   = -std=c++11\nset    std_flag_for_oneapi = -std=c++17\n\n## ----------------------------------------------------------------------------\n## libstdc++ linking mode\n\nset libstdcpp_static_link_true  = -static-libstdc++\nset libstdcpp_static_link_false = \n\n## ----------------------------------------------------------------------------\n## Some defaults\n\nset flags        = -Wall -fPIC -O2 -I$root$/include -DNDEBUG\nset cflags       = ${std_flag_for_$simd$} $flags \\\n                   ${libstdcpp_static_link_$static_libstdcpp$}\nset sleef_cflags = -fPIC -O2 -I$root$/src -DNDEBUG -DDORENAME=1\n\n## ----------------------------------------------------------------------------\n## Default building rules\n\nphony all deps libnsimd_$simd$$so$\n\nbuild_file libnsimd_$simd$$so deps ${o_for_$simd$}\n\tc++ -fPIC -shared @in -o @out\n\nset ldflags = -fPIC -L. -lnsimd_$simd\n\n## ----------------------------------------------------------------------------\n## Generic (emulation) rules for building\n\nbuild_file gpu$o autodeps $root$/src/gpu.cpp\n\tc++ $cflags$ $cflags_for_cpu @in -c -o @out\n\nbuild_file ufp$o autodeps $root$/src/ufp.cpp\n\tc++ $cflags$ $cflags_for_cpu @in -c -o @out\n\nbuild_file fp16$o autodeps $root$/src/fp16.cpp\n\tc++ $cflags$ $cflags_for_cpu @in -c -o @out\n\nbuild_file memory$o autodeps $root$/src/memory.cpp\n\tc++ $cflags$ $cflags_for_cpu @in -c -o @out\n\nbuild_file rempitab$o autodeps $root$/src/rempitab.c\n\tcc $sleef_cflags$ -c @in -o @out\n\nbuild_file sleefsp$o autodeps $root$/src/sleefsp.c\n\tcc $sleef_cflags$ -c @in -o @out\n\nbuild_file sleefdp$o autodeps $root$/src/sleefdp.c\n\tcc $sleef_cflags$ -c @in -o @out\n\nbuild_file api_cpu$o autodeps $root$/src/api_cpu.cpp\n\tc++ $cflags$ $cflags_for_cpu -c @in -o @out\n\n## ----------------------------------------------------------------------------\n## Intel rules for building\n\nbuild_file api_sse2$o autodeps $root$/src/api_sse2.cpp\n\tc++ $cflags$ -c $cflags_for_sse2 @in -o @out\n\nbuild_file sleef_sse2_f32$o autodeps $root$/src/sleefsimdsp.c\n\tcc $sleef_cflags$ -c -msse2 -DNSIMD_SSE2 -DENABLE_SSE2=1 @in -o @out\n\nbuild_file sleef_sse2_f64$o autodeps $root$/src/sleefsimddp.c\n\tcc $sleef_cflags$ -c -msse2 -DNSIMD_SSE2 -DENABLE_SSE2=1 @in -o @out\n\nbuild_file api_sse42$o autodeps $root$/src/api_sse42.cpp\n\tc++ $cflags$ -c $cflags_for_sse42 @in -o @out\n\nbuild_file sleef_sse42_f32$o autodeps $root$/src/sleefsimdsp.c\n\tcc $sleef_cflags$ -c -msse42 -DNSIMD_SSE42 -DENABLE_SSE4=1 @in -o @out\n\nbuild_file sleef_sse42_f64$o autodeps $root$/src/sleefsimddp.c\n\tcc $sleef_cflags$ -c -msse42 -DNSIMD_SSE42 -DENABLE_SSE4=1 @in -o @out\n\nbuild_file api_avx$o autodeps $root$/src/api_avx.cpp\n\tc++ $cflags$ -c $cflags_for_avx @in -o @out\n\nbuild_file sleef_avx_f32$o autodeps $root$/src/sleefsimdsp.c\n\tcc $sleef_cflags$ -c -mavx -DNSIMD_AVX -DENABLE_AVX=1 @in -o @out\n\nbuild_file sleef_avx_f64$o autodeps $root$/src/sleefsimddp.c\n\tcc $sleef_cflags$ -c -mavx -DNSIMD_AVX -DENABLE_AVX=1 @in -o @out\n\nbuild_file api_avx2$o autodeps $root$/src/api_avx2.cpp\n\tc++ $cflags$ -c $cflags_for_avx2 @in -o @out\n\nbuild_file sleef_avx2_f32$o autodeps $root$/src/sleefsimdsp.c\n\tcc $sleef_cflags$ -c -mavx2 -mfma -DNSIMD_AVX2 -DENABLE_AVX2=1 \\\n\t   @in -o @out\n\nbuild_file sleef_avx2_f64$o autodeps $root$/src/sleefsimddp.c\n\tcc $sleef_cflags$ -c -mavx2 -mfma -DNSIMD_AVX2 -DENABLE_AVX2=1 \\\n\t   @in -o @out\n\nbuild_file api_avx512_knl$o autodeps $root$/src/api_avx512_knl.cpp\n\tc++ $cflags$ -c $cflags_for_avx512_knl @in -o @out\n\nbuild_file sleef_avx512_knl_f32$o autodeps $root$/src/sleefsimdsp.c\n\tcc $sleef_cflags$ -c -mavx512_knl -DNSIMD_AVX512_KNL \\\n\t   -DENABLE_AVX512F=1 @in -o @out\n\nbuild_file sleef_avx512_knl_f64$o autodeps $root$/src/sleefsimddp.c\n\tcc $sleef_cflags$ -c -mavx512_knl -DNSIMD_AVX512_KNL \\\n\t   -DENABLE_AVX512F=1 @in -o @out\n\nbuild_file api_avx512_skylake$o autodeps $root$/src/api_avx512_skylake.cpp\n\tc++ $cflags$ -c $cflags_for_avx512_skylake @in -o @out\n\nbuild_file sleef_avx512_skylake_f32$o autodeps $root$/src/sleefsimdsp.c\n\tcc $sleef_cflags$ -c -mavx512_knl -DNSIMD_AVX512_SKYLAKE \\\n\t   -DENABLE_AVX512F=1 @in -o @out\n\nbuild_file sleef_avx512_skylake_f64$o autodeps $root$/src/sleefsimddp.c\n\tcc $sleef_cflags$ -c -mavx512_knl -DNSIMD_AVX512_SKYLAKE \\\n\t   -DENABLE_AVX512F=1 @in -o @out\n\n## ----------------------------------------------------------------------------\n## ARM 32 bits rules for building\n\nbuild_file api_neon128$o autodeps $root$/src/api_neon128.cpp\n\tc++ $cflags$ -c $cflags_for_neon128 @in -o @out\n\nbuild_file sleef_neon128_f32$o autodeps $root$/src/sleefsimdsp.c\n\tcc $sleef_cflags$ -c -mneon128 -DNSIMD_NEON128 \\\n\t   -DENABLE_NEON32=1 @in -o @out\n\nbuild_file sleef_neon128_f64$o autodeps $root$/src/sleefsimddp_emulation.c\n\tcc $sleef_cflags$ -c -mneon128 -DNSIMD_NEON128 -DENABLE_NEON32=1 \\\n\t   -I$root$/include @in -o @out\n\n## ----------------------------------------------------------------------------\n## ARM 64 bits rules for building\n\nbuild_file api_aarch64$o autodeps $root$/src/api_aarch64.cpp\n\tc++ $cflags$ -c $cflags_for_aarch64 @in -o @out\n\nbuild_file sleef_aarch64_f32$o autodeps $root$/src/sleefsimdsp.c\n\tcc $sleef_cflags$ -c -maarch64 -DNSIMD_AARCH64 \\\n\t   -DENABLE_ADVSIMD=1 @in -o @out\n\nbuild_file sleef_aarch64_f64$o autodeps $root$/src/sleefsimddp.c\n\tcc $sleef_cflags$ -c -maarch64 -DNSIMD_AARCH64 \\\n\t   -DENABLE_ADVSIMD=1 @in -o @out\n\nbuild_file api_sve$o autodeps $root$/src/api_sve.cpp\n\tc++ $cflags$ -c $cflags_for_sve @in -o @out\n\nbuild_file sleef_sve_f32$o autodeps $root$/src/sleefsimdsp.c\n\tcc $sleef_cflags$ -c -msve -DNSIMD_SVE -DENABLE_SVE=1 @in -o @out\n\nbuild_file sleef_sve_f64$o autodeps $root$/src/sleefsimddp.c\n\tcc $sleef_cflags$ -c -msve -DNSIMD_SVE -DENABLE_SVE=1 @in -o @out\n\nbuild_file api_sve128$o autodeps $root$/src/api_sve128.cpp\n\tc++ $cflags$ -c $cflags_for_sve128 @in -o @out\n\nbuild_file sleef_sve128_f32$o autodeps $root$/src/sleefsimdsp.c\n\tcc $sleef_cflags$ -c -msve128 -DNSIMD_SVE128 -DENABLE_SVE=1 @in -o @out\n\nbuild_file sleef_sve128_f64$o autodeps $root$/src/sleefsimddp.c\n\tcc $sleef_cflags$ -c -msve128 -DNSIMD_SVE128 -DENABLE_SVE=1 @in -o @out\n\nbuild_file api_sve256$o autodeps $root$/src/api_sve256.cpp\n\tc++ $cflags$ -c $cflags_for_sve256 @in -o @out\n\nbuild_file sleef_sve256_f32$o autodeps $root$/src/sleefsimdsp.c\n\tcc $sleef_cflags$ -c -msve256 -DNSIMD_SVE256 -DENABLE_SVE=1 @in -o @out\n\nbuild_file sleef_sve256_f64$o autodeps $root$/src/sleefsimddp.c\n\tcc $sleef_cflags$ -c -msve256 -DNSIMD_SVE256 -DENABLE_SVE=1 @in -o @out\n\nbuild_file api_sve512$o autodeps $root$/src/api_sve512.cpp\n\tc++ $cflags$ -c $cflags_for_sve512 @in -o @out\n\nbuild_file sleef_sve512_f32$o autodeps $root$/src/sleefsimdsp.c\n\tcc $sleef_cflags$ -c -msve512 -DNSIMD_SVE512 -DENABLE_SVE=1 @in -o @out\n\nbuild_file sleef_sve512_f64$o autodeps $root$/src/sleefsimddp.c\n\tcc $sleef_cflags$ -c -msve512 -DNSIMD_SVE512 -DENABLE_SVE=1 @in -o @out\n\nbuild_file api_sve1024$o autodeps $root$/src/api_sve1024.cpp\n\tc++ $cflags$ -c $cflags_for_sve1024 @in -o @out\n\nbuild_file sleef_sve1024_f32$o autodeps $root$/src/sleefsimdsp.c\n\tcc $sleef_cflags$ -c -msve1024 -DNSIMD_SVE1024 -DENABLE_SVE=1 \\\n\t   @in -o @out\n\nbuild_file sleef_sve1024_f64$o autodeps $root$/src/sleefsimddp.c\n\tcc $sleef_cflags$ -c -msve1024 -DNSIMD_SVE1024 -DENABLE_SVE=1 \\\n\t   @in -o @out\n\nbuild_file api_sve2048$o autodeps $root$/src/api_sve2048.cpp\n\tc++ $cflags$ -c $cflags_for_sve2048 @in -o @out\n\nbuild_file sleef_sve2048_f32$o autodeps $root$/src/sleefsimdsp.c\n\tcc $sleef_cflags$ -c -msve2048 -DNSIMD_SVE2048 -DENABLE_SVE=1 \\\n\t   @in -o @out\n\nbuild_file sleef_sve2048_f64$o autodeps $root$/src/sleefsimddp.c\n\tcc $sleef_cflags$ -c -msve2048 -DNSIMD_SVE2048 -DENABLE_SVE=1 \\\n\t   @in -o @out\n\n## ----------------------------------------------------------------------------\n## POWERPC rules for building\n\nbuild_file api_vmx$o autodeps $root$/src/api_vmx.cpp\n\tc++ $cflags$ -c $cflags_for_vmx @in -o @out\n\nbuild_file sleef_vmx_f32$o autodeps $root$/src/sleefsimdsp_emulation.c\n\tcc $sleef_cflags$ -c -mvmx -DNSIMD_VMX -DENABLE_VSX=1 \\\n\t   -I$root$/include @in -o @out\n\nbuild_file sleef_vmx_f64$o autodeps $root$/src/sleefsimddp_emulation.c\n\tcc $sleef_cflags$ -c -mvmx -DNSIMD_VMX -DENABLE_VSX=1 \\\n\t   -I$root$/include @in -o @out\n\nbuild_file api_vsx$o autodeps $root$/src/api_vsx.cpp\n\tc++ $cflags$ -c $cflags_for_vsx @in -o @out\n\nbuild_file sleef_vsx_f32$o autodeps $root$/src/sleefsimdsp.c\n\tcc $sleef_cflags$ -c -mvsx -DNSIMD_VSX -DENABLE_VSX=1 @in -o @out\n\nbuild_file sleef_vsx_f64$o autodeps $root$/src/sleefsimddp.c\n\tcc $sleef_cflags$ -c -mvsx -DNSIMD_VSX -DENABLE_VSX=1 @in -o @out\n\n## ----------------------------------------------------------------------------\n## Installation and packaging\n\ninstall_file libnsimd_${simd}$so lib\n[W] install_file libnsimd_${simd}$lib lib\ninstall_dir $root$/include/nsimd include\ninstall_dir $root$/doc/html doc\n\n## ----------------------------------------------------------------------------\n## Tests\n\n# Lambda arguments: suite, compiler, std, simd_ext\n# By default all tests will be considered\nlambda tests_*_*_* = ok\n\n# Now disable some possibilities on certain compilers\nset    tests_clang_c89_vmx = \"\"\nset    tests_clang_c89_vsx = \"\"\nset    tests_clang_c89_sve = \"\"\nlambda tests_*_c89_cuda    = \"\"\nlambda tests_*_c99_cuda    = \"\"\nlambda tests_*_c11_cuda    = \"\"\nlambda tests_*_cpp17_cuda  = \"\"\nlambda tests_*_c89_rocm    = \"\"\nlambda tests_*_c99_rocm    = \"\"\nlambda tests_*_c11_rocm    = \"\"\nlambda tests_*_cpp98_rocm  = \"\"\nlambda tests_*_cpp17_rocm  = \"\"\nlambda tests_*_c89_oneapi  = \"\"\nlambda tests_*_c99_oneapi  = \"\"\nlambda tests_*_c11_oneapi  = \"\"\nlambda tests_dpcpp_cpp98_* = \"\"\nlambda tests_dpcpp_cpp11_* = \"\"\n\nset c89_enabled   = ${tests_$ccomp$_c89_$simd$}\nset c89.files     = \"\"\nset c99_enabled   = ${tests_$ccomp$_c99_$simd$}\nset c99.files     = \"\"\nset c11_enabled   = ${tests_$ccomp$_c11_$simd$}\nset c11.files     = \"\"\nset cpp98_enabled = ${tests_$cppcomp$_cpp98_$simd$}\nset cpp98.files   = \"\"\nset cpp11_enabled = ${tests_$cppcomp$_cpp11_$simd$}\nset cpp11.files   = \"\"\nset cpp17_enabled = ${tests_$cppcomp$_cpp17_$simd$}\nset cpp17.files   = \"\"\nset cpp20.files   = \"\"\n\nset tests_flags = $cuda_arch_flags $flags ${cflags_for_$simd$} -lm $ldflags\necho Test compilation flags: $tests_flags$\n\n[$c89_enabled$] build_files c89 foreach glob:$root$/tests/*.prec11.c \\\n                as tests.%r.c89$exe \\\n                autodeps @item libnsimd_$simd$$so$\n\t[$c89_enabled$] cc -std=c89 @item $tests_flags -o @out\n\n[$c89_enabled$] phony tests.c89 deps $c89.files\n\n\n[$c99_enabled$] build_files c99 foreach glob:$root$/tests/*.prec11.c \\\n                as tests.%r.c99$exe \\\n                autodeps @item libnsimd_$simd$$so$\n\t[$c99_enabled$] cc -std=c99 @item $tests_flags -o @out\n\n[$c99_enabled$] phony tests.c99 deps $c99.files\n\n\n[$c11_enabled$] build_files c11 foreach glob:$root$/tests/*.c \\\n                as tests.%r.c11$exe \\\n                autodeps @item libnsimd_$simd$$so$\n\t[$c11_enabled$] cc -std=c11 @item $tests_flags -o @out\n\n[$c11_enabled$] phony tests.c11 deps $c11.files\n\n\n[$cpp98_enabled$] build_files cpp98 foreach glob:$root$/tests/*.cpp \\\n                  as tests.%r.cpp98$exe \\\n                  autodeps @item libnsimd_$simd$$so$\n\t[$cpp98_enabled$] c++ -std=c++98 @item $tests_flags -o @out\n\n[$cpp98_enabled$] phony tests.cpp98 deps $cpp98.files\n\n\n[$cpp11_enabled$] build_files cpp11 foreach glob:$root$/tests/*.cpp \\\n                  as tests.%r.cpp11$exe \\\n                  autodeps @item libnsimd_$simd$$so$\n\t[$cpp11_enabled$] c++ -std=c++11 @item $tests_flags -o @out\n\n[$cpp11_enabled$] phony tests.cpp11 deps $cpp11.files\n\n\n[$cpp17_enabled$] build_files cpp17 foreach glob:$root$/tests/*.cpp \\\n                  as tests.%r.cpp17$exe \\\n                  autodeps @item libnsimd_$simd$$so$\n\t[$cpp17_enabled$] c++ -std=c++17 @item $tests_flags -o @out\n\n[$cpp17_enabled$] phony tests.cpp17 deps $cpp17.files\n\n\n[$cpp20_tests$] build_files cpp20 foreach glob:$root$/tests/*.cpp \\\n                  as tests.%r.cpp20$exe \\\n                  autodeps @item libnsimd_$simd$$so$\n\t[$cpp20_tests$] c++ -std=c++20 @item $tests_flags -o @out\n\n[$cpp20_tests$] phony tests.cpp20 deps $cpp20.files\n\n\n# Phony target for tests\nphony tests deps $c89.files $c99.files $c11.files $cpp98.files $cpp11.files \\\n                 $cpp17.files $cpp20.files\n\n## ----------------------------------------------------------------------------\n## Examples\n\nbuild_files examples_cpp98 foreach glob:$root$/examples/*.cpp \\\n                           as examples.%r.cpp98$exe \\\n                           autodeps @item libnsimd_$simd$$so$\n\tc++ -std=c++98 @item $tests_flags -o @out\n\nphony examples.cpp98 deps $examples_cpp98.files\n"
  },
  {
    "path": "doc/Makefile.nix",
    "content": "# Copyright (c) 2020 Agenium Scale\n# \n# Permission is hereby granted, free of charge, to any person obtaining a copy\n# of this software and associated documentation files (the \"Software\"), to deal\n# in the Software without restriction, including without limitation the rights\n# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n# copies of the Software, and to permit persons to whom the Software is\n# furnished to do so, subject to the following conditions:\n# \n# The above copyright notice and this permission notice shall be included in all\n# copies or substantial portions of the Software.\n# \n# THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n# SOFTWARE.\n\nNS2_ROOT  = ../nstools/ns2\nCXX       = c++\nCXX_FLAGS = -O2 -Wall -Wextra -pedantic -std=c++11\n\nall: md2html what_is_wrapped\n\nlibns2.a: $(NS2_ROOT)/../.git/logs/HEAD Makefile.nix\n\trm -rf libns2\n\tmkdir -p libns2\n\tcp $(NS2_ROOT)/lib/*.cpp libns2\n\t(cd libns2 && $(CXX) $(CXX_FLAGS) -I../$(NS2_ROOT)/include -c *.cpp)\n\tar rcs $@ libns2/*.o\n\trm -rf libns2\n\nmd2html: libns2.a md2html.cpp Makefile.nix\n\t$(CXX) $(CXX_FLAGS) md2html.cpp -I$(NS2_ROOT)/include -o $@ -L. -lns2\n\nwhat_is_wrapped: libns2.a what_is_wrapped.cpp Makefile.nix\n\t$(CXX) $(CXX_FLAGS) what_is_wrapped.cpp -I$(NS2_ROOT)/include -o $@ \\\n\t       -L. -lns2\n"
  },
  {
    "path": "doc/Makefile.win",
    "content": "# Copyright (c) 2020 Agenium Scale\n# \n# Permission is hereby granted, free of charge, to any person obtaining a copy\n# of this software and associated documentation files (the \"Software\"), to deal\n# in the Software without restriction, including without limitation the rights\n# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n# copies of the Software, and to permit persons to whom the Software is\n# furnished to do so, subject to the following conditions:\n# \n# The above copyright notice and this permission notice shall be included in all\n# copies or substantial portions of the Software.\n# \n# THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n# SOFTWARE.\n\nNS2_ROOT  = ..\\nstools\\ns2\nCXX       = cl\nCXX_FLAGS = /nologo /Ox /W3 /EHsc /DNS_NO_DLLSPEC /D_CRT_SECURE_NO_WARNINGS\n\nall: md2html.exe what_is_wrapped.exe\n\nlibns2.lib: $(NS2_ROOT)\\..\\.git\\logs\\HEAD Makefile.win\n\tif exist libns2 rd /Q /S libns2\n\tmd libns2\n\tcopy /Y $(NS2_ROOT)\\lib\\*.cpp libns2\n\t(cd libns2 && $(CXX) $(CXX_FLAGS) -I..\\$(NS2_ROOT)\\include /c *.cpp)\n\tlib /nologo /out:libns2.lib libns2\\*.obj\n\trd /Q /S libns2\n\nmd2html.exe: libns2.lib md2html.cpp Makefile.win\n\t$(CXX) $(CXX_FLAGS) /I$(NS2_ROOT)\\include md2html.cpp libns2.lib \\\n\t       Shlwapi.lib Dbghelp.lib /Fe$@\n\nwhat_is_wrapped.exe: libns2.lib what_is_wrapped.cpp Makefile.win\n\t$(CXX) $(CXX_FLAGS) /I$(NS2_ROOT)\\include what_is_wrapped.cpp \\\n\t       libns2.lib Shlwapi.lib Dbghelp.lib /Fe$@\n"
  },
  {
    "path": "doc/markdown/compilers_and_versions.md",
    "content": "<!--\n\nCopyright (c) 2019 Agenium Scale\n\nPermission is hereby granted, free of charge, to any person obtaining a copy\nof this software and associated documentation files (the \"Software\"), to deal\nin the Software without restriction, including without limitation the rights\nto use, copy, modify, merge, publish, distribute, sublicense, and/or sell\ncopies of the Software, and to permit persons to whom the Software is\nfurnished to do so, subject to the following conditions:\n\nThe above copyright notice and this permission notice shall be included in all\ncopies or substantial portions of the Software.\n\nTHE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\nIMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\nFITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\nAUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\nLIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\nOUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\nSOFTWARE.\n\n-->\n\n`nsimd` is tested with GCC, Clang and MSVC. As a C89 and a C++98 API are\nprovided, other compilers should work fine. Old compiler versions should work as\nlong as they support the targeted SIMD extension. For instance, `nsimd` can\ncompile on MSVC 2010 `SSE4.2` code.\n\n`nsimd` requires a C or a C++ compiler and is actually daily tested on the\nfollowing compilers for the following hardware:\n\n**Compiler**            | **Version** | **Architecture** | **Extensions**\n----------------------- | ----------- | ---------------- | --------------\nGCC                     | 8.3.0       | Intel            | `SSE2`, `SSE4.2`, `AVX`, `AVX2`, `AVX-512` (`KNL` and `SKYLAKE`)\nClang                   | 7.0.1       | Intel            | `SSE2`, `SSE4.2`, `AVX`, `AVX2`, `AVX-512` (`KNL` and `SKYLAKE`)\nGCC                     | 8.3.0       | ARM              | `Aarch64`, `NEON` (`ARMv7`), `SVE`\nClang                   | 7.0.1       | ARM              | `Aarch64`, `NEON` (`ARMv7`), `SVE`\nMicrosoft Visual Studio | 2017        | Intel            | `SSE4.2`\nIntel C++ Compiler      | 19.0.4.243  | Intel            | `SSE2`, `SSE4.2`, `AVX`, `AVX2`, `AVX-512` (`SKYLAKE`)\n\n<!-- TODO  -->\n<!--We recommend using a 64-bits compiler as this results in significantly better\nperformance. Also, `nsimd` performances are only provided when compiled in an\noptimized code with assertions disabled.-->\n"
  },
  {
    "path": "doc/markdown/concepts.md",
    "content": "# C++20 concepts\n\nAs of C++20, concepts are available. We quote <en.cppreference.com> to\nintroduce concepts.\n\n*Class templates, function templates, and non-template functions (typically\nmembers of class templates) may be associated with a constraint, which\nspecifies the requirements on template arguments, which can be used to select\nthe most appropriate function overloads and template specializations.*\n\n*Named sets of such requirements are called concepts. Each concept is a\npredicate, evaluated at compile time, and becomes a part of the interface of a\ntemplate where it is used as a constraint*\n\n## Concepts provided by NSIMD\n\nAll concepts provided by NSIMD comes in two forms:\n- The native C++20 form in the `nsimd` namespace\n- As a macro for keeping the compatibility with older versions of C++\n\nThe following tables list all concepts and is exhaustive. Native concepts are\naccessible through the `nsimd` namespace. They take only one argument. Their\nmacro counterparts take no argument as they are meant to be used as\nconstraint placeholder types. When compiling for older C++ versions NSIMD\nconcepts macros are simply read as `typename` by the compiler.\n\nTable for base C and C++ APIs:\n\n| Native concept              | Macro                              | Description                                    |\n|:----------------------------|:-----------------------------------|:-----------------------------------------------|\n| `simd_ext_c`                | `NSIMD_CONCEPT_SIMD_EXT`           | Valid SIMD extension                           |\n| `simd_value_type_c`         | `NSIMD_CONCEPT_VALUE_TYPE`         | Valid NSIMD underlying value type              |\n| `simd_value_type_or_bool_c` | `NSIMD_CONCEPT_VALUE_TYPE_OR_BOOL` | Valid NSIMD underlying value type or `bool`    |\n| `alignment_c`               | `NSIMD_CONCEPT_ALIGNMENT`          | Valid NSIMD alignment `aligned` or `unaligned` |\n\nTable for advanced C++ API:\n\n| Native concept | Macro                    | Description                                    |\n|:---------------|:-------------------------|:----------------------|\n| `is_pack_c`    | `NSIMD_CONCEPT_PACK`     | Valid NSIMD pack      |\n| `is_packl_c`   | `NSIMD_CONCEPT_PACKL`    | Valid NSIMD packl     |\n| `is_packx1_c`  | `NSIMD_CONCEPT_PACKX1`   | Valid NSIMD packx1    |\n| `is_packx2_c`  | `NSIMD_CONCEPT_PACKX2`   | Valid NSIMD packx2    |\n| `is_packx3_c`  | `NSIMD_CONCEPT_PACKX3`   | Valid NSIMD packx3    |\n| `is_packx4_c`  | `NSIMD_CONCEPT_PACKX4`   | Valid NSIMD packx4    |\n| `any_pack_c`   | `NSIMD_CONCEPT_ANY_PACK` | Any of the above pack |\n\n## Expressing C++20 constraints\n\nExpressing constraints can of course be done with the `requires` keyword. But\nfor compatibility with older C++ versions NSIMD provides `NSIMD_REQUIRES`\nwhich take as onyl argument the constraints.\n\n```c++\ntemplate <typename T, typename S>\nNSIMD_REQUIRES(sizeof(T) == sizeof(S))\nvoid foo(T, S);\n```\n\nIt is advised to use doubled parenthesis as coma in the constraints expression\ncan be interpreted as argument separators for the macro itself.\n\n```c++\ntemplate <typename T, typename S>\nNSIMD_REQUIRES((std::is_same<T, S>))\nvoid foo(T, S);\n```\n\nNote that when expressing constraints using `nsimd::sizeof_v`'s prefer the\nNSIMD definition of sizeof for the following reason: when dealing with\nfloat16's one cannot know the underlying representation of such a type as it\nis non-portable and non-standard, but NSIMD provides helper functions to\ntransparently deal with float16's as if they were 16-bits wide. Therefore\nexpressing sizeof equality should be done with `nsimd::sizeof_v`.\n\n```c++\ntemplate <typename T, typename S>\nNSIMD_REQUIRES((nsimd::sizeof_v<T> == nsimd::sizeof_v<S>))\nvoid foo(T, S);\n```\n"
  },
  {
    "path": "doc/markdown/defines.md",
    "content": "# Defines provided by NSIMD\n\nNSIMD uses macros (not function macros) that we call defines to make choices\nin its code at copmile time. Most of them can be of use to the end-user so\nwe list them here.\n\n## Compiler detection\n\nThe compiler detection is automatically done by NSIMD as it is relatively\neasy.\n\n| Define              | Compiler                                          |\n|---------------------|---------------------------------------------------|\n| `NSIMD_IS_MSVC`     | Microsoft Visual C++                              |\n| `NSIMD_IS_HIPCC`    | ROCm HIP compiler (warning, see below)            |\n| `NSIMD_IS_NVCC`     | NVIDIA CUDA Compiler                              |\n| `NSIMD_IS_ICC`      | Intel C++ Compiler                                |\n| `NSIMD_IS_CLANG`    | Clang/LLVM                                        |\n| `NSIMD_IS_GCC`      | GNU Compiler Collection                           |\n| `NSIMD_IS_FCC`      | Fujitsu compiler                                  |\n\n**Warning**: some HIP versions do not declare themselves at all so it\nimpossible to find out that HIP is the compiler. As HIP is based on clang,\nwithout help NSIMD will detect Clang. It is up to the end-user to compile\nwith `-D__HIPCC__` for NSIMD to detect HIP.\n\nNote that we do support the Armclang C and C++ compilers but for NSIMD there\nis no need to have code different from Clang's specific code so we do no\nprovide a macro to detect this compiler in particular.\n\nNote also that two of the above macros can be defined at the same time. This\nhappens typically when compiling for a device. For example when compiling for\nNVIDIA CUDA with nvcc both `NSIMD_IS_NVCC` and `NSIMD_IS_GCC` (when the host\ncompiler is GCC).\n\n## Compilation environment and contants\n\n| Define            | Description           | Possible values                 |\n|-------------------|-----------------------|---------------------------------|\n| `NSIMD_C`         | C version             | 1989, 1999, 2011                |\n| `NSIMD_CXX`       | C++ version           | 1998, 2011, 2014, 2017, 2020    |\n| `NSIMD_WORD_SIZE` | Machine word size     | 32, 64                          |\n| `NSIMD_U8_MIN`    | Minimum value for u8  | 0                               |\n| `NSIMD_U8_MAX`    | Maximum value for u8  | 255                             |\n| `NSIMD_I8_MIN`    | Minimum value for i8  | -128                            |\n| `NSIMD_I8_MAX`    | Maximum value for i8  | 127                             |\n| `NSIMD_U16_MIN`   | Minimum value for u16 | 0                               |\n| `NSIMD_U16_MAX`   | Maximum value for u16 | 65535                           |\n| `NSIMD_I16_MIN`   | Minimum value for i16 | -32768                          |\n| `NSIMD_I16_MAX`   | Maximum value for i16 | 32767                           |\n| `NSIMD_U32_MIN`   | Minimum value for u32 | 0                               |\n| `NSIMD_U32_MAX`   | Maximum value for u32 | 4294967295                      |\n| `NSIMD_I32_MIN`   | Minimum value for i32 | -2147483648                     |\n| `NSIMD_I32_MAX`   | Maximum value for i32 | 2147483647                      |\n| `NSIMD_U64_MIN`   | Minimum value for u64 | 0                               |\n| `NSIMD_U64_MAX`   | Maximum value for u64 | 18446744073709551615            |\n| `NSIMD_I64_MIN`   | Minimum value for i64 | -9223372036854775808            |\n| `NSIMD_I64_MAX`   | Maximum value for i64 | 9223372036854775807             |\n| `NSIMD_DLLSPEC`   | (Windows) DLL storage-class information | `__declspec(dllexport)` or `__declspec(dllimport)` |\n| `NSIMD_DLLSPEC`   | (Unix) storage-class information        | `extern` or nothing |\n| `NSIMD_C_LINKAGE_FOR_F16` | Indicate whether functions involving f16 have C linkage | defined or not |\n\n## Targeted architecture detection\n\nContrary to the compiler detection, the targeted architecture is not done\nautoamtically by NSIMD as is really hard and some compilers do not provide\nthe necessary informations. So in order to have a consistent way of targeting\nan architecture this is up to the end-user to specify it using one of the\nfollowing defines.\n\n| Define                 | Targeted architecture                             |\n|------------------------|---------------------------------------------------|\n| `NSIMD_CPU`            | Generic, no SIMD, emulation                       |\n| `NSIMD_SSE2`           | Intel SSE2                                        |\n| `NSIMD_SSE42`          | Intel SSE4.2                                      |\n| `NSIMD_AVX`            | Intel AVX                                         |\n| `NSIMD_AVX2`           | Intel AVX2                                        |\n| `NSIMD_AVX512_KNL`     | Intel AVX-512 as found on KNLs                    |\n| `NSIMD_AVX512_SKYLAKE` | Intel AVX-512 as found on Xeon Skylake            |\n| `NSIMD_NEON128`        | Arm NEON 128 bits as found on 32-bits Arm chips   |\n| `NSIMD_AARCH64`        | Arm NEON 128 bits as found on 64-bits Arm chips   |\n| `NSIMD_SVE`            | Arm SVE (length agnostic)                         |\n| `NSIMD_SVE128`         | Arm SVE (size known at compilation to 128 bits)   |\n| `NSIMD_SVE256`         | Arm SVE (size known at compilation to 256 bits)   |\n| `NSIMD_SVE512`         | Arm SVE (size known at compilation to 512 bits)   |\n| `NSIMD_SVE1024`        | Arm SVE (size known at compilation to 1024 bits)  |\n| `NSIMD_SVE2048`        | Arm SVE (size known at compilation to 2048 bits)  |\n| `NSIMD_CUDA`           | Nvidia CUDA                                       |\n| `NSIMD_ROCM`           | AMD ROCm architectures                            |\n| `NSIMD_VMX`            | IBM POWERPC VMX (Altivec)                         |\n| `NSIMD_VSX`            | IBM POWERPC VSX (Altivec)                         |\n| `NSIMD_FP16`           | Architecture supports natively IEEE float16       |\n| `NSIMD_FMA`            | Architecture supports natively FMAs               |\n\n## Targeted architecture constants\n\n| Define                | Description                                        |\n|-----------------------|----------------------------------------------------|\n| `NSIMD_NB_REGISTERS`  | Number of SIMD registers                           |\n| `NSIMD_MAX_LEN_BIT`   | Maximum number of bits in a SIMD register          |\n| `NSIMD_MAX_LEN_i8`    | Maximum number of i8's in a SIMD register          |\n| `NSIMD_MAX_LEN_u8`    | Maximum number of u8's in a SIMD register          |\n| `NSIMD_MAX_LEN_i16`   | Maximum number of i16's in a SIMD register         |\n| `NSIMD_MAX_LEN_u16`   | Maximum number of u16's in a SIMD register         |\n| `NSIMD_MAX_LEN_i32`   | Maximum number of i32's in a SIMD register         |\n| `NSIMD_MAX_LEN_u32`   | Maximum number of u32's in a SIMD register         |\n| `NSIMD_MAX_LEN_i64`   | Maximum number of i64's in a SIMD register         |\n| `NSIMD_MAX_LEN_u64`   | Maximum number of u64's in a SIMD register         |\n\nNSIMD provides a mean to write generic code by using the `NSIMD_MAX_LEN` macros\nwhose argument is one of { i8, u8, i16, u16, i32, u32, i64, u64 }.\n\n```c++\n#define T ??? // to be defined as a base type\n\nint main(void) {\n  T buf[NSIMD_MAX_LEN(T)]; // an array of T's for loading/storing\n  ...\n  return 0;\n}\n```\n\n## Other useful macros\n\nNSIMD provides macros to concatenate blobs so that generic programming in pure\nC is possible.\n\n- `#define NSIMD_PP_CAT_2(a, b)` concatenates `a` and `b`.\n- `#define NSIMD_PP_CAT_3(a, b, c)` concatenates `a`, `b` and `c`.\n- `#define NSIMD_PP_CAT_4(a, b, c, d)` concatenates `a`, `b`, `c` and `d`.\n- `#define NSIMD_PP_CAT_5(a, b, c, d, e)` concatenates `a`, `b`, `c`, `d` and\n  `e`.\n- `#define NSIMD_PP_CAT_6(a, b, c, d, e, f)` concatenates `a`, `b`, `c`, `d`,\n  `e` and `f`.\n"
  },
  {
    "path": "doc/markdown/faq.md",
    "content": "<!--\n\nCopyright (c) 2020 Agenium Scale\n\nPermission is hereby granted, free of charge, to any person obtaining a copy\nof this software and associated documentation files (the \"Software\"), to deal\nin the Software without restriction, including without limitation the rights\nto use, copy, modify, merge, publish, distribute, sublicense, and/or sell\ncopies of the Software, and to permit persons to whom the Software is\nfurnished to do so, subject to the following conditions:\n\nThe above copyright notice and this permission notice shall be included in all\ncopies or substantial portions of the Software.\n\nTHE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\nIMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\nFITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\nAUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\nLIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\nOUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\nSOFTWARE.\n\n-->\n\n# Frequently Asked Questions\n\n## Is it good practice to use a `nsimd::pack` as a `std::vector`?\n\nNo, these are two very different objects. A `nsimd::pack` represent a SIMD\nregister whereas a `std::vector` represents a chunk of memory. You should\nseparate concerns and use `std::vector` to store data in your structs or\nclasses, `nsimd::pack` should only be used in computation kernels and nowhere\nelse especially not in structs or classes.\n\n## Why is the speed-up of my code not as expected?\n\nThere are several reasons which can reduce the speed-up:\n\n- Have you enabled compiler optimizations? You must enable all compiler\n  optimizations (like `-O3`).\n\n- Have you compiled in 64 bit mode? There is significant performance increase\n  on architectures supporting 64 bit binaries.\n\n- Is your code trivially vectorizable? Modern compilers can vectorize trivial\n  code segments automatically. If you benchmark a trivial scalar code versus a\n  vectorized code, the compiler may vectorize the scalar code, thereby giving\n  similar performance to the vectorized version.\n\n- Some architectures do not provides certains functionnalities. For example\n  AVX2 chips do not provide a way to convert long to double. So using\n  `nsimd::cvt<f64>` will produce an emulation for-loop in the resulting\n  binary. To know which intrinsics are used by NSIMD you can consult\n  <wrapped_intrinsics.md>.\n\n## Why did my code segfaulted or crashed?\n\nThe most common cause of segfaults in SIMD codes is accessing non-aligned\nmemory. For best performance, all memory should be aligned. NSIMD includes an\naligned memory allocation function and an aligned memory allocator to help you\nwith this. Please refer to <tutorials.md> for details on how to\nensure that you memory is correctly aligned.\n\nAnother common cause is to read or write data beyond the allocated memory.\nDo not forget that loading data into a SIMD vector will result in loading\n16 bytes (or 4 floats) from memory. If this read occurs at the last 2 elements\nof allocated memory then a segfault will be generated.\n\n## My code compiled for AVX is not twice as fast as for SSE, why?\n\nNot all SSE instructions have an equivalent AVX instruction. As a consequence\nNSIMD uses two SSE operations to emulate the equivalent AVX operation.  Also,\nthe cycles required for certain instructions are not equal on both\narchitectures, for example, `sqrt` on `SSE` requires 13-14 cycles whereas\n`sqrt` on `AVX` requires 21-28 cycles. Please refer\n[here](https://www.agner.org/optimize/instruction_tables.pdf) for more\ninformation.\n\nVery few integer operations are supported on AVX, AVX2 is required for most\ninteger operations. If a NSIMD function is called on an integer AVX register,\nthis register will be split into two SSE registers and the equivalent\ninstruction called on both register. In the case, no speed-up will be observed\ncompared with SSE code. This is true also on POWER 7, where double is not\nsupported.\n\n## I disassembled my code, and the generated code is less than optimal, why?\n\n- Have you compiled in release mode, with full optimizations options?\n\n- Have you used a 64 bit compiler?\n\n- There are many SIMD related bugs across all compilers, and some compilers\n  generate less than optimal code in some cases. Is it possible to update your\n  compiler to a more modern compiler?\n\n- We provide workarounds for several compiler bugs, however, we may have\n  missed some. You may also have found a bug in `nsimd`. Please report this\n  through issues on our github with a minimal code example. We responds quickly\n  to bug reports and do our best to patch them as quickly as possible.\n\n## How can I use a certain intrinsic?\n\nIf you require a certain intrinsic, you may search inside of NSIMD for it and\nthen call the relevant function or look at <wrapped_intrinsics.md>.\n\nIn rare cases, the intrinsic may not be included in NSIMD as we map the\nintrinsic wherever it makes sense semantically. If a certain intrinsic does not\nfit inside of this model, if may be excluded. In this case, you may call it\nyourself, however, note this will not be portable. \n\nTo use a particular intrinsic say `_mm_avg_epu8`, you can write the following.\n\n```c++\nnsimd::pack<u8> a, b, result;\nresult = nsimd::pack<u8>(_mm_avg_epu8(a.native_register(),\n                                      b.native_register()));\n```\n\n## How do I convert integers/floats to/from logicals?\n\nUse [`nsimd::to_mask`](api_to-mask.md) and\n[`nsimd::to_logical`](api_to-logical.md).\n\n## How about shuffles?\n\nGeneral shuffles are not provided by NSIMD. You can see\n[issue 8 on github](https://github.com/agenium-scale/nsimd/issues/8). For now\nwe provide only some length agnostic shuffles such as zip and unzip, see\n[the shuffle API](api.md) at the Shuffle section.\n\n## Are there C++ STL like algorithms?\n\nNo. You are welcome to [contribute](contribute.md) to NSIMD and add them as\na NSIMD module. You should use\n[expressions templates](module_tet1d_overview.md) instead. Strictly conforment\nSTL algorithms do not provide means to control for example the unroll factor\nor the number of threads per block when compiling for GPUs.\n\n## Are there masked operators in NSIMD?\n\nYes, we provide masked loads and stores, see [the api](api.md) at the\n\"Loads & stores\" section. We also provide the\n[`nsimd::mask_for_loop_tail`](api_mask-for-loop-tail.md) which computes the\nmask for ending loops. But note that using these is not recommanded as on\nmost architectures there are no intrinsic. This will result in slow code. It\nis recommanded to finish loops using a scalar implementation.\n\n## Are there gathers and scatter in NSIMD?\n\nYes, we provide gathers and scatters, see [the api](api.md) at the\n\"Loads & stores\" section. Note also that as most architectures do not provide\nsuch intrinsics and so this could result in slow code.\n\n## Why does not NSIMD recognize the target architecture automatically?\n\nAutodetecting the SIMD extension is compiler/compiler version/cpu/system\ndependant which means a lot of code for a (most likely buggy) feature which can\nbe an inconvenience sometimes. Plus some compilers do not permit this feature.\nFor example cf.\n<https://www.boost.org/doc/libs/1_71_0/doc/html/predef/reference.html> and\n<https://msdn.microsoft.com/en-us/library/b0084kay.aspx>. Thus a \"manual\"\nsystem is always necessary.\n\n## Why some operators have their names ending with an \"1\"?\n\nThis is because of C++ and our will not to use C++-useless-complicated stuff.\nTaking the example with `if_else`, suppose that we have called it \"if\\_else\"\nwithout the \"1\". When working with packs, one wants to be able to use `if_else`\nin this manner:\n\n```c++\nint main() {\n  using namespace nsimd;\n  \n  typedef pack<int> pi;\n  typedef pack<float> pf;\n\n  int n;\n  int *a, *b;      // suppose both points to n ints\n  float *fa, *fb;  // suppose both points to n floats\n\n  for (int i = 0; i < n; i += len()) {\n    packl<int> cond = (loada<pi>(&a[i]) < loada<pi>(&b[i]));\n    storea(&fb[i], if_else(cond, load<pf>(&fb[i]), set1<pf>(0.0f)));\n  }\n\n  return 0;\n}\n```\n\nBut this causes a compiler error, the overload of `if_else` is ambiguous.\nSure one can use many C++-ish techniques to tackle this problem but we chose\nnot to as the goal is to make the life of the compiler as easy as possible.\nSo as we want to favor the C++ advanced API as it is the most human readable,\nusers of the C and C++ base APIs will have to use `if_else1`.\n"
  },
  {
    "path": "doc/markdown/fp16.md",
    "content": "# IEEE float16 related functions\n\nNSIMD natively supports IEEE float16's. This means that NSIMD provides types\nand functions to deal with them. When the targeted architecture supports them\nthen NSIMD will use approriate intrinsics otherwise emulation with float32's\nwill be used.\n\n- When emulating, as float16's are not natively supported by neither C or C++\n  emulation is done with float32's.\n\n- Intel architectures do not support IEEE float16 arithmetic, they only\n  provide, as an extension, supports for convertion to/from float32. When\n  compiling NSIMD for Intel architectures use `-DFP16` to activate the\n  conversion intrinsics if available on your machine. Note that AVX-512\n  has thoses natively.\n\n- Arm architectures can provide native float16 arithmetic. For 32-bits and\n  64-bits chips (ARMv7 and Aarch64) chips float16 support is optional. When\n  compiling with `-DFP16`, NSIMD will use float16-related intrinsics. Note\n  that for SVE chips float16's are mandatory hence NSIMD will use appropriate\n  intrinsics with or without `-DFP16`.\n\n- CUDA provides supports for converting float16's to/from float32's. These\n  are always used by NSIMD. But it is only since devices of compute\n  capabilities 5.3 and above that float16's arithmetic is provided. NSIMD will\n  always use CUDA float16's functions so there is no need to compile with\n  `-DFP16`.\n\n- ROCm HIP supports float16's except for the first versions. For now NSIMD\n  assumes that it is always the case and use HIP float16 API. There is no\n  need for `-DFP16`.\n\n## Float16's related functions and types\n\nNSIMD provide the `f16` type which represents a IEEE float16. Note that\ndepending on the targeted architecture and the presence of `-DFP16` the float16\ntype can typedefs many different types. Therefore the two following functions\nare provided and can be used to convert a float16 from/to a float32. These\nfunctions preserve NaN's and infinities. When converting from a float32 to\na float16 saturation to infinities is performed when the float32 cannot be\nrepresented as a float16.\n\n| Function signature                                | Availability |\n|---------------------------------------------------|--------------|\n| `f16 nsimd_f32_to_f16(f32 a);`                    | C and C++    |\n| `f32 nsimd_f16_to_f32(f16 a);`                    | C and C++    |\n| `f16 nsimd::f32_to_f16(f32 a);`                   | C++ only     |\n| `f32 nsimd::f16_to_f32(f16 a);`                   | C++ only     |\n\nFor loading/storing float16's NSIMD provides other conversion function to/from\n16-bits unsigned integers. The integers will hold the IEEE binary\nrepresentation of the float16's.\n\n| Function signature                                | Availability |\n|---------------------------------------------------|--------------|\n| `u16 nsimd_f32_to_u16(f32 a);`                    | C and C++    |\n| `f32 nsimd_u16_to_f32(u16 a);`                    | C and C++    |\n| `u16 nsimd::f32_to_u16(f32 a);`                   | C++ only     |\n| `f32 nsimd::u16_to_f32(u16 a);`                   | C++ only     |\n\nThe `nsimd_*` functions listed above do not use the same linkage type depending\non the targeted architecture. When compiling for GPUs the corresponding symbols\nnames are mangled. They use C++ ABI because the float16 type is defined as a\nC++ class and not as a C struct. We therefore inherit from the implementation\nof CUDA and HIP/ROCm. Linkage types are listed below.\n\n| Function signature                | CUDA/ROCm   | Other architectures |\n|-----------------------------------|-------------|---------------------|\n| `f16 nsimd_f32_to_f16(f32 a);`    | C++ linkage | C linkage           |\n| `f32 nsimd_f16_to_f32(f16 a);`    | C++ linkage | C linkage           |\n| `f16 nsimd::f32_to_f16(f32 a);`   | C++ linkage | C++ linkage         |\n| `f32 nsimd::f16_to_f32(f16 a);`   | C++ linkage | C++ linkage         |\n| `u16 nsimd_f32_to_u16(f32 a);`    | C++ linkage | C linkage           |\n| `f32 nsimd_u16_to_f32(u16 a);`    | C++ linkage | C linkage           |\n| `u16 nsimd::f32_to_u16(f32 a);`   | C++ linkage | C++ linkage         |\n| `f32 nsimd::u16_to_f32(u16 a);`   | C++ linkage | C++ linkage         |\n\nIt is possible to know at compile time in which situation we are. The\n`NSIMD_C_LINKAGE_FOR_F16` macro if defined means that C linkage is used for\n`nsimd_*` functions.\n"
  },
  {
    "path": "doc/markdown/how_tests_are_done.md",
    "content": "<!--\n\nCopyright (c) 2021 Agenium Scale\n\nPermission is hereby granted, free of charge, to any person obtaining a copy\nof this software and associated documentation files (the \"Software\"), to deal\nin the Software without restriction, including without limitation the rights\nto use, copy, modify, merge, publish, distribute, sublicense, and/or sell\ncopies of the Software, and to permit persons to whom the Software is\nfurnished to do so, subject to the following conditions:\n\nThe above copyright notice and this permission notice shall be included in all\ncopies or substantial portions of the Software.\n\nTHE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\nIMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\nFITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\nAUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\nLIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\nOUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\nSOFTWARE.\n\n-->\n\n# How tests are done?\n\nFirst and foremost note that this is a work in progress and that we are doing\nour best to have serious testing of the library.\n\nWe can also state our conclusion on testing: we are not and never will be\nsatisfied with our tests, there are not enough of them, we want more.\n\nThe current system has on average 15000 tests by SIMD extensions. Thanks to\nour \"Python\" approach we can automatically generate tests for all operators\nand for all types. This has greatly helped us in finding bugs. But, as you\nknow, bugs are always there.\n\n## Why write this?\n\nTesting the library has been taken seriously since its very beginning. Tests\nhave gone through several stages:\n\n- The first one was during the development of the first version of the library.\n  Tests of operators were done with random numbers as input. Those random\n  numbers were all powers of 2 to ease the comparisons of basic arithmetic\n  types. NaNs and infinities were not generated as inputs and operators\n  behaviors with those inputs were not tested\n\n- For the second stage random numbers generators have been improved to emit\n  NaNs and infinities. It allowed us to detect many errors in operators,\n  mostly in math functions like cos, sin, exp... But we also discovered bugs\n  in hardware when NaNs and infinities are given to intrinsics.\n\n- The third stage which the current test system takes into account the\n  experience we gain with the privous two. As we have abandonned the buggy and\n  slow implementations of math functions coming from Boost.SIMD and now rely on\n  the excellent Sleef (<https://sleef.org/>) we trust that the math functions\n  are correctly tested. In more details we do not generate NaNs and infinities\n  anymore because we trust functions coming from Sleef and we do not want\n  to write code in our tests to bypass hardware bugs. We only care that our\n  wrapping are correct adn that `nsimd::add` correctly calls add, the fact that\n  the add does not work correctly is a hardware bug then and not the\n  problem of the library.\n\nTests on floatting points are done using ULPs. ULP means units in the last\nplace and is commonly used for the comparison of floatting point numbers.\nIt is in general a bad idea to compare floats with the `==` operators as\nit essentially compares bits. Instead we want to check if the results of\ntwo computations are \"not to far away from each other\". When checking an\noperator, let's say, on CPUs and GPUs, we to take into account that\n- the rounding mode may be different and\n- the precision of the calculation may be different.\n\n## ULPs\n\nThis chapter is dedicated to math proof concerning ULPs. Indeed people use\nthis notion but proofs are hard to find. We give our own definition of distance\nin ULPs, compare it to the usual one and give pros and cons.\nWe assume the reader is familiar with basic mathematics.\n\nFor this entire chapter fix the following:\n- an integer $b > 1$ (will be our radix),\n- an integer $p > 1$ (will be the number of digits in the mantissa)\n- an integer $M > 1$ (will be the minimum exponent allowed for floatting\n  point numbers)\nA floatting point number is an element of $\\mathbb{R}$ of the form\n$m b^e$ with $e \\geq -M$ and $m \\in \\mathbb{Z}$. More precisely we define\nthe set of floatting point numbers $F$ to be the union of the following two\nsets:\n- $\\{ mb^e \\in F \\text{ with } e > -M \\}$ the *normal* numbers.\n- $\\{ mb^{-M} \\in F \\text{ with } m \\in \\mathbb{Z} \\text{ and }\n  0 < |m| < b^p \\}$ the *denormal* or *subnormal* numbers.\n\nThe set $F$ can be viewed as a subset of $\\mathbb{R}$ with the mapping\n$\\phi : (m, e) \\mapsto mb^e$ and we will make this abuse of\nnotation in what follows. Usually the sign of the floatting point number\nis separated from $m$ but we include it \"inside\" $m$ as it does not change\nthe proofs below and simplifies the notations.\n\nLet $a_i \\in F$ for $i = 1,2$ such that $a_i = m_i b^{e_i}$.\n\n**Proposition:** $\\phi$ is injective.\n\n**Proof:** Suppose that $a_1 = a_2$ or $m_1b^{e_1} = m_2b^{e_2}$. If $a_1$\nand $a_2$ are subnormal numbers then $e_1 = e_2 = -M$ and $m_1 = m_2$. If\n$a_1$ and $a_2$ are normal numbers suppose that $e_2 > e_1$, then\n$|\\frac{m_2b^{e_2}}{m_1b^{e_1}}| > b^{e_2 + p - 1 - e_1 - p}\n= b^{e_2 - e_1 - 1} \\geq b^{1 - 1} = 1$ therefore\n$m_2b^{e_2} \\neq m_1b^{e_1}$ which is absurd hence $e_1 = e_2$ and as a\nconsequence $m_1 = m_2$.\n\n**Definition:** We define the *distance in ULPs between $a_1$ and $a_2$*\ndenoted by $U(a_1, a_2)$ to be:\n- $|m_1b^{e_1 - e_2} - m_2|$ if $e_1 \\geq e_2$,\n- $|m_1 - m_2b^{e_2 - e_1}|$ otherwise.\n\n**Example:** Take $a_1 = 123456 \\times 10^5$ and $a_2 = 123789 \\times 10^5$\nThen as the exponents of $a_1$ and $a_2$ are the same we have\n$U(123456 \\times 10^5, 123789 \\times 10^5) = |123789 - 123456| = 333$.\n\nThe following proposition confort the name \"units in the last place\".\n\n**Proposition:** Let $f = \\lfloor \\log_b U(a_1, a_2) \\rfloor + 1$ and suppose\nthat $a_1, a_2$ are of same sign and have the same exponents, then either the\nfirst $p - f$ digits of $m_1$ and $m_2$ are identical or their difference is\n$\\pm 1$.\n\n**Proof:** For $i = 1,2$ there exists $q_i \\in \\mathbb{Z}$ and\n$0 \\leq r_i < b^f$ such that $m_i = q_i b^f + r_i$. Then\n$|q_1 - q_2| \\leq \\frac{|m_1 - m_2| + |r_1 - r_2|}{b^f}\n< \\frac{b^{\\log_b(U(a_1, a_2)} + b^f}{b^f} = 2$\n\nSo that either $q_1 = q_2$ or $q_1 - q_2 = \\pm 1$. It is interesting to know\nwhat are the cases when $q_1 - q_2 \\pm 1$. Suppose that $0 \\leq m_1 < m_2$\nand that $q_1 = q_2 + 1$ then $m_1 = q_1 b^f  + r_1 \\geq q_2 b^f + b^f >\nq_2 b^f + r_2 = m_2$ which contradicts the hypothesis hence $q_1 \\leq q_2$.\nFinally $r_1 + U(a_1, a_2) = r_1 + (m_2 - m_1) = q_2 b^f + r_2 - q_1 b^f\n= r_2 + b_f$ so that:\n- $r_1 + U(a_1, a_2) \\geq b^f$ and\n- $r_1 = r_2 + (b_f - U(a_1, a_2)) = r_2 + (b^f - b^{\\log_b(U(a_1, a_2))})\n  > r_2$.\n\n**Example:** Taking back $a_1 = 123456 \\times 10^5$ and\n$a_2 = 123789 \\times 10^5$. As $q_1 = q_2$ we have the first 3 digits of $a_1$\nand $a_2$ that are identical and they differ by their last\n$\\log_{10} \\lfloor U(a_1, a_2) \\rfloor + 1\n= \\lfloor \\log_{10}(333) \\rfloor + 1 = 3$\n\n**Example:** Now take $a_1 = 899900 \\times 10^5$ and\n$a_2 = 900100 \\times 10^5$. We have $f = 3$ but $q_2 = q_1 + 1$ and\n$r_2 = 900 > 100 = r_1$ and $r_2 + U(a_1, a_2) = 1100 \\geq 1000 = 10^3$.\n\nThe propositions above show that our definition of the ULP distance is well\nchoosen as we have the following results:\n- (second proposition) is measures de number of different digits at the end\n  of the mantissa.\n- (first proposition) if we write the numbers differently but still in base $b$\n  we only change the number of different digits in the last places by some\n  zeros. The latter number being the exponent of $b$ that represents the\n  difference in scaling of both representations of floatting point numbers.\n\nWe show now how to compute it using the IEEE 754 floatting point numbers\nrepresentation. A floatting point number $(m, e) \\in F$ is stored in memory\n(and registers) as the integer $\\pm ((e + M)b^p + |m|)$.\n\n**Proposition:** If $e_2 \\geq e_1 + 2$ then $U(a_1, a_2) \\geq b^p$.\n\n**Proof:** We have $U(a_1, a_2) = |m_2 b^{e_2 - e_1} - m_1|\n\\geq ||m_2| b^{e_2 - e_1} - |m_1||$. But $m_2$ is a normal number otherwise we\nwould have $e_2 = -M = e_1$ so that $|m_2| \\geq b^{p - 1}$ and we have\n$|m_2| b^{e_2 - e_1} \\geq b^{p - 1 + e_2 - e_1} \\geq b^{p + 1} > |m_1|$,\ntherefore $||m_2| b^{e_2 - e_1} - |m_1|| \\geq |m_2|b^2 - |m_1|\n> b^{p - 1 + 2} - b^p = b^p$.\n\nThe proposition above basically states that if two floatting point numbers\nare two orders of magnitude away then that have no digits in common, and\nthat there are godd chances that comparing them is not interesting at all.\n\nThe usual definition of the distance in ULPs is roughly given as the number\nof floatting point numbers between the two considered floatting point numbers.\nMore precisely we will denote it by $V$ and it is defined as follows:\n- $V(a_1, a_2) = |(e_1 + M)b^p + |m_1| - (e_2 + M)b^p - |m_2||$ if $a_1$ and\n  $a_2$ have the same signs\n- $V(a_1, a_2) = (e_1 + M)b^p + |m_1| + (e_2 + M)b^p + |m_2|$ otherwise.\n\n**Proposition:** If $e_1 = e_2$ and $a_1$, $a_2$ have the same sign then\n$U(a_1, a_2) = V(a_1, a_2)$.\n\n**Proof:** We have $V(a_1, a_2) = |(e_1 + M)b^p + m_1 - (e_2 + M)b^p - m_2|$,\nbut as $e_1 = e_2$, we end up with $V(a_1, a_2) = |m_1 - m_2| = U(a_1, a_2)$.\n\n**Proposition:** $V(a_1, a_2) = 1$ is equivalent to $U(a_1, a_2) = 1$.\n\n**Proof:** The proposition is true if $e_1 = e_2$. Suppose that $e_2 > e_1$.\nNote that $a_2$ is a normal number so that $m_2 \\geq b^{p - 1}$.\n\nWe first suppose that $V(a_1, a_2) = 1$. Then by the definition of $V$, $a_1$\nand $a_2$ have same sign otherwise $V(a_1, a_2) \\geq 2$ and we suppose that\n$a_i \\geq 0$. Moreover we have $e_2 = e_1 + 1$ otherwise we would have that\n$a_1 = m_1b^{e_1} < m_1b^{e_1 + 1} < m_2b^{e_1 + 2} \\leq a_2$. Now we have\n$(b^p - 1)b^{e_1} < b^{p - 1}b^{e_1 + 1}$ and let\n$(b^p - 1)b^{e_1} \\leq mb^e \\leq b^{p - 1}b^{e_1 + 1}$.\n\nFirst note that if $a = mb^e$ is a normal number then $m \\geq b^{p - 1}$ and if\n$a$ is a subnormal number then $e = -M$ in which case we also have $e_1 = -M$\nand $m \\geq b^p - 1 \\geq b^{p - 1}$. In any case $m \\geq b^{p - 1}$.\n\nWe have $(b^p - 1)/m b^{e_1} < b^e < b^{p - 1}/m b^{e_1 + 1}$. But\n$1 \\leq (b^p - 1) / m$ and $b^{p - 1} / m \\leq 1$ so that\n$b^{e_1} \\leq b^e \\leq b^{e_1 + 1}$ and $e = e_1$ or $e = e_1 + 1$. In the\nfirst case $(b^p - 1)b^{e_1} \\leq mb^{e_1}$ so that $b^p - 1 \\leq m$ but\n$m < b^p$ and $m = b^p - 1$. In the second case\n$mb^{e_1 + 1} \\leq b^{p - 1}b^{e_1 + 1}$ so that $m \\leq b^{p - 1}$ but\n$b^{p - 1} \\leq m$ and $m = b^{p - 1}$. We have proven that two consecutive\nelements of $F$ with $e_2 = e_1 + 1$ are neessary of the form\n$a_1 = (b^p - 1)b^{e_1}$ and $a_2 = b^{p - 1}b^{e_1 + 1}$. Now we can compute\n$U(a_1, a_2) = |bb^{p - 1} - (b^p - 1)| = 1$.\n\nConversely, suppose that $U(a_1, a_2) = 1$, then\n$|b^{e_2 - e_1}m_2 - m_1| = 1$. Suppose that $b^{e_2 - e_1}m_2 - m_1 = -1$,\nthen $-1 \\geq bb^{p - 1} - b^p = 0$ which is absurd. We then have\n$b^{e_2 - e_1}m_2 - m_1 = 1$. Suppose that $e_2 \\geq e_1 + 2$ then we would\nhave that $b^{e_2 - e_1}m_2 - m_1 \\geq b^2b^{p - 1} - b^p \\geq b^p$ which is\nabsurd so that $e_2 = e_1 + 1$ and $bm_2 - m_1 = 1$. Suppose that\n$m_2 \\geq b^{p - 1} + 1$ then $bm_2 - m_1 \\geq b^p + b - (b^p - 1) \\geq 2$\nwhich is absurd so that $m_2 = b^{p - 1}$ and as a consequence $m_1 = b^p - 1$.\n\nIf $a_1, a_2 < 0$, then $V(a_1, a_2) = 1$ is equivalent by definition to\n$V(-a_1, -a_2) = 1$ which is equivalent to $U(-a_1, -a_2) = 1$ which is\nby definition equivalent to $U(a_1, a_2) = 1$.\n\n**Proposition:** Suppose that $e_1 \\leq e_2 \\leq e_1 + 1$ then\n$V \\leq U \\leq bV$.\n\n**Proof:** The proposition is true if $e_1 = e_2$. Suppose now that\n$e_2 = e_1 + 1$. Then we have\n$b^p + m_2 - m_1 \\geq b^p + b^{p - 1} - b^p \\geq 0$\nso that $V(a_1, a_2) = b^p + m_2 - m_1 = b^p + m_2(1 - b) + bm_2 - m_1$. But\n$b^p + m_2(1 - b) \\leq b^p + b^p(1 - b) \\leq 0$ and\n$bm_2 - m_1 \\geq bb^{p - 1} - b^p = 0$ so that $V(a_1, a_2) \\leq bm_2 - m_1\n= U(a_1, a_2)$. On the other hand we have $bm_2 - m_1\n\\leq b(b^p + m_2 - m_1 + m_1 - m_1/b - b^p)$ but\n$m_1 - m_1/b - b^p \\leq b^p - b^{p - 1}/b - b^p \\leq 0$ so that\n$U(a_1, a_2) \\leq b(b^p + m_2 - m_1) = bV(a_1, a_2)$.\n\n**Remark:** The previous propositions shows that the difference between $V$\nand $U$ is only visible when the arguments have differents exponents and\nare non consecutive. Our version of the distance in ULPs puts more weights\nwhen crossing powers of $b$. Also if $e_2 \\geq e_1 + 2$ then we have seen that\n$a_1$ and $a_2$ have nothing in common which is indicated by the fact that\n$U, V \\geq b^p$.\n\n**Definition:** We now define the relative distance $D(a_1, a_2)$ between\n$a_1$ and $a_2$ to be $|a_1 - a_2| / \\min(|a_1|, |a_2|)$.\n\n**Proposition:** As $U$ is defined in a \"mathematical\" way compared to $V$ then\nthe relation between $U$ and $D$ is straightforward and we have\n$D(a_1, a_2) = U(a_1, a_2) / |m_1|$. Moreover we have\n$b^{-q}U \\leq D \\leq b^{1 - q}U$ where $q$ is the greatest integer such that\n$b^{q - 1} \\leq |m_1| < b^q$. In particular if $a_1$ is a normal number then\n$p = q$.\n\n**Proof:** Suppose that $|a_1| < |a_2|$, then we have three cases:\n- If $a_2$ is denormal, then so is $a_1$ and $e_1 = -M = e_2$.\n- If $a_2$ is normal, then:\n  + If $a_1$ is denormal then $e_1 < e_2$.\n  + If $a_1$ and $a_2$ are normal numbers then $|m_1/m_2| b^{e_1 - e_2} < 1$\n    but $|m_1/m_2| \\geq b^{p - 1} / b^p = b^{-1}$ and we have\n    $b^{e_1 - e_2 - 1} < 1$ so that $e_1 < e_2 + 1$ or $e_1 \\leq e_2$.\nIn any case we have $e_1 \\leq e_2$, as a consequence we have\n$D(a_1, a_2) = |m_1b^{e_1} - m_2b^{e_2}| / \\min(|m_1|b^{e_1}, |m_2|b^{e_2})\n= |m_1 - m_2b^{e_2 - e_1}| / \\min(|m_1|, |m_2|b^{e_2 - e_1})$. Therefore\n$D(a_1, a_2) = U(a_1, a_2) / \\min(|m_1|, |m_2|b^{e_2 - e_1})$. Now if\n$e_1 = e_2$ then $\\min(|m_1|, |m_2|) = |m_1|$ but if $e_2 > e_1$ then $a_2$ is\na normal number and $|m_1| < b^p = b \\times b^{p - 1} \\leq b^{e_2 - e_1} |m_2|$\nand again $\\min(|m_1|, |m_2|b^{e_2 - e_1}) = |m_1|$.\n\nApplying $b^{q - 1} \\leq |m_1| < b^q$ we get that\n$b^{-q}U \\leq D \\leq b^{1 - q}U$. If moreover $a_1$ is a normal number then\nby definition $p = q$.\n\n**Remark:** Using the inequality of the previous proposition and taking the\nbase-$b$ logarithm we get $-q + \\log U \\leq \\log D \\leq 1 - q + \\log U$ and\nthen $-q + \\lfloor \\log U \\rfloor \\leq \\lfloor \\log D \\rfloor\n\\leq 1 - q + \\lfloor \\log U \\rfloor$ hence two possibilities:\n- $-q + \\lfloor \\log U \\rfloor = \\lfloor \\log D \\rfloor$ in which case\n  $\\lfloor \\log U \\rfloor + (-\\lfloor \\log D \\rfloor) = q$.\n- $1 - q + \\lfloor \\log U \\rfloor = \\lfloor \\log D \\rfloor$ in which case\n  $1 + \\lfloor \\log U \\rfloor + (-\\lfloor \\log D \\rfloor) = q$.\nAccording to a above proposition we know that $f = 1 + \\lfloor \\log U \\rfloor$\ncan be interpreted as the number of differents digits in the last places of the\nmantissa. Write $\\mathcal{D} = - \\lfloor \\log D \\rfloor$ then\n$q \\leq f + \\mathcal{D} \\leq q + 1$. The latter inequality shows that\n$\\mathcal{D}$ can be interpreted as the number of digits which are the same in\nthe mantissa near the \"first\" place. Note that for denormal numbers the \"first\"\nplaces are near the bit of most significance. We can conclude this remark with\nthe interpretation that two floatting point numbers have at least\n$\\mathcal{D} - 1$ digits in common in the first place of the mantissa and $f$\ndigits which are different in the last place of the mantissa.\n\n**Algorithm:** We give below the C code for $U$ with a caveat. As seen in a\nprevious proposition when $e_2 \\geq e_1 + 2$ the arguments have no digit in\ncommon and can be considered too far away in which case we return `INT_MAX` (or\n`LONG_MAX`). As a side effect is that the code will be free of multiprecision\nintegers (which would be necessary as soon as $|e_2 - e_1| \\geq 12$) hence\nlesser dependencies, readability, maintainability and performances.\nWhen $|e_2 - e_1| \\leq 1$ we use the formula of the definition.\n\n```c\n/* We suppose that floats are IEEE754 and not NaN nor infinity */\n\nstruct fl_t{\n  int mantissa;\n  int exponent;\n};\n\nfl_t decompose(float a_) {\n  fl_t ret;\n  unsigned int a;\n  memcpy(&a, &a_, sizeof(float)); /* avoid aliasing */\n  ret.exponent = (int)((a >> 23) & 0xff) - 127;\n  if (ret.exponent == -127) {\n    /* denormal number */\n    ret.mantissa = (int)(a & 0x007fffff);\n  } else {\n    ret.mantissa = (int)((1 << 23) | (a & 0x007fffff));\n  }\n  if (a >> 31) {\n    ret.mantissa = -ret.mantissa;\n  }\n  return ret;\n}\n\nint distance_ulps(float a_, float b_) {\n  fl_t a, b;\n  a = decompose(a_);\n  b = decompose(b_);\n\n  if (a.exponent - b.exponent < -1 || a.exponent - b.exponent > 1) {\n    return INT_MAX;\n  }\n  \n  int d;\n  if (a.exponent == b.exponent) {\n    d = a.mantissa = b.mantissa;\n  } else if (a.exponent > b.exponent) {\n    d = 2 * a.mantissa - b.mantissa;\n  } else {\n    d = 2 * b.mantissa - a.mantissa;\n  }\n\n  return d > 0 ? d : -d;\n}\n```\n\nThe algorithm for computing $\\mathcal{D} - 1$ follows:\n\n```c\nint d(float a_, float b_) {\n  float absa = fabsf(a_);\n  float absb = fabsf(b_);\n\n  /* ensure that |a_| <= |b_| */\n  if (absb < absa) {\n    float tmp = absa;\n    absa = absb;\n    absb = tmp;\n  }\n\n  fl_t a = decompose(absa);\n  int q = 0;\n  for (q = 0; q <= 23 && (2 << q) <= a.mantissa; q++);\n\n  int ulps = distance_ulps(a_, b_);\n  int lu;\n  for (lu = 0; lu <= 30 && (2 << (lu + 1)) <= a.mantissa; lu++);\n\n  return q - (lu + 1) - 1;\n}\n```\n\n## What we really do in the tests\n\nAs said above buggy intrinsics can be easily found. But the bugs appears for\ncorner cases typically involving NaNs and/or infinities. But according to the\nphilosophy of NSIMD, it is not the job of its standard operators to propose a\nnon buggy alternative to a buggy intrinsics. But we still have the problem of\ntesting. A consequence of the philosophy of NSIMD is that we only have to test\nthat intrinsics are correctly wrapped. We can reasonably assume that testing\nfor floatting point numbers on only normal numbers is more than sufficient.\n\nMoreover, an implementation (buggy or not), may have different parameters set\nthat controls how floatting point arithmetic is done on various components of\nthe chip. An non exhaustive list includes:\n- Rounding modes (which is not controlled by NSIMD as it is a library)\n- FTZ/DAZ (flush to zero) denormal values never appear.\n- FTZ/DAZ on some components (SIMD parts) and not others (scalar parts)\n- Non IEEE behavior (eg. some NVIDIA GPU and ARMv7 chips)\n- A mix of the above\n- A buggy mix of the above\n\nAs a consequence we do not compare floats using the operator `=` nor do we\nuse a weird-buggy formula involving the machine epsilon. Instead we use\nthe algorithm above to make sure that the first bits are correct. More\nprecisely we use the following algorithm and its variants for float16 and\ndoubles where `ufp` stands for `units in the first place`.\n\n```c\n/* a_ and b_ must be IEEE754 and normal numbers */\nint ufps(float a_, float b_) {\n  unsigned int a, b;\n  memcpy(&a, &a_, 4);\n  memcpy(&b, &b_, 4);\n  int ea = (int)((a >> 23) & 0xff);\n  int eb = (int)((b >> 23) & 0xff);\n  if (ea - eb > 1 || ea - eb < -1) {\n    return 0;\n  }\n  int ma = (int)(a & 0x007fffff);\n  int mb = (int)(b & 0x007fffff);\n  int d = 0;\n  if (ea == eb) {\n    d = ma - mb;\n  } else if (ea > eb) {\n    d = 2 * ma - mb;\n  } else {\n    d = 2 * mb - ma);\n  }\n  d = (d >= 0 ? d : -d);\n  int i = 0;\n  for (; i < 30 && d >= (1 << i); i++);\n  return 23 - i;\n}\n```\n"
  },
  {
    "path": "doc/markdown/memory.md",
    "content": "# Memory functions\n\nAlthough the purpose of NSIMD is not to provide a full memory container\nlibrary, it provides some helper functions to facilitate the end-user. The\nfunctions below only deals with CPUs. If your needs concerns GPUs or memory\ntransfers between CPUs and GPUs see the [memory management\nmodule](module_memory_management_overview.md).\n\n## Memory functions available in C and C++\n\n- `void *nsimd_aligned_alloc(nsimd_nat n);`  \n  Returns a pointer to `n` bytes of aligned memory. It returns NULL is an\n  error occurs.\n\n- `void nsimd_aligned_free(void *ptr);`\n  Frees the memory pointed to by `ptr`.\n\n## Memory functions available in C++\n\n- `void *nsimd::aligned_alloc(nsimd_nat n);`  \n  Returns a pointer to `n` bytes of aligned memory. It returns NULL is an\n  error occurs.\n\n- `void nsimd::aligned_free(void *ptr);`  \n  Frees the memory pointed to by `ptr`.\n\n- `template <typename T> T *nsimd::aligned_alloc_for(nsimd_nat n);`  \n  Returns a pointer to `n` `T`'s of aligned memory. It returns NULL is an\n  error occurs.\n\n- `template <typename T> void nsimd::aligned_free_for(void *ptr);`  \n  Free memory pointed to by `ptr`.\n\n## C++ allocators for `std::vector`'s\n\nNSIMD provides C++ allocators so that memory used by C++ container such as\n`std::vector`'s will be suitably aligned in memory.\n\n- `template <typename T> class nsimd::allocator;`  \n  The class for allocating aligned memory inside C++ containers.\n\nExemple:\n\n```c++\n#include <nsimd/nsimd.h>\n\nint main() {\n  int n = // number of float's to allocate\n  std::vector<float, nsimd::allocator<float> > myvector(size_t(n));\n  \n  // In what follows ptr is a pointer suitably aligned for the current SIMD\n  // targeted architecture.\n  float *ptr;\n  \n  // C++98\n  ptr = &myvector[0]; \n\n  // C++11 and above\n  ptr = myvector.data(); \n}\n```\n\nAs there is no portable way of having aligned scoped memory, one can use the\nNSIMD allocators to emulate such memory.\n\n```c++\n#include <nsimd/nsimd.h>\n\ntemplate <typename T, int N> void test() {\n  std::vector<T, nsimd::allocator<T> > mem(size_t(N));\n  T *ptr;\n  \n  // C++98\n  ptr = &mem[0]; // scoped aligned memory\n\n  // C++11 and above\n  ptr = mem.data(); // scoped aligned memory\n}\n\nint main() {\n  test<float, 16>();\n  test<double, 8>();\n}\n```\n\n## C++ scoped memory allocation\n\nNSIMD provides a struct helper for the user to allocate a chunk of memory and\ndon't care about its release. It uses C++ RAII.\n\n```c++\nnamespace nsimd {\n\ntemplate <typename T> class scoped_aligned_mem_for {\n\n  template <typename I> scoped_aligned_mem(I n);\n  // Construct a struct an array of n T's.\n\n  T *get();\n  // Return the pointer to access memory.\n\n};\n\n}\n\nint main() {\n  // Allocates 1024 floats in memory. It will be freed when the function (or\n  // the program) terminates.\n  nsimd::scoped_aligned_mem_for<float> buffer(1024);\n  return 0;\n}\n```\n"
  },
  {
    "path": "doc/markdown/modules/.gitignore",
    "content": "*/api*.md"
  },
  {
    "path": "doc/markdown/modules/fixed_point/overview.md",
    "content": "<!--\n\nCopyright (c) 2019 Agenium Scale\n\nPermission is hereby granted, free of charge, to any person obtaining a copy\nof this software and associated documentation files (the \"Software\"), to deal\nin the Software without restriction, including without limitation the rights\nto use, copy, modify, merge, publish, distribute, sublicense, and/or sell\ncopies of the Software, and to permit persons to whom the Software is\nfurnished to do so, subject to the following conditions:\n\nThe above copyright notice and this permission notice shall be included in all\ncopies or substantial portions of the Software.\n\nTHE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\nIMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\nFITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\nAUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\nLIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\nOUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\nSOFTWARE.\n\n-->\n\n\n# NSIMD fixed point module\n\n## Description\n\nThis module implements a fixed-point numbers support for the `nsimd` library.\nFixed-point numbers are integer types used to represent decimal numbers. A number `lf` \nof bits are used to encode its integer part, and `rt` bits are used to encode its \nfractional part.\n\nThe fixed_point module uses the templated type `nsimd::fixed_point::fp_t<lf, rt>` to \nrepresent a fixed_point number. All the basic floating-point arithmetic operaors have \nbeen defined, therefore fp_t elements can be manipulated as normal numbers.\nThe fixed_point module will use a `int8_t`, `int16_t`, or `int32_t` integer type for \nstorage, depending on the value of `lf + 2 * rt`. \n\nAll the functions of the module are under the namespace `nsimd::fixed_point`, \nand match the same interface than `nsimd`.\n\nThe `fp_t` struct type is defined in `fixed.hpp`, and the associated simd `fpsimd_t` \nstruct type is defined in `simd.hpp`.\n\nThe modules redefines the `nsimd` pack type for fixed-point numbers, templated with `lf` \nand `rt` :\n\n```C++\nnamespace nsimd {\nnamespace fixed_point {\ntemplate <uint8_t lf, uint8_t rt>\nstruct pack;\n} // namespace fixed_point\n} // namespace nsimd\n```\n\nThen, the pack can be manipulated as an `nsimd` pack like other scalar types. \n\n## Compatibility\n\nThe fixed point module is a C++ only API, compatible with the C++98 standard.\nIt has the same compilers and hardware support than the main `nsimd` API \n(see the [API index](../../index.md)).\n\n## Example\n\nHere is a minimal example(main.cpp) :\n\n```C++\n#include <ctime>\n#include <cstdlib>\n#include <iostream>\n#include <nsimd/modules/fixed_point.hpp>\n\nfloat rand_float() {\n  return 4.0f * ((float) rand() / (float) RAND_MAX) - 2.0f;        \n}\n\nint main() {\n  // We use fixed point numbers with 8 bits of integer part and 8 bits of \n  // decimal part. It will use a 32 bits integer for internal storage.\n  typedef nsimd::fixed_point::fp_t<8, 8> fp_t;\n  typedef nsimd::fixed_point::pack<fp_t> fp_pack_t;\n  \n  const size_t v_size = nsimd::fixed_point::len(fp_t());\n\n  fp_t *input0 = (fp_t*)malloc(v_size * sizeof(fp_t));\n  fp_t *input1 = (fp_t *)malloc(v_size * sizeof(fp_t));\n  fp_t *res = (fp_t *)malloc(v_size * sizeof(fp_t));\n  \n  // Input and output initializations \n  for(size_t i = 0; i < nsimd::fixed_point::len(fp_t()); i++) {\n    input0[i] = fp_t(rand_float());\n    input1[i] = fp_t(rand_float());\n  }\n  \n  fp_pack_t v0 = nsimd::fixed_point::loadu<fp_pack_t>(input0);\n  fp_pack_t v1 = nsimd::fixed_point::loadu<fp_pack_t>(input1);\n  fp_pack_t vres = nsimd::fixed_point::add(v0, v1);\n  nsimd::fixed_point::storeu(res, vres);\n  \n  for(size_t i = 0; i < nsimd::fixed_point::len(fp_t()); i++) {\n    std::cout << float(input0[i]) << \" | \"\n      << float(input1[i]) << \" | \"\n      << float(res[i]) << \"\\n\";\n  }\n  std::cout << std::endl;\n  \n  return EXIT_SUCCESS;\n}\n\n```\n\nTo test with avx2 run : \n```bash\nexport NSIMD_ROOT=<path/to/simd>\ng++ -o main -I$NSIMD_ROOT/include -mavx2 -DNSIMD_AVX2 main.cpp\n./main\n```\n\nThe console output will look like this : \n```console\n$>./main \n1.35938 | -0.421875 | 0.9375\n1.13281 | 1.19531 | 2.32812\n1.64844 | -1.21094 | 0.4375\n-0.660156 | 1.07422 | 0.414062\n-0.890625 | 0.214844 | -0.675781\n-0.0898438 | 0.515625 | 0.425781\n-0.539062 | 0.0546875 | -0.484375\n1.80859 | 1.66406 | 3.47266\n```\n        "
  },
  {
    "path": "doc/markdown/pack.md",
    "content": "# NSIMD pack and related functions\n\nThe advanced C++ API provides types that represents SIMD registers. These\ntypes are struct that allows NSIMD to define infix operators. In this page\nNSIMD concepts are reported in the documentation but you can think of them\nas usual `typename`s.\n\n## The Pack type\n\n```c++\ntemplate <NSIMD_CONCEPT_VALUE_TYPE T, int N, NSIMD_CONCEPT_SIMD_EXT SimdExt>\nstruct pack {\n  // Typedef to retrieve the native SIMD type\n  typedef typename simd_traits<T, SimdExt>::simd_vector simd_vector;\n\n  // Typedef to retrieve T\n  typedef T value_type;\n\n  // Typedef to retrieve SimdExt\n  typedef SimdExt simd_ext;\n\n  // Static member to retrive N\n  static const int unroll = N;\n\n  // Ctor that splats `s`, the resulting vector will be [s, s, s, ...]\n  template <NSIMD_CONCEPT_VALUE_TYPE S> pack(S const &s);\n\n  // Ctor that takes a SIMD vector of native type\n  // ONLY AVAILABLE when N == 1\n  pack(simd_vector v);\n  \n  // Retrieve the underlying native SIMD vector\n  // ONLY AVAILABLE when N == 1\n  simd_vector native_register() const;\n\n};\n```\n\nExample:\n\n```c++\n#include <nsimd/nsimd-all.hpp>\n#include <iostream>\n\nint main() {\n  nsimd::pack<float> v(2.0f);\n  std::cout << v << '\\n';\n\n  vf32 nv = v.native_register();\n  nv = nsimd::add(nv, nv, f32());\n  std::cout << nsimd::pack<f32>(nv) << '\\n';\n\n  return 0;\n}\n```\n\n### Infix operators available for packs\n\n- `pack operator+(pack const &, pack const &);`\n- `pack operator*(pack const &, pack const &);`\n- `pack operator-(pack const &, pack const &);`\n- `pack operator/(pack const &, pack const &);`\n- `pack operator-(pack const &);`\n- `pack operator|(pack const &, pack const &);`\n- `pack operator^(pack const &, pack const &);`\n- `pack operator&(pack const &, pack const &);`\n- `pack operator~(pack const &);`\n- `pack operator<<(pack const &, int);` (only available for integers)\n- `pack operator>>(pack const &, int);` (only available for integers)\n\n### Assignment operators available for packs\n\n- `pack operator+=(pack const &);`\n- `pack operator-=(pack const &);`\n- `pack operator*=(pack const &);`\n- `pack operator/=(pack const &);`\n- `pack &operator|=(pack const &other);`\n- `pack &operator&=(pack const &other);`\n- `pack &operator^=(pack const &other);`\n- `pack &operator<<=(int);`\n- `pack &operator>>=(int);`\n\n### Function aliases\n\nThe C++ standard provides functions with different names that does exactly\nthe same thing. This is due to the retro compatibility with C. Take the\n`fmin` C function as an example. In C this function give the minimum between\ndoubles only. The C++ standard provides overloads to this function so that it\ncan work on floats and long doubles. The aliases provided by NSIMD have the\nsame purpose but they are not provided as operator on their own because their\nreal purpose is to write generic code that can work on scalar and SIMD vector\ntypes. As such they are only relevant for the advanced C++ API.\n\n- `pack fmin(pack const &, pack const &);`\n- `pack fmax(pack const &, pack const &);`\n- `pack fabs(pack const &);`\n\nThey are contained in the `nsimd/cxx_adv_api_aliases.hpp` header and not\nprovided by default to respect the philosophy of NSIMD which is force the\nuse to think different between SIMD code and scalar code. They are provided\nautomatically when including `nsimd/nsimd-all.hpp`.\n\n## The Packl type\n\n```c++\ntemplate <NSIMD_CONCEPT_VALUE_TYPE T, int N, NSIMD_CONCEPT_SIMD_EXT SimdExt>\nstruct packl {\n  // Typedef to retrieve the native SIMD type\n  typedef typename simd_traits<T, SimdExt>::simd_vectorl simd_vectorl;\n\n  // Typedef to retrieve T\n  typedef T value_type;\n\n  // Typedef to retrieve SimdExt\n  typedef SimdExt simd_ext;\n\n  // Static member to retrive N\n  static const int unroll = N;\n\n  // Ctor that splats `s`, the resulting vector will be [s, s, s, ...]\n  template <NSIMD_CONCEPT_VALUE_TYPE S> packl(S const &s);\n\n  // Ctor that takes a SIMD vector of native type\n  // ONLY AVAILABLE when N == 1\n  packl(simd_vectorl v);\n  \n  // Retrieve the underlying native SIMD vector\n  // ONLY AVAILABLE when N == 1\n  simd_vector native_register() const;\n\n};\n```\n\nExample:\n\n```c++\n#include <nsimd/nsimd-all.hpp>\n#include <iostream>\n\nint main() {\n  nsimd::pack<float> v(2.0f);\n  nsimd::packl<float> mask;\n\n  mask = nsimd::eq(v, v);\n  std::cout << v << '\\n';\n\n  mask = nsimd::neq(v, v);\n  std::cout << v << '\\n';\n\n  return 0;\n}\n```\n\n### Infix operators involving packls\n\n- `packl operator&&(packl const &, packl const &);`\n- `packl operator||(packl const &, packl const &);`\n- `packl operator!(packl const &, packl const &);`\n- `packl operator==(pack const &, pack const &);`\n- `packl operator!=(pack const &, pack const &);`\n- `packl operator<(pack const &, pack const &);`\n- `packl operator<=(pack const &, pack const &);`\n- `packl operator>(pack const &, pack const &);`\n- `packl operator>=(pack const &, pack const &);`\n\n## Packs for SoA/AoS\n\nTypes containing several SIMD vectors are also provided to help the user\nmanipulate arrays of structures. When working, let's say, on complex numbers,\nloading them from memory with layout `RIRIRIRIRIRI...` can be done with the\n`load2*` operators that will returns 2 SIMD vectors `RRRR` and `IIII` where\n`R` stands for real part and `I` for imaginary part.\n\nSimilarily loading an RGB image from memory stored following the layout\n`RGBRGBRGBRGB...` can be done with `load3*` to get 3 SIMD vectors `RRRR`,\n`GGGG` and `BBBB`.\n\n### Packx1\n\n```c++\ntemplate <NSIMD_CONCEPT_VALUE_TYPE T, int N, NSIMD_CONCEPT_SIMD_EXT SimdExt>\nNSIMD_STRUCT packx1 {\n\n  // Usual typedefs and static members\n  typedef typename simd_traits<T, SimdExt>::simd_vector simd_vector;\n  typedef T value_type;\n  typedef SimdExt simd_ext;\n  static const int unroll = N;\n  static const int soa_num_packs = 1;\n\n  // Member v0 for reading and writing\n  pack<T, N, SimdExt> v0;\n};\n```\n\n### Packx2\n\n```c++\ntemplate <NSIMD_CONCEPT_VALUE_TYPE T, int N, NSIMD_CONCEPT_SIMD_EXT SimdExt>\nNSIMD_STRUCT packx2 {\n\n  // Usual typedefs and static members\n  typedef typename simd_traits<T, SimdExt>::simd_vector simd_vector;\n  typedef T value_type;\n  typedef SimdExt simd_ext;\n  static const int unroll = N;\n  static const int soa_num_packs = 2;\n\n  // Members for reading and writing\n  pack<T, N, SimdExt> v0;\n  pack<T, N, SimdExt> v1;\n};\n```\n\n### Packx3\n\n```c++\ntemplate <NSIMD_CONCEPT_VALUE_TYPE T, int N, NSIMD_CONCEPT_SIMD_EXT SimdExt>\nNSIMD_STRUCT packx3 {\n\n  // Usual typedefs and static members\n  typedef typename simd_traits<T, SimdExt>::simd_vector simd_vector;\n  typedef T value_type;\n  typedef SimdExt simd_ext;\n  static const int unroll = N;\n  static const int soa_num_packs = 3;\n\n  // Members for reading and writing\n  pack<T, N, SimdExt> v0;\n  pack<T, N, SimdExt> v1;\n  pack<T, N, SimdExt> v2;\n};\n```\n\n### Packx4\n\n```c++\ntemplate <NSIMD_CONCEPT_VALUE_TYPE T, int N, NSIMD_CONCEPT_SIMD_EXT SimdExt>\nNSIMD_STRUCT packx4 {\n\n  // Usual typedefs and static members\n  typedef typename simd_traits<T, SimdExt>::simd_vector simd_vector;\n  typedef T value_type;\n  typedef SimdExt simd_ext;\n  static const int unroll = N;\n  static const int soa_num_packs = 4;\n\n  // Members for reading and writing\n  pack<T, N, SimdExt> v0;\n  pack<T, N, SimdExt> v1;\n  pack<T, N, SimdExt> v2;\n  pack<T, N, SimdExt> v3;\n};\n```\n\n### Functions involving packx2, packx3 and packx4\n\nThe following functions converts packxs into unrolled packs. The difference\nbetween the `to_pack` and `to_pack_interleave` families of functions is in\nthe way they flatten (or deinterleave) the structure of SIMD vectors.\n\n```c++\ntemplate <NSIMD_CONCEPT_VALUE_TYPE T, NSIMD_CONCEPT_SIMD_EXT SimdExt>\npack<T, 2 * N, SimdExt> to_pack(const packx2<T, N, SimdExt> &);\n\ntemplate <NSIMD_CONCEPT_VALUE_TYPE T, NSIMD_CONCEPT_SIMD_EXT SimdExt>\npack<T, 3 * N, SimdExt> to_pack(const packx3<T, N, SimdExt> &);\n\ntemplate <NSIMD_CONCEPT_VALUE_TYPE T, NSIMD_CONCEPT_SIMD_EXT SimdExt>\npack<T, 4 * N, SimdExt> to_pack(const packx4<T, N, SimdExt> &);\n\ntemplate <NSIMD_CONCEPT_VALUE_TYPE T, NSIMD_CONCEPT_SIMD_EXT SimdExt>\npack<T, 2 * N, SimdExt> to_pack_interleave(const packx2<T, N, SimdExt> &);\n\ntemplate <NSIMD_CONCEPT_VALUE_TYPE T, NSIMD_CONCEPT_SIMD_EXT SimdExt>\npack<T, 3 * N, SimdExt> to_pack_interleave(const packx3<T, N, SimdExt> &);\n\ntemplate <NSIMD_CONCEPT_VALUE_TYPE T, NSIMD_CONCEPT_SIMD_EXT SimdExt>\npack<T, 4 * N, SimdExt> to_pack_interleave(const packx4<T, N, SimdExt> &);\n```\n\nThe `to_pack` family of functions performs the following operations:\n\n```\npackx2<T, 3> = | v0 = [u0 u1 u2] | ---> [u0 u1 u2 w0 w1 w2] = pack<T, 6>\n               | v1 = [w0 w1 w2] |\n```\n\nwhile the `to_pack_interleave` family of functions does the following:\n\n```\npackx2<T, 3> = | v0 = [u0 u1 u2] | ---> [u0 w0 v1 w1 v2 w2] = pack<T, 6>\n               | v1 = [w0 w1 w2] |\n```\n\n"
  },
  {
    "path": "doc/markdown/tutorial.md",
    "content": "<!--\n\nCopyright (c) 2020 Agenium Scale\n\nPermission is hereby granted, free of charge, to any person obtaining a copy\nof this software and associated documentation files (the \"Software\"), to deal\nin the Software without restriction, including without limitation the rights\nto use, copy, modify, merge, publish, distribute, sublicense, and/or sell\ncopies of the Software, and to permit persons to whom the Software is\nfurnished to do so, subject to the following conditions:\n\nThe above copyright notice and this permission notice shall be included in all\ncopies or substantial portions of the Software.\n\nTHE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\nIMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\nFITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\nAUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\nLIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\nOUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\nSOFTWARE.\n\n-->\n\n# NSIMD tutorial\n\nIn this tutorial we will write and compile a simple SIMD kernel to become\nfamiliar with the basics of NSIMD. We will also see different aspects of SIMD\nprogramming:\n- aligned vs. unaligned data access\n- basic SIMD arithmetic\n- SIMD loops\n- SIMD branching\n- architecture selection at runtime\n\n## SIMD basics\n\nSIMD programming means using the CPU SIMD registers to performs operations\non several data at once. A SIMD vector should be viewed as a set of bits\nwhich are interpreted by the operators that operate on them. Taking a 128-bits\nwide SIMD register, it can be interpreted as:\n- 16 signed/unsigned chars\n- 8 signed/unsigned shorts\n- 4 signed/unsigned ints\n- 4 floats\n- 2 signed/unsigned longs\n- 2 doubles\nas shown in the picture below.\n\n![Register layout](img/register.png)\n\n## Computation kernel\n\nWe will explain the rewriting of the following kernel which uppercases ASCII\nletters only.\n\n@[INCLUDE_CODE:L7:L16](../../examples/tutorial.cpp)\n\nHere is the corresponding SIMD version. Explanations to follow.\n\n@[INCLUDE_CODE:L18:L39](../../examples/tutorial.cpp)\n\n## Getting started with NSIMD\n\nAll APIs of NSIMD core is available with this include:\n\n@[INCLUDE_CODE:L1:L1](../../examples/tutorial.cpp)\n\nFor ease of programming with use the NSIMD namespace inside the\n`uppercase_simd` function.\n\n@[INCLUDE_CODE:L20:L20](../../examples/tutorial.cpp)\n\n## SIMD vectors\n\nA `nsimd::pack<T>` can be considered analogous to a SIMD register (on your or\nany other machine). Operations performed on packs - from elementary operations\nsuch as addition to complicated functions such as `nsimd::rsqrt11(x)` - will be\nperformed using SIMD registers and operations if supported by your hardware. As\nshown below, data must be manually loaded into and stored from these registers.\nAgain, for ease of programming we typedef a pack of T's.\n\n@[INCLUDE_CODE:L21:L21](../../examples/tutorial.cpp)\n\nNSIMD provides another type of pack called `nsimd::packl` which handles vectors\nof booleans.\n\n@[INCLUDE_CODE:L22:L22](../../examples/tutorial.cpp)\n\nThis distinction between pack's and packl's is necessary ffor two reasons:\n- On recent hardware, SIMD vectors of booleans are handled by dedicated\n  registers.\n- Pack and Packl must have different semantics as arithmetic operators on\n  booleans have no sense as well as logical operators on Pack's.\n\n## Loading data from memory\n\nOne way to construct a `nsimd::pack<T>` is to simply declare\n(default-construct) it. Such a pack may *not* be zero-initialized and thus may\n*contain arbitrary values*.\n\nAnother way to construct a `nsimd::pack<T>` is to fill it with a single value.\nThis so-called splatting constructor takes one scalar value and replicates it\nin all elements of the pack.\n\nBut most common usage to construct a `nsimd::pack<T>` is by using the copy\nconstructor from loading functions.\n\n@[INCLUDE_CODE:L27:L27](../../examples/tutorial.cpp)\n\n## Aligned vs. unaligned memory\n\nAlignement of a given pointer `ptr` to memory to some value `A` means that\n`ptr % A == 0`. On older hardware loading data from unaligned memory can\nresult in performance penalty. On recent hardware it is hard to exhibit a\ndifference. NSIMD provides two versions of \"load\":\n- `loada` for loading data from aligned memory\n- `loadu` for loading data from unaligned momery\nNote that using `loada` on unaligned pointer may result in segfaults. As\nrecent hardware have good support for unaligned memory we use `loadu`.\n\n@[INCLUDE_CODE:L27:L27](../../examples/tutorial.cpp)\n\nTo ensure that data allocated by `std::vector` is aligned, NSIMD provide\na C++ allocator.\n\n```c++\nstd::vector<T, nsimd::allocator<T> > data;\n```\n\nWhen loading data from memory you must ensure that there is sufficient data in\nthe block of memory you load from to fill a `nsimd::pack<T>`. For example, on\nan `AVX` capable machine, a SIMD vector of `float` (32 bits) contains 8\nelements. Therefore, there must be at least 8 floats in the memory block you\nload data from otherwise loading may result in segfaults. More on this below.\n\n## Operations on pack's and packl's\n\nOnce initialized, `nsimd::pack<T>` instances can be used to perform arithmetic.\nUsual operations are provided by NSIMD such:\n- addition\n- substraction\n- multiplication\n- division\n- square root\n- bitwise and/or/xor\n- ...\n\n@[INCLUDE_CODE:L28:L29](../../examples/tutorial.cpp)\n\nC++ operators are also overloaded for pack's and packl's as well as between\npack's and scalars or packl's and booleans.\n\n## SIMD branching\n\nNSIMD provide the `if_else` operator which fill the output, lane by lane,\naccording to the lane value of its first argument:\n- if it is true, the output lane will be filled with the second argument's lane\n- if it is false, the output lane will be filled with the third argument's lane\nTherefore the branching:\n\n@[INCLUDE_CODE:L10:L14](../../examples/tutorial.cpp)\n\nwill be rewritten as\n\n@[INCLUDE_CODE:L28:L30](../../examples/tutorial.cpp)\n\nor as a one liner\n\n@[INCLUDE_CODE:L36:L36](../../examples/tutorial.cpp)\n\n## SIMD loops\n\nA SIMD loop is similar to its scalar counterpart except that instead of\ngoing through data one element at a time it goes 4 by 4 or 8 by 8 elements\nat a time. More precisely SIMD loops generally goes from steps equal to\npack's length. Therefore the scalar loop\n\n@[INCLUDE_CODE:L9:L9](../../examples/tutorial.cpp)\n\nis rewritten as\n\n@[INCLUDE_CODE:L23:L26](../../examples/tutorial.cpp)\n\nNote that going step by step will only cover most of the data except maybe the\ntail of data in case that the number of elements is not a multiple of the\nPack's length. Therefore to perform computations on the tail one has to\nload data from only `n` elements where `n < len<p_t>()`. One can use\n`maskz_loadu` which will load data only on lanes that are marked as true by\nanother argument to the function.\n\n@[INCLUDE_CODE:L35:L35](../../examples/tutorial.cpp)\n\nThe mask can be computed manually but NSIMD provides a function for it.\n\n@[INCLUDE_CODE:L34:L34](../../examples/tutorial.cpp)\n\nThen the computation on the tail is exactly the same as within the loop. Put\ntogether it gives for the tail:\n\n@[INCLUDE_CODE:L34:L37](../../examples/tutorial.cpp)\n\nThen the entire loop reads as follows.\n\n@[INCLUDE_CODE:L25:L37](../../examples/tutorial.cpp)\n\n## Compiling the Code\n\nHere is the complete listing of the code.\n\n@[INCLUDE_CODE](../../examples/tutorial.cpp)\n\nThe compilation of a program using `nsimd` is like any other library.\n\n```bash\nc++ -O3 -DAVX2 -mavx2 -L/path/to/lib -lnsimd_avx2 -I/path/to/include tutorial.cpp\n```\n\nWhen compiling with NSIMD, you have to decide at compile time the targeted\nSIMD extensions, AVX2 in the example above. It is therefore necessary to\ngive `-mavx2` to the compiler for it to emit AVX2 instructions. To tell NSIMD\nthat AVX2 has to be used the `-DAVX2` has to be passed to the compiler. For\nan exhaustive list of defines controlling compilation see <defines.md>. There\nis a .so file for each SIMD extension, it is therefore necessary to link\nagainst the proper .so file.\n\n## Runtime selection of SIMD extensions\n\nIt is sometimes necessary to have several versions of a given algorithm for\ndifferent SIMD extensions. This is rather to do with NSIMD. Basically the\nidea is to write the algorithm in a generic manner using pack's as shown above.\nIt is then sufficient to compile the same soure file for different SIMD\nextensions and then link the resulting object files altogether. Suppose that\na file named `uppercase.cpp` contains the following code:\n\n@[INCLUDE_CODE:L18:L38](../../examples/tutorial.cpp)\n\nThis would give the following in a Makefile.\n\n```makefile\nall: uppercase\n\nuppercase_sse2.o: uppercase.cpp\n\tc++ -O3 -DSSE2 -msse2 -c $? -o $@\n\nuppercase_sse42.o: uppercase.cpp\n\tc++ -O3 -DSSE42 -msse4.2 -c $? -o $@\n\nuppercase_avx.o: uppercase.cpp\n\tc++ -O3 -DAVX -mavx -c $? -o $@\n\nuppercase_avx2.o: uppercase.cpp\n\tc++ -O3 -DAVX2 -mavx2 -c $? -o $@\n\nuppercase: uppercase_sse2.o \\\n           uppercase_sse42.o \\\n           uppercase_avx.o \\\n           uppercase_avx2.o\n           main.cpp\n\tc++ $? -lnsimd_avx2 -o $@\n```\n\nNote that `libnsimd_avx2` contains all the functions for SSE 2, SSE 4.2, AVX\nand AVX2. This is a consequence of the retrocompatiblity of Intel SIMD\nextensions. The situation is the same on ARM where `libnsimd_sve.so` will\ncontain functions for AARCH64.\n\nThere is a small caveat. The symbol name corresponding to the `uppercase_simd`\nfunction will be same for all the object files which will result in error\nwhen linking together all objects. To avoid this situation one can use\nfunction overloading as follows:\n\n```c++\ntemplate <typename T>\nvoid uppercase_simd(NSIMD_SIMD, T *dst, const T *src, int n) {\n  // ...\n}\n```\n\nThe macro `NSIMD_SIMD` will be expanded to a type containing the information on\nthe SIMD extension currently requested by the user. This techniques is called\ntag dispatching and does not require *any* modification of the algorithm\ninside the function. Finally in `main` one has to do dispatching by using\neither `cpuid` of by another mean.\n\n```c++\nint main() {\n  // what follows is pseudo-code\n  switch(cpuid()) {\n  case cpuid_sse2:\n    uppercase(nsimd::sse2, dst, src, n);\n    break;\n  case cpuid_sse42:\n    uppercase(nsimd::sse42, dst, src, n);\n    break;\n  case cpuid_avx:\n    uppercase(nsimd::avx, dst, src, n);\n    break;\n  case cpuid_avx2:\n    uppercase(nsimd::avx2, dst, src, n);\n    break;\n  }\n  return 0;\n}\n```\n"
  },
  {
    "path": "doc/md2html.cpp",
    "content": "/*\n\nCopyright (c) 2020 Agenium Scale\n\nPermission is hereby granted, free of charge, to any person obtaining a copy\nof this software and associated documentation files (the \"Software\"), to deal\nin the Software without restriction, including without limitation the rights\nto use, copy, modify, merge, publish, distribute, sublicense, and/or sell\ncopies of the Software, and to permit persons to whom the Software is\nfurnished to do so, subject to the following conditions:\n\nThe above copyright notice and this permission notice shall be included in all\ncopies or substantial portions of the Software.\n\nTHE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\nIMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\nFITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\nAUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\nLIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\nOUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\nSOFTWARE.\n\n*/\n\n#include <ns2.hpp>\n\n#include <stdexcept>\n#include <utility>\n#include <string>\n#include <vector>\n\n// ----------------------------------------------------------------------------\n\n// Extract lines form strings like \":L7:L42\"\n// Returns -1 if fails\nstd::pair<int, int> extract_lines(std::string const &s) {\n  std::pair<int, int> r(-1, -1);\n  std::vector<std::string> lines = ns2::split(s, \":L\");\n  if (lines.size() == 3 && lines[0] == \"\") {\n    try {\n      r.first = std::stoi(lines[1]);\n      r.second = std::stoi(lines[2]);\n    } catch (std::exception const &) {\n      r.first = -1;\n      r.second = -1;\n    }\n  }\n  return r;\n}\n\n// ----------------------------------------------------------------------------\n\nstd::string callback_input_filename = \"\";\n\nstd::string callback_macro(std::string const &label, std::string const &url,\n                           ns2::markdown_infos_t const &markdown_infos) {\n  std::string filename;\n  if (ns2::startswith(label, \"INCLUDE\")) {\n    filename = ns2::join_path(ns2::dirname(callback_input_filename), url);\n  }\n\n  std::string lang;\n  if (ns2::startswith(label, \"INCLUDE_CODE\")) {\n    std::string const ext = ns2::splitext(filename).second;\n    if (ext == \"sh\") {\n      lang = \"Bash\";\n    } else if (ext == \"c\" || ext == \"h\") {\n      lang = \"C\";\n    } else if (ext == \"cpp\" || ext == \"hpp\") {\n      lang = \"C++\";\n    } else if (ext == \"py\") {\n      lang = \"Python\";\n    }\n  }\n\n  if (ns2::startswith(label, \"INCLUDE_CODE:\")) {\n    std::string const lines_str = label.substr(label.find(':'));\n    std::pair<int, int> const l_first_last = extract_lines(lines_str);\n    if (l_first_last.first == -1) {\n      throw std::runtime_error(\"cannot extract first line number\");\n    }\n    if (l_first_last.second == -1) {\n      throw std::runtime_error(\"cannot extract last line number\");\n    }\n    std::string out;\n    std::string lines;\n    {\n      ns2::ifile_t in(filename);\n      int num_line = 1;\n      std::string line;\n      while (std::getline(in, line)) {\n        if (num_line == l_first_last.second) {\n          lines += line;\n        } else if (num_line < l_first_last.second) {\n          if (num_line >= l_first_last.first) {\n            lines += line + \"\\n\";\n          }\n        } else {\n          break;\n        }\n        ++num_line;\n      }\n    }\n    ns2::compile_markdown(\"```\" + lang + \"\\n\" + ns2::deindent(lines) +\n                              \"\\n```\\n\",\n                          &out, markdown_infos);\n    return out;\n  }\n\n  if (ns2::startswith(label, \"INCLUDE_CODE\")) {\n    std::string out;\n    ns2::compile_markdown(\"```\" + lang + \"\\n\" + ns2::read_file(filename) +\n                              \"\\n```\\n\",\n                          &out, markdown_infos);\n    return out;\n  }\n\n  if (ns2::startswith(label, \"INCLUDE\")) {\n    ns2::ifile_t in(filename);\n    std::ostringstream out;\n    ns2::compile_markdown(&in, &out, markdown_infos);\n    return out.str();\n  }\n\n  return \"\";\n}\n\n// ----------------------------------------------------------------------------\n\nstd::pair<std::string, bool>\ncallback_link(std::string const &label, std::string const &url,\n              ns2::markdown_infos_t const &markdown_infos) {\n  if (markdown_infos.output_format != ns2::HTML) {\n    return std::pair<std::string, bool>(\"\", false);\n  }\n\n  std::pair<std::string, std::string> root_basename_ext = ns2::splitext(url);\n  if (root_basename_ext.second == \"md\") {\n    return std::pair<std::string, bool>(\n        ns2::html_href(root_basename_ext.first + \".html\", label), true);\n  } else {\n    return std::pair<std::string, bool>(\"\", false);\n  }\n}\n\n// ----------------------------------------------------------------------------\n\nint main(int argc, char **argv) {\n  if (argc != 3) {\n    std::cout << \"Usage: \" << argv[0] << \" <input_file> <output_file>\"\n              << std::endl;\n    return 1;\n  }\n\n  std::string const input_filename = argv[1];\n  std::string const output_filename = argv[2];\n\n  ns2::ifile_t input_file(input_filename);\n  ns2::ofile_t output_file(output_filename);\n\n  std::cout << \"Convert \\\"\" << input_filename << \"\\\" to \\\"\" << output_filename\n            << \"\\\"\" << std::endl;\n\n  callback_input_filename = input_filename;\n  ns2::markdown_infos_t markdown_infos(ns2::HTML, callback_macro,\n                                       callback_link, true);\n\n  ns2::compile_markdown(&input_file, &output_file, markdown_infos);\n\n  return 0;\n}\n"
  },
  {
    "path": "doc/what_is_wrapped.cpp",
    "content": "/*\n\nCopyright (c) 2021 Agenium Scale\n\nPermission is hereby granted, free of charge, to any person obtaining a copy\nof this software and associated documentation files (the \"Software\"), to deal\nin the Software without restriction, including without limitation the rights\nto use, copy, modify, merge, publish, distribute, sublicense, and/or sell\ncopies of the Software, and to permit persons to whom the Software is\nfurnished to do so, subject to the following conditions:\n\nThe above copyright notice and this permission notice shall be included in all\ncopies or substantial portions of the Software.\n\nTHE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\nIMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\nFITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\nAUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\nLIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\nOUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\nSOFTWARE.\n\n*/\n\n/*\n\nThis little C++ program reads and parses files from NSIMD wrapping intrinsics\nin order to build a markdown page describing in a table which operators are\njust intrinsics wrapper and which one are more complicated. We only to parse\nC code so no need for complicated stuff. Moreover what we doo is really simple\nand a C parser is not needed.\n\nWe replace all C delimiters by spaces, then split the resulting string into\nwords and we get a vector of strings. Then search in it the function that we\nwant (say nsimd_add_sse2_f32) along with its opening curly and closing\nbrakets and finally:\n- if there is only one token then it must be an intrinsic\n- if there is a for then it must use emulation\n- if there are several tokens but no for it must be a trick using other\n  intrinsics\n\nThe produced markdown contains:\n- E for emulation\n- T for trick with other intrinsics\n- NOOP for noop\n- a link to the Intel/Arm documentation about the intrinsic otherwise\n\nWell all that to say that a few hundreds of simple C++ code is more that\nenough for our need and we don't need to depend on some C/C++ parser such\nas Clang. Note that using a real parser will be counter productive as some\nintrinsics are implemented as macros to compiler builtin which then appear\nin the AST instead of the documented intrinsics.\n\nThis code is completely non-optimized and we don't care because it does not\ntake time to execute and it is not our purpose to optimize this code.\n\n*/\n\n// ----------------------------------------------------------------------------\n\n#include <ns2.hpp>\n\n#include <utility>\n#include <string>\n#include <vector>\n\n// ----------------------------------------------------------------------------\n\n#define MAX_LEN (11 * 11)\n\ntypedef std::map<std::string, std::string[MAX_LEN]> table_t;\n\nstd::string type_names_str(\"i8,u8,i16,u16,i32,u32,i64,u64,f16,f32,f64\");\nstd::vector<std::string> types_list(ns2::split(type_names_str, \",\"));\n\nconst size_t not_found = ~((size_t)0);\n\n// ----------------------------------------------------------------------------\n\nint nbits(std::string const &typ) {\n  if (typ == \"i8\" || typ == \"u8\") {\n    return 8;\n  } else {\n    return (10 * (typ[1] - '0')) + (typ[2] - '0');\n  }\n}\n\n// ----------------------------------------------------------------------------\n\nstd::vector<std::string> get_types_names(std::string const &output) {\n  std::vector<std::string> const& list = types_list;\n  if (output == \"same\") {\n    return list;\n  }\n  std::vector<std::string> ret;\n  for (size_t i = 0; i < list.size(); i++) {\n    for (size_t j = 0; j < list.size(); j++) {\n      if ((output == \"same_size\" && nbits(list[j]) == nbits(list[i])) ||\n          (output == \"bigger_size\" && nbits(list[j]) == 2 * nbits(list[i])) ||\n          (output == \"lesser_size\" && 2 * nbits(list[j]) == nbits(list[i]))) {\n        ret.push_back(list[j] + \"_\" + list[i]);\n      }\n    }\n  }\n  return ret;\n}\n\n// ----------------------------------------------------------------------------\n\nsize_t find(std::vector<std::string> const &haystack,\n            std::string const &needle, size_t i0 = 0) {\n  for (size_t i = i0; i < haystack.size(); i++) {\n    if (haystack[i] == needle) {\n      return i;\n    }\n  }\n  return not_found;\n}\n\n// ----------------------------------------------------------------------------\n\nsize_t find_by_prefix(std::vector<std::string> const &needles,\n                      std::string const &haystack) {\n  for (size_t i = 0; i < needles.size(); i++) {\n    if (ns2::startswith(haystack, needles[i])) {\n      return i;\n    }\n  }\n  return not_found;\n}\n\n// ----------------------------------------------------------------------------\n\nint is_number(std::string const &s) {\n  for (size_t i = 0; i < s.size(); i++) {\n    if (s[i] != 'x' && s[i] != 'l' && s[i] != 'L' && s[i] != 'u' &&\n        s[i] != 'U' && !(s[i] >= '0' && s[i] <= '9')) {\n      return false;\n    }\n  }\n  return true;\n}\n\n// ----------------------------------------------------------------------------\n\nint is_macro(std::string const &s) {\n  for (size_t i = 0; i < s.size(); i++) {\n    if (s[i] != '_' || !(s[i] >= 'A' && s[i] <= 'Z')) {\n      return false;\n    }\n  }\n  return true;\n}\n\n// ----------------------------------------------------------------------------\n\nvoid parse_file(std::string const &input_vars, std::string const &simd_ext,\n                std::vector<std::string> const &types_names,\n                std::string const &op_name, std::string const &filename,\n                table_t *table_) {\n  table_t &table = *table_;\n  std::string content(ns2::read_file(filename));\n\n  // replace all C delimiters by spaces except {}\n  for (size_t i = 0; i < content.size(); i++) {\n    const char delims[] = \"()[];,:+-*/%&|!%\\n\\t\\r\";\n    for (size_t j = 0; j < sizeof(delims); j++) {\n      if (content[i] == delims[j]) {\n        content[i] = ' ';\n        break;\n      }\n    }\n  }\n\n  // replace '{' by ' { ' and same for '}' in case there are some code\n  // just before/after it\n  content = ns2::replace(ns2::replace(content, \"}\", \" } \"), \"{\", \" { \");\n\n  // now split string on spaces and removes some tokens\n  std::vector<std::string> to_be_removed(\n      ns2::split(\"return,signed,unsigned,char,short,int,long,float,double,\"\n                 \"const,void,__vector,__bool,bool,vector\" +\n                     type_names_str + \",\" + input_vars,\n                 ','));\n  std::vector<std::string> to_be_removed_by_prefix(ns2::split(\n      \"_mm_cast,_mm256_cast,_mm512_cast,vreinterpret,svreinterpret,svptrue_\",\n      ','));\n  std::vector<std::string> tokens;\n  { // to free tokens0 afterwards\n    std::vector<std::string> tokens0 = ns2::split(content, ' ');\n    for (size_t i = 0; i < tokens0.size(); i++) {\n      // We also remove svptrue_* as they are everywhere for SVE and all\n      // casts as they incur no opcode and are often used for intrinsics\n      // not supporting certain types\n      if (tokens0[i].size() == 0 || is_number(tokens0[i]) ||\n          is_macro(tokens0[i]) ||\n          find_by_prefix(to_be_removed_by_prefix, tokens0[i]) != not_found ||\n          find(to_be_removed, tokens0[i]) != not_found) {\n        continue;\n      }\n      tokens.push_back(tokens0[i]);\n    }\n  }\n\n  // finally search for intrinsics\n  for (size_t typ = 0; typ < types_names.size(); typ++) {\n    std::string func_name(\"nsimd_\" + op_name + \"_\" + simd_ext + \"_\" +\n                          types_names[typ]);\n\n    // find func_name\n    size_t pos = find(tokens, func_name);\n    if (pos == not_found) {\n      table[op_name][typ] = \"NA\";\n      continue;\n    }\n\n    // find opening {\n    size_t i0 = find(tokens, \"{\", pos);\n    if (i0 == not_found) {\n      std::cerr << \"WARNING: cannot find opening '{' for '\" << func_name\n                << \"' in '\" << filename << \"'\\n\";\n      table[op_name][typ] = \"NA\";\n      continue;\n    }\n\n    // find closing }\n    size_t i1 = i0;\n    int nest = 0;\n    for (i1 = i0; i1 < tokens.size(); i1++) {\n      if (tokens[i1] == \"{\") {\n        nest++;\n      } else if (tokens[i1] == \"}\") {\n        nest--;\n      }\n      if (nest == 0) {\n        break;\n      }\n    }\n\n    // if there is no token inside {} then it must be a noop\n    // if there is only one token inside {} then it must be the intrinsic\n    // if there is a for loop then it must be emulation\n    // if there are several tokens but no for then it must be a trick\n    if (i0 + 1 == i1) {\n      table[op_name][typ] = \"NOOP\";\n    } else if (i0 + 2 == i1 && !ns2::startswith(tokens[i0 + 1], \"nsimd_\")) {\n      table[op_name][typ] = \"[`\" + tokens[i0 + 1] + \"`]\";\n      if (simd_ext == \"neon128\" || simd_ext == \"aarch64\") {\n        table[op_name][typ] +=\n            \"(https://developer.arm.com/architectures/instruction-sets/\"\n            \"intrinsics/\" + tokens[i0 + 1] + \")\";\n      } else if (ns2::startswith(simd_ext, \"sve\")) {\n        table[op_name][typ] +=\n            \"(https://developer.arm.com/documentation/100987/0000)\";\n      } else if (simd_ext == \"sse2\" || simd_ext == \"sse42\" ||\n                 simd_ext == \"avx\" || simd_ext == \"avx2\" ||\n                 simd_ext == \"avx512_knl\" || simd_ext == \"avx512_skylake\") {\n        table[op_name][typ] += \"(https://software.intel.com/sites/landingpage/\"\n                               \"IntrinsicsGuide/#text=\" +\n                               tokens[i0 + 1] + \")\";\n      } else if (simd_ext == \"vsx\" || simd_ext == \"vmx\") {\n        table[op_name][typ] +=\n            \"(https://www.ibm.com/docs/en/xl-c-aix/13.1.3?topic=functions-\" +\n            ns2::replace(tokens[i0 + 1], \"_\", \"-\") + \")\";\n      }\n    } else {\n      if (find(std::vector<std::string>(tokens.begin() + i0,\n                                        tokens.begin() + (i1 + 1)),\n               \"for\") != not_found) {\n        table[op_name][typ] = \"E\";\n      } else {\n        table[op_name][typ] = \"T\";\n      }\n    }\n  }\n}\n\n// ----------------------------------------------------------------------------\n\nstd::string md_row(int nb_col, std::string const &cell_content) {\n  std::string ret(\"|\");\n  for (int i = 0; i < nb_col; i++) {\n    ret += cell_content + \"|\";\n  }\n  return ret;\n}\n\n// ----------------------------------------------------------------------------\n\nint main(int argc, char **argv) {\n  if ((argc % 2) != 0 || argc <= 5) {\n    std::cout\n        << \"Usage: \" << argv[0]\n        << \" a0,a1,a2 simd_ext output_type operator1 file1 operator2 file2 \"\n           \"...\\n\"\n        << \"where output_type is (same|same_size|bigger_size|lesser_size)\"\n        << std::endl;\n    return 1;\n  }\n\n  std::string input_vars(argv[1]);\n  std::string simd_ext(argv[2]);\n  std::string output_type(argv[3]);\n  std::vector<std::string> types_names = get_types_names(output_type);\n  table_t table;\n\n  for (int i = 4; i < argc; i += 2) {\n    parse_file(input_vars, simd_ext, types_names, argv[i], argv[i + 1],\n               &table);\n  }\n\n  for (table_t::const_iterator it = table.begin(); it != table.end(); it++) {\n    std::cout << \"## \" << it->first << \"\\n\\n\";\n    if (output_type == \"same\") {\n      const std::string(&row)[MAX_LEN] = it->second;\n      for (size_t i = 0; i < types_list.size(); i++) {\n        std::cout << \"-  \" << it->first << \" on **\" << types_list[i]\n                  << \"**: \" << row[i] << \"\\n\";\n      }\n      std::cout << \"\\n\\n\";\n    } else {\n      const std::string(&row)[MAX_LEN] = it->second;\n      for (size_t i = 0; i < types_list.size(); i++) {\n        for (size_t j = 0; j < types_list.size(); j++) {\n          std::string cell_content;\n          std::string typ(types_list[j] + \"_\" + types_list[i]);\n          for (size_t k = 0; k < types_names.size(); k++) {\n            if (typ == types_names[k]) {\n              cell_content = row[k];\n              break;\n            }\n          }\n          if (cell_content.size() > 0) {\n            std::cout << \"-  \" << it->first << \" from **\" << types_list[i]\n                      << \"** to **\" << types_list[j] << \"**: \" << cell_content\n                      << \"\\n\";\n          }\n        }\n        std::cout << \"\\n\";\n      }\n      std::cout << \"\\n\";\n    }\n  }\n\n  return 0;\n}\n"
  },
  {
    "path": "egg/__init__.py",
    "content": "# Copyright (c) 2019 Agenium Scale\n#\n# Permission is hereby granted, free of charge, to any person obtaining a copy\n# of this software and associated documentation files (the \"Software\"), to deal\n# in the Software without restriction, including without limitation the rights\n# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n# copies of the Software, and to permit persons to whom the Software is\n# furnished to do so, subject to the following conditions:\n#\n# The above copyright notice and this permission notice shall be included in all\n# copies or substantial portions of the Software.\n#\n# THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n# SOFTWARE.\n\nfrom . import operators\n"
  },
  {
    "path": "egg/common.py",
    "content": "# Use utf-8 encoding\n# -*- coding: utf-8 -*-\n\n# Copyright (c) 2020 Agenium Scale\n#\n# permission is hereby granted, free of charge, to any person obtaining a copy\n# of this software and associated documentation files (the \"Software\"), to deal\n# in the Software without restriction, including without limitation the rights\n# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n# copies of the Software, and to permit persons to whom the Software is\n# furnished to do so, subject to the following conditions:\n#\n# The above copyright notice and this permission notice shall be included in all\n# copies or substantial portions of the Software.\n#\n# THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n# SOFTWARE.\n\n# -----------------------------------------------------------------------------\n\n# What does this script?\n# ----------------------\n#\n# This is only a python module that holds what is shared by `generate.py`,\n# the `platform_*.py` files and all other python code in `egg`. If contains\n# the list of supported types, functions, operators, and some useful helper\n# functions such as the python equivalent of `mkdir -p`.\n\n# -----------------------------------------------------------------------------\n# Import section\n\nimport math\nimport os\nimport sys\nimport io\nimport collections\nimport platform\nimport string\nimport shutil\nimport math\n\n# -----------------------------------------------------------------------------\n# print\n\ndef myprint(opts, obj):\n    if opts.list_files:\n        return\n    print('-- {}'.format(obj))\n\n# -----------------------------------------------------------------------------\n# check if file exists\n\ndef can_create_filename(opts, filename):\n    if opts.list_files:\n        print(filename)\n        return False\n    if opts.verbose:\n        sys.stdout.write('-- {}: '.format(filename))\n    if os.path.isfile(filename) and not opts.force:\n        if opts.verbose:\n            sys.stdout.write('skipping\\n')\n        return False\n    elif opts.force:\n        if opts.verbose:\n            sys.stdout.write('creating (forced)\\n')\n        return True\n    else:\n        if opts.verbose:\n            sys.stdout.write('creating (missing)\\n')\n        return True\n\n# -----------------------------------------------------------------------------\n# open with UTF8 encoding\n\ndef open_utf8(opts, filename):\n    dummy, ext = os.path.splitext(filename)\n    if ext.lower() in ['.c', '.h', '.cpp', '.hpp', '.cc', '.cxx', '.hxx',\n                       '.hpp']:\n        begin_comment = '/*'\n        end_comment = '*/'\n    elif ext.lower() in ['.md', '.htm', '.html']:\n        begin_comment = '<!--'\n        end_comment = '-->'\n    else:\n        begin_comment = None\n    with io.open(filename, mode='w', encoding='utf-8') as fout:\n        if begin_comment is not None:\n            if opts.simple_license:\n                fout.write('''{}\n\nCopyright (c) 2021 Agenium Scale\n\n{}\n\n'''.format(begin_comment, end_comment))\n            else:\n                fout.write('''{}\n\nCopyright (c) 2021 Agenium Scale\n\nPermission is hereby granted, free of charge, to any person obtaining a copy\nof this software and associated documentation files (the \"Software\"), to deal\nin the Software without restriction, including without limitation the rights\nto use, copy, modify, merge, publish, distribute, sublicense, and/or sell\ncopies of the Software, and to permit persons to whom the Software is\nfurnished to do so, subject to the following conditions:\n\nThe above copyright notice and this permission notice shall be included in all\ncopies or substantial portions of the Software.\n\nTHE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\nIMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\nFITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\nAUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\nLIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\nOUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\nSOFTWARE.\n\n{}\n\n'''.format(begin_comment, end_comment))\n\n        fout.write('{} This file has been auto-generated {}\\n\\n'.\\\n            format(begin_comment, end_comment))\n\n    return io.open(filename, mode='a', encoding='utf-8')\n\n# -----------------------------------------------------------------------------\n# clang-format\n\ndef clang_format(opts, filename, cuda=False):\n    with io.open(filename, 'a', encoding='utf-8') as fout:\n        fout.write('\\n')\n    if not opts.enable_clang_format:\n        # TODO: not sure if needed to implement a smarter call to clang-format\n        if cuda:\n            os.system('clang-format -style=\"{{ Standard: Cpp11 }}\" -i {}'. \\\n                      format(filename))\n        else:\n            os.system('clang-format -style=\"{{ Standard: Cpp03 }}\" -i {}'. \\\n                      format(filename))\n    if cuda:\n        shutil.copyfile(filename, filename[:-4] + '.cu')\n\n# -----------------------------------------------------------------------------\n# Not implemented response\n\nNOT_IMPLEMENTED = 'abort();'\n\n# -----------------------------------------------------------------------------\n# C/C++ comment hbar\n\nhbar = '/* ' + ('-' * 73) + ' */'\n\n# -----------------------------------------------------------------------------\n# Convert constants for operators\n\nOUTPUT_TO_SAME_TYPE       = 0\nOUTPUT_TO_SAME_SIZE_TYPES = 1\nOUTPUT_TO_UP_TYPES        = 2\nOUTPUT_TO_DOWN_TYPES      = 3\n\n# -----------------------------------------------------------------------------\n# SIMD type\n\nx86_simds = [\n    'sse2',\n    'sse42',\n    'avx',\n    'avx2',\n    'avx512_knl',\n    'avx512_skylake',\n]\n\narm_simds = [\n    'neon128',\n    'aarch64',\n    'sve',\n    'sve128',\n    'sve256',\n    'sve512',\n    'sve1024',\n    'sve2048'\n]\n\nppc_simds = [\n    'vmx',\n    'vsx',\n]\n\nsimds = ['cpu'] + x86_simds + arm_simds + ppc_simds\n\nsimds_deps = {\n    'cpu': ['cpu'],\n    'sse2': ['cpu', 'sse2'],\n    'sse42': ['cpu', 'sse2', 'sse42'],\n    'avx': ['cpu', 'sse2', 'sse42', 'avx'],\n    'avx2': ['cpu', 'sse2', 'sse42', 'avx', 'avx2'],\n    'fma4': [],\n    'avx512_knl': ['cpu', 'sse2', 'sse42', 'avx', 'avx2', 'avx512_knl'],\n    'avx512_skylake': ['cpu', 'sse2', 'sse42', 'avx', 'avx2', 'avx512_skylake'],\n    'neon128': ['cpu', 'neon128'],\n    'aarch64': ['cpu', 'aarch64'],\n    'sve': ['cpu', 'aarch64', 'sve'],\n    'sve128': ['cpu', 'aarch64', 'sve128'],\n    'sve256': ['cpu', 'aarch64', 'sve256'],\n    'sve512': ['cpu', 'aarch64', 'sve512'],\n    'sve1024': ['cpu', 'aarch64', 'sve1024'],\n    'sve2048': ['cpu', 'aarch64', 'sve2048'],\n    'vmx': ['cpu', 'vmx'],\n    'vsx': ['cpu', 'vmx', 'vsx']\n}\n\nftypes = ['f64', 'f32', 'f16']\nftypes_no_f16 = ['f64', 'f32']\nitypes = ['i64', 'i32', 'i16', 'i8']\nutypes = ['u64', 'u32', 'u16', 'u8']\niutypes = itypes + utypes\ntypes = ftypes + iutypes\n\ndef logical(typ):\n    return 'l{}'.format(typ)\n\nsigned_type = {\n    'i8': 'i8',\n    'u8': 'i8',\n    'i16': 'i16',\n    'u16': 'i16',\n    'i32': 'i32',\n    'u32': 'i32',\n    'i64': 'i64',\n    'u64': 'i64',\n    'f16': 'f16',\n    'f32': 'f32',\n    'f64': 'f64'\n}\n\nbitfield_type = {\n    'i8': 'u8',\n    'u8': 'u8',\n    'i16': 'u16',\n    'u16': 'u16',\n    'i32': 'u32',\n    'u32': 'u32',\n    'i64': 'u64',\n    'u64': 'u64',\n    'f16': 'u16',\n    'f32': 'u32',\n    'f64': 'u64'\n}\n\nin0 = 'a0'\nin1 = 'a1'\nin2 = 'a2'\nin3 = 'a3'\nin4 = 'a4'\nin5 = 'a5'\n\nCPU_NBITS = 128\n\nif CPU_NBITS != 128:\n    raise ValueError('CPU_NBITS must be 128')\n\ndef get_arg(i):\n    fmtspec = { 'in0': in0, 'in1': in1, 'in2': in2, 'in3': in3, 'in4': in4,\n                'in5': in5 }\n    return '{{in{}}}'.format(i).format(**fmtspec)\n\ndef get_args(n):\n    fmtspec = { 'in0': in0, 'in1': in1, 'in2': in2, 'in3': in3, 'in4': in4,\n                'in5': in5 }\n    return ', '.join(['{{in{}}}'.format(i).format(**fmtspec) \\\n                      for i in range(0, n)])\n\ndef get_simds_deps_from_opts(opts):\n    simds = set()\n    for simd1 in opts.simd:\n        for simd2 in simds_deps[simd1]:\n            simds.add(simd2)\n    return simds\n\ndef bitsize(typ):\n    if not (typ in types):\n        raise ValueError('Unknown type \"{}\"'.format(typ))\n    return int(typ[1:])\n\ndef sizeof(typ):\n    return bitsize(typ) // 8\n\ndef ilog2(x):\n    if x <= 0:\n        return None\n    for i in range(0, x):\n        if 2 ** (i + 1) > x:\n            return i\n\n#def get_same_size_types(typ):\n#    nbits = typ[1:]\n#    if typ in ['i8' ,'u8']:\n#        return ['i8', 'u8']\n#    else:\n#        return ['i' + nbits, 'u' + nbits, 'f' + nbits]\n\ndef get_output_types(from_typ, output_to):\n    if output_to == OUTPUT_TO_SAME_TYPE:\n        return [from_typ]\n    else:\n        nbits = from_typ[1:]\n        if output_to == OUTPUT_TO_SAME_SIZE_TYPES:\n            if from_typ in ['i8' ,'u8']:\n                return ['i8', 'u8']\n            else:\n                return ['i' + nbits, 'u' + nbits, 'f' + nbits]\n        elif output_to == OUTPUT_TO_UP_TYPES:\n            if nbits == '64':\n                raise ValueError('No uptype for ' + from_typ)\n            else:\n                n = str(int(nbits) * 2)\n                return ['i' + n, 'u' + n, 'f' + n]\n        elif output_to == OUTPUT_TO_DOWN_TYPES:\n            n = str(int(nbits) // 2)\n            if nbits == '8':\n                raise ValueError('No downtype for ' + from_typ)\n            elif nbits == '16':\n                return ['i' + n, 'u' + n]\n            else:\n                return ['i' + n, 'u' + n, 'f' + n]\n        else:\n            raise ValueError('Invalid argument for \"output_to\": {}'. \\\n                             format(output_to))\n\n# -----------------------------------------------------------------------------\n# mkdir -p (avoid a dependency for just one function)\n\ndef mkdir_p(path):\n    if os.path.isdir(path):\n        return path\n    head, tail = os.path.split(path)\n    if head != '':\n        mkdir_p(head)\n    os.mkdir(path)\n    return path\n\n# -----------------------------------------------------------------------------\n# Replacement of enumerate\n\ndef enum(l):\n    ret = []\n    for i in range(0, len(l)):\n        ret.append([i, l[i]])\n    return ret\n\n# -----------------------------------------------------------------------------\n# List of supported SIMD operators/functions\n\n# v   = SIMD vector parameter\n# vi  = SIMD vector of signed integers parameter\n# vx2 = struct of 2 SIMD vector parameters\n# vx3 = struct of 3 SIMD vector parameters\n# vx4 = struct of 4 SIMD vector parameters\n# l   = SIMD vector of logicals parameter\n# s   = Scalar parameter\n# *   = Pointer to scalar parameter\n# c*  = Pointer to const scalar parameter\n# _   = void (only for return type)\n# p   = Parameter (int)\n\n\n# -----------------------------------------------------------------------------\n# Type generators\n\ndef get_one_type_generic(param, typ):\n    if param == '_':\n        return 'void'\n    elif param == 'p':\n        return 'int'\n    elif param == 's':\n        return typ\n    elif param == '*':\n        return '{}*'.format(typ)\n    elif param == 'c*':\n        return '{} const*'.format(typ)\n    elif param == 'vi':\n        return 'vi{}'.format(typ[1:])\n    elif param == 'v':\n        return 'v{}'.format(typ)\n    elif param == 'vx2':\n        return 'v{}x2'.format(typ)\n    elif param == 'vx3':\n        return 'v{}x3'.format(typ)\n    elif param == 'vx4':\n        return 'v{}x4'.format(typ)\n    elif param == 'l':\n        return 'vl{}'.format(typ)\n    else:\n        raise ValueError(\"Unknown param '{}'\".format(param))\n\ndef get_one_type_specific(param, ext, typ):\n    if param == '_':\n        return 'void'\n    elif param == 'p':\n        return 'int'\n    elif param == 's':\n        return typ\n    elif param == '*':\n        return '{}*'.format(typ)\n    elif param == 'c*':\n        return '{} const*'.format(typ)\n    elif param == 'vi':\n        return 'nsimd_{}_vi{}'.format(ext, typ[1:])\n    elif param == 'v':\n        return 'nsimd_{}_v{}'.format(ext, typ)\n    elif param == 'vx2':\n        return 'nsimd_{}_v{}x2'.format(ext, typ)\n    elif param == 'vx3':\n        return 'nsimd_{}_v{}x3'.format(ext, typ)\n    elif param == 'vx4':\n        return 'nsimd_{}_v{}x4'.format(ext, typ)\n    elif param == 'l':\n        return 'nsimd_{}_vl{}'.format(ext, typ)\n    else:\n        raise ValueError(\"Unknown param '{}'\".format(param))\n\ndef get_one_type_pack(param, inout, N):\n    if param == '_':\n        return 'void'\n    if param == 'p':\n        return 'int'\n    if param == '*':\n        return 'T*'\n    if param == 'c*':\n        return 'T const*'\n    if param == 's':\n        return 'T'\n    if param in ['v', 'vx2', 'vx3', 'vx4']:\n        if inout == 0:\n            return 'pack<T, {}, SimdExt> const&'.format(N)\n        else:\n            return 'pack<T, {}, SimdExt>'.format(N)\n    if param == 'vi':\n        if inout == 0:\n            return 'pack<typename traits<T>::itype, {}, SimdExt> const&'. \\\n                   format(N)\n        else:\n            return 'pack<typename traits<T>::itype, {}, SimdExt>'.format(N)\n    if param == 'l':\n        if inout == 0:\n            return 'packl<T, {}, SimdExt> const&'.format(N)\n        else:\n            return 'packl<T, {}, SimdExt>'.format(N)\n    raise ValueError(\"Unknown param '{}'\".format(param))\n\ndef get_one_type_generic_adv_cxx(param, T, N):\n    if param == '_':\n        return 'void'\n    elif param == 'p':\n        return 'int'\n    elif param == '*':\n        return '{}*'.format(T)\n    elif param == 'c*':\n        return '{} const*'.format(T)\n    elif param == 's':\n        return T\n    elif param == 'v':\n        return 'pack<{}, {}, SimdExt>'.format(T, N)\n    elif param == 'vi':\n        return 'pack<i{}, {}, SimdExt>'.format(T[1:], N)\n    elif param == 'vx2':\n        return 'packx2<{}, {}, SimdExt>'.format(T, N)\n    elif param == 'vx3':\n        return 'packx3<{}, {}, SimdExt>'.format(T, N)\n    elif param == 'vx4':\n        return 'packx4<{}, {}, SimdExt>'.format(T, N)\n    elif param == 'l':\n        return 'packl<{}, {}, SimdExt>'.format(T, N)\n    else:\n        raise ValueError('Unknown param: \"{}\"'.format(param))\n\ndef get_one_type_scalar(param, t):\n    if param == '_':\n        return 'void'\n    elif param in ['p', 'l']:\n        return 'int'\n    elif param in ['s', 'v']:\n        return t\n    else:\n        raise ValueError('Unknown param: \"{}\"'.format(param))\n\ndef get_first_discriminating_type(params):\n    for i in range(len(params)):\n        if params[i] in ['v', 'l', 'vx2', 'vx3', 'vx4']:\n            return i\n    return -1\n\n# -----------------------------------------------------------------------------\n# Formats\n\ndef pprint_lines(what):\n    return '\\n'.join(what)\n\ndef pprint_commas(what):\n    return ', '.join(what)\n\ndef pprint_includes(what):\n    return pprint_lines('#include {}'.format(i) for i in what)\n\n# -----------------------------------------------------------------------------\n# Function parsing signatures\n\ndef parse_signature(signature):\n    l = signature.split(' ');\n    name = l[1]\n    if len(l) > 2:\n        params = [l[0]] + l[2:]\n    else:\n        params = [l[0]]\n\n    return (name, params)\n\n# -----------------------------------------------------------------------------\n# Load platforms\n\ndef get_platforms(opts):\n    if opts.platforms_list != None:\n        return opts.platforms_list\n    ret = dict()\n    path = opts.script_dir\n    myprint(opts, 'Searching platforms in \"{}\"'.format(path))\n    for mod_file in os.listdir(path):\n        if mod_file[-3:] == '.py' and mod_file[0:9] == 'platform_':\n            mod_name = mod_file[:-3]\n            myprint(opts, 'Found new platform: {}'.format(mod_name[9:]))\n            ret[mod_name[9:]] = __import__(mod_name)\n    opts.platforms_list = ret\n    return ret\n\n# -----------------------------------------------------------------------------\n# Find modules\n\ndef get_modules(opts):\n    if opts.modules_list != None:\n        return opts.modules_list\n    ret = dict()\n    # We have one module by directory\n    path = os.path.join(opts.script_dir, 'modules')\n    myprint(opts, 'Searching modules in \"{}\"'.format(path))\n    for module_dir in os.listdir(path):\n        if (not os.path.isdir(os.path.join(path, module_dir))) or \\\n           module_dir == '.' or module_dir == '..' or \\\n           (not os.path.exists(os.path.join(path, module_dir, 'hatch.py'))):\n            continue\n        myprint(opts, 'Found new module: {}'.format(module_dir))\n        mod = __import__('modules.{}.hatch'.format(module_dir))\n        ret[module_dir] = mod\n    opts.modules_list = ret\n    return ret\n\n# -----------------------------------------------------------------------------\n# Integer limits per type using macros defined in <limits.h> or <climits>\n\nlimits = {\n    'i8':   {'min': 'NSIMD_I8_MIN',     'max': 'NSIMD_I8_MAX'   },\n    'i16':  {'min': 'NSIMD_I16_MIN',    'max': 'NSIMD_I16_MAX'  },\n    'i32':  {'min': 'NSIMD_I32_MIN',    'max': 'NSIMD_I32_MAX'  },\n    'i64':  {'min': 'NSIMD_I64_MIN',    'max': 'NSIMD_I64_MAX'  },\n    'u8':   {'min': 'NSIMD_U8_MIN',     'max': 'NSIMD_U8_MAX'   },\n    'u16':  {'min': 'NSIMD_U16_MIN',    'max': 'NSIMD_U16_MAX'  },\n    'u32':  {'min': 'NSIMD_U32_MIN',    'max': 'NSIMD_U32_MAX'  },\n    'u64':  {'min': 'NSIMD_U64_MIN',    'max': 'NSIMD_U64_MAX'  }\n  }\n\n# -----------------------------------------------------------------------------\n# Misc\n\ndef ext_from_lang(lang):\n    return 'c' if lang == 'c_base' else 'cpp'\n\ndef nsimd_category(category):\n    return 'nsimd_' + category\n\n# ------------------------------------------------------------------------------\n# Doc common\n\ndef to_filename(op_name):\n    valid = string.ascii_letters + string.digits\n    ret = ''\n    for c in op_name:\n        ret += '-' if c not in valid else c\n    return ret\n\ndef get_markdown_dir(opts):\n    return os.path.join(opts.script_dir, '..', 'doc', 'markdown')\n\ndef get_markdown_api_file(opts, name, module=''):\n    root = get_markdown_dir(opts)\n    op_name = to_filename(name)\n    if module == '':\n        return os.path.join(root, 'api_{}.md'.format(op_name))\n    else:\n        return os.path.join(root, 'module_{}_api_{}.md'.format(module, op_name))\n\ndef get_markdown_file(opts, name, module=''):\n    root =  get_markdown_dir(opts)\n    op_name = to_filename(name)\n    if module == '':\n        return os.path.join(root, '{}.md'.format(op_name))\n    else:\n        return os.path.join(root, 'module_{}_{}.md'.format(module, op_name))\n\n"
  },
  {
    "path": "egg/cuda.py",
    "content": "# Copyright (c) 2020 Agenium Scale\n#\n# Permission is hereby granted, free of charge, to any person obtaining a copy\n# of this software and associated documentation files (the \"Software\"), to deal\n# in the Software without restriction, including without limitation the rights\n# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n# copies of the Software, and to permit persons to whom the Software is\n# furnished to do so, subject to the following conditions:\n#\n# The above copyright notice and this permission notice shall be included in all\n# copies or substantial portions of the Software.\n#\n# THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n# SOFTWARE.\n\nimport common\nimport scalar\n\nfmtspec = dict()\n\n# -----------------------------------------------------------------------------\n# NVIDIA doc on f16 can be found at\n# https://docs.nvidia.com/cuda/cuda-math-api/group__CUDA__MATH__INTRINSIC__HALF.html\n\ndef get_impl_f16(operator, totyp, typ):\n    if operator.name == 'round_to_even':\n        arch53_code = 'return hrint({in0});'.format(**fmtspec)\n    elif operator.name in ['rec', 'rec8', 'rec11']:\n        arch53_code = 'return hrcp({in0});'.format(**fmtspec)\n    elif operator.name in ['rsqrt8', 'rsqrt11']:\n        arch53_code = 'return hrsqrt({in0});'.format(**fmtspec)\n    elif operator.name in ['fma', 'fms', 'fnma', 'fnms']:\n        neg = '-' if operator.name in ['fnma, fnms'] else ''\n        op = '-' if operator.name in ['fnms, fms'] else ''\n        arch53_code = 'return __hfma({neg}{in0}, {in1}, {op}{in2});'. \\\n                      format(neg=neg, op=op, **fmtspec)\n    elif operator.name in ['min', 'max']:\n        intr = '__hlt' if operator.name == 'min' else '__hgt'\n        arch53_code = '''if ({intr}) {{\n                           return {in0};\n                         }} else {{\n                           return {in1};\n                         }}'''.format(intr=intr, **fmtspec)\n    elif operator.name in ['adds', 'subs']:\n        arch53_code = 'return __h{op}({in0}, {in1});'. \\\n                      format(op=operator.name[:-1], **fmtspec)\n    else:\n        args = ', '.join(['{{in{}}}'.format(i).format(**fmtspec) \\\n                          for i in range(len(operator.params[1:]))])\n        # Some f16 functions are not prefixed by `__`\n        not_prefixed = ['ceil', 'floor', 'trunc', 'sqrt']\n        if operator.name in not_prefixed:\n            arch53_code = 'return h{}({});'.format(operator.name, args)\n        else:\n            arch53_code = 'return __h{}({});'.format(operator.name, args)\n    args = ', '.join(['__half2float({{in{}}})'.format(i).format(**fmtspec) \\\n                      for i in range(len(operator.params[1:]))])\n    if operator.params[0] == 'l':\n        emul = 'return gpu_{}({});'.format(operator.name, args)\n    else:\n        emul = 'return __float2half(gpu_{}({}));'.format(operator.name, args)\n    return '''#if __CUDA_ARCH__ >= 530\n                {arch53_code}\n              #else\n                {emul}\n              #endif'''.format(arch53_code=arch53_code, emul=emul)\n\n# -----------------------------------------------------------------------------\n# Reinterprets on CUDA have intrinsics\n\ndef reinterpret(totyp, typ):\n    if typ == totyp:\n        return 'return {in0};'.format(**fmtspec)\n    cuda_typ = { 'i16': 'short',\n                 'u16': 'ushort',\n                 'f16': 'half',\n                 'i32': 'int',\n                 'u32': 'uint',\n                 'f32': 'float',\n                 'f64': 'double',\n                 'i64': 'longlong' }\n    if typ in cuda_typ and totyp in cuda_typ and \\\n       ((typ in common.ftypes and totyp in common.iutypes) or \\\n        (typ in common.iutypes and totyp in common.ftypes)):\n        return 'return __{typ2}_as_{totyp2}({in0});'. \\\n               format(typ2=cuda_typ[typ], totyp2=cuda_typ[totyp], **fmtspec)\n    else:\n        return '''{totyp} ret;\n                  memcpy((void *)&ret, (void *)&{in0}, sizeof({in0}));\n                  return ret;'''.format(**fmtspec)\n\n# -----------------------------------------------------------------------------\n\ndef get_impl(operator, totyp, typ):\n\n    global fmtspec\n\n    fmtspec = {\n      'in0': common.in0,\n      'in1': common.in1,\n      'in2': common.in2,\n      'typ': typ,\n      'totyp': totyp,\n      'typnbits': typ[1:]\n    }\n\n    # src operators\n    if operator.src:\n        cuda_ops = {\n          'sin_u35': 'sin',\n          'cos_u35': 'cos',\n          'tan_u35': 'tan',\n          'asin_u35': 'asin',\n          'acos_u35': 'acos',\n          'atan_u35': 'atan',\n          'atan2_u35': 'atan2',\n          'log_u35': 'log',\n          'cbrt_u35': 'cbrt',\n          'sin_u10': 'sin',\n          'cos_u10': 'cos',\n          'tan_u10': 'tan',\n          'asin_u10': 'asin',\n          'acos_u10': 'acos',\n          'atan_u10': 'atan',\n          'atan2_u10': 'atan2',\n          'log_u10': 'log',\n          'cbrt_u10': 'cbrt',\n          'exp_u10': 'exp',\n          'pow_u10': 'pow',\n          'sinh_u10': 'sinh',\n          'cosh_u10': 'cosh',\n          'tanh_u10': 'tanh',\n          'sinh_u35': 'sinh',\n          'cosh_u35': 'cosh',\n          'tanh_u35': 'tanh',\n          'asinh_u10': 'asinh',\n          'acosh_u10': 'acosh',\n          'atanh_u10': 'atanh',\n          'exp2_u10': 'exp2',\n          'exp2_u35': 'exp2',\n          'exp10_u10': 'exp10',\n          'exp10_u35': 'exp10',\n          'expm1_u10': 'expm1',\n          'log10_u10': 'log10',\n          'log2_u10': 'log2',\n          'log2_u35': 'log2',\n          'log1p_u10': 'log1p',\n          'sinpi_u05': 'sinpi',\n          'cospi_u05': 'cospi',\n          'hypot_u05': 'hypot',\n          'hypot_u35': 'hypot',\n          'remainder': 'remainder',\n          'fmod': 'fmod',\n          'lgamma_u10': 'lgamma',\n          'tgamma_u10': 'tgamma',\n          'erf_u10': 'erf',\n          'erfc_u15': 'erfc'\n        }\n        args = common.get_args(len(operator.params[1:]))\n        cuda_op = cuda_ops[operator.name]\n        if typ == 'f16':\n            # For f16 CUDA offers only a few operator\n            if cuda_op in ['cos', 'exp', 'exp10', 'exp2', 'log', 'log10',\n                           'log2', 'sin']:\n                return '''#if __CUDA_ARCH__ >= 530\n                            return h{}({});\n                          #else\n                            return __float2half(gpu_{}(__half2float({})));\n                          #endif'''.format(cuda_op, args, operator.name, args)\n            else:\n                args = ', '.join('__half2float({})'.format(common.get_arg(i)) \\\n                                 for i in range(len(operator.params[1:])))\n                return 'return __float2half(gpu_{}({}));'. \\\n                       format(operator.name, args)\n        elif typ == 'f32':\n            return 'return {}f({});'.format(cuda_op, args)\n        else:\n            return 'return {}({});'.format(cuda_op, args)\n\n    # bool first, no special treatment for f16's\n    bool_operators = {\n        'andl': 'return {in0} && {in1};',\n        'orl': 'return {in0} || {in1};',\n        'xorl': 'return {in0} ^ {in1};',\n        'andnotl': 'return {in0} && (!{in1});',\n        'notl': 'return !{in0};',\n    }\n    if operator.name in bool_operators:\n        return bool_operators[operator.name].format(**fmtspec)\n    # infix operators that needs type punning, no special treatment for f16's\n    def pun_code(code, arity, typ):\n        if typ in common.utypes:\n            return 'return ' + code.format(**fmtspec) + ';'\n        utyp = common.bitfield_type[typ]\n        to_utyp = '\\n'.join(\n                  ['''{utyp} buf{i};\n                      memcpy(&buf{i}, &{{in{i}}}, sizeof({{in{i}}}));'''. \\\n                      format(i=i, utyp=utyp).format(**fmtspec) \\\n                      for i in range(arity)])\n        return '''{to_utyp}\n                  {utyp} tmp = {code};\n                  {typ} ret;\n                  memcpy(&ret, &tmp, sizeof(tmp));\n                  return ret;'''.format(to_utyp=to_utyp, utyp=utyp, typ=typ,\n                                        code=code.format(in0='buf0',\n                                                         in1='buf1'))\n    pun_operators = {\n        'orb': lambda: pun_code('{in0} | {in1}', 2, typ),\n        'andb': lambda: pun_code('{in0} & {in1}', 2, typ),\n        'andnotb': lambda: pun_code('{in0} & (~{in1})', 2, typ),\n        'notb': lambda: pun_code('~{in0}', 1, typ),\n        'xorb': lambda: pun_code('{in0} ^ {in1}', 2, typ),\n    }\n    if operator.name in pun_operators:\n        return pun_operators[operator.name]()\n    # reinterpret\n    if operator.name == 'reinterpret':\n        return reinterpret(totyp, typ)\n    # cvt\n    if operator.name == 'cvt':\n        return 'return ({totyp}){in0};'.format(**fmtspec)\n    # to_mask\n    if operator.name == 'to_mask':\n        if typ in common.utypes:\n            return 'return ({typ})({in0} ? -1 : 0);'.format(**fmtspec)\n        return 'return gpu_reinterpret({typ}(), ({utyp})({in0} ? -1 : 0));'. \\\n               format(utyp=common.bitfield_type[typ], **fmtspec)\n    # to_logical\n    if operator.name == 'to_logical':\n        if typ in common.iutypes:\n            return 'return {in0} == ({typ})0 ? false : true;'.format(**fmtspec)\n        return '''return gpu_reinterpret({utyp}(), {in0}) == ({utyp})0\n                         ? false : true ;'''. \\\n                         format(utyp=common.bitfield_type[typ], **fmtspec)\n    # for all other operators, f16 has a special treatment\n    if typ == 'f16':\n        return get_impl_f16(operator, totyp, typ)\n    # then deal with f32's operators\n    # first infix operators\n    c_operators = {\n        'add': 'return ({typ})({in0} + {in1});',\n        'sub': 'return ({typ})({in0} - {in1});',\n        'mul': 'return ({typ})({in0} * {in1});',\n        'div': 'return ({typ})({in0} / {in1});',\n        'neg': 'return ({typ})(-{in0});',\n        'rec': 'return 1.0{f} / {in0};',\n        'rec8': 'return 1.0{f} / {in0};',\n        'rec11': 'return 1.0{f} / {in0};',\n        'lt': 'return {in0} < {in1};',\n        'gt': 'return {in0} > {in1};',\n        'le': 'return {in0} <= {in1};',\n        'ge': 'return {in0} >= {in1};',\n        'ne': 'return {in0} != {in1};',\n        'eq': 'return {in0} == {in1};',\n        'shl': 'return ({typ})({in0} << {in1});',\n    }\n    if operator.name in c_operators:\n        return c_operators[operator.name]. \\\n               format(f='f' if typ == 'f32' else '', **fmtspec)\n    # right shifts\n    if operator.name in ['shr', 'shra']:\n        if typ in common.utypes:\n            return 'return ({typ})({in0} >> {in1});'.format(**fmtspec)\n        if operator.name == 'shr':\n            return \\\n            '''return gpu_reinterpret({typ}(), ({utyp})(\n                          gpu_reinterpret({utyp}(), {in0}) >> {in1}));'''. \\\n                          format(utyp=common.bitfield_type[typ], **fmtspec)\n        # getting here means shra on signed types\n        return \\\n        '''if ({in1} == 0) {{\n             return {in0};\n           }}\n           if ({in0} >= 0) {{\n             return gpu_reinterpret({typ}(), ({utyp})(\n                        gpu_reinterpret({utyp}(), {in0}) >> {in1}));\n           }} else {{\n             {utyp} mask = ({utyp})((({utyp})-1) << ({typnbits} - {in1}));\n             return gpu_reinterpret({typ}(), (({utyp})(mask |\n                      ({utyp})(gpu_reinterpret({utyp}(), {in0}) >> {in1}))));\n           }}'''.format(utyp=common.bitfield_type[typ], **fmtspec)\n    # adds\n    if operator.name == 'adds':\n        if typ in common.ftypes:\n            return c_operators['add'].format(**fmtspec)\n        else:\n            return scalar.get_impl(operator, totyp, typ)\n    # subs\n    if operator.name == 'subs':\n        if typ in common.ftypes:\n            return c_operators['sub'].format(**fmtspec)\n        elif typ in common.utypes:\n            return scalar.get_impl(operator, totyp, typ)\n        else:\n            return 'return nsimd::gpu_adds({in0}, ({typ})(-{in1}));'. \\\n                   format(**fmtspec)\n    # fma's\n    if operator.name in ['fma', 'fms', 'fnma', 'fnms']:\n        neg = '-' if operator.name in ['fnma, fnms'] else ''\n        op = '-' if operator.name in ['fnms, fms'] else ''\n        if typ in common.ftypes:\n            return 'return fma{f}({neg}{in0}, {in1}, {op}{in2});'. \\\n                   format(f='f' if typ == 'f32' else '', neg=neg, op=op,\n                          **fmtspec)\n        else:\n            return 'return {neg}{in0} * {in1} + ({op}{in2});'. \\\n                   format(neg=neg, op=op, **fmtspec)\n    # other operators\n    if typ in common.iutypes:\n        if operator.name in ['round_to_even', 'ceil', 'floor', 'trunc']:\n            return 'return {in0};'.format(**fmtspec)\n        elif operator.name == 'min':\n            return 'return ({typ})({in0} < {in1} ? {in0} : {in1});'. \\\n                   format(**fmtspec)\n        elif operator.name == 'max':\n            return 'return ({typ})({in0} > {in1} ? {in0} : {in1});'. \\\n                   format(**fmtspec)\n        elif operator.name == 'abs':\n            return 'return ({typ})({in0} > 0 ? {in0} : -{in0});'. \\\n                   format(**fmtspec)\n    else:\n        cuda_name = {\n            'round_to_even': 'rint',\n            'min': 'fmin',\n            'max': 'fmax',\n            'abs': 'fabs',\n            'ceil': 'ceil',\n            'floor': 'floor',\n            'trunc': 'trunc',\n            'rsqrt8': 'rsqrt',\n            'rsqrt11': 'rsqrt'\n        }\n        args = ', '.join(['{{in{}}}'.format(i).format(**fmtspec) \\\n                          for i in range(len(operator.args))])\n        return 'return {name}{f}({args});'. \\\n               format(name=cuda_name[operator.name] \\\n                      if operator.name in cuda_name else operator.name,\n                      f='f' if typ == 'f32' else '', args=args)\n"
  },
  {
    "path": "egg/experiments/gen_sleef_operators.py",
    "content": "# Copyright (c) 2021 Agenium Scale\n#\n# Permission is hereby granted, free of charge, to any person obtaining a copy\n# of this software and associated documentation files (the \"Software\"), to deal\n# in the Software without restriction, including without limitation the rights\n# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n# copies of the Software, and to permit persons to whom the Software is\n# furnished to do so, subject to the following conditions:\n#\n# The above copyright notice and this permission notice shall be included in all\n# copies or substantial portions of the Software.\n#\n# THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n# SOFTWARE.\n\nimport os\n\nscript_dir = os.path.dirname(os.path.realpath(__file__))\nsleef_dir = os.path.join(script_dir, '..', '..', '_deps-sleef')\nsleef_version = '3.5.1'\n\nfuncproto = os.path.join(sleef_dir, 'sleef-{}'.format(sleef_version),\n                         'src', 'libm', 'funcproto.h')\n\nulp_suffix = {\n    '0' : '',\n    '1' : '_u1',\n    '2' : '_u05',\n    '3' : '_u35',\n    '4' : '_u15',\n    '5' : '_u3500'\n}\n\nfunc_type = {\n    '0' : 'v {} v',\n    '1' : 'v {} v v',\n    '2' : 'vx2 {} v',\n    '3' : 'v {} v p',\n    '4' : 'v {} v',\n    '5' : 'v {} v v v',\n    '6' : 'vx2 {} v',\n    '7' : 'p {} p',\n    '8' : '* {} p'\n}\n\nprops = {\n    'cos' : ['cosine', 'DocTrigo', 'R'],\n    'sin' : ['sine', 'DocTrigo', 'R'],\n    'fastcos' : ['cosine', 'DocTrigo', 'R'],\n    'fastsin' : ['sine', 'DocTrigo', 'R'],\n    'cospi' : ['cosine of multiple of pi argument', 'DocTrigo', 'R'],\n    'sinpi' : ['sine of multiple of pi argument', 'DocTrigo', 'R'],\n    'tan' : ['tangent', 'DocTrigo', 'R\\{(z+0.5)*pi}'],\n    'acos' : ['arc cosine', 'DocTrigo', '(-1,1)'],\n    'asin' : ['arc sine', 'DocTrigo', '(-1,1)'],\n    'atan' : ['arc tangent', 'DocTrigo', 'R'],\n    'atan2' : ['arc tangent', 'DocTrigo', 'RxR'],\n\n    'log' : ['natural logarithmic', 'DocExpLog', '(0,Inf)'],\n    'log2' : ['base-2 logarithmic', 'DocExpLog', '(0,Inf)'],\n    'log10' : ['base-10 logarithmic', 'DocExpLog', '(0,Inf)'],\n    'log1p' : ['logarithm of one plus argument', 'DocExpLog', '(-1,Inf)'],\n    'exp' : ['exponential', 'DocExpLog', 'R'],\n    'exp2' : ['base-2 exponential', 'DocExpLog', 'R'],\n    'exp10' : ['base-10 exponential', 'DocExpLog', 'R'],\n    'expm1' : ['exponential minus 1', 'DocExpLog', 'R'],\n    'pow' : ['power', 'DocExpLog', 'RxR'],\n    'fastpow' : ['power', 'DocExpLog', 'RxR'],\n\n    'cbrt' : ['cubic root', 'DocBasicArithmetic', 'R'],\n    'hypot' : ['hypotenuse', 'DocBasicArithmetic', 'RxR'],\n\n    'sinh': ['hyperbolic sine', 'DocHyper', 'R'],\n    'cosh': ['hyperbolic cosine', 'DocHyper', 'R'],\n    'tanh': ['hyperbolic tangent', 'DocHyper', 'R'],\n    'asinh': ['hyperbolic arc sine', 'DocHyper', 'R'],\n    'acosh': ['hyperbolic arc cosine', 'DocHyper', '(1,Inf)'],\n    'atanh': ['hyperbolic arc tangent', 'DocHyper', '(-1,1)'],\n\n    'lgamma' : ['log gamma', 'DocMisc', 'R\\{-n}'],\n    'tgamma' : ['gamma', 'DocMisc', 'R\\{-n}'],\n    'erf' : ['error function', 'DocMisc', 'R'],\n    'erfc' : ['complementary error function', 'DocMisc', 'R']\n}\n\nwith open(funcproto, 'r') as fin:\n    for line in fin:\n        if not (line.find('{') != -1 and line.find('}') != -1):\n            continue\n        items = [item.strip() for item in line.strip(' \\n\\r{},').split(',')]\n        items[0] = items[0].strip('\"')\n        if items[0] == 'NULL':\n            break\n        if items[0] not in props:\n            continue\n        name = items[0] + '_u' + items[1]\n        symbol = 'nsimd_sleef_{}'.format(name)\n        prop = props[items[0]]\n        print('Class {}{}(SrcOperator):'. \\\n              format(name[0].upper(), name[1:]))\n        print('  full_name = \\'{}\\''.format(prop[0]))\n        print('  signature = \\'{}\\''.format(func_type[items[3]]) \\\n                                    .format(name))\n        print('  sleef_symbol_prefix = \\'{}\\''.format(symbol))\n        print('  domain = Domain(\\'{}\\')'.format(prop[2]))\n        print('  categories = [{}]'.format(prop[1]))\n        print('  desc = \\'Compute the {} of its argument{} with ' \\\n                 'a precision of {} ulps. For more informations visit ' \\\n                 '<https://sleef.org/purec.xhtml>.\\''.format(prop[0],\n                 's' if items[3] in ['1', '3', '5'] else '',\n                 float(items[1]) / 10.0))\n        print('')\n"
  },
  {
    "path": "egg/experiments/round-ppc.c",
    "content": "#include <altivec.h>\n#include <stdio.h>\n\nvoid pp(const char *prefix, FILE *out, float buf[4]) {\n  fputs(prefix, out);\n  fputc('{', out);\n  for (int i = 0; i < 4; i++) {\n    fprintf(out, \" %f\", (double)buf[i]);\n  }\n  fputs(\" }\\n\", out);\n}\n\nint main() {\n  float res[4];\n\n  float buf[4];\n  buf[0] = -1.5f;\n  buf[1] = -0.5f;\n  buf[2] = 0.5f;\n  buf[3] = 1.5f;\n  __vector float v = *(__vector float *)buf;\n\n\n  pp(\"   buf = \", stdout, buf);\n\n\n  *(__vector float *)res = vec_round(v);\n  pp(\" round = \", stdout, res);\n\n  *(__vector float *)res = vec_rint(v);\n  pp(\"  rint = \", stdout, res);\n\n  *(__vector float *)res = vec_roundc(v);\n  pp(\"roundc = \", stdout, res);\n\n  return 0;\n}\n"
  },
  {
    "path": "egg/experiments/upcvt-sve.c",
    "content": "#include <stdio.h>\n#include <arm_sve.h>\n\n// armclang -march=armv8+sve egg/experiments/upcvt-sve.c -o ../build/a.out\n\n// ---\n\nint len32() {\n  return (int)svcntp_b32(svptrue_b32(), svptrue_b32());\n}\n\nvoid print32(FILE *out, const char *var, svfloat32_t a) {\n  float buf[2048];\n  svst1_f32(svptrue_b32(), buf, a);\n  fprintf(out, \"%s = \", var);\n  for (int i = 0; i < len32(); i++) {\n    if (i > 0) {\n      fputs(\", \", out);\n    }\n    fprintf(out, \"%f\", (double)buf[i]);\n  }\n  fputc('\\n', stdout);\n}\n\nsvfloat32_t iota32(float i0) {\n  float buf[2048];\n  for (int i = 0; i < len32(); i++) {\n    buf[i] = i0 + (float)i;\n  }\n  return svld1(svptrue_b32(), buf);\n}\n\n// ---\n\nint len64() {\n  return (int)svcntp_b64(svptrue_b64(), svptrue_b64());\n}\n\nvoid print64(FILE *out, const char *var, svfloat64_t a) {\n  double buf[2048];\n  svst1_f64(svptrue_b64(), buf, a);\n  fprintf(out, \"%s = \", var);\n  for (int i = 0; i < len64(); i++) {\n    if (i > 0) {\n      fputs(\", \", out);\n    }\n    fprintf(out, \"%f\", buf[i]);\n  }\n  fputc('\\n', stdout);\n}\n\n\n// ---\n\nint main() {\n  svfloat32_t a = iota32(0.0f);\n  svfloat32_t b = iota32(8.0f);\n  svfloat64_t c = svcvt_f64_f32_z(svptrue_b32(), svzip1_f32(a, a));\n  print32(stdout, \"a \", a);\n  print32(stdout, \"aa\", svzip1_f32(a, a));\n  print64(stdout, \"c \", c);\n  return 0;\n}\n"
  },
  {
    "path": "egg/gen_adv_c_api.py",
    "content": "# Copyright (c) 2021 Agenium Scale\n#\n# Permission is hereby granted, free of charge, to any person obtaining a copy\n# of this software and associated documentation files (the \"Software\"), to deal\n# in the Software without restriction, including without limitation the rights\n# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n# copies of the Software, and to permit persons to whom the Software is\n# furnished to do so, subject to the following conditions:\n#\n# The above copyright notice and this permission notice shall be included in all\n# copies or substantial portions of the Software.\n#\n# THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n# SOFTWARE.\n\nimport common\nimport os\nimport operators\n\n# -----------------------------------------------------------------------------\n# Construct C11 types\n\ndef get_c11_types(simd_ext):\n    ret = ''\n    for se in common.simds_deps[simd_ext]:\n        ret += '\\n\\n'.join([\n               '''typedef NSIMD_STRUCT nsimd_pack_{typ}_{se} {{\n                    nsimd_{se}_v{typ} v;\n                  }} nsimd_pack_{typ}_{se};\n\n                  NSIMD_INLINE nsimd_pack_{typ}_{se}\n                  nsimd_make_pack_{typ}_{se}(nsimd_{se}_v{typ} v) {{\n                    return (nsimd_pack_{typ}_{se}){{ v }};\n                  }}'''.format(typ=typ, se=se) for typ in common.types])\n        ret += '\\n\\n'\n        ret += '\\n\\n'.join([\n               '''typedef NSIMD_STRUCT nsimd_packl_{typ}_{se} {{\n                    nsimd_{se}_vl{typ} v;\n                  }} nsimd_packl_{typ}_{se};\n\n                  NSIMD_INLINE nsimd_packl_{typ}_{se}\n                  nsimd_make_packl_{typ}_{se}(nsimd_{se}_vl{typ} v) {{\n                    return (nsimd_packl_{typ}_{se}){{ v }};\n                  }}'''.format(typ=typ, se=se) for typ in common.types])\n        for deg in [2, 3, 4]:\n            vs = ', '.join(['v{}'.format(i) for i in range(deg)])\n            avs = ', '.join(['{{a0.v{}}}'.format(i) for i in range(deg)])\n            ret += '\\n\\n'\n            ret += '\\n\\n'.join([\n                   '''typedef NSIMD_STRUCT nsimd_packx{deg}_{typ}_{se} {{\n                        nsimd_pack_{typ}_{se} {vs};\n                      }} nsimd_packx{deg}_{typ}_{se};\n\n                      NSIMD_INLINE nsimd_packx{deg}_{typ}_{se}\n                      nsimd_make_packx{deg}_{typ}_{se}\n                      (nsimd_{se}_v{typ}x{deg} a0) {{\n                        return (nsimd_packx{deg}_{typ}_{se}){{ {avs} }};\n                      }}                      '''. \\\n                      format(typ=typ, se=se, vs=vs, deg=deg, avs=avs) \\\n                      for typ in common.types])\n\n    ret += '\\n\\n'\n    ret += '#define nsimd_make_pack(var, func) ' \\\n           '_Generic(var, \\\\\\n'\n    ret += '\\n'.join([\n           'nsimd_pack_{typ}_{se}: nsimd_make_pack_{typ}_{se}, \\\\'. \\\n           format(typ=typ, se=se) for typ in common.types \\\n                                  for se in common.simds_deps[simd_ext]])\n    ret += '\\n'\n    ret += '\\n'.join([\n           'nsimd_packl_{typ}_{se}: nsimd_make_packl_{typ}_{se}, \\\\'. \\\n           format(typ=typ, se=se) for typ in common.types \\\n                                  for se in common.simds_deps[simd_ext]])\n    ret += '\\n'\n    ret += '\\n'.join([\n           'nsimd_packx{d}_{typ}_{se}: nsimd_make_packx{d}_{typ}_{se}, \\\\'. \\\n           format(typ=typ, se=se, d=d) for typ in common.types \\\n                                       for d in [2, 3, 4] \\\n                                       for se in common.simds_deps[simd_ext]])\n    ret += '\\ndefault: nsimd_c11_type_unsupported)(func)'\n\n    ret += '\\n\\n'\n    ret += '\\n'.join([\n           'typedef nsimd_pack_{typ}_{simd_ext} nsimd_pack_{typ};'. \\\n            format(typ=typ, simd_ext=simd_ext) for typ in common.types])\n    ret += '\\n\\n'\n    ret += '\\n'.join([\n           'typedef nsimd_packl_{typ}_{simd_ext} nsimd_packl_{typ};'. \\\n            format(typ=typ, simd_ext=simd_ext) for typ in common.types])\n    ret += '\\n\\n'\n    ret += '\\n'.join([\n           'typedef nsimd_packx{d}_{typ}_{simd_ext} nsimd_packx{d}_{typ};'. \\\n            format(typ=typ, simd_ext=simd_ext, d=d) \\\n            for typ in common.types for d in [2, 3, 4]])\n\n    ret += '\\n\\n'\n    ret += '#define nsimd_c11_pack(var) _Generic((var), \\\\\\n'\n    ret += '\\n'.join([\n           'nsimd_packl_{typ}_{se}: ' \\\n           '((nsimd_pack_{typ}_{se} (*)())NULL)(), \\\\'. \\\n           format(typ=typ, se=se) for typ in common.types \\\n                                  for se in common.simds_deps[simd_ext]])\n    ret += '\\ndefault: NULL)'\n\n    ret += '\\n\\n'\n    ret += '#define nsimd_c11_packl(var) _Generic((var), \\\\\\n'\n    ret += '\\n'.join([\n           'nsimd_pack_{typ}_{se}: ' \\\n           '((nsimd_packl_{typ}_{se} (*)())NULL)(), \\\\'. \\\n           format(typ=typ, se=se) for typ in common.types \\\n                                  for se in common.simds_deps[simd_ext]])\n    ret += '\\ndefault: NULL)'\n\n    ret += '\\n\\n'\n    ret += '#define nsimd_c11_packx2(var) _Generic((var), \\\\\\n'\n    ret += '\\n'.join([\n           'nsimd_pack_{typ}_{se}: ' \\\n           '((nsimd_packx2_{typ}_{se} (*)())NULL)(), \\\\'. \\\n           format(typ=typ, se=se) for typ in common.types \\\n                                  for se in common.simds_deps[simd_ext]])\n    ret += '\\ndefault: NULL)'\n\n    return ret\n\n# -----------------------------------------------------------------------------\n# Construct C11 overloads\n\ndef get_c11_overloads(op, simd_ext):\n    if common.get_first_discriminating_type(op.params) == -1:\n        # Only the len operator should go here\n        assert op.name == 'len'\n        ret = '\\n\\n'.join([\n        '''#define NSIMD_C11_LEN_nsimd_pack_{typ}_{se}() \\\\\n                   nsimd_len_{se}_{typ}()\n\n           #define NSIMD_C11_LEN_nsimd_packl_{typ}_{se}() \\\\\n                   nsimd_len_{se}_{typ}()\n\n           #define NSIMD_C11_LEN_nsimd_packx2_{typ}_{se}() \\\\\n                   (2 * nsimd_len_{se}_{typ}())\n\n           #define NSIMD_C11_LEN_nsimd_packx3_{typ}_{se}() \\\\\n                   (3 * nsimd_len_{se}_{typ}())\n\n           #define NSIMD_C11_LEN_nsimd_packx4_{typ}_{se}() \\\\\n                   (4 * nsimd_len_{se}_{typ}())'''.format(typ=typ, se=se) \\\n                   for typ in op.types for se in common.simds_deps[simd_ext]])\n\n        ret += '\\n\\n'\n        ret += '\\n\\n'.join([\n        '''#define NSIMD_C11_LEN_nsimd_pack_{typ}() \\\\\n                   nsimd_len_{simd_ext}_{typ}()\n\n           #define NSIMD_C11_LEN_nsimd_packl_{typ}() \\\\\n                   nsimd_len_{simd_ext}_{typ}()\n\n           #define NSIMD_C11_LEN_nsimd_packx2_{typ}() \\\\\n                   (2 * nsimd_len_{simd_ext}_{typ}())\n\n           #define NSIMD_C11_LEN_nsimd_packx3_{typ}() \\\\\n                   (3 * nsimd_len_{simd_ext}_{typ}())\n\n           #define NSIMD_C11_LEN_nsimd_packx4_{typ}() \\\\\n                   (4 * nsimd_len_{simd_ext}_{typ}())'''. \\\n                   format(typ=typ, simd_ext=simd_ext) for typ in common.types])\n        ret += '\\n\\n'\n        ret += '#define nsimd_len(type) \\\\\\n' \\\n               'NSIMD_PP_CAT_2(NSIMD_C11_LEN_, type)()\\n\\n'\n        return ret\n\n    def get_c11_arg(param, name):\n        if param in ['*', 'c*', 's', 'p']:\n            return name\n        elif param in ['v', 'l', 'vi']:\n            return '({}).v'.format(name)\n\n    args = op.params[1:]\n    i0 = common.get_first_discriminating_type(args)\n    if i0 == -1:\n        if op.params[0] == 'v':\n            pack = 'pack'\n        elif op.params[0] == 'l':\n            pack = 'packl'\n        elif op.params[0] == 'vx2':\n            pack = 'packx2'\n        elif op.params[0] == 'vx3':\n            pack = 'packx3'\n        elif op.params[0] == 'vx4':\n            pack = 'packx4'\n        macro_args = ', '.join(['a{}'.format(i) for i in range(len(args))])\n        ret = '\\n\\n'.join([\n        '''#define NSIMD_C11_{OP_NAME}_nsimd_{pack}_{typ}_{se}({macro_args}) \\\\\n                     nsimd_make_{pack}_{typ}_{se}( \\\\\n                       nsimd_{op_name}_{se}_{typ}({macro_args}))'''. \\\n                       format(OP_NAME=op.name.upper(), se=se,\n                              macro_args=macro_args,\n                              op_name=op.name, typ=typ, pack=pack) \\\n                              for typ in op.types \\\n                              for se in common.simds_deps[simd_ext]])\n        ret += '\\n\\n'\n        ret += '\\n\\n'.join([\n        '''#define NSIMD_C11_{OP_NAME}_nsimd_{pack}_{typ}({macro_args}) \\\\\n                     nsimd_make_{pack}_{typ}_{simd_ext}( \\\\\n                       nsimd_{op_name}_{simd_ext}_{typ}({macro_args}))'''. \\\n                       format(OP_NAME=op.name.upper(), simd_ext=simd_ext,\n                              macro_args=macro_args, op_name=op.name, typ=typ,\n                              pack=pack) for typ in op.types])\n        ret += '\\n\\n'\n        type_args = ', '.join(['type'] + \\\n                              ['a{}'.format(i) for i in range(len(args))])\n        call_args = ', '.join([get_c11_arg(args[i], 'a{}'.format(i)) \\\n                               for i in range(len(args))])\n        ret += '\\n\\n#define nsimd_{op_name}({type_args})' \\\n               ' NSIMD_PP_CAT_2(NSIMD_C11_{OP_NAME}_, type)({call_args})'. \\\n               format(op_name=op.name, OP_NAME=op.name.upper(),\n                      call_args=call_args, type_args=type_args)\n        return ret\n\n    # Getting here means that i0 >= 0 i.e. that overloads can be determined\n    # by argument i0 of the operator which is in ['v', 'l', 'vx2', 'vx3',\n    # 'vx4']\n\n    macro_args = ['a{}'.format(i) for i in range(len(args))]\n    call_args = ', '.join([get_c11_arg(args[i], 'a{}'.format(i)) \\\n                           for i in range(len(args))])\n    if not op.closed:\n        macro_args = ['to_type'] + macro_args\n    macro_args = ', '.join(macro_args)\n\n    if op.params[0] in ['v', 'l', 'vx2', 'vx3', 'vx4']:\n        if not op.closed:\n            ret = '#define nsimd_{}({}) ' \\\n                  'nsimd_make_pack((((to_type (*)())NULL)()), ' \\\n                  '_Generic(({}), \\\\\\n'. \\\n                  format(op.name, macro_args, 'a{}'.format(i0))\n        else:\n            if op.params[0] != args[i0]:\n                if op.params[0] == 'v':\n                    ctrl_expr = 'nsimd_c11_pack(a{})'.format(i0)\n                elif op.params[0] == 'l':\n                    ctrl_expr = 'nsimd_c11_packl(a{})'.format(i0)\n                elif op.params[0] == 'vx2':\n                    ctrl_expr = 'nsimd_c11_packx2(a{})'.format(i0)\n            else:\n                ctrl_expr = 'a{}'.format(i0)\n            ret = '#define nsimd_{}({}) ' \\\n                  'nsimd_make_pack({}, _Generic(({}), \\\\\\n'. \\\n                  format(op.name, macro_args, ctrl_expr, 'a{}'.format(i0))\n    else:\n        ret = '#define nsimd_{}({}) _Generic(({}), \\\\\\n'. \\\n              format(op.name, macro_args, 'a{}'.format(i0))\n\n    suf = { 'v': '', 'l': 'l', 'vx2': 'x2', 'vx3': 'x3', 'vx4': 'x4'}\n\n    arg = args[i0]\n    typ_fmt = 'nsimd_pack{}_{{}}_{{}}'.format(suf[arg])\n\n    for se in common.simds_deps[simd_ext]:\n        for typ in op.types:\n            ret += typ_fmt.format(typ, se) + ': '\n            if op.closed:\n                ret += 'nsimd_{}_{}_{}, \\\\\\n'.format(op.name, se, typ)\n                continue\n            ret += '_Generic(((to_type (*)())NULL)(), \\\\\\n'\n            for to_typ in common.get_output_types(typ, op.output_to):\n                to_pack = 'nsimd_pack{}_{}_{}'. \\\n                          format(suf[op.params[0]], to_typ, se)\n                ret += '  {}: nsimd_{}_{}_{}_{}, \\\\\\n'. \\\n                       format(to_pack, op.name, se, to_typ, typ)\n            ret += '  default: nsimd_c11_type_unsupported), \\\\\\n'\n\n    ret += 'default: nsimd_c11_type_unsupported)({})'.format(call_args)\n    if op.params[0] in ['v', 'l', 'vx2', 'vx3', 'vx4']:\n        ret += ')'\n    return ret\n\n# -----------------------------------------------------------------------------\n\ndef doit(opts):\n    common.myprint(opts, 'Generating advanced C API (requires C11)')\n    filename = os.path.join(opts.include_dir, 'c_adv_api_functions.h')\n    if not common.can_create_filename(opts, filename):\n        return\n    with common.open_utf8(opts, filename) as out:\n        out.write('''#ifndef NSIMD_C_ADV_API_FUNCTIONS_H\n                     #define NSIMD_C_ADV_API_FUNCTIONS_H\n\n                     #include <nsimd/nsimd.h>\n\n                     ''')\n\n        for simd_ext in common.simds:\n            out.write('''{hbar}\n                         {hbar}\n                         {hbar}\n\n                         /* {SIMD_EXT} */\n\n                         {hbar}\n                         {hbar}\n                         {hbar}\n\n                         #ifdef NSIMD_{SIMD_EXT}\n\n                         {types}\n\n                         '''.format(hbar=common.hbar,\n                                    types=get_c11_types(simd_ext),\n                                    SIMD_EXT=simd_ext.upper()))\n\n            for op_name, operator in operators.operators.items():\n                out.write('/* {} */\\n\\n{}\\n\\n'. \\\n                          format(op_name, get_c11_overloads(operator,\n                                                            simd_ext)))\n\n            out.write('\\n\\n#endif')\n\n        out.write('\\n\\n{}\\n\\n#endif\\n'.format(common.hbar))\n\n"
  },
  {
    "path": "egg/gen_adv_cxx_api.py",
    "content": "# Copyright (c) 2019 Agenium Scale\n#\n# Permission is hereby granted, free of charge, to any person obtaining a copy\n# of this software and associated documentation files (the \"Software\"), to deal\n# in the Software without restriction, including without limitation the rights\n# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n# copies of the Software, and to permit persons to whom the Software is\n# furnished to do so, subject to the following conditions:\n#\n# The above copyright notice and this permission notice shall be included in all\n# copies or substantial portions of the Software.\n#\n# THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n# SOFTWARE.\n\nimport operators\nimport common\nimport os\nfrom datetime import date\nimport sys\n\n# -----------------------------------------------------------------------------\n# Actual implementation\n\ndef get_cxx_advanced_generic(operator):\n    def get_pack(param):\n        if param in ['v', 'vi']:\n            return 'pack'\n        elif param == 'l':\n            return 'pack'\n        else:\n            return 'pack{}'.format(param[1:])\n    args_list = common.enum(operator.params[1:])\n    inter = [i for i in ['v', 'vi', 'l', 'vx1', 'vx2', 'vx3', 'vx4'] \\\n             if i in operator.params[1:]]\n    need_tmpl_pack = get_pack(operator.params[0]) if inter == [] else None\n\n    # Compute parameters passed to the base C++ API functions\n    def var(arg, N):\n        member = 'car' if N == '1' else 'cdr'\n        if arg[1] in ['vi', 'v', 'l']:\n            return 'a{}.{}'.format(arg[0], member)\n        elif (arg[1] in ['*', 'c*']) and N != '1':\n            return 'a{} + len_'.format(arg[0])\n        else:\n            return 'a{}'.format(arg[0])\n    vars1 = [var(i, '1') for i in args_list] + ['T()'] + \\\n            (['typename ToPackType::value_type()'] if not operator.closed \\\n             else []) + ['SimdExt()']\n    varsN = [var(i, 'N') for i in args_list]\n    other_varsN = ', '.join(['a{}'.format(i[0]) for i in args_list])\n    if other_varsN != '':\n        other_varsN = ', ' + other_varsN\n    if not operator.closed:\n        varsN = ['typename ToPackType::value_type()'] + varsN\n    if need_tmpl_pack != None:\n        varsN = ['{}<T, N - 1, SimdExt>()'.format(need_tmpl_pack)] + varsN\n    vars1 = ', '.join(vars1)\n    varsN = ', '.join(varsN)\n\n    # Compute return type\n    ret1 = 'ToPackType' if not operator.closed \\\n           else common.get_one_type_generic_adv_cxx(operator.params[0],\n                                                    'T', '1')\n    retN = 'ToPackType' if not operator.closed \\\n           else common.get_one_type_generic_adv_cxx(operator.params[0],\n                                                            'T', 'N')\n\n    # Dump C++\n    if operator.params[0] in ['v', 'vi', 'l']:\n        return_ret = 'return ret;'\n        ret_car = 'ret.car = '\n        ret_cdr = 'ret.cdr = '\n        post_car = ''\n        post_cdr = ''\n        pack1_ret = '{} ret;'.format(ret1)\n        packN_ret = '{} ret;'.format(retN)\n    elif operator.params[0] in ['vx1', 'vx2', 'vx3', 'vx4']:\n        num = operator.params[0][-1:]\n        return_ret = 'return ret;'\n        if operator.closed:\n            ret_car = \\\n                'typename simd_traits<T, SimdExt>::simd_vectorx{} car = '. \\\n                format(num)\n        else:\n            ret_car = \\\n                '''typename simd_traits<typename ToPackType::value_type,\n                       SimdExt>::simd_vectorx{} car = '''.format(num)\n        ret_cdr = 'packx{}<T, N - 1, SimdExt> cdr = '.format(num)\n        post_car = '; ret.set_car({})'.format(', '.join( \\\n            ['car.v{}'.format(i) for i in range(0, int(num))]))\n        post_cdr = '; ret.set_cdr({})'.format(', '.join( \\\n            ['cdr.v{}'.format(i) for i in range(0, int(num))]))\n        pack1_ret = '{} ret;'.format(ret1)\n        packN_ret = '{} ret;'.format(retN)\n    else:\n        return_ret = ''\n        ret_car = ''\n        ret_cdr = ''\n        post_car = ''\n        post_cdr = ''\n        pack1_ret = ''\n        packN_ret = ''\n    if '*' in operator.params[1:] or 'c*' in operator.params[1:]:\n        # store*[au] does not contain any packx* argument, therefore the offset\n        # cannot be correctly computed\n        if operator.name in ['store2u', 'store2a']:\n            multiplier = '2 * '\n        elif operator.name in ['store3u', 'store3a']:\n            multiplier = '3 * '\n        elif operator.name in ['store4u', 'store4a']:\n            multiplier = '4 * '\n        else:\n            multiplier = ''\n        int_len = 'int len_ = {}len({}<T, 1, SimdExt>());'. \\\n                  format(multiplier, get_pack(inter[0]) if inter != [] \\\n                                     else need_tmpl_pack)\n    else:\n        int_len = ''\n\n    sig = operator.get_generic_signature('cxx_adv')\n    for k in sig:\n        sig[k] = sig[k][:-1] # remove trailing ';'\n\n    tmpl = '''{{sig1}} {{{{{pack1_ret}\n                {ret_car}{name}({vars1}){post_car};\n              {return_ret}}}}}\n\n              {{sigN}} {{{{{packN_ret}{int_len}\n                {ret_car}{name}({vars1}){post_car};\n                {ret_cdr}{{cxx_name}}({varsN}){post_cdr};\n              {return_ret}}}}}'''. \\\n\t      format(pack1_ret=pack1_ret, ret_car=ret_car, name=operator.name,\n\t             vars1=vars1, return_ret=return_ret, retN=retN,\n\t             packN_ret=packN_ret, int_len=int_len, ret_cdr=ret_cdr,\n\t             varsN=varsN, post_car=post_car, post_cdr=post_cdr)\n\n    ret = ''\n    if operator.cxx_operator:\n        ret += tmpl.format(cxx_name='operator'+operator.cxx_operator,\n                           sig1=sig['op1'], sigN=sig['opN']) + '\\n\\n'\n    ret += tmpl.format(cxx_name=operator.name,\n                       sig1=sig['1'], sigN=sig['N']) + '\\n\\n'\n\n    if not operator.closed:\n        return_ins = 'return ' if operator.params[0] != '_' else ''\n        ret += '\\n\\n'\n        ret += '''{sig} {{\n                    {return_ins}{cxx_name}(ToPackType(){other_varsN});\n                  }}'''. \\\n                  format(cxx_name=operator.name, sig=sig['dispatch'],\n                         other_varsN=other_varsN, return_ins=return_ins)\n    if need_tmpl_pack != None:\n        ret += '\\n\\n'\n        ret += '''{sig} {{\n                    return {cxx_name}(SimdVector(){other_varsN});\n                  }}'''. \\\n                  format(sig=sig['dispatch'], cxx_name=operator.name,\n                         other_varsN=other_varsN)\n    return ret\n\n# -----------------------------------------------------------------------------\n# Generate assignments operator (+=, *=, &=, ...)\ndef gen_assignment_operators(op):\n    #return '''{sig} {{ }}'''\n    return ''\n\n# -----------------------------------------------------------------------------\n# Generate advanced C++ API\n\ndef doit(opts):\n    common.myprint(opts, 'Generating advanced C++ API')\n    filename = os.path.join(opts.include_dir, 'cxx_adv_api_functions.hpp')\n    if not common.can_create_filename(opts, filename):\n        return\n    with common.open_utf8(opts, filename) as out:\n        out.write('''#ifndef NSIMD_CXX_ADV_API_FUNCTIONS_HPP\n                     #define NSIMD_CXX_ADV_API_FUNCTIONS_HPP\n\n                     namespace nsimd {\n\n                     ''')\n\n        for op_name, operator in operators.operators.items():\n            if not operator.autogen_cxx_adv:\n                continue\n\n            out.write('''{hbar}\n\n                         {code}\n\n                         '''.format(hbar=common.hbar,\n                                    code=get_cxx_advanced_generic(operator)))\n\n            if operator.cxx_operator and \\\n                (operator.args in [['v', 'v'], ['v', 'p']]):\n              out.write('{hbar}\\n{code}'. \\\n                      format(hbar=common.hbar,\n                             code=gen_assignment_operators(operator)))\n\n\n        out.write('''{hbar}\n\n                     }} // namespace nsimd\n\n                     #endif'''.format(hbar=common.hbar))\n    common.clang_format(opts, filename)\n"
  },
  {
    "path": "egg/gen_archis.py",
    "content": "# Copyright (c) 2021 Agenium Scale\n#\n# Permission is hereby granted, free of charge, to any person obtaining a copy\n# of this software and associated documentation files (the \"Software\"), to deal\n# in the Software without restriction, including without limitation the rights\n# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n# copies of the Software, and to permit persons to whom the Software is\n# furnished to do so, subject to the following conditions:\n#\n# The above copyright notice and this permission notice shall be included in all\n# copies or substantial portions of the Software.\n#\n# THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n# SOFTWARE.\n\nimport operators\nimport common\nimport gen_adv_c_api\nimport os\nfrom datetime import date\nimport sys\n\n# -----------------------------------------------------------------------------\n# Generate code for output\n\ndef get_simd_implementation_src(operator, simd_ext, from_typ, fmtspec):\n    if simd_ext == 'cpu':\n        vlen = common.CPU_NBITS // int(from_typ[1:])\n        vasi = []\n        params = operator.params[1:]\n        for i in range(len(params)):\n            if params[i] in ['v', 'l', 'vi']:\n                vasi.append('a{}.v{{i}}'.format(i))\n            else:\n                vasi.append('a{}'.format(i))\n        vasi = ', '.join(vasi)\n        typ2 = 'f32' if from_typ == 'f16' else from_typ\n        if operator.params[0] == '_':\n            body = '\\n'.join(\n                        ['nsimd_scalar_{op_name}_{typ2}({vasi});'. \\\n                         format(op_name=operator.name, typ2=typ2,\n                                vasi=vasi.format(i=i)) for i in range(vlen)])\n        else:\n            body = 'nsimd_cpu_v{} ret;\\n'.format(from_typ)\n            body += '\\n'.join(\n                    ['ret.v{i} = nsimd_scalar_{op_name}_{typ2}({vasi});'. \\\n                     format(i=i, op_name=operator.name, typ2=typ2,\n                            vasi=vasi.format(i=i)) for i in range(vlen)])\n            body += '\\nreturn ret;\\n'\n        return \\\n        '''{hbar}\n\n           NSIMD_INLINE {return_typ} NSIMD_VECTORCALL\n           nsimd_{name}_{simd_ext}_{suf}({c_args}) {{\n             {body}\n           }}\n\n           #if NSIMD_CXX > 0\n           namespace nsimd {{\n             NSIMD_INLINE {return_typ} NSIMD_VECTORCALL\n             {name}({cxx_args}) {{\n               {body}\n             }}\n           }} // namespace nsimd\n           #endif\n\n           '''.format(body=body, **fmtspec)\n    if from_typ == 'f16':\n        n = len(operator.params[1:])\n        f16_to_f32 = '\\n'.join(\n                    ['nsimd_{simd_ext}_vf32x2 buf{i}' \\\n                     ' = nsimd_upcvt_{simd_ext}_f32_f16({args});'. \\\n                     format(i=i, args=common.get_arg(i), **fmtspec) \\\n                     for i in range(n)])\n        bufsv0 = ', '.join(['buf{}.v0'.format(i) for i in range(n)])\n        bufsv1 = ', '.join(['buf{}.v1'.format(i) for i in range(n)])\n        if operator.params[0] != '_':\n            retv0 = 'nsimd_{simd_ext}_vf32 retv0 = '.format(**fmtspec)\n            retv1 = 'nsimd_{simd_ext}_vf32 retv1 = '.format(**fmtspec)\n            f32_to_f16 = \\\n            'return nsimd_downcvt_{simd_ext}_f16_f32(retv0, retv1);'. \\\n            format(**fmtspec)\n        else:\n            retv0 = ''\n            retv1 = ''\n            f32_to_f16 = ''\n        retv0 += '{sleef_symbol_prefix}_{simd_ext}_f32({bufsv0});'. \\\n                 format(bufsv0=bufsv0, **fmtspec)\n        retv1 += '{sleef_symbol_prefix}_{simd_ext}_f32({bufsv1});'. \\\n                 format(bufsv1=bufsv1, **fmtspec)\n        return \\\n        '''{hbar}\n\n           NSIMD_INLINE {return_typ} NSIMD_VECTORCALL\n           nsimd_{name}_{simd_ext}_{suf}({c_args}) {{\n             {f16_to_f32}\n             {retv0}\n             {retv1}\n           {f32_to_f16}}}\n\n           #if NSIMD_CXX > 0\n           namespace nsimd {{\n             NSIMD_INLINE {return_typ} NSIMD_VECTORCALL\n             {name}({cxx_args}) {{\n               {f16_to_f32}\n               {retv0}\n               {retv1}\n             {f32_to_f16}}}\n           }} // namespace nsimd\n           #endif\n\n           '''.format(f16_to_f32=f16_to_f32, retv0=retv0, retv1=retv1,\n                      f32_to_f16=f32_to_f16, **fmtspec)\n    else:\n        return \\\n        '''{hbar}\n\n           #if NSIMD_CXX > 0\n           extern \"C\" {{\n           #endif\n\n           NSIMD_DLLSPEC {return_typ} NSIMD_VECTORCALL\n           {sleef_symbol_prefix}_{simd_ext}_{suf}({c_args});\n\n           #if NSIMD_CXX > 0\n           }} // extern \"C\"\n           #endif\n\n           NSIMD_INLINE {return_typ} NSIMD_VECTORCALL\n           nsimd_{name}_{simd_ext}_{suf}({c_args}) {{\n             {returns}{sleef_symbol_prefix}_{simd_ext}_{suf}({vas});\n           }}\n\n           #if NSIMD_CXX > 0\n           namespace nsimd {{\n             NSIMD_INLINE {return_typ} NSIMD_VECTORCALL\n             {name}({cxx_args}) {{\n               {returns}{sleef_symbol_prefix}_{simd_ext}_{suf}({vas});\n             }}\n           }} // namespace nsimd\n           #endif\n\n           '''.format(**fmtspec)\n\n# -----------------------------------------------------------------------------\n# Generate code for output\n\ndef get_simd_implementation(opts, operator, mod, simd_ext):\n    typ_pairs = []\n    for t in operator.types:\n        return_typs = common.get_output_types(t, operator.output_to)\n        for tt in return_typs:\n            typ_pairs.append([t, tt])\n\n    if not operator.closed:\n        tmp = [p for p in typ_pairs if p[0] in common.ftypes and \\\n                                       p[1] in common.ftypes]\n        tmp += [p for p in typ_pairs if p[0] in common.itypes and \\\n                                        p[1] in common.itypes]\n        tmp += [p for p in typ_pairs if p[0] in common.utypes and \\\n                                        p[1] in common.utypes]\n        tmp += [p for p in typ_pairs \\\n                if (p[0] in common.utypes and p[1] in common.itypes) or \\\n                   (p[0] in common.itypes and p[1] in common.utypes)]\n        tmp += [p for p in typ_pairs \\\n                if (p[0] in common.iutypes and p[1] in common.ftypes) or \\\n                   (p[0] in common.ftypes and p[1] in common.iutypes)]\n        typ_pairs = tmp\n\n    ret = ''\n    for pair in typ_pairs:\n        from_typ = pair[0]\n        to_typ = pair[1]\n        fmtspec = operator.get_fmtspec(from_typ, to_typ, simd_ext)\n        if operator.src:\n            ret += get_simd_implementation_src(operator, simd_ext, from_typ,\n                                               fmtspec)\n        else:\n            ret += \\\n            '''{hbar}\n\n               NSIMD_INLINE {return_typ} NSIMD_VECTORCALL\n               nsimd_{name}_{simd_ext}_{suf}({c_args}) {{\n                 {content}\n               }}\n\n               #if NSIMD_CXX > 0\n               namespace nsimd {{\n                 NSIMD_INLINE {return_typ} NSIMD_VECTORCALL\n                 {name}({cxx_args}) {{\n                   {returns}nsimd_{name}_{simd_ext}_{suf}({vas});\n                 }}\n               }} // namespace nsimd\n               #endif\n\n               '''.format(content=mod.get_impl(opts, operator.name,\n                          simd_ext, from_typ, to_typ), **fmtspec)\n    return ret[0:-2]\n\n\n# -----------------------------------------------------------------------------\n# Generate code for output\n\ndef gen_archis_write_put(opts, platform, simd_ext, simd_dir):\n    filename = os.path.join(simd_dir, 'put.h')\n    if not common.can_create_filename(opts, filename):\n        return\n    op = None\n    with common.open_utf8(opts, filename) as out:\n        out.write( \\\n        '''#ifndef NSIMD_{PLATFORM}_{SIMD_EXT}_PUT_H\n           #define NSIMD_{PLATFORM}_{SIMD_EXT}_PUT_H\n\n           {include_cpu_put}#include <nsimd/{platform}/{simd_ext}/types.h>\n           #include <stdio.h>\n\n           {hbar}\n\n           '''.format(year=date.today().year, hbar=common.hbar,\n                      simd_ext=simd_ext, platform=platform,\n                      PLATFORM=platform.upper(), SIMD_EXT=simd_ext.upper(),\n                      include_cpu_put='#include <nsimd/cpu/cpu/put.h>\\n' \\\n                      if simd_ext != 'cpu' else ''))\n        for typ in common.types:\n            out.write( \\\n            '''#if NSIMD_CXX > 0\n               extern \"C\" {{\n               #endif\n\n               NSIMD_DLLSPEC int NSIMD_VECTORCALL\n               nsimd_put_{simd_ext}_{typ}(FILE *, const char *,\n                                          nsimd_{simd_ext}_v{typ});\n\n               #if NSIMD_CXX > 0\n               }} // extern \"C\"\n               #endif\n\n               #if NSIMD_CXX > 0\n               namespace nsimd {{\n               NSIMD_INLINE int NSIMD_VECTORCALL\n               put(FILE *out, const char *fmt, nsimd_{simd_ext}_v{typ} a0,\n                   {typ}, {simd_ext}) {{\n                 return nsimd_put_{simd_ext}_{typ}(out, fmt, a0);\n               }}\n               }} // namespace nsimd\n               #endif\n\n               {hbar}\n\n               #if NSIMD_CXX > 0\n               extern \"C\" {{\n               #endif\n\n               NSIMD_DLLSPEC int NSIMD_VECTORCALL\n               nsimd_put_{simd_ext}_l{typ}(FILE *, const char *,\n                                           nsimd_{simd_ext}_vl{typ});\n\n               #if NSIMD_CXX > 0\n               }} // extern \"C\"\n               #endif\n\n               #if NSIMD_CXX > 0\n               namespace nsimd {{\n               NSIMD_INLINE int NSIMD_VECTORCALL\n               putl(FILE *out, const char *fmt, nsimd_{simd_ext}_vl{typ} a0,\n                    {typ}, {simd_ext}) {{\n                 return nsimd_put_{simd_ext}_l{typ}(out, fmt, a0);\n               }}\n               }} // namespace nsimd\n               #endif\n\n               {hbar}\n               '''.format(simd_ext=simd_ext, hbar=common.hbar, typ=typ))\n        out.write('#endif')\n    common.clang_format(opts, filename)\n\n# -----------------------------------------------------------------------------\n# Generate code for architectures\n\ndef gen_archis_write_file(opts, op, platform, simd_ext, simd_dir):\n    filename = os.path.join(simd_dir, '{}.h'.format(op.name))\n    if not common.can_create_filename(opts, filename):\n        return\n    mod = opts.platforms[platform]\n    additional_include = mod.get_additional_include(op.name, platform,\n                                                    simd_ext)\n    if op.src:\n        additional_include += \\\n        '''#include <nsimd/{platform}/{simd_ext}/downcvt.h>\n           #include <nsimd/{platform}/{simd_ext}/upcvt.h>\n           '''.format(platform=platform, simd_ext=simd_ext)\n    with common.open_utf8(opts, filename) as out:\n        out.write(\n        '''#ifndef {guard}\n           #define {guard}\n\n           #include <nsimd/{platform}/{simd_ext}/types.h>\n           {additional_include}\n\n           {code}\n\n           {hbar}\n\n           #endif\n           '''.format(additional_include=additional_include,\n                      year=date.today().year,\n                      guard=op.get_header_guard(platform, simd_ext),\n                      platform=platform, simd_ext=simd_ext,\n                      func=op.name, hbar=common.hbar,\n                      code=get_simd_implementation(opts, op, mod, simd_ext)))\n    common.clang_format(opts, filename)\n\ndef gen_archis_simd(opts, platform, simd_ext, simd_dir):\n    for op_name, operator in operators.operators.items():\n        gen_archis_write_file(opts, operator, platform, simd_ext, simd_dir)\n    gen_archis_write_put(opts, platform, simd_ext, simd_dir)\n\ndef gen_archis_types(opts, simd_dir, platform, simd_ext):\n    filename = os.path.join(simd_dir, 'types.h')\n    if not common.can_create_filename(opts, filename):\n        return\n    mod = opts.platforms[platform]\n    c_code = '\\n'.join([mod.get_type(opts, simd_ext, t,\n                                     'nsimd_{}_v{}'.format(simd_ext, t)) \\\n                                     for t in common.types])\n    c_code += '\\n\\n'\n    c_code += '\\n'.join([mod.get_logical_type(\n                             opts, simd_ext, t, 'nsimd_{}_vl{}'. \\\n                             format(simd_ext, t)) for t in common.types])\n    if mod.has_compatible_SoA_types(simd_ext):\n        for deg in range(2, 5):\n            c_code += '\\n'.join([mod.get_SoA_type(simd_ext, typ, deg,\n                                'nsimd_{}_v{}x{}'.format(simd_ext, typ, deg)) \\\n                                for typ in common.types])\n    else:\n        c_code += '\\n'.join([\n        '''\n        typedef NSIMD_STRUCT nsimd_{simd_ext}_v{typ}x2 {{\n          nsimd_{simd_ext}_v{typ} v0;\n          nsimd_{simd_ext}_v{typ} v1;\n        }} nsimd_{simd_ext}_v{typ}x2;\n        '''.format(simd_ext=simd_ext, typ=typ) for typ in common.types])\n\n        c_code += '\\n'.join([\n        '''\n        typedef NSIMD_STRUCT nsimd_{simd_ext}_v{typ}x3 {{\n          nsimd_{simd_ext}_v{typ} v0;\n          nsimd_{simd_ext}_v{typ} v1;\n          nsimd_{simd_ext}_v{typ} v2;\n        }} nsimd_{simd_ext}_v{typ}x3;\n        '''.format(simd_ext=simd_ext, typ=typ) for typ in common.types])\n\n        c_code += '\\n'.join([\n        '''\n        typedef NSIMD_STRUCT nsimd_{simd_ext}_v{typ}x4 {{\n          nsimd_{simd_ext}_v{typ} v0;\n          nsimd_{simd_ext}_v{typ} v1;\n          nsimd_{simd_ext}_v{typ} v2;\n          nsimd_{simd_ext}_v{typ} v3;\n        }} nsimd_{simd_ext}_v{typ}x4;\n        '''.format(simd_ext=simd_ext, typ=typ) for typ in common.types])\n        c_code += '\\n\\n'\n    cxx_code = \\\n        '\\n\\n'.join(['''template <>\n                        struct simd_traits<{typ}, {simd_ext}> {{\n                          typedef nsimd_{simd_ext}_v{typ} simd_vector;\n                          typedef nsimd_{simd_ext}_v{typ}x2 simd_vectorx2;\n                          typedef nsimd_{simd_ext}_v{typ}x3 simd_vectorx3;\n                          typedef nsimd_{simd_ext}_v{typ}x4 simd_vectorx4;\n                          typedef nsimd_{simd_ext}_vl{typ} simd_vectorl;\n                        }};'''.format(typ=t, simd_ext=simd_ext)\n                        for t in common.types])\n    with common.open_utf8(opts, filename) as out:\n        out.write('''#ifndef NSIMD_{platform}_{SIMD_EXT}_TYPES_H\n                     #define NSIMD_{platform}_{SIMD_EXT}_TYPES_H\n\n                     {c_code}\n\n                     #define NSIMD_{simd_ext}_NB_REGISTERS  {nb_registers}\n\n                     #if NSIMD_CXX > 0\n                     namespace nsimd {{\n\n                     // defined in nsimd.h for C++20 concepts\n                     // struct {simd_ext} {{}};\n\n                     {cxx_code}\n\n                     }} // namespace nsimd\n                     #endif\n\n                     #endif\n                     '''. \\\n                     format(year=date.today().year, platform=platform.upper(),\n                            SIMD_EXT=simd_ext.upper(), simd_ext=simd_ext,\n                            c_code=c_code, cxx_code=cxx_code,\n                            nb_registers=mod.get_nb_registers(simd_ext)))\n    common.clang_format(opts, filename)\n\ndef gen_archis_platform(opts, platform):\n    include_dir = os.path.join(opts.include_dir, platform);\n    for s in opts.platforms[platform].get_simd_exts():\n        common.myprint(opts, 'Found new SIMD extension: {}'.format(s))\n        if s in opts.simd:\n            simd_dir = os.path.join(include_dir, s)\n            common.mkdir_p(simd_dir)\n            gen_archis_types(opts, simd_dir, platform, s)\n            gen_archis_simd(opts, platform, s, simd_dir)\n        else:\n            common.myprint(opts, '  Extension excluded by command line')\n\ndef doit(opts):\n    common.myprint(opts, 'Generating SIMD implementations')\n    opts.platforms = common.get_platforms(opts)\n    for p in opts.platforms:\n        common.mkdir_p(os.path.join(opts.include_dir, p))\n        gen_archis_platform(opts, p)\n"
  },
  {
    "path": "egg/gen_base_apis.py",
    "content": "# Copyright (c) 2019 Agenium Scale\n#\n# Permission is hereby granted, free of charge, to any person obtaining a copy\n# of this software and associated documentation files (the \"Software\"), to deal\n# in the Software without restriction, including without limitation the rights\n# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n# copies of the Software, and to permit persons to whom the Software is\n# furnished to do so, subject to the following conditions:\n#\n# The above copyright notice and this permission notice shall be included in all\n# copies or substantial portions of the Software.\n#\n# THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n# SOFTWARE.\n\nimport operators\nimport common\nimport os\nfrom datetime import date\nimport sys\n\n# -----------------------------------------------------------------------------\n# C base generic implem\n\ndef get_c_base_generic(operator):\n    vas = common.get_args(len(operator.params) - 1)\n    sig = operator.get_generic_signature('c_base')\n    if not operator.closed:\n        return \\\n        '''{sig} NSIMD_PP_CAT_6(nsimd_{name}_, NSIMD_SIMD, _, \\\\\n                                to_type, _, from_type)({vas})\n\n           {sig_e} NSIMD_PP_CAT_6(nsimd_{name}_, simd_ext, _, \\\\\n                                  to_type, _, from_type)({vas})'''. \\\n           format(sig=sig[0], sig_e=sig[1], name=operator.name, vas=vas)\n    else:\n        return \\\n        '''{sig} NSIMD_PP_CAT_4(nsimd_{name}_, NSIMD_SIMD, _, type)({vas})\n\n           {sig_e} NSIMD_PP_CAT_4(nsimd_{name}_, simd_ext, _, type)({vas})'''. \\\n           format(sig=sig[0], sig_e=sig[1], name=operator.name, vas=vas)\n\n# -----------------------------------------------------------------------------\n# C++ base generic implem\n\ndef get_cxx_base_generic(operator):\n    returns = '' if operator.params[0] == '_' else 'return'\n    temp = common.get_args(len(operator.params) - 1)\n    temp += ', ' if temp != '' else ''\n    args = temp + 'F(), T()' if not operator.closed else temp + 'T()'\n    return \\\n    '''#if NSIMD_CXX > 0\n       namespace nsimd {{\n       {sig} {{\n         {returns} {name}({args}, NSIMD_SIMD());\n       }}\n       }} // namespace nsimd\n       #endif'''.format(name=operator.name, args=args, returns=returns,\n                        sig=operator.get_generic_signature('cxx_base')[:-1])\n\n# -----------------------------------------------------------------------------\n# Declarations for output\n\ndef get_put_decl():\n    return \\\n    '''#include NSIMD_AUTO_INCLUDE(put.h)\n\n       #define vput(out, fmt, a0, type) \\\n           NSIMD_PP_CAT_4(nsimd_put_, NSIMD_SIMD, _, type)(out, fmt, a0)\n\n       #define vput_e(out, fmt, a0, type, simd_ext) \\\n           NSIMD_PP_CAT_4(nsimd_put_, simd_ext, _, type)(out, fmt, a0)\n\n       #if NSIMD_CXX > 0\n       namespace nsimd {\n       template <typename A0, typename T>\n       int put(FILE *out, const char *fmt, A0 a0, T) {\n         return put(out, fmt, a0, T(), NSIMD_SIMD());\n       }\n       } // namespace nsimd\n       #endif\n       '''\n\n# -----------------------------------------------------------------------------\n# Generate base APIs\n\ndef doit(opts):\n    common.myprint(opts, 'Generating base APIs')\n    common.mkdir_p(opts.include_dir)\n    filename = os.path.join(opts.include_dir, 'functions.h')\n    if not common.can_create_filename(opts, filename):\n        return\n    with common.open_utf8(opts, filename) as out:\n        out.write('''#ifndef NSIMD_FUNCTIONS_H\n                     #define NSIMD_FUNCTIONS_H\n\n                     '''.format(year=date.today().year))\n\n        for op_name, operator in operators.operators.items():\n            out.write('''{}\n\n                         #include NSIMD_AUTO_INCLUDE({}.h)\n\n                         {}\n\n                         {}\n\n                         '''.format(common.hbar, operator.name,\n                                    get_c_base_generic(operator),\n                                    get_cxx_base_generic(operator)))\n\n        out.write('''{hbar}\n\n                     {put_decl}\n\n                     {hbar}\n\n                     #endif'''. \\\n                     format(hbar=common.hbar, put_decl=get_put_decl()))\n    common.clang_format(opts, filename)\n"
  },
  {
    "path": "egg/gen_benches.py",
    "content": "# Copyright (c) 2019 Agenium Scale\n#\n# Permission is hereby granted, free of charge, to any person obtaining a copy\n# of this software and associated documentation files (the \"Software\"), to deal\n# in the Software without restriction, including without limitation the rights\n# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n# copies of the Software, and to permit persons to whom the Software is\n# furnished to do so, subject to the following conditions:\n#\n# The above copyright notice and this permission notice shall be included in all\n# copies or substantial portions of the Software.\n#\n# THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n# SOFTWARE.\n\nimport os\nimport sys\nimport common\nimport operators\nfrom datetime import date\nfrom collections import OrderedDict\n\n# -----------------------------------------------------------------------------\n# Sig\n\ndef sig_replace_name(sig, name):\n    sig = sig.split(' ')\n    sig[1] = name\n    return ' '.join(sig)\n\ndef sig_translate(sig, translates, name=None):\n    sig = sig.split(' ')\n    ## Translates a given type to another\n    sig[0] = translates.get(sig[0], sig[0])\n    ## Do not use sig[1] (the function name)\n    for i, p in enumerate(sig[2:]):\n        sig[2 + i] = translates.get(p, p)\n    sig = ' '.join(sig)\n    ## Redefine name if available\n    if name:\n        sig = sig_replace_name(sig, name)\n    return sig\n\n# -----------------------------------------------------------------------------\n# Errors\n\nclass BenchError(RuntimeError):\n    pass\n\n# -----------------------------------------------------------------------------\n# Markers\n\ndef asm_marker(simd, bench_name):\n    r = ''\n    r += '#ifdef ASM_MARKER'\n    r += '\\n'\n\n    for_intel = '__asm__ __volatile__(\"callq __asm_marker__{bench_name}\");'. \\\n                format(bench_name=bench_name)\n    for_arm = '__asm__ __volatile__(\"bl __asm_marker__{bench_name}\");'. \\\n              format(bench_name=bench_name)\n    if simd in common.x86_simds:\n        r += for_intel\n    elif simd in common.arm_simds:\n        r += for_arm\n    elif simd == 'cpu':\n        r += '''#if defined(NSIMD_X86)\n                  {}\n                #elif defined(NSIMD_ARM)\n                  {}\n                #endif'''.format(for_intel, for_arm)\n    elif simd in common.ppc_simds:\n        #TODO\n        return ''. format(bench_name=bench_name)\n    else:\n        raise BenchError('Unable to write marker for SIMD: {}'.format(simd))\n    r += '\\n'\n    r += '#endif'\n    return r\n\n# -----------------------------------------------------------------------------\n# Metaclass\n\n# Provides __static_init__ hook\nclass StaticInitMetaClass(type):\n    def __new__(cls, name, bases, dct):\n        x = type.__new__(cls, name, bases, dct)\n        x.__static_init__(x)\n        return x\n\n# -----------------------------------------------------------------------------\n# Basic nsimd types\n\n## Will be automatically populated thanks to the metaclass\ntypes = {}\n\n# -----------------------------------------------------------------------------\n\nclass TypeBase(object, metaclass=StaticInitMetaClass):\n\n    @staticmethod\n    def __static_init__(c):\n        ## Skip base class\n        if c.__name__.endswith('Base'):\n            return\n        types[c.name] = c()\n\n    def is_simd(self):\n        return False\n\n    def is_volatile(self):\n        return False\n\nclass TypeVectorBase(TypeBase):\n    def is_simd(self):\n        return True\n\n# -----------------------------------------------------------------------------\n\nclass TypeVoid(TypeBase):\n    name = '_'\n\n    def as_type(self, typ):\n        return 'void'\n\n# -----------------------------------------------------------------------------\n\nclass TypeScalar(TypeBase):\n    name = 's'\n\n    def as_type(self, typ):\n        return typ\n\n    def code_load(self, simd, typ, ptr):\n        return '*({})'.format(ptr)\n\n    def code_store(self, simd, typ, lhs, rhs):\n        return '*({}) = {}'.format(lhs, rhs)\n\n# -----------------------------------------------------------------------------\n\nclass TypeVolatileScalar(TypeScalar):\n    name = 'volatile-s'\n\n    def is_volatile(self):\n        return True\n\n# -----------------------------------------------------------------------------\n\nclass TypeLogicalScalar(TypeBase):\n    name = 'ls'\n\n    def as_type(self, typ):\n        return {\n            'i8': 'u8',\n            'i16': 'u16',\n            'i32': 'u32',\n            'i64': 'u64',\n            'f32': 'u32',\n            'f64': 'u64',\n            }.get(typ, typ)\n\n    def code_load(self, simd, typ, ptr):\n        return '({})(*({}))'.format(self.as_type(typ), ptr)\n\n    def code_store(self, simd, typ, lhs, rhs):\n        return '*({}) = ({})({})'.format(lhs, typ, rhs)\n\n# -----------------------------------------------------------------------------\n\nclass TypeVolatileLogicalScalar(TypeLogicalScalar):\n    name = 'volatile-ls'\n\n    def is_volatile(self):\n        return True\n\n# -----------------------------------------------------------------------------\n\nclass TypeInt(TypeScalar):\n    name = 'p'\n\n    def as_type(self, typ):\n        return 'int'\n\n# -----------------------------------------------------------------------------\n\nclass TypePtr(TypeBase):\n    name = '*'\n\n    def as_type(self, typ):\n        return typ + '*'\n\n# -----------------------------------------------------------------------------\n\nclass TypeConstPtr(TypeBase):\n    name = 'c*'\n\n    def as_type(self, typ):\n        return 'const ' + typ + '*'\n\n# -----------------------------------------------------------------------------\n\nclass TypeVector(TypeVectorBase):\n    name = 'v'\n\n    def as_type(self, typ):\n        return 'v' + typ\n\n    def code_load(self, simd, typ, ptr):\n        return 'nsimd::loada({}, {}())'.format(ptr, typ)\n\n    def code_store(self, simd, typ, ptr, expr):\n        return 'nsimd::storea({}, {}, {}())'.format(ptr, expr, typ)\n\n# -----------------------------------------------------------------------------\n\nclass TypeCPUVector(TypeVector):\n    name = 'vcpu'\n\n    def code_load(self, simd, typ, ptr):\n        return 'nsimd::loada({}, {}(), nsimd::cpu())'.format(ptr, typ)\n\n    def code_store(self, simd, typ, ptr, expr):\n        return 'nsimd::storea({}, {}, {}(), nsimd::cpu())'.format(ptr, expr, typ)\n\n# -----------------------------------------------------------------------------\n\nclass TypeUnrolledVectorBase(TypeVectorBase):\n    def as_type(self, typ):\n        raise NotImplemented()\n\n    def code_load(self, simd, typ, ptr):\n        return 'nsimd::loada<nsimd::pack<{}, {}>>({})'. \\\n               format(typ, self.unroll, ptr)\n\n    def code_store(self, simd, typ, ptr, expr):\n        return 'nsimd::storea({}, {})'.format(ptr, expr)\n\n# -----------------------------------------------------------------------------\n\nclass TypeUnrolledVector1(TypeUnrolledVectorBase):\n    name = 'vu1'\n    unroll = 1\n\nclass TypeUnrolledVector2(TypeUnrolledVectorBase):\n    name = 'vu2'\n    unroll = 2\n\nclass TypeUnrolledVector3(TypeUnrolledVectorBase):\n    name = 'vu3'\n    unroll = 3\n\nclass TypeUnrolledVector4(TypeUnrolledVectorBase):\n    name = 'vu4'\n    unroll = 4\n\nclass TypeUnrolledVector5(TypeUnrolledVectorBase):\n    name = 'vu5'\n    unroll = 5\n\nclass TypeUnrolledVector6(TypeUnrolledVectorBase):\n    name = 'vu6'\n    unroll = 6\n\nclass TypeUnrolledVector7(TypeUnrolledVectorBase):\n    name = 'vu7'\n    unroll = 7\n\nclass TypeUnrolledVector8(TypeUnrolledVectorBase):\n    name = 'vu8'\n    unroll = 8\n\nclass TypeUnrolledVector9(TypeUnrolledVectorBase):\n    name = 'vu9'\n    unroll = 9\n\n# -----------------------------------------------------------------------------\n\nclass TypeVectorX2(TypeVectorBase):\n    name = 'vx2'\n\n    def as_type(self, typ):\n        return 'v' + typ + 'x2'\n\n# -----------------------------------------------------------------------------\n\nclass TypeVectorX3(TypeVectorBase):\n    name = 'vx3'\n\n    def as_type(self, typ):\n        return 'v' + typ + 'x3'\n\n# -----------------------------------------------------------------------------\n\nclass TypeVectorX4(TypeVectorBase):\n    name = 'vx4'\n\n    def as_type(self, typ):\n        return 'v' + typ + 'x4'\n\n# -----------------------------------------------------------------------------\n\nclass TypeLogical(TypeVectorBase):\n    name = 'l'\n\n    def as_type(self, typ):\n        return 'vl' + typ\n\n    def code_load(self, simd, typ, ptr):\n        return 'nsimd::loadla({}, {}())'.format(ptr, typ)\n\n    def code_store(self, simd, typ, ptr, expr):\n        return 'nsimd::storela({}, {}, {}())'.format(ptr, expr, typ)\n\n# -----------------------------------------------------------------------------\n\nclass TypeCPULogical(TypeLogical):\n    name = 'lcpu'\n\n    def code_load(self, simd, typ, ptr):\n        return 'nsimd::loadla({}, {}(), nsimd::cpu())'.format(ptr, typ)\n\n    def code_store(self, simd, typ, ptr, expr):\n        return 'nsimd::storela({}, {}, {}(), nsimd::cpu())'.format(ptr, expr, typ)\n\n# -----------------------------------------------------------------------------\n\nclass TypeUnrolledLogicalBase(TypeVectorBase):\n    def as_type(self, typ):\n        raise NotImplemented()\n\n    def code_load(self, simd, typ, ptr):\n        return 'nsimd::loadla<nsimd::packl<{}, {}>>({})'. \\\n               format(typ, self.unroll, ptr)\n\n    def code_store(self, simd, typ, ptr, expr):\n        return 'nsimd::storela({}, {})'.format(ptr, expr)\n\n# -----------------------------------------------------------------------------\n\nclass TypeUnrolledLogical1(TypeUnrolledLogicalBase):\n    name = 'lu1'\n    unroll = 1\n\nclass TypeUnrolledLogical2(TypeUnrolledLogicalBase):\n    name = 'lu2'\n    unroll = 2\n\nclass TypeUnrolledLogical3(TypeUnrolledLogicalBase):\n    name = 'lu3'\n    unroll = 3\n\nclass TypeUnrolledLogical4(TypeUnrolledLogicalBase):\n    name = 'lu4'\n    unroll = 4\n\nclass TypeUnrolledLogical5(TypeUnrolledLogicalBase):\n    name = 'lu5'\n    unroll = 5\n\nclass TypeUnrolledLogical6(TypeUnrolledLogicalBase):\n    name = 'lu6'\n    unroll = 6\n\nclass TypeUnrolledLogical7(TypeUnrolledLogicalBase):\n    name = 'lu7'\n    unroll = 7\n\nclass TypeUnrolledLogical8(TypeUnrolledLogicalBase):\n    name = 'lu8'\n    unroll = 8\n\nclass TypeUnrolledLogical9(TypeUnrolledLogicalBase):\n    name = 'lu9'\n    unroll = 9\n\n# -----------------------------------------------------------------------------\n\nclass TypeBoostSimdVector(TypeVectorBase):\n    name = 'boost::simd::pack'\n\n    def as_type(self, typ):\n        return 'boost::simd::pack<{}>'.format(typ)\n\n    def code_load(self, simd, typ, ptr):\n        return '{}({})'.format(self.as_type(typ), ptr)\n\n    def code_store(self, simd, typ, ptr, expr):\n        return 'nsimd::storea({}, {}, {}())'.format(ptr, expr, typ)\n\n# -----------------------------------------------------------------------------\n\nclass TypeBoostSimdLogicalVector(TypeVectorBase):\n    name = 'boost::simd::lpack'\n\n    def as_type(self, typ):\n        return 'boost::simd::pack<boost::simd::logical<{}>>'.format(typ)\n\n    def code_load(self, simd, typ, ptr):\n        return '{}({})'.format(self.as_type(typ), ptr)\n\n    def code_store(self, simd, typ, ptr, expr):\n        return 'nsimd::storea({}, {}, {}())'.format(ptr, expr, typ)\n\n# -----------------------------------------------------------------------------\n\nclass TypeMIPPReg(TypeVectorBase):\n    name = 'mipp::reg'\n\n    def as_type(self, typ):\n        return 'mipp::Reg<{}>'.format(typ)\n\n    def code_load(self, simd, typ, ptr):\n        return 'mipp::load<{}>({})'.format(typ, ptr)\n\n    def code_store(self, simd, typ, ptr, expr):\n        return 'mipp::store({}, {})'.format(ptr, expr)\n\n# -----------------------------------------------------------------------------\n\nclass TypeMIPPMsk(TypeVectorBase):\n    name = 'mipp::msk'\n\n    def as_type(self, typ):\n        return 'mipp::Msk<{}>'.format(typ)\n\n    def code_load(self, simd, typ, ptr):\n        if simd in ['avx512_knl', 'avx512_skylake']:\n            return '*({})'.format(ptr)\n        else:\n            return 'mipp::load<{}>({})'.format(typ, ptr)\n\n    def code_store(self, simd, typ, ptr, expr):\n        if simd in ['avx512_knl', 'avx512_skylake']:\n            return '*({}) = {}'.format(ptr, expr)\n        else:\n            return 'mipp::store({}, reinterpret_cast<mipp::reg>({}))'.format(ptr, expr)\n\n# -----------------------------------------------------------------------------\n\ndef type_of(param):\n    if param in types:\n        return types[param]\n    else:\n        raise BenchError(\"Unable to find corresponding type for: \" + param)\n\ndef as_type(param, typ):\n    return type_of(param).as_type(typ)\n\n# -----------------------------------------------------------------------------\n# Operator class needs to be reinforced for benches\n\nclass BenchOperator(object, metaclass=type):\n    def __init__(self):\n        self.typed_params_ = []\n        for p in self.params:\n            self.typed_params_.append(type_of(p))\n\n    @property\n    def function_name(self):\n        return self.name.split('::')[-1].split('<')[0]\n\n    ## Generates list of includes to be included\n    def gen_includes(self, lang):\n        includes = []\n        includes.append('<nsimd/nsimd.h>')\n        if lang == 'cxx_adv':\n            includes.append('<nsimd/cxx_adv_api.hpp>')\n        if lang == 'c_base':\n            includes += ['<stdlib.h>', '<stdio.h>', '<errno.h>', '<string.h>']\n        else:\n            includes += ['<cstdlib>', '<cstdio>', '<cerrno>', '<cstring>',\n                         '<algorithm>']\n        return includes\n\n    def match_sig(self, signature):\n        (name, params) = common.parse_signature(signature)\n        if len(params) != len(self.params):\n            return False\n        for p1, p2 in zip(params, self.params):\n            if p1 != p2:\n                return False\n        return True\n\n    def bench_code_before(self, typ):\n        return ''\n\n    def bench_against_init(self):\n        bench = {}\n        for simd in ['*'] + common.simds:\n            bench[simd] = OrderedDict()\n            for typ in ['*'] + common.types:\n                bench[simd][typ] = OrderedDict()\n        return bench\n\n    def bench_against_cpu(self):\n        bench = self.bench_against_init()\n        ## Enable bench against nsimd (cpu architecture)\n        if self.bench_auto_against_cpu:\n            bench['*']['*'][common.nsimd_category('cpu')] = \\\n                    cpu_fun_from_sig(sig_translate(self.signature, {\n                                     's': 'volatile-s',\n                                     'v': 'vcpu',\n                                     'l': 'lcpu',\n                                     }))\n        return bench\n\n    def bench_against_libs(self):\n        bench = self.bench_against_init()\n        ## Enable bench against all other libraries\n        if self.bench_auto_against_mipp:\n            for typ in self.bench_mipp_types():\n                ## MIPP always requires template\n                mipp_name = self.bench_mipp_name(typ)\n                signature = sig_translate(self.signature, {\n                    'v': 'mipp::reg',\n                    'l': 'mipp::msk',\n                    }, name=mipp_name)\n                if signature:\n                    bench['*'][typ]['MIPP'] = signature\n        if self.bench_auto_against_sleef:\n            for simd in common.simds:\n                for typ in self.bench_sleef_types():\n                    if not common.sleef_support_type(simd, typ):\n                        continue\n                    sleef_name = self.bench_sleef_name(simd, typ)\n                    if sleef_name is None:\n                        continue\n                    ## IMPORTANT:\n                    ## If simd is cpu, then make the signature using scalar\n                    if simd == 'cpu':\n                        signature = sig_translate(self.signature, {\n                            's': 'volatile-s',\n                            'v': 'volatile-s',\n                            'l': 'volatile-s',\n                            }, sleef_name)\n                    else:\n                        signature = sig_translate(self.signature, {},\n                                                        sleef_name)\n                    if signature:\n                        bench[simd][typ]['Sleef'] = signature\n        if self.bench_auto_against_std:\n            for simd in common.simds:\n                for typ in self.bench_std_types():\n                    std_name = self.bench_std_name(simd, typ)\n                    signature = sig_translate(self.signature, {\n                        's': 'volatile-s',\n                        'v': 'volatile-s',\n                        'l': 'volatile-s',\n                        }, std_name)\n                    if signature:\n                        if self.cxx_operator:\n                            bench[simd][typ]['std'] = std_operator_from_sig(signature,\n                                    self.cxx_operator)\n                        else:\n                            bench[simd][typ]['std'] = std_fun_from_sig(signature)\n        return bench\n\n    def code_call(self, typ, args):\n        return 'nsimd::{}({}, {}())'.format(self.name,\n                                            common.pprint_commas(args), typ)\n\n    def code_ptr_step(self, typ, simd):\n        if any(p.is_simd() for p in self.typed_params_):\n            return 'vlen_e({}, {})'.format(typ, simd)\n        else:\n            return '1'\n\nclass BenchOperatorWithNoMakers(BenchOperator):\n    use_for_parsing = False\n\n    # Classes that inherit from me do not have their name member\n    # which is mandatory so I fill it for them here.\n    def __init__(self):\n        BenchOperator.__init__(self)\n        (self.name, void) = common.parse_signature(self.signature)\n\n# -----------------------------------------------------------------------------\n# Make the list of all operators, they will inherit from the corresponding\n# operators.Operator and then from BenchOperator\n\nfunctions = {}\n\nclass dummy(operators.MAddToOperators):\n    def __new__(cls, name, bases, dct):\n        return type.__new__(cls, name, bases, dct)\n\nfor op_name, operator in operators.operators.items():\n    if operator.load_store: # We do not bench loads/stores\n        continue\n    op_class = dummy(operator.__class__.__name__,\n                     (operator.__class__, BenchOperator), {})\n    functions[op_name] = op_class()\n\n# -----------------------------------------------------------------------------\n# Function helpers\n\ndef nsimd_unrolled_fun_from_sig(from_sig, unroll):\n    sig = sig_translate(from_sig, {\n        'v': 'vu' + str(unroll),\n        'l': 'lu' + str(unroll),\n        })\n    class InlineNSIMDUnrolledFun(operators.Operator, BenchOperatorWithNoMakers,\n                                 metaclass=dummy):\n        signature = sig\n        def code_call(self, typ, args):\n            return 'nsimd::{}({})'.format(self.name,\n                                          common.pprint_commas(args))\n        def code_ptr_step(self, typ, simd):\n            return 'nsimd::len(nsimd::pack<{}, {}, nsimd::{}>())'.format(typ, unroll, simd)\n    return InlineNSIMDUnrolledFun()\n\ndef fun_from_sig(from_sig):\n    class InlineFun(operators.Operator, BenchOperatorWithNoMakers,\n                    metaclass=dummy):\n        signature = from_sig\n        def code_call(self, typ, args):\n            return '{}({})'.format(self.name, common.pprint_commas(args))\n    return InlineFun()\n\ndef std_fun_from_sig(from_sig):\n    return fun_from_sig(from_sig)\n\ndef std_operator_from_sig(from_sig, op):\n    class InlineStdOperatorFun(operators.Operator, BenchOperatorWithNoMakers,\n                               metaclass=dummy):\n        __metaclass__ = dummy\n        signature = from_sig\n        operator = op\n        def code_call(self, typ, args):\n            if len(args) == 1:\n                return '{}({})'.format(self.operator, args[0])\n            elif len(args) == 2:\n                return '{} {} {}'.format(args[0], self.operator, args[1])\n            else:\n                raise BenchError('std:: operators requires 1 or 2 arguments!')\n    return InlineStdOperatorFun()\n\ndef cpu_fun_from_sig(from_sig):\n    class InlineCPUFun(operators.Operator, BenchOperatorWithNoMakers,\n                       metaclass=dummy):\n        signature = from_sig\n        def code_call(self, typ, args):\n            return 'nsimd::{}({}, {}(), nsimd::cpu())'. \\\n                   format(self.name, common.pprint_commas(args), typ)\n    return InlineCPUFun()\n\ndef sanitize_fun_name(name):\n    return ''.join(map(lambda c: c if c.isalnum() else '_', name))\n\n# -----------------------------------------------------------------------------\n# Code\n\ndef code_cast(typ, expr):\n    return '({})({})'.format(typ, expr)\n\ndef code_cast_ptr(typ, expr):\n    return code_cast(typ + '*', expr)\n\n# -----------------------------------------------------------------------------\n# Globals\n\n_opts = None\n_lang = 'cxx_adv'\n\n# -----------------------------------------------------------------------------\n# Generates\n\ndef TODO(f):\n    if _opts.verbose:\n        common.myprint(opts, '@@ TODO: ' + f.name)\n\ndef gen_filename(f, simd, typ):\n    ## Retrieve directory from global options\n    benches_dir = common.mkdir_p(os.path.join(_opts.benches_dir, _lang))\n    ## Generate path (composed from: function name + type + extension)\n    return os.path.join(benches_dir, '{}.{}.{}.{}'.format(\n        f.name, simd, typ, common.ext_from_lang(_lang)))\n\ndef gen_bench_name(category, name, unroll=None):\n    bench_name = '{}_{}'.format(category, name)\n    if unroll:\n        bench_name += '_unroll{}'.format(unroll)\n    return bench_name\n\ndef gen_bench_from_code(f, typ, code, bench_with_timestamp):\n    header = ''\n    header += common.pprint_includes(f.gen_includes(_lang))\n    header += \\\n    '''\n\n    // Required for random generation\n    #include \"../benches.hpp\"\n\n    // Google benchmark\n    #ifndef DISABLE_GOOGLE_BENCHMARK\n    #include <benchmark/benchmark.h>\n    #endif\n\n    #include <ctime>\n    double timestamp_ns() {\n      timespec ts;\n      clock_gettime(CLOCK_MONOTONIC, &ts);\n      return double(ts.tv_sec) * 1000000000.0 + double(ts.tv_nsec);\n    }\n\n    // std\n    #include <cmath>\n    // #include <map>\n    #include <numeric>\n    // #include <fstream>\n\n    // Sleef\n    #pragma GCC diagnostic push\n    #pragma GCC diagnostic ignored \"-Wignored-qualifiers\"\n    #include <sleef.h>\n    #pragma GCC diagnostic pop\n\n    // MIPP\n    #pragma GCC diagnostic push\n    #pragma GCC diagnostic ignored \"-Wconversion\"\n    #pragma GCC diagnostic ignored \"-Wsign-conversion\"\n    #pragma GCC diagnostic ignored \"-Wdouble-promotion\"\n    #pragma GCC diagnostic ignored \"-Wunused-parameter\"\n    #if defined(__clang__)\n    #pragma GCC diagnostic ignored \"-Wzero-length-array\"\n    #endif\n    #include <mipp.h>\n    #pragma GCC diagnostic pop\n    '''\n    return \\\n    '''{header}\n\n    // -------------------------------------------------------------------------\n\n    static const int sz = 1024;\n\n    template <typename Random>\n    static {type}* make_data(int sz, Random r) {{\n      {type}* data = ({type}*)nsimd_aligned_alloc(sz * {sizeof});\n\n      for (int i = 0; i < sz; ++i) {{\n        data[i] = r();\n      }}\n      return data;\n    }}\n\n    static {type}* make_data(int sz) {{\n      {type}* data = ({type}*)nsimd_aligned_alloc(sz * {sizeof});\n\n      for (int i = 0; i < sz; ++i) {{\n        data[i] = {type}(0);\n      }}\n      return data;\n    }}\n\n    {random_code}\n\n    {code}\n\n    int main(int argc, char** argv)\n    {{\n      std::vector<std::string> args(argv, argv + argc);\n\n      if (std::find(args.begin(), args.end(), \"--use_timestamp_ns\")\n          != args.end()) {{\n        {bench_with_timestamp}\n      }}\n      #ifndef DISABLE_GOOGLE_BENCHMARK\n      else {{\n        ::benchmark::Initialize(&argc, argv);\n        ::benchmark::RunSpecifiedBenchmarks();\n      }}\n      #endif\n\n      return 0;\n    }}\n\n    '''.format(\n            name=f.name,\n            type=typ,\n            year=date.today().year,\n            random_code=f.domain.code('rand_param', typ),\n            code=code,\n            bench_with_timestamp=bench_with_timestamp,\n            sizeof=common.sizeof(typ),\n            header=header,\n    )\n\ndef gen_bench_info_from(f, simd, typ):\n    bench_args_init = []\n    bench_args_decl = []\n    bench_args_call = []\n    ## Generate code for parameters\n    for i, arg in enumerate(f.args):\n        p = type_of(arg)\n        qualifiers = ''\n        if p.is_volatile():\n            qualifiers += 'volatile '\n        bench_args_init.append('make_data(sz, &rand_param{n})'.format(n=i))\n        bench_args_decl.append('{} {}* _{}'.format(qualifiers, typ, i))\n        bench_args_call.append(p.code_load(simd, typ, '_{} + i'.format(i)))\n    ## Generate code for bench (using function return type)\n    r = type_of(f.get_return())\n    bench_call = r.code_store(simd, typ, '_r + i',\n                              f.code_call(typ, bench_args_call))\n    return bench_args_init, bench_args_decl, bench_args_call, bench_call\n\ndef gen_bench_asm_function(f, simd, typ, category):\n    bench_args_init, bench_args_decl, \\\n    bench_args_call, bench_call = gen_bench_info_from(f, simd, typ)\n    ## Add function that can easily be parsed to get assembly and plain code\n    return \\\n    '''\n    void {bench_name}__asm__({type}* _r, {bench_args_decl}, int sz) {{\n      __asm__ __volatile__(\"nop\");\n      __asm__ __volatile__(\"nop\");\n      __asm__ __volatile__(\"nop\");\n      __asm__ __volatile__(\"nop\");\n      __asm__ __volatile__(\"nop\");\n      __asm__ __volatile__(\"nop\");\n      // code:{{\n      int n = {step};\n      #if defined(NSIMD_IS_GCC)\n        #pragma GCC unroll 1\n      #elif defined(NSIMD_IS_CLANG)\n        #pragma clang loop unroll(disable)\n      #elif defined(NSIMD_IS_ICC)\n        #pragma unroll(1)\n      #endif\n      for (int i = 0; i < sz; i += n) {{\n        {bench_call};\n      }}\n      // code:}}\n      __asm__ __volatile__(\"nop\");\n      __asm__ __volatile__(\"nop\");\n      __asm__ __volatile__(\"nop\");\n      __asm__ __volatile__(\"nop\");\n      __asm__ __volatile__(\"nop\");\n      __asm__ __volatile__(\"nop\");\n    }}\n    '''.format(\n        bench_name=gen_bench_name(category, f.function_name),\n        type=typ,\n        step=f.code_ptr_step(typ, simd),\n        bench_call=bench_call,\n        bench_args_decl=common.pprint_commas(bench_args_decl)\n        )\n\ndef gen_bench_from_basic_fun(f, simd, typ, category, unroll=None):\n    bench_args_init, bench_args_decl, bench_args_call, bench_call = \\\n            gen_bench_info_from(f, simd, typ)\n    bench_name = gen_bench_name(category, f.function_name, unroll)\n\n    code_timestamp_ns = \\\n    '''\n    void {bench_name}({type}* _r, {bench_args_decl}, int sz) {{\n      // Normalize size depending on the step so that we're not going out of boundaies\n      // (Required when the size is'nt a multiple of `n`, like for unrolling benches)\n      sz = (sz / {step}) * {step};\n      std::cout << \"{bench_name}({type}), sz = \" << sz << std::endl;\n      {asm_marker}\n      // code: {bench_name}\n      int n = {step};\n      #if defined(NSIMD_IS_GCC)\n        #pragma GCC unroll 1\n      #elif defined(NSIMD_IS_CLANG)\n        #pragma clang loop unroll(disable)\n      #elif defined(NSIMD_IS_ICC)\n        #pragma unroll(1)\n      #endif\n      for (int i = 0; i < sz; i += n) {{\n        {bench_call};\n      }}\n      // code: {bench_name}\n      {asm_marker}\n    }}\n    '''\n\n    return \\\n    '''\n    // -----------------------------------------------------------------------------\n\n    {code_before}\n\n    extern \"C\" {{ void __asm_marker__{bench_name}() {{}} }}\n\n    #ifndef DISABLE_GOOGLE_BENCHMARK\n\n    void {bench_name}(benchmark::State& state, {type}* _r, {bench_args_decl}, int sz) {{\n      // Normalize size depending on the step so that we're not going out of boundaies\n      // (Required when the size is'nt a multiple of `n`, like for unrolling benches)\n      sz = (sz / {step}) * {step};\n      try {{\n        for (auto _ : state) {{\n          {asm_marker}\n          // code: {bench_name}\n          int n = {step};\n          #if defined(NSIMD_IS_GCC)\n            #pragma GCC unroll 1\n          #elif defined(NSIMD_IS_CLANG)\n            #pragma clang loop unroll(disable)\n          #elif defined(NSIMD_IS_ICC)\n            #pragma unroll(1)\n          #endif\n          for (int i = 0; i < sz; i += n) {{\n            {bench_call};\n          }}\n          // code: {bench_name}\n          {asm_marker}\n        }}\n      }} catch (std::exception const& e) {{\n        std::string message(\"ERROR: \");\n        message += e.what();\n        state.SkipWithError(message.c_str());\n      }}\n    }}\n\n    BENCHMARK_CAPTURE({bench_name}, {type}, make_data(sz), {bench_args_init}, sz);\n\n    #endif\n    '''.format(\n            bench_name=bench_name,\n            type=typ,\n            step=f.code_ptr_step(typ, simd),\n            bench_call=bench_call,\n            bench_args_init=common.pprint_commas(bench_args_init),\n            bench_args_decl=common.pprint_commas(bench_args_decl),\n            bench_args_call=common.pprint_commas(bench_args_call),\n            code_before=f.bench_code_before(typ),\n            asm_marker=asm_marker(simd, bench_name)\n            )\n\ndef gen_code(f, simd, typ, category):\n    code = None\n    if f.returns_any_type:\n        return TODO(f)\n    ## TODO: We have to refactor this, it's annoying to add every possible signatures...\n    if f.match_sig('v * v v') or f.match_sig('v * v v v') \\\n        or f.match_sig('l * v v') or f.match_sig('l * l l') \\\n        or f.match_sig('l * l') or f.match_sig('v * v') \\\n        or f.match_sig('s * s') \\\n        or f.match_sig('s * s s') \\\n        or f.match_sig('s * s s s') \\\n        or f.match_sig('vcpu * vcpu') \\\n        or f.match_sig('vcpu * vcpu vcpu') \\\n        or f.match_sig('vcpu * vcpu vcpu vcpu') \\\n        or f.match_sig('lcpu * lcpu') \\\n        or f.match_sig('lcpu * lcpu lcpu') \\\n        or f.match_sig('lcpu * vcpu vcpu') \\\n        or f.match_sig('vcpu * lcpu vcpu vcpu') \\\n        or f.match_sig('volatile-s * volatile-s') \\\n        or f.match_sig('volatile-s * volatile-s volatile-s') \\\n        or f.match_sig('volatile-s * volatile-s volatile-s volatile-s') \\\n        or f.match_sig('volatile-ls * volatile-s') \\\n        or f.match_sig('volatile-ls * volatile-s volatile-s') \\\n        or f.match_sig('volatile-ls * volatile-ls') \\\n        or f.match_sig('volatile-ls * volatile-ls volatile-ls') \\\n        or f.match_sig('volatile-s * volatile-ls volatile-s volatile-s') \\\n        or f.match_sig('boost::simd::pack * boost::simd::pack') \\\n        or f.match_sig('boost::simd::pack * boost::simd::pack boost::simd::pack') \\\n        or f.match_sig('boost::simd::pack * boost::simd::pack boost::simd::pack boost::simd::pack') \\\n        or f.match_sig('boost::simd::lpack * boost::simd::pack') \\\n        or f.match_sig('boost::simd::lpack * boost::simd::pack boost::simd::pack') \\\n        or f.match_sig('mipp::reg * mipp::reg') \\\n        or f.match_sig('mipp::reg * mipp::reg mipp::reg') \\\n        or f.match_sig('mipp::msk * mipp::reg') \\\n        or f.match_sig('mipp::msk * mipp::reg mipp::reg') \\\n        or f.match_sig('v * l v v'):\n        code = gen_bench_from_basic_fun(f, simd, typ, category=category)\n    if f.match_sig('p * l'):\n        return TODO(f)\n    if f.match_sig('l * p'):\n        return TODO(f)\n    if f.match_sig('v * s'):\n        return TODO(f)\n    if f.match_sig('l * p'):\n        return TODO(f)\n    if f.match_sig('p *'):\n        return TODO(f)\n    if f.match_sig('v * v p'):\n        return TODO(f)\n    if code is None:\n        raise BenchError('Unable to generate bench for signature: ' + \\\n                         f.signature)\n    return code\n\ndef gen_bench_unrolls(f, simd, typ, category):\n    code = ''\n    sig = f.signature\n    for unroll in [2, 3, 4]:\n        f = nsimd_unrolled_fun_from_sig(sig, unroll)\n        code += gen_bench_from_basic_fun(f, simd, typ, category=category,\n                                         unroll=unroll)\n    return code\n\ndef gen_bench_against(f, simd, typ, against):\n    code = ''\n    # \"against\" dict looks like: { simd: { type: { name: sig } } }\n    for s in [simd, '*']:\n        if not s in against:\n            continue\n        for t in [typ, '*']:\n            if not t in against[s]:\n                continue\n            for category, f in against[s][t].items():\n                # Allow function to be simple str (you use this most of the\n                # time)\n                if isinstance(f, str):\n                    f = fun_from_sig(f)\n                # Now that we have a `Fun` type, we can generate code\n                code += gen_code(f, simd, typ, category=category)\n    return code\n\ndef gen_bench_with_timestamp(f, simd, typ, category, unroll=None):\n    code = ''\n    bench_args_init, bench_args_decl, bench_args_call, bench_call = \\\n            gen_bench_info_from(f, simd, typ)\n    bench_name = gen_bench_name(category, f.function_name, unroll)\n    bench_args_decl = ''\n    bench_args_call = ''\n    for i, arg in enumerate(f.args):\n        bench_args_decl += typ + ' * data' + str(i) + ' = make_data(sz, &rand_param' + str(i) + ');' + '\\n'\n        if i != 0: bench_args_call += ', '\n        bench_args_call += 'data' + str(i)\n    code += \\\n      '''\n      {{\n        // Bench\n        {typ} * r = make_data(sz);\n        {bench_args_decl}\n        double elapsed_times_ns[nb_runs] = {{ }}; // Must be at least 10000\n        {typ} sum = {{ }};\n        for (size_t run = 0; run < nb_runs; ++run) {{\n          double const t0 = timestamp_ns();\n          {bench_name}(r, {bench_args_call}, 1000);\n          double const t1 = timestamp_ns();\n          elapsed_times_ns[run] = (t1 - t0) / double(sz);\n          // Compute sum\n          if (rand() % 2) {{\n            sum += std::accumulate(r, r + sz, {typ}());\n          }} else {{\n            sum -= std::accumulate(r, r + sz, {typ}());\n          }}\n        }}\n        // Save sum and elapsed time\n        std::sort(elapsed_times_ns, elapsed_times_ns + nb_runs);\n        size_t const i_start = nb_runs / 2 - 10;\n        size_t const i_end = nb_runs / 2 + 10;\n        sums[\"{bench_name}\"] =\n          std::make_pair(sum, std::accumulate(elapsed_times_ns + i_start, elapsed_times_ns + i_end, 0.0) / double(i_end - i_start));\n        // Number of elapsed times\n        std::map<double, int> nb_per_elapsed_time;\n        for (size_t run = 0; run < nb_runs; ++run) {{\n          ++nb_per_elapsed_time[(i64(elapsed_times_ns[run] * 100)) / 100.0];\n        }}\n        // Draw gnuplot\n        std::system(\"mkdir -p gnuplot\");\n        std::string const dat_filename = \"gnuplot/benches.cxx_adv.{bench_name}.dat\";\n        std::ofstream dat_file(dat_filename);\n        for (auto const & elapsed_time_nb : nb_per_elapsed_time) {{\n          dat_file << elapsed_time_nb.first << \" \" << elapsed_time_nb.second << \"\\\\n\";\n        }}\n        std::string const gnuplot_filename = \"gnuplot/benches.cxx_adv.{bench_name}.gnuplot\";\n        std::ofstream gnuplot_file(gnuplot_filename);\n        gnuplot_file << \"set term svg\" << \"\\\\n\";\n        gnuplot_file << \"set output \\\\\"benches.cxx_adv.{bench_name}.svg\\\\\"\" << \"\\\\n\";\n        gnuplot_file << \"set xlabel \\\\\"Time in nanoseconds (lower is better)\\\\\"\" << \"\\\\n\";\n        gnuplot_file << \"set ylabel \\\\\"Number of runs\\\\\"\" << \"\\\\n\";\n        gnuplot_file << \"\\\\n\";\n        gnuplot_file << \"set style line 1 \\\\\\\\\" << \"\\\\n\";\n        gnuplot_file << \"    linecolor rgb '#db284c' \\\\\\\\\" << \"\\\\n\";\n        gnuplot_file << \"    linetype 1 linewidth 2\" << \"\\\\n\";\n        gnuplot_file << \"\\\\n\";\n        gnuplot_file << \"plot '\" << dat_filename << \"' with linespoints linestyle 1\" << \"\\\\n\";\n        std::system((\"cd gnuplot && gnuplot \\\\\"\" + gnuplot_filename + \"\\\\\"\").c_str());\n      }}\n      '''.format(bench_name=bench_name,\n                  typ=typ,\n                  bench_args_decl=bench_args_decl,\n                  bench_args_call=bench_args_call,\n                  )\n    return code\n\ndef gen_bench_unrolls_with_timestamp(f, simd, typ, category):\n    code = ''\n    for unroll in [2, 3, 4]:\n        code += gen_bench_with_timestamp(f, simd, typ, category=category,\n                                         unroll=unroll)\n    return code\n\ndef gen_bench_against_with_timestamp(f, simd, typ, against):\n    code = ''\n    # \"against\" dict looks like: { simd: { type: { name: sig } } }\n    for s in [simd, '*']:\n        if not s in against:\n            continue\n        for t in [typ, '*']:\n            if not t in against[s]:\n                continue\n            for category, f in against[s][t].items():\n                # Allow function to be simple str (you use this most of the\n                # time)\n                if isinstance(f, str):\n                    f = fun_from_sig(f)\n                # Now that we have a `Fun` type, we can generate code\n                code += gen_bench_with_timestamp(f, simd, typ, category)\n    return code\n\ndef gen_bench(f, simd, typ):\n    ## TODO\n    path = gen_filename(f, simd, typ)\n    ## Check if we need to create the file\n    if not common.can_create_filename(_opts, path):\n        return\n    ## Generate specific code for the bench\n    category = common.nsimd_category(simd)\n    code = gen_code(f, simd, typ, category=category)\n    if code is None:\n        return\n    ## Now aggregate every parts\n    bench = ''\n    #bench += gen_bench_asm_function(f, typ, category)\n    bench += gen_bench_against(f, 'cpu', typ, f.bench_against_cpu())\n    bench += code\n    bench += gen_bench_unrolls(f, simd, typ, category)\n    bench += gen_bench_against(f, simd, typ, f.bench_against_libs())\n    ## bench_with_timestamp\n    bench_with_timestamp = ''\n    bench_with_timestamp += 'std::map<std::string, std::pair<' + typ + ', double>> sums;' + '\\n'\n    bench_with_timestamp += 'size_t const nb_runs = 10 * 1000;' + '\\n'\n    bench_with_timestamp += gen_bench_against_with_timestamp(f, 'cpu', typ, f.bench_against_cpu())\n    bench_with_timestamp += gen_bench_with_timestamp(f, simd, typ, category)\n    bench_with_timestamp += gen_bench_unrolls_with_timestamp(f, simd, typ, category)\n    bench_with_timestamp += gen_bench_against_with_timestamp(f, simd, typ, f.bench_against_libs())\n    bench_with_timestamp += '''\n                            std::string json = \"\";\n                            json += \"{{\\\\n\";\n                            json += \"  \\\\\"benchmarks\\\\\": [\\\\n\";\n\n                            for (auto const & bench_name_sum_time : sums) {{\n                              std::string const & bench_name = bench_name_sum_time.first;\n                              {typ} const & sum = bench_name_sum_time.second.first;\n                              double const & elapsed_time_ns = bench_name_sum_time.second.second;\n\n                              json += \"  {{\" \"\\\\n\";\n                              json += \"    \\\\\"name\\\\\": \\\\\"\" + bench_name + \"/{typ}\\\\\",\" + \"\\\\n\";\n                              json += \"    \\\\\"real_time\\\\\": \" + std::to_string(elapsed_time_ns) + \",\" + \"\\\\n\";\n                              json += \"    \\\\\"sum\\\\\": \" + std::string(std::isfinite(sum) ? \"\" : \"\\\\\"\") + std::to_string(sum) + std::string(std::isfinite(sum) ? \"\" : \"\\\\\"\") + \",\" + \"\\\\n\";\n                              json += \"    \\\\\"time_unit\\\\\": \\\\\"ns\\\\\"\\\\n\";\n                              json += \"  }}\";\n                              if (&bench_name_sum_time != &*sums.rbegin()) {{\n                                json += \",\";\n                              }}\n                              json += \"\\\\n\";\n                            }}\n\n                            json += \"  ]\\\\n\";\n                            json += \"}}\\\\n\";\n\n                            std::cout << json << std::flush;\n                            '''.format(typ=typ)\n    ## Finalize code\n    code = gen_bench_from_code(f, typ, bench, '') # bench_with_timestamp\n    ## Write file\n    with common.open_utf8(_opts, path) as f:\n        f.write(code)\n    ## Clang-format it!\n    common.clang_format(_opts, path)\n\n# -----------------------------------------------------------------------------\n# Entry point\n\ndef doit(opts):\n    global _opts\n    _opts = opts\n    common.myprint(opts, 'Generating benches')\n    for f in functions.values():\n        if not f.do_bench:\n            if opts.verbose:\n                common.myprint(opts, 'Skipping bench: {}'.format(f.name))\n            continue\n        # WE MUST GENERATE CODE FOR EACH SIMD EXTENSION AS OTHER LIBRARY\n        # USUALLY DO NOT PROPOSE A GENERIC INTERFACE\n        for simd in _opts.simd:\n            ## FIXME\n            if simd in ['neon128', 'cpu']:\n                continue\n            for typ in f.types:\n                ## FIXME\n                if typ == 'f16':\n                    continue\n                ## Skip non-matching benches\n                if opts.match and not opts.match.match(f.name):\n                    continue\n                ## FIXME\n                if f.name in ['gamma', 'lgamma', 'ziplo', 'ziphi',\n                              'unziphi', 'unziplo']:\n                    continue\n                gen_bench(f, simd, typ)\n"
  },
  {
    "path": "egg/gen_doc.py",
    "content": "# Use utf-8 encoding\n# -*- coding: utf-8 -*-\n\n# Copyright (c) 2021 Agenium Scale\n#\n# Permission is hereby granted, free of charge, to any person obtaining a copy\n# of this software and associated documentation files (the \"Software\"), to deal\n# in the Software without restriction, including without limitation the rights\n# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n# copies of the Software, and to permit persons to whom the Software is\n# furnished to do so, subject to the following conditions:\n#\n# The above copyright notice and this permission notice shall be included in all\n# copies or substantial portions of the Software.\n#\n# THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n# SOFTWARE.\n\nimport os\nimport platform\nimport io\nimport sys\nimport subprocess\nimport common\nimport collections\nimport operators\nimport re\nimport string\n\ncategories = operators.categories\noperators = operators.operators\n\n# -----------------------------------------------------------------------------\n# Get output of command\n\ndef get_command_output(args):\n    p = subprocess.Popen(args, stdout=subprocess.PIPE)\n    lines = p.communicate()[0].split('\\n')[0:-1]\n    return '\\n'.join(['    {}'.format(l) for l in lines])\n\n# -----------------------------------------------------------------------------\n\ndef gen_overview(opts):\n    filename = common.get_markdown_file(opts, 'overview')\n    if not common.can_create_filename(opts, filename):\n        return\n    with common.open_utf8(opts, filename) as fout:\n        fout.write('''# Overview\n\n## NSIMD scalar types\n\nTheir names follow the following pattern: `Sxx` where\n\n- `S` is `i` for signed integers, `u` for unsigned integer or `f` for\n  floatting point number.\n- `xx` is the number of bits taken to represent the number.\n\nFull list of scalar types:\n\n''')\n        for t in common.types:\n            fout.write('- `{}`\\n'.format(t))\n        fout.write('''\n## NSIMD generic SIMD vector types\n\nIn NSIMD, we call a platform an architecture e.g. Intel, ARM, POWERPC. We call\nSIMD extension a set of low-level functions and types provided by hardware\nvendors to access SIMD units. Examples include SSE2, SSE42, AVX, ...  When\ncompiling the generic SIMD vector types represents a SIMD register of the\ntarget. Examples are a `__m128` for Intel SSE, `__m512` for Intel AVX-512 or\n`svfloat32_t` for Arm SVE.\n\nTheir names follow the following pattern:\n\n- C base API: `vSCALAR` where `SCALAR` is a one of scalar type listed above.\n- C advanced API: `nsimd_pack_SCALAR` where `SCALAR` is a one of scalar type\n  listed above.\n- C++ advanced API: `nsimd::pack<SCALAR>` where `SCALAR` is a one of scalar\n  type listed above.\n\nFull list of SIMD vector types:\n\n| Base type | C base API | C advanced API | C++ advanced API |\n|-----------|------------|----------------|------------------|\n''')\n\n        fout.write('\\n'.join([\n        '| `{typ}` | `v{typ}` | `nsimd_pack_{typ}` | `nsimd::pack<{typ}>` |'. \\\n        format(typ=typ) for typ in common.types]))\n\n        fout.write('''\n\n## C/C++ base APIs\n\nThese come automatically when you include `nsimd/nsimd.h`. You do *not* need\nto include a header file for having a function. Here is a list of supported\nplatforms and their corresponding SIMD extensions.\n\n''')\n        platforms = common.get_platforms(opts)\n        for p in platforms:\n            fout.write('- Platform `{}`\\n'.format(p))\n            for s in platforms[p].get_simd_exts():\n                fout.write('  - `{}`\\n'.format(s))\n        fout.write('''\nEach simd extension has its own set of SIMD types and functions. Types follow\nthe pattern: `nsimd_SIMDEXT_vSCALAR` where\n\n- `SIMDEXT` is the SIMD extensions.\n- `SCALAR` is one of scalar types listed above.\n\nThere are also logical types associated to each SIMD vector type. These types\nare used, for example, to represent the result of a comparison of SIMD vectors.\nThey are usually bit masks. Their name follow the pattern:\n`nsimd_SIMDEXT_vlSCALAR` where\n\n- `SIMDEXT` is the SIMD extensions.\n- `SCALAR` is one of scalar types listed above.\n\nNote 1: Platform `cpu` is a 128 bits SIMD emulation fallback when no SIMD\nextension has been specified or is supported on a given compilation target.\n\nNote 2: as all SIMD extensions of all platforms are different there is no\nneed to put the name of the platform in each identifier.\n\nFunction names follow the pattern: `nsimd_SIMDEXT_FUNCNAME_SCALAR` where\n\n- `SIMDEXT` is the SIMD extensions.\n- `FUNCNAME` is the name of a function e.g. `add` or `sub`.\n- `SCALAR` is one of scalar types listed above.\n\n### Generic identifier\n\nIn the base C API, genericity is achieved using macros.\n\n- `vec(SCALAR)` is a type to represent a SIMD vector containing SCALAR\n  elements.  SCALAR must be one of scalar types listed above.\n- `vecl(SCALAR)` is a type to represent a SIMD vector of logicals for SCALAR\n  elements. SCALAR must be one of scalar types listed above.\n- `vec_a(SCALAR, SIMDEXT)` is a type to represent a SIMD vector containing\n  SCALAR elements for the simd extension SIMDEXT. SCALAR must be one of scalar\n  types listed above and SIMDEXT must be a valid SIMD extension.\n- `vecl_a(SCALAR, SIMDEXT)` is a type to represent a SIMD vector of logicals\n  for SCALAR elements for the simd extension SIMDEXT. SCALAR must be one of\n  scalar types listed above and SIMDEXT must be a valid SIMD extension.\n- `vFUNCNAME` takes as input the above types to access the operator FUNCNAME\n  e.g. `vadd`, `vsub`.\n\nIn C++98 and C++03, type traits are available.\n\n- `nsimd::simd_traits<SCALAR, SIMDEXT>::vector` is the SIMD vector type for\n  platform SIMDEXT containing SCALAR elements. SIMDEXT is one of SIMD\n  extension listed above, SCALAR is one of scalar type listed above.\n- `nsimd::simd_traits<SCALAR, SIMDEXT>::vectorl` is the SIMD vector of logicals\n  type for platform SIMDEXT containing SCALAR elements. SIMDEXT is one of\n  SIMD extensions listed above, SCALAR is one of scalar type listed above.\n\nIn C++11 and beyond, type traits are still available but typedefs are also\nprovided.\n\n- `nsimd::vector<SCALAR, SIMDEXT>` is a typedef to\n  `nsimd::simd_traits<SCALAR, SIMDEXT>::vector`.\n- `nsimd::vectorl<SCALAR, SIMDEXT>` is a typedef to\n  `nsimd::simd_traits<SCALAR, SIMDEXT>::vectorl`.\n\nThe C++20 API does not bring different types for SIMD registers nor other\nway to access the other SIMD types. It only brings concepts instead of usual\n`typename`s. For more informations cf. <concepts.md>.\n\nNote that all macro and functions available in plain C are still available in\nC++.\n\n### List of operators provided by the base APIs\n\nIn the documentation we use interchangeably the terms \"function\" and\n\"operator\".  For each operator FUNCNAME a C function (also available in C++)\nnamed `nsimd_SIMDEXT_FUNCNAME_SCALAR` is available for each SCALAR type unless\nspecified otherwise.\n\nFor each FUNCNAME, a C macro (also available in C++) named `vFUNCNAME` is\navailable and takes as its last argument a SCALAR type.\n\nFor each FUNCNAME, a C macro (also available in C++) named `vFUNCNAME_a` is\navailable and takes as its two last argument a SCALAR type and a SIMDEXT.\n\nFor each FUNCNAME, a C++ function in namespace `nsimd` named `FUNCNAME` is\navailable. It takes as its last argument the SCALAR type and can optionnally\ntake the SIMDEXT as its last last argument.\n\nFor example, for the addition of two SIMD vectors `a` and `b` here are the\npossibilities:\n\n```c++\nc = nsimd_add_avx_f32(a, b); // use AVX\nc = nsimd::add(a, b, f32()); // use detected SIMDEXT\nc = nsimd::add(a, b, f32(), avx()); // force AVX even if detected SIMDEXT is not AVX\nc = vadd(a, b, f32); // use detected SIMDEXT\nc = vadd_e(a, b, f32, avx); // force AVX even if detected SIMDEXT is not AVX\n```\n\nHere is a list of available FUNCNAME.\n\n''')\n        for op_name, operator in operators.items():\n            return_typ = common.get_one_type_generic(operator.params[0],\n                                                     'SCALAR')\n            func = operator.name\n            args = ', '.join([common.get_one_type_generic(p, 'SCALAR') + \\\n                              ' a' + str(count) for count, p in \\\n                              enumerate(operator.params[1:])])\n            fout.write('- `{} {}({});`  \\n'.format(return_typ, func, args))\n            if len(operator.types) < len(common.types):\n                typs = ', '.join(['{}'.format(t) for t in operator.types])\n                fout.write('  Only available for {}\\n'.format(typs))\n\n        fout.write('''\n\n## C advanced API (only available in C11)\n\nThe C advanced API takes advantage of the C11 `_Generic` keyword to provide\nfunction overloading. Unlike the base API described above there is no need to\npass as arguments the base type of the SIMD extension. The informations are\ncontained in the types provided by this API.\n\n- `nsimd_pack_SCALAR_SIMDEXT` represents a SIMD vectors containing\n  SCALAR elements of SIMD extension SIMDEXT.\n- `nsimd::packl_SCALAR_SIMDEXT` represents a SIMD vectors of logicals\n  for SCALAR elements of SIMD extension SIMDEXT.\n\nThere are versions of the above type without SIMDEXT for which the targeted\nSIMD extension is automatically chosen.\n\n- `nsimd_pack_SCALAR` represents a SIMD vectors containing SCALAR elements.\n- `nsimd::packl_SCALAR` represents a SIMD vectors of logicals for SCALAR\n  elements.\n\nGeneric types are also available:\n\n- `nsimd_pack(SCALAR)` is a type to represent a SIMD vector containing SCALAR\n  elements.  SCALAR must be one of scalar types listed above.\n- `nsimd_packl(SCALAR)` is a type to represent a SIMD vector of logicals for\n  SCALAR elements. SCALAR must be one of scalar types listed above.\n- `nsimd_pack_a(SCALAR, SIMDEXT)` is a type to represent a SIMD vector\n  containing SCALAR elements for the simd extension SIMDEXT. SCALAR must be one\n  of scalar types listed above and SIMDEXT must be a valid SIMD extension.\n- `nsimd_packl_a(SCALAR, SIMDEXT)` is a type to represent a SIMD vector of\n  logicals for SCALAR elements for the simd extension SIMDEXT. SCALAR must be\n  one of scalar types listed above and SIMDEXT must be a valid SIMD extension.\n\nFinally, operators are follow the naming: `nsimd_FUNCNAME` e.g. `nsimd_add`,\n`nsimd_sub`.\n\n## C++ advanced API\n\nThe C++ advanced API is called advanced not because it requires C++11 or above\nbut because it makes use of the particular implementation of ARM SVE by ARM\nin their compiler. We do not know if GCC (and possibly MSVC in the distant\nfuture) will use the same approach. Anyway the current implementation allows\nus to put SVE SIMD vectors inside some kind of structs that behave like\nstandard structs. If you want to be sure to write portable code do *not* use\nthis API. Two new types are available.\n\n- `nsimd::pack<SCALAR, N, SIMDEXT>` represents `N` SIMD vectors containing\n  SCALAR elements of SIMD extension SIMDEXT. You can specify only the first\n  template argument. The second defaults to 1 while the third defaults to the\n  detected SIMDEXT.\n- `nsimd::packl<SCALAR, N, SIMDEXT>` represents `N` SIMD vectors of logical\n  type containing SCALAR elements of SIMD extension SIMDEXT. You can specify\n  only the first template argument. The second defaults to 1 while the third\n  defaults to the detected SIMDEXT.\n\nUse N > 1 when declaring packs to have an unroll of N. This is particularily\nuseful on ARM.\n\nFunctions that takes packs do not take any other argument unless specified\notherwise e.g. the load family of funtions. It is impossible to determine\nthe kind of pack (unroll and SIMDEXT) from the type of a pointer. Therefore\nin this case, the last argument must be a pack and this same type will then\nreturn. Also some functions are available as C++ operators. They follow the\nnaming: `nsimd::FUNCNAME`.\n''')\n\n# -----------------------------------------------------------------------------\n\ndef gen_doc(opts):\n    common.myprint(opts, 'Generating doc for each function')\n\n    # Build tree for api.md\n    api = dict()\n    for _, operator in operators.items():\n        for c in operator.categories:\n            if c not in api:\n                api[c] = [operator]\n            else:\n                api[c].append(operator)\n\n    # api.md\n    # filename = os.path.join(opts.script_dir, '..','doc', 'markdown', 'api.md')\n    filename = common.get_markdown_file(opts, 'api')\n    if common.can_create_filename(opts, filename):\n        with common.open_utf8(opts, filename) as fout:\n            fout.write('# General API\\n\\n')\n            fout.write('- [Memory function](memory.md)\\n')\n            fout.write('- [Float16 related functions](fp16.md)\\n')\n            fout.write('- [Defines provided by NSIMD](defines.md)\\n')\n            fout.write('- [NSIMD pack and related functions](pack.md)\\n\\n')\n            fout.write('- [NSIMD C++20 concepts](concepts.md)\\n\\n')\n            fout.write('# SIMD operators\\n')\n            for c, ops in api.items():\n                if len(ops) == 0:\n                    continue\n                fout.write('\\n## {}\\n\\n'.format(c.title))\n                for op in ops:\n                    Full_name = op.full_name[0].upper() + op.full_name[1:]\n                    fout.write('- [{} ({})](api_{}.md)\\n'.format(\n                        Full_name, op.name, common.to_filename(op.name)))\n\n    # helper to get list of function signatures\n    def to_string(var):\n        sigs = [var] if type(var) == str or not hasattr(var, '__iter__') \\\n                     else list(var)\n        for i in range(0, len(sigs)):\n            sigs[i] = re.sub('[ \\n\\t\\r]+', ' ', sigs[i])\n        return '\\n'.join(sigs)\n\n    # Operators (one file per operator)\n    # dirname = os.path.join(opts.script_dir, '..','doc', 'markdown')\n    dirname = common.get_markdown_dir(opts)\n    common.mkdir_p(dirname)\n    for op_name, operator in operators.items():\n        # Skip non-matching doc\n        if opts.match and not opts.match.match(op_name):\n            continue\n        # filename = os.path.join(dirname, 'api_{}.md'.format(common.to_filename(\n        #                operator.name)))\n        filename = common.get_markdown_api_file(opts, operator.name)\n        if not common.can_create_filename(opts, filename):\n            continue\n        Full_name = operator.full_name[0].upper() + operator.full_name[1:]\n        with common.open_utf8(opts, filename) as fout:\n            fout.write('# {}\\n\\n'.format(Full_name))\n            fout.write('## Description\\n\\n')\n            fout.write(operator.desc)\n            fout.write('\\n\\n## C base API (generic)\\n\\n')\n            fout.write('```c\\n')\n            fout.write(to_string(operator.get_generic_signature('c_base')))\n            fout.write('\\n```\\n\\n')\n            fout.write('\\n\\n## C advanced API (generic, requires C11)\\n\\n')\n            fout.write('```c\\n')\n            fout.write(to_string(operator.get_generic_signature('c_adv')))\n            fout.write('\\n```\\n\\n')\n            fout.write('## C++ base API (generic)\\n\\n')\n            fout.write('```c++\\n')\n            fout.write(to_string(operator.get_generic_signature('cxx_base')))\n            fout.write('\\n```\\n\\n')\n            fout.write('## C++ advanced API\\n\\n')\n            fout.write('```c++\\n')\n            fout.write(to_string(operator.get_generic_signature('cxx_adv'). \\\n                                 values()))\n            fout.write('\\n```\\n\\n')\n            fout.write('## C base API (architecture specifics)')\n            for simd_ext in opts.simd:\n                fout.write('\\n\\n### {}\\n\\n'.format(simd_ext.upper()))\n                fout.write('```c\\n')\n                for typ in operator.types:\n                    fout.write(operator.get_signature(typ, 'c_base', simd_ext))\n                    fout.write(';\\n')\n                fout.write('```')\n            fout.write('\\n\\n## C++ base API (architecture specifics)')\n            for simd_ext in opts.simd:\n                fout.write('\\n\\n### {}\\n\\n'.format(simd_ext.upper()))\n                fout.write('```c\\n')\n                for typ in operator.types:\n                    fout.write(operator.get_signature(typ, 'cxx_base',\n                                                      simd_ext))\n                    fout.write(';\\n')\n                fout.write('```')\n\n# -----------------------------------------------------------------------------\n\ndef gen_modules_md(opts):\n    common.myprint(opts, 'Generating modules.md')\n    mods = common.get_modules(opts)\n    ndms = []\n    for mod in mods:\n        name = eval('mods[mod].{}.hatch.name()'.format(mod))\n        desc = eval('mods[mod].{}.hatch.desc()'.format(mod))\n        ndms.append([name, desc, mod])\n    filename = common.get_markdown_file(opts, 'modules')\n    if not common.can_create_filename(opts, filename):\n        return\n    with common.open_utf8(opts, filename) as fout:\n        fout.write('''# Modules\n\nNSIMD comes with several additional modules. A module provides a set of\nfunctionnalities that are usually not at the same level as SIMD intrinsics\nand/or that do not provide all C and C++ APIs. These functionnalities are\ngiven with the library because they make heavy use of NSIMD core which\nabstract SIMD intrinsics. Below is the exhaustive list of modules.\n\n''')\n        for ndm in ndms:\n            fout.write('- [{}](module_{}_overview.md)  \\n'.format(ndm[0],\n                                                                  ndm[2]))\n            fout.write('\\n'.join(['  {}'.format(line.strip()) \\\n                                  for line in ndm[1].split('\\n')]))\n            fout.write('\\n\\n')\n\n# -----------------------------------------------------------------------------\n\ndef build_exe_for_doc(opts):\n    if not opts.list_files:\n        doc_dir = os.path.join(opts.script_dir, '..', 'doc')\n        if platform.system() == 'Windows':\n            code = os.system('cd {} && nmake /F Makefile.win'. \\\n                             format(os.path.normpath(doc_dir)))\n        else:\n            code = os.system('cd {} && make -f Makefile.nix'. \\\n                             format(os.path.normpath(doc_dir)))\n        if code == 0:\n            common.myprint(opts, 'Build successful')\n        else:\n            common.myprint(opts, 'Build failed')\n\n# -----------------------------------------------------------------------------\n\ndef gen_what_is_wrapped(opts):\n    common.myprint(opts, 'Generating \"which intrinsics are wrapped\"')\n    build_exe_for_doc(opts)\n    wrapped = 'what_is_wrapped.exe' if platform.system() == 'Windows' \\\n                                    else 'what_is_wrapped'\n    doc_dir = os.path.join(opts.script_dir, '..', 'doc')\n    full_path_wrapped = os.path.join(doc_dir, wrapped)\n    if not os.path.isfile(full_path_wrapped):\n        common.myprint(opts, '{} not found'.format(wrapped))\n        return\n\n    # Content for indexing files created in this function\n    index = '# Intrinsics that are wrapped\\n'\n\n    # Build command line\n    cmd0 = '{} {},{},{},{},{},{}'.format(full_path_wrapped, common.in0,\n                                         common.in1, common.in2, common.in3,\n                                         common.in4, common.in5)\n\n    # For now we only list Intel, Arm and POWERPC intrinsics\n    simd_exts = common.x86_simds + common.arm_simds + common.ppc_simds\n    for p in common.get_platforms(opts):\n        index_simds = ''\n        for simd_ext in opts.platforms_list[p].get_simd_exts():\n            if simd_ext not in simd_exts:\n                continue\n            md = os.path.join(common.get_markdown_dir(opts),\n                              'wrapped_intrinsics_for_{}.md'.format(simd_ext))\n            index_simds += '- [{}](wrapped_intrinsics_for_{}.md)\\n'. \\\n                           format(simd_ext.upper(), simd_ext)\n            ops = [[], [], [], []]\n            for op_name, operator in operators.items():\n                if operator.src:\n                    continue\n                c_src = os.path.join(opts.include_dir, p, simd_ext,\n                                     '{}.h'.format(op_name))\n                ops[operator.output_to].append('{} \"{}\"'. \\\n                                               format(op_name, c_src))\n            if not common.can_create_filename(opts, md):\n                continue\n            with common.open_utf8(opts, md) as fout:\n                fout.write('# Intrinsics wrapped for {}\\n\\n'. \\\n                           format(simd_ext.upper()))\n                fout.write('Notations are as follows:\\n'\n                           '- `T` for trick usually using other intrinsics\\n'\n                           '- `E` for scalar emulation\\n'\n                           '- `NOOP` for no operation\\n'\n                           '- `NA` means the operator does not exist for '\n                              'the given type\\n'\n                           '- `intrinsic` for the actual wrapped intrinsic\\n'\n                           '\\n')\n            cmd = '{} {} same {} >> \"{}\"'.format(cmd0, simd_ext,\n                    ' '.join(ops[common.OUTPUT_TO_SAME_TYPE]), md)\n            if os.system(cmd) != 0:\n                common.myprint(opts, 'Unable to generate markdown for '\n                                     '\"same\"')\n                continue\n\n            cmd = '{} {} same_size {} >> \"{}\"'.format(cmd0, simd_ext,\n                    ' '.join(ops[common.OUTPUT_TO_SAME_SIZE_TYPES]), md)\n            if os.system(cmd) != 0:\n                common.myprint(opts, 'Unable to generate markdown for '\n                                     '\"same_size\"')\n                continue\n\n            cmd = '{} {} bigger_size {} >> \"{}\"'.format(cmd0, simd_ext,\n                    ' '.join(ops[common.OUTPUT_TO_UP_TYPES]), md)\n            if os.system(cmd) != 0:\n                common.myprint(opts, 'Unable to generate markdown for '\n                                     '\"bigger_size\"')\n                continue\n\n            cmd = '{} {} lesser_size {} >> \"{}\"'.format(cmd0, simd_ext,\n                    ' '.join(ops[common.OUTPUT_TO_DOWN_TYPES]), md)\n            if os.system(cmd) != 0:\n                common.myprint(opts, 'Unable to generate markdown for '\n                                     '\"lesser_size\"')\n                continue\n        if index_simds != '':\n            index += '\\n## Platform {}\\n\\n'.format(p)\n            index += index_simds\n\n    md = os.path.join(common.get_markdown_dir(opts), 'wrapped_intrinsics.md')\n    if common.can_create_filename(opts, md):\n        with common.open_utf8(opts, md) as fout:\n            fout.write(index)\n\n# -----------------------------------------------------------------------------\n\ndef get_html_dir(opts):\n    return os.path.join(opts.script_dir, '..', 'doc', 'html')\n\ndef get_html_api_file(opts, name, module=''):\n    root = get_html_dir(opts)\n    op_name = to_filename(name)\n    if module == '':\n        return os.path.join(root, 'api_{}.html'.format(op_name))\n    else:\n        return os.path.join(root, 'module_{}_api_{}.html'. \\\n                                  format(module, op_name))\n\ndef get_html_file(opts, name, module=''):\n    root = get_html_dir(opts)\n    op_name = to_filename(name)\n    if module == '':\n        return os.path.join(root, '{}.html'.format(op_name))\n    else:\n        return os.path.join(root, 'module_{}_{}.html'.format(module, op_name))\n\ndoc_header = '''\\\n<!DOCTYPE html>\n\n<html>\n  <head>\n    <meta charset=\\\"utf-8\\\">\n    <meta name=\\\"viewport\\\" content=\\\"width=device-width, initial-scale=1\\\">\n    <title>{}</title>\n    <style type=\\\"text/css\\\">\n      body {{\n        /*margin:40px auto;*/\n        margin:10px auto;\n        /*max-width:650px;*/\n        max-width:800px;\n        /*line-height:1.6;*/\n        line-height:1.4;\n        /*font-size:18px;*/\n        color:#444;\n        padding: 0 10px;\n      }}\n      h1,h2,h3 {{\n        line-height: 1.2;\n      }}\n      table {{\n        border-collapse: collapse;\n        border: 0px solid gray;\n        width: 100%;\n      }}\n      th, td {{\n        border: 2px solid gray;\n        padding: 0px 1em 0px 1em;\n      }}\n    </style>\n    <!-- https://www.mathjax.org/#gettingstarted -->\n    <script src=\\\"assets/polyfill.min.js\\\"></script>\n    <script id=\\\"MathJax-script\\\" async src=\\\"assets/tex-svg.js\\\"></script>\n    <!-- Highlight.js -->\n    <link rel=\\\"stylesheet\\\" href= \\\"assets/highlight.js.default.min.css\\\">\n    <script src=\\\"assets/highlight.min.js\\\"></script>\n    <script src=\\\"assets/cpp.min.js\\\"></script>\n    <script>hljs.initHighlightingOnLoad();</script>\n  </head>\n<body>\n\n<div style=\"text-align: center; margin-bottom: 1em;\">\n  <img src=\\\"img/logo.svg\\\">\n  <hr>\n</div>\n<div style=\"text-align: center; margin-bottom: 1em;\">\n  <b>NSIMD documentation</b>\n</div>\n<div style=\"text-align: center; margin-bottom: 1em;\">\n  <a href=\\\"index.html\\\">Index</a> |\n  <a href=\\\"tutorial.html\\\">Tutorial</a> |\n  <a href=\\\"faq.html\\\">FAQ</a> |\n  <a href=\\\"contribute.html\\\">Contribute</a> |\n  <a href=\\\"overview.html\\\">API overview</a> |\n  <a href=\\\"api.html\\\">API reference</a> |\n  <a href=\\\"wrapped_intrinsics.html\\\">Wrapped intrinsics</a> |\n  <a href=\\\"modules.html\\\">Modules</a>\n  <hr>\n</div>\n{}\n'''\n\ndoc_footer = '''\\\n  </body>\n</html>\n'''\n\ndef get_html_header(opts, title, filename):\n    # check if filename is part of a module doc\n    for mod in opts.modules_list:\n        if filename.startswith('module_{}_'.format(mod)):\n            links = eval('opts.modules_list[mod].{}.hatch.doc_menu()'. \\\n                         format(mod))\n            name = eval('opts.modules_list[mod].{}.hatch.name()'.format(mod))\n            html = '<div style=\"text-align: center; margin-bottom: 1em;\">\\n'\n            html += '<b>{} module documentation</b>\\n'.format(name)\n            if len(links) > 0:\n                html += '</div>\\n'\n                html += \\\n                '<div style=\"text-align: center; margin-bottom: 1em;\">\\n'\n                html += ' | '.join(['<a href=\\\"module_{}_{}.html\\\">{}</a>'. \\\n                                    format(mod, href, label) \\\n                                    for label, href in links.items()])\n            html += '\\n<hr>\\n</div>\\n'\n            return doc_header.format(title, html)\n    return doc_header.format(title, '')\n\ndef get_html_footer():\n    return doc_footer\n\n# -----------------------------------------------------------------------------\n\ndef gen_doc_html(opts, title):\n    if not opts.list_files:\n        build_exe_for_doc(opts)\n        md2html = 'md2html.exe' if platform.system() == 'Windows' \\\n                                else 'md2html'\n        doc_dir = os.path.join(opts.script_dir, '..', 'doc')\n        full_path_md2html = os.path.join(doc_dir, md2html)\n        if not os.path.isfile(full_path_md2html):\n            common.myprint(opts, '{} not found'.format(md2html))\n            return\n\n    # get all markdown files\n    md_dir = common.get_markdown_dir(opts)\n    html_dir = get_html_dir(opts)\n\n    if not os.path.isdir(html_dir):\n        mkdir_p(html_dir)\n\n    doc_files = []\n    for filename in os.listdir(md_dir):\n        name =  os.path.basename(filename)\n        if name.endswith('.md'):\n            doc_files.append(os.path.splitext(name)[0])\n\n    if opts.list_files:\n        ## list gen files\n        for filename in doc_files:\n            input_name = os.path.join(md_dir, filename + '.md')\n            output_name = os.path.join(html_dir, filename + '.html')\n            print(output_name)\n    else:\n        ## gen html files\n        footer = get_html_footer()\n        tmp_file = os.path.join(doc_dir, 'tmp.html')\n        for filename in doc_files:\n            header = get_html_header(opts, title, filename)\n            input_name = os.path.join(md_dir, filename + '.md')\n            output_name = os.path.join(html_dir, filename + '.html')\n            os.system('{} \"{}\" \"{}\"'.format(full_path_md2html, input_name,\n                                            tmp_file))\n            with common.open_utf8(opts, output_name) as fout:\n                fout.write(header)\n                with io.open(tmp_file, mode='r', encoding='utf-8') as fin:\n                    fout.write(fin.read())\n                fout.write(footer)\n\ndef gen_html(opts):\n    common.myprint(opts, 'Generating HTML documentation')\n    gen_doc_html(opts, 'NSIMD documentation')\n\n# -----------------------------------------------------------------------------\n\ndef copy_github_file_to_doc(opts, github_filename, doc_filename):\n    common.myprint(opts, 'Copying {} ---> {}'. \\\n                   format(github_filename, doc_filename))\n    if not common.can_create_filename(opts, doc_filename):\n        return\n    with io.open(github_filename, mode='r', encoding='utf-8') as fin:\n        file_content = fin.read()\n    # we replace all links to doc/... by nsimd/...\n    file_content = file_content.replace('doc/markdown/', 'nsimd/')\n    file_content = file_content.replace('doc/', 'nsimd/')\n    # we do not use common.open_utf8 as the copyright is already in content\n    with io.open(doc_filename, mode='w', encoding='utf-8') as fout:\n        fout.write(file_content)\n\n# -----------------------------------------------------------------------------\n\ndef doit(opts):\n    gen_overview(opts)\n    gen_doc(opts)\n    gen_modules_md(opts)\n    gen_what_is_wrapped(opts)\n    root_dir = os.path.join(opts.script_dir, '..')\n    copy_github_file_to_doc(opts,\n                            os.path.join(root_dir, 'README.md'),\n                            common.get_markdown_file(opts, 'index'))\n    copy_github_file_to_doc(opts,\n                            os.path.join(root_dir, 'CONTRIBUTING.md'),\n                            common.get_markdown_file(opts, 'contribute'))\n    gen_html(opts) # This must be last\n"
  },
  {
    "path": "egg/gen_friendly_but_not_optimized.py",
    "content": "# Copyright (c) 2019 Agenium Scale\n#\n# Permission is hereby granted, free of charge, to any person obtaining a copy\n# of this software and associated documentation files (the \"Software\"), to deal\n# in the Software without restriction, including without limitation the rights\n# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n# copies of the Software, and to permit persons to whom the Software is\n# furnished to do so, subject to the following conditions:\n#\n# The above copyright notice and this permission notice shall be included in all\n# copies or substantial portions of the Software.\n#\n# THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n# SOFTWARE.\n\nimport common\nimport operators\nimport os\nfrom datetime import date\nimport sys\n\n# -----------------------------------------------------------------------------\n# Generate advanced C++ API\n\ndef get_impl(operator):\n    if operator.params == ['v', 'v', 'v'] or \\\n       operator.params == ['l', 'v', 'v']:\n        return \\\n        '''template <typename T, int N, typename SimdExt, typename S>\n        pack{l}<T, N, SimdExt>\n        operator{cxx_op}(pack<T, N, SimdExt> const &v, S s) {{\n          return {op_name}(v, pack<T, N, SimdExt>(T(s)));\n        }}\n\n        template <typename S, typename T, int N, typename SimdExt>\n        pack{l}<T, N, SimdExt>\n        operator{cxx_op}(S s, pack<T, N, SimdExt> const &v) {{\n          return {op_name}(pack<T, N, SimdExt>(T(s)), v);\n        }}'''.format(l='l' if operator.params[0] == 'l' else '',\n                     cxx_op=operator.cxx_operator, op_name=operator.name)\n    if operator.params == ['l', 'l', 'l']:\n        return \\\n        '''template <typename T, int N, typename SimdExt, typename S>\n        packl<T, N, SimdExt>\n        operator{cxx_op}(packl<T, N, SimdExt> const &v, S s) {{\n          return {op_name}(v, packl<T, N, SimdExt>(bool(s)));\n        }}\n\n        template <typename S, typename T, int N, typename SimdExt>\n        packl<T, N, SimdExt>\n        operator{cxx_op}(S s, packl<T, N, SimdExt> const &v) {{\n          return {op_name}(pack<T, N, SimdExt>(bool(s)), v);\n        }}\n\n        template <typename T, typename S, int N, typename SimdExt>\n        packl<T, N, SimdExt> operator{cxx_op}(packl<T, N, SimdExt> const &v,\n                                      packl<S, N, SimdExt> const &w) {{\n          return {op_name}(v, reinterpretl<packl<T, N, SimdExt> >(w));\n        }}'''.format(cxx_op=operator.cxx_operator, op_name=operator.name)\n\n# -----------------------------------------------------------------------------\n# Generate advanced C++ API\n\ndef doit(opts):\n    common.myprint(opts, 'Generating friendly but not optimized advanced '\n                         'C++ API')\n    filename = os.path.join(opts.include_dir, 'friendly_but_not_optimized.hpp')\n    if not common.can_create_filename(opts, filename):\n        return\n    with common.open_utf8(opts, filename) as out:\n        out.write('''#ifndef NSIMD_FRIENDLY_BUT_NOT_OPTIMIZED_HPP\n                     #define NSIMD_FRIENDLY_BUT_NOT_OPTIMIZED_HPP\n\n                     #include <nsimd/nsimd.h>\n                     #include <nsimd/cxx_adv_api.hpp>\n\n                     namespace nsimd {{\n\n                     '''.format(year=date.today().year))\n        for op_name, operator in operators.operators.items():\n            if operator.cxx_operator == None or len(operator.params) != 3 or \\\n               operator.name in ['shl', 'shr']:\n                continue\n            out.write('''{hbar}\n\n                         {code}\n\n                         '''.format(hbar=common.hbar, code=get_impl(operator)))\n        out.write('''{hbar}\n\n                     }} // namespace nsimd\n\n                     #endif'''.format(hbar=common.hbar))\n    common.clang_format(opts, filename)\n"
  },
  {
    "path": "egg/gen_modules.py",
    "content": "# Copyright (c) 2019 Agenium Scale\n#\n# Permission is hereby granted, free of charge, to any person obtaining a copy\n# of this software and associated documentation files (the \"Software\"), to deal\n# in the Software without restriction, including without limitation the rights\n# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n# copies of the Software, and to permit persons to whom the Software is\n# furnished to do so, subject to the following conditions:\n#\n# The above copyright notice and this permission notice shall be included in all\n# copies or substantial portions of the Software.\n#\n# THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n# SOFTWARE.\n\nimport os\nimport common\n\ndef doit(opts):\n    mods = common.get_modules(opts)\n    for mod in mods:\n        exec('mods[mod].{}.hatch.doit(opts)'.format(mod))\n"
  },
  {
    "path": "egg/gen_scalar_utilities.py",
    "content": "# Copyright (c) 2019 Agenium Scale\n#\n# Permission is hereby granted, free of charge, to any person obtaining a copy\n# of this software and associated documentation files (the \"Software\"), to deal\n# in the Software without restriction, including without limitation the rights\n# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n# copies of the Software, and to permit persons to whom the Software is\n# furnished to do so, subject to the following conditions:\n#\n# The above copyright notice and this permission notice shall be included in all\n# copies or substantial portions of the Software.\n#\n# THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n# SOFTWARE.\n\nimport os\nimport common\nimport operators\nimport scalar\nimport cuda\nimport rocm\nimport oneapi\n\n# -----------------------------------------------------------------------------\n\ndef get_gpu_impl(gpu_sig, cuda_impl, rocm_impl, oneapi_sig, oneapi_impl):\n    if cuda_impl == rocm_impl:\n        return '''#if defined(NSIMD_CUDA) || defined(NSIMD_ROCM)\n\n                  inline {gpu_sig} {{\n                    {cuda_impl}\n                  }}\n\n                  #elif defined(NSIMD_ONEAPI)\n\n                  inline {oneapi_sig} {{\n                    {oneapi_impl}\n                  }}\n\n                  #endif'''.format(gpu_sig=gpu_sig, cuda_impl=cuda_impl,\n                                   oneapi_sig=oneapi_sig,\n                                   oneapi_impl=oneapi_impl)\n    else:\n        return '''#if defined(NSIMD_CUDA)\n\n                  inline {gpu_sig} {{\n                    {cuda_impl}\n                  }}\n\n                  #elif defined(NSIMD_ROCM)\n\n                  inline {gpu_sig} {{\n                    {rocm_impl}\n                  }}\n\n                  #elif defined(NSIMD_ONEAPI)\n\n                  inline {oneapi_sig} {{\n                    {oneapi_impl}\n                  }}\n\n                  #endif'''.format(gpu_sig=gpu_sig, cuda_impl=cuda_impl,\n                                   rocm_impl=rocm_impl, oneapi_sig=oneapi_sig,\n                                   oneapi_impl=oneapi_impl)\n\n# -----------------------------------------------------------------------------\n\ndef doit(opts):\n    common.myprint(opts, 'Generating scalar implementation for CPU and GPU')\n    filename = os.path.join(opts.include_dir, 'scalar_utilities.h')\n    if not common.can_create_filename(opts, filename):\n        return\n    with common.open_utf8(opts, filename) as out:\n        # we declare reinterprets now as we need them\n        scalar_tmp = []\n        gpu_tmp = []\n        oneapi_tmp = []\n        for t in operators.Reinterpret.types:\n            for tt in common.get_output_types(\n                          t, operators.Reinterpret.output_to):\n                scalar_tmp += [operators.Reinterpret(). \\\n                               get_scalar_signature('cpu', t, tt, 'c')]\n                gpu_tmp += [operators.Reinterpret(). \\\n                            get_scalar_signature('gpu', t, tt, 'cxx')]\n                oneapi_tmp += [operators.Reinterpret(). \\\n                               get_scalar_signature('oneapi', t, tt, 'cxx')]\n        scalar_reinterpret_decls = '\\n'.join(['NSIMD_INLINE ' + sig + ';' \\\n                                              for sig in scalar_tmp])\n        gpu_reinterpret_decls = '\\n'.join(['inline ' + sig + ';' \\\n                                           for sig in gpu_tmp])\n        oneapi_reinterpret_decls = '\\n'.join(['inline ' + sig + ';' \\\n                                              for sig in oneapi_tmp])\n        sleef_decls = ''\n        for op in operators.operators.values():\n            if 'sleef_symbol_prefix' in op.__class__.__dict__:\n                sleef_decls += 'f32 {}_scalar_f32({});\\n'. \\\n                               format(op.sleef_symbol_prefix,\n                                      ', '.join(['f32'] * len(op.params[1:])))\n                sleef_decls += 'f64 {}_scalar_f64({});\\n'. \\\n                               format(op.sleef_symbol_prefix,\n                                      ', '.join(['f64'] * len(op.params[1:])))\n        out.write(\n        '''#ifndef NSIMD_SCALAR_UTILITIES_H\n           #define NSIMD_SCALAR_UTILITIES_H\n\n           #if NSIMD_CXX > 0\n           #include <cmath>\n           #include <cstring>\n           #else\n           #include <math.h>\n           #include <string.h>\n           #endif\n\n           #ifdef NSIMD_NATIVE_FP16\n             #if defined(NSIMD_IS_GCC)\n               #pragma GCC diagnostic push\n               #pragma GCC diagnostic ignored \"-Wdouble-promotion\"\n             #elif defined(NSIMD_IS_CLANG)\n               #pragma clang diagnostic push\n               #pragma clang diagnostic ignored \"-Wdouble-promotion\"\n             #endif\n           #endif\n\n           {hbar}\n\n           #if NSIMD_CXX > 0\n           extern \"C\" {{\n           #endif\n\n           {sleef_decls}\n\n           #if NSIMD_CXX > 0\n           }} // extern \"C\"\n           #endif\n\n           {hbar}\n\n           {scalar_reinterpret_decls}\n\n           #if defined(NSIMD_CUDA) || defined(NSIMD_ROCM) || \\\n               defined(NSIMD_ONEAPI)\n\n           namespace nsimd {{\n\n           #if defined(NSIMD_CUDA) || defined(NSIMD_ROCM)\n\n           {gpu_reinterpret_decls}\n\n           #elif defined(NSIMD_ONEAPI)\n\n           {oneapi_reinterpret_decls}\n\n           #endif\n\n           }} // namespace nsimd\n\n           #endif\n           '''. \\\n           format(hbar=common.hbar, sleef_decls=sleef_decls,\n                  scalar_reinterpret_decls=scalar_reinterpret_decls,\n                  gpu_reinterpret_decls=gpu_reinterpret_decls,\n                  oneapi_reinterpret_decls=oneapi_reinterpret_decls))\n        for op_name, operator in operators.operators.items():\n            if not operator.has_scalar_impl:\n                continue\n            if operator.params == ['l'] * len(operator.params):\n                out.write('\\n\\n' + common.hbar + '\\n\\n')\n                out.write(\n                '''NSIMD_INLINE {c_sig} {{\n                  {scalar_impl}\n                }}\n\n                #if NSIMD_CXX > 0\n\n                namespace nsimd {{\n\n                NSIMD_INLINE {cxx_sig} {{\n                  return nsimd_scalar_{op_name}({c_args});\n                }}\n\n                {gpu_impl}\n\n                }} // namespace nsimd\n\n                #endif'''.format(\n                c_sig=operator.get_scalar_signature('cpu', '', '', 'c'),\n                cxx_sig=operator.get_scalar_signature('cpu', '', '', 'cxx'),\n                op_name=op_name,\n                c_args=', '.join(['a{}'.format(i - 1) \\\n                               for i in range(1, len(operator.params))]),\n                scalar_impl=scalar.get_impl(operator, tt, t),\n                gpu_impl=get_gpu_impl(\n                    operator.get_scalar_signature('gpu', t, tt, 'cxx'),\n                    cuda.get_impl(operator, tt, t),\n                    rocm.get_impl(operator, tt, t),\n                    operator.get_scalar_signature('oneapi', t, tt, 'cxx'),\n                    oneapi.get_impl(operator, tt, t))))\n                continue\n            for t in operator.types:\n                tts = common.get_output_types(t, operator.output_to)\n                for tt in tts:\n                    out.write('\\n\\n' + common.hbar + '\\n\\n')\n                    out.write(\n                    '''NSIMD_INLINE {c_sig} {{\n                      {scalar_impl}\n                    }}\n\n                    #if NSIMD_CXX > 0\n\n                    namespace nsimd {{\n\n                    NSIMD_INLINE {cxx_sig} {{\n                      return nsimd_scalar_{op_name}_{suffix}({c_args});\n                    }}\n\n                    {gpu_impl}\n\n                    }} // namespace nsimd\n\n                    #endif'''.format(\n                    c_sig=operator.get_scalar_signature('cpu', t, tt, 'c'),\n                    cxx_sig=operator.get_scalar_signature('cpu', t, tt, 'cxx'),\n                    op_name=op_name,\n                    suffix=t if operator.closed else '{}_{}'.format(tt, t),\n                    c_args=', '.join(['a{}'.format(i - 1) \\\n                                   for i in range(1, len(operator.params))]),\n                    scalar_impl=scalar.get_impl(operator, tt, t),\n                    gpu_impl=get_gpu_impl(\n                        operator.get_scalar_signature('gpu', t, tt, 'cxx'),\n                        cuda.get_impl(operator, tt, t),\n                        rocm.get_impl(operator, tt, t),\n                        operator.get_scalar_signature('oneapi', t, tt, 'cxx'),\n                        oneapi.get_impl(operator, tt, t))))\n\n        out.write('''\n\n                  {hbar}\n\n                  #ifdef NSIMD_NATIVE_FP16\n                    #if defined(NSIMD_IS_GCC)\n                      #pragma GCC diagnostic pop\n                    #elif defined(NSIMD_IS_CLANG)\n                      #pragma clang diagnostic pop\n                    #endif\n                  #endif\n\n                  #endif'''.format(hbar=common.hbar))\n    common.clang_format(opts, filename)\n\n"
  },
  {
    "path": "egg/gen_src.py",
    "content": "# Copyright (c) 2021 Agenium Scale\n#\n# Permission is hereby granted, free of charge, to any person obtaining a copy\n# of this software and associated documentation files (the \"Software\"), to deal\n# in the Software without restriction, including without limitation the rights\n# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n# copies of the Software, and to permit persons to whom the Software is\n# furnished to do so, subject to the following conditions:\n#\n# The above copyright notice and this permission notice shall be included in all\n# copies or substantial portions of the Software.\n#\n# THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n# SOFTWARE.\n\nimport common\nimport operators\nimport os\nfrom datetime import date\nimport sys\n\n# -----------------------------------------------------------------------------\n# Implementations for output\n\ndef get_put_impl(simd_ext):\n    args = {\n      'i8' : ['\"%d\"', '(int)buf[i]'],\n      'u8' : ['\"%d\"', '(int)buf[i]'],\n      'i16': ['\"%d\"', '(int)buf[i]'],\n      'u16': ['\"%d\"', '(int)buf[i]'],\n      'i32': ['\"%d\"', 'buf[i]'],\n      'u32': ['\"%u\"', 'buf[i]'],\n      'i64': ['\"%lld\"', '(nsimd_longlong)buf[i]'],\n      'u64': ['\"%llu\"', '(nsimd_ulonglong)buf[i]'],\n      'f16': ['\"%e\"', '(double)nsimd_f16_to_f32(buf[i])'],\n      'f32': ['\"%e\"', '(double)buf[i]'],\n      'f64': ['\"%e\"', 'buf[i]'],\n    }\n    ret = '''#ifdef NSIMD_LONGLONG_IS_EXTENSION\n               #if defined(NSIMD_IS_GCC) || defined(NSIMD_IS_CLANG)\n                 #pragma GCC diagnostic ignored \"-Wformat\"\n               #endif\n             #endif\n\n             #include <cstdio>\n\n             extern \"C\" {\n\n             '''\n    for typ in common.types:\n\n        fmt = \\\n        '''NSIMD_DLLEXPORT int NSIMD_VECTORCALL\n           nsimd_put_{simd_ext}_{l}{typ}(FILE *out, const char *fmt,\n                                         nsimd_{simd_ext}_v{l}{typ} v) {{\n             using namespace nsimd;\n             {typ} buf[NSIMD_MAX_LEN({typ})];\n\n             int n = len({typ}(), {simd_ext}());\n             store{l}u(buf, v, {typ}(), {simd_ext}());\n             if (fputs(\"{{ \", out) == EOF) {{\n               return -1;\n             }}\n             int ret = 2;\n             for (int i = 0; i < n; i++) {{\n               int code;\n               if (fmt != NULL) {{\n                 code = fprintf(out, fmt, {val});\n               }} else {{\n                 code = fprintf(out, {fmt}, {val});\n               }}\n               if (code < 0) {{\n                 return -1;\n               }}\n               ret += code;\n               if (i < n - 1) {{\n                 if (fputs(\", \", out) == EOF) {{\n                   return -1;\n                 }}\n                 ret += 2;\n               }}\n             }}\n             if (fputs(\" }}\", out) == EOF) {{\n               return -1;\n             }}\n             return ret + 2;\n           }}\n           {hbar}\n           '''\n\n        ret += fmt.format(typ=typ, l='', simd_ext=simd_ext, hbar=common.hbar,\n                          fmt=args[typ][0], val=args[typ][1])\n        ret += fmt.format(typ=typ, l='l', simd_ext=simd_ext, hbar=common.hbar,\n                          fmt=args[typ][0], val=args[typ][1])\n    ret += '} // extern \"C\"\\n'\n    return ret\n\n# -----------------------------------------------------------------------------\n# Generate base APIs\n\ndef write_cpp(opts, simd_ext, emulate_fp16):\n    filename = os.path.join(opts.src_dir, 'api_{}.cpp'.format(simd_ext))\n    if not common.can_create_filename(opts, filename):\n        return\n    with common.open_utf8(opts, filename) as out:\n        out.write('''#define NSIMD_INSIDE\n                     #include <nsimd/nsimd.h>\n                     #include <nsimd/cxx_adv_api.hpp>\n\n                     '''.format(year=date.today().year))\n        out.write(get_put_impl(simd_ext))\n    common.clang_format(opts, filename)\n\ndef doit(opts):\n    common.mkdir_p(opts.src_dir)\n    common.myprint(opts, 'Generating source for binary')\n    opts.platforms = common.get_platforms(opts)\n    for platform in opts.platforms:\n        mod = opts.platforms[platform]\n        for simd_ext in mod.get_simd_exts():\n            write_cpp(opts, simd_ext, mod.emulate_fp16(simd_ext))\n"
  },
  {
    "path": "egg/gen_tests.py",
    "content": "# Copyright (c) 2021 Agenium Scale\n#\n# Permission is hereby granted, free of charge, to any person obtaining a copy\n# of this software and associated documentation files (the \"Software\"), to deal\n# in the Software without restriction, including without limitation the rights\n# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n# copies of the Software, and to permit persons to whom the Software is\n# furnished to do so, subject to the following conditions:\n#\n# The above copyright notice and this permission notice shall be included in all\n# copies or substantial portions of the Software.\n#\n# THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n\n# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n# SOFTWARE.\n\nimport os\nimport math\nimport sys\nimport common\nimport operators\nfrom datetime import date\n\n# -----------------------------------------------------------------------------\n# Helper functions\n\ndef should_i_do_the_test(operator, tt='', t=''):\n    if operator.name == 'cvt' and t in common.ftypes and tt in common.iutypes:\n        # When converting from float to int to float then we may not\n        # get the initial result because of roundings. As tests are usually\n        # done by going back and forth then both directions get tested in the\n        # end\n        return False\n    if operator.name == 'reinterpret' and t in common.iutypes and \\\n       tt in common.ftypes:\n        # When reinterpreting from int to float we may get NaN or infinities\n        # and no ones knows what this will give when going back to ints\n        # especially when float16 are emulated. Again as tests are done by\n        # going back and forth both directions get tested in the end.\n        return False\n    if operator.name in ['notb', 'andb', 'andnotb', 'xorb', 'orb'] and \\\n       t == 'f16':\n        # Bit operations on float16 are hard to check because they are\n        # emulated in most cases. Therefore going back and forth with\n        # reinterprets for doing bitwise operations make the bit in the last\n        # place to wrong. This is normal but makes testing real hard. So for\n        # now we do not test them on float16.\n        return False\n    if operator.name in ['len', 'set1', 'set1l', 'mask_for_loop_tail',\n                         'loadu', 'loada', 'storeu', 'storea', 'loadla',\n                         'loadlu', 'storela', 'storelu', 'if_else1']:\n        # These functions are used in almost every tests so we consider\n        # that they are extensively tested.\n        return False\n    if operator.name in ['store2a', 'store2u', 'store3a', 'store3u',\n                         'store4a', 'store4u', 'scatter', 'scatter_linear',\n                         'downcvt', 'to_logical']:\n        # These functions are tested along with their load counterparts.\n        # downcvt is tested along with upcvt and to_logical is tested with\n        # to_mask\n        return False\n    return True\n\n# -----------------------------------------------------------------------------\n# CBPRNG\n\ndef cbprng_impl(typ, domain_, for_cpu, only_int = False):\n    code = '((((unsigned int)(1 + i) * 69342380u + 414585u) ' \\\n           '^ ((unsigned int)(1 + j) * 89375027u + 952905u))' \\\n           '% 1000000u)'\n    def c_code(a0_, a1_):\n        if a1_ < a0_:\n            raise ValueError(\"a0 must be lesser than a1\")\n        if typ in common.utypes and a0_ < 0.0 and a1_ < 0.0:\n            raise ValueError(\"a0 and a1 must be positive\")\n        if typ in common.ftypes:\n            a0 = a0_\n            a1 = a1_\n        else:\n            a0 = 0 if typ in common.utypes and a0_ < 0 else math.ceil(a0_)\n            a1 = math.floor(a1_)\n        if a1 < a0:\n            raise ValueError(\"a0 and a1 must be positive after filtering\")\n\n        if typ in common.iutypes:\n            return 'return ({})({} + (f32)((i32){} % {}));'. \\\n                   format(typ, a0, code, a1 - a0 + 1)\n        elif typ == 'f16':\n            return \\\n            'return {}({}(((f32){} + (f32){} * (f32)({}) / 1000000.0f)));'. \\\n            format('(f16)' if not for_cpu else 'nsimd_f32_to_f16',\n                   '(f32)(i32)' if only_int else '', a0, a1 - a0, code)\n        elif typ in ['f32', 'f64']:\n            return \\\n            'return {}(({}){} + ({}){} * ({}){} / ({})1000000);'. \\\n            format('({})({})'.format(typ, 'i' + typ[1:]) if only_int else '',\n                   typ, a0, typ, a1 - a0, typ, code, typ)\n\n    if typ not in common.utypes:\n        domain = domain_\n    domain = []\n    for i in range(len(domain_) // 2):\n        if domain_[2 * i + 1] > 0:\n            domain.append(domain_[2 * i])\n            domain.append(domain_[2 * i + 1])\n    if len(domain) == 0:\n        raise ValueError('domain {} is empty after filtering'.format(domain_))\n\n    nb_intervals = len(domain) // 2\n    if nb_intervals == 1:\n        return '  {}'.format(c_code(domain[0], domain[1]))\n    ret = 'int piece = ((1 + i) * (1 + j)) % {};'.format(nb_intervals)\n    for i in range(nb_intervals - 1):\n        ret += '\\nif (piece == {}) {{\\n'.format(i)\n        ret += '  {}\\n'.format(c_code(domain[2 * i], domain[2 * i + 1]))\n        ret += '} else '\n    ret += '{\\n'\n    ret += '  {}\\n'.format(c_code(domain[-2], domain[-1]))\n    ret += '}'\n    return ret\n\ndef cbprng(typ, operator, target, gpu_params = None):\n    if target not in ['cpu', 'cuda', 'hip', 'oneapi']:\n        raise ValueError('Unsupported target, must be cpu, cuda, hip or '\n                         'oneapi')\n\n    arity = len(operator.params[1:])\n    ret = '{}{} random_impl(int i, int j) {{\\n'. \\\n          format('' if target in ['cpu', 'oneapi'] else '__device__ ', typ)\n    for_cpu = (target == 'cpu')\n\n    if arity == 1:\n        ret += cbprng_impl(typ, operator.domain[0], for_cpu,\n                           operator.tests_on_integers_only)\n    else:\n        for i in range(arity - 1):\n            ret += 'if (j == {}) {{\\n  {}\\n}} else '. \\\n                   format(i, cbprng_impl(typ, operator.domain[i], for_cpu,\n                                         operator.tests_on_integers_only))\n        ret += '{{\\n{}\\n}} '. \\\n               format(cbprng_impl(typ, operator.domain[-1],\n                                  for_cpu, operator.tests_on_integers_only))\n    ret += '\\n}\\n\\n'\n\n    if target == 'cpu':\n        ret += '''void random({} *dst, unsigned int n, int j) {{\n                    unsigned int i;\n                    for (i = 0; i < n; i++) {{\n                      dst[i] = random_impl((int)i, j);\n                    }}\n                  }}'''.format(typ)\n    elif target == 'cuda':\n        ret += '''__global__ void random_kernel({typ} *dst, int n, int j) {{\n                    int i = threadIdx.x + blockIdx.x * blockDim.x;\n                    if (i < n) {{\n                      dst[i] = random_impl((int)i, j);\n                    }}\n                  }}\n\n                  void random({typ} *dst, unsigned int n, int j) {{\n                    random_kernel<<<{gpu_params}>>>(dst, (int)n, j);\n                  }}'''.format(typ=typ, gpu_params=gpu_params)\n    elif target == 'hip':\n        ret += '''__global__ void random_kernel({typ} *dst, size_t n, int j) {{\n                    size_t i = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n                    if (i < n) {{\n                      dst[i] = random_impl((int)i, j);\n                    }}\n                  }}\n\n                  void random({typ} *dst, unsigned int n, int j) {{\n                    hipLaunchKernelGGL(random_kernel, {gpu_params}, 0, 0,\n                                       dst, n, j);\n                  }}'''.format(typ=typ, gpu_params=gpu_params)\n    elif target == 'oneapi':\n        ret += '''inline void random_kernel({typ} *dst, unsigned int n, int j,\n                                            sycl::nd_item<1> item) {{\n                    size_t i = item.get_global_id().get(0);\n                    if (i < n) {{\n                      dst[i] = random_impl((int)i, j);\n                    }}\n                  }}\n\n                  void random({typ} *dst, unsigned int n, int j) {{\n                    size_t nt = (size_t)nsimd_kernel_param({n}, {tpb});\n                    sycl::queue q_ = nsimd::oneapi::default_queue();\n                    q_.parallel_for(sycl::nd_range<1>(sycl::range<1>(nt),\n                                    sycl::range<1>({tpb})),\n                                    [=](sycl::nd_item<1> item){{\n                                      random_kernel(dst, n, j, item);\n                                    }}).wait_and_throw();\n                  }}'''.format(typ=typ, n=gpu_params[0], tpb=gpu_params[1])\n    return ret\n\n# -----------------------------------------------------------------------------\n\nposix_c_source = \\\n'''#if !defined(_POSIX_C_SOURCE)\n   #define _POSIX_C_SOURCE 200112L\n   #elif _POSIX_C_SOURCE < 200112L\n   #error \"_POSIX_C_SOURCE defined by third-party but must be >= 200112L\"\n   #endif'''\n\nmsvc_c4334_warning = \\\n'''#ifdef NSIMD_IS_MSVC\n     // MSVC wrongly emits warning C4333 on the following pieces of code:\n     //   (i64)(1 << (rand() % 4))\n     //   (u64)(1 << (rand() % 4))\n     // so we deactive it for now\n     #pragma warning( disable : 4334 )\n   #endif'''\n\n# -----------------------------------------------------------------------------\n# Get filename for test\n\ndef get_filename(opts, op, typ, lang, custom_name=''):\n    tests_dir = os.path.join(opts.tests_dir, lang)\n    common.mkdir_p(tests_dir)\n    ext = { 'c_base': 'prec11.c', 'c_adv': 'c' }\n    if not custom_name:\n        filename = os.path.join(tests_dir, '{}.{}.{}'.format(op.name, typ,\n                     ext[lang] if lang in ['c_base', 'c_adv'] else 'cpp'))\n    else:\n        filename = os.path.join(tests_dir, '{}_{}.{}.{}'.format(op.name,\n                     custom_name, typ,\n                     ext[lang] if lang in ['c_base', 'c_adv'] else 'cpp'))\n    if common.can_create_filename(opts, filename):\n        return filename\n    else:\n        return None\n\n# -----------------------------------------------------------------------------\n# Get standard includes\n\ndef get_includes(lang):\n    ret = '#include <nsimd/nsimd.h>\\n'\n    if lang == 'cxx_adv':\n        ret += '#include <nsimd/cxx_adv_api.hpp>\\n'\n    if lang == 'c_adv':\n        ret += '#include <nsimd/c_adv_api.h>\\n'\n    if lang in ['c_base', 'c_adv']:\n        ret += '''#include <stdlib.h>\n                  #include <stdio.h>\n                  #include <errno.h>\n                  #include <stdio.h>\n                  #include <assert.h>\n                  #include <string.h>'''\n    else:\n        ret += '''#include <cstdlib>\n                  #include <cstdio>\n                  #include <cerrno>\n                  #include <iostream>\n                  #include <cassert>\n                  #include <cstring>'''\n    return ret\n\n# -----------------------------------------------------------------------------\n# Function to compute number of common bits between two floatting points\n# numbers\n\ndistance_int = '''\nint distance({typ} a, {typ} b) {{\n  {typ} d = (a > b ? a - b : b - a);\n  return (int)((u64)d > (u64)INT_MAX) ? (u64)INT_MAX : (u64)d);\n}}\n'''\n\ndistance_float = '''\nint distance({typ} a, {typ} b) {{\n  if (nsimd_isnan_{typ}(a) && nsimd_isnan_{typ}(b)) {{\n    return 0;\n  }}\n\n  if (nsimd_isnan_{typ}(a) || nsimd_isnan_{typ}(b)) {{\n    return -1;\n  }}\n\n  if (nsimd_isinf_{typ}(a) && nsimd_isinf_{typ}(b)) {{\n    return 0;\n  }}\n\n  if (nsimd_isinf_{typ}(a) || nsimd_isinf_{typ}(b)) {{\n    return -1;\n  }}\n\n  return nsimd_ufp_{typ}(a, b);\n}}\n\n/* ------------------------------------------------------------------------- */\n'''\n\ndistance = {\n  'i8': distance_int.format(typ='i8'),\n  'u8': distance_int.format(typ='u8'),\n  'i16': distance_int.format(typ='i16'),\n  'u16': distance_int.format(typ='u16'),\n  'i32': distance_int.format(typ='i32'),\n  'u32': distance_int.format(typ='u32'),\n  'i64': distance_int.format(typ='i64'),\n  'u64': distance_int.format(typ='u64'),\n  'f16': distance_float.format(typ='f16'),\n  'f32': distance_float.format(typ='f32'),\n  'f64': distance_float.format(typ='f64')\n}\n\n# -----------------------------------------------------------------------------\n# Template for a lot of tests\n\ntemplate = \\\n'''{includes}\n\n#define SIZE (2048 / {sizeof})\n\n#define STATUS \"test of {op_name} over {typ}\"\n\n#define CHECK(a) {{ \\\\\n  errno = 0; \\\\\n  if (!(a)) {{ \\\\\n    fprintf(stderr, \"ERROR: \" #a \":%d: %s\\\\n\", \\\\\n            __LINE__, strerror(errno)); \\\\\n    fflush(stderr); \\\\\n    exit(EXIT_FAILURE); \\\\\n  }} \\\\\n}}\n\n/* ------------------------------------------------------------------------- */\n\n{extra_code}\n\nint comp_function({typ} ref_out, {typ} nsimd_out)\n{{\n   {comp};\n}}\n\nint main(void) {{\n  int vi, i, step;\n  {typ} *vout_ref, *vout_nsimd;\n  {vin_defi}\n\n  CHECK(vout_ref = ({typ}*)nsimd_aligned_alloc(SIZE * {sizeof}));\n  CHECK(vout_nsimd = ({typ}*)nsimd_aligned_alloc(SIZE * {sizeof}));\n\n  step = vlen({typ});\n\n  fprintf(stdout, STATUS \"...\\\\n\");\n  fflush(stdout);\n\n  /* Fill input vector(s) with random values */\n  {vin_rand}\n\n  /* We ensure that inputs are normal numbers */\n  for (i = 0; i < SIZE; i++) {{\n    {denormalize_inputs}\n  }}\n\n  /* Fill vout_ref output vector with reference values */\n  for (i = 0; i < SIZE; i += {cpu_step}) {{\n    /* This is a call directly to the cpu API of nsimd\n       to ensure that we call the scalar version of the\n       function */\n    {vout_ref_comp}\n  }}\n\n  /* Fill vout_nsimd output vector with computed values */\n  for (i = 0; i < SIZE; i += step) {{\n    {vout_nsimd_comp}\n  }}\n\n  {dnz_flush_to_zero}\n\n  /* Compare results */\n  for (vi = 0; vi < SIZE; vi += step) {{\n    for (i = vi; i < vi + step; i++) {{\n      if (comp_function(vout_ref[i], vout_nsimd[i])) {{\n        fprintf(stdout, STATUS \"... FAIL\\\\n\");\n        fflush(stdout);\n        return -1;\n      }}\n    }}\n  }}\n\n  fprintf(stdout, STATUS \"... OK\\\\n\");\n  fflush(stdout);\n  return 0;\n}}'''\n\n# -----------------------------------------------------------------------------\n# Common to most of the tests\n\ndef get_content(op, typ, lang):\n    cast = 'f32' if typ in ['f16', 'f32'] else 'f64'\n    zero = 'nsimd_f32_to_f16(0.0f)' if typ == 'f16' else '({})0'.format(typ)\n\n    # By default we use emulation functions (\"cpu\" architecture) for testing\n    # in which case increment is given by nsimd_cpu_len()\n    cpu_step = 'nsimd_len_cpu_{}()'.format(typ)\n\n    nargs = range(1, len(op.params))\n\n    if typ in common.ftypes:\n        code = ['''if (!nsimd_isnormal_{typ}(vin{i}[i])) {{\n                     vin{i}[i] = {zero};\n                   }}'''.format(typ=typ, i=i, zero=zero) for i in nargs]\n        denormalize_inputs = '\\n'.join(code)\n    else:\n        denormalize_inputs = ''\n\n    # Depending on function parameters, generate specific input, ...\n    if all(e == 'v' for e in op.params) or all(e == 'l' for e in op.params):\n        logical = 'l' if op.params[0] == 'l' else ''\n\n        # Make vin_defi\n        code = ['{} *vin{};'.format(typ, i) for i in nargs]\n        code += ['CHECK(vin{} = ({}*)nsimd_aligned_alloc(SIZE * {}));'.\n                 format(i, typ, common.sizeof(typ)) for i in nargs]\n        vin_defi = '\\n'.join(code)\n        vin_rand = '\\n'.join(['random(vin{}, SIZE, {});'.format(i, i - 1) \\\n                              for i in nargs])\n\n        # Make vout_ref_comp\n        args = ', '.join(['va{}'.format(i) for i in nargs])\n        code = ['nsimd_cpu_v{}{} {}, vc;'.format(logical, typ, args)]\n        code += ['va{} = nsimd_load{}u_cpu_{}(&vin{}[i]);'.\n                 format(i, logical, typ, i) for i in nargs]\n        code += ['vc = nsimd_{}_cpu_{}({});'.format(op.name, typ, args)]\n        code += ['nsimd_store{}u_cpu_{}(&vout_ref[i], vc);'. \\\n                 format(logical, typ)]\n        vout_ref_comp = '\\n'.join(code)\n\n        # Make vout_nsimd_comp\n        args = ', '.join(['va{}'.format(i) for i in nargs])\n        if lang == 'c_base':\n            code = ['vec{}({}) {}, vc;'.format(logical, typ, args)]\n            code += ['va{} = vload{}u(&vin{}[i], {});'.\n                     format(i, logical, i, typ) for i in nargs]\n            code += ['vc = v{}({}, {});'.format(op.name, args, typ)]\n            code += ['vstore{}u(&vout_nsimd[i], vc, {});'.format(logical, typ)]\n            vout_nsimd_comp = '\\n'.join(code)\n        if lang == 'c_adv':\n            code = ['nsimd_pack{}_{} {}, vc;'.format(logical, typ, args)]\n            code += ['va{} = nsimd_load{}u(nsimd_pack{}_{}, &vin{}[i]);'.\n                     format(i, logical, logical, typ, i) for i in nargs]\n            code += ['vc = nsimd_{}({});'.format(op.name, args)]\n            code += ['nsimd_store{}u(&vout_nsimd[i], vc);'. \\\n                     format(logical, typ)]\n            vout_nsimd_comp = '\\n'.join(code)\n        if lang == 'cxx_base':\n            code = ['vec{}({}) {}, vc;'.format(logical, typ, args)]\n            code += ['va{} = nsimd::load{}u(&vin{}[i], {}());'.\n                     format(i, logical, i, typ) for i in nargs]\n            code += ['vc = nsimd::{}({}, {}());'.format(op.name, args, typ)]\n            code += ['nsimd::store{}u(&vout_nsimd[i], vc, {}());'. \\\n                     format(logical, typ)]\n            vout_nsimd_comp = '\\n'.join(code)\n        if lang == 'cxx_adv':\n            code = ['nsimd::pack{}<{}> {}, vc;'.format(logical, typ, args)]\n            code += ['''va{i} = nsimd::load{logical}u<\n                                  nsimd::pack{logical}<{typ}> >(\n                                      &vin{i}[i]);'''.\n                     format(i=i, logical=logical, typ=typ) for i in nargs]\n            if op.cxx_operator:\n                if len(op.params[1:]) == 1:\n                    code += ['vc = {}va1;'.\n                             format(op.cxx_operator)]\n                if len(op.params[1:]) == 2:\n                    code += ['vc = va1 {} va2;'.\n                             format(op.cxx_operator)]\n            else:\n                code += ['vc = nsimd::{}({});'.format(op.name, args)]\n            code += ['nsimd::store{}u(&vout_nsimd[i], vc);'. \\\n                     format(logical, typ)]\n            vout_nsimd_comp = '\\n'.join(code)\n    elif op.params == ['l', 'v', 'v']:\n        vin_defi = \\\n        '''{typ} *vin1, *vin2;\n           CHECK(vin1 = ({typ}*)nsimd_aligned_alloc(SIZE * {sizeof}));\n           CHECK(vin2 = ({typ}*)nsimd_aligned_alloc(SIZE * {sizeof}));'''. \\\n           format(typ=typ, sizeof=common.sizeof(typ))\n        code = ['random(vin{}, SIZE, {});'.format(i, i - 1) for i in nargs]\n        vin_rand = '\\n'.join(code)\n\n        vout_ref_comp = '''nsimd_cpu_v{typ} va1, va2;\n                           nsimd_cpu_vl{typ} vc;\n                           va1 = nsimd_loadu_cpu_{typ}(&vin1[i]);\n                           va2 = nsimd_loadu_cpu_{typ}(&vin2[i]);\n                           vc = nsimd_{op_name}_cpu_{typ}(va1, va2);\n                           nsimd_storelu_cpu_{typ}(&vout_ref[i], vc);'''. \\\n                           format(typ=typ, op_name=op.name)\n\n        if lang == 'c_base':\n            vout_nsimd_comp = '''vec({typ}) va1, va2;\n                                 vecl({typ}) vc;\n                                 va1 = vloadu(&vin1[i], {typ});\n                                 va2 = vloadu(&vin2[i], {typ});\n                                 vc = v{op_name}(va1, va2, {typ});\n                                 vstorelu(&vout_nsimd[i], vc, {typ});'''. \\\n                                 format(typ=typ, op_name=op.name)\n        if lang == 'c_adv':\n            vout_nsimd_comp = '''nsimd_pack_{typ} va1, va2;\n                                 nsimd_packl_{typ} vc;\n                                 va1 = nsimd_loadu(nsimd_pack_{typ}, &vin1[i]);\n                                 va2 = nsimd_loadu(nsimd_pack_{typ}, &vin2[i]);\n                                 vc = nsimd_{op_name}(va1, va2);\n                                 nsimd_storelu(&vout_nsimd[i], vc);'''. \\\n                                 format(typ=typ, op_name=op.name)\n        if lang == 'cxx_base':\n            vout_nsimd_comp = \\\n            '''vec({typ}) va1, va2;\n               vecl({typ}) vc;\n               va1 = nsimd::loadu(&vin1[i], {typ}());\n               va2 = nsimd::loadu(&vin2[i], {typ}());\n               vc = nsimd::{op_name}(va1, va2, {typ}());\n               nsimd::storelu(&vout_nsimd[i], vc, {typ}());'''. \\\n               format(typ=typ, op_name=op.name)\n        if lang == 'cxx_adv':\n            if op.cxx_operator:\n                do_computation = 'vc = va1 {} va2;'. \\\n                                 format(op.cxx_operator)\n            else:\n                do_computation = 'vc = nsimd::{}(va1, va2, {}());'. \\\n                                 format(op.name, typ)\n            vout_nsimd_comp = \\\n            '''nsimd::pack<{typ}> va1, va2;\n               nsimd::packl<{typ}> vc;\n               va1 = nsimd::loadu<nsimd::pack<{typ}> >(&vin1[i]);\n               va2 = nsimd::loadu<nsimd::pack<{typ}> >(&vin2[i]);\n               {do_computation}\n               nsimd::storelu(&vout_nsimd[i], vc);'''. \\\n               format(typ=typ, op_name=op.name,\n                      do_computation=do_computation)\n\n    elif op.params == ['v', 'v', 'p']:\n        vin_defi = \\\n        '''{typ} *vin1;\n           CHECK(vin1 = ({typ}*)nsimd_aligned_alloc(SIZE * {sizeof}));'''. \\\n           format(typ=typ, sizeof=common.sizeof(typ))\n        vin_rand = 'random(vin1, SIZE, 0);'\n        vout_ref_comp = \\\n        '''nsimd_cpu_v{typ} va1, vc;\n           va1 = nsimd_loadu_cpu_{typ}(&vin1[i]);\n           vc = nsimd_{op_name}_cpu_{typ}(va1, (i / step) % {typnbytes});\n           nsimd_storeu_cpu_{typ}(&vout_ref[i], vc);'''. \\\n           format(typ=typ, op_name=op.name, typnbytes=typ[1:])\n        if lang == 'c_base':\n            vout_nsimd_comp = \\\n            '''vec({typ}) va1, vc;\n               va1 = vloadu(&vin1[i], {typ});\n               vc = v{op_name}(va1, (i / step) % {typnbytes}, {typ});\n               vstoreu(&vout_nsimd[i], vc, {typ});'''. \\\n               format(typ=typ, op_name=op.name, typnbytes=typ[1:])\n        if lang == 'c_adv':\n            vout_nsimd_comp = \\\n            '''nsimd_pack_{typ} va1, vc;\n               va1 = nsimd_loadu(nsimd_pack_{typ}, &vin1[i]);\n               vc = nsimd_{op_name}(va1, (i / step) % {typnbytes});\n               nsimd_storeu(&vout_nsimd[i], vc);'''. \\\n               format(typ=typ, op_name=op.name, typnbytes=typ[1:])\n        if lang == 'cxx_base':\n            vout_nsimd_comp = \\\n            '''vec({typ}) va1, vc;\n               va1 = nsimd::loadu(&vin1[i], {typ}());\n               vc = nsimd::{op_name}(va1, (i / step) % {typnbytes}, {typ}());\n               nsimd::storeu(&vout_nsimd[i], vc, {typ}());'''. \\\n                       format(typ=typ, op_name=op.name, typnbytes=typ[1:])\n        if lang == 'cxx_adv':\n            if op.cxx_operator:\n                do_computation = 'vc = va1 {} ((i / step) % {typnbytes});'. \\\n                        format(op.cxx_operator, typnbytes=typ[1:])\n            else:\n                do_computation = \\\n                'vc = nsimd::{}(va1, (i / step) % {typnbytes});'. \\\n                format(op.name, typnbytes=typ[1:])\n            vout_nsimd_comp = \\\n            '''nsimd::pack<{typ}> va1, vc;\n               va1 = nsimd::loadu<nsimd::pack<{typ}> >(&vin1[i]);\n               {do_computation}\n               nsimd::storeu(&vout_nsimd[i], vc);'''. \\\n               format(typ=typ, do_computation=do_computation)\n    else:\n        raise ValueError('No test available for operator \"{}\" on type \"{}\"'.\n                         format(op.name, typ))\n    return { 'vin_defi': vin_defi, 'vin_rand': vin_rand, 'cpu_step': cpu_step,\n             'vout_ref_comp': vout_ref_comp,\n             'vout_nsimd_comp': vout_nsimd_comp,\n             'denormalize_inputs': denormalize_inputs }\n\n# -----------------------------------------------------------------------------\n# Generate test in C, C++ (base API) and C++ (advanced API) for almost all\n# tests\n\ndef gen_test(opts, op, typ, lang):\n    filename = get_filename(opts, op, typ, lang)\n    if filename == None:\n        return\n\n    content = get_content(op, typ, lang)\n\n    extra_code = cbprng(typ, op, 'cpu')\n\n    if op.name in ['notb', 'andb', 'orb', 'xorb', 'andnotb']:\n        comp = 'return nsimd_scalar_reinterpret_{uT}_{typ}(ref_out) != ' \\\n                      'nsimd_scalar_reinterpret_{uT}_{typ}(nsimd_out)'. \\\n               format(typ=typ, uT=common.bitfield_type[typ])\n    elif op.name in ['max', 'min'] and typ in common.ftypes:\n        comp = 'return nsimd_scalar_ne_{}(ref_out, nsimd_out);'.format(typ)\n    else:\n        if typ in common.ftypes:\n            comp = 'return distance(ref_out, nsimd_out) < {}'. \\\n                   format(op.ufp[typ])\n            extra_code += distance[typ]\n        else:\n            comp = 'return nsimd_scalar_ne_{}(ref_out, nsimd_out);'. \\\n                   format(typ)\n\n    includes = get_includes(lang)\n\n    if typ in common.ftypes:\n        dnz_flush_to_zero = \\\n        '''/* We flush subnormal numbers to zero because support for it      */\n           /* can be disabled, some intrinsics do not support them,          */\n           /* execution of 32-bits code on 64-bits system may have different */\n           /* ways of handling them. */\n           for (i = 0; i < SIZE; i++) {{\n             if (!nsimd_isnormal_{typ}(vout_ref[i])) {{\n               vout_ref[i] = {zero};\n             }}\n             if (!nsimd_isnormal_{typ}(vout_nsimd[i])) {{\n               vout_nsimd[i] = {zero};\n             }}\n           }}'''.format(typ=typ, zero='({})0'.format(typ) if typ != 'f16' \\\n                        else 'nsimd_f32_to_f16(0.0f)')\n    else:\n        dnz_flush_to_zero = ''\n\n    with common.open_utf8(opts, filename) as out:\n        out.write(template.format(\n            includes=includes, sizeof=common.sizeof(typ), typ=typ,\n            op_name=op.name, year=date.today().year, comp=comp,\n            dnz_flush_to_zero=dnz_flush_to_zero,\n            extra_code=extra_code, **content))\n    common.clang_format(opts, filename)\n\n# -----------------------------------------------------------------------------\n# Tests for addv\n\ndef gen_addv(opts, op, typ, lang):\n    filename = get_filename(opts, op, typ, lang)\n    if filename == None:\n        return\n\n    if typ == 'f16':\n        rand = 'nsimd_f32_to_f16((f32)(rand() % 3) - 1.0f)'\n        zero = 'nsimd_f32_to_f16(0.0f)'\n        comp = 'nsimd_f16_to_f32(vout[i]) != nsimd_f16_to_f32(vref[i])'\n    else:\n        rand = '({})((int)(rand() % 3) - 1)'.format(typ)\n        zero = '({})0'.format(typ)\n        comp = 'vout[i] != vref[i]'\n\n    if lang == 'c_base':\n        nsimd = 'vaddv(vloada(vin + (i * step), {typ}), {typ})'. \\\n                format(typ=typ)\n    elif lang == 'c_adv':\n        nsimd = 'nsimd_addv(nsimd_loada(nsimd_pack_{}, vin + (i * step)))'. \\\n                format(typ)\n    elif lang == 'cxx_base':\n        nsimd = 'nsimd::addv(nsimd::loada(vin + (i * step), {}()), {}())'. \\\n                format(typ, typ)\n    elif lang == 'cxx_adv':\n        nsimd = 'nsimd::addv(nsimd::loada<nsimd::pack<{}> >' \\\n                             '(vin + (i * step)))'.format(typ)\n\n    with common.open_utf8(opts, filename) as out:\n        out.write(\n        '''{posix_c_source}\n           {includes}\n\n           #define CHECK(a) {{ \\\\\n             errno = 0; \\\\\n             if (!(a)) {{ \\\\\n               fprintf(stderr, \"ERROR: \" #a \":%d: %s\\\\n\", \\\\\n                       __LINE__, strerror(errno)); \\\\\n               fflush(stderr); \\\\\n               exit(EXIT_FAILURE); \\\\\n             }} \\\\\n           }}\n\n           #define STATUS \"test of addv over {typ}\"\n\n           int main() {{\n             int step = vlen({typ});\n             int size = 2048;\n             int i;\n             {typ} *vin, *vref, *vout;\n\n             CHECK(vin = ({typ} *)nsimd_aligned_alloc(size * {sizeof} * step));\n             CHECK(vref = ({typ} *)nsimd_aligned_alloc(size * {sizeof}));\n             CHECK(vout = ({typ} *)nsimd_aligned_alloc(size * {sizeof}));\n\n             fprintf(stdout, STATUS \"...\\\\n\");\n             fflush(stdout);\n\n             for (i = 0; i < step * size; i++) {{\n               vin[i] = {rand};\n             }}\n\n             for (i = 0; i < size; i++) {{\n               int j;\n               {typ} acc = {zero};\n               for (j = step * i; j < step * i + step; j++) {{\n                   acc = nsimd_scalar_add_{typ}(acc, vin[j]);\n               }}\n               vref[i] = acc;\n             }}\n\n             for (i = 0; i < size; i++) {{\n               vout[i] = {nsimd};\n             }}\n\n             for (i = 0; i < size; i++) {{\n               if ({comp}) {{\n                 fprintf(stdout, STATUS \"... FAIL\\\\n\");\n                 fflush(stdout);\n                 return -1;\n               }}\n             }}\n\n             fprintf(stdout, STATUS \"... OK\\\\n\");\n             fflush(stdout);\n             return 0;\n           }}\n           '''.format(typ=typ, sizeof=common.sizeof(typ), zero=zero, rand=rand,\n                      comp=comp, nsimd=nsimd, posix_c_source=posix_c_source,\n                      includes=get_includes(lang)))\n    common.clang_format(opts, filename)\n\n# -----------------------------------------------------------------------------\n# General tests helpers for adds/subs\n\ndef aligned_alloc_error():\n      return '''\n      #define CHECK(a) \\\\\n      {{ \\\\\n        errno = 0; \\\\\n        if (!(a)) \\\\\n        {{ \\\\\n          fprintf(stderr, \\\"ERROR: \\\" #a \\\":%d: %s\\\\n\\\", \\\\\n                __LINE__, strerror(errno)); \\\\\n          fflush(stderr); \\\\\n          exit(EXIT_FAILURE); \\\\\n        }} \\\\\n      }}\n      '''\n\ndef equal(typ):\n      return '''\n      int equal({typ} expected_result, {typ} computed_result)\n      {{\n        return expected_result == computed_result;\n      }}\n      '''.format(typ=typ)\n\ndef adds_subs_check_case():\n      return '''\n      #define CHECK_CASE(test_output, which_test) \\\\\n      {{ \\\\\n        if(0 == (test_output)) \\\\\n        {{ \\\\\n          fprintf(stdout, STATUS \\\" ... \\\" which_test \\\" check FAIL\\\\n\\\"); \\\\\n          fflush(stdout); \\\\\n          return -1; \\\\\n        }} \\\\\n      }}\n      '''\n\ndef random_sign_flip():\n      return '''\n      int random_sign_flip(void)\n      {{\n          return 2 * (rand() % 2) - 1;\n      }}\n      '''\n\ndef zero_out_arrays(typ):\n      return '''\n      void zero_out_arrays({typ} vin1[], {typ} vin2[], {typ} vout_expected[],\n                           {typ} vout_computed[])\n      {{\n        int ii = 0;\n        for(ii = 0; ii < SIZE; ++ii)\n        {{\n           vin1[ii] = ({typ})0;\n           vin2[ii] = ({typ})0;\n           vout_expected[ii] = ({typ})0;\n           vout_computed[ii] = ({typ})0;\n        }}\n      }}\n      '''.format(typ=typ)\n\ndef compute_op_given_language(typ, op, language):\n      if 'c_base' == language:\n            return \\\n            '''vec({typ}) va1, va2, vc;\n               va1 = vloadu(&vin1[outer], {typ});\n               va2 = vloadu(&vin2[outer], {typ});\n               vc = v{op}(va1, va2, {typ});\n               vstoreu(&vout_computed[outer], vc, {typ});'''. \\\n               format(typ=typ, op=op)\n      elif 'c_adv' == language:\n            return \\\n            '''nsimd_pack_{typ} va1, va2, vc;\n               va1 = nsimd_loadu(nsimd_pack_{typ}, &vin1[outer]);\n               va2 = nsimd_loadu(nsimd_pack_{typ}, &vin2[outer]);\n               vc = nsimd_{op}(va1, va2);\n               nsimd_storeu(&vout_computed[outer], vc);'''. \\\n               format(typ=typ, op=op)\n      elif 'cxx_base' == language:\n            return \\\n            '''vec({typ}) va1, va2, vc;\n               va1 = nsimd::loadu(&vin1[outer], {typ}());\n               va2 = nsimd::loadu(&vin2[outer], {typ}());\n               vc = nsimd::{op}(va1, va2, {typ}());\n               nsimd::storeu(&vout_computed[outer], vc, {typ}());'''. \\\n               format(typ=typ, op=op)\n      else:\n            return \\\n            '''nsimd::pack<{typ}> va1, va2, vc;\n               va1 = nsimd::loadu<nsimd::pack<{typ}> >(&vin1[outer]);\n               va2 = nsimd::loadu<nsimd::pack<{typ}> >(&vin2[outer]);\n               vc = nsimd::{op}(va1, va2);\n               nsimd::storeu(&vout_computed[outer], vc);'''. \\\n               format(typ=typ, op=op)\n\ndef compare_expected_vs_computed(typ, op, language):\n      values_computation = compute_op_given_language(typ, op, language)\n      return '''\n      int compare_expected_vs_computed(const {typ}* vin1, const {typ}* vin2,\n                                       const {typ}* vout_expected,\n                                       {typ} vout_computed[])\n      {{\n          const int step = vlen({typ});\n          int outer = 0;\n          int inner = 0;\n\n          for (outer = 0; outer < SIZE; outer += step) {{\n          /* Fill vout_computed with computed values */\n          {values_computation}\n          /* Compare results */\n          for (inner = outer; inner < outer + step; ++inner) {{\n              if (! equal(vout_expected[inner], vout_computed[inner])) {{\n                return 0;\n              }}\n            }}\n          }}\n\n          return 1;\n      }}\n      '''.format(typ=typ, values_computation=values_computation)\n\ndef test_signed_neither_overflow_nor_underflow(typ, min_, max_, operator,\n                                               check):\n      return '''\n      int test_neither_overflow_nor_underflow({typ} vin1[], {typ} vin2[],\n                                              {typ} vout_expected[],\n                                              {typ} vout_computed[])\n      {{\n        int ii = 0;\n        while(ii < SIZE)\n        {{\n          {typ} a = ({typ})((random_sign_flip() * rand()) % {max_} % {min_});\n          {typ} b = ({typ})((random_sign_flip() * rand()) % {max_} % {min_});\n          if({check}(a, b))\n          {{\n            vin1[ii] = a;\n            vin2[ii] = b;\n            vout_expected[ii] = ({typ})(a {operator} b);\n            ++ ii;\n          }}\n        }}\n        assert(ii == SIZE);\n        /*\n        Test:\n        if (neither overflow nor underflow) {{\n          vout_expected[ii] == a {operator} b;\n        }}\n        */\n        return compare_expected_vs_computed(vin1, vin2, vout_expected,\n                                            vout_computed);\n      }}\n      '''.format(typ=typ, min_=min_, max_=max_, operator=operator, check=check)\n\ndef test_signed_all_cases(typ, min_, max_, oper, oper_is_overflow,\n                          oper_is_underflow):\n      return '''\n      int test_all_cases({typ} vin1[], {typ} vin2[], {typ} vout_expected[],\n                         {typ} vout_computed[])\n      {{\n        int ii = 0;\n        for(ii = 0; ii < SIZE; ++ii)\n        {{\n          vin1[ii] = ({typ})((random_sign_flip() * rand()) % {max_} % {min_});\n          vin2[ii] = ({typ})((random_sign_flip() * rand()) % {max_} % {min_});\n          if({oper_is_overflow}(vin1[ii], vin2[ii]))\n          {{\n            vout_expected[ii] = {max_};\n          }}\n          else if({oper_is_underflow}(vin1[ii], vin2[ii]))\n          {{\n            vout_expected[ii] = {min_};\n          }}\n          else\n          {{\n            vout_expected[ii] = ({typ})(vin1[ii] {oper} vin2[ii]);\n          }}\n        }}\n        /* Test all cases */\n        return compare_expected_vs_computed(vin1, vin2, vout_expected,\n                                            vout_computed);\n      }}\n      ''' .format(typ=typ, min_=min_, max_=max_,\n                  oper=oper, oper_is_overflow=oper_is_overflow,\n                  oper_is_underflow=oper_is_underflow)\n\n# -----------------------------------------------------------------------------\n# Tests helpers for adds - is overflow/underflow/neither overflow nor underflow\n\ndef adds_is_overflow(typ, max_):\n      return '''\n      int adds_is_overflow(const {typ} a, const {typ} b)\n      {{\n        return (a > 0) && (b > {max_} - a);\n      }}\n      '''.format(typ=typ, max_=max_)\n\ndef adds_signed_is_underflow(typ, min_):\n      return '''\n      int adds_signed_is_underflow(const {typ} a, const {typ} b)\n      {{\n        return (a < 0) && (b < {min_} - a);\n      }}\n      '''.format(typ=typ, min_=min_)\n\ndef adds_signed_is_neither_overflow_nor_underflow(typ):\n      return '''\n      int adds_signed_is_neither_overflow_nor_underflow(const {typ} a,\n                                                        const {typ} b)\n      {{\n        return ! adds_is_overflow(a, b) && ! adds_signed_is_underflow(a, b);\n      }}\n      '''.format(typ=typ)\n\n# -----------------------------------------------------------------------------\n# Tests helpers for adds with integer types\n\n# test integer overflow\ndef test_adds_overflow(typ, max_):\n      rand_ = '({typ})rand()'.format(typ=typ) \\\n              if typ in common.utypes else 'rand()'\n      return '''\n      int test_overflow({typ} vin1[], {typ} vin2[], {typ} vout_expected[],\n                        {typ} vout_computed[])\n      {{\n        /* if ((vin1[ii] > 0) && (vin2[ii] > {max_} - vin1[ii])) {{\n             overflow\n           }} */\n        int ii = 0;\n\n        /* vin1[ii] > 0 */\n        for(ii = 0; ii < SIZE; ++ii)\n        {{\n          {typ} rand_val = ({typ})({rand_} % {max_});\n          vin1[ii] = (rand_val == 0 ? 1 : rand_val);\n        }}\n\n        /*\n        vin2[ii] > {max_} - vin1[ii]\n        vin2[ii] = {max_} - vin1[ii] + rand_val\n        s.t.: 0 < rand_val <= vin1[ii]\n        */\n        for(ii = 0; ii < SIZE; ++ii)\n        {{\n            {typ} rand_val = ({typ})({rand_} % (vin1[ii] + 1));\n            rand_val = (rand_val == 0 ? 1 : rand_val);\n            vin2[ii] = ({typ})({max_} - vin1[ii] + rand_val);\n            vout_expected[ii] = {max_};\n        }}\n\n        /*\n        Test:\n        if ((vin1[ii] > 0) && (vin2[ii] > {max_} - vin1[ii])) {{\n          vout_expected[ii] == {max_};\n        }}\n        */\n        return compare_expected_vs_computed(vin1, vin2, vout_expected,\n                                            vout_computed);\n     }}\n      '''.format(typ=typ, max_=max_, rand_=rand_)\n\n# -----------------------------------------------------------------------------\n# Tests helpers for adds with signed integer types\n\n# test signed underflow\ndef test_adds_signed_underflow(typ, min_):\n      return '''\n      int test_underflow({typ} vin1[], {typ} vin2[], {typ} vout_expected[],\n                         {typ} vout_computed[])\n      {{\n        /* if ((vin1[ii] < 0) && (vin2[ii] < {min_} - vin1[ii])) {{\n             underflow\n           }} */\n        int ii = 0;\n\n        /* vin1[ii] < 0 */\n        for(ii = 0; ii < SIZE; ++ii)\n        {{\n            {typ} rand_val = ({typ})((- rand()) % {min_});\n            vin1[ii] = (rand_val == 0 ? - 1 : rand_val);\n        }}\n\n        /*\n        vin1[ii] < 0\n        vin2[ii] < {min_} - vin1[ii]\n        vin2[ii] = {min_} - vin1[ii] - rand_val\n        s.t.: 0 < rand_val < - vin1[ii]\n        */\n\n        for(ii = 0; ii < SIZE; ++ii)\n        {{\n            {typ} rand_val = ({typ})((rand()) % (- vin1[ii]));\n            rand_val = (rand_val == 0 ? 1 : rand_val);\n            vin2[ii] = ({typ})({min_} - vin1[ii] - rand_val);\n            vout_expected[ii] = {min_};\n        }}\n\n        /*\n        Test:\n        if ((vin1[ii] < 0) && (vin2[ii] < {min_} - vin1[ii])) {{\n          vout_expected[ii] == {min_};\n        }}\n        */\n        return compare_expected_vs_computed(vin1, vin2, vout_expected,\n                                            vout_computed);\n      }}\n      '''.format(typ=typ, min_=min_)\n\n# test signed neither overflow nor underflow\ndef test_adds_signed_neither_overflow_nor_underflow(typ, min_, max_):\n      return \\\n        test_signed_neither_overflow_nor_underflow(typ, min_, max_,\n         '+', 'adds_signed_is_neither_overflow_nor_underflow')\n\n# test signed all cases\ndef test_adds_signed_all_cases(typ, min_, max_):\n      return test_signed_all_cases(typ, min_, max_, '+', 'adds_is_overflow',\n                                   'adds_signed_is_underflow')\n\n# all signed tests\ndef tests_adds_signed():\n      return'''\n      zero_out_arrays(vin1, vin2, vout_expected, vout_computed);\n      CHECK_CASE(test_overflow(vin1, vin2, vout_expected,\n                 vout_computed), \"overflow\");\n\n      zero_out_arrays(vin1, vin2, vout_expected, vout_computed);\n      CHECK_CASE(test_underflow(vin1, vin2, vout_expected,\n                 vout_computed), \"underflow\");\n\n      zero_out_arrays(vin1, vin2, vout_expected, vout_computed);\n      CHECK_CASE(test_neither_overflow_nor_underflow(vin1, vin2,\n                 vout_expected, vout_computed),\n                 \"neither underflow nor overflow\");\n\n      zero_out_arrays(vin1, vin2, vout_expected, vout_computed);\n      CHECK_CASE(test_all_cases(vin1, vin2, vout_expected,\n                 vout_computed), \"all cases\");\n      '''\n\n# -----------------------------------------------------------------------------\n# Tests helper for adds with unsigned types\n\n# test signed neither overflow nor underflow\ndef test_adds_unsigned_no_overflow(typ, max_):\n      return '''\n      int test_no_overflow({typ} vin1[], {typ} vin2[], {typ} vout_expected[], {typ} vout_computed[])\n      {{\n        int ii = 0;\n        while(ii < SIZE)\n        {{\n          {typ} a = ({typ})(({typ})rand() % {max_});\n          {typ} b = ({typ})(({typ})rand() % {max_});\n          if(! adds_is_overflow(a, b))\n          {{\n            vin1[ii] = a;\n            vin2[ii] = b;\n            vout_expected[ii] = ({typ})(a + b);\n            ++ ii;\n          }}\n        }}\n        assert(ii == SIZE);\n        /*\n        Test:\n        if (not adds is overflow) {{ vout_expected[ii] == a + b; }}\n        */\n        return compare_expected_vs_computed(vin1, vin2, vout_expected, vout_computed);\n      }}\n      '''.format(typ=typ, max_=max_)\n\n# test unsigned all cases\ndef test_adds_unsigned_all_cases(typ, max_):\n      return '''\n      int test_all_cases({typ} vin1[], {typ} vin2[], {typ} vout_expected[], {typ} vout_computed[])\n      {{\n        int ii = 0;\n        for(ii = 0; ii < SIZE; ++ii)\n        {{\n          vin1[ii] = ({typ})(({typ})rand() % {max_});\n          vin2[ii] = ({typ})(({typ})rand() % {max_});\n          if(adds_is_overflow(vin1[ii], vin2[ii]))\n          {{\n            vout_expected[ii] = {max_};\n          }}\n          else {{ vout_expected[ii] = ({typ})(vin1[ii] + vin2[ii]); }}\n        }}\n        /* Test all cases: */\n        return compare_expected_vs_computed(vin1, vin2, vout_expected, vout_computed);\n      }}\n      '''.format(typ=typ, max_=max_)\n\n# all unsigned tests\ndef tests_adds_unsigned():\n      return'''\n      zero_out_arrays(vin1, vin2, vout_expected, vout_computed);\n      CHECK_CASE(test_overflow(vin1, vin2, vout_expected,\n                 vout_computed), \"overflow\");\n\n      zero_out_arrays(vin1, vin2, vout_expected, vout_computed);\n      CHECK_CASE(test_no_overflow(vin1, vin2, vout_expected,\n                 vout_computed), \"no overflow\");\n\n      zero_out_arrays(vin1, vin2, vout_expected, vout_computed);\n      CHECK_CASE(test_all_cases(vin1, vin2, vout_expected,\n                 vout_computed), \"all cases\");\n      '''\n\n# ------------------------------------------------------------------------------\n# Get adds tests given type\n\ndef get_adds_tests_cases_for_signed_types(typ, min_, max_):\n      helpers = '''\n            {test_adds_overflow}\n\n            {test_adds_signed_underflow}\n\n            {adds_is_overflow}\n\n            {adds_signed_is_underflow}\n\n            {adds_signed_is_neither_overflow_nor_underflow}\n\n            {test_adds_signed_neither_overflow_nor_underflow}\n\n            {test_adds_signed_all_cases}\n          ''' .format(test_adds_overflow=test_adds_overflow(typ, max_),\n                      test_adds_signed_underflow=test_adds_signed_underflow(\n                          typ, min_),\n                      adds_is_overflow=adds_is_overflow(typ, max_),\n                      adds_signed_is_underflow=adds_signed_is_underflow(\n                          typ, min_),\n                      adds_signed_is_neither_overflow_nor_underflow=adds_signed_is_neither_overflow_nor_underflow(\n                          typ),\n                      test_adds_signed_neither_overflow_nor_underflow=test_adds_signed_neither_overflow_nor_underflow(\n                          typ, min_=min_, max_=max_),\n                      test_adds_signed_all_cases=test_adds_signed_all_cases(\n                          typ, min_=min_, max_=max_)\n                      )\n      return {'helpers': helpers, 'tests': tests_adds_signed()}\n\ndef get_adds_tests_cases_for_unsigned_types(typ, max_):\n      helpers = '''\n          {test_adds_overflow}\n\n          {adds_is_overflow}\n\n          {test_adds_unsigned_no_overflow}\n\n          {test_adds_unsigned_all_cases}\n          ''' .format(test_adds_overflow=test_adds_overflow(typ, max_),\n                      adds_is_overflow=adds_is_overflow(typ, max_),\n                      test_adds_unsigned_no_overflow=test_adds_unsigned_no_overflow(\n                          typ, max_),\n                      test_adds_unsigned_all_cases=test_adds_unsigned_all_cases(typ, max_)\n                      )\n      return {'helpers': helpers, 'tests': tests_adds_unsigned()}\n\ndef get_adds_tests_cases_given_type(typ):\n      if typ in common.iutypes:\n            type_limits = common.limits[typ]\n            min_ = type_limits['min']\n            max_ = type_limits['max']\n\n            if typ in common.itypes:\n                  return get_adds_tests_cases_for_signed_types(typ=typ, min_=min_, max_=max_)\n\n            if typ in common.utypes:\n                  return get_adds_tests_cases_for_unsigned_types(typ=typ, max_=max_)\n      else:\n            msg = '{typ} not implemented'.format(typ=typ)\n            raise TypeError(msg)\n\n# -----------------------------------------------------------------------------\n# gen_adds\n\ndef gen_adds(opts, op, typ, lang):\n\n    # Do not test for floats since adds(floats) == add(floats)\n    if typ in common.ftypes:\n        return\n\n    filename = get_filename(opts, op, typ, lang)\n\n    if filename == None:\n        return\n\n    sizeof = common.sizeof(typ)\n\n    head = '''\n              {includes}\n              #include <assert.h>\n\n              #define SIZE (2048 / {sizeof})\n\n              #define STATUS \"test of {op_name} over {typ}\"\n\n              {aligned_alloc_error}\n\n              {adds_subs_check_case}\n            ''' .format(includes=get_includes(lang),\n                        op_name=op.name,\n                        typ=typ,\n                        sizeof=sizeof,\n                        aligned_alloc_error=aligned_alloc_error(),\n                        adds_subs_check_case=adds_subs_check_case())\n\n    with common.open_utf8(opts, filename) as out:\n        out.write(\n            ''' \\\n            {head}\n            /* ------------------------------------------------------------------------- */\n\n            {random_sign_flip}\n\n            {zero_out_arrays}\n\n            {equal}\n\n            {compare_expected_vs_computed}\n\n            {tests_helpers}\n\n            int main(void)\n            {{\n              const int mem_aligned_size = SIZE * {sizeof};\n\n              {typ} *vin1;\n              {typ} *vin2;\n\n              {typ} *vout_expected;\n              {typ} *vout_computed;\n\n              CHECK(vin1 = ({typ} *)nsimd_aligned_alloc(mem_aligned_size));\n              CHECK(vin2 = ({typ} *)nsimd_aligned_alloc(mem_aligned_size));\n\n              CHECK(vout_expected = ({typ} *)nsimd_aligned_alloc(mem_aligned_size));\n              CHECK(vout_computed = ({typ} *)nsimd_aligned_alloc(mem_aligned_size));\n\n              {tests}\n\n              fprintf(stdout, STATUS \"... OK\\\\n\");\n              fflush(stdout);\n              return EXIT_SUCCESS;\n            }}\n        ''' .format(head=head,\n                    compare_expected_vs_computed=\\\n                      compare_expected_vs_computed(typ, op.name, lang),\n                    random_sign_flip='' if typ in common.utypes \\\n                                        else random_sign_flip(),\n                    zero_out_arrays=zero_out_arrays(typ),\n                    equal=equal(typ),\n                    tests_helpers=\\\n                      get_adds_tests_cases_given_type(typ)['helpers'],\n                    tests=get_adds_tests_cases_given_type(typ)['tests'],\n                    op_name = op.name,\n                    typ=typ,\n                    sizeof = sizeof)\n        )\n\n    common.clang_format(opts, filename)\n\n# -----------------------------------------------------------------------------\n# Tests helpers for subs - is overflow/underflow/neither overflow nor underflow\n\n# subs signed\n\ndef subs_signed_is_overflow(typ, max_):\n      return '''\n      int subs_signed_is_overflow(const {typ} a, const {typ} b)\n      {{\n        return (b < 0) && (a > {max_} + b);\n      }}\n      '''.format(typ=typ, max_=max_)\n\ndef subs_signed_is_underflow(typ, min_):\n      return '''\n      int subs_signed_is_underflow(const {typ} a, const {typ} b)\n      {{\n        return (b > 0) && (a < {min_} + b);\n      }}\n      '''.format(typ=typ, min_=min_)\n\ndef subs_signed_is_neither_overflow_nor_underflow(typ):\n      return '''\n      int subs_signed_is_neither_overflow_nor_underflow(const {typ} a,\n                                                        const {typ} b) {{\n        return !subs_signed_is_overflow(a, b) &&\n               !subs_signed_is_underflow(a, b);\n      }}\n      '''.format(typ=typ)\n\n# subs unsigned\n\ndef subs_unsigned_is_underflow(typ):\n      return '''\n      int subs_unsigned_is_underflow(const {typ} a, const {typ} b)\n      {{\n        return a < b;\n      }}\n      '''.format(typ=typ)\n\n# -----------------------------------------------------------------------------\n# Tests helpers for subs with signed types\n\n# test signed integer overflow\ndef test_subs_signed_overflow(typ, min_, max_):\n      return '''\n      int test_overflow({typ} vin1[], {typ} vin2[], {typ} vout_expected[],\n                        {typ} vout_computed[])\n      {{\n        /*\n        if ((vin2[ii] < 0) && (vin1[ii] > {max_} + vin2[ii])) {{\n          overflow\n        }}\n        */\n        int ii = 0;\n\n        /* vin2[ii] < 0 */\n        for(ii = 0; ii < SIZE; ++ii)\n        {{\n          {typ} rand_val = ({typ})((- rand()) % {min_});\n          vin2[ii] = (rand_val == 0 ? - 1 : rand_val);\n        }}\n\n        /*\n        vin1[ii] - vin2[ii] > {max_}\n        vin1[ii] > {max_} + vin2[ii]\n        vin1[ii] = {max_} + vin2[ii] + rand_val\n        s.t.: 0 < rand_val <= - vin2[ii]\n\n        (- TYPE_MIN) overflows\n        if vin2[ii] == -1 -->  rand() % -(vin2[ii] + 1) --> rand() % 0\n        Therefore check if vin2[ii] == -1 --> if True --> set rand_val == 1\n        */\n\n        for(ii = 0; ii < SIZE; ++ii)\n        {{\n          {typ} rand_val = 0;\n          if(-1 == vin2[ii]){{ rand_val = 1; }}\n          else{{\n            rand_val = ({typ})(rand() % -(vin2[ii] + 1));\n            rand_val = (rand_val == 0 ? 1 : rand_val);\n          }}\n            vin1[ii] = ({typ})({max_} + vin2[ii] + rand_val);\n            vout_expected[ii] = {max_};\n        }}\n\n        /*\n        Test:\n        if ((vin2[ii] < 0) && (vin1[ii] > {max_} + vin2[ii])) {{\n          vout_expected[ii] == {max_};\n        }}\n        */\n        return compare_expected_vs_computed(vin1, vin2, vout_expected,\n                                            vout_computed);\n     }}\n      '''.format(typ=typ, min_=min_, max_=max_)\n\n# test signed underflow\ndef test_subs_signed_underflow(typ, min_, max_):\n      return '''\n      int test_underflow({typ} vin1[], {typ} vin2[], {typ} vout_expected[],\n                         {typ} vout_computed[]) {{\n        /*\n        if ((vin2[ii] > 0) && (vin1[ii] < {min_} + vin2[ii])) {{\n          underflow\n        }}\n        */\n        int ii = 0;\n\n        /* vin2[ii] > 0 */\n        for(ii = 0; ii < SIZE; ++ii)\n        {{\n            {typ} rand_val = ({typ})(rand() % {max_});\n            vin2[ii] = (rand_val == 0 ? 1 : rand_val);\n        }}\n\n        /*\n        vin1[ii] < {min_} + vin2[ii]\n        vin1[ii] = {min_} + vin2[ii] - rand_val\n        s.t.: 0 < rand_val < vin2[ii]\n        */\n        for(ii = 0; ii < SIZE; ++ii)\n        {{\n            {typ} rand_val = ({typ})(rand() % vin2[ii]);\n            rand_val = (rand_val == 0 ? 1 : rand_val);\n            vin1[ii] = ({typ})({min_} + vin2[ii] - rand_val);\n            vout_expected[ii] = {min_};\n        }}\n\n        /*\n        Test:\n        if ((vin2[ii] > 0) && (vin1[ii] < {min_} + vin2[ii])) {{\n          vout_expected[ii] == {min_};\n        }}\n        */\n        return compare_expected_vs_computed(vin1, vin2, vout_expected,\n                                            vout_computed);\n      }}\n      '''.format(typ=typ, min_=min_, max_=max_)\n\n# test signed neither overflow nor underflow\ndef test_subs_signed_neither_overflow_nor_underflow(typ, min_, max_):\n      return \\\n        test_signed_neither_overflow_nor_underflow(typ, min_, max_,\n         '-', 'subs_signed_is_neither_overflow_nor_underflow')\n\n# test signed all cases\ndef test_subs_signed_all_cases(typ, min_, max_):\n      return test_signed_all_cases(typ, min_, max_, '-',\n                                   'subs_signed_is_overflow',\n                                   'subs_signed_is_underflow')\n\n# all signed tests\ndef tests_subs_signed():\n      return '''\n      zero_out_arrays(vin1, vin2, vout_expected, vout_computed);\n      CHECK_CASE(test_overflow(vin1, vin2, vout_expected,\n                 vout_computed), \"overflow\");\n\n      zero_out_arrays(vin1, vin2, vout_expected, vout_computed);\n      CHECK_CASE(test_underflow(vin1, vin2, vout_expected,\n                 vout_computed), \"underflow\");\n\n      zero_out_arrays(vin1, vin2, vout_expected, vout_computed);\n      CHECK_CASE(test_neither_overflow_nor_underflow(vin1, vin2, vout_expected,\n                 vout_computed), \"neither underflow nor overflow\");\n\n      zero_out_arrays(vin1, vin2, vout_expected, vout_computed);\n      CHECK_CASE(test_all_cases(vin1, vin2, vout_expected,\n                 vout_computed), \"all cases\");\n      '''\n\n# -----------------------------------------------------------------------------\n# Tests helpers for subs with unsigned types\n\n# test unsigned underflow\ndef test_subs_unsigned_underflow(typ, min_, max_):\n      return '''\n      int test_underflow({typ} vin1[], {typ} vin2[], {typ} vout_expected[],\n                         {typ} vout_computed[]) {{\n        /* if (vin1[ii] < vin2[ii]) {{ underflow }} */\n        int ii = 0;\n\n        /* vin1[ii] */\n        for(ii = 0; ii < SIZE; ++ii) {{\n          vin1[ii] = ({typ})(({typ})rand() % {max_});\n        }}\n\n        /*\n        vin1[ii] < vin2[ii]\n        vin2[ii] = vin1[ii] + rand_val\n        s.t.: 0 < rand_val < {max_} - vin1[ii]\n        */\n        for(ii = 0; ii < SIZE; ++ii)\n        {{\n            {typ} rand_val = ({typ})(({typ})rand() % ({max_} - vin1[ii]));\n            rand_val = (rand_val == 0 ? 1 : rand_val);\n            vin2[ii] = ({typ})(vin1[ii] + rand_val);\n            vout_expected[ii] = ({typ}){min_};\n        }}\n\n        /*\n        Test:\n        if (vin1[ii] < vin2[ii]) {{ vout_expected[ii] == {min_}; }}\n        */\n        return compare_expected_vs_computed(vin1, vin2, vout_expected,\n                                            vout_computed);\n      }}\n      '''.format(typ=typ, min_=min_, max_=max_)\n\n# test unsigned no underflow\ndef test_subs_unsigned_no_underflow(typ, max_):\n      return '''\n      int test_no_underflow({typ} vin1[], {typ} vin2[], {typ} vout_expected[],\n                            {typ} vout_computed[]) {{\n        /* if (vin1[ii] >= vin2[ii]) {{ no underflow }} */\n        int ii = 0;\n\n        /* vin1[ii] */\n        for(ii = 0; ii < SIZE; ++ii) {{\n          vin1[ii] = ({typ})(({typ})rand() % {max_});\n        }}\n\n        /*\n        vin1[ii] >= vin2[ii]\n        vin2 = vin1 - rand_val\n        s.t. 0 <= rand_val <= vin1\n        */\n\n        for(ii = 0; ii < SIZE; ++ii)\n        {{\n            {typ} rand_val = ({typ})(({typ})rand() % (vin1[ii] + 1));\n            vin2[ii] = ({typ})(vin1[ii] - rand_val);\n            vout_expected[ii] = ({typ})(vin1[ii] - vin2[ii]);\n        }}\n\n        /*\n        Test:\n        if (vin1[ii] >= vin2[ii]) {{\n          vout_expected[ii] == vin1[ii] - vin2[ii];\n        }}\n        */\n        return compare_expected_vs_computed(vin1, vin2, vout_expected,\n                                            vout_computed);\n      }}\n      '''.format(typ=typ, max_=max_)\n\n# test signed all cases\ndef test_subs_unsigned_all_cases(typ, min_, max_):\n      return '''\n      int test_all_cases({typ} vin1[], {typ} vin2[], {typ} vout_expected[],\n                         {typ} vout_computed[]) {{\n        int ii = 0;\n        for(ii = 0; ii < SIZE; ++ii)\n        {{\n          vin1[ii] = ({typ})(({typ})rand() % {max_});\n          vin2[ii] = ({typ})(({typ})rand() % {max_});\n          if(subs_unsigned_is_underflow(vin1[ii], vin2[ii]))\n          {{\n            vout_expected[ii] = ({typ}){min_};\n          }}\n          else {{ vout_expected[ii] = ({typ})(vin1[ii] - vin2[ii]); }}\n        }}\n        /* Test all cases: */\n        return compare_expected_vs_computed(vin1, vin2, vout_expected,\n                                            vout_computed);\n      }}\n      '''.format(typ=typ, min_=min_, max_=max_)\n\n# all unsigned tests\ndef tests_subs_unsigned():\n      return'''\n      zero_out_arrays(vin1, vin2, vout_expected, vout_computed);\n      CHECK_CASE(test_underflow(vin1, vin2, vout_expected,\n                 vout_computed), \"underflow\");\n\n      zero_out_arrays(vin1, vin2, vout_expected, vout_computed);\n      CHECK_CASE(test_no_underflow(vin1, vin2, vout_expected, vout_computed),\n      \"no underflow\");\n\n      zero_out_arrays(vin1, vin2, vout_expected, vout_computed);\n      CHECK_CASE(test_all_cases(vin1, vin2, vout_expected,\n                 vout_computed), \"all cases\");\n      '''\n\n# ------------------------------------------------------------------------------\n# Get subs tests given type\n\ndef get_subs_tests_cases_for_signed_types(typ, min_, max_):\n      helpers = '''\n            {test_subs_signed_overflow}\n\n            {test_subs_signed_underflow}\n\n            {subs_signed_is_overflow}\n\n            {subs_signed_is_underflow}\n\n            {subs_signed_is_neither_overflow_nor_underflow}\n\n            {test_subs_signed_neither_overflow_nor_underflow}\n\n            {test_subs_signed_all_cases}\n          ''' .format(test_subs_signed_overflow=\\\n                        test_subs_signed_overflow(typ, min_, max_),\n                      test_subs_signed_underflow=\\\n                        test_subs_signed_underflow(typ, min_, max_),\n                      subs_signed_is_overflow=\\\n                        subs_signed_is_overflow(typ, max_),\n                      subs_signed_is_underflow=\\\n                        subs_signed_is_underflow(typ, min_),\n                      subs_signed_is_neither_overflow_nor_underflow=\\\n                        subs_signed_is_neither_overflow_nor_underflow(typ),\n                      test_subs_signed_neither_overflow_nor_underflow=\\\n                        test_subs_signed_neither_overflow_nor_underflow(\n                          typ, min_=min_, max_=max_),\n                      test_subs_signed_all_cases=\\\n                        test_subs_signed_all_cases(typ, min_=min_, max_=max_))\n      return {'helpers': helpers, 'tests': tests_subs_signed()}\n\ndef get_subs_tests_cases_for_unsigned_types(typ, min_, max_):\n      helpers = '''\n          {test_subs_unsigned_underflow}\n\n          {test_subs_unsigned_no_underflow}\n\n          {subs_unsigned_is_underflow}\n\n          {test_subs_unsigned_all_cases}\n          ''' .format(test_subs_unsigned_underflow=\\\n                        test_subs_unsigned_underflow(typ, min_, max_),\n                      test_subs_unsigned_no_underflow=\\\n                        test_subs_unsigned_no_underflow(typ, max_),\n                      subs_unsigned_is_underflow=\\\n                        subs_unsigned_is_underflow(typ),\n                      test_subs_unsigned_all_cases=\\\n                        test_subs_unsigned_all_cases(typ, min_, max_))\n      return {'helpers': helpers, 'tests': tests_subs_unsigned()}\n\ndef get_subs_tests_cases_given_type(typ):\n      if typ in common.iutypes:\n            type_limits = common.limits[typ]\n            min_ = type_limits['min']\n            max_ = type_limits['max']\n\n            if typ in common.itypes:\n                  return get_subs_tests_cases_for_signed_types(\n                             typ=typ, min_=min_, max_=max_)\n\n            if typ in common.utypes:\n                  return get_subs_tests_cases_for_unsigned_types(\n                             typ=typ, min_=min_, max_=max_)\n      else:\n            msg = '{typ} not implemented'.format(typ=typ)\n            raise TypeError(msg)\n\n# -----------------------------------------------------------------------------\n# gen_subs\n\ndef gen_subs(opts, op, typ, lang):\n\n    # Do not test for floats since subs(floats) == sub(floats)\n    if typ in common.ftypes:\n          return\n\n    filename = get_filename(opts, op, typ, lang)\n\n    if filename == None:\n        return\n\n    sizeof = common.sizeof(typ)\n\n    head = \\\n    '''{includes}\n       #include <assert.h>\n\n       #define SIZE (2048 / {sizeof})\n\n       #define STATUS \"test of {op_name} over {typ}\"\n\n       {aligned_alloc_error}\n\n       {adds_subs_check_case}'''. \\\n       format(includes=get_includes(lang), op_name=op.name, typ=typ,\n              sizeof=sizeof, aligned_alloc_error=aligned_alloc_error(),\n              adds_subs_check_case=adds_subs_check_case())\n\n    with common.open_utf8(opts, filename) as out:\n        out.write('''\n        {head}\n\n        {hbar}\n\n        {random_sign_flip}\n\n        {zero_out_arrays}\n\n        {equal}\n\n        {compare_expected_vs_computed}\n\n        {tests_helpers}\n\n        int main(void)\n        {{\n          const int mem_aligned_size = SIZE * {sizeof};\n\n          {typ} *vin1;\n          {typ} *vin2;\n\n          {typ} *vout_expected;\n          {typ} *vout_computed;\n\n          CHECK(vin1 = ({typ} *)nsimd_aligned_alloc(mem_aligned_size));\n          CHECK(vin2 = ({typ} *)nsimd_aligned_alloc(mem_aligned_size));\n\n          CHECK(vout_expected = ({typ} *)nsimd_aligned_alloc(mem_aligned_size));\n          CHECK(vout_computed = ({typ} *)nsimd_aligned_alloc(mem_aligned_size));\n\n          {tests}\n\n          fprintf(stdout, STATUS \"... OK\\\\n\");\n          fflush(stdout);\n          return EXIT_SUCCESS;\n        }}\n        '''.format(head=head,\n                   compare_expected_vs_computed=\\\n                     compare_expected_vs_computed(typ, op.name, lang),\n                   random_sign_flip='' if typ in common.utypes \\\n                                       else random_sign_flip(),\n                   zero_out_arrays=zero_out_arrays(typ),\n                   equal=equal(typ),\n                   tests_helpers=\\\n                     get_subs_tests_cases_given_type(typ)['helpers'],\n                   tests=get_subs_tests_cases_given_type(typ)['tests'],\n                   op_name=op.name, typ=typ, hbar=common.hbar, sizeof=sizeof))\n\n    common.clang_format(opts, filename)\n\n# -----------------------------------------------------------------------------\n# Tests for all and any\n\ndef gen_all_any(opts, op, typ, lang):\n    filename = get_filename(opts, op, typ, lang)\n    if filename == None:\n        return\n    if lang == 'c_base':\n        op_test = 'v{}(vloadla(buf, {}), {})'.format(op.name, typ, typ)\n    elif lang == 'c_adv':\n        op_test = 'nsimd_{}(nsimd_loadla(nsimd_packl_{}, buf))'. \\\n                  format(op.name, typ)\n    elif lang == 'cxx_base':\n        op_test = 'nsimd::{}(nsimd::loadla(buf, {}()), {}())'. \\\n                  format(op.name, typ, typ)\n    else:\n        op_test = 'nsimd::{}(nsimd::loadla<nsimd::packl<{}> >(buf))'. \\\n                  format(op.name, typ)\n    if typ == 'f16':\n        scalar0 = 'nsimd_f32_to_f16(0)'\n        scalar1 = 'nsimd_f32_to_f16(1)'\n    else:\n        scalar0 = '({})0'.format(typ)\n        scalar1 = '({})1'.format(typ)\n    with common.open_utf8(opts, filename) as out:\n        out.write(\n            '''{includes}\n\n           #define CHECK(a) {{ \\\\\n             errno = 0; \\\\\n             if (!(a)) {{ \\\\\n               fprintf(stderr, \"ERROR: \" #a \":%d: %s\\\\n\", \\\\\n                       __LINE__, strerror(errno)); \\\\\n               fflush(stderr); \\\\\n               exit(EXIT_FAILURE); \\\\\n             }} \\\\\n           }}\n\n           int main(void) {{\n             int i;\n             {typ} *buf;\n             int len = vlen({typ});\n\n             fprintf(stdout, \"test of {op_name} over {typ}...\\\\n\");\n             CHECK(buf = ({typ}*)nsimd_aligned_alloc(len * {sizeof}));\n\n             /* Test with all elements to true */\n             for (i = 0; i < len; i++) {{\n               buf[i] = {scalar1};\n             }}\n             if (!{op_test}) {{\n               exit(EXIT_FAILURE);\n             }}\n\n             /* Test with all elements set to false */\n             for (i = 0; i < len; i++) {{\n               buf[i] = {scalar0};\n             }}\n             if ({op_test}) {{\n               exit(EXIT_FAILURE);\n             }}\n\n             /* Test with only one element set to true */\n             if (len > 1) {{\n               buf[0] = {scalar1};\n               if ({notl}{op_test}) {{\n                 exit(EXIT_FAILURE);\n               }}\n             }}\n\n             fprintf(stdout, \"test of {op_name} over {typ}... OK\\\\n\");\n             return EXIT_SUCCESS;\n           }}'''.format(includes=get_includes(lang), op_name=op.name,\n                        typ=typ, op_test=op_test, year=date.today().year,\n                        notl='!' if op.name == 'any' else '', scalar0=scalar0,\n                        scalar1=scalar1, sizeof=common.sizeof(typ)))\n    common.clang_format(opts, filename)\n\n# -----------------------------------------------------------------------------\n# Tests for load/store of degrees 2, 3 and 4\n\ndef gen_load_store(opts, op, typ, lang):\n    filename = get_filename(opts, op, typ, lang)\n    if filename == None:\n        return\n    if op.name.startswith('load'):\n        deg = op.name[4]\n        align = op.name[5]\n    elif op.name.startswith('store'):\n        deg = op.name[5]\n        align = op.name[6]\n    variables = ', '.join(['v.v{}'.format(i) for i in range(0, int(deg))])\n    if lang == 'c_base':\n        load_store = \\\n            '''vecx{deg}({typ}) v = vload{deg}{align}(&vin[i], {typ});\n               vstore{deg}{align}(&vout[i], {variables}, {typ});'''. \\\n               format(deg=deg, typ=typ, align=align, variables=variables)\n    elif lang == 'c_adv':\n        load_store = \\\n            '''nsimd_packx{deg}_{typ} v =\n                   nsimd_load{deg}{align}(nsimd_packx{deg}_{typ}, &vin[i]);\n               nsimd_store{deg}{align}(&vout[i], {variables});'''. \\\n               format(deg=deg, typ=typ, align=align, variables=variables)\n    elif lang == 'cxx_base':\n        load_store = \\\n            '''vecx{deg}({typ}) v = nsimd::load{deg}{align}(&vin[i], {typ}());\n               nsimd::store{deg}{align}(&vout[i], {variables}, {typ}());'''. \\\n               format(deg=deg, typ=typ, align=align, variables=variables)\n    else:\n        load_store = \\\n            '''nsimd::packx{deg}<{typ}> v = nsimd::load{deg}{align}<\n                                          nsimd::packx{deg}<{typ}> >(&vin[i]);\n               nsimd::store{deg}{align}(&vout[i], {variables});'''. \\\n               format(deg=deg, typ=typ, align=align, variables=variables)\n    if typ == 'f16':\n        rand = '*((u16*)vin + i) = nsimd_f32_to_u16((float)(rand() % 10));'\n        comp = '*((u16*)vin + i) != *((u16 *)vout + i)'\n    else:\n        rand = 'vin[i] = ({})(rand() % 10);'.format(typ)\n        comp = 'vin[i] != vout[i]'\n\n    if align=='u':\n        unalign = '+1'\n    else:\n        unalign = ''\n\n    with common.open_utf8(opts, filename) as out:\n        out.write('''{includes}\n\n        #define SIZE (2048 / {sizeof})\n\n        #define STATUS \"test of {op_name} over {typ}\"\n\n        #define CHECK(a) {{ \\\\\n          errno = 0; \\\\\n          if (!(a)) {{ \\\\\n            fprintf(stderr, \"ERROR: \" #a \":%d: %s\\\\n\", \\\\\n                    __LINE__, strerror(errno)); \\\\\n            fflush(stderr); \\\\\n            exit(EXIT_FAILURE); \\\\\n          }} \\\\\n        }}\n\n        int main(void) {{\n          int i, vi;\n          {typ} *vin, *vout;\n          int len = vlen({typ});\n          int n = SIZE * {deg} * len;\n\n          fprintf(stdout, \"test of {op_name} over {typ}...\\\\n\");\n          CHECK(vin = ({typ}*)nsimd_aligned_alloc(\n                                n * {sizeof} {unalign}) {unalign});\n          CHECK(vout = ({typ}*)nsimd_aligned_alloc(\n                                   n * {sizeof} {unalign}) {unalign});\n\n          /* Fill with random data */\n          for (i = 0; i < n; i++) {{\n            {rand}\n          }}\n\n          /* Load and put back data into vout */\n          for (i = 0; i < n; i += {deg} * len) {{\n            {load_store}\n          }}\n\n          /* Compare results */\n          for (vi = 0; vi < SIZE; vi += len) {{\n            for (i = vi; i < vi + len; i++) {{\n              if ({comp}) {{\n                fprintf(stdout, STATUS \"... FAIL\\\\n\");\n                fflush(stdout);\n                return -1;\n              }}\n            }}\n          }}\n\n          fprintf(stdout, \"test of {op_name} over {typ}... OK\\\\n\");\n          return EXIT_SUCCESS;\n        }}'''.format(includes=get_includes(lang), op_name=op.name,\n                     typ=typ, rand=rand, year=date.today().year, deg=deg,\n                     sizeof=common.sizeof(typ), load_store=load_store,\n                     comp=comp, unalign=unalign))\n    common.clang_format(opts, filename)\n\n# -----------------------------------------------------------------------------\n# Tests for gather/scatter\n\ndef gen_gather_scatter(opts, op, typ, lang):\n    filename = get_filename(opts, op, typ, lang)\n    if filename == None:\n        return\n\n    ityp = 'i' + typ[1:]\n\n    if lang == 'c_base':\n        if op.name == 'gather_linear':\n            gather_scatter = '''vscatter_linear(vout + 1, 2, vgather_linear(\n                                    vin, 2, {typ}), {typ});'''.format(typ=typ)\n        else:\n            gather_scatter = \\\n                '''vec({ityp}) offsets = vmul(viota({ityp}), vset1(({ityp})2,\n                                              {ityp}), {ityp});\n                   vec({typ}) v = vgather(vin, offsets, {typ});\n                   offsets = vadd(offsets, vset1(({ityp})1, {ityp}), {ityp});\n                   vscatter(vout, offsets, v, {typ});'''. \\\n                   format(typ=typ, ityp=ityp)\n    elif lang == 'c_adv':\n        if op.name == 'gather_linear':\n            gather_scatter = \\\n            '''nsimd_scatter_linear(\n                   vout + 1, 2, nsimd_gather_linear(\n                     nsimd_pack_{}, vin, 2));'''.format(typ)\n        else:\n            gather_scatter = \\\n                '''nsimd_pack_{ityp} offsets = nsimd_mul(nsimd_iota(\n                       nsimd_pack_{ityp}), nsimd_set1(\n                         nsimd_pack_{ityp}, ({ityp})2));\n                   nsimd_pack_{typ} v = nsimd_gather(\n                       nsimd_pack_{typ}, vin, offsets);\n                   offsets = nsimd_add(offsets, nsimd_set1(nsimd_pack_{ityp},\n                                                           ({ityp})1));\n                   nsimd_scatter(vout, offsets, v);'''. \\\n                   format(typ=typ, ityp=ityp)\n    elif lang == 'cxx_base':\n        if op.name == 'gather_linear':\n            gather_scatter = '''nsimd::scatter_linear(vout + 1, 2,\n                                  nsimd::gather_linear(\n                                    vin, 2, {typ}()), {typ}());'''. \\\n                                    format(typ=typ)\n        else:\n            gather_scatter = \\\n            '''vec({ityp}) offsets = nsimd::mul(nsimd::iota({ityp}()),\n                                     nsimd::set1(({ityp})2, {ityp}()),\n                                     {ityp}());\n               vec({typ}) v = nsimd::gather(vin, offsets, {typ}());\n               offsets = nsimd::add(offsets, nsimd::set1(({ityp})1, {ityp}()),\n                                    {ityp}());\n               nsimd::scatter(vout, offsets, v, {typ}());'''. \\\n               format(typ=typ, ityp=ityp)\n    else:\n        if op.name == 'gather_linear':\n            gather_scatter = '''nsimd::scatter_linear(vout + 1, 2,\n                                  nsimd::gather_linear<nsimd::pack<{typ}> >(\n                                      vin, 2));'''.format(typ=typ)\n        else:\n            gather_scatter = \\\n            '''typedef nsimd::pack<{typ}> pack;\n               typedef nsimd::pack<{ityp}> ipack;\n               ipack offsets = nsimd::mul(nsimd::iota<ipack>(),\n                               nsimd::set1<ipack>(({ityp})2));\n               pack v = nsimd::gather(vin, offsets);\n               offsets = nsimd::add(offsets, nsimd::set1<ipack>(({ityp})1));\n               nsimd::scatter(vout, offsets, v);'''. \\\n               format(typ=typ, ityp=ityp)\n\n    if typ == 'f16':\n        one = 'nsimd_f32_to_f16(1.0f)'\n        zero = 'nsimd_f32_to_f16(0.0f)'\n        comp = 'nsimd_f16_to_f32(vout[i]) != 0.0f'\n    else:\n        one = '({typ})1'.format(typ=typ)\n        zero = '({typ})0'.format(typ=typ)\n        comp = 'vout[i] != ({typ})0'.format(typ=typ)\n\n    with common.open_utf8(opts, filename) as out:\n        out.write(\n           '''{includes}\n\n           #define STATUS \"test of {op_name} over {typ}\"\n\n           int main(void) {{\n             int n = 2 * vlen({typ});\n             int i;\n             {typ} vin[2 * NSIMD_MAX_LEN({typ})];\n             {typ} vout[2 * NSIMD_MAX_LEN({typ})];\n\n             fprintf(stdout, \"test of {op_name} over {typ}...\\\\n\");\n\n             /* Fill input and output with 0 1 0 1 0 1 ... */\n             for (i = 0; i < n; i++) {{\n               if ((i % 2) == 1) {{\n                 vin[i] = {one};\n                 vout[i] = {one};\n               }} else {{\n                 vin[i] = {zero};\n                 vout[i] = {zero};\n               }}\n             }}\n\n             /* We gather odd offsets elements from vin and put then at even */\n             /* offsets. */\n             {{\n               {gather_scatter}\n             }}\n\n             /* Compare results */\n             for (i = 0; i < n; i++) {{\n               if ({comp}) {{\n                 fprintf(stdout, STATUS \"... FAIL\\\\n\");\n                 fflush(stdout);\n                 return -1;\n               }}\n             }}\n\n             fprintf(stdout, \"test of {op_name} over {typ}... OK\\\\n\");\n             return EXIT_SUCCESS;\n           }}'''.format(includes=get_includes(lang), ityp=ityp, comp=comp,\n                        typ=typ, year=date.today().year, op_name=op.name,\n                        gather_scatter=gather_scatter, zero=zero, one=one))\n    common.clang_format(opts, filename)\n\n# -----------------------------------------------------------------------------\n# Tests for masked scatter\n\ndef gen_mask_scatter(opts, op, typ, lang):\n    filename = get_filename(opts, op, typ, lang)\n    if filename == None:\n        return\n\n    ityp = 'i' + typ[1:]\n\n    if typ == 'f16':\n        two = 'nsimd_f32_to_f16(2.0f)'\n        one = 'nsimd_f32_to_f16(1.0f)'\n        zero = 'nsimd_f32_to_f16(0.0f)'\n        comp_with_0 = 'nsimd_f16_to_f32(vout[2 * k]) != 0.0f'\n        comp_with_1 = 'nsimd_f16_to_f32(vout[2 * k + 1]) != 1.0f'\n        comp_with_2 = 'nsimd_f16_to_f32(vout[2 * k]) != 2.0f'\n    else:\n        two = '({typ})2'.format(typ=typ)\n        one = '({typ})1'.format(typ=typ)\n        zero = '({typ})0'.format(typ=typ)\n        comp_with_0 = 'vout[2 * k] != ({typ})0'.format(typ=typ)\n        comp_with_1 = 'vout[2 * k + 1] != ({typ})1'.format(typ=typ)\n        comp_with_2 = 'vout[2 * k] != ({typ})2'.format(typ=typ)\n\n    if lang == 'c_base':\n        mask_scatter = \\\n            '''vec({ityp}) offsets = vmul(viota({ityp}), vset1(({ityp})2,\n                                          {ityp}), {ityp});\n               vecl({typ}) mask = vmask_for_loop_tail(0, i, {typ});\n               vmask_scatter(mask, vout, offsets, vset1({two}, {typ}),\n                             {typ});'''.format(two=two, typ=typ, ityp=ityp)\n    if lang == 'c_adv':\n        mask_scatter = \\\n            '''nsimd_pack_{ityp} offsets = nsimd_mul(nsimd_iota(\n                   nsimd_pack_{ityp}), nsimd_set1(\n                     nsimd_pack_{ityp}, ({ityp})2));\n               nsimd_packl_{typ} mask = nsimd_mask_for_loop_tail(\n                   nsimd_pack_{typ}, 0, i);\n               nsimd_mask_scatter(mask, vout, offsets, nsimd_set1(\n                   nsimd_pack_{typ}, {two}));'''. \\\n                   format(two=two, typ=typ, ityp=ityp)\n    elif lang == 'cxx_base':\n        mask_scatter = \\\n            '''vec({ityp}) offsets = nsimd::mul(nsimd::iota({ityp}()),\n                                     nsimd::set1(({ityp})2, {ityp}()),\n                                     {ityp}());\n               vecl({typ}) mask = nsimd::mask_for_loop_tail(0, i, {typ}());\n               nsimd::mask_scatter(mask, vout, offsets, nsimd::set1(\n                                   {two}, {typ}()), {typ}());'''. \\\n                                   format(two=two, typ=typ, ityp=ityp)\n    else:\n        mask_scatter = \\\n            '''typedef nsimd::pack<{typ}> pack;\n               typedef nsimd::pack<{ityp}> ipack;\n               typedef nsimd::packl<{typ}> packl;\n               ipack offsets = nsimd::mul(nsimd::iota<ipack>(),\n                               nsimd::set1<ipack>(({ityp})2));\n               packl mask = nsimd::mask_for_loop_tail<packl>(0, i);\n               nsimd::mask_scatter(mask, vout, offsets,\n                                   nsimd::set1<pack>({two}));'''. \\\n                                   format(two=two, typ=typ, ityp=ityp)\n\n    with common.open_utf8(opts, filename) as out:\n        out.write(\n           '''{includes}\n\n           #define STATUS \"test of {op_name} over {typ}\"\n\n           int main(void) {{\n             int n = 2 * vlen({typ});\n             int i, j, k;\n             {typ} vout[2 * NSIMD_MAX_LEN({typ})];\n\n             fprintf(stdout, \"test of {op_name} over {typ}...\\\\n\");\n\n             for (i = 0; i < n / 2; i++) {{\n               /* Fill output with 0 1 0 1 0 1 ... */\n               for (j = 0; j < n; j++) {{\n                 vout[j] = (j % 2 == 0 ? {zero} : {one});\n               }}\n\n               {{\n                 {mask_scatter}\n               }}\n\n               /* Check results */\n               for (k = 0; k < n / 2; k++) {{\n                 if ({comp_with_1}) {{\n                   goto error;\n                 }}\n               }}\n               for (k = 0; k < i; k++) {{\n                 if ({comp_with_2}) {{\n                   goto error;\n                 }}\n               }}\n               for (; k < n / 2; k++) {{\n                 if ({comp_with_0}) {{\n                   goto error;\n                 }}\n               }}\n             }}\n\n             fprintf(stdout, \"test of {op_name} over {typ}... OK\\\\n\");\n             fflush(stdout);\n             return EXIT_SUCCESS;\n\n           error:\n             fprintf(stdout, STATUS \"... FAIL\\\\n\");\n             fflush(stdout);\n             return EXIT_FAILURE;\n           }}'''.format(includes=get_includes(lang), ityp=ityp, two=two,\n                        typ=typ, year=date.today().year, op_name=op.name,\n                        mask_scatter=mask_scatter, zero=zero, one=one,\n                        comp_with_0=comp_with_0, comp_with_2=comp_with_2,\n                        comp_with_1=comp_with_1))\n    common.clang_format(opts, filename)\n\n# -----------------------------------------------------------------------------\n# Tests for masked gather\n\ndef gen_maskoz_gather(opts, op, typ, lang):\n    filename = get_filename(opts, op, typ, lang)\n    if filename == None:\n        return\n\n    ityp = 'i' + typ[1:]\n\n    if typ == 'f16':\n        three = 'nsimd_f32_to_f16(3.0f)'\n        two = 'nsimd_f32_to_f16(2.0f)'\n        one = 'nsimd_f32_to_f16(1.0f)'\n        zero = 'nsimd_f32_to_f16(0.0f)'\n        comp_with_1 = 'nsimd_f16_to_f32(vout[k]) != 1.0f'\n        if op.name == 'maskz_gather':\n            comp_with_0_or_3 = 'nsimd_f16_to_f32(vout[k]) != 0.0f'\n        else:\n            comp_with_0_or_3 = 'nsimd_f16_to_f32(vout[k]) != 3.0f'\n    else:\n        three = '({typ})3'.format(typ=typ)\n        two = '({typ})2'.format(typ=typ)\n        one = '({typ})1'.format(typ=typ)\n        zero = '({typ})0'.format(typ=typ)\n        comp_with_1 = 'vout[k] != ({typ})1'.format(typ=typ)\n        if op.name == 'maskz_gather':\n            comp_with_0_or_3 = 'vout[k] != ({typ})0'.format(typ=typ)\n        else:\n            comp_with_0_or_3 = 'vout[k] != ({typ})3'.format(typ=typ)\n\n    oz = 'o' if op.name == 'masko_gather' else 'z'\n\n    if lang == 'c_base':\n        ta = ', vset1({three}, {typ})'.format(three=three, typ=typ) \\\n             if op.name == 'masko_gather' else ''\n        maskoz_gather = \\\n            '''vec({ityp}) offsets = vmul(viota({ityp}), vset1(({ityp})2,\n                                          {ityp}), {ityp});\n               vecl({typ}) mask = vmask_for_loop_tail(0, i, {typ});\n               vstoreu(vout, vmask{oz}_gather(mask, vin, offsets{ta},\n                       {typ}), {typ});'''. \\\n                       format(typ=typ, ityp=ityp, ta=ta, oz=oz)\n    if lang == 'c_adv':\n        ta = ', nsimd_set1(nsimd_pack_{typ}, {three})'. \\\n             format(three=three, typ=typ) if op.name == 'masko_gather' else ''\n        maskoz_gather = \\\n            '''nsimd_pack_{ityp} offsets = nsimd_mul(nsimd_iota(\n                   nsimd_pack_{ityp}), nsimd_set1(\n                       nsimd_pack_{ityp}, ({ityp})2));\n               nsimd_packl_{typ} mask = nsimd_mask_for_loop_tail(\n                                            nsimd_pack_{typ}, 0, i);\n               nsimd_storeu(vout, nsimd_mask{oz}_gather(\n                   mask, vin, offsets{ta}));'''. \\\n                   format(typ=typ, ityp=ityp, ta=ta, oz=oz)\n    elif lang == 'cxx_base':\n        ta = ', nsimd::set1({three}, {typ}())'.format(three=three, typ=typ) \\\n             if op.name == 'masko_gather' else ''\n        maskoz_gather = \\\n            '''vec({ityp}) offsets = nsimd::mul(nsimd::iota({ityp}()),\n                                     nsimd::set1(({ityp})2, {ityp}()),\n                                     {ityp}());\n               vecl({typ}) mask = nsimd::mask_for_loop_tail(0, i, {typ}());\n               nsimd::storeu(vout, nsimd::mask{oz}_gather(\n                   mask, vin, offsets{ta}, {typ}()), {typ}());'''. \\\n                   format(typ=typ, ityp=ityp, ta=ta, oz=oz)\n    else:\n        ta = ', nsimd::set1<nsimd::pack<{typ}> >({three})'. \\\n             format(three=three, typ=typ) if op.name == 'masko_gather' else ''\n        maskoz_gather = \\\n            '''typedef nsimd::pack<{ityp}> ipack;\n               typedef nsimd::packl<{typ}> packl;\n               ipack offsets = nsimd::mul(nsimd::iota<ipack>(),\n                               nsimd::set1<ipack>(({ityp})2));\n               packl mask = nsimd::mask_for_loop_tail<packl>(0, i);\n               nsimd::storeu(vout, nsimd::mask{oz}_gather(\n                   mask, vin, offsets{ta}));'''. \\\n                   format(ta=ta, oz=oz, typ=typ, ityp=ityp)\n\n    with common.open_utf8(opts, filename) as out:\n        out.write(\n           '''{includes}\n\n           #define STATUS \"test of {op_name} over {typ}\"\n\n           int main(void) {{\n             int n = 2 * vlen({typ});\n             int i, j, k;\n             {typ} vin[2 * NSIMD_MAX_LEN({typ})];\n             {typ} vout[NSIMD_MAX_LEN({typ})];\n\n             fprintf(stdout, \"test of {op_name} over {typ}...\\\\n\");\n\n             for (i = 0; i < n / 2; i++) {{\n               /* Fill input with 1 0 1 0 1 0 ... */\n               for (j = 0; j < n; j++) {{\n                 vin[j] = (j % 2 == 1 ? {zero} : {one});\n               }}\n\n               /* Fill output with 2's ... */\n               for (j = 0; j < n / 2; j++) {{\n                 vout[j] = {two};\n               }}\n\n               {{\n                 {maskoz_gather}\n               }}\n\n               /* Check results */\n               for (k = 0; k < i; k++) {{\n                 if ({comp_with_1}) {{\n                   goto error;\n                 }}\n               }}\n               for (; k < n / 2; k++) {{\n                 if ({comp_with_0_or_3}) {{\n                   goto error;\n                 }}\n               }}\n             }}\n\n             fprintf(stdout, \"test of {op_name} over {typ}... OK\\\\n\");\n             fflush(stdout);\n             return EXIT_SUCCESS;\n\n           error:\n             fprintf(stdout, STATUS \"... FAIL\\\\n\");\n             fflush(stdout);\n             return EXIT_FAILURE;\n           }}'''.format(includes=get_includes(lang), ityp=ityp, two=two,\n                        typ=typ, year=date.today().year, op_name=op.name,\n                        maskoz_gather=maskoz_gather, zero=zero, one=one,\n                        comp_with_0_or_3=comp_with_0_or_3, three=three,\n                        comp_with_1=comp_with_1))\n    common.clang_format(opts, filename)\n\n# -----------------------------------------------------------------------------\n# Tests for masked loads\n\ndef gen_mask_load(opts, op, typ, lang):\n    filename = get_filename(opts, op, typ, lang)\n    if filename == None:\n        return\n\n    if typ == 'f16':\n        fill_vin = 'vin[i] = nsimd_f32_to_f16((f32)i);'\n        m1 = 'nsimd_f32_to_f16(-1.0f)'\n        comp1 = 'nsimd_f16_to_f32(vout[j]) != (f32)j'\n    else:\n        fill_vin = 'vin[i] = ({typ})i;'.format(typ=typ)\n        m1 = '({typ})-1'.format(typ=typ)\n        comp1 = 'vout[j] != ({typ})j'.format(typ=typ)\n\n    if op.name in ['masko_loada1', 'masko_loadu1']:\n        if lang == 'c_base':\n            test = \\\n            '''vecl({typ}) mask = vmask_for_loop_tail(0, i, {typ});\n               vec({typ}) other = vset1({m1}, {typ});\n               vstoreu(vout, v{op_name}(mask, vin, other, {typ}), {typ});'''. \\\n               format(typ=typ, op_name=op.name, m1=m1)\n        elif lang == 'c_adv':\n            test = \\\n            '''nsimd_packl_{typ} mask = nsimd_mask_for_loop_tail(\n                                            nsimd_packl_{typ}, 0, i);\n               nsimd_pack_{typ} other = nsimd_set1(nsimd_pack_{typ}, {m1});\n               nsimd_storeu(vout, nsimd_{op_name}(mask, vin, other));'''. \\\n               format(typ=typ, op_name=op.name, m1=m1)\n        elif lang == 'cxx_base':\n            test = \\\n            '''vecl({typ}) mask = nsimd::mask_for_loop_tail(0, i, {typ}());\n               vec({typ}) other = nsimd::set1({m1}, {typ}());\n               nsimd::storeu(vout, nsimd::{op_name}(\n                   mask, vin, other, {typ}()), {typ}());'''. \\\n                   format(typ=typ, op_name=op.name, m1=m1)\n        elif lang == 'cxx_adv':\n            test = \\\n            '''nsimd::packl<{typ}> mask =\n                   nsimd::mask_for_loop_tail<nsimd::packl<{typ}> >(0, i);\n               nsimd::pack<{typ}> other = nsimd::set1<nsimd::pack<{typ}> >(\n                                              {m1});\n               nsimd::storeu(vout, nsimd::{op_name}(mask, vin, other));'''. \\\n               format(typ=typ, op_name=op.name, m1=m1)\n        comp2 = 'vout[j] != ({typ})-1'.format(typ=typ) if typ != 'f16' else \\\n                'nsimd_f16_to_f32(vout[j]) != -1.0f'\n    else:\n        if lang == 'c_base':\n            test = \\\n            '''vecl({typ}) mask = vmask_for_loop_tail(0, i, {typ});\n               vstoreu(vout, v{op_name}(mask, vin, {typ}), {typ});'''. \\\n               format(typ=typ, op_name=op.name, m1=m1)\n        elif lang == 'c_adv':\n            test = \\\n            '''nsimd_packl_{typ} mask = nsimd_mask_for_loop_tail(\n                                            nsimd_packl_{typ}, 0, i);\n               nsimd_storeu(vout, nsimd_{op_name}(mask, vin));'''. \\\n               format(typ=typ, op_name=op.name, m1=m1)\n        elif lang == 'cxx_base':\n            test = \\\n            '''vecl({typ}) mask = nsimd::mask_for_loop_tail(0, i, {typ}());\n               nsimd::storeu(vout, nsimd::{op_name}(\n                   mask, vin, {typ}()), {typ}());'''. \\\n                   format(typ=typ, op_name=op.name, m1=m1)\n        elif lang == 'cxx_adv':\n            test = \\\n            '''nsimd::packl<{typ}> mask =\n                   nsimd::mask_for_loop_tail<nsimd::packl<{typ}> >(0, i);\n               nsimd::storeu(vout, nsimd::{op_name}(mask, vin));'''. \\\n               format(typ=typ, op_name=op.name, m1=m1)\n        comp2 = 'vout[j] != ({typ})0'.format(typ=typ) if typ != 'f16' else \\\n                'nsimd_f16_to_f32(vout[j]) != -0.0f'\n\n    if op.name in ['masko_loadu1', 'maskz_loadu1']:\n        unalign = '\\nvin += 1;'\n    else:\n        unalign = ''\n\n    with common.open_utf8(opts, filename) as out:\n        out.write(\n           '''{includes}\n\n           #define STATUS \"test of {op_name} over {typ}\"\n\n           #define CHECK(a) {{ \\\\\n             errno = 0; \\\\\n             if (!(a)) {{ \\\\\n               fprintf(stderr, \"ERROR: \" #a \":%d: %s\\\\n\", \\\\\n                       __LINE__, strerror(errno)); \\\\\n               fflush(stderr); \\\\\n               exit(EXIT_FAILURE); \\\\\n             }} \\\\\n           }}\n\n           int main(void) {{\n             int i, j;\n             {typ} *vin;\n             {typ} vout[NSIMD_MAX_LEN({typ})];\n             int len = vlen({typ});\n\n             fprintf(stdout, \"test of {op_name} over {typ}...\\\\n\");\n\n             CHECK(vin = ({typ}*)nsimd_aligned_alloc(2 * len));{unalign}\n\n             /* Fill with data */\n             for (i = 0; i < len; i++) {{\n               {fill_vin}\n             }}\n\n             /* Load and put back data into vout */\n             for (i = 0; i < len; i++) {{\n               {test}\n\n               for (j = 0; j < i; j++) {{\n                 if ({comp1}) {{\n                   fprintf(stdout, STATUS \"... FAIL\\\\n\");\n                   fflush(stdout);\n                   return -1;\n                 }}\n               }}\n               for (; j < len; j++) {{\n                 if ({comp2}) {{\n                   fprintf(stdout, STATUS \"... FAIL\\\\n\");\n                   fflush(stdout);\n                   return -1;\n                 }}\n               }}\n             }}\n\n             fprintf(stdout, \"test of {op_name} over {typ}... OK\\\\n\");\n             return EXIT_SUCCESS;\n           }}'''.format(includes=get_includes(lang), op_name=op.name,\n                        typ=typ, year=date.today().year, test=test,\n                        comp1=comp1, comp2=comp2, unalign=unalign,\n                        fill_vin=fill_vin))\n    common.clang_format(opts, filename)\n\n# -----------------------------------------------------------------------------\n# Tests for masked stores\n\ndef gen_mask_store(opts, op, typ, lang):\n    filename = get_filename(opts, op, typ, lang)\n    if filename == None:\n        return\n\n    if typ == 'f16':\n        fill_vout = 'vout[i] = nsimd_f32_to_f16((f32)0);'\n        one = 'nsimd_f32_to_f16(1.0f)'\n        comp1 = 'nsimd_f16_to_f32(vout[j]) != (f32)1'\n        comp2 = 'nsimd_f16_to_f32(vout[j]) != (f32)0'\n    else:\n        fill_vout = 'vout[i] = ({typ})0;'.format(typ=typ)\n        one = '({typ})1'.format(typ=typ)\n        comp1 = 'vout[j] != ({typ})1'.format(typ=typ)\n        comp2 = 'vout[j] != ({typ})0'.format(typ=typ)\n\n    if lang == 'c_base':\n        test = \\\n        '''vecl({typ}) mask = vmask_for_loop_tail(0, i, {typ});\n           v{op_name}(mask, vout, vset1({one}, {typ}), {typ});'''. \\\n           format(typ=typ, op_name=op.name, one=one)\n    elif lang == 'c_adv':\n        test = \\\n        '''nsimd_packl_{typ} mask = nsimd_mask_for_loop_tail(\n               nsimd_packl_{typ}, 0, i);\n           nsimd_{op_name}(mask, vout, nsimd_set1(\n               nsimd_pack_{typ}, {one}));'''. \\\n               format(typ=typ, op_name=op.name, one=one)\n    elif lang == 'cxx_base':\n        test = \\\n        '''vecl({typ}) mask = nsimd::mask_for_loop_tail(0, i, {typ}());\n           nsimd::{op_name}(mask, vout, nsimd::set1({one}, {typ}()),\n                            {typ}());'''.format(typ=typ, op_name=op.name,\n                                                one=one)\n    elif lang == 'cxx_adv':\n        test = \\\n        '''nsimd::packl<{typ}> mask =\n               nsimd::mask_for_loop_tail<nsimd::packl<{typ}> >(0, i);\n           nsimd::{op_name}(mask, vout,\n                            nsimd::set1<nsimd::pack<{typ}> >({one}));'''. \\\n                            format(typ=typ, op_name=op.name, one=one)\n\n    if op.name == 'mask_storeu1':\n        unalign = '\\nvout += 1;'\n    else:\n        unalign = ''\n\n    with common.open_utf8(opts, filename) as out:\n        out.write(\n           '''{includes}\n\n           #define STATUS \"test of {op_name} over {typ}\"\n\n           #define CHECK(a) {{ \\\\\n             errno = 0; \\\\\n             if (!(a)) {{ \\\\\n               fprintf(stderr, \"ERROR: \" #a \":%d: %s\\\\n\", \\\\\n                       __LINE__, strerror(errno)); \\\\\n               fflush(stderr); \\\\\n               exit(EXIT_FAILURE); \\\\\n             }} \\\\\n           }}\n\n           int main(void) {{\n             int i, j;\n             {typ} *vout;\n             int len = vlen({typ});\n\n             fprintf(stdout, \"test of {op_name} over {typ}...\\\\n\");\n\n             CHECK(vout = ({typ}*)nsimd_aligned_alloc({sizeof} * len));{unalign}\n\n             /* Fill vout with zeors */\n             for (i = 0; i < len; i++) {{\n               {fill_vout}\n             }}\n\n             /* Store data into vout */\n             for (i = 0; i < len; i++) {{\n               {test}\n\n               for (j = 0; j < i; j++) {{\n                 if ({comp1}) {{\n                   fprintf(stdout, STATUS \"... FAIL\\\\n\");\n                   fflush(stdout);\n                   return -1;\n                 }}\n               }}\n               for (; j < len; j++) {{\n                 if ({comp2}) {{\n                   fprintf(stdout, STATUS \"... FAIL\\\\n\");\n                   fflush(stdout);\n                   return -1;\n                 }}\n               }}\n             }}\n\n             fprintf(stdout, \"test of {op_name} over {typ}... OK\\\\n\");\n             return EXIT_SUCCESS;\n           }}'''.format(includes=get_includes(lang), op_name=op.name,\n                        typ=typ, year=date.today().year, test=test,\n                        comp1=comp1, comp2=comp2, unalign=unalign,\n                        fill_vout=fill_vout, sizeof=common.sizeof(typ)))\n    common.clang_format(opts, filename)\n\n# -----------------------------------------------------------------------------\n# Tests that load/store of degrees 2, 3 and 4 ravels vectors correctly\n\ndef gen_load_store_ravel(opts, op, typ, lang):\n    # This test only the libs internal, not the API, so we only generate test\n    # for c\n    filename = get_filename(opts, op, typ, lang, 'ravel')\n    if filename == None:\n        return\n\n    deg = op.name[4]\n    align = op.name[5]\n\n    if typ=='f16':\n        convert_to='nsimd_f32_to_f16((f32)'\n    else:\n        convert_to='({typ})('.format(typ=typ)\n\n    check = '\\n'.join(['''\n      comp = vset1({convert_to}{i}+1), {typ});\n      err = err || vany(vne(v.v{i}, comp, {typ}), {typ});\n      '''.format(typ=typ, i=i, convert_to=convert_to) \\\n      for i in range (0, int(deg))])\n\n    with common.open_utf8(opts, filename) as out:\n        out.write(\n        '''{includes}\n\n           #define SIZE (2048 / {sizeof})\n\n           #define STATUS \"test raveling of {op_name} over {typ}\"\n\n           #define CHECK(a) {{ \\\\\n             errno = 0; \\\\\n             if (!(a)) {{ \\\\\n               fprintf(stderr, \"ERROR: \" #a \":%d: %s\\\\n\", \\\\\n                       __LINE__, strerror(errno)); \\\\\n               fflush(stderr); \\\\\n               exit(EXIT_FAILURE); \\\\\n             }} \\\\\n           }}\n\n           int main(void) {{\n             {typ}* vin;\n             {typ}* vout;\n             int i;\n             int len = vlen({typ});\n             int n = {deg} * len;\n             int err=0;\n             vec({typ}) comp;\n             vecx{deg}({typ}) v;\n\n             fprintf(stdout, \"test raveling of {op_name} over {typ}...\\\\n\");\n\n             CHECK(vin = ({typ}*)nsimd_aligned_alloc(n * {sizeof}));\n             CHECK(vout = ({typ}*)nsimd_aligned_alloc(n * {sizeof}));\n\n             /* Fill in the vectors */\n             for (i=0; i<n; ++i) {{\n                 vin[i] = {convert_to}(i%{deg}) + 1);\n             }}\n\n             /* Load data and check that each vector is correctly filled */\n             v = v{op_name}(vin, {typ});\n\n             {check}\n\n             if (err) {{\n               fprintf(stdout, STATUS \"... FAIL\\\\n\");\n               fflush(stdout);\n               return -1;\n             }}\n\n             fprintf(stdout, \"Raveling of {op_name} over {typ}... OK\\\\n\");\n             return EXIT_SUCCESS;\n           }}'''.format(includes=get_includes(lang), op_name=op.name,\n                        typ=typ, year=date.today().year, deg=deg,\n                        convert_to=convert_to,\n                        sizeof=common.sizeof(typ), check=check))\n    common.clang_format(opts, filename)\n\n# -----------------------------------------------------------------------------\n# Tests for iota\n\ndef gen_iota(opts, op, typ, lang):\n    filename = get_filename(opts, op, typ, lang)\n    if filename == None:\n        return\n    if lang == 'c_base':\n        do_iota = 'vstoreu(buf, viota({typ}), {typ});'.format(typ=typ)\n    elif lang == 'c_adv':\n        do_iota = 'nsimd_storeu(buf, nsimd_iota(nsimd_pack_{typ}));'. \\\n                  format(typ=typ)\n    elif lang == 'cxx_base':\n        do_iota = 'nsimd::storeu(buf, nsimd::iota({typ}()), {typ}());'. \\\n                  format(typ=typ)\n    else:\n        do_iota = 'nsimd::storeu(buf, nsimd::iota<nsimd::pack<{typ}> >());'. \\\n                  format(typ=typ)\n\n    if typ == 'f16':\n        comp_i = 'nsimd_f16_to_f32(buf[i]) != (f32)i'\n    else:\n        comp_i = 'buf[i] != ({typ})i'.format(typ=typ)\n\n    with common.open_utf8(opts, filename) as out:\n        out.write(\n            '''{includes}\n\n           int main(void) {{\n             int i;\n             {typ} buf[NSIMD_MAX_LEN({typ})];\n             int len = vlen({typ});\n\n             fprintf(stdout, \"test of {op_name} over {typ}...\\\\n\");\n\n             {do_iota}\n\n             for (i = 0; i < len; i++) {{\n               if ({comp_i}) {{\n                 exit(EXIT_FAILURE);\n               }}\n             }}\n\n             fprintf(stdout, \"test of {op_name} over {typ}... OK\\\\n\");\n             return EXIT_SUCCESS;\n           }}'''.format(includes=get_includes(lang), op_name=op.name,\n                        typ=typ, do_iota=do_iota, year=date.today().year,\n                        comp_i=comp_i))\n    common.clang_format(opts, filename)\n\n# -----------------------------------------------------------------------------\n# Tests for nbtrue\n\ndef gen_nbtrue(opts, op, typ, lang):\n    filename = get_filename(opts, op, typ, lang)\n    if filename == None:\n        return\n    if lang == 'c_base':\n        nbtrue = 'vnbtrue(vloadla(buf, {}), {})'.format(typ, typ)\n    elif lang == 'c_adv':\n        nbtrue = 'nsimd_nbtrue(nsimd_loadla(nsimd_packl_{}, buf))'.format(typ)\n    elif lang == 'cxx_base':\n        nbtrue = 'nsimd::nbtrue(nsimd::loadla(buf, {}()), {}())'. \\\n                 format(typ, typ)\n    else:\n        nbtrue = 'nsimd::nbtrue(nsimd::loadla<nsimd::packl<{}> >(buf))'. \\\n                 format(typ)\n    if typ == 'f16':\n        scalar0 = 'nsimd_f32_to_f16(0)'\n        scalar1 = 'nsimd_f32_to_f16(1)'\n    else:\n        scalar0 = '({})0'.format(typ)\n        scalar1 = '({})1'.format(typ)\n    with common.open_utf8(opts, filename) as out:\n        out.write(\n            '''{includes}\n\n           #define CHECK(a) {{ \\\\\n             errno = 0; \\\\\n             if (!(a)) {{ \\\\\n               fprintf(stderr, \"ERROR: \" #a \":%d: %s\\\\n\", \\\\\n                       __LINE__, strerror(errno)); \\\\\n               fflush(stderr); \\\\\n               exit(EXIT_FAILURE); \\\\\n             }} \\\\\n           }}\n\n           int main(void) {{\n             int i;\n             {typ} *buf;\n             int len = vlen({typ});\n\n             fprintf(stdout, \"test of {op_name} over {typ}...\\\\n\");\n             CHECK(buf = ({typ}*)nsimd_aligned_alloc(len * {sizeof}));\n\n             /* Test with all elements to true */\n             for (i = 0; i < len; i++) {{\n               buf[i] = {scalar1};\n             }}\n             if ({nbtrue} != len) {{\n               exit(EXIT_FAILURE);\n             }}\n\n             /* Test with all elements to false */\n             for (i = 0; i < len; i++) {{\n               buf[i] = {scalar0};\n             }}\n             if ({nbtrue} != 0) {{\n               exit(EXIT_FAILURE);\n             }}\n\n             /* Test with only one element to true */\n             buf[0] = {scalar1};\n             if ({nbtrue} != 1) {{\n               exit(EXIT_FAILURE);\n             }}\n\n             fprintf(stdout, \"test of {op_name} over {typ}... OK\\\\n\");\n             return EXIT_SUCCESS;\n           }}'''.format(includes=get_includes(lang), op_name=op.name,\n                        typ=typ, nbtrue=nbtrue, year=date.today().year,\n                        notl='!' if op.name == 'any' else '', scalar0=scalar0,\n                        scalar1=scalar1, sizeof=common.sizeof(typ)))\n    common.clang_format(opts, filename)\n\n# -----------------------------------------------------------------------------\n# Tests for reinterprets and converts\n\n\ndef gen_reinterpret_convert(opts, op, from_typ, to_typ, lang):\n    filename = get_filename(opts, op, '{}_to_{}'.format(from_typ, to_typ),\n                            lang)\n    if filename == None:\n        return\n    logical = 'l' if op.name == 'reinterpretl' or op.name == 'to_mask' else ''\n    if lang == 'c_base':\n        if op.name == 'upcvt':\n            comp = '''{{\n                        vecx2({to_typ}) tmp =\n                          vupcvt(vload{logical}a(in, {from_typ}),\n                                                 {from_typ}, {to_typ});\n                        vstore{logical}a(out, vdowncvt(\n                            tmp.v0, tmp.v1, {to_typ}, {from_typ}),\n                            {from_typ});\n                      }}'''.format(op_name=op.name, from_typ=from_typ,\n                                   to_typ=to_typ, logical=logical)\n        elif op.name == 'to_mask':\n            comp = '''vstorela(out, vto_logical(vto_mask(vloadla(in, {typ}),\n                               {typ}), {typ}), {typ});'''.format(typ=from_typ)\n        else:\n            comp = '''vstore{logical}a(out, v{op_name}(v{op_name}(\n                        vload{logical}a(in, {from_typ}), {from_typ}, {to_typ}),\n                          {to_typ}, {from_typ}), {from_typ});'''. \\\n                          format(op_name=op.name, from_typ=from_typ,\n                                 to_typ=to_typ, logical=logical)\n    elif lang == 'c_adv':\n        if op.name == 'upcvt':\n            comp = '''{{\n                        nsimd_packx2_{to_typ} tmp =\n                            nsimd_upcvt(nsimd_packx2_{to_typ},\n                                nsimd_loada(nsimd_pack_{from_typ}, in));\n                        nsimd_storea(out, nsimd_downcvt(\n                            nsimd_pack_{from_typ}, tmp.v0, tmp.v1));\n                      }}'''.format(op_name=op.name, from_typ=from_typ,\n                                   to_typ=to_typ, logical=logical)\n        elif op.name == 'to_mask':\n            comp = '''nsimd_storela(out, nsimd_to_logical(nsimd_to_mask(\n                          nsimd_loadla(nsimd_packl_{typ}, in))));'''. \\\n                          format(typ=from_typ)\n        else:\n            comp = \\\n            '''nsimd_store{logical}a(out, nsimd_{op_name}(\n                 nsimd_pack{logical}_{from_typ},\n                   nsimd_{op_name}(nsimd_pack{logical}_{to_typ},\n                     nsimd_load{logical}a(nsimd_pack{logical}_{from_typ},\n                       in))));'''. \\\n                     format(op_name=op.name, from_typ=from_typ,\n                            to_typ=to_typ, logical=logical)\n    elif lang == 'cxx_base':\n        if op.name == 'upcvt':\n            comp = '''vecx2({to_typ}) tmp =\n                        nsimd::upcvt(nsimd::load{logical}a(\n                            in, {from_typ}()), {from_typ}(), {to_typ}());\n                        nsimd::store{logical}a(out, nsimd::downcvt(\n                            tmp.v0, tmp.v1, {to_typ}(), {from_typ}()),\n                            {from_typ}());'''. \\\n                            format(op_name=op.name, from_typ=from_typ,\n                            to_typ=to_typ, logical=logical)\n        elif op.name == 'to_mask':\n            comp = '''nsimd::storela(out, nsimd::to_logical(nsimd::to_mask(\n                        nsimd::loadla(in, {typ}()), {typ}()), {typ}()),\n                          {typ}());'''.format(typ=from_typ)\n        else:\n            comp = '''nsimd::store{logical}a(out, nsimd::{op_name}(\n                        nsimd::{op_name}(nsimd::load{logical}a(\n                          in, {from_typ}()), {from_typ}(), {to_typ}()),\n                            {to_typ}(), {from_typ}()), {from_typ}());'''. \\\n                            format(op_name=op.name, from_typ=from_typ,\n                                   to_typ=to_typ, logical=logical)\n    else:\n        if op.name == 'upcvt':\n            comp = \\\n                '''nsimd::packx2<{to_typ}> tmp = nsimd::upcvt<\n                 nsimd::pack{logical}x2<{to_typ}> >(nsimd::load{logical}a<\n                   nsimd::pack{logical}<{from_typ}> >(in));\n               nsimd::store{logical}a(out, nsimd::downcvt<\n                 nsimd::pack{logical}<{from_typ}> >(tmp.v0, tmp.v1));'''. \\\n                 format(op_name=op.name, from_typ=from_typ,\n                        to_typ=to_typ, logical=logical)\n        elif op.name == 'to_mask':\n            comp = '''nsimd::storela(out, nsimd::to_logical(nsimd::to_mask(\n                        nsimd::loadla<nsimd::packl<{}> >(in))));'''. \\\n                        format(from_typ)\n        else:\n            comp = \\\n                '''nsimd::store{logical}a(out, nsimd::{op_name}<\n                 nsimd::pack{logical}<{from_typ}> >(nsimd::{op_name}<\n                   nsimd::pack{logical}<{to_typ}> >(nsimd::load{logical}a<\n                     nsimd::pack{logical}<{from_typ}> >(in))));'''. \\\n                format(op_name=op.name, from_typ=from_typ,\n                       to_typ=to_typ, logical=logical)\n    if logical == 'l':\n        rand = '(rand() % 2)'\n    else:\n        if op.name == 'reinterpret' and to_typ == 'f16' and \\\n           from_typ in ['i16', 'u16']:\n            rand = '(15360 /* no denormal */ | (1 << (rand() % 4)))'\n        else:\n            if to_typ in common.utypes or from_typ in common.utypes:\n                rand = '(1 << (rand() % 4))'\n            else:\n                rand = '((2 * (rand() % 2) - 1) * (1 << (rand() % 4)))'\n    if from_typ == 'f16':\n        rand = 'nsimd_f32_to_f16((f32){});'.format(rand)\n        neq_test = '(*(u16*)&in[j]) != (*(u16*)&out[j])'\n    else:\n        rand = '({}){}'.format(from_typ, rand)\n        neq_test = 'in[j] != out[j]'\n    with common.open_utf8(opts, filename) as out:\n        out.write(\n        '''{includes}\n\n           {msvc_c4334_warning}\n\n           #define CHECK(a) {{ \\\\\n             errno = 0; \\\\\n             if (!(a)) {{ \\\\\n               fprintf(stderr, \"ERROR: \" #a \":%d: %s\\\\n\", \\\\\n                       __LINE__, strerror(errno)); \\\\\n               fflush(stderr); \\\\\n               exit(EXIT_FAILURE); \\\\\n             }} \\\\\n           }}\n\n           int main(void) {{\n             int i, j;\n             {from_typ} *in, *out;\n             int len = vlen({from_typ});\n\n             fprintf(stdout,\n                     \"test of {op_name} from {from_typ} to {to_typ}...\\\\n\");\n             CHECK(in = ({from_typ}*)nsimd_aligned_alloc(len * {sizeof}));\n             CHECK(out = ({from_typ}*)nsimd_aligned_alloc(len * {sizeof}));\n\n             for (i = 0; i < 100; i++) {{\n               for (j = 0; j < len; j++) {{\n                 in[j] = {rand};\n               }}\n\n               {comp}\n\n               for (j = 0; j < len; j++) {{\n                 if ({neq_test}) {{\n                   exit(EXIT_FAILURE);\n                 }}\n               }}\n             }}\n\n             fprintf(stdout,\n                     \"test of {op_name} from {from_typ} to {to_typ}... OK\\\\n\");\n             return EXIT_SUCCESS;\n           }}'''.format(includes=get_includes(lang), op_name=op.name,\n                        to_typ=to_typ, from_typ=from_typ, comp=comp,\n                        year=date.today().year, rand=rand, neq_test=neq_test,\n                        sizeof=common.sizeof(from_typ),\n                        msvc_c4334_warning=msvc_c4334_warning \\\n                        if from_typ in ['i64', 'u64'] else ''))\n    common.clang_format(opts, filename)\n\n# -----------------------------------------------------------------------------\n# Shuffle\n\n\ndef gen_reverse(opts, op, typ, lang):\n    filename = get_filename(opts, op, typ, lang)\n    if filename == None:\n        return\n    if lang == 'c_base':\n        test_code = \\\n        'vstorea(out, vreverse(vloada(in, {typ}), {typ}), {typ});'. \\\n        format(typ=typ)\n    elif lang == 'c_adv':\n        test_code = '''nsimd_storea(out, nsimd_reverse(nsimd_loada(\n                         nsimd_pack_{typ}, in)));'''.format(typ=typ)\n    elif lang == 'cxx_base':\n        test_code = \\\n        'nsimd::storea(out, nsimd::reverse(nsimd::loada(in, {typ}()), ' \\\n        '{typ}()), {typ}());'.format(typ=typ)\n    elif lang == 'cxx_adv':\n        test_code = \\\n        'nsimd::storea(out, nsimd::reverse(' \\\n        'nsimd::loada<nsimd::pack<{typ}> >(in)));'.format(typ=typ)\n    if typ == 'f16':\n        init = 'in[ i ] = nsimd_f32_to_f16((float)(i + 1));'\n        comp = 'ok &= nsimd_f16_to_f32(out[len - 1 - i]) == ' \\\n               'nsimd_f16_to_f32(in[i]);'\n    else:\n        init = 'in[ i ] = ({typ})(i + 1);'.format(typ=typ)\n        comp = 'ok &= out[len - 1 - i] == in[i];'\n\n    with common.open_utf8(opts, filename) as out:\n        out.write(\n            '''{includes}\n\n           #define CHECK(a) {{ \\\\\n             errno = 0; \\\\\n             if (!(a)) {{ \\\\\n               fprintf(stderr, \"ERROR: \" #a \":%d: %s\\\\n\", \\\\\n                       __LINE__, strerror(errno)); \\\\\n               fflush(stderr); \\\\\n               exit(EXIT_FAILURE); \\\\\n             }} \\\\\n           }}\n\n           int main(void) {{\n             unsigned char i;\n             int ok;\n             {typ} * in;\n             {typ} * out;\n\n             int len = vlen({typ});\n\n             fprintf(stdout, \"test of {op_name} over {typ}...\\\\n\");\n             CHECK(in = ({typ}*)nsimd_aligned_alloc(len * {sizeof}));\n             CHECK(out = ({typ}*)nsimd_aligned_alloc(len * {sizeof}));\n\n             for( i = 0 ; i < len ; ++i )\n             {{\n                 {init}\n             }}\n\n             {test_code}\n\n             ok = 1;\n\n             for( i = 0 ; i < len ; ++i )\n             {{\n               {comp}\n             }}\n\n             if( ok )\n             {{\n               fprintf(stdout, \"test of {op_name} over {typ}... OK\\\\n\");\n             }}\n             else\n             {{\n               fprintf(stderr, \"test of {op_name} over {typ}... FAIL\\\\n\");\n               exit(EXIT_FAILURE);\n             }}\n\n             nsimd_aligned_free( in );\n             nsimd_aligned_free( out );\n\n             return EXIT_SUCCESS;\n           }}'''.format(includes=get_includes(lang), op_name=op.name,\n                        typ=typ, test_code=test_code, year=date.today().year,\n                        sizeof=common.sizeof(typ), init=init, comp=comp))\n\n    common.clang_format(opts, filename)\n\n# -----------------------------------------------------------------------------\n# Unpack half\n\ndef gen_unpack_half(opts, op, typ, lang):\n    filename = get_filename(opts, op, typ, lang)\n    if filename == None:\n        return\n    if typ == 'f16':\n        left = '(double)nsimd_f16_to_f32(ref_out)'\n        right = '(double)nsimd_f16_to_f32(nsimd_out)'\n    elif typ == 'f32':\n        left = '(double)ref_out'\n        right = '(double)nsimd_out'\n    else:\n        left = 'ref_out'\n        right = 'nsimd_out'\n\n    if lang == 'c_base':\n        typ_nsimd = 'vec({typ})'.format(typ=typ)\n        vout1_comp = '''vec({typ}) va1, va2, vc;\n                        va1 = vloadu(&vin1[i], {typ});\n                        va2 = vloadu(&vin2[i], {typ});\n                        vc = v{op_name}(va1, va2, {typ});\n                        vstoreu(&vout[i], vc, {typ});'''. \\\n                        format(typ=typ, op_name=op.name)\n    if lang == 'c_adv':\n        typ_nsimd = 'nsimd_pack_{typ}'.format(typ=typ)\n        vout1_comp = '''nsimd_pack_{typ} va1, va2, vc;\n                        va1 = nsimd_loadu(nsimd_pack_{typ}, &vin1[i]);\n                        va2 = nsimd_loadu(nsimd_pack_{typ}, &vin2[i]);\n                        vc = nsimd_{op_name}(va1, va2);\n                        nsimd_storeu(&vout[i], vc);'''. \\\n                        format(typ=typ, op_name=op.name)\n    if lang == 'cxx_base':\n        typ_nsimd = 'vec({typ})'.format(typ=typ)\n        vout1_comp = '''vec({typ}) va1, va2, vc;\n                        va1 = nsimd::loadu(&vin1[i], {typ}());\n                        va2 = nsimd::loadu(&vin2[i], {typ}());\n                        vc = nsimd::{op_name}(va1, va2, {typ}());\n                        nsimd::storeu(&vout[i], vc, {typ}());'''. \\\n                        format(typ=typ, op_name=op.name)\n    if lang == 'cxx_adv':\n        typ_nsimd = 'nsimd::pack<{typ}>'.format(typ=typ)\n        vout1_comp = '''nsimd::pack<{typ}> va1, va2, vc;\n                        va1 = nsimd::loadu<nsimd::pack<{typ}> >(&vin1[i]);\n                        va2 = nsimd::loadu<nsimd::pack<{typ}> >(&vin2[i]);\n                        vc = nsimd::{op_name}(va1, va2);\n                        nsimd::storeu(&vout[i], vc);'''. \\\n                        format(typ=typ, op_name=op.name)\n\n    op_test =  'step/(2 * nb_lane)'\n    if op.name in['ziphi', 'ziplo']:\n        offset = 'int offset = {val};'.format(val= '0' \\\n                 if op.name == 'ziplo' else 'vlen({typ}) / 2'.format(typ=typ))\n    else:\n        offset = ''\n\n    if op.name in ['unziplo', 'unziphi']:\n        if typ == 'f16':\n            comp_unpack = '''\n            (nsimd_f16_to_f32(vout[i]) != nsimd_f16_to_f32(vin1[vi + 2 * j + {i}]))\n            || (nsimd_f16_to_f32(vout[i + step / 2]) != nsimd_f16_to_f32(vin2[vi + 2 * j + {i}]))\n            '''.format(i = '0' if op.name == 'unziplo' else '1')\n        else:\n            comp_unpack =  '''\\\n            (vout[i] != vin1[vi + 2 * j + {i}])\n            || (vout[i + step / 2] != vin2[vi + 2 * j + {i}])\n            '''.format(i = '0' if op.name == 'unziplo' else '1')\n    else:\n        if typ == 'f16':\n            comp_unpack ='''(nsimd_f16_to_f32(vout[i]) != nsimd_f16_to_f32(vin1[j])) ||\n                (nsimd_f16_to_f32(vout[i + 1]) != nsimd_f16_to_f32(vin2[j]))'''\n        else:\n            comp_unpack ='''(vout[i] != vin1[j]) ||\n            (vout[i + 1] != vin2[j])'''\n\n    nbits = {'f16': '10', 'f32': '21', 'f64': '48'}\n    head = '''{posix_c_source}\n\n              {includes}\n              #include <float.h>\n              #include <math.h>\n\n              {msvc_c4334_warning}\n\n              #define SIZE (2048 / {sizeof})\n\n              #define CHECK(a) {{ \\\\\n                errno = 0; \\\\\n                if (!(a)) {{ \\\\\n                fprintf(stderr, \"ERROR: \" #a \":%d: %s\\\\n\", \\\\\n                        __LINE__, strerror(errno)); \\\\\n                fflush(stderr); \\\\\n                exit(EXIT_FAILURE); \\\\\n                }} \\\\\n              }}\n\n              /* {simd} */\n\n              ''' .format(year=date.today().year, typ=typ,\n                          posix_c_source=posix_c_source,\n                          includes=get_includes(lang),\n                          comp_unpack=comp_unpack,\n                          sizeof=common.sizeof(typ), simd=opts.simd,\n                          msvc_c4334_warning=msvc_c4334_warning \\\n                          if typ in ['i64', 'u64'] else '')\n    if typ == 'f16':\n        rand = '''nsimd_f32_to_f16((f32)(2 * (rand() % 2) - 1) *\n        (f32)(1 << (rand() % 4)) /\n        (f32)(1 << (rand() % 4)))'''\n    else:\n        rand = '''({typ})(({typ})(2 * (rand() % 2) - 1) * ({typ})(1 << (rand() % 4))\n        / ({typ})(1 << (rand() % 4)))'''.format(typ=typ)\n\n    with common.open_utf8(opts, filename) as out:\n        out.write(\n        '''{head}\n\n           int main(void) {{\n              int vi, i, j, step;\n              {typ} *vin1, *vin2;\n              {typ} *vout;\n\n              CHECK(vin1 = ({typ} *)nsimd_aligned_alloc(SIZE * {sizeof}));\n              CHECK(vin2 = ({typ} *)nsimd_aligned_alloc(SIZE * {sizeof}));\n              CHECK(vout = ({typ} *)nsimd_aligned_alloc(SIZE * {sizeof}));\n\n              step = vlen({typ});\n\n              fprintf(stdout, \"test of {op_name} over {typ}...\\\\n\");\n\n              /* Fill input vector(s) with random */\n              for (i = 0; i < SIZE; i++)\n              {{\n                vin1[i] = {rand};\n                vin2[i] = {rand};\n              }}\n\n              /* Fill output vector with computed values */\n              for (i = 0; i < SIZE; i += step)\n              {{\n                {vout1_comp}\n              }}\n\n              /* Compare results */\n              if (step != 1) {{\n                {offset}\n                for (vi = 0; vi < SIZE; vi += step){{\n                 j = {init_j};\n                 for (i = vi; i < {cond}; {inc}) {{\n                   if({comp_unpack}) {{\n                     fprintf(stderr, \"test of {op_name} over {typ}... FAIL\\\\n\");\n                     exit(EXIT_FAILURE);\n                   }}\n                   j++;\n                  }}\n                }}\n              }}\n\n              fprintf(stdout, \"test of {op_name} over {typ}... OK\\\\n\");\n              fflush(stdout);\n              return EXIT_SUCCESS;\n            }}\n        '''.format(includes=get_includes(lang), op_name=op.name,\n            typ=typ, year=date.today().year,sizeof=common.sizeof(typ),\n            rand=rand, head=head, comp_unpack=comp_unpack,\n            vout1_comp= vout1_comp, op_test=op_test, typ_nsimd=typ_nsimd,\n            offset=offset,\n            cond='vi + step' if op.name in['ziplo', 'ziphi'] else 'vi + step / 2',\n            init_j='vi + offset' if op.name in['ziplo', 'ziphi'] else '0',\n            inc='i += 2' if op.name in['ziphi', 'ziplo'] else 'i++',\n            pos='0' if op.name in ['ziplo', 'unziplo', 'unziphi'] else op_test))\n\n    common.clang_format(opts, filename)\n\n# ------------------------------------------------------------------------------\n# Unpack\n\ndef gen_unpack(opts, op, typ, lang):\n    filename = get_filename(opts, op, typ, lang)\n    if filename == None:\n        return\n    if typ == 'f16':\n        left = '(double)nsimd_f16_to_f32(ref_out)'\n        right = '(double)nsimd_f16_to_f32(nsimd_out)'\n    elif typ == 'f32':\n        left = '(double)ref_out'\n        right = '(double)nsimd_out'\n    else:\n        left = 'ref_out'\n        right = 'nsimd_out'\n\n    if lang == 'c_base':\n        typ_nsimd = 'vec({typ})'.format(typ=typ)\n        vout1_comp = \\\n        '''vec({typ}) va1, va2;\n           vecx2({typ}) vc;\n           va1 = vloadu(&vin1[i], {typ});\n           va2 = vloadu(&vin2[i], {typ});\n           vc = v{op_name}(va1, va2, {typ});\n           vstoreu(&vout[2 * i], vc.v0, {typ});\n           vstoreu(&vout[2 * i + vlen({typ})], vc.v1, {typ});'''. \\\n           format(typ=typ, op_name=op.name)\n    if lang == 'c_adv':\n        typ_nsimd = 'nsimd_pack_{typ}'.format(typ=typ)\n        vout1_comp = \\\n        '''nsimd_pack_{typ} va1, va2;\n           nsimd_packx2_{typ} vc;\n           va1 = nsimd_loadu(nsimd_pack_{typ}, &vin1[i]);\n           va2 = nsimd_loadu(nsimd_pack_{typ}, &vin2[i]);\n           vc = nsimd_{op_name}(va1, va2);\n           nsimd_storeu(&vout[2 * i], vc.v0);\n           nsimd_storeu(&vout[2 * i + nsimd_len(nsimd_pack_{typ})],\n                        vc.v1);'''.format(typ=typ, op_name=op.name)\n    if lang == 'cxx_base':\n        typ_nsimd = 'vec({typ})'.format(typ=typ)\n        vout1_comp = \\\n        '''vec({typ}) va1, va2;\n           vecx2({typ}) vc;\n           va1 = nsimd::loadu(&vin1[i], {typ}());\n           va2 = nsimd::loadu(&vin2[i], {typ}());\n           vc = nsimd::{op_name}(va1, va2, {typ}());\n           nsimd::storeu(&vout[2 * i], vc.v0, {typ}());\n           nsimd::storeu(&vout[2 * i + vlen({typ})], vc.v1, {typ}());'''. \\\n           format(typ=typ, op_name=op.name)\n    if lang == 'cxx_adv':\n        typ_nsimd = 'nsimd::pack<{typ}>'.format(typ=typ)\n        vout1_comp = \\\n        '''nsimd::pack<{typ}> va1, va2;\n           nsimd::packx2<{typ}> vc;\n           va1 = nsimd::loadu<nsimd::pack<{typ}> >(&vin1[i]);\n           va2 = nsimd::loadu<nsimd::pack<{typ}> >(&vin2[i]);\n           vc = nsimd::{op_name}(va1, va2);\n           nsimd::storeu(&vout[2 * i], vc.v0);\n           nsimd::storeu(&vout[2 * i + nsimd::len({typ}())], vc.v1);'''. \\\n           format(typ=typ, op_name=op.name)\n\n    head = '''{posix_c_source}\n\n              {includes}\n              #include <float.h>\n              #include <math.h>\n\n              {msvc_c4334_warning}\n\n              #define SIZE (2048 / {sizeof})\n\n              #define CHECK(a) {{ \\\\\n                errno = 0; \\\\\n                if (!(a)) {{ \\\\\n                  fprintf(stderr, \"ERROR: \" #a \":%d: %s\\\\n\", \\\\\n                          __LINE__, strerror(errno)); \\\\\n                  fflush(stderr); \\\\\n                  exit(EXIT_FAILURE); \\\\\n                }} \\\\\n              }}\n\n              /* {simd} */\n              ''' .format(year=date.today().year, typ=typ,\n                          posix_c_source=posix_c_source,\n                          includes=get_includes(lang),\n                          sizeof=common.sizeof(typ), simd= opts.simd,\n                          msvc_c4334_warning=msvc_c4334_warning \\\n                          if typ in ['i64', 'u64'] else '')\n\n    if typ == 'f16':\n        rand = 'nsimd_f32_to_f16((f32)(2 * (rand() % 2) - 1) * ' \\\n               '(f32)(1 << (rand() % 4)) / (f32)(1 << (rand() % 4)))'\n    else:\n        rand = '({typ})(({typ})(2 * (rand() % 2) - 1) * ' \\\n               '({typ})(1 << (rand() % 4)) / ({typ})(1 << (rand() % 4)))'. \\\n               format(typ=typ)\n\n    if op.name == 'zip':\n        scalar_code = '''for(i = 0; i < step; i ++) {{\n                           out_ptr[2 * i] = vin1_ptr[i];\n                           out_ptr[2 * i + 1] = vin2_ptr[i];\n                         }}\n                         '''\n    else:\n        scalar_code = \\\n        '''for(i = 0; i < step / 2; i++) {{\n             out_ptr[i] = vin1_ptr[2 * i];\n             out_ptr[step / 2 + i] = vin2_ptr[2 * i];\n             out_ptr[step + i] = vin1_ptr[2 * i + 1];\n             out_ptr[step + step / 2 + i] = vin2_ptr[2 * i + 1];\n           }}\n           '''\n\n    if typ == 'f16':\n        comp = 'nsimd_f16_to_f32(vout[vi]) !=  nsimd_f16_to_f32(vout_ref[vi])'\n    else:\n        comp = 'vout[vi] != vout_ref[vi]'\n\n    with common.open_utf8(opts, filename) as out:\n        out.write(\n        '''{head}\n\n        int main(void){{\n          int i, vi, step;\n          {typ} *vin1, *vin2;\n          {typ} *vout;\n          {typ} *vout_ref;\n\n          CHECK(vin1 = ({typ} *)nsimd_aligned_alloc(SIZE * {sizeof}));\n          CHECK(vin2 = ({typ} *)nsimd_aligned_alloc(SIZE * {sizeof}));\n          CHECK(vout = ({typ} *)nsimd_aligned_alloc(2 * SIZE * {sizeof}));\n          CHECK(vout_ref = ({typ} *)nsimd_aligned_alloc(2 * SIZE * {sizeof}));\n\n          step = vlen({typ});\n\n          fprintf(stdout, \"test of {op_name} over {typ}...\\\\n\");\n\n          /* Fill input vector(s) with random */\n          for (i = 0; i < SIZE; i++)\n          {{\n            vin1[i] = {rand};\n            vin2[i] = {rand};\n          }}\n\n          /* Compute a scalar reference version */\n          for(vi = 0; vi < SIZE; vi += step)\n          {{\n            {typ} *out_ptr = vout_ref + 2 * vi;\n            {typ} *vin1_ptr = vin1 + vi;\n            {typ} *vin2_ptr = vin2 + vi;\n\n            {scalar_code}\n          }}\n\n          /* Fill output vector with computed values */\n          for (i = 0; i < SIZE; i += step)\n          {{\n            {vout1_comp}\n          }}\n\n          /* Compare results */\n          for(vi = 0; vi < SIZE; vi++) {{\n            if({comp}) {{\n              fprintf(stderr, \"test of {op_name} over {typ}... FAIL\\\\n\");\n              exit(EXIT_FAILURE);\n            }}\n          }}\n\n          fprintf(stdout, \"test of {op_name} over {typ}... OK\\\\n\");\n          fflush(stdout);\n          return EXIT_SUCCESS;\n        }}\n        '''.format(includes=get_includes(lang), op_name=op.name,\n                   typ=typ, year=date.today().year,sizeof=common.sizeof(typ),\n                   rand=rand, head=head, scalar_code=scalar_code, comp=comp,\n                   vout1_comp= vout1_comp, typ_nsimd=typ_nsimd))\n    common.clang_format(opts, filename)\n\n# -----------------------------------------------------------------------------\n# Entry point\n\ndef doit(opts):\n    common.myprint(opts, 'Generating tests')\n    for op_name, operator in operators.operators.items():\n        # Skip non-matching tests\n        if opts.match and not opts.match.match(op_name):\n            continue\n        for typ in operator.types:\n            if not should_i_do_the_test(operator, '', typ):\n                continue\n            elif operator.name == 'nbtrue':\n                gen_nbtrue(opts, operator, typ, 'c_base')\n                gen_nbtrue(opts, operator, typ, 'c_adv')\n                gen_nbtrue(opts, operator, typ, 'cxx_base')\n                gen_nbtrue(opts, operator, typ, 'cxx_adv')\n            elif operator.name == 'addv':\n                if typ in common.ftypes:\n                    gen_addv(opts, operator, typ, 'c_base')\n                    gen_addv(opts, operator, typ, 'c_adv')\n                    gen_addv(opts, operator, typ, 'cxx_base')\n                    gen_addv(opts, operator, typ, 'cxx_adv')\n            elif operator.name == 'adds':\n                gen_adds(opts, operator, typ, 'c_base')\n                gen_adds(opts, operator, typ, 'c_adv')\n                gen_adds(opts, operator, typ, 'cxx_base')\n                gen_adds(opts, operator, typ, 'cxx_adv')\n            elif operator.name == 'subs':\n                gen_subs(opts, operator, typ, 'c_base')\n                gen_subs(opts, operator, typ, 'c_adv')\n                gen_subs(opts, operator, typ, 'cxx_base')\n                gen_subs(opts, operator, typ, 'cxx_adv')\n            elif operator.name in ['all', 'any']:\n                gen_all_any(opts, operator, typ, 'c_base')\n                gen_all_any(opts, operator, typ, 'c_adv')\n                gen_all_any(opts, operator, typ, 'cxx_base')\n                gen_all_any(opts, operator, typ, 'cxx_adv')\n            elif operator.name == 'iota':\n                gen_iota(opts, operator, typ, 'c_base')\n                gen_iota(opts, operator, typ, 'c_adv')\n                gen_iota(opts, operator, typ, 'cxx_base')\n                gen_iota(opts, operator, typ, 'cxx_adv')\n            elif operator.name in ['reinterpret', 'reinterpretl', 'cvt',\n                                   'upcvt', 'to_mask']:\n                for to_typ in common.get_output_types(typ, operator.output_to):\n                    if not should_i_do_the_test(operator, to_typ, typ):\n                        continue\n                    gen_reinterpret_convert(opts, operator, typ, to_typ,\n                                            'c_base')\n                    gen_reinterpret_convert(opts, operator, typ, to_typ,\n                                            'c_adv')\n                    gen_reinterpret_convert(opts, operator, typ, to_typ,\n                                            'cxx_base')\n                    gen_reinterpret_convert(opts, operator, typ, to_typ,\n                                            'cxx_adv')\n            elif operator.name in ['load2a', 'load2u', 'load3a', 'load3u',\n                                   'load4a', 'load4u']:\n                gen_load_store(opts, operator, typ, 'c_base')\n                gen_load_store(opts, operator, typ, 'c_adv')\n                gen_load_store(opts, operator, typ, 'cxx_base')\n                gen_load_store(opts, operator, typ, 'cxx_adv')\n                gen_load_store_ravel(opts, operator, typ, 'c_base')\n            elif operator.name in ['gather', 'gather_linear']:\n                gen_gather_scatter(opts, operator, typ, 'c_base')\n                gen_gather_scatter(opts, operator, typ, 'c_adv')\n                gen_gather_scatter(opts, operator, typ, 'cxx_base')\n                gen_gather_scatter(opts, operator, typ, 'cxx_adv')\n            elif operator.name == 'mask_scatter':\n                gen_mask_scatter(opts, operator, typ, 'c_base')\n                gen_mask_scatter(opts, operator, typ, 'c_adv')\n                gen_mask_scatter(opts, operator, typ, 'cxx_base')\n                gen_mask_scatter(opts, operator, typ, 'cxx_adv')\n            elif operator.name in ['maskz_gather', 'masko_gather']:\n                gen_maskoz_gather(opts, operator, typ, 'c_base')\n                gen_maskoz_gather(opts, operator, typ, 'c_adv')\n                gen_maskoz_gather(opts, operator, typ, 'cxx_base')\n                gen_maskoz_gather(opts, operator, typ, 'cxx_adv')\n            elif operator.name in ['masko_loada1', 'masko_loadu1',\n                                   'maskz_loada1', 'maskz_loadu1']:\n                gen_mask_load(opts, operator, typ, 'c_base')\n                gen_mask_load(opts, operator, typ, 'c_adv')\n                gen_mask_load(opts, operator, typ, 'cxx_base')\n                gen_mask_load(opts, operator, typ, 'cxx_adv')\n            elif operator.name in ['mask_storea1', 'mask_storeu1']:\n                gen_mask_store(opts, operator, typ, 'c_base')\n                gen_mask_store(opts, operator, typ, 'c_adv')\n                gen_mask_store(opts, operator, typ, 'cxx_base')\n                gen_mask_store(opts, operator, typ, 'cxx_adv')\n            elif operator.name == 'reverse':\n                gen_reverse(opts, operator, typ, 'c_base');\n                gen_reverse(opts, operator, typ, 'c_adv');\n                gen_reverse(opts, operator, typ, 'cxx_base');\n                gen_reverse(opts, operator, typ, 'cxx_adv');\n            elif operator.name in ['ziplo', 'ziphi',\n                                   'unziplo', 'unziphi']:\n                gen_unpack_half(opts, operator, typ, 'c_base')\n                gen_unpack_half(opts, operator, typ, 'c_adv')\n                gen_unpack_half(opts, operator, typ, 'cxx_base')\n                gen_unpack_half(opts, operator, typ, 'cxx_adv')\n            elif operator.name in ['zip', 'unzip']:\n                gen_unpack(opts, operator, typ, 'c_base')\n                gen_unpack(opts, operator, typ, 'c_adv')\n                gen_unpack(opts, operator, typ, 'cxx_base')\n                gen_unpack(opts, operator, typ, 'cxx_adv')\n            else:\n                gen_test(opts, operator, typ, 'c_base')\n                gen_test(opts, operator, typ, 'c_adv')\n                gen_test(opts, operator, typ, 'cxx_base')\n                gen_test(opts, operator, typ, 'cxx_adv')\n"
  },
  {
    "path": "egg/get_sleef_code.py",
    "content": "# Copyright (c) 2021 Agenium Scale\n#\n# Permission is hereby granted, free of charge, to any person obtaining a copy\n# of this software and associated documentation files (the \"Software\"), to deal\n# in the Software without restriction, including without limitation the rights\n# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n# copies of the Software, and to permit persons to whom the Software is\n# furnished to do so, subject to the following conditions:\n#\n# The above copyright notice and this permission notice shall be included in all\n# copies or substantial portions of the Software.\n#\n# THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n# SOFTWARE.\n\nimport common\nimport shutil\nimport requests\nimport zipfile\nimport os\n\n# -----------------------------------------------------------------------------\n\ndef doit(opts):\n    common.myprint(opts, 'Copy native Sleef version {}'. \\\n                         format(opts.sleef_version))\n\n    # First download Sleef\n    sleef_dir = os.path.join(opts.script_dir, '..', '_deps-sleef')\n    common.mkdir_p(sleef_dir)\n    url = 'https://github.com/shibatch/sleef/archive/refs/tags/{}.zip'. \\\n          format(opts.sleef_version)\n    r = requests.get(url, allow_redirects=True)\n    sleef_zip = os.path.join(sleef_dir, 'sleef.zip')\n    with open(sleef_zip, 'wb') as fout:\n        fout.write(r.content)\n\n    # Unzip sleef\n    with zipfile.ZipFile(sleef_zip, 'r') as fin:\n        fin.extractall(path=sleef_dir)\n\n    # Copy helper function\n    def copy(filename):\n        dst_filename = os.path.basename(filename)\n        shutil.copyfile(os.path.join(sleef_dir,\n                                     'sleef-{}'.format(opts.sleef_version),\n                                     filename), os.path.join(opts.src_dir,\n                                                             dst_filename))\n\n    # Copy files\n    copy('src/libm/sleefsimddp.c')\n    copy('src/libm/sleefsimdsp.c')\n    copy('src/libm/sleefdp.c')\n    copy('src/libm/sleefsp.c')\n    copy('src/common/misc.h')\n    copy('src/libm/estrin.h')\n    copy('src/libm/dd.h')\n    copy('src/libm/df.h')\n    copy('src/libm/rempitab.c')\n    copy('src/arch/helpersse2.h')\n    copy('src/arch/helperavx.h')\n    copy('src/arch/helperavx2.h')\n    copy('src/arch/helperavx512f.h')\n    copy('src/arch/helperneon32.h')\n    copy('src/arch/helperadvsimd.h')\n    copy('src/arch/helperpower_128.h')\n    copy('src/arch/helpersve.h')\n\n    # Sleef uses aliases but we don't need those so we comment them\n    def comment_DALIAS_lines(filename):\n        src = os.path.join(opts.src_dir, filename)\n        dst = os.path.join(opts.src_dir, 'tmp.c')\n        with open(src, 'r') as fin, open(dst, 'w') as fout:\n            for line in fin:\n                if line.startswith('DALIAS_'):\n                    fout.write('/* {} */\\n'.format(line.strip()))\n                else:\n                    fout.write(line)\n        shutil.copyfile(dst, src)\n        os.remove(dst)\n    comment_DALIAS_lines('sleefsimdsp.c')\n    comment_DALIAS_lines('sleefsimddp.c')\n\n    # Sleef provides runtime SIMD detection via cpuid but we don't need it\n    def replace_x86_cpuid(filename):\n        src = os.path.join(opts.src_dir, filename)\n        dst = os.path.join(opts.src_dir, 'tmp.c')\n        with open(src, 'r') as fin, open(dst, 'w') as fout:\n            for line in fin:\n                if line.startswith('void Sleef_x86CpuID'):\n                    fout.write(\n                    '''static inline\n                       void Sleef_x86CpuID(int32_t out[4], uint32_t eax,\n                                           uint32_t ecx) {\n                         /* We don't care for cpuid detection */\n                         out[0] = 0xFFFFFFFF;\n                         out[1] = 0xFFFFFFFF;\n                         out[2] = 0xFFFFFFFF;\n                         out[3] = 0xFFFFFFFF;\n                       }\n                       ''')\n                else:\n                    fout.write(line)\n        shutil.copyfile(dst, src)\n        os.remove(dst)\n    replace_x86_cpuid('helpersse2.h')\n    replace_x86_cpuid('helperavx.h')\n    replace_x86_cpuid('helperavx2.h')\n    replace_x86_cpuid('helperavx512f.h')\n\n    # Sleef uses force inline through its INLINE macro defined in misc.h\n    # We modify it to avoid warnings and because force inline has been a pain\n    # in the past. We also rename some exported symbols.\n    with open(os.path.join(opts.src_dir, 'misc.h'), 'a') as fout:\n        fout.write(\n        '''\n\n        /* NSIMD specific */\n        #ifndef NSIMD_SLEEF_MISC_H\n        #define NSIMD_SLEEF_MISC_H\n\n        #ifdef INLINE\n        #undef INLINE\n        #endif\n        #define INLINE inline\n\n        #define Sleef_rempitabdp nsimd_sleef_rempitab_f64\n        #define Sleef_rempitabsp nsimd_sleef_rempitab_f32\n\n        #endif\n\n        ''')\n\n    # Sleef functions must be renamed properly for each SIMD extensions.\n    # Moreover their name must contain their precision (in ULPs). This\n    # precision is not the same for all functions and some functions can have\n    # differents flavours (or precisions). The \"database\" is contained within\n    # src/libm/funcproto.h. So we parse it and produce names\n    # in headers \"rename[SIMD ext].h\" to avoid modifying Sleef C files.\n    funcproto = os.path.join(sleef_dir, 'sleef-{}'.format(opts.sleef_version),\n                             'src', 'libm', 'funcproto.h')\n    defines = []\n    ulp_suffix = {\n        '0' : '',\n        '1' : '_u1',\n        '2' : '_u05',\n        '3' : '_u35',\n        '4' : '_u15',\n        '5' : '_u3500'\n    }\n    with open(funcproto, 'r') as fin:\n        for line in fin:\n            if (line.find('{') != -1 and line.find('}') != -1):\n                items = [item.strip() \\\n                         for item in line.strip(' \\n\\r{},').split(',')]\n                items[0] = items[0].strip('\"')\n                if items[0] == 'NULL':\n                    break\n                sleef_name_f64 = items[0] + ulp_suffix[items[2]]\n                sleef_name_f32 = items[0] + 'f' + ulp_suffix[items[2]]\n                items[1] = items[1] if items[1] != '5' else '05'\n                if items[1] == '-1':\n                    nsimd_name_f64 = 'nsimd_sleef_{}_{{nsimd_ext}}_f64'. \\\n                                     format(items[0])\n                    nsimd_name_f32 = 'nsimd_sleef_{}_{{nsimd_ext}}_f32'. \\\n                                     format(items[0])\n                else:\n                    nsimd_name_f64 = \\\n                    'nsimd_sleef_{}_u{}{{det}}_{{nsimd_ext}}_f64'. \\\n                    format(items[0], items[1])\n                    nsimd_name_f32 = \\\n                    'nsimd_sleef_{}_u{}{{det}}_{{nsimd_ext}}_f32'. \\\n                    format(items[0], items[1])\n                defines.append('#define x{} {}'.format(sleef_name_f64,\n                                                       nsimd_name_f64))\n                defines.append('#define x{} {}'.format(sleef_name_f32,\n                                                       nsimd_name_f32))\n    defines = '\\n'.join(defines)\n\n    sleef_to_nsimd = {\n        '':        ['scalar'],\n        'sse2':    ['sse2'],\n        'sse4':    ['sse42'],\n        'avx':     ['avx'],\n        'avx2':    ['avx2'],\n        'avx512f': ['avx512_knl', 'avx512_skylake'],\n        'neon32':  ['neon128'],\n        'advsimd': ['aarch64'],\n        'sve':     ['sve128', 'sve256', 'sve512', 'sve1024', 'sve2048'],\n        'vsx':     ['vmx', 'vsx']\n    }\n\n    for simd_ext in ['', 'sse2', 'sse4', 'avx', 'avx2', 'avx512f', 'neon32',\n                     'advsimd', 'sve', 'vsx']:\n        renameheader = os.path.join(opts.src_dir,\n                                    'rename{}.h'.format(simd_ext))\n        se = simd_ext if simd_ext != '' else 'scalar'\n        with open(renameheader, 'w') as fout:\n            fout.write(\n            '''#ifndef RENAME{SIMD_EXT}_H\n               #define RENAME{SIMD_EXT}_H\n\n               '''.format(SIMD_EXT=se.upper()))\n            for nse in sleef_to_nsimd[simd_ext]:\n                ifdef = '' if simd_ext == '' \\\n                           else '#ifdef NSIMD_{}'.format(nse.upper())\n                endif = '' if simd_ext == '' else '#endif'\n                fout.write(\n                '''{hbar}\n                   /* Naming of functions {nsimd_ext} */\n\n                   {ifdef}\n\n                   #ifdef DETERMINISTIC\n\n                   {defines_det_f32}\n\n                   #else\n\n                   {defines_nondet_f32}\n\n                   #endif\n\n                   #define rempi nsimd_sleef_rempi_{nsimd_ext}\n                   #define rempif nsimd_sleef_rempif_{nsimd_ext}\n                   #define rempisub nsimd_sleef_rempisub_{nsimd_ext}\n                   #define rempisubf nsimd_sleef_rempisubf_{nsimd_ext}\n                   #define gammak nsimd_gammak_{nsimd_ext}\n                   #define gammafk nsimd_gammafk_{nsimd_ext}\n\n                   {endif}\n\n                   '''.format(NSIMD_EXT=nse.upper(), nsimd_ext=nse,\n                   hbar=common.hbar, ifdef=ifdef, endif=endif,\n                   defines_det_f32=defines.format(det='d', nsimd_ext=nse),\n                   defines_nondet_f32=defines.format(det='', nsimd_ext=nse),\n                   defines_det_f64=defines.format(det='d', nsimd_ext=nse),\n                   defines_nondet_f64=defines.format(det='', nsimd_ext=nse)))\n\n            fout.write('\\n\\n#endif\\n\\n')\n\n            common.clang_format(opts, renameheader)\n"
  },
  {
    "path": "egg/hatch.py",
    "content": "# Copyright (c) 2021 Agenium Scale\n#\n# Permission is hereby granted, free of charge, to any person obtaining a copy\n# of this software and associated documentation files (the \"Software\"), to deal\n# in the Software without restriction, including without limitation the rights\n# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n# copies of the Software, and to permit persons to whom the Software is\n# furnished to do so, subject to the following conditions:\n#\n# The above copyright notice and this permission notice shall be included in all\n# copies or substantial portions of the Software.\n#\n# THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n# SOFTWARE.\n\n# What does this script?\n# ----------------------\n#\n# This script generates code for each architecture, the base C/C++ APIs and\n# the advanced C++ API. Each part to be generated is handled by a\n# `gen_*.py` file. This script simply calls the `doit` function of each\n# `gen_*.py` module. Names are self-explanatory.\n#\n# -----------------------------------------------------------------------------\n# First thing we do is check whether python3 is used\n\nimport sys\nif sys.version_info[0] < 3:\n    print('Only Python 3 is supported')\n    sys.exit(1)\n\n# -----------------------------------------------------------------------------\n# Imports\n\nimport argparse\nimport os\nimport re\nimport common\nimport gen_archis\nimport gen_base_apis\nimport gen_adv_cxx_api\nimport gen_adv_c_api\nimport gen_tests\nimport gen_src\nimport gen_doc\nimport gen_friendly_but_not_optimized\nimport gen_modules\nimport gen_scalar_utilities\nimport get_sleef_code\n\n# Dir of this script\nscript_dir = os.path.dirname(__file__)\nif script_dir == '':\n    script_dir = '.'\n\n# -----------------------------------------------------------------------------\n# Arguments parsing\n\ndef parse_args(args):\n    def parse_simd(value):\n        ## Split .simd now\n        values = {\n            'x86': common.x86_simds,\n            'arm': common.arm_simds,\n            'ppc': common.ppc_simds,\n            'all': common.simds,\n        }.get(value, value.split(','))\n        ## Check that all simd are valid\n        ret = []\n        for simd in values:\n            if simd not in common.simds:\n                raise argparse.ArgumentTypeError(\n                        \"SIMD '{}' not found in {}\".format(simd, common.simds))\n            ret += common.simds_deps[simd]\n        return list(set(ret))\n    def parse_match(value):\n        if value is None:\n            return None\n        else:\n            return re.compile(value)\n    # In pratice, we either generate all or all except tests and we never\n    # change default directories for code generation. So we remove unused\n    # options and regroup some into --library.\n    parser = argparse.ArgumentParser(\n                 description='This is NSIMD generation script.')\n    parser.add_argument('--force', '-f', action='store_true',\n        help='Generate all files even if they already exist')\n    parser.add_argument('--list-files', '-L', action='store_true',\n        default=False,\n        help='List files that will be created by hatch.py')\n    parser.add_argument('--all', '-A', action='store_true',\n        help='Generate code for the library and its tests')\n    parser.add_argument('--library', '-l', action='store_true',\n        help='Generate code of the library (C and C++ APIs)')\n    parser.add_argument('--sleef', '-s', action='store_true', default=False,\n        help='Compile Sleef')\n    parser.add_argument('--tests', '-t', action='store_true',\n        help='Generate tests in C and C++')\n    parser.add_argument('--doc', '-d', action='store_true',\n        help='Generate all documentation')\n    parser.add_argument('--enable-clang-format', '-F', action='store_false',\n        default=True,\n        help='Disable Clang Format (mainly for speed on Windows)')\n    parser.add_argument('--sve-emulate-bool', action='store_true',\n        default=False,\n        help='Use normal SVE vector to emulate predicates.')\n    parser.add_argument('--simd', '-D', type=parse_simd, default='all',\n        help='List of SIMD extensions (separated by a comma)')\n    parser.add_argument('--match', '-m', type=parse_match, default=None,\n        help='Regex used to filter generation on operator names')\n    parser.add_argument('--verbose', '-v', action = 'store_true', default=None,\n        help='Enable verbose mode')\n    parser.add_argument('--simple-license', action='store_true', default=False,\n        help='Put a simple copyright statement instead of the whole license')\n    opts = parser.parse_args(args)\n    # When -L has been chosen, we want to list all files and so we have to\n    # turn to True other parameters\n    if opts.list_files:\n        opts.library = True\n        opts.tests = True\n        opts.force = True\n        opts.doc = True\n    # We set variables here because all the code depends on them + we do want\n    # to keep the possibility to change them in the future\n    opts.archis = opts.library\n    opts.base_apis = opts.library\n    opts.adv_cxx_api = opts.library\n    opts.adv_c_api = opts.library\n    opts.friendly_but_not_optimized = opts.library\n    opts.src = opts.library\n    opts.scalar_utilities = opts.library\n    opts.sleef_version = '3.5.1'\n    opts.include_dir = os.path.join(script_dir, '..', 'include', 'nsimd')\n    opts.tests_dir = os.path.join(script_dir, '..', 'tests')\n    opts.src_dir = os.path.join(script_dir, '..', 'src')\n    return opts\n\n# -----------------------------------------------------------------------------\n# Entry point\n\ndef main():\n    opts = parse_args(sys.argv[1:])\n    opts.script_dir = script_dir\n    opts.modules_list = None\n    opts.platforms_list = None\n\n    ## Gather all SIMD dependencies\n    opts.simd = common.get_simds_deps_from_opts(opts)\n    common.myprint(opts, 'List of SIMD: {}'.format(', '.join(opts.simd)))\n    if opts.archis == True or opts.all == True:\n        gen_archis.doit(opts)\n    if opts.base_apis == True or opts.all == True:\n        gen_base_apis.doit(opts)\n    if opts.adv_cxx_api == True or opts.all == True:\n        gen_adv_cxx_api.doit(opts)\n    if opts.adv_c_api == True or opts.all == True:\n        gen_adv_c_api.doit(opts)\n    if opts.tests == True or opts.all == True:\n        gen_tests.doit(opts)\n    if opts.src == True or opts.all == True:\n        gen_src.doit(opts)\n    if opts.sleef == True or opts.all == True:\n        get_sleef_code.doit(opts)\n    if opts.scalar_utilities == True or opts.all == True:\n        gen_scalar_utilities.doit(opts)\n    if opts.friendly_but_not_optimized == True or opts.all == True:\n        gen_friendly_but_not_optimized.doit(opts)\n    gen_modules.doit(opts) # this must be here after all NSIMD\n    if opts.doc == True or opts.all == True:\n        gen_doc.doit(opts)\n\nif __name__ == '__main__':\n    main()\n"
  },
  {
    "path": "egg/modules/fixed_point/gen_doc.py",
    "content": "# Use utf-8 encoding\n# -*- coding: utf-8 -*-\n\n# Copyright (c) 2019 Agenium Scale\n#\n# Permission is hereby granted, free of charge, to any person obtaining a copy\n# of this software and associated documentation files (the \"Software\"), to deal\n# in the Software without restriction, including without limitation the rights\n# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n# copies of the Software, and to permit persons to whom the Software is\n# furnished to do so, subject to the following conditions:\n#\n# The above copyright notice and this permission notice shall be included in all\n# copies or substantial portions of the Software.\n#\n# THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n# SOFTWARE.\n\nimport os\nimport platform\nimport io\nimport sys\nimport subprocess\nimport collections\nimport re\nimport string\n\nimport common\nimport operators\n\n# ------------------------------------------------------------------------------\n\ndef gen_overview(opts):\n    filename = common.get_markdown_file(opts, 'overview', 'fixed_point')\n    with common.open_utf8(opts, filename) as fout:\n        fout.write('''\n# NSIMD fixed point module\n\n## Description\n\nThis module implements a fixed-point numbers support for the `nsimd` library.\nFixed-point numbers are integer types used to represent decimal numbers. A\nnumber `lf` of bits are used to encode its integer part, and `rt` bits are used\nto encode its fractional part.\n\nThe fixed_point module uses the templated type `nsimd::fixed_point::fp_t<lf,\nrt>` to represent a fixed_point number. All the basic floating-point arithmetic\noperaors have been defined, therefore fp_t elements can be manipulated as\nnormal numbers.  The fixed_point module will use a `i8`, `i16`, or\n`i32` integer type for storage, depending on the value of `lf + 2 * rt`.\n\nAll the functions of the module are under the namespace `nsimd::fixed_point`,\nand match the same interface than `nsimd` C++ .\n\nThe `fp_t` struct type is defined in `fixed.hpp`, and the associated simd\n`fpsimd_t` struct type are defined in `simd.hpp`.\n\nThe modules redefines the `nsimd` pack type for fixed-point numbers, templated\nwith `lf` and `rt` :\n\n```C++\nnamespace nsimd {\nnamespace fixed_point {\ntemplate <u8 lf, u8 rt>\nstruct pack;\n} // namespace fixed_point\n} // namespace nsimd\n```\n\nThen, the pack can be manipulated as an `nsimd` pack like other scalar types.\n\n## Compatibility\n\nThe fixed point module is a C++ only API, compatible with the C++98 standard.\nIt has the same compilers and hardware support than the main `nsimd` API\n(see the [API index](index.md)).\n\n## Example\n\nHere is a minimal example([main.cpp](../../examples/module_fixed_point.cpp)):\n\n@[INCLUDE_CODE:L21:L61](../../examples/module_fixed_point.cpp)\n\nTo test with avx2 run :\n```bash\nexport NSIMD_ROOT=<path/to/nsimd>\ng++ -o main -I$NSIMD_ROOT/include -mavx2 -DNSIMD_AVX2 main.cpp\n./main\n```\n\nThe console output will look like this :\n```console\n$>./main\n1.35938 | -0.421875 | 0.9375\n1.13281 | 1.19531 | 2.32812\n1.64844 | -1.21094 | 0.4375\n-0.660156 | 1.07422 | 0.414062\n-0.890625 | 0.214844 | -0.675781\n-0.0898438 | 0.515625 | 0.425781\n-0.539062 | 0.0546875 | -0.484375\n1.80859 | 1.66406 | 3.47266\n```\n        ''')\n\napi_template = '''\\\n# {full_name}\n\n{desc}\n\n## Template parameter type for T:\n\nWhen using the following typedef :\n```c++\ntypedef nsimd::fixed_point::fp_t<lf, rt> fp_t\n```\n\nThe T template parameter is one of the following types depending on the operator:\n\n- `set1`, `loadu` and `loada`:\n```c++\nnsimd::fixed_point::pack<fp_t>\n```\n- `loadlu`, `loadla`:\n```c++\nnsimd::fixed_point::packl<fp_t>\n```\n- Other operators:\n```c++\nnsimd::fixed_point::fp_t<lf, rt>\n```\n\n## C++ API\n\n```c++\n{decl}\n```\n'''\n\ndecl_template = '''\\\ntemplate <typename T>\n{ret}{op}({args});\\n\\n'''\n\n# -----------------------------------------------------------------------------\n\ndef get_type(param, return_typ=False):\n    if param == '_':\n        return 'void'\n    elif param == '*':\n        return 'typename T::value_type *'\n    elif param == 'c*':\n        return 'const typename T::value_type *'\n    elif param == 's':\n        return 'typename T::value_type'\n    elif param in 'v':\n        return 'pack<T>' if return_typ else 'const pack<T> &'\n    elif param == 'l':\n        return 'packl<T>' if return_typ else 'const packl<T> &'\n    elif param == 'p':\n        return 'int '\n    else:\n        return None\n\n# -----------------------------------------------------------------------------\n\ndef gen_decl(op):\n    sig = '{}{} {{}}({});'.format(\n            'template <typename T> ' \\\n                if 'v' not in op.params[1:] and \\\n                   'l' not in op.params[1:] else '',\n            get_type(op.params[0], True),\n            ', '.join(['{} {}'.format(\n                               get_type(op.params[i + 1]),\n                                        common.get_arg(i)) \\\n                                        for i in range(len(op.params[1:]))])\n          )\n    ret = 'namespace nsimd {\\n' \\\n          'namespace fixed_point {\\n\\n' + sig.format(op.name) + '\\n\\n'\n    if op.cxx_operator != None:\n        ret += sig.format('operator' + op.cxx_operator) + '\\n\\n'\n    ret += '} // namespace fixed_point\\n' \\\n           '} // namespace nsimd'\n    return ret\n\n# -----------------------------------------------------------------------------\n\ndef gen_api(opts, op_list):\n    api = dict()\n    for _, operator in operators.operators.items():\n        if operator.name not in op_list:\n            continue\n        for c in operator.categories:\n            if c not in api:\n                api[c] = [operator]\n            else:\n                api[c].append(operator)\n\n    filename = common.get_markdown_file(opts, 'api', 'fixed_point')\n    with common.open_utf8(opts, filename) as fout:\n        fout.write('''# NSIMD fixed point API\\n''')\n        for c, ops in api.items():\n            if len(ops) == 0:\n                continue\n            fout.write('\\n## {}\\n\\n'.format(c.title))\n            for op in ops:\n                fout.write('- [{} ({})](module_fixed_point_api_{}.md)\\n'. \\\n                           format(op.full_name, op.name,\n                                  common.to_filename(op.name)))\n\n# -----------------------------------------------------------------------------\n\ndef gen_doc(opts, op_list):\n    for _, op in operators.operators.items():\n        if op.name not in op_list:\n            continue\n        filename = common.get_markdown_api_file(opts, op.name, 'fixed_point')\n        with common.open_utf8(opts, filename) as fout:\n            fout.write(api_template.format(full_name=op.full_name,\n                                           desc=op.desc, decl=gen_decl(op)))\n\n# -----------------------------------------------------------------------------\n\ndef doit(opts, op_list):\n    common.myprint(opts, 'Generating doc for module fixed_point')\n    gen_overview(opts)\n    gen_api(opts, op_list)\n    gen_doc(opts, op_list)\n\n"
  },
  {
    "path": "egg/modules/fixed_point/gen_tests.py",
    "content": "# Copyright (c) 2019 Agenium Scale\n#\n# Permission is hereby granted, free of charge, to any person obtaining a copy\n# of this software and associated documentation files (the \"Software\"), to deal\n# in the Software without restriction, including without limitation the rights\n# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n# copies of the Software, and to permit persons to whom the Software is\n# furnished to do so, subject to the following conditions:\n#\n# The above copyright notice and this permission notice shall be included in all\n# copies or substantial portions of the Software.\n#\n# THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n# SOFTWARE.\n\nimport os\nimport sys\nimport common\n\n# -------------------------------------------------------------------------------\n\ndef get_filename(opts, op, lf, rt):\n    tests_dir = os.path.join(opts.tests_dir, \"modules/fixed_point\")\n    common.mkdir_p(tests_dir)\n    filename = os.path.join(tests_dir, '{}.fp_{}_{}.cpp'.format(op, lf, rt))\n    if os.path.exists(filename):\n        os.remove(filename)\n    if common.can_create_filename(opts, filename):\n        return filename\n    else:\n        return None\n\nincludes = \"\"\"\n#include <string.h>\n#include <stdio.h>\n#include <stdlib.h>\n#include <stdint.h>\n#include <math.h>\n#include <time.h>\n\n#include <nsimd/nsimd.h>\n#include <nsimd/modules/fixed_point.hpp>\n\"\"\"\n\narithmetic_aliases = \"\"\"\ntypedef nsimd::fixed_point::fp_t<{lf}, {rt}> fp_t;\ntypedef nsimd::fixed_point::pack<fp_t> vec_t;\ntypedef nsimd::fixed_point::packl<fp_t> vecl_t;\ntypedef nsimd::fixed_point::pack<fp_t>::value_type raw_t;\ntypedef nsimd::fixed_point::packl<fp_t>::value_type log_t;\nconst size_t v_size = (size_t) nsimd::fixed_point::len(fp_t());\n\"\"\"\n\n# ------------------------------------------------------------------------------\n# Utility functions\n\ncheck = \"\"\"\n#define CHECK(a) {{ \\\\\n  if (!(a)) {{ \\\\\n    fprintf(stderr, \"ERROR: \" #a \":%s: %d\\\\n\", __FILE__, __LINE__); \\\\\n    fflush(stderr); \\\\\n    exit(EXIT_FAILURE); \\\\\n  }} \\\\\n}}\n\n\"\"\"\n\nlimits = \"\"\"\ntemplate <u8 lf, u8 rt>\nstatic double __get_numeric_precision() {\n  return (double)ldexpf(1.0, -(int)rt);\n}\n\n\"\"\"\n\ncomparison_fp = \"\"\"\ntemplate <u8 lf, u8 rt>\nbool __compare_values(nsimd::fixed_point::fp_t<lf, rt> val, double ref){\n  return nsimd_scalar_abs_f64(double(val) - ref) <=\n           __get_numeric_precision<lf, rt>();\n}\n\n\"\"\"\n\ncomparison_log = \"\"\"\ntemplate <typename T, u8 lf, u8 rt>\nbool __check_logical_val(T val, nsimd::fixed_point::fp_t<lf, rt> v0,\n    nsimd::fixed_point::fp_t<lf, rt> v1)\n{{\n  return (((v0._raw {op_val} v1._raw) && (val != 0))\n      || (!(v0._raw {op_val} v1._raw) && (val == 0)));\n}}\n\n\"\"\"\n\ngen_random_val = \"\"\"\ntemplate <u8 lf, u8 rt>\nnsimd::fixed_point::fp_t<lf, rt> __gen_random_val() {{\n  float tmp = (float) rand() / (float) RAND_MAX;\n  return nsimd::fixed_point::fp_t<lf, rt>(0.5f * tmp + 1.0f);\n}}\n\n\"\"\"\n\n# ------------------------------------------------------------------------------\n# Template for arithmetic binary operators\n\narithmetic_test_template = \"\"\"\n{includes}\n\n// -----------------------------------------------------------------------------\n\n{decls}\n\n// -----------------------------------------------------------------------------\n\nint main() {{\n  typedef nsimd::fixed_point::fp_t<{lf}, {rt}> fp_t;\n  typedef nsimd::fixed_point::pack<fp_t> vec_t;\n  const size_t v_size = (size_t) nsimd::fixed_point::len(fp_t());\n\n  // FP vectors\n  fp_t *tab0_fp = (fp_t *) malloc(v_size * sizeof(fp_t));\n  fp_t *tab1_fp = (fp_t *) malloc(v_size * sizeof(fp_t));\n  fp_t *res_fp  = (fp_t *) malloc(v_size * sizeof(fp_t));\n\n  // Floating point equivalent\n  double *tab0_f = (double *) malloc(v_size * sizeof(double));\n  double *tab1_f = (double *) malloc(v_size * sizeof(double));\n  double *res_f  = (double *) malloc(v_size * sizeof(double));\n\n  for (size_t i = 0; i < v_size; i++) {{\n    tab0_fp[i] = __gen_random_val<{lf}, {rt}>();\n    tab1_fp[i] = __gen_random_val<{lf}, {rt}>();\n    tab0_f[i] = double(tab0_fp[i]);\n    tab1_f[i] = double(tab1_fp[i]);\n  }}\n\n  vec_t v0_fp = nsimd::fixed_point::loadu<vec_t>(tab0_fp);\n  vec_t v1_fp = nsimd::fixed_point::loadu<vec_t>(tab1_fp);\n  vec_t vres_fp = nsimd::fixed_point::{op_name}(v0_fp, v1_fp);\n  nsimd::fixed_point::storeu(res_fp, vres_fp);\n\n  for (size_t i = 0; i < v_size; i++) {{\n    res_f[i] = tab0_f[i] {op_val} tab1_f[i];\n  }}\n\n  for(size_t i = 0; i < v_size; i++) {{\n    CHECK(__compare_values(res_fp[i], res_f[i]));\n  }}\n\n  fprintf(stdout, \\\"test of {op_name} over fp_t<{lf},{rt}>... OK\\\\n\\\");\n  return EXIT_SUCCESS;\n}}\n\"\"\"\n\narithmetic_ops = [(\"add\", \"+\"), (\"sub\", \"-\"), (\"mul\", \"*\"), (\"div\",\"/\")]\n\ndef gen_arithmetic_ops_tests(lf, rt, opts):\n    for op_name, op_val in arithmetic_ops:\n        decls = check + limits + comparison_fp + gen_random_val\n        content_src = arithmetic_test_template.format(\n            op_name=op_name, op_val=op_val, lf=lf, rt=rt,\n            includes=includes, decls=decls)\n        filename = get_filename(opts, op_name, lf, rt)\n        if filename == None:\n            continue\n        with common.open_utf8(opts, filename) as fp:\n            fp.write(content_src)\n        common.clang_format(opts, filename)\n\n# ------------------------------------------------------------------------------\n# Min max operators template\n\nminmax_test_template = \"\"\"\n{includes}\n#define op_min(a, b) ((a) < (b) ?(a) : (b))\n#define op_max(a, b) ((a) > (b) ?(a) : (b))\n\n// -----------------------------------------------------------------------------\n\n{decls}\n\n// -----------------------------------------------------------------------------\n\nint main() {{\n  typedef nsimd::fixed_point::fp_t<{lf}, {rt}> fp_t;\n  typedef nsimd::fixed_point::pack<fp_t> vec_t;\n  const size_t v_size = (size_t) nsimd::fixed_point::len(fp_t());\n\n  // FP vectors\n  fp_t *tab0_fp = (fp_t *) malloc(v_size * sizeof(fp_t));\n  fp_t *tab1_fp = (fp_t *) malloc(v_size * sizeof(fp_t));\n  fp_t *res_fp  = (fp_t *) malloc(v_size * sizeof(fp_t));\n\n  int *res_ref  = (int *) malloc(v_size * sizeof(int));\n\n  for (size_t i = 0; i < v_size; i++) {{\n    tab0_fp[i] = __gen_random_val<{lf}, {rt}>();\n    tab1_fp[i] = __gen_random_val<{lf}, {rt}>();\n  }}\n\n  vec_t v0_fp = nsimd::fixed_point::loadu<vec_t>(tab0_fp);\n  vec_t v1_fp = nsimd::fixed_point::loadu<vec_t>(tab1_fp);\n  vec_t vres_fp = nsimd::fixed_point::{op_name}(v0_fp, v1_fp);\n  nsimd::fixed_point::storeu(res_fp, vres_fp);\n\n  for (size_t i = 0; i < v_size; i++) {{\n    res_ref[i] = op_{op_name}((int) tab0_fp[i]._raw, (int) tab1_fp[i]._raw);\n  }}\n\n  for(size_t i = 0; i < v_size; i++) {{\n    CHECK(res_fp[i]._raw == res_ref[i]);\n  }}\n\n  fprintf(stdout, \\\"test of {op_name} over fp_t<{lf},{rt}>... OK\\\\n\\\");\n  return EXIT_SUCCESS;\n}}\n\"\"\"\n\nminmax_ops = [\"min\", \"max\"]\ndef gen_minmax_ops_tests(lf, rt, opts):\n    for op_name in minmax_ops:\n        decls = check + limits + comparison_fp + gen_random_val\n        content_src = minmax_test_template.format(\n            op_name=op_name, lf=lf, rt=rt,\n            includes=includes, decls=decls)\n        filename = get_filename(opts, op_name, lf, rt)\n        if filename == None:\n            continue\n        with common.open_utf8(opts, filename) as fp:\n            fp.write(content_src)\n        common.clang_format(opts, filename)\n\n# ------------------------------------------------------------------------------\n# Ternary ops (FMA and co)\n\nternary_ops_template = \"\"\"\n{includes}\n\n// -----------------------------------------------------------------------------\n\n{decls}\n\n// -----------------------------------------------------------------------------\n\nint main() {{\n  typedef nsimd::fixed_point::fp_t<{lf}, {rt}> fp_t;\n  typedef nsimd::fixed_point::pack<fp_t> vec_t;\n  const size_t v_size = (size_t) nsimd::fixed_point::len(fp_t());\n\n  // FP vectors\n  fp_t *tab0_fp = (fp_t *) malloc(v_size * sizeof(fp_t));\n  fp_t *tab1_fp = (fp_t *) malloc(v_size * sizeof(fp_t));\n  fp_t *tab2_fp = (fp_t *) malloc(v_size * sizeof(fp_t));\n  fp_t *res_fp  = (fp_t *) malloc(v_size * sizeof(fp_t));\n\n  // Floating point equivalent\n  double *tab0_f = (double *) malloc(v_size * sizeof(double));\n  double *tab1_f = (double *) malloc(v_size * sizeof(double));\n  double *tab2_f = (double *) malloc(v_size * sizeof(double));\n  double *res_f  = (double *) malloc(v_size * sizeof(double));\n\n  for (size_t i = 0; i < v_size; i++) {{\n    tab0_fp[i] = __gen_random_val<{lf}, {rt}>();\n    tab1_fp[i] = __gen_random_val<{lf}, {rt}>();\n    tab2_fp[i] = __gen_random_val<{lf}, {rt}>();\n    tab0_f[i] = double(tab0_fp[i]);\n    tab1_f[i] = double(tab1_fp[i]);\n    tab2_f[i] = double(tab2_fp[i]);\n  }}\n\n  vec_t v0_fp = nsimd::fixed_point::loadu<vec_t>(tab0_fp);\n  vec_t v1_fp = nsimd::fixed_point::loadu<vec_t>(tab1_fp);\n  vec_t v2_fp = nsimd::fixed_point::loadu<vec_t>(tab2_fp);\n  vec_t vres_fp = nsimd::fixed_point::{op_name}(v0_fp, v1_fp, v2_fp);\n  nsimd::fixed_point::storeu(res_fp, vres_fp);\n\n  for(size_t i = 0; i < v_size; i++) {{\n    const double a = tab0_f[i];\n    const double b = tab1_f[i];\n    const double c = tab2_f[i];\n\n    {check_statement}\n  }}\n\n  for(size_t i = 0; i < v_size; i++) {{\n    CHECK(__compare_values(res_fp[i], res_f[i]));\n  }}\n\n  fprintf(stdout, \\\"test of {op_name} over fp_t<{lf},{rt}>... OK\\\\n\\\");\n  return EXIT_SUCCESS;\n}}\n\"\"\"\n\nternary_ops = [(\"fma\", \"res_f[i] = (a * b) + c;\")]\ndef gen_ternary_ops_tests(lf, rt, opts):\n    for op_name, statement in ternary_ops:\n        decls = check + limits + comparison_fp + gen_random_val\n        content_src = ternary_ops_template.format(\n            op_name=op_name, check_statement=statement.format(lf=lf, rt=rt),\n            lf=lf, rt=rt,includes=includes, decls=decls)\n        filename = get_filename(opts, op_name, lf, rt)\n        if filename == None:\n            continue\n        with common.open_utf8(opts, filename) as fp:\n            fp.write(content_src)\n        common.clang_format(opts, filename)\n\n# ------------------------------------------------------------------------------\n# Template for math operators\n\nrec_reference = \"\"\"\n// Rec operator on floating points (avoids to write a particular test for rec)\nstatic inline double rec(const double x) {{ return 1.0 / x; }}\n\"\"\"\n\nmath_test_template = \"\"\"\n{includes}\n\n// -----------------------------------------------------------------------------\n\n{decls}\n\n// -----------------------------------------------------------------------------\n\nint main() {{\n  typedef nsimd::fixed_point::fp_t<{lf}, {rt}> fp_t;\n  typedef nsimd::fixed_point::pack<fp_t> vec_t;\n  const size_t v_size = (size_t) nsimd::fixed_point::len(fp_t());\n\n  // FP vectors\n  fp_t *tab0_fp= (fp_t *) malloc(v_size * sizeof(fp_t));\n  fp_t *res_fp = (fp_t *) malloc(v_size * sizeof(fp_t));\n\n  // Floating point equivalent\n  double *tab0_f = (double *) malloc(v_size * sizeof(double));\n  double *res_f  = (double *) malloc(v_size * sizeof(double));\n\n  for (size_t i = 0; i < v_size; i++) {{\n    tab0_fp[i] = __gen_random_val<{lf}, {rt}>();\n    tab0_f[i] = double(tab0_fp[i]);\n  }}\n\n  vec_t v0_fp = nsimd::fixed_point::loadu<vec_t>(tab0_fp);\n  vec_t vres_fp = nsimd::fixed_point::{op_name}(v0_fp);\n  nsimd::fixed_point::storeu(res_fp, vres_fp);\n\n  for (size_t i = 0; i < v_size; i++) {{\n    res_f[i] = {ref_op_name}(tab0_f[i]);\n  }}\n\n  for(size_t i = 0; i < v_size; i++) {{\n    CHECK(__compare_values(res_fp[i], res_f[i]));\n  }}\n\n  fprintf(stdout, \\\"test of {op_name} over fp_t<{lf},{rt}>... OK\\\\n\\\");\n  return EXIT_SUCCESS;\n}}\n\"\"\"\n\nmath_ops = [\"rec\", \"abs\"]\ndef gen_math_functions_tests(lf, rt, opts):\n    for op_name in math_ops:\n        decls = check + limits + comparison_fp + gen_random_val\n        if op_name == \"rec\":\n            decls += rec_reference\n            ref_op_name = 'rec'\n        else:\n            ref_op_name = 'nsimd_scalar_abs_f64'\n        content_src = math_test_template.format(op_name=op_name, lf=lf, rt=rt,\n                                                ref_op_name=ref_op_name,\n                                                includes=includes, decls=decls)\n        filename = get_filename(opts, op_name, lf, rt)\n        if filename == None:\n            continue\n        with common.open_utf8(opts, filename) as fp:\n            fp.write(content_src)\n        common.clang_format(opts, filename)\n\n# ------------------------------------------------------------------------------\n# Comparison operators\n\ncomparison_test_template = \"\"\"\n{includes}\n\n// -----------------------------------------------------------------------------\n\n{decls}\n\n// -----------------------------------------------------------------------------\n\nint main(){{\n  typedef nsimd::fixed_point::fp_t<{lf}, {rt}> fp_t;\n  typedef nsimd::fixed_point::pack<fp_t> vec_t;\n  typedef nsimd::fixed_point::packl<fp_t> vecl_t;\n  typedef nsimd::fixed_point::packl<fp_t>::value_type log_t;\n  const size_t v_size = (size_t) nsimd::fixed_point::len(fp_t());\n\n  // FP vectors\n  fp_t *tab0_fp = (fp_t *) malloc(v_size * sizeof(fp_t));\n  fp_t *tab1_fp = (fp_t *) malloc(v_size * sizeof(fp_t));\n  log_t *resl_fp = (log_t *) malloc(v_size * sizeof(log_t));\n\n  for(size_t i = 0; i < v_size; i++) {{\n    tab0_fp[i] = __gen_random_val<{lf}, {rt}>();\n    tab1_fp[i] = __gen_random_val<{lf}, {rt}>();\n  }}\n  // Be sure there is at least one equality to test all the cases.\n  tab0_fp[0] = tab1_fp[0];\n\n  vec_t v0_fp = nsimd::fixed_point::loadu<vec_t>(tab0_fp);\n  vec_t v1_fp = nsimd::fixed_point::loadu<vec_t>(tab1_fp);\n  vecl_t vres_fp = nsimd::fixed_point::{op_name}(v0_fp, v1_fp);\n  nsimd::fixed_point::storelu(resl_fp, vres_fp);\n\n  for(size_t i = 0; i < v_size; i++) {{\n    CHECK((__check_logical_val<log_t, {lf}, {rt}>(\n        resl_fp[i], tab0_fp[i], tab1_fp[i])));\n  }}\n\n  fprintf(stdout, \\\"test of {op_name} over fp_t<{lf},{rt}>... OK\\\\n\\\");\n  return EXIT_SUCCESS;\n}}\n\"\"\"\n\ncomparison_ops = [(\"eq\",\"==\"), (\"ne\",\"!=\"), (\"le\",\"<=\"), (\"lt\",\"<\"),\n                  (\"ge\",\">=\"), (\"gt\",\">\")]\n\ndef gen_comparison_tests(lf, rt, opts):\n    for op_name, op_val in comparison_ops:\n        decls = check + limits + comparison_log.format(op_val=op_val) + gen_random_val\n        content_src = comparison_test_template.format(\n            op_name=op_name, op_val=op_val, lf=lf, rt=rt,\n            includes=includes, decls=decls)\n        filename = get_filename(opts, op_name, lf, rt)\n        if filename == None:\n            continue\n        with common.open_utf8(opts, filename) as fp:\n            fp.write(content_src)\n        common.clang_format(opts, filename)\n\n# ------------------------------------------------------------------------------\n# Bitwise binary operators\n\nbitwise_binary_test_template = \"\"\"\n{includes}\n#include <limits>\n\n// -----------------------------------------------------------------------------\n\n{decls}\n\n// -----------------------------------------------------------------------------\n\nint main() {{\n  typedef nsimd::fixed_point::fp_t<{lf}, {rt}> fp_t;\n  typedef nsimd::fixed_point::pack{l}<fp_t> vec{l}_t;\n  typedef nsimd::fixed_point::pack{l}<fp_t>::value_type raw_t;\n  const size_t v_size = (size_t) nsimd::fixed_point::len(fp_t());\n\n  raw_t *tab0 = (raw_t *) malloc(v_size * sizeof(raw_t));\n  raw_t *tab1 = (raw_t *) malloc(v_size * sizeof(raw_t));\n  raw_t *res  = (raw_t *) malloc(v_size * sizeof(raw_t));\n\n  for(size_t i = 0; i < v_size; i++)\n  {{\n    tab0[i] = {rand_statement}\n    tab1[i] = {rand_statement}\n  }}\n  // Be sure there is at least one equality to test all the cases.\n  tab0[0] = tab1[0];\n\n  vec{l}_t v0 = nsimd::fixed_point::load{l}u<vec{l}_t>(tab0);\n  vec{l}_t v1 = nsimd::fixed_point::load{l}u<vec{l}_t>(tab1);\n  vec{l}_t v_res = nsimd::fixed_point::{op_name}{term}(v0, v1);\n  nsimd::fixed_point::store{l}u(res, v_res);\n\n  for(size_t i = 0; i < v_size; i++)\n  {{\n    raw_t a = tab0[i];\n    raw_t b = tab1[i];\n    raw_t c = res[i];\n    CHECK({test_statement});\n  }}\n\n  fprintf(stdout, \\\"test of {op_name}{term} over fp_t<{lf},{rt}>... OK\\\\n\\\");\n  return EXIT_SUCCESS;\n}}\n\"\"\"\n\nbitwise_binary_ops = [(\"and\", \"c._raw == (a._raw & b._raw)\", \"c == (a & b)\"),\n                      (\"andnot\", \"c._raw == (a._raw & ~b._raw)\", \"c == (a & ~b)\"),\n                      (\"or\", \"c._raw == (a._raw | b._raw)\", \"c == (a | b)\"),\n                      (\"xor\",\"c._raw == ((~a._raw & b._raw) | (a._raw & ~b._raw))\",\n                       \"c == ((~a & b) | (a & ~b))\")]\ndef gen_bitwise_ops_tests(lf, rt, opts):\n    for op_name, s0, s1 in bitwise_binary_ops:\n        # {op}b\n        decls = check + limits + gen_random_val\n        content_src = bitwise_binary_test_template.format(\n            op_name=op_name, lf=lf, rt=rt,\n            includes=includes, decls=decls,\n            rand_statement=\"__gen_random_val<{lf}, {rt}>();\".format(lf=lf, rt=rt),\n            test_statement=s0, l=\"\", term=\"b\")\n        filename = get_filename(opts, op_name + \"b\", lf, rt)\n        if filename != None:\n            with common.open_utf8(opts, filename) as fp:\n                fp.write(content_src)\n            common.clang_format(opts, filename)\n\n        # {op}l\n        content_src = bitwise_binary_test_template.format(\n            op_name=op_name, lf=lf, rt=rt,\n            includes=includes, decls=decls,\n            rand_statement=\"(raw_t)(rand() % 2);\".format(lf=lf, rt=rt),\n            test_statement=s1, l=\"l\", term=\"l\")\n        filename = get_filename(opts, op_name + \"l\", lf, rt)\n        if filename != None:\n            with common.open_utf8(opts, filename) as fp:\n                fp.write(content_src)\n            common.clang_format(opts, filename)\n\n# ------------------------------------------------------------------------------\n# Bitwise unary operators\n\nbitwise_unary_test_template = \"\"\"\n{includes}\n\n// -----------------------------------------------------------------------------\n\n{decls}\n\n// -----------------------------------------------------------------------------\n\nint main() {{\n  typedef nsimd::fixed_point::fp_t<{lf}, {rt}> fp_t;\n  typedef nsimd::fixed_point::pack{l}<fp_t> vec{l}_t;\n  typedef nsimd::fixed_point::pack{l}<fp_t>::value_type raw_t;\n  const size_t v_size = (size_t) nsimd::fixed_point::len(fp_t());\n\n  raw_t *tab0 = (raw_t *) malloc(v_size * sizeof(raw_t));;\n  raw_t *res  = (raw_t *) malloc(v_size * sizeof(raw_t));;\n\n  for(size_t i = 0; i < v_size; i++)\n  {{\n    tab0[i] = {rand_statement}\n  }}\n\n  vec{l}_t v0 = nsimd::fixed_point::load{l}u<vec{l}_t>(tab0);\n  vec{l}_t v_res = nsimd::fixed_point::{op_name}{term}(v0);\n  nsimd::fixed_point::store{l}u(res, v_res);\n\n  for(size_t i = 0; i < v_size; i++)\n  {{\n    raw_t a = tab0[i];\n    raw_t b = res[i];\n    CHECK({test_statement});\n  }}\n\n  fprintf(stdout, \\\"test of {op_name}{term} over fp_t<{lf},{rt}>... OK\\\\n\\\");\n  return EXIT_SUCCESS;\n}}\n\"\"\"\n\nbitwise_unary_ops = [(\"not\", \"b._raw == ~a._raw\",\n                      \"((b == 0) && (a == 1)) | ((b == 1) && (a == 0))\")]\ndef gen_unary_ops_tests(lf, rt, opts):\n    for op_name, s0, s1 in bitwise_unary_ops:\n        decls = check + limits + gen_random_val\n        # {op}b\n        content_src = bitwise_unary_test_template.format(\n            op_name=op_name, lf=lf, rt=rt,\n            includes=includes, decls=decls,\n            rand_statement=\"__gen_random_val<{lf}, {rt}>();\".format(lf=lf,\n            rt=rt), test_statement=s0, l=\"\", term=\"b\")\n        filename = get_filename(opts, op_name + \"b\", lf, rt)\n        if filename != None:\n            with common.open_utf8(opts, filename) as fp:\n                fp.write(content_src)\n            common.clang_format(opts, filename)\n\n        # {op}l\n        content_src = bitwise_unary_test_template.format(\n            op_name=op_name, lf=lf, rt=rt,\n            includes=includes, decls=decls,\n            rand_statement=\"(raw_t)(rand() % 2);\".format(lf=lf, rt=rt),\n            test_statement=s1, l=\"l\", term=\"l\")\n        filename = get_filename(opts, op_name + \"l\", lf, rt)\n        if filename != None:\n            with common.open_utf8(opts, filename) as fp:\n                fp.write(content_src)\n            common.clang_format(opts, filename)\n\n# -----------------------------------------------------------------------------\n# if_else\n\nif_else_test_template = \"\"\"\n{includes}\n\n// -----------------------------------------------------------------------------\n\n{decls}\n\n// -----------------------------------------------------------------------------\n\nint main() {{\n  typedef nsimd::fixed_point::fp_t<{lf}, {rt}> fp_t;\n  typedef nsimd::fixed_point::pack<fp_t> vec_t;\n  typedef nsimd::fixed_point::packl<fp_t> vecl_t;\n  typedef nsimd::fixed_point::packl<fp_t>::value_type log_t;\n  const size_t v_size = (size_t) nsimd::fixed_point::len(fp_t());\n\n  fp_t *tab0_fp = (fp_t *) malloc(v_size * sizeof(fp_t));\n  fp_t *tab1_fp = (fp_t *) malloc(v_size * sizeof(fp_t));\n  fp_t *res_fp  = (fp_t *) malloc(v_size * sizeof(fp_t));\n  log_t *mask = (log_t *) malloc(v_size * sizeof(log_t));\n\n  for(size_t i = 0; i < v_size; i++) {{\n    tab0_fp[i] = __gen_random_val<{lf}, {rt}>();\n    tab1_fp[i] = __gen_random_val<{lf}, {rt}>();\n    mask[i] = (log_t) (rand() % 2);\n  }}\n\n  vec_t v0 = nsimd::fixed_point::loadu<vec_t>(tab0_fp);\n  vec_t v1 = nsimd::fixed_point::loadu<vec_t>(tab1_fp);\n  vecl_t vl = nsimd::fixed_point::loadlu<vecl_t>(mask);\n  vec_t v_res = nsimd::fixed_point::if_else1(vl, v0, v1);\n  nsimd::fixed_point::storeu(res_fp, v_res);\n\n  for(size_t i = 0; i < v_size; i++)\n  {{\n    fp_t ref = mask[i] ? tab0_fp[i] : tab1_fp[i];\n    CHECK(ref._raw == res_fp[i]._raw);\n  }}\n\n  fprintf(stdout, \\\"test of if_else1 over fp_t<{lf},{rt}>... OK\\\\n\\\");\n  return EXIT_SUCCESS;\n}}\n\"\"\"\n\ndef gen_if_else_tests(lf, rt, opts):\n    decls = check + limits + comparison_fp + gen_random_val\n    content_src = if_else_test_template.format(\n        lf=lf, rt=rt, includes=includes, decls=decls)\n    filename = get_filename(opts, \"if_else\", lf, rt)\n    if filename == None:\n        return\n    with common.open_utf8(opts, filename) as fp:\n        fp.write(content_src)\n    common.clang_format(opts, filename)\n# -------------------------------------------------------------------------------\n\nload_ops = [\"loadu\", \"loadlu\", \"loada\", \"loadla\"]\nstore_ops = [\"storeu\", \"storelu\", \"storea\", \"storela\"]\n\n# -------------------------------------------------------------------------------\n# Entry point\n\nlf_vals = [\"4\", \"8\", \"16\"]\nrt_vals = [\"1\", \"2\", \"3\", \"4\", \"5\", \"6\", \"7\", \"8\"]\n\ndef doit(opts):\n    common.myprint(opts, 'Generating tests for module fixed_point')\n    for lf in lf_vals:\n        for rt in rt_vals:\n            ## Arithmetic operators\n            gen_arithmetic_ops_tests(lf, rt, opts)\n\n            ## Min and max operators\n            gen_minmax_ops_tests(lf, rt, opts)\n\n            ## Ternary_operators\n            gen_ternary_ops_tests(lf, rt, opts)\n\n            ## Math functions\n            gen_math_functions_tests(lf, rt, opts)\n\n            ## Comparison operators\n            gen_comparison_tests(lf, rt, opts)\n\n            ## Bitwise binary operators\n            gen_bitwise_ops_tests(lf, rt, opts)\n\n            ## Bitwise unary operators\n            gen_unary_ops_tests(lf, rt, opts)\n\n            ## If_else\n            gen_if_else_tests(lf, rt, opts)\n"
  },
  {
    "path": "egg/modules/fixed_point/hatch.py",
    "content": "# Copyright (c) 2019 Agenium Scale\n#\n# Permission is hereby granted, free of charge, to any person obtaining a copy\n# of this software and associated documentation files (the \"Software\"), to deal\n# in the Software without restriction, including without limitation the rights\n# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n# copies of the Software, and to permit persons to whom the Software is\n# furnished to do so, subject to the following conditions:\n#\n# The above copyright notice and this permission notice shall be included in all\n# copies or substantial portions of the Software.\n#\n# THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n# SOFTWARE.\n\n## -----------------------------------------------------------------------------\n\nop_list = [\n    'len',\n    'set1',\n    'loadu',\n    'loada',\n    'loadlu',\n    'loadla',\n    'storeu',\n    'storea',\n    'storelu',\n    'storela',\n    'add',\n    'sub',\n    'mul',\n    'div',\n    'fma',\n    'min',\n    'max',\n    'abs',\n    'rec',\n    'eq',\n    'ne',\n    'le',\n    'lt',\n    'ge',\n    'gt',\n    'ifelse1',\n    'andb',\n    'andnotb',\n    'notb',\n    'orb',\n    'xorb',\n    'andl',\n    'andnotl',\n    'notl',\n    'orl',\n    'xorl'\n]\n\n# -----------------------------------------------------------------------------\n# Imports\n\nimport modules.fixed_point.gen_tests\nimport modules.fixed_point.gen_doc\n\n# -----------------------------------------------------------------------------\n\ndef name():\n    return 'Fixed-point arithmetic'\n\ndef desc():\n    return '''This module provides vectorized fixed-point arithmetic through\na C++98 API. The programmer can choose the integral type and the place of the\ncoma for representing its fixed-point numbers. A number of operators are\nalso provided.'''\n\ndef doc_menu():\n    return {'Overview': 'overview', 'API reference': 'api'}\n\n# -----------------------------------------------------------------------------\n# Entry point\n\ndef doit(opts):\n    if opts.tests == True or opts.all == True:\n        modules.fixed_point.gen_tests.doit(opts)\n    if opts.doc == True or opts.all == True:\n        modules.fixed_point.gen_doc.doit(opts, op_list)\n"
  },
  {
    "path": "egg/modules/memory_management/hatch.py",
    "content": "# Copyright (c) 2020 Agenium Scale\n#\n# Permission is hereby granted, free of charge, to any person obtaining a copy\n# of this software and associated documentation files (the \"Software\"), to deal\n# in the Software without restriction, including without limitation the rights\n# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n# copies of the Software, and to permit persons to whom the Software is\n# furnished to do so, subject to the following conditions:\n#\n# The above copyright notice and this permission notice shall be included in all\n# copies or substantial portions of the Software.\n#\n# THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n# SOFTWARE.\n\nimport common\n\n# -----------------------------------------------------------------------------\n\ndef name():\n    return 'Memory management'\n\ndef desc():\n    return '''This module provides C-style memory managmenent functions:\nmalloc, calloc, free, copy to/from devices, etc... Its purpose is to facilitate\nthe use of data buffers in a portable way for systems with CPUs only and\nfor systems with CPUs and GPUs.'''\n\ndef doc_menu():\n    return dict()\n\n# -----------------------------------------------------------------------------\n\ndef doit(opts):\n    common.myprint(opts, 'Generating module memory_management')\n    if not opts.doc:\n        return\n    filename = common.get_markdown_file(opts, 'overview', 'memory_management')\n    if not common.can_create_filename(opts, filename):\n        return\n    with common.open_utf8(opts, filename) as fout:\n        fout.write('''# Overview\n\nThis module provides C-style memory managmenent functions. Its purpose is not\nto become a fully feature container library. It is to provide portable\nmalloc, memcpy and free functions with a little helpers to copy data from and\nto the devices.\n\n# API reference\n\n## Equivalents of malloc, calloc, memcpy and free for devices\n\nNote that the below functions simply wraps the corresponding C functions\nwhen targeting a CPU.\n\n- `template <typename T> T *device_malloc(size_t sz)`{br}\n  Allocates `sz * sizeof(T)` bytes of memory on the device.\n  On error NULL is returned.\n\n- `template <typename T> T *device_calloc(size_t sz)`{br}\n  Allocates `sz * sizeof(T)` bytes of memory on the device and set the\n  allocated memory to zero.\n  On error NULL is returned.\n\n- `template <typename T> void device_free(T *ptr)`{br}\n  Free the memory pointed to by the given pointer.\n\n- `template <typename T> void copy_to_device(T *device_ptr, T *host_ptr,\n  size_t sz)`{br}\n  Copy data to from host to device.\n\n- `template <typename T> void copy_to_host(T *host_ptr, T *device_ptr,\n  size_t sz)`{br}\n  Copy data to from device to host.\n\n- `#define nsimd_fill_dev_mem_func(func_name, expr)`{br}\n  Create a device function that will fill data with `expr`. To call the created\n  function one simply does `func_name(ptr, sz)`. The `expr` argument represents\n  some simple C++ expression that can depend only on `i` the i-th element in\n  the vector as shown in the example below.\n\n  ```c++\n  nsimd_fill_dev_mem_func(prng, ((i * 1103515245 + 12345) / 65536) % 32768)\n\n  int main() {{\n    prng(ptr, 1000);\n    return 0;\n  }}\n  ```\n\n## Pairs of pointers\n\nIt is often useful to allocate a pair of data buffers: one on the host and\none on the devices to perform data transfers. The below functions provides\nquick ways to malloc, calloc, free and memcpy pointers on host and devices at\nonce. Note that when targeting CPUs the pair of pointers is reduced to one\npointer that ponit the a single data buffer in which case memcpy's are not\nperformed. Note also that there is no implicit synchronization of data\nbetween both data buffers. It is up to the programmer to triggers memcpy's.\n\n```c++\ntemplate <typename T>\nstruct paired_pointers_t {{\n  T *device_ptr, *host_ptr;\n  size_t sz;\n}};\n```\n\nMembers of the above structure are not to be modified but can be passed as\narguments for reading/writing data from/to memory they point to.\n\n- `template <typename T> paired_pointers_t<T> pair_malloc(size_t sz)`{br}\n  Allocate `sz * sizeof(T)` bytes of memory on the host and on the device.\n  If an error occurs both pointers are NULL.\n\n- `template <typename T> paired_pointers_t<T> pair_malloc_or_exit(size_t\n  sz)`{br}\n  Allocate `sz * sizeof(T)` bytes of memory on the host and on the device.\n  If an error occurs, prints an error message on stderr and exit(3).\n\n- `template <typename T> paired_pointers_t<T> pair_calloc(size_t sz)`{br}\n  Allocate `sz * sizeof(T)` bytes of memory on the host and on the device.\n  Write both data buffers with zeros.\n  If an error occurs both pointers are NULL.\n\n- `template <typename T> paired_pointers_t<T> pair_calloc_or_exit(size_t\n  sz)`{br}\n  Allocate `sz * sizeof(T)` bytes of memory on the host and on the device.\n  Write both data buffers with zeros.\n  If an error occurs, prints an error message on stderr and exit(3).\n\n- `template <typename T> void pair_free(paired_pointers_t<T> p)`{br}\n  Free data buffers on the host and the device.\n\n- `template <typename T> void copy_to_device(paired_pointers_t<T> p)`{br}\n  Copy data from the host buffer to its corresponding device buffer.\n\n- `template <typename T> void copy_to_host(paired_pointers_t<T> p)`{br}\n  Copy data from the device buffer to its corresponding host buffer.\n'''.format(br='  '))\n\n"
  },
  {
    "path": "egg/modules/random/hatch.py",
    "content": "# Copyright (c) 2020 Agenium Scale\n#\n# Permission is hereby granted, free of charge, to any person obtaining a copy\n# of this software and associated documentation files (the \"Software\"), to deal\n# in the Software without restriction, including without limitation the rights\n# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n# copies of the Software, and to permit persons to whom the Software is\n# furnished to do so, subject to the following conditions:\n#\n# The above copyright notice and this permission notice shall be included in all\n# copies or substantial portions of the Software.\n#\n# THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n# SOFTWARE.\n\nimport os\nimport common\nimport collections\n\n\n# -----------------------------------------------------------------------------\n\nrand_functions = list()\n\n\nclass MAddToRands(type):\n    def __new__(cls, name, bases, dct):\n        ret = type.__new__(cls, name, bases, dct)\n        if name != 'Rand':\n            rand_functions.append(ret())\n        return ret\n\nclass Rand(object, metaclass=MAddToRands):\n    def gen_function_name(self, nwords, word_size, nrounds):\n        return '{}_{}x{}_{}'.format(self.name, nwords, word_size, nrounds)\n\n    def gen_headers(self, opts):\n        res = ''\n\n        for word_size, nwords_nrounds in self.wordsize_nwords_nrounds.items():\n            for nwords, list_nrounds in nwords_nrounds.items():\n                for nrounds in list_nrounds:\n                    res += self.gen_signature(nwords, word_size, nrounds)+';'\n\n        return res\n\n    def gen_tests(self, opts, nrounds, word_size, nwords):\n\n        key_size = self.get_key_size(nwords)\n\n        key_initialization = 'nsimd::packx{}<u{}> key_pack;'. \\\n                format(key_size, word_size)\n        for i in range (0, key_size):\n            key_initialization += '''\n            i = {i};\n            for (int j = 0; j < len; j++) {{\n              key[j + i * len] = (u{word_size})(j + i * len);\n            }}\n            key_pack.v{i} = nsimd::loadu(&key[i*len], u{word_size}());\n            '''.format(i=i, word_size=word_size)\n\n        input_initilization = \\\n                'memset(in, 0, sizeof(u{}) * {} * ulen);\\n'. \\\n                format(word_size, nwords)\n        for i in range (0, nwords):\n            input_initilization += 'in_pack.v{} = nsimd::pack<u{}>(0);'. \\\n                    format(i, word_size)\n\n        compare = ''\n        for i in range (0, nwords):\n            compare += '''\n                if (i=={i}) {{\n                    nsimd::storeu(out_nsimd, out_pack.v{i});\n                }}\n                '''.format(i=i)\n\n        l = 'll' if word_size == 64 else ''\n        cast = '(nsimd_ulonglong)' if word_size == 64 else ''\n\n        res = '''\n        #include <nsimd/modules/random/functions.hpp>\n        #include \"reference.hpp\"\n        #include <iostream>\n\n        #ifdef NSIMD_LONGLONG_IS_EXTENSION\n          #if defined(NSIMD_IS_GCC) || defined(NSIMD_IS_CLANG)\n            #pragma GCC diagnostic ignored \"-Wformat\"\n          #endif\n        #endif\n\n        int main() {{\n          int res = EXIT_SUCCESS;\n          printf(\"Test of {function_name} ...\\\\n\");\n\n          nsimd::packx{nwords}<u{word_size}> in_pack;\n          nsimd::packx{nwords}<u{word_size}> out_pack;\n\n          const int len = nsimd::len(u{word_size}());\n          const unsigned int ulen = (unsigned int)len;\n\n          u{word_size} *key = (u{word_size}*)malloc(ulen *\n                                sizeof(u{word_size}) * {key_size});\n          u{word_size} *in = (u{word_size}*)malloc(ulen *\n                               sizeof(u{word_size}) * {nwords});\n          u{word_size} *out = (u{word_size}*)malloc(ulen *\n                                sizeof(u{word_size}) * {nwords});\n          u{word_size} *out_nsimd = (u{word_size}*)malloc(ulen *\n                                      sizeof(u{word_size}));\n\n          tab{word_size}x{nwords}_t in_ref;\n          tab{word_size}x{key_size}_t key_ref;\n          tab{word_size}x{nwords}_t out_ref;\n\n          int i;\n\n          // Keys\n          {key_initialization}\n\n          {input_initilization}\n\n          for (int cpt=0; cpt < 100000; ++cpt) {{\n            out_pack = nsimd::random::{function_name}(in_pack, key_pack);\n\n            for (int i=0; i<len; ++i) {{\n              for (int j=0; j<{nwords}; ++j) {{\n                  in_ref.v[j] = in[i + j * len];\n              }}\n\n              for (int j=0; j<{key_size}; ++j) {{\n                  key_ref.v[j] = key[i + j*len];\n              }}\n\n              out_ref = branson_{name}{nwords}x{word_size}_R({nrounds},\n                          in_ref, key_ref);\n\n              for (int j=0; j<{nwords}; ++j) {{\n                  out[i + j * len] = out_ref.v[j];\n              }}\n            }}\n\n            for (int i=0; i<{nwords}; ++i) {{\n              {compare}\n\n              if (memcmp(out_nsimd, &out[i * len],\n                         ulen * sizeof(u{word_size}))) {{\n                printf (\"%i\\\\n\", i);\n                for (int j=0; j<len; ++j) {{\n                  printf (\"%{l}u\\\\t(0x%{l}x)\\\\t\\\\t%{l}u\\\\t(0x%{l}x)\\\\n\",\n                          {cast}out[j+i*len], {cast}out[j+i*len],\n                          {cast}out_nsimd[j], {cast}out_nsimd[j]);\n                }}\n\n                res = EXIT_FAILURE;\n                printf(\"... FAILED\\\\n\");\n                goto cleanup;\n              }}\n            }}\n\n            in_pack = out_pack;\n            memcpy(in, out, sizeof(u{word_size}) * {nwords} * ulen);\n          }}\n\n          fprintf(stdout, \"... OK\\\\n\");\n\n        cleanup:\n          free(key);\n          free(in);\n          free(out);\n          free(out_nsimd);\n\n          return res;\n        }}\n        '''.format(function_name=self.gen_function_name(nwords, word_size,\n                   nrounds), word_size=word_size, key_size=key_size,\n                   nwords=nwords, key_initialization=key_initialization,\n                   nrounds=nrounds, input_initilization=input_initilization,\n                   compare=compare, l=l, name = self.name, cast=cast)\n\n        # Write file\n        return res\n\nclass Philox(Rand):\n    name = 'philox'\n\n    wordsize_nwords_nrounds = {32: {2: [10],\n                                    4: [7, 10]},\n                               64: {2: [6, 10],\n                                    4: [7, 10]}}\n\n    mullohi='''\n#if 1\nvoid mulhilo32(pack<u32> a, pack<u32> b, pack<u32> *low, pack<u32> *high) {\n  nsimd::packx2<u64> a64 = nsimd::upcvt(nsimd::packx2<u64>(), a);\n  nsimd::packx2<u64> b64 = nsimd::upcvt(nsimd::packx2<u64>(), b);\n\n  nsimd::packx2<u64> product;\n  product.v0 = a64.v0 * b64.v0;\n  product.v1 = a64.v1 * b64.v1;\n\n  *high =\n      nsimd::downcvt(nsimd::pack<u32>(), product.v0 >> 32, product.v1 >> 32);\n  *low = nsimd::downcvt(nsimd::pack<u32>(), product.v0, product.v1);\n}\n\n#else\n\nvoid mulhilo32(pack<u32> a, pack<u32> b, pack<u32> *low, pack<u32> *high) {\n  nsimd::pack<u32> ah = nsimd::shr(a, 16);\n  nsimd::pack<u32> bh = nsimd::shr(b, 16);\n\n  nsimd::pack<u32> al = nsimd::shr(nsimd::shl(a, 16), 16);\n  nsimd::pack<u32> bl = nsimd::shr(nsimd::shl(b, 16), 16);\n\n  nsimd::pack<u32> ahbh = ah * bh;\n  nsimd::pack<u32> ahbl = ah * bl;\n  nsimd::pack<u32> albh = al * bh;\n  nsimd::pack<u32> albl = al * bl;\n\n  nsimd::pack<u32> tmp1 = nsimd::shl(albh, 16);\n  nsimd::pack<u32> tmp2 = nsimd::shl(ahbl, 16);\n\n  nsimd::pack<u32> tmp3 = tmp1 + tmp2;\n\n  nsimd::pack<u32> _1 = nsimd::set1(nsimd::pack<u32>(), 1u);\n  nsimd::pack<u32> _0 = nsimd::set1(nsimd::pack<u32>(), 0u);\n\n  nsimd::pack<u32> carry =\n      nsimd::if_else1((tmp3 < tmp1) || (tmp3 < tmp2), _1, _0);\n\n  *low = tmp3 + albl;\n\n  carry = carry + nsimd::if_else1((*low < tmp3) || (*low < albl), _1, _0);\n\n  *high = ahbh + nsimd::shr(albh, 16) + nsimd::shr(ahbl, 16) + carry;\n}\n#endif\n\n#if 0\nvoid mulhilo64(pack<u64> a, pack<u64> b, pack<u64> *low, pack<u64> *high) {\n  u64 a_buf[8];\n  u64 b_buf[8];\n  u64 low_buf[8];\n  u64 high_buf[8];\n\n  nsimd::storeu(a_buf, a);\n  nsimd::storeu(b_buf, b);\n\n  for (int i = 0; i < nsimd::len(u64()); ++i) {\n    __uint128_t product = ((__uint128_t)a_buf[i]) * ((__uint128_t)b_buf[i]);\n    high_buf[i] = (u64)(product >> 64);\n    low_buf[i] = (u64)product;\n  }\n\n  *high = nsimd::loadu(high_buf, u64());\n  *low = nsimd::loadu(low_buf, u64());\n}\n\n#else\n\nvoid mulhilo64(pack<u64> a, pack<u64> b, pack<u64> *low, pack<u64> *high) {\n  nsimd::pack<u64> ah = nsimd::shr(a, 32);\n  nsimd::pack<u64> bh = nsimd::shr(b, 32);\n\n  nsimd::pack<u64> al = nsimd::shr(nsimd::shl(a, 32), 32);\n  nsimd::pack<u64> bl = nsimd::shr(nsimd::shl(b, 32), 32);\n\n  nsimd::pack<u64> ahbh = ah * bh;\n  nsimd::pack<u64> ahbl = ah * bl;\n  nsimd::pack<u64> albh = al * bh;\n  nsimd::pack<u64> albl = al * bl;\n\n  nsimd::pack<u64> tmp1 = nsimd::shl(albh, 32);\n  nsimd::pack<u64> tmp2 = nsimd::shl(ahbl, 32);\n\n  nsimd::pack<u64> tmp3 = tmp1 + tmp2;\n\n  nsimd::pack<u64> _1 = nsimd::set1(nsimd::pack<u64>(), (u64)1);\n  nsimd::pack<u64> _0 = nsimd::set1(nsimd::pack<u64>(), (u64)0);\n\n  nsimd::pack<u64> carry =\n      nsimd::if_else1((tmp3 < tmp1) || (tmp3 < tmp2), _1, _0);\n\n  *low = tmp3 + albl;\n\n  carry = carry + nsimd::if_else1((*low < tmp3) || (*low < albl), _1, _0);\n\n  *high = ahbh + nsimd::shr(albh, 32) + nsimd::shr(ahbl, 32) + carry;\n}\n#endif\n    '''\n\n    def gen_signature(self, nwords, word_size, nrounds):\n        return ('nsimd::packx{nwords}<u{word_size}> {fun_name}' \\\n                '(nsimd::packx{nwords}<u{word_size}> in, ' \\\n                'nsimd::packx{key_size}<u{word_size}> key)'). \\\n                format(nwords = nwords, word_size = word_size,\n                       fun_name = self.gen_function_name(nwords, word_size,\n                                                         nrounds),\n                       key_size = self.get_key_size(nwords))\n\n    def get_key_size(self, nwords):\n        return int(nwords/2)\n\n    def gen_func(self, opts, nrounds, word_size, nwords):\n        if nwords == 2:\n            bump_keys_init = \\\n            'nsimd::pack<u{word_size}> bump = ' \\\n            'nsimd::set1(nsimd::pack<u{word_size}>(), {bump});'.\\\n            format(word_size=word_size, bump = '(u64)0x9E3779B97F4A7C15ULL' \\\n                   if word_size == 64 else '(u32)0x9E3779B9U')\n            bump_keys = 'key.v0 = key.v0 + bump;'\n\n            round_init = '''\n            nsimd::pack<u{word_size}> mul =\n                nsimd::set1(nsimd::pack<u{word_size}>(), {mul});\n            nsimd::pack<u{word_size}> high, low;'''. \\\n            format(word_size=word_size, mul='(u64)0xD2B74407B1CE6E93ULL' \\\n                   if word_size == 64 else '(u32)0xD256D193U')\n\n            round='''\n              mulhilo{word_size}(mul, in.v0, &low, &high);\n\n              in.v0 = high ^ key.v0 ^ in.v1;\n              in.v1 = low;\n            '''.format(word_size=word_size)\n\n        elif nwords == 4:\n            bump_keys_init = '''\n            nsimd::pack<u{word_size}> bump0 =\n                nsimd::set1(nsimd::pack<u{word_size}>(), {bump0});\n            nsimd::pack<u{word_size}> bump1 =\n                nsimd::set1(nsimd::pack<u{word_size}>(), {bump1});'''.\\\n                format(word_size=word_size,\n                       bump0 = '(u64)0x9E3779B97F4A7C15ULL' \\\n                       if word_size == 64 else '(u32)0x9E3779B9U',\n                       bump1 = '(u64)0xBB67AE8584CAA73BULL' \\\n                       if word_size == 64 else '(u32)0xBB67AE85U')\n            bump_keys = 'key.v0 = key.v0 + bump0;\\nkey.v1 = key.v1 + bump1;'\n\n            round_init = '''\n            nsimd::pack<u{word_size}> mul0 =\n                nsimd::set1(nsimd::pack<u{word_size}>(), {mul0});\n            nsimd::pack<u{word_size}> mul1 =\n                nsimd::set1(nsimd::pack<u{word_size}>(), {mul1});\n            nsimd::pack<u{word_size}> low0, high0, low1, high1;\n            '''.format(word_size=word_size,\n                       mul0='(u64)0xD2E7470EE14C6C93ULL' \\\n                       if word_size == 64 else '(u32)0xD2511F53U',\n                       mul1='(u64)0xCA5A826395121157ULL' \\\n                       if word_size == 64 else '(u32)0xCD9E8D57U')\n\n            round='''\n            mulhilo{word_size}(mul0, in.v0, &low0, &high0);\n            mulhilo{word_size}(mul1, in.v2, &low1, &high1);\n\n            in.v0 = high1 ^ key.v0 ^ in.v1;\n            in.v1 = low1;\n            in.v2 = high0 ^ key.v1 ^ in.v3;\n            in.v3 = low0;'''.format(word_size=word_size)\n\n\n        res = self.gen_signature (nwords, word_size, nrounds)\n        res += ' {{ nsimd::packx{}<u{}> out;'.format(nwords, word_size)\n        res += bump_keys_init\n        res += round_init\n\n        # Round 0:\n        res += round;\n\n        for i in range(1, nrounds):\n            res += bump_keys\n            res += round\n\n        res+='''\n            return in;\n        }\n        '''\n\n        return res\n\n    def generate(self, opts):\n        res = self.mullohi\n\n        for word_size, nwords_nrounds in self.wordsize_nwords_nrounds.items():\n            for nwords, list_nrounds in nwords_nrounds.items():\n                for nrounds in list_nrounds:\n                    res += self.gen_func(opts, nrounds, word_size, nwords)\n\n        return res\n\n\n\nclass ThreeFry(Rand):\n    name = 'threefry'\n\n    enums='''\n    enum enum_threefry32x2_rotations {\n      Rot_32x2_0 = 13,\n      Rot_32x2_1 = 15,\n      Rot_32x2_2 = 26,\n      Rot_32x2_3 = 6,\n      Rot_32x2_4 = 17,\n      Rot_32x2_5 = 29,\n      Rot_32x2_6 = 16,\n      Rot_32x2_7 = 24\n    };\n\n    enum enum_threefry32x4_rotations {\n      Rot_32x4_0_0 = 10,\n      Rot_32x4_0_2 = 26,\n      Rot_32x4_1_0 = 11,\n      Rot_32x4_1_2 = 21,\n      Rot_32x4_2_0 = 13,\n      Rot_32x4_2_2 = 27,\n      Rot_32x4_3_0 = 23,\n      Rot_32x4_3_2 = 5,\n      Rot_32x4_4_0 = 6,\n      Rot_32x4_4_2 = 20,\n      Rot_32x4_5_0 = 17,\n      Rot_32x4_5_2 = 11,\n      Rot_32x4_6_0 = 25,\n      Rot_32x4_6_2 = 10,\n      Rot_32x4_7_0 = 18,\n      Rot_32x4_7_2 = 20\n    };\n\n    enum enum_threefry64x2_rotations {\n      Rot_64x2_0 = 16,\n      Rot_64x2_1 = 42,\n      Rot_64x2_2 = 12,\n      Rot_64x2_3 = 31,\n      Rot_64x2_4 = 16,\n      Rot_64x2_5 = 32,\n      Rot_64x2_6 = 24,\n      Rot_64x2_7 = 21\n    };\n    enum enum_threefry64x4_rotations {\n\n      Rot_64x4_0_0 = 14,\n      Rot_64x4_0_2 = 16,\n      Rot_64x4_1_0 = 52,\n      Rot_64x4_1_2 = 57,\n      Rot_64x4_2_0 = 23,\n      Rot_64x4_2_2 = 40,\n      Rot_64x4_3_0 = 5,\n      Rot_64x4_3_2 = 37,\n      Rot_64x4_4_0 = 25,\n      Rot_64x4_4_2 = 33,\n      Rot_64x4_5_0 = 46,\n      Rot_64x4_5_2 = 12,\n      Rot_64x4_6_0 = 58,\n      Rot_64x4_6_2 = 22,\n      Rot_64x4_7_0 = 32,\n      Rot_64x4_7_2 = 32\n    };\n    '''\n\n    # Following macros should not be changed to function : gcc can't inline them\n    rotations='''\n    #define SHIFT_MOD_32(x, N) ((x << (N & 31)) | (x >> ((32 - N) & 31)))\n    #define SHIFT_MOD_64(x, N) ((x << (N & 63)) | (x >> ((64 - N) & 63)))\n    '''\n\n    undef_macro='''\n    #undef SHIFT_MOD_32\n    #undef SHIFT_MOD_64\n    '''\n\n    wordsize_nwords_nrounds = {32: {2: [12, 20, 32],\n                                    4: [12, 20, 72]},\n                               64: {2: [13, 20, 32],\n                                    4: [12, 20, 72]}}\n\n    def gen_signature(self, nwords, word_size, nrounds):\n        return '''nsimd::packx{nwords}<u{word_size}> \\\n            {fun_name} \\\n            (nsimd::packx{nwords}<u{word_size}> in, \\\n            nsimd::packx{nwords}<u{word_size}> key)'''. \\\n            format(nwords=nwords, word_size = word_size,\n                   fun_name=self.gen_function_name(nwords, word_size, nrounds))\n\n    def get_key_size(self, nwords):\n        return nwords\n\n    def gen_body(self, opts, nrounds, word_size, nwords):\n        if word_size == 32:\n            initialize_keys = '''nsimd::pack<u32> ks{nwords} =\n                nsimd::set1(nsimd::pack<u32>(), 0x1BD11BDAU);'''. \\\n                        format(nwords=nwords)\n        elif word_size == 64:\n            initialize_keys = '''nsimd::pack<u64> ks{nwords} =\n            nsimd::set1(nsimd::pack<u64>(), (u64)0x1BD11BDAA9FC1A22ULL);'''. \\\n            format(nwords=nwords)\n\n        res = self.gen_signature(nwords, word_size, nrounds)\n\n        res += ' {{ nsimd::packx{}<u{}> out;'.format(nwords, word_size)\n\n        res += initialize_keys\n\n        initialisation_keys = '''\n        nsimd::pack<u{word_size}> ks{i};\n        ks{i} = key.v{i};\n        out.v{i} = in.v{i};\n        ks{nwords} = ks{nwords} ^ key.v{i};\n        out.v{i} = out.v{i} + key.v{i};\n        '''\n\n        for i in range(0,nwords):\n            res += initialisation_keys.format(i=i, nwords=nwords,\n                                              word_size=word_size)\n\n        for i in range(0, nrounds):\n            if nwords == 4:\n                indexes= [1 if i%2==0 else 3, 1 if i%2==1 else 3]\n\n                res += '''\n                out.v0 = out.v0 + out.v{index0};\n                out.v{index0} = SHIFT_MOD_{word_size}(out.v{index0},\n                                           Rot_{word_size}x{nwords}_{i_mod}_0);\n                out.v{index0} = out.v{index0} ^ out.v0;\n                out.v2 = out.v2 + out.v{index1};\n                out.v{index1} = SHIFT_MOD_{word_size}(out.v{index1},\n                                         Rot_{word_size}x{nwords}_{i_mod}_2);\n                out.v{index1} = out.v{index1} ^ out.v2;\n                '''.format(index0=indexes[0], index1=indexes[1], i_mod=i%8,\n                        word_size=word_size, nwords=nwords)\n            elif nwords == 2:\n                res += '''\n                out.v0 = out.v0 + out.v1;\n                out.v1 = SHIFT_MOD_{word_size}(out.v1,\n                             Rot_{word_size}x{nwords}_{i_mod});\n                out.v1 = out.v1 ^ out.v0;'''. \\\n                format(i_mod=i % 8, word_size=word_size, nwords=nwords)\n\n            #if (i % nwords) == nwords - 1:\n            if (i % 4) == 3:\n                d = int(i / 4 + 1)\n                res += '\\n'\n                for j in range(0, nwords):\n                    res += 'out.v{j} = out.v{j} + ks{calc};\\n'. \\\n                            format(j=j, calc=str(int((d+j)%(nwords+1))))\n\n                res += 'out.v{n} = out.v{n} + ' \\\n                       'nsimd::pack<u{word_size}>({d});\\n'. \\\n                       format(d=d, n=nwords-1, word_size=word_size)\n\n        res+='''\n            return out;\n        }\n        '''\n\n        return res\n\n    def generate(self, opts):\n        res = ''\n        res += self.enums\n        res += self.rotations\n\n        for word_size, nwords_nrounds in self.wordsize_nwords_nrounds.items():\n            for nwords, list_nrounds in nwords_nrounds.items():\n                for nrounds in list_nrounds:\n                    res += self.gen_body(opts, nrounds, word_size, nwords)\n\n        res += self.undef_macro\n\n        return res\n\ndef gen_functions(opts):\n    ## Write source files\n    #dirname = os.path.join(opts.include_dir, 'modules', 'random')\n    #common.mkdir_p(dirname)\n    #filename = os.path.join(dirname, 'functions.cpp')\n    #print(filename)\n    #with common.open_utf8(opts, filename) as out:\n    #    out.write('#include \"functions.hpp\"\\n')\n    #    out.write('{}\\n\\n'.format(common.hbar))\n    #    out.write(gen(opts))\n    #    out.write('#endif\\n')\n    #common.clang_format(opts, filename)\n\n    # Write headers\n    dirname = os.path.join(opts.include_dir, 'modules', 'random')\n    common.mkdir_p(dirname)\n    filename = os.path.join(dirname, 'functions.hpp')\n    with common.open_utf8(opts, filename) as out:\n        out.write(\n        '''#ifndef NSIMD_MODULES_RANDOM_FUNCTIONS_HPP\n           #define NSIMD_MODULES_RANDOM_FUNCTIONS_HPP\n\n           #include <nsimd/nsimd.h>\n           #include <nsimd/cxx_adv_api.hpp>\n           #include <nsimd/cxx_adv_api_functions.hpp>\n\n           #ifdef NSIMD_LONGLONG_IS_EXTENSION\n             #if defined(NSIMD_IS_GCC)\n               /* Not emitting the warning -Wlong-long is not possible */\n               /* with GCC <= 12. It is a bug. A workaround is to tell GCC   */\n               /* to consider this header file as a system header file so    */\n               /* that all warnings are not  emitted. This is not satisfying */\n               /* but necessary for the moment.                              */\n               #pragma GCC system_header\n               #pragma GCC diagnostic push\n               #pragma GCC diagnostic ignored \"-Wlong-long\"\n             #elif defined(NSIMD_IS_CLANG)\n               #pragma clang diagnostic push\n               #pragma clang diagnostic ignored \"-Wlong-long\"\n             #endif\n           #endif\n\n           namespace nsimd {\n           namespace random {\n\n           ''')\n\n        out.write('{}\\n\\n'.format(common.hbar))\n        for func in rand_functions:\n            out.write(func.gen_headers(opts))\n            out.write(func.generate(opts))\n\n        out.write(\n        '''#ifdef NSIMD_LONGLONG_IS_EXTENSION\n             #if defined(NSIMD_IS_GCC)\n               #pragma GCC diagnostic pop\n             #elif defined(NSIMD_IS_CLANG)\n               #pragma clang diagnostic pop\n             #endif\n           #endif\n\n           } // namespace nsimd\n           } // namespace random\n\n           #endif\n           ''')\n\n    common.clang_format(opts, filename)\n\ndef gen_tests(opts):\n    for func in rand_functions:\n        for word_size, nwords_nrounds in func.wordsize_nwords_nrounds.items():\n            for nwords, list_nrounds in nwords_nrounds.items():\n                for nrounds in list_nrounds:\n                    # Write headers\n                    dirname = os.path.join(opts.tests_dir, 'modules', 'random')\n                    common.mkdir_p(dirname)\n                    filename = os.path.join(dirname, '{}.cpp'. \\\n                               format(func.gen_function_name(nwords, word_size,\n                                                             nrounds)))\n                    with common.open_utf8(opts, filename) as out:\n                        out.write(func.gen_tests(opts, nrounds, word_size,\n                                  nwords))\n\n                    common.clang_format(opts, filename)\n\n\n# -----------------------------------------------------------------------------\n\ndef name():\n    return 'Random number generators'\n\ndef desc():\n    return \\\n    'This module define functions that generate pseudorandom numbers using' \\\n    'algorithms described in Parallel Random Numbers: As Easy as 1,2,3, by' \\\n    'John K. Salmon, Mark A. Moraes, Ron O. Dror and David E.Shaw.'\n\ndef gen_doc(opts):\n    api =  ''\n    for func in rand_functions:\n        for word_size, nwords_nrounds in func.wordsize_nwords_nrounds.items():\n            for nwords, list_nrounds in nwords_nrounds.items():\n                for nrounds in list_nrounds:\n                    api += '- `' + func.gen_signature(nwords, word_size,\n                                                      nrounds) + '`;  \\n'\n                    api += '  Returns a random number using the ' \\\n                           '{func_name} generator\\n\\n'. \\\n                           format(func_name=func.name)\n\n    res = '''\n# NSIMD Random module overview\n\n{desc}\n\nTwo different algorithms are proposed : threefry and philox. Both should give\nhigh quality random number.\nThreefry is quicker on CPU, while philox is best used on GPU.\n\nBoth algorithms are counter based pseudorandom number generator, meaning that\nthey need two parameters:\n- a key, each key will generate an unique sequence,\n- a counter, which will give the different numbers in the sequence.\n\n# NSIMD Random API reference\n\n{api}\n'''.format(desc = desc(), api=api)\n\n\n    filename = common.get_markdown_file(opts, 'overview', 'random')\n    if not common.can_create_filename(opts, filename):\n        return\n    with common.open_utf8(opts, filename) as fout:\n        fout.write(res)\n\ndef doc_menu():\n    return dict()\n\n# -----------------------------------------------------------------------------\n\ndef doit(opts):\n    common.myprint(opts, 'Generating module random')\n\n    if opts.library:\n        gen_functions(opts)\n    if opts.tests:\n        gen_tests(opts)\n    if opts.doc:\n        gen_doc(opts)\n"
  },
  {
    "path": "egg/modules/spmd/hatch.py",
    "content": "# Copyright (c) 2020 Agenium Scale\n#\n# Permission is hereby granted, free of charge, to any person obtaining a copy\n# of this software and associated documentation files (the \"Software\"), to deal\n# in the Software without restriction, including without limitation the rights\n# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n# copies of the Software, and to permit persons to whom the Software is\n# furnished to do so, subject to the following conditions:\n#\n# The above copyright notice and this permission notice shall be included in all\n# copies or substantial portions of the Software.\n#\n# THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n# SOFTWARE.\n\nimport os\nimport operators\nimport common\nimport gen_scalar_utilities\nimport gen_tests as nsimd_tests\n\n# -----------------------------------------------------------------------------\n# CUDA: default number of threads per block\n\ntpb = 128\ngpu_params = '(n + {}) / {}, {}'.format(tpb, tpb - 1, tpb)\n\n# -----------------------------------------------------------------------------\n# helpers\n\ndef append(s1, s2):\n    if s1 == '':\n        return s2\n    if s2 == '':\n        return s1\n    return s1 + ', ' + s2\n\nk_typ = {'i': 'k_int', 'u': 'k_uint', 'f': 'k_float'}\n\ndef get_signature(op):\n    args = ', '.join(['a{}'.format(i - 1) for i in range(1, len(op.params))])\n    if op.output_to == common.OUTPUT_TO_SAME_SIZE_TYPES or \\\n       op.name == 'to_mask':\n        args = append('to_type', args)\n    return '#define k_{}({})'.format(op.name, args)\n\n# -----------------------------------------------------------------------------\n\ndef gen_doc_overview(opts):\n    filename = common.get_markdown_file(opts, 'overview', 'spmd')\n    if not common.can_create_filename(opts, filename):\n        return\n    with common.open_utf8(opts, filename) as fout:\n        fout.write('''# Overview\n\n## What is SPMD?\n\nSPMD stands for *Single Program Multiple Data*. It is a programming paradigm.\nIt is used by NVIDIA CUDA. Its strengh lies in writing computation kernels.\nBasically you concentrate your attention on the kernel itself and not on\nhow to run it. An example is worth more than a long speech, let's take vector\naddition of `float`'s.\n\n```c++\nspmd_kernel_1d(add, float *dst, float *a, float *b)\n  k_store(dst, k_load(a) + k_load(b));\nspmd_kernel_end\n```\n\nIt would be written as follows for CUDA (assuming that the vector lenghts are\nmultiples of block's sizes).\n\n```c++\n__global__ add(float *dst, float *a, float *b) {\n  int i = blockIdx.x * blockDim.x + threadIdx.x;\n  dst[i] = a[i] + b[i];\n}\n```\n\nNSIMD's SPMD is a small DSL in standard C++98 that can be used to write\ncomputation kernels for GPUs (NVIDIA's and AMD's) and any SIMD units supported\nby NSIMD. On a more technical side, the DSL keywords are macros that:\n- translates to C-ish keywords for GPUs and\n- use masks for CPUs as Intel ISPC (<https://ispc.github.io/>).\n\nThe difference between NSIMD's SPMD is that a single code can be compiled\nto target GPUs and CPUs whereas:\n- NVIDIA CUDA only targets NVIDIA GPUs\n- AMD HIP only targets NVIDIA and AMD GPUs\n- INTEL ICP only targets Intel SIMD units and ARM NEON\n\n## Writing kernels and device functions\n\nAs for CUDA kernels you can write templated and non-templated CUDA kernels.\nDeclaring a kernel function and launching it is straight forward:\n\n```c++\nspmd_kernel_1d(kernel_name, arguments)\n  // kernel code\nspmd_kernel_end\n\nint main() {\n\n  spmd_launch_kernel_1d(kernel_name, bit_width, param,\n                        vector_size, arguments);\n\n  return 0;\n}\n```\n\nThe `bit_width` argument indicates the types width in bits that will be\navailable inside kernels. The `param` argument indicates the unroll factor for\nCPUs and the number of threads per block for GPUs. The `vector_size` argument\nindicates the vectors length passed as arguments.\n\nDevice functions can also been implemented. They are functions that will\nonly run on the device. As for kernels, they have the same restrictions.\n\n```c++\nspmd_dev_func(k_float device_func, k_float a, k_float b)\n  // Device function code\nspmd_dev_func_end\n\nspmd_kernel_1d(kernel, arguments)\n\n  // ...\n\n  spmd_call_dev_func(device_func, a, b);\n\n  // ...\n\nspmd_kernel_end\n```\n\nThe caveat with `spmd_dev_func` is that its first argument must be the return\ntype followed by the device function name.\n\nIt is also possible to write templated kernels. Due to C++ `__VA_ARGS__`\nlimitations the number of template argument is limited to one of kind\n`typename`. If more types or integers are to be passed to device kernels or\nfunctions they have to be boxed inside a struct.\n\n```c++\nstruct mul_t {\n  spmd_dev_func(static k_float dev_impl, k_float a, k_float b)\n    return a * b;\n  spmd_dev_func_end\n};\n\nstruct add_t {\n  spmd_dev_func(static k_float dev_impl, k_float a, k_float b)\n    return a + b;\n  spmd_dev_func_end\n};\n\n// Op is the template argument (typename Op in C++ code)\nspmd_tmpl_dev_func(k_float trampoline, Op, k_float a, k_float b)\n  return Op::template spmd_call_dev_func(dev_impl, a, b);\nspmd_dev_func_end\n\n// Op is the template argument (typename Op in C++ code)\nspmd_tmpl_kernel_1d(tmpl_kernel, Op, arguments)\n\n  // ...\n\n  spmd_call_tmpl_dev_func(trampoline, Op, a, b);\n\n  // ...\n\nspmd_kernel_end\n\nint main() {\n\n  // Kernel call for addition\n  spmd_launch_tmpl_kernel_1d(tmpl_kernel, add_t, 32, 1, N, arguments);\n\n  // Kernel call for multiplication\n  spmd_launch_tmpl_kernel_1d(tmpl_kernel, mul_t, 32, 1, N, arguments);\n\n  return 0;\n}\n```\n\n## The NSIMD SPMD C++ DSL\n\nThe DSL is of course constraint by C++ syntax and constructs. This implies\nsome strange syntax and the impossibility to use infix operator `=`.\nFor now (2020/05/16) the NSIMD SPMD DSL does only supports `if`'s, while-loops\nand `returns`. It seems that for-loops and do-while-loops cannot be nicely\nproposed, i.e. with a nice syntax, the switch-case keywords cannot be\nimplemented with a good conformence to the semantic of their C++ counterparts.\nGoto's also cannot be implemented properly.\n\n### Variables types available in kernels and device functions\n\nThe following self-explanatory variable types are available inside kernels\nand devices functions:\n\n- `k_int` for signed integers\n- `k_uint` for unsigned integers\n- `k_float` for floatting point numbers\n- `k_bool` for booleans\n\nAs explained above the bit-width of the above types are determined by the\nlaunch kernel function. Note that `k_float` does not exists for 8-bits types.\n\n### Load/store from/to memory\n\nGiven a pointer, the proper way to load data is to use `k_load(ptr)`. For\nstoring a value to memory `k_store` is to be used.\n\n```c++\nk_store(ptr, value);\nk_store(ptr, expression);\n```\n\nAs explained above, there is no need to compute the offset to apply to\npointers. This is hidden from the programmer.\n\n### Assignment operator (`operator=`)\n\nDue to C++ ADL (<https://en.cppreference.com/w/cpp/language/adl>) and the\nneed for keeping things simple for the compiler (which does not always mean\nsimple for the programmer) the use of infix operator `=` will not produce\na copmilation error but will give incorrect result. You should use `k_set`.\n\n```c++\nk_set(var, value);\nk_set(var, expression);\n```\n\nAs written above, `k_set` assign value or the result of an expression to a\nvariable.\n\n### if, then, else\n\nYou should not use plan C++ `if`'s or `else`'s. This will not cause compilation\nerror but will produce incorrect results at runtime. You should use `k_if`,\n`k_else`, `k_elseif` and `k_endif` instead. they have the same semantic as\ntheir C++ counterparts.\n\n```c++\nspmd_kernel_1d(if_elseif_else, float *dst, float *a_ptr)\n\n  k_float a, ret;\n  k_set(a, k_load(a_ptr));\n\n  k_if (a > 15.0f)\n\n    k_set(ret, 15.0f);\n\n  k_elseif ( a > 10.0f)\n\n    k_set(ret, 10.0f);\n\n  k_elseif ( a > 5.0f)\n\n    k_set(ret, 5.0f);\n\n  k_else\n\n    k_set(ret, 0.0f);\n\n  k_endif\n\n  k_store(dst, ret);\n\nspmd_kernel_end\n```\n\n### while loops\n\nYou should not use plan C++ `while`'s, `break`'s and `continue`'s. This will\nnot cause compilation error but will produce incorrect results at runtime.\nYou should use `k_while`, `k_break`, `k_continue` and `k_endif` instead. They\nhave the same semantic as their C++ counterparts.\n\n```c++\nspmd_kernel_1d(binpow, float *dst, float *a_ptr, int *p_ptr)\n\n  k_float a, ret;\n  k_set(a, k_load(a_ptr));\n  k_set(ret, 1.0f);\n  k_int p;\n  k_set(p, k_load(p_ptr));\n\n  k_while(p > 0)\n\n    k_if ((p & 1) != 0)\n\n      k_set(ret, ret * a);\n\n    k_endif\n\n    k_set(a, a * a);\n    k_set(p, p >> 1);\n\n  k_endwhile\n\n  k_store(dst, ret);\n\nspmd_kernel_end\n```\n\n### Returns\n\nReturns cannot be implemented as macros overloading is not possible in a\nstandard way with an overload taking zero arguments. So returning has to be\ndone correctly. The `k_return` keyword has the same semantic as the C++\n`return` keyword without arguments and can be used at will for kernels (as\nkernels return type is always `void`) and for device functions returning\n`void`.\n\nFor device functions returning a value it is recommanded to proceed this way:\n\n1. Declare a variable, say `ret`, to store the return value.\n2. Whereever you need to return, set the variable appropriately with `k_set`\n   and return with `k_return`.\n3. At the end of the function use `return ret;`.\n\n```c++\nspmd_dev_func(k_int func, k_int a)\n\n  k_float ret;\n\n  k_if (a == 0)\n    k_set(ret, 0);\n    k_return;\n  k_endif\n\n  k_if (a == 1)\n    k_set(ret, -1);\n    k_return;\n  k_endif\n\n  k_set(ret, a);\n\n  return ret;\n\nspmd_dev_func_end\n```\n\n## Advanced techniques and functions\n\nThis paragraph applies mainly when targeting CPUs. Using techniques described\nbelow won't affect GPUs.\n\nIf you are familiar with the SIMD technique of masking to emulate loops and\nif's you may know that `k_set` and `k_store` are implemented using respectively\n`nsimd::if_else` and `nsimd::maskz_storeu` which may incur performance\npenalties. When you know that a simple assignment or store is sufficient\nyou may use the unmasked variants:\n\n- `k_unmasked_set` translates into a C++ assignment.\n- `k_unmasked_store` translates into a C++ SIMD store.\n\nTheir arguments are exactly the same as `k_set` and `k_store`. Unmasked\noperations can usually be used at the beginning of device functions and also\ninside loops, on temporary variables, knowing that the result of the latter\nwon't be needed later.\n\nYou may also use C++ standard keywords and constructs. But be aware that doing\nso will apply all the same treatment too all SIMD lanes. This can be useful\nwhen the operations involved are independant of the processed data as in the\nexample below.\n\n```c++\nspmd_dev_func(k_float newton_raphson_sqrt, k_float a, k_float x0)\n  k_float ret;\n  for (int i = 0; i < 6; i++) {\n    k_unmasked_set(ret, (ret + ret * a) / 2.0f);\n  }\n  return ret;\nspmd_dev_func_end\n```\n''')\n\n# -----------------------------------------------------------------------------\n\ndef gen_doc_api(opts):\n    filename = common.get_markdown_file(opts, 'api', 'spmd')\n    if not common.can_create_filename(opts, filename):\n        return\n\n    # Build tree for api.md\n    api = dict()\n    for _, operator in operators.operators.items():\n        if not operator.has_scalar_impl:\n            continue\n        for c in operator.categories:\n            if c not in api:\n                api[c] = [operator]\n            else:\n                api[c].append(operator)\n\n    with common.open_utf8(opts, filename) as fout:\n        fout.write(\n'''# NSIMD SPMD API reference\n\nThis page contains the exhaustive API of the SPMD module. Note that most\noperators names follow the simple naming `k_[NSIMD name]` and have the same\nsemantics. This page is light, you may use CTRL+F to find the operator you\nare looking for.\n\nFor genericity on the base type you should use operator names instead of\ninfix operators, e.g. `k_add` instead of `+`. Indeed for `f16`'s NVIDIA CUDA\nand NSIMD do not provide overloads and therefore code using `+` will fail to\ncompile.\n\nNote that all operators accept literals and scalars. For example you may\nwrite `k_add(a, 1)` or `float s; k_add(a, s);`. This also applies when\nusing infix operators. But note that literals or scalars must have the\nsame type as the other operands.\n\n''')\n\n        for c, ops in api.items():\n            if len(ops) == 0:\n                continue\n            fout.write('\\n## {}\\n\\n'.format(c.title))\n            for op in ops:\n                fout.write('- `{}`  \\n'.format(get_signature(op)))\n                if op.cxx_operator != None:\n                    fout.write('  Infix operator: `{}` ' \\\n                               '(*for certain types only*)  \\n'.\\\n                               format(op.cxx_operator))\n                fout.write('  {}\\n\\n'.format(op.desc))\n\n# -----------------------------------------------------------------------------\n\ndef gen_tests_for_shifts(opts, t, operator):\n    op_name = operator.name\n    dirname = os.path.join(opts.tests_dir, 'modules', 'spmd')\n    common.mkdir_p(dirname)\n    filename = os.path.join(dirname, '{}.{}.cpp'.format(op_name, t))\n    if not common.can_create_filename(opts, filename):\n        return\n    with common.open_utf8(opts, filename) as out:\n        out.write(\n        '''#include <nsimd/modules/spmd.hpp>\n        #include <nsimd/modules/memory_management.hpp>\n        #include <nsimd/scalar_utilities.h>\n        #include \"../common.hpp\"\n\n        #if defined(NSIMD_CUDA)\n\n        __global__ void kernel({typ} *dst, {typ} *a0, int n, int s) {{\n          int i = threadIdx.x + blockIdx.x * blockDim.x;\n          if (i < n) {{\n            dst[i] = nsimd::gpu_{op_name}(a0[i], s);\n          }}\n        }}\n\n        void compute_result({typ} *dst, {typ} *a0, unsigned int n, int s) {{\n          kernel<<<{gpu_params}>>>(dst, a0, int(n), s);\n        }}\n\n        {cbprng_cuda}\n\n        #elif defined(NSIMD_ROCM)\n\n        __global__ void kernel({typ} *dst, {typ} *a0, size_t n, int s) {{\n          size_t i = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n          if (i < n) {{\n            dst[i] = nsimd::gpu_{op_name}(a0[i], s);\n          }}\n        }}\n\n        void compute_result({typ} *dst, {typ} *a0, size_t n, int s) {{\n          hipLaunchKernelGGL(kernel, {gpu_params}, 0, 0, dst, a0, n, s);\n        }}\n\n        {cbprng_hip}\n\n        #elif defined(NSIMD_ONEAPI)\n\n        inline void kernel({typ} *dst, {typ} *a0, const size_t n,\n                           const int s, sycl::nd_item<1> item) {{\n          const size_t ii = item.get_global_id().get(0);\n          if (ii < n){{\n            dst[ii] = nsimd::gpu_{op_name}(a0[ii], s);\n          }}\n        }}\n\n        void compute_result({typ} *dst, {typ} *a0, size_t n, int s) {{\n          size_t total_num_threads = (size_t)nsimd_kernel_param((int)n, {tpb});\n          sycl::queue q_ = nsimd::oneapi::default_queue();\n          q_.parallel_for(sycl::nd_range<1>(sycl::range<1>(total_num_threads),\n                                            sycl::range<1>({tpb})),\n                                            [=](sycl::nd_item<1> item){{\n              kernel(dst, a0, n, s, item);\n            }}).wait_and_throw();\n        }}\n\n        {cbprng_oneapi}\n\n        #else\n\n        void compute_result({typ} *dst, {typ} *a0, unsigned int n, int s) {{\n          for (unsigned int i = 0; i < n; i++) {{\n            dst[i] = nsimd::scalar_{op_name}(a0[i], s);\n          }}\n        }}\n\n        {cbprng_cpu}\n\n        #endif\n\n        // clang-format off\n\n        spmd_kernel_1d(kernel, {typ} *dst, {typ} *a0, int s)\n          k_store(dst, k_{op_name}(k_load(a0), s));\n        spmd_kernel_end\n\n        // clang-format on\n\n        int main() {{\n          unsigned int n_[3] = {{ 10, 1001, 10001 }};\n          for (int i = 0; i < (int)(sizeof(n_) / sizeof(int)); i++) {{\n            unsigned int n = n_[i];\n            for (int s = 0; s < {typnbits}; s++) {{\n              int ret = 0;\n              {typ} *a0 = nsimd::device_calloc<{typ}>(n);\n              random(a0, n, 0);\n              {typ} *ref = nsimd::device_calloc<{typ}>(n);\n              {typ} *out = nsimd::device_calloc<{typ}>(n);\n              spmd_launch_kernel_1d(kernel, {typnbits}, 1, n, out, a0, s);\n              compute_result(ref, a0, n, s);\n              if (!cmp(ref, out, n)) {{\n                ret = -1;\n              }}\n              nsimd::device_free(a0);\n              nsimd::device_free(ref);\n              nsimd::device_free(out);\n              if (ret != 0) {{\n                return ret;\n              }}\n            }}\n          }}\n          return 0;\n        }}\n        '''.format(typ=t, op_name=op_name, typnbits=t[1:], tpb=tpb,\n                   cbprng_cpu=nsimd_tests.cbprng(t, operator, 'cpu'),\n                   cbprng_cuda=nsimd_tests.cbprng(t, operator, 'cuda',\n                                                  gpu_params),\n                   cbprng_hip=nsimd_tests.cbprng(t, operator, 'hip',\n                                                 gpu_params),\n                   cbprng_oneapi=nsimd_tests.cbprng(t, operator, 'oneapi',\n                                                    ['(int)n', str(tpb)]),\n                   gpu_params=gpu_params))\n\n    common.clang_format(opts, filename, cuda=True)\n\n# -----------------------------------------------------------------------------\n\ndef gen_tests_for_cvt_reinterpret(opts, tt, t, operator):\n    op_name = operator.name\n    dirname = os.path.join(opts.tests_dir, 'modules', 'spmd')\n    common.mkdir_p(dirname)\n    filename = os.path.join(dirname, '{}.{}_{}.cpp'.format(op_name, t, tt))\n    if not common.can_create_filename(opts, filename):\n        return\n\n    with common.open_utf8(opts, filename) as out:\n        out.write(\n        '''#include <nsimd/modules/spmd.hpp>\n        #include <nsimd/modules/memory_management.hpp>\n        #include <nsimd/scalar_utilities.h>\n        #include \"../common.hpp\"\n\n        #if defined(NSIMD_CUDA)\n\n        __global__ void kernel({typ} *dst, {typ} *a0, int n) {{\n          int i = threadIdx.x + blockIdx.x * blockDim.x;\n          if (i < n) {{\n            dst[i] = nsimd::gpu_{op_name}({typ}(), nsimd::gpu_{op_name}(\n                         {totyp}(), a0[i]));\n          }}\n        }}\n\n        void compute_result({typ} *dst, {typ} *a0, unsigned int n) {{\n          kernel<<<{gpu_params}>>>(dst, a0, int(n));\n        }}\n\n        {cbprng_cuda}\n\n        #elif defined(NSIMD_ROCM)\n\n        __global__ void kernel({typ} *dst, {typ} *a0, size_t n) {{\n          size_t i = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n          if (i < n) {{\n            dst[i] = nsimd::gpu_{op_name}({typ}(), nsimd::gpu_{op_name}(\n                         {totyp}(), a0[i]));\n          }}\n        }}\n\n        void compute_result({typ} *dst, {typ} *a0, size_t n) {{\n          hipLaunchKernelGGL(kernel, {gpu_params}, 0, 0, dst, a0, n);\n        }}\n\n        {cbprng_hip}\n\n        #elif defined(NSIMD_ONEAPI)\n\n        inline void kernel({typ} *dst, {typ} *a0, const size_t n,\n                           sycl::nd_item<1> item) {{\n          const size_t ii = item.get_global_id().get(0);\n          if (ii < n){{\n            dst[ii] = nsimd::gpu_{op_name}({typ}(), nsimd::gpu_{op_name}(\n                              {totyp}(), a0[ii]));\n          }}\n        }}\n\n        void compute_result({typ} *dst, {typ} *a0, size_t n) {{\n          size_t total_num_threads = (size_t)nsimd_kernel_param((int)n, {tpb});\n          sycl::queue q_ = nsimd::oneapi::default_queue();\n          q_.parallel_for(sycl::nd_range<1>(sycl::range<1>(total_num_threads),\n                                            sycl::range<1>({tpb})),\n                                            [=](sycl::nd_item<1> item){{\n            kernel(dst, a0, n, item);\n          }}).wait_and_throw();\n        }}\n\n        {cbprng_oneapi}\n\n        #else\n\n        void compute_result({typ} *dst, {typ} *a0, unsigned int n) {{\n          for (unsigned int i = 0; i < n; i++) {{\n            dst[i] = nsimd::scalar_{op_name}({typ}(), nsimd::scalar_{op_name}(\n                         {totyp}(), a0[i]));\n          }}\n        }}\n\n        {cbprng_cpu}\n\n        #endif\n\n        // clang-format off\n\n        spmd_kernel_1d(kernel, {typ} *dst, {typ} *a0)\n          k_store(dst, k_{op_name}({k_typ}, k_{op_name}({k_totyp},\n                  k_load(a0))));\n        spmd_kernel_end\n\n        // clang-format on\n\n        int main() {{\n          unsigned int n_[3] = {{ 10, 1001, 10001 }};\n          for (int i = 0; i < (int)(sizeof(n_) / sizeof(int)); i++) {{\n            unsigned int n = n_[i];\n            int ret = 0;\n            {typ} *a0 = nsimd::device_calloc<{typ}>(n);\n            random(a0, n, 0);\n            {typ} *ref = nsimd::device_calloc<{typ}>(n);\n            {typ} *out = nsimd::device_calloc<{typ}>(n);\n            spmd_launch_kernel_1d(kernel, {typnbits}, 1, n, out, a0);\n            compute_result(ref, a0, n);\n            if (!cmp(ref, out, n)) {{\n              ret = -1;\n            }}\n            nsimd::device_free(a0);\n            nsimd::device_free(ref);\n            nsimd::device_free(out);\n            if (ret != 0) {{\n              return ret;\n            }}\n          }}\n          return 0;\n        }}\n        '''.format(typ=t, totyp=tt, op_name=op_name, typnbits=t[1:],\n                   gpu_params=gpu_params, k_typ=k_typ[t[0]], tpb=tpb,\n                   cbprng_cpu=nsimd_tests.cbprng(t, operator, 'cpu'),\n                   cbprng_cuda=nsimd_tests.cbprng(t, operator, 'cuda'),\n                   cbprng_hip=nsimd_tests.cbprng(t, operator, 'hip',\n                                                 gpu_params),\n                   cbprng_oneapi=nsimd_tests.cbprng(t, operator, 'oneapi',\n                                                    ['(int)n', str(tpb)]),\n                   k_totyp=k_typ[tt[0]]))\n\n    common.clang_format(opts, filename, cuda=True)\n\n# -----------------------------------------------------------------------------\n\ndef gen_tests_for(opts, t, operator):\n    op_name = operator.name\n    dirname = os.path.join(opts.tests_dir, 'modules', 'spmd')\n    common.mkdir_p(dirname)\n    filename = os.path.join(dirname, '{}.{}.cpp'.format(op_name, t))\n    if not common.can_create_filename(opts, filename):\n        return\n\n    arity = len(operator.params[1:])\n    k_args = ', '.join(['{} *a{}'.format(t, i) for i in range(arity)])\n    k_call_args = ', '.join(['a{}'.format(i) for i in range(arity)])\n\n    fill_tabs = '\\n'.join(['{typ} *a{i} = nsimd::device_calloc<{typ}>(n);\\n' \\\n                           'random(a{i}, n, {i});'.format(typ=t, i=i) \\\n                           for i in range(arity)])\n\n    free_tabs = '\\n'.join(['nsimd::device_free(a{i});'. \\\n                           format(typ=t, i=i) for i in range(arity)])\n\n    # spmd\n    def get_cte_spmd(typ, cte):\n        if typ == 'f16':\n            return 'k_f32_to_f16((f32){})'.format(cte)\n        else:\n            return '({}){}'.format(typ, cte)\n\n    def spmd_load_code(param, typ, i):\n        if param == 'l':\n            return 'k_lt(k_load(a{}), {})'.format(i, get_cte_spmd(typ, 4))\n        if param == 'v':\n            return 'k_load(a{})'.format(i)\n\n    args = ', '.join([spmd_load_code(operator.params[i + 1], t, i) \\\n                      for i in range(arity)])\n    if op_name == 'to_mask':\n        args = k_typ[t[0]] + ', ' + args\n    if operator.params[0] == 'v':\n        k_code = 'k_store(dst, k_{}({}));'.format(op_name, args)\n    else:\n        k_code = '''k_if (k_{}({}))\n                      k_store(dst, 1);\n                    k_else\n                      k_store(dst, 0);\n                    k_endif'''.format(op_name, args)\n\n    # gpu\n    def get_cte_gpu(typ, cte, target):\n        if typ == 'f16' and target == 'cuda_rocm':\n            return '__float2half((f32){})'.format(cte)\n        else:\n            return '({}){}'.format(typ, cte)\n\n    def gpu_load_code(param, typ, i, target):\n        if param == 'l':\n            return 'nsimd::gpu_lt(a{}[i], {})'. \\\n                   format(i, get_cte_gpu(typ, 4, target))\n        if param == 'v':\n            return 'a{}[i]'.format(i)\n\n    args_cuda_rocm = ', '.join([gpu_load_code(operator.params[i + 1], t, i,\n                                              'cuda_rocm') \\\n                                              for i in range(arity)])\n    args_oneapi = ', '.join([gpu_load_code(operator.params[i + 1], t, i,\n                                           'oneapi') for i in range(arity)])\n    if op_name == 'to_mask':\n        args_cuda_rocm = t + '(), ' + args_cuda_rocm\n        args_oneapi = t + '(), ' + args_oneapi\n    if operator.params[0] == 'v':\n        cuda_rocm_kernel = 'dst[i] = nsimd::gpu_{}({});'. \\\n                           format(op_name, args_cuda_rocm)\n        oneapi_kernel = 'dst[i] = nsimd::gpu_{}({});'. \\\n                        format(op_name, args_oneapi)\n    else:\n        tmpl = '''if (nsimd::gpu_{}({{}})) {{{{\n                    dst[i] = {{}};\n                  }}}} else {{{{\n                    dst[i] = {{}};\n                  }}}}'''.format(op_name)\n        cuda_rocm_kernel = tmpl.format(args_cuda_rocm,\n                                       get_cte_gpu(t, 1, 'cuda_rocm'),\n                                       get_cte_gpu(t, 0, 'cuda_rocm'))\n        oneapi_kernel = tmpl.format(args_oneapi,\n                                    get_cte_gpu(t, 1, 'oneapi'),\n                                    get_cte_gpu(t, 0, 'oneapi'))\n\n    # cpu\n    def get_cte_cpu(typ, cte):\n        if typ == 'f16':\n            return 'nsimd_f32_to_f16((f32){})'.format(cte)\n        else:\n            return '({}){}'.format(typ, cte)\n\n    def cpu_load_code(param, typ, i):\n        if param == 'l':\n            return 'nsimd::scalar_lt(a{}[i], {})'. \\\n                   format(i, get_cte_cpu(typ, 4))\n        if param == 'v':\n            return 'a{}[i]'.format(i)\n\n    args = ', '.join([cpu_load_code(operator.params[i + 1], t, i) \\\n                      for i in range(arity)])\n    if op_name == 'to_mask':\n        args = t + '(), ' + args\n    if operator.params[0] == 'v':\n        cpu_kernel = 'dst[i] = nsimd::scalar_{}({});'.format(op_name, args)\n    else:\n        cpu_kernel = '''if (nsimd::scalar_{op_name}({args})) {{\n                          dst[i] = {one};\n                        }} else {{\n                          dst[i] = {zero};\n                        }}'''.format(op_name=op_name, args=args,\n                                     one=get_cte_cpu(t, 1),\n                                     zero=get_cte_cpu(t, 0))\n\n    comp = '!cmp(ref, out, n{})'.format('' if t in common.iutypes \\\n                                        else ', {}'.format(operator.ufp[t]))\n\n    with common.open_utf8(opts, filename) as out:\n        out.write(\n        '''#include <nsimd/modules/spmd.hpp>\n        #include <nsimd/modules/memory_management.hpp>\n        #include <nsimd/scalar_utilities.h>\n        #include \"../common.hpp\"\n\n        #if defined(NSIMD_CUDA)\n\n        __global__ void kernel({typ} *dst, {k_args}, int n) {{\n          int i = threadIdx.x + blockIdx.x * blockDim.x;\n          if (i < n) {{\n            {cuda_rocm_kernel}\n          }}\n        }}\n\n        void compute_result({typ} *dst, {k_args}, unsigned int n) {{\n          kernel<<<{gpu_params}>>>(dst, {k_call_args}, int(n));\n        }}\n\n        {cbprng_cuda}\n\n        #elif defined(NSIMD_ROCM)\n\n        __global__ void kernel({typ} *dst, {k_args}, size_t n) {{\n          size_t i = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n          if (i < n) {{\n            {cuda_rocm_kernel}\n          }}\n        }}\n\n        void compute_result({typ} *dst, {k_args}, size_t n) {{\n          hipLaunchKernelGGL(kernel, {gpu_params}, 0, 0, dst, {k_call_args},\n                             n);\n        }}\n\n        {cbprng_hip}\n\n        #elif defined(NSIMD_ONEAPI)\n\n        inline void kernel({typ} *dst, {k_args}, const size_t n,\n                           sycl::nd_item<1> item) {{\n          const size_t i = item.get_global_id().get(0);\n          if(i < n){{\n            {oneapi_kernel}\n          }}\n        }}\n\n        void compute_result({typ} *dst, {k_args}, size_t n) {{\n          size_t total_num_threads = (size_t)nsimd_kernel_param((int)n, {tpb});\n          sycl::queue q_ = nsimd::oneapi::default_queue();\n          q_.parallel_for(sycl::nd_range<1>(sycl::range<1>(total_num_threads),\n                                            sycl::range<1>({tpb})),\n                                            [=](sycl::nd_item<1> item){{\n            kernel(dst, {k_call_args}, n, item);\n          }}).wait_and_throw();\n        }}\n\n        {cbprng_oneapi}\n\n        #else\n\n        void compute_result({typ} *dst, {k_args}, unsigned int n) {{\n          for (unsigned int i = 0; i < n; i++) {{\n            {cpu_kernel}\n          }}\n        }}\n\n        {cbprng_cpu}\n\n        #endif\n\n        // clang-format off\n\n        spmd_kernel_1d(kernel, {typ} *dst, {k_args})\n          {k_code}\n        spmd_kernel_end\n\n        // clang-format on\n\n        #if defined(NSIMD_CUDA) || defined(NSIMD_ROCM) || defined(NSIMD_ONEAPI)\n        #define THREADS_PER_BLOCK 128\n        #else\n        #define THREADS_PER_BLOCK 1\n        #endif\n\n        int main() {{\n          unsigned int n_[3] = {{ 10, 1001, 10001 }};\n          for (int i = 0; i < (int)(sizeof(n_) / sizeof(int)); i++) {{\n            unsigned int n = n_[i];\n            int ret = 0;\n            {fill_tabs}\n            {typ} *ref = nsimd::device_calloc<{typ}>(n);\n            {typ} *out = nsimd::device_calloc<{typ}>(n);\n            spmd_launch_kernel_1d(kernel, {typnbits}, THREADS_PER_BLOCK, n,\n                                  out, {k_call_args});\n            compute_result(ref, {k_call_args}, n);\n            if ({comp}) {{\n              ret = -1;\n            }}\n            nsimd::device_free(ref);\n            nsimd::device_free(out);\n            {free_tabs}\n            if (ret != 0) {{\n              return ret;\n            }}\n          }}\n          return 0;\n        }}\n        '''.format(typ=t, free_tabs=free_tabs, fill_tabs=fill_tabs,\n                   k_code=k_code, k_call_args=k_call_args, k_args=k_args,\n                   cpu_kernel=cpu_kernel, comp=comp,\n                   cuda_rocm_kernel=cuda_rocm_kernel,\n                   oneapi_kernel=oneapi_kernel,\n                   cbprng_cpu=nsimd_tests.cbprng(t, operator, 'cpu'),\n                   cbprng_cuda=nsimd_tests.cbprng(t, operator, 'cuda',\n                                                  gpu_params),\n                   cbprng_hip=nsimd_tests.cbprng(t, operator, 'hip',\n                                                 gpu_params),\n                   cbprng_oneapi=nsimd_tests.cbprng(t, operator, 'oneapi',\n                                                    ['(int)n', str(tpb)]),\n                   gpu_params=gpu_params, typnbits=t[1:], tpb=tpb))\n\n    common.clang_format(opts, filename, cuda=True)\n\ndef gen_tests(opts):\n    for op_name, operator in operators.operators.items():\n        if not operator.has_scalar_impl:\n            continue\n        not_closed = (operator.output_to == common.OUTPUT_TO_SAME_SIZE_TYPES \\\n                      or ('v' not in operator.params[1:] and 'l' not in\n                      operator.params[1:]))\n        for t in operator.types:\n            tts = common.get_output_types(t, operator.output_to)\n            for tt in tts:\n                if not nsimd_tests.should_i_do_the_test(operator, tt, t):\n                    continue\n                if operator.name in ['shl', 'shr', 'shra']:\n                    gen_tests_for_shifts(opts, t, operator)\n                elif operator.name in ['cvt', 'reinterpret', 'reinterpretl']:\n                    gen_tests_for_cvt_reinterpret(opts, tt, t, operator)\n                else:\n                    gen_tests_for(opts, t, operator)\n\n# -----------------------------------------------------------------------------\n\ndef gen_functions(opts):\n    functions = ''\n\n    for op_name, operator in operators.operators.items():\n        if not operator.has_scalar_impl:\n            continue\n\n        if operator.params[0] == 'l':\n            s_ret_typ = 'bool'\n            v_ret_typ = \\\n                'nsimd::packl<typename base_type<A0>::type, N>'\n        else:\n            s_ret_typ = 'T'\n            v_ret_typ = 'nsimd::pack<typename base_type<A0>::type, N>'\n\n        def s_typ(typ):\n            if typ == 'p':\n                return 'int'\n            if typ == 'v':\n                return 'T'\n            if typ == 'l':\n                return 'bool'\n\n        s_args = ', '.join(['{} a{}'.format(s_typ(operator.params[i]), i - 1) \\\n                            for i in range(1, len(operator.params))])\n        s_call_args = ', '.join(['a{}'.format(i - 1) \\\n                                 for i in range(1, len(operator.params))])\n        s_tmpl = 'typename T' if 'v' in operator.params[1:] else ''\n\n        def v_typ(typ, i):\n            if typ == 'p':\n                return 'int'\n            if typ in ['v', 'l']:\n                return 'A{}'.format(i)\n        v_args = ', '.join(['{} a{}'. \\\n                            format(v_typ(operator.params[i], i - 1), i - 1) \\\n                            for i in range(1, len(operator.params))])\n\n        def v_call_arg(typ, i):\n            if typ == 'p':\n                return '(int)a{}'.format(i)\n            if typ == 'v':\n                return 'spmd::to_pack<T, N>(a{})'.format(i)\n            if typ == 'l':\n                return 'spmd::to_packl<T, N>(a{})'.format(i)\n\n        v_call_args = ', '.join([v_call_arg(operator.params[i], i - 1) \\\n                                 for i in range(1, len(operator.params))])\n\n        v_tmpl = ', '.join(['typename A{}'.format(i - 1) \\\n                            for i in range(1, len(operator.params)) \\\n                            if operator.params[i] != 'p'])\n\n        m_call_args_cpu = s_call_args\n        m_call_args_gpu = s_call_args\n        to_type = ''\n        ToType = ''\n        v_op_name = op_name\n        s_op_name = op_name\n        template = ''\n\n        # Override for non closed operators\n        if operator.output_to == common.OUTPUT_TO_SAME_SIZE_TYPES or \\\n           op_name == 'to_mask':\n            s_ret_typ = 'ToType'\n            s_tmpl = append('typename ToType', s_tmpl)\n            m_call_args_gpu = append('to_type()', s_call_args)\n            s_call_args = append('ToType()', s_call_args)\n            v_tmpl = append('typename ToType', v_tmpl)\n            to_type = '<to_type>'\n            template = 'template '\n            v_ret_typ = 'ToType'\n            ToType = '<ToType>'\n\n        # special case for to_mask\n        if op_name == 'to_mask':\n            v_op_name = 'reinterpret'\n            v_call_args = 'to_mask({})'.format(v_call_args)\n\n        if v_tmpl != '':\n            v_tmpl = 'template <{}>'.format(v_tmpl)\n        if s_tmpl != '':\n            s_tmpl = 'template <{}>'.format(s_tmpl)\n\n        functions += \\\n        '''#if defined(NSIMD_CUDA) || defined(NSIMD_ROCM) || \\\n               defined(NSIMD_ONEAPI)\n\n           {signature} nsimd::gpu_{s_op_name}({m_call_args_gpu})\n\n           #else\n\n           template <typename KernelType, int N> struct {op_name}_helper {{}};\n\n           template <int N> struct {op_name}_helper<spmd::KernelScalar, N> {{\n             {s_tmpl} static {s_ret_typ} impl({s_args}) {{\n               return nsimd::scalar_{s_op_name}({s_call_args});\n             }}\n           }};\n\n           template <int N> struct {op_name}_helper<spmd::KernelSIMD, N> {{\n             {v_tmpl} static {v_ret_typ} impl({v_args}) {{\n               typedef typename spmd::base_type<A0>::type T;\n               return nsimd::{v_op_name}{ToType}({v_call_args});\n             }}\n           }};\n\n           {signature} \\\\\n               spmd::{op_name}_helper<spmd_KernelType_, \\\\\n                                      spmd_N_>::{template}impl{to_type}( \\\\\n                                        {m_call_args_cpu})\n\n           #endif\n\n           {hbar}\n\n           '''.format(hbar=common.hbar, s_op_name=s_op_name, s_tmpl=s_tmpl,\n                      s_ret_typ=s_ret_typ, s_args=s_args, v_args=v_args,\n                      v_call_args=v_call_args, s_call_args=s_call_args,\n                      v_tmpl=v_tmpl, v_ret_typ=v_ret_typ, ToType=ToType,\n                      m_call_args_cpu=m_call_args_cpu, to_type=to_type,\n                      v_op_name=v_op_name, op_name=op_name, template=template,\n                      m_call_args_gpu=m_call_args_gpu,\n                      signature=get_signature(operator))\n\n    # Write the code to file\n    dirname = os.path.join(opts.include_dir, 'modules', 'spmd')\n    common.mkdir_p(dirname)\n    filename = os.path.join(dirname, 'functions.hpp')\n    if not common.can_create_filename(opts, filename):\n        return\n    with common.open_utf8(opts, filename) as out:\n        out.write('#ifndef NSIMD_MODULES_SPMD_FUNCTIONS_HPP\\n')\n        out.write('#define NSIMD_MODULES_SPMD_FUNCTIONS_HPP\\n\\n')\n        out.write('namespace spmd {\\n\\n')\n        out.write('{}\\n\\n'.format(common.hbar))\n        out.write(functions)\n        out.write('} // namespace spmd\\n\\n')\n        out.write('#endif\\n')\n    common.clang_format(opts, filename)\n\n# -----------------------------------------------------------------------------\n\ndef name():\n    return 'SPMD programming'\n\ndef desc():\n    return '''SPMD programming allows the programmer to focus on kernels and\nthe compiler to vectorize kernel code more effectively. Basically this\nmodule provides a \"à la CUDA\" programming C++ DSL to targets CPU SIMD as well\nas Intel, NVIDIA and AMD GPUs.'''\n\ndef doc_menu():\n    return {'Overview': 'overview', 'API reference': 'api'}\n\n# -----------------------------------------------------------------------------\n\ndef doit(opts):\n    common.myprint(opts, 'Generating module spmd')\n    if opts.library:\n        gen_functions(opts)\n    if opts.tests:\n        gen_tests(opts)\n    if opts.doc:\n        gen_doc_api(opts)\n        gen_doc_overview(opts)\n"
  },
  {
    "path": "egg/modules/tet1d/hatch.py",
    "content": "# Copyright (c) 2021 Agenium Scale\n#\n# Permission is hereby granted, free of charge, to any person obtaining a copy\n# of this software and associated documentation files (the \"Software\"), to deal\n# in the Software without restriction, including without limitation the rights\n# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n# copies of the Software, and to permit persons to whom the Software is\n# furnished to do so, subject to the following conditions:\n#\n# The above copyright notice and this permission notice shall be included in all\n# copies or substantial portions of the Software.\n#\n# THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n# SOFTWARE.\n\nimport os\nimport operators\nimport common\nimport gen_scalar_utilities\nimport gen_tests as nsimd_tests\n\n# -----------------------------------------------------------------------------\n# CUDA: default number of threads per block\n\ntpb = 128\ngpu_params = '(n + {}) / {}, {}'.format(tpb, tpb - 1, tpb)\n\ndef is_not_closed(operator):\n    return (operator.output_to == common.OUTPUT_TO_SAME_SIZE_TYPES \\\n            or ('v' not in operator.params[1:] and 'l' not in\n            operator.params[1:]))\n\n# -----------------------------------------------------------------------------\n\ndef gen_doc_overview(opts):\n    filename = common.get_markdown_file(opts, 'overview', 'tet1d')\n    if not common.can_create_filename(opts, filename):\n        return\n    with common.open_utf8(opts, filename) as fout:\n        fout.write('''# Overview\n\n## What are expression templates?\n\nExpression templates are a C++ template metaprogramming technique that\nessentially allows high level programming for loop fusion. Take the following\nexemple.\n\n```c++\nstd::vector<float> operator+(std::vector<float> const &a,\n                             std::vector<float> const &b) {{\n  std::vector<float> ret(a.size());\n  for (size_t i = 0; i < a.size(); i++) {{\n    ret[i] = a[i] + b[i];\n  }}\n  return ret;\n}}\n\nint main() {{\n  std::vector<float> a, b, c, d, sum;\n\n  ...\n\n  sum = a + b + c + d;\n\n  ...\n\n  return 0;\n}}\n```\n\nThe expression `a + b + c + d` involves three calls to `operator+` and at least\nnine memory passes are necessary. This can be optimized as follows.\n\n```c++\nint main() {{\n  std::vector<float> a, b, c, d, sum;\n\n  ...\n\n  for (size_t i = 0; i < a.size(); i++) {{\n    ret[i] = a[i] + b[i] + c[i] + d[i];\n  }}\n\n  ...\n\n  return 0;\n}}\n```\n\nThe rewriting above requires only four memory passes which is of course better\nbut as humans we prefer the writing `a + b + c + d`. Expression templates\nsolves exactly this problem and allows the programmer to write `a + b + c + d`\nand the compiler to see the loop written above.\n\n## Expressions templates with NSIMD\n\nThis module provides expression templates on top of NSIMD core. As a\nconsequence the loops seen by the compiler deduced from the high-level\nexpressions are optimized using SIMD instructions. Note also that NVIDIA and\nAMD GPUs are supported through CUDA and ROCm/HIP. The API for expression\ntemplates in NSIMD is C++98 compatible and is able to work with any container\nas its only requirement for data is that it must be contiguous.\n\nAll inputs to an expression must be declared using `tet1d::in` while the\noutput must be declared using `tet1d::out`.\n\n```c++\nint main() {{\n  std::vector<float> a, b, c;\n\n  ...\n\n  tet1d::out(a) = tet1d::in(&a[0], a.size()) + tet1d::in(&b[0], b.size());\n\n  ...\n\n  return 0;\n}}\n```\n\n- `template <typename T, typename I> inline node in(const T *data, I sz);`{nl}\n  Construct an input for expression templates starting at address `data` and\n  containing `sz` elements. The return type of this functin `node` can be used\n  with the help of the `TET1D_IN(T)` macro where `T` if the underlying type of\n  data (ints, floats, doubles...).\n\n- `template <typename T> node out(T *data);`{nl}\n  Construct an output for expression templates starting at address `data`. Note\n  that memory must be allocated by the user before passing it to the expression\n  template engine. The output type can be used with the `TET1D_OUT(T)` where\n  `T` is the underlying type (ints, floats, doubles...).\n\nNote that it is possible to pass parameters to the expression template engine\nto specify the number of threads per block for GPUs or the SIMD extension to\nuse...\n\n- `template <typename T, typename Pack> node out(T *data, int\n  threads_per_block, void *stream);`{nl}\n  Construct an output for expression templates starting at address `data`. Note\n  that memory must be allocated by the user before passing it to the expression\n  template engine. The `Pack` parameter is useful when compiling for CPUs. The\n  type is `nsimd::pack<...>` allowing the developper to specify all details\n  about the NSIMD packs that will be used by the expression template engine.\n  The `threads_per_block` and `stream` arguments are used only when compiling\n  for GPUs. Their meaning is contained in their names. The output type can be\n  used with the `TET1D_OUT_EX(T, N, SimdExt)` where `T` is the underlying type\n  (ints, floats, doubles...), `N` is the unroll factor and `SimdExt` the SIMD\n  extension.\n\nMoreover a MATLAB-like syntax is provided. One can select a subrange of given\ninput. Indexes are understood as for Python: -1 represents the last element.\nThe contant `tet1d::end = -1` allows one to write portable code.\n\n```c++\nint main() {{\n  std::vector<float> a, b, c;\n\n  ...\n\n  TET1D_IN(float) va = tet1d::in(&a[0], a.size());\n  TET1D_IN(float) vb = tet1d::in(&b[0], b.size());\n  tet1d::out(c) = va(10, tet1d::end - 10) + vb;\n\n  ...\n\n  return 0;\n}}\n```\n\nOne can also specify which elements of the output must be rewritten with\nthe following syntax.\n\n```c++\nint main() {{\n  std::vector<float> a, b, c;\n\n  ...\n\n  TET1D_IN(float) va = tet1d::in(&a[0], a.size());\n  TET1D_IN(float) vb = tet1d::in(&b[0], b.size());\n  TET1D_OUT(float) vc = tet1d::out(&c[0]);\n  vc(va >= 10 && va < 20) = vb;\n\n  ...\n\n  return 0;\n}}\n```\n\nIn the exemple above, element `i` in `vc` is written only if `va[i] >= 10` and\n`va[i] < 20`. The expression appearing in the parenthesis can contain\narbitrary expression templates as soon as the underlying type is `bool`.\n\n## Warning using `auto`\n\nUsing auto can lead to surprising results. We advice you never to use auto\nwhen dealing with expression templates. Indeed using `auto` will make the\nvariable an obscure type representing the computation tree of the expression\ntemplate. This implies that you won't be able to get data from this variable\ni.e. get the `.data` member for exemple. Again this variable or its type cannot\nbe used in template arguments where you need it.\n'''.format(nl='  '))\n\n# -----------------------------------------------------------------------------\n\ndef gen_doc_api(opts):\n    filename = common.get_markdown_file(opts, 'api', 'tet1d')\n    if not common.can_create_filename(opts, filename):\n        return\n\n    # Build tree for api.md\n    api = dict()\n    for _, operator in operators.operators.items():\n        if not operator.has_scalar_impl:\n            continue\n        for c in operator.categories:\n            if c not in api:\n                api[c] = [operator]\n            else:\n                api[c].append(operator)\n\n    def get_signature(op):\n        def get_type(typ):\n            if typ == 'p':\n                return 'int'\n            elif typ == 'v':\n                return 'ExprNumber'\n            elif typ == 'l':\n                return 'ExprBool'\n        ret = get_type(op.params[0]) + ' ' + op.name + '('\n        if is_not_closed(op):\n            ret += 'ToType' + (', ' if len(op.params[1:]) > 0 else '')\n        ret += ', '.join(['{{t}} {{in{i}}}'.format(i=i). \\\n                          format(t=get_type(op.params[i + 1]), in0=common.in0,\n                          in1=common.in1, in2=common.in2, in3=common.in3) \\\n                          for i in range(len(op.params[1:]))])\n        ret += ');'\n        return ret\n\n    with common.open_utf8(opts, filename) as fout:\n        fout.write(\n'''# NSIMD TET1D API reference\n\nThis page contains the exhaustive API of the TET1D module. Note that most\noperators names follow their NSIMD counterparts and have the same\nsemantics. This page is light, you may use CTRL+F to find the operator you\nare looking for.\n\nNote that all operators accept literals and scalars. For example you may\nwrite `tet1d::add(a, 1)`. This also applies when using infix operators. Note\nthat literals or scalars of different types can be used with expression\ninvolving other types.\n\nIn all signature below the following pseudo types are used for simplification:\n- `ExprNumber` to designate an existing expression template on signed, unsigned\n  integers of floatting point types or a scalar of signed, unsigned integers or\n  floatting point types.\n- `ExprBool` to designate an existing expression template over booleans or\n  a boolean.\n- `ToType` to designate a base type (signed, unsigned integers or floatting\n  point types) and is used when a change in type is requested for example\n  when converting data.\n\n''')\n\n        for c, ops in api.items():\n            if len(ops) == 0:\n                continue\n            fout.write('\\n## {}\\n\\n'.format(c.title))\n            for op in ops:\n                fout.write('- `{}`  \\n'.format(get_signature(op)))\n                if op.cxx_operator != None:\n                    fout.write('  Infix operator: `{}`  \\n'. \\\n                               format(op.cxx_operator[8:]))\n                fout.write('  {}\\n\\n'.format(op.desc))\n\n# -----------------------------------------------------------------------------\n\ndef gen_tests_for_shifts(opts, t, operator):\n    op_name = operator.name\n    dirname = os.path.join(opts.tests_dir, 'modules', 'tet1d')\n    common.mkdir_p(dirname)\n    filename = os.path.join(dirname, '{}.{}.cpp'.format(op_name, t))\n    if not common.can_create_filename(opts, filename):\n        return\n    with common.open_utf8(opts, filename) as out:\n        out.write(\n        '''#include <nsimd/modules/tet1d.hpp>\n        #include <nsimd/modules/memory_management.hpp>\n        #include \"../common.hpp\"\n\n        #if defined(NSIMD_CUDA)\n\n        __global__ void kernel({t} *dst, {t} *tab0, int n, int s) {{\n          int i = threadIdx.x + blockIdx.x * blockDim.x;\n          if (i < n) {{\n            dst[i] = nsimd::gpu_{op_name}(tab0[i], s);\n          }}\n        }}\n\n        void compute_result({t} *dst, {t} *tab0, unsigned int n, int s) {{\n          kernel<<<{gpu_params}>>>(dst, tab0, int(n), s);\n        }}\n\n        {cbprng_cuda}\n\n        #elif defined(NSIMD_ROCM)\n\n        __global__ void kernel({t} *dst, {t} *tab0, size_t n, int s) {{\n          size_t i = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n          if (i < n) {{\n            dst[i] = nsimd::gpu_{op_name}(tab0[i], s);\n          }}\n        }}\n\n        void compute_result({t} *dst, {t} *tab0, size_t n, int s) {{\n          hipLaunchKernelGGL(kernel, {gpu_params}, 0, 0, dst, tab0, n, s);\n        }}\n\n        {cbprng_hip}\n\n        #elif defined(NSIMD_ONEAPI)\n\n        inline void kernel({t} *dst, {t} *tab0, const size_t n,\n                           const int s, sycl::nd_item<1> item) {{\n          size_t ii = item.get_global_id().get(0);\n          if (ii < n){{\n            dst[ii] = nsimd::gpu_{op_name}(tab0[ii], s);\n          }}\n        }}\n\n        void compute_result({t} *dst, {t} *tab0, size_t n, int s) {{\n          size_t total_num_threads = (size_t)nsimd_kernel_param((int)n, {tpb});\n          sycl::queue q_ = nsimd::oneapi::default_queue();\n          q_.parallel_for(sycl::nd_range<1>(sycl::range<1>(total_num_threads),\n                                            sycl::range<1>({tpb})),\n                                            [=](sycl::nd_item<1> item){{\n                                              kernel(dst, tab0, n, s, item);\n                                            }}).wait_and_throw();\n        }}\n\n        {cbprng_oneapi}\n\n        #else\n\n        void compute_result({t} *dst, {t} *tab0, unsigned int n, int s) {{\n          for (unsigned int i = 0; i < n; i++) {{\n            dst[i] = nsimd_scalar_{op_name}_{t}(tab0[i], s);\n          }}\n        }}\n\n        {cbprng_cpu}\n\n        #endif\n\n        int main() {{\n          unsigned int n_[3] = {{ 10, 1001, 10001 }};\n          for (int i = 0; i < (int)(sizeof(n_) / sizeof(int)); i++) {{\n            unsigned int n = n_[i];\n            for (int s = 0; s < {typnbits}; s++) {{\n              int ret = 0;\n              {t} *tab0 = nsimd::device_calloc<{t}>(n);\n              random(tab0, n, 0);\n              {t} *ref = nsimd::device_calloc<{t}>(n);\n              {t} *out = nsimd::device_calloc<{t}>(n);\n              compute_result(ref, tab0, n, s);\n              tet1d::out(out) = tet1d::{op_name}(tet1d::in(tab0, n), s);\n              if (!cmp(ref, out, n)) {{\n                ret = -1;\n              }}\n              nsimd::device_free(ref);\n              nsimd::device_free(out);\n              nsimd::device_free(tab0);\n              if (ret != 0) {{\n                return ret;\n              }}\n            }}\n          }}\n          return 0;\n        }}\n        '''.format(gpu_params=gpu_params, op_name=op_name, t=t,\n                   typnbits=t[1:], tpb=tpb,\n                   cbprng_cpu=nsimd_tests.cbprng(t, operator, 'cpu'),\n                   cbprng_cuda=nsimd_tests.cbprng(t, operator, 'cuda',\n                                                  gpu_params),\n                   cbprng_hip=nsimd_tests.cbprng(t, operator, 'hip',\n                                                 gpu_params),\n                   cbprng_oneapi=nsimd_tests.cbprng(t, operator, 'oneapi',\n                                                    ['(int)n', str(tpb)])))\n    common.clang_format(opts, filename, cuda=True)\n\ndef gen_tests_for(opts, tt, t, operator):\n    op_name = operator.name\n    dirname = os.path.join(opts.tests_dir, 'modules', 'tet1d')\n    common.mkdir_p(dirname)\n    filename = os.path.join(dirname, '{}.{}.cpp'.format(op_name,\n               t if t == tt else '{}_{}'.format(t, tt)))\n    if not common.can_create_filename(opts, filename):\n        return\n\n    arity = len(operator.params[1:])\n    args_tabs = ', '.join(['{typ} *tab{i}'.format(typ=t, i=i) \\\n                           for i in range(arity)])\n    args_tabs_call = ', '.join(['tab{i}'.format(i=i) \\\n                                for i in range(arity)])\n    args_tabs_i_call = ', '.join(['tab{i}[i]'.format(i=i) \\\n                                  for i in range(arity)])\n    args_in_tabs_call = ', '.join(['tet1d::in(tab{i}, n)'. \\\n                                   format(i=i) \\\n                                   for i in range(arity)])\n\n    fill_tabs = '\\n'.join(['{typ} *tab{i} = nsimd::device_calloc<{typ}>(n);\\n' \\\n                           'random(tab{i}, n, {i});'.format(typ=t, i=i) \\\n                           for i in range(arity)])\n\n    free_tabs = '\\n'.join(['nsimd::device_free(tab{i});'. \\\n                           format(typ=t, i=i) for i in range(arity)])\n\n    zero = '{}(0)'.format(t) if t != 'f16' else '{f32_to_f16}(0.0f)'\n    one = '{}(1)'.format(t) if t != 'f16' else '{f32_to_f16}(1.0f)'\n    comp_tab0_to_1 = 'tab0[i] == {}(1)'.format(t) if t != 'f16' else \\\n                     '{f16_to_f32}(tab0[i]) == 1.0f'\n    comp_tab1_to_1 = 'tab1[i] == {}(1)'.format(t) if t != 'f16' else \\\n                     '{f16_to_f32}(tab1[i]) == 1.0f'\n\n    if op_name == 'cvt':\n        tet1d_code = \\\n            '''tet1d::out(out) = tet1d::cvt<{t}>(tet1d::cvt<{tt}>(\n                                     tet1d::in(tab0, n)));'''. \\\n                                     format(t=t, tt=tt)\n        compute_result_kernel = \\\n            '''dst[i] = nsimd::{{p}}_cvt({t}(), nsimd::{{p}}_cvt(\n                            {tt}(), tab0[i]));'''.format(t=t, tt=tt)\n    elif op_name == 'reinterpret':\n        tet1d_code = \\\n            '''tet1d::out(out) = tet1d::reinterpret<{t}>(\n                                     tet1d::reinterpret<{tt}>(tet1d::in(\n                                         tab0, n)));'''.format(t=t, tt=tt)\n        compute_result_kernel = \\\n            '''dst[i] = nsimd::{{p}}_reinterpret({t}(),\n                            nsimd::{{p}}_reinterpret({tt}(),\n                                tab0[i]));'''.format(t=t, tt=tt)\n    elif op_name in ['to_mask', 'to_logical']:\n        tet1d_code = \\\n            '''tet1d::out(out) = tet1d::to_mask(tet1d::to_logical(tet1d::in(\n                                     tab0, n)));'''\n        compute_result_kernel = \\\n            '''dst[i] = nsimd::{{p}}_to_mask({t}(),\n                            nsimd::{{p}}_to_logical(tab0[i]));'''. \\\n                            format(t=t)\n    elif operator.params == ['v'] * len(operator.params):\n        compute_result_kernel = \\\n            'dst[i] = nsimd::{{p}}_{op_name}({args_tabs_i_call});'. \\\n            format(op_name=op_name, args_tabs_i_call=args_tabs_i_call)\n        if operator.cxx_operator != None:\n            if len(operator.params[1:]) == 1:\n                tet1d_code = 'tet1d::out(out) = {cxx_op}tet1d::in(tab0, n);'. \\\n                             format(cxx_op=operator.cxx_operator)\n            else:\n                tet1d_code = 'tet1d::out(out) = tet1d::in(tab0, n) {cxx_op} ' \\\n                             'tet1d::in(tab1, n);'. \\\n                             format(cxx_op=operator.cxx_operator)\n        else:\n            tet1d_code = \\\n                'tet1d::out(out) = tet1d::{op_name}({args_in_tabs_call});'. \\\n                format(op_name=op_name, args_in_tabs_call=args_in_tabs_call)\n    elif operator.params == ['l', 'v', 'v']:\n        if operator.cxx_operator != None:\n            cond = 'A {} B'.format(operator.cxx_operator)\n        else:\n            cond = 'tet1d::{}(A, B)'.format(op_name)\n        tet1d_code = \\\n            '''TET1D_OUT({typ}) Z = tet1d::out(out);\n               TET1D_IN({typ}) A = tet1d::in(tab0, n);\n               TET1D_IN({typ}) B = tet1d::in(tab1, n);\n               Z({cond}) = 1;'''.format(cond=cond, typ=t)\n        compute_result_kernel = \\\n            '''if (nsimd::{{p}}_{op_name}(tab0[i], tab1[i])) {{{{\n                 dst[i] = {one};\n               }}}} else {{{{\n                 dst[i] = {zero};\n               }}}}'''.format(op_name=op_name, typ=t, one=one, zero=zero)\n    elif operator.params == ['l'] * len(operator.params):\n        if len(operator.params[1:]) == 1:\n            if operator.cxx_operator != None:\n                cond = '{}(A == 1)'.format(operator.cxx_operator)\n            else:\n                cond = 'tet1d::{}(A == 1)'.format(op_name)\n            tet1d_code = \\\n                '''TET1D_OUT({typ}) Z = tet1d::out(out);\n                   TET1D_IN({typ}) A = tet1d::in(tab0, n);\n                   Z({cond}) = 1;'''.format(cond=cond, typ=t)\n            compute_result_kernel = \\\n                '''if (nsimd::{{p}}_{op_name}({comp_tab0_to_1})) {{{{\n                     dst[i] = {one};\n                   }}}} else {{{{\n                     dst[i] = {zero};\n                   }}}}'''.format(op_name=op_name, typ=t, one=one, zero=zero,\n                                  comp_tab0_to_1=comp_tab0_to_1)\n        if len(operator.params[1:]) == 2:\n            if operator.cxx_operator != None:\n                cond = '(A == 1) {} (B == 1)'.format(operator.cxx_operator)\n            else:\n                cond = 'tet1d::{}(A == 1, B == 1)'.format(op_name)\n            tet1d_code = \\\n                '''TET1D_OUT({typ}) Z = tet1d::out(out);\n                   TET1D_IN({typ}) A = tet1d::in(tab0, n);\n                   TET1D_IN({typ}) B = tet1d::in(tab1, n);\n                   Z({cond}) = 1;'''.format(cond=cond, typ=t)\n            compute_result_kernel = \\\n                '''if (nsimd::{{p}}_{op_name}({comp_tab0_to_1},\n                                              {comp_tab1_to_1})) {{{{\n                     dst[i] = {one};\n                   }}}} else {{{{\n                     dst[i] = {zero};\n                   }}}}'''.format(op_name=op_name, typ=t, one=one, zero=zero,\n                                  comp_tab0_to_1=comp_tab0_to_1,\n                                  comp_tab1_to_1=comp_tab1_to_1)\n    else:\n        raise Exception('Unsupported operator: \"{}\"'.format(op_name))\n\n    cpu_kernel = compute_result_kernel.format(p='scalar',\n                                              f32_to_f16='nsimd_f32_to_f16',\n                                              f16_to_f32='nsimd_f16_to_f32')\n    cuda_rocm_kernel = compute_result_kernel.format(p='gpu',\n                                                    f32_to_f16='__float2half',\n                                                    f16_to_f32='__half2float')\n    oneapi_kernel = compute_result_kernel.format(p='gpu',\n                                                 f32_to_f16='(f16)',\n                                                 f16_to_f32='(f32)')\n\n    comp = '!cmp(ref, out, n{})'.format('' if t in common.iutypes \\\n                                        else ', {}'.format(operator.ufp[t]))\n\n    with common.open_utf8(opts, filename) as out:\n        out.write(\n        '''#include <nsimd/modules/tet1d.hpp>\n        #include <nsimd/modules/memory_management.hpp>\n        #include \"../common.hpp\"\n\n        #if defined(NSIMD_CUDA)\n\n        __global__ void kernel({typ} *dst, {args_tabs}, int n) {{\n          int i = threadIdx.x + blockIdx.x * blockDim.x;\n          if (i < n) {{\n            {cuda_rocm_kernel}\n          }}\n        }}\n\n        void compute_result({typ} *dst, {args_tabs}, unsigned int n) {{\n          kernel<<<{gpu_params}>>>(dst, {args_tabs_call}, int(n));\n        }}\n\n        {cbprng_cuda}\n\n        #elif defined(NSIMD_ROCM)\n\n        __global__ void kernel({typ} *dst, {args_tabs}, size_t n) {{\n          size_t i = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n          if (i < n) {{\n            {cuda_rocm_kernel}\n          }}\n        }}\n\n        void compute_result({typ} *dst, {args_tabs}, size_t n) {{\n          hipLaunchKernelGGL(kernel, {gpu_params}, 0, 0, dst, {args_tabs_call},\n                             n);\n        }}\n\n        {cbprng_hip}\n\n        #elif defined(NSIMD_ONEAPI)\n\n        inline void kernel({typ} *dst, {args_tabs}, const size_t n,\n                           sycl::nd_item<1> item) {{\n          size_t i = item.get_global_id().get(0);\n          if (i < n) {{\n            {oneapi_kernel}\n          }}\n        }}\n\n        void compute_result({typ} *dst, {args_tabs}, const size_t n) {{\n\t  size_t total_num_threads = (size_t)nsimd_kernel_param((int)n, {tpb});\n\t  sycl::queue q_ = nsimd::oneapi::default_queue();\n\t  q_.parallel_for(sycl::nd_range<1>(sycl::range<1>(total_num_threads),\n\t                                    sycl::range<1>({tpb})),\n\t                                    [=](sycl::nd_item<1> item){{\n            kernel(dst, {args_tabs_call}, n, item);\n          }}).wait_and_throw();\n        }}\n\n        {cbprng_oneapi}\n\n        #else\n\n        void compute_result({typ} *dst, {args_tabs}, unsigned int n) {{\n          for (unsigned int i = 0; i < n; i++) {{\n            {cpu_kernel}\n          }}\n        }}\n\n        {cbprng_cpu}\n\n        #endif\n\n        int main() {{\n          unsigned int n_[3] = {{ 10, 1001, 10001 }};\n          for (int i = 0; i < (int)(sizeof(n_) / sizeof(int)); i++) {{\n            unsigned int n = n_[i];\n            int ret = 0;\n            {fill_tabs}\n            {typ} *ref = nsimd::device_calloc<{typ}>(n);\n            {typ} *out = nsimd::device_calloc<{typ}>(n);\n            compute_result(ref, {args_tabs_call}, n);\n            {tet1d_code}\n            if ({comp}) {{\n              ret = -1;\n            }}\n            nsimd::device_free(ref);\n            nsimd::device_free(out);\n            {free_tabs}\n            if (ret != 0) {{\n              return ret;\n            }}\n          }}\n          return 0;\n        }}\n        '''.format(typ=t, args_tabs=args_tabs, fill_tabs=fill_tabs,\n                   args_tabs_call=args_tabs_call, gpu_params=gpu_params,\n                   free_tabs=free_tabs, tet1d_code=tet1d_code, comp=comp,\n                   cpu_kernel=cpu_kernel, tpb=tpb,\n                   cuda_rocm_kernel=cuda_rocm_kernel,\n                   oneapi_kernel=oneapi_kernel,\n                   cbprng_cpu=nsimd_tests.cbprng(t, operator, 'cpu'),\n                   cbprng_cuda=nsimd_tests.cbprng(t, operator, 'cuda',\n                                                  gpu_params),\n                   cbprng_hip=nsimd_tests.cbprng(t, operator, 'hip',\n                                                 gpu_params),\n                   cbprng_oneapi=nsimd_tests.cbprng(t, operator, 'oneapi',\n                                                    ['(int)n', str(tpb)])))\n\n    common.clang_format(opts, filename, cuda=True)\n\ndef gen_tests(opts):\n    for op_name, operator in operators.operators.items():\n        if not operator.has_scalar_impl:\n            continue\n        for t in operator.types:\n            tts = common.get_output_types(t, operator.output_to)\n            for tt in tts:\n                if not nsimd_tests.should_i_do_the_test(operator, tt, t):\n                    continue\n                if operator.name in ['shl', 'shr', 'shra']:\n                    gen_tests_for_shifts(opts, t, operator)\n                else:\n                    gen_tests_for(opts, tt, t, operator)\n\n# -----------------------------------------------------------------------------\n\ndef gen_functions(opts):\n    functions = ''\n\n    for op_name, operator in operators.operators.items():\n        if not operator.has_scalar_impl:\n            continue\n\n        not_closed = is_not_closed(operator)\n        not_closed_tmpl_args = 'typename ToType, ' if not_closed else ''\n        not_closed_tmpl_params = 'ToType' if not_closed else 'none_t'\n\n        if op_name in ['shl', 'shr', 'shra']:\n            tmpl_args = 'typename Left'\n            tmpl_params = 'Left, none_t, none_t'\n            size = 'return left.size();'\n            args = 'Left const &left, int s'\n            members = 'Left left; int s;'\n            members_assignment = 'ret.left = to_node(left); ret.s = s;'\n            to_node_type = 'typename to_node_t<Left>::type, none_t, none_t'\n        elif len(operator.params) == 2:\n            tmpl_args = not_closed_tmpl_args + 'typename Left'\n            tmpl_params = 'Left, none_t, ' + not_closed_tmpl_params\n            size = 'return left.size();'\n            args = 'Left const &left'\n            members = 'Left left;'\n            members_assignment = 'ret.left = to_node(left);'\n            to_node_type = 'typename to_node_t<Left>::type, none_t, none_t'\n        elif len(operator.params) == 3:\n            tmpl_args = 'typename Left, typename Right'\n            tmpl_params = 'Left, Right, none_t'\n            size = 'return compute_size(left.size(), right.size());'\n            args = 'Left const &left, Right const &right'\n            members = 'Left left;\\nRight right;'\n            members_assignment = '''ret.left = to_node(left);\n                                    ret.right = to_node(right);'''\n            to_node_type = 'typename to_node_t<Left>::type, ' \\\n                           'typename to_node_t<Right>::type, none_t'\n        elif len(operator.params) == 4:\n            tmpl_args = 'typename Left, typename Right, typename Extra'\n            tmpl_params = 'Left, Right, Extra'\n            size = \\\n            'return compute_size(left.size(), right.size(), extra.size());'\n            args = 'Left const &left, Right const &right, Extra const &extra'\n            members = 'Left left;\\nRight right;\\nExtra extra;'\n            members_assignment = '''ret.left = to_node(left);\n                                    ret.right = to_node(right);\n                                    ret.extra = to_node(extra);'''\n            to_node_type = 'typename to_node_t<Left>::type, ' \\\n                           'typename to_node_t<Right>::type, ' \\\n                           'typename to_node_t<Extra>::type'\n\n        if operator.returns == 'v':\n            to_pack = 'to_pack_t'\n            return_type = 'out_type'\n        else:\n            to_pack = 'to_packl_t'\n            return_type = 'bool'\n\n        if not_closed:\n            to_typ_arg = 'out_type(), '\n            to_typ_tmpl_arg = '<typename {to_pack}<out_type, Pack>::type>'. \\\n                              format(to_pack=to_pack)\n            in_out_typedefs = '''typedef typename Left::out_type in_type;\n                                 typedef ToType out_type;'''\n            to_node_type = 'typename to_node_t<Left>::type, none_t, ToType'\n        else:\n            to_typ_arg = '' if op_name != 'to_mask' else 'out_type(), '\n            to_typ_tmpl_arg = ''\n            in_out_typedefs = '''typedef typename Left::out_type in_type;\n                                 typedef typename Left::out_type out_type;'''\n\n        impl_args = 'left.{cpu_gpu}_get{tmpl}(i)'\n        if (len(operator.params[1:]) >= 2):\n            if operator.params[2] == 'p':\n                impl_args += ', s'\n            else:\n                impl_args += ', right.{cpu_gpu}_get{tmpl}(i)'\n        if (len(operator.params[1:]) >= 3):\n            impl_args += ', extra.{cpu_gpu}_get{tmpl}(i)'\n\n        impl_scalar = 'return nsimd::scalar_{}({}{});'. \\\n                      format(op_name, to_typ_arg,\n                             impl_args.format(cpu_gpu='scalar', tmpl=''))\n\n        impl_gpu = 'return nsimd::gpu_{}({}{});'. \\\n                   format(op_name, to_typ_arg,\n                          impl_args.format(cpu_gpu='gpu', tmpl=''))\n\n        impl_simd = 'return nsimd::{}{}({});'. \\\n                      format(op_name, to_typ_tmpl_arg,\n                             impl_args.format(cpu_gpu='template simd',\n                                              tmpl='<Pack>'))\n\n        functions += \\\n        '''struct {op_name}_t {{}};\n\n        template <{tmpl_args}>\n        struct node<{op_name}_t, {tmpl_params}> {{\n          {in_out_typedefs}\n\n          {members}\n\n          nsimd::nat size() const {{\n            {size}\n          }}\n\n        #if defined(NSIMD_CUDA) || defined(NSIMD_ROCM)\n          __device__ {return_type} gpu_get(nsimd::nat i) const {{\n            {impl_gpu}\n          }}\n        #elif defined(NSIMD_ONEAPI)\n          {return_type} gpu_get(nsimd::nat i) const {{\n            {impl_gpu}\n          }}\n        #else\n          {return_type} scalar_get(nsimd::nat i) const {{\n            {impl_scalar}\n          }}\n          template <typename Pack> typename {to_pack}<out_type, Pack>::type\n          simd_get(nsimd::nat i) const {{\n            {impl_simd}\n          }}\n        #endif\n        }};\n\n        template<{tmpl_args}>\n        node<{op_name}_t, {to_node_type}> {op_name}({args}) {{\n          node<{op_name}_t, {to_node_type}> ret;\n          {members_assignment}\n          return ret;\n        }}'''.format(op_name=op_name, tmpl_args=tmpl_args, size=size,\n                     tmpl_params=tmpl_params, return_type=return_type,\n                     args=args, to_pack=to_pack, to_node_type=to_node_type,\n                     members=members, members_assignment=members_assignment,\n                     in_out_typedefs=in_out_typedefs,\n                     impl_gpu=impl_gpu,\n                     impl_scalar=impl_scalar,\n                     impl_simd=impl_simd)\n\n        if operator.cxx_operator != None and len(operator.params) == 2:\n            functions += \\\n            '''\n            template <typename Op, typename Left, typename Right,\n                      typename Extra>\n            node<{op_name}_t, node<Op, Left, Right, Extra>, none_t, none_t>\n            operator{cxx_operator}(node<Op, Left, Right, Extra> const &node) {{\n              return tet1d::{op_name}(node);\n            }}'''.format(op_name=op_name,\n                         cxx_operator=operator.cxx_operator);\n        if operator.cxx_operator != None and len(operator.params) == 3:\n            functions += '''\n\n            template <typename Op, typename Left, typename Right,\n                      typename Extra, typename T>\n            node<{op_name}_t, node<Op, Left, Right, Extra>,\n                 node<scalar_t, none_t, none_t,\n                      typename node<Op, Left, Right, Extra>::in_type>, none_t>\n            operator{cxx_operator}(node<Op, Left, Right, Extra> const &node,\n                                   T a) {{\n              typedef typename tet1d::node<Op, Left, Right, Extra>::in_type S;\n              return tet1d::{op_name}(node, literal_to<S>::impl(a));\n            }}\n\n            template <typename T, typename Op, typename Left, typename Right,\n                      typename Extra>\n            node<{op_name}_t, node<scalar_t, none_t, none_t,\n                              typename node<Op, Left, Right, Extra>::in_type>,\n                 node<Op, Left, Right, Extra>, none_t>\n            operator{cxx_operator}(T a,\n                                   node<Op, Left, Right, Extra> const &node) {{\n              typedef typename tet1d::node<Op, Left, Right, Extra>::in_type S;\n              return tet1d::{op_name}(literal_to<S>::impl(a), node);\n            }}\n\n            template <typename LeftOp, typename LeftLeft, typename LeftRight,\n                      typename LeftExtra, typename RightOp, typename RightLeft,\n                      typename RightRight, typename RightExtra>\n            node<{op_name}_t, node<LeftOp, LeftLeft, LeftRight, LeftExtra>,\n                              node<RightOp, RightLeft, RightRight, RightExtra>,\n                 none_t>\n            operator{cxx_operator}(node<LeftOp, LeftLeft, LeftRight,\n                                LeftExtra> const &left,\n                           node<RightOp, RightLeft, RightRight,\n                                RightExtra> const &right) {{\n              return tet1d::{op_name}(left, right);\n            }}'''.format(op_name=op_name,\n                         cxx_operator=operator.cxx_operator);\n\n        functions += '\\n\\n{}\\n\\n'.format(common.hbar)\n\n    # Write the code to file\n    dirname = os.path.join(opts.include_dir, 'modules', 'tet1d')\n    common.mkdir_p(dirname)\n    filename = os.path.join(dirname, 'functions.hpp')\n    if not common.can_create_filename(opts, filename):\n        return\n    with common.open_utf8(opts, filename) as out:\n        out.write('#ifndef NSIMD_MODULES_TET1D_FUNCTIONS_HPP\\n')\n        out.write('#define NSIMD_MODULES_TET1D_FUNCTIONS_HPP\\n\\n')\n        out.write('namespace tet1d {\\n\\n')\n        out.write('{}\\n\\n'.format(common.hbar))\n        out.write(functions)\n        out.write('} // namespace tet1d\\n\\n')\n        out.write('#endif\\n')\n    common.clang_format(opts, filename)\n\n# -----------------------------------------------------------------------------\n\ndef name():\n    return 'Tiny expression templates 1D'\n\ndef desc():\n    return '''This module provide a thin layer of expression templates above\nNSIMD core. It also allows the programmer to target Intel, NVIDIA and AMD GPUs.\nExpression template are a C++ technique that allows the programmer to write\ncode \"à la MATLAB\" where variables usually represents vectors and operators\nare itemwise.'''\n\ndef doc_menu():\n    return {'Overview': 'overview', 'API reference': 'api'}\n\n# -----------------------------------------------------------------------------\n\ndef doit(opts):\n    common.myprint(opts, 'Generating module tet1d')\n    if opts.library:\n        gen_functions(opts)\n    if opts.tests:\n        gen_tests(opts)\n    if opts.doc:\n        gen_doc_api(opts)\n        gen_doc_overview(opts)\n"
  },
  {
    "path": "egg/oneapi.py",
    "content": "\n# Copyright (c) 2021 Agenium Scale\n#\n# Permission is hereby granted, free of charge, to any person obtaining a copy\n# of this software and associated documentation files (the \"Software\"), to deal\n# in the Software without restriction, including without limitation the rights\n# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n# copies of the Software, and to permit persons to whom the Software is\n# furnished to do so, subject to the following conditions:\n#\n# The above copyright notice and this permission notice shall be included in all\n# copies or substantial portions of the Software.\n#\n# THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n# SOFTWARE.\n\n# -----------------------------------------------------------------------------\n# References:\n\n# Functions: book:\n# Data Parallel C++\n# Mastering DPC++ for Programming of Heterogeneous Systems using\n# C++ and SYCL - Apress Open\n# Table page 475: list of maths functions. float16 supported\n\n# sycl half type (f16) API:\n# https://mmha.github.io/syclreference/libraries/types/half/\n# -----------------------------------------------------------------------------\n\nimport common\nimport scalar\n\nfmtspec = dict()\n\n# -----------------------------------------------------------------------------\n\ndef get_impl_f16(operator, totyp, typ):\n\n    # Case 1: rounding functions\n    # no sycl function available for half type\n    # sycl function available for f32\n    # use sycl defined conversions half --> f32 , f32 --> half\n\n    # Case 2: no sycl function available for half type\n    # sycl function available for f32\n    # use nsimd casts f32-->f16 + sycl function + f16-->f32\n\n    no_sycl_avail_f16_cast_use_sycl_f32 = \\\n        ['fma', 'fms', 'fnma', 'fnms', 'min', 'max', 'abs']\n\n    # Case 3: sycl provides functions supporting half type\n\n    sycl_avail_functions_f16 = \\\n        ['rec', 'rec8', 'rec11', 'rsqrt8', 'rsqrt11', 'rsqrt', 'sqrt']\n\n    # Case 4: sycl half's type provided comparison operators\n    # Note:\n    # not documented in the book\n    # source: sycl half type (f16) API:\n    # https://mmha.github.io/syclreference/libraries/types/half/\n\n    sycl_avail_cmp_op_f16 = {\n        'lt': 'return {in0} < {in1};',\n        'gt': 'return {in0} > {in1};',\n        'le': 'return {in0} <= {in1};',\n        'ge': 'return {in0} >= {in1};',\n        'ne': 'return {in0} != {in1};',\n        'eq': 'return {in0} == {in1};'\n    }\n\n    # Case 5: no sycl function available for any type\n    # use nsimd_scalar_[operator]_f16\n\n    # Dispatch\n\n    # Case 1\n    if operator.name in ['floor','ceil','trunc']:\n        return 'return f16(sycl::{op}(static_cast<f32>({in0})));'.\\\n               format(op=operator.name,**fmtspec)\n    elif operator.name == 'round_to_even':\n        return 'return f16(sycl::rint(static_cast<f32>({in0})));'.\\\n               format(**fmtspec)\n\n    # Case 2\n    elif operator.name in no_sycl_avail_f16_cast_use_sycl_f32:\n        if operator.name in ['fma', 'fms', 'fnma', 'fnms']:\n            neg = '-' if operator.name in ['fnma', 'fnms'] else ''\n            op = '-' if operator.name in ['fnms', 'fms'] else ''\n            return '''// cl::sycl::half::operator float\n                      f32 x0 = static_cast<f32>({in0});\n                      f32 x1 = static_cast<f32>({in1});\n                      f32 x2 = static_cast<f32>({in2});\n                      f32 res = sycl::fma({neg}x0, x1, {op}x2);\n                      // cl::sycl::half::half(const float& f)\n                      return f16(res);'''.format(neg=neg, op=op, **fmtspec)\n        elif operator.name in ['min', 'max']:\n            op = 'fmin' if operator.name == 'min' else 'fmax'\n            return '''// cl::sycl::half::operator float\n                      f32 x0 =  static_cast<f32>({in0});\n                      f32 x1 =  static_cast<f32>({in1});\n                      f32 res = sycl::{op}(x0, x1);\n                      // cl::sycl::half::half(const float& f)\n                      return f16(res);'''.format(op=op, **fmtspec)\n        elif operator.name == 'abs':\n            return '''// cl::sycl::half::operator float\n                      f32 x0 = static_cast<f32>({in0});\n                      f32 res = sycl::fabs(x0);\n                      // cl::sycl::half::half(const float& f)\n                      return f16(res);'''.format(**fmtspec)\n\n    # Case 3\n    elif operator.name in sycl_avail_functions_f16:\n        if operator.name in ['rec8', 'rec11', 'rec']:\n            return '''// sycl::recip available in native form only\n                      // availability in half-precision\n                      return f16(1.0f / {in0});'''.format(**fmtspec)\n        elif operator.name in ['rsqrt8', 'rsqrt11', 'rsqrt']:\n            return 'return sycl::rsqrt({in0});'.format(**fmtspec)\n        elif operator.name == 'sqrt':\n            return 'return sycl::sqrt({in0});'.format(**fmtspec)\n\n    # Case 4\n    elif operator.name in sycl_avail_cmp_op_f16:\n        return sycl_avail_cmp_op_f16[operator.name].format(**fmtspec)\n\n    # Case 5\n    else:\n        args = ', '.join(['{{in{}}}'.format(i).format(**fmtspec) \\\n                          for i in range(len(operator.params[1:]))])\n        return 'return nsimd_scalar_{op}_f16({args});'.\\\n               format(op=operator.name, args=args)\n\n# -----------------------------------------------------------------------------\n\ndef reinterpret(totyp, typ):\n    if typ == totyp:\n        return 'return {in0};'.format(**fmtspec)\n    elif ((typ in common.ftypes and totyp in common.iutypes) or \\\n         (typ in common.iutypes and totyp in common.ftypes)):\n        return 'return nsimd_scalar_reinterpret_{totyp}_{typ}({in0});'. \\\n               format(**fmtspec)\n    else:\n        return '''{totyp} ret;\n                  memcpy((void *)&ret, (void *)&{in0}, sizeof({in0}));\n                  return ret;'''.format(**fmtspec)\n\n# -----------------------------------------------------------------------------\n\ndef get_impl(operator, totyp, typ):\n\n    global fmtspec\n\n    fmtspec = {\n        'in0': common.in0,\n        'in1': common.in1,\n        'in2': common.in2,\n        'typ': typ,\n        'totyp': totyp,\n        'typnbits': typ[1:]\n    }\n\n    # src operators\n    if operator.src:\n        oneapi_ops = {\n          'sin_u35': 'sin',\n          'cos_u35': 'cos',\n          'tan_u35': 'tan',\n          'asin_u35': 'asin',\n          'acos_u35': 'acos',\n          'atan_u35': 'atan',\n          'atan2_u35': 'atan2',\n          'log_u35': 'log',\n          'cbrt_u35': 'cbrt',\n          'sin_u10': 'sin',\n          'cos_u10': 'cos',\n          'tan_u10': 'tan',\n          'asin_u10': 'asin',\n          'acos_u10': 'acos',\n          'atan_u10': 'atan',\n          'atan2_u10': 'atan2',\n          'log_u10': 'log',\n          'cbrt_u10': 'cbrt',\n          'exp_u10': 'exp',\n          'pow_u10': 'pow',\n          'sinh_u10': 'sinh',\n          'cosh_u10': 'cosh',\n          'tanh_u10': 'tanh',\n          'sinh_u35': 'sinh',\n          'cosh_u35': 'cosh',\n          'tanh_u35': 'tanh',\n          'fastsin_u3500': 'sin',\n          'fastcos_u3500': 'cos',\n          'fastpow_u3500': 'pow',\n          'asinh_u10': 'asinh',\n          'acosh_u10': 'acosh',\n          'atanh_u10': 'atanh',\n          'exp2_u10': 'exp2',\n          'exp2_u35': 'exp2',\n          'exp10_u10': 'exp10',\n          'exp10_u35': 'exp10',\n          'expm1_u10': 'expm1',\n          'log10_u10': 'log10',\n          'log2_u10': 'log2',\n          'log2_u35': 'log2',\n          'log1p_u10': 'log1p',\n          'sinpi_u05': 'sinpi',\n          'cospi_u05': 'cospi',\n          'hypot_u05': 'hypot',\n          'hypot_u35': 'hypot',\n          'remainder': 'remainder',\n          'fmod': 'fmod',\n          'lgamma_u10': 'lgamma',\n          'tgamma_u10': 'tgamma',\n          'erf_u10': 'erf',\n          'erfc_u15': 'erfc'\n        }\n        return 'return cl::sycl::{}({});'.format(\n                  oneapi_ops[operator.name],\n                  common.get_args(len(operator.params[1:])))\n\n    # bool first, no special treatment for f16's\n    bool_operators = [ 'andl', 'orl', 'xorl', 'andnotl', 'notl' ]\n    if operator.name in bool_operators:\n        if operator.name == 'notl':\n            return 'return nsimd_scalar_{op}({in0});'.\\\n                   format(op=operator.name,**fmtspec)\n        else:\n            return 'return nsimd_scalar_{op}({in0}, {in1});'.\\\n                   format(op=operator.name,**fmtspec)\n\n    # infix operators no special treatment for f16's\n    infix_operators = [ 'orb', 'andb', 'andnotb', 'notb', 'xorb' ]\n    if operator.name in infix_operators:\n        if operator.name == 'notb':\n            return 'return nsimd_scalar_{op}_{typ}({in0});'.\\\n                   format(op=operator.name,**fmtspec)\n        else:\n            return 'return nsimd_scalar_{op}_{typ}({in0}, {in1});'.\\\n                   format(op=operator.name,**fmtspec)\n\n    # reinterpret\n    if operator.name == 'reinterpret':\n        return reinterpret(totyp, typ)\n\n    # cvt\n    if operator.name == 'cvt':\n        if 'f16' == totyp:\n            # conversion op: takes in a 32 bit float and converts it to 16 bits\n            return 'return sycl::half(static_cast<f32>({in0}));'. \\\n                   format(**fmtspec)\n        else:\n          return 'return nsimd_scalar_cvt_{totyp}_{typ}({in0});'. \\\n                 format(**fmtspec)\n\n    # to_mask\n    if operator.name == 'to_mask':\n        return 'return nsimd_scalar_to_mask_{totyp}({in0});'.format(**fmtspec)\n\n    # to_logical\n    if operator.name == 'to_logical':\n        return 'return nsimd_scalar_to_logical_{typ}({in0});'.format(**fmtspec)\n\n    # for all other operators, f16 has a special treatment\n    if typ == 'f16':\n        return get_impl_f16(operator, totyp, typ)\n\n    # infix operators - rec - f32, f64\n    infix_op_rec_ftypes = ['rec', 'rec8', 'rec11']\n\n    if typ in common.ftypes_no_f16 and operator.name in infix_op_rec_ftypes:\n        return '''// sycl::recip available in native form only\n                  return 1.0{f} / {in0};'''. \\\n                  format(f='f' if typ == 'f32' else '', **fmtspec)\n\n    # infix operators - cmp - f32, f64\n    infix_op_cmp_f32_f64 = {\n        'lt': 'return {cast_to_int}sycl::isless({in0}, {in1});',\n        'gt': 'return {cast_to_int}sycl::isgreater({in0}, {in1});',\n        'le': 'return {cast_to_int}sycl::islessequal({in0}, {in1});',\n        'ge': 'return {cast_to_int}sycl::isgreaterequal({in0}, {in1});',\n        'ne': 'return {cast_to_int}sycl::isnotequal({in0}, {in1});',\n        'eq': 'return {cast_to_int}sycl::isequal({in0}, {in1});'\n    }\n\n    if typ in common.ftypes_no_f16 and operator.name in infix_op_cmp_f32_f64:\n        return infix_op_cmp_f32_f64[operator.name]. \\\n               format(cast_to_int='(int)' if typ == 'f64' else '', **fmtspec)\n\n    # infix operators - cmp - integer types\n    infix_op_cmp_iutypes = [ 'lt', 'gt', 'le', 'ge', 'ne', 'eq' ]\n    if operator.name in infix_op_cmp_iutypes:\n      return 'return nsimd_scalar_{op}_{typ}({in0},{in1});'.\\\n        format(op=operator.name, **fmtspec)\n\n    # infix operators f32, f64 + integers\n    # ref: see Data Parallel C++ book, pages 480, 481, 482\n    # TODO: do the functions below call instrinsics/built-in\n    # functions on the device?\n    # 'add': 'return std::plus<{typ}>()({in0}, {in1});',\n    # 'sub': 'return std::minus<{typ}>()({in0}, {in1});',\n    # 'mul': 'return std::multiplies<{typ}>()({in0}, {in1});',\n    # 'div': 'return std::divides<{typ}>()({in0}, {in1});',\n\n    infix_op_t = [ 'add', 'sub', 'mul', 'div' ]\n    if operator.name in infix_op_t:\n        return 'return nsimd_scalar_{op}_{typ}({in0}, {in1});'. \\\n               format(op=operator.name, **fmtspec)\n\n    # neg\n    # ref: see Data Parallel C++ book, pages 480, 481, 482\n    # TODO: does the function below call an instrinsic/built-in\n    # function on the device?\n    # 'neg': 'return std::negate<{typ}>()({in0});'\n\n    if operator.name == 'neg':\n        return 'return nsimd_scalar_{op}_{typ}({in0});'. \\\n               format(op=operator.name, **fmtspec)\n\n    # shifts\n    shifts_op_ui_t = [ 'shl', 'shr', 'shra' ]\n    if operator.name in shifts_op_ui_t and typ in common.iutypes:\n        return 'return nsimd_scalar_{op}_{typ}({in0}, {in1});'. \\\n               format(op=operator.name, **fmtspec)\n\n    # adds\n    if operator.name == 'adds':\n        if typ in common.ftypes:\n            return 'return nsimd_scalar_add_{typ}({in0}, {in1});'. \\\n                   format(**fmtspec)\n        else:\n            return 'return sycl::add_sat({in0}, {in1});'.format(**fmtspec)\n\n    # subs\n    if operator.name == 'subs':\n        if typ in common.ftypes:\n            return 'return nsimd_scalar_sub_{typ}({in0}, {in1});'. \\\n                   format(**fmtspec)\n        else:\n            return 'return sycl::sub_sat({in0}, {in1});'.format(**fmtspec)\n\n    # fma's\n    if operator.name in ['fma', 'fms', 'fnma', 'fnms']:\n        if typ in common.ftypes:\n            neg = '-' if operator.name in ['fnma', 'fnms'] else ''\n            op = '-' if operator.name in ['fnms', 'fms'] else ''\n            return 'return sycl::fma({neg}{in0}, {in1}, {op}{in2});'. \\\n                   format(op=op, neg=neg, **fmtspec)\n        else:\n            return 'return nsimd_scalar_{op}_{typ}({in0}, {in1}, {in2});'. \\\n                   format(op=operator.name, **fmtspec)\n\n    # other operators\n    # round_to_even, ceil, floor, trunc, min, max, abs, sqrt\n\n    # round_to_even\n    if operator.name == 'round_to_even':\n        if typ in common.ftypes_no_f16:\n            return 'return sycl::rint({in0});'.format(**fmtspec)\n        else:\n            return 'return {in0};'.format(**fmtspec)\n\n    # other rounding operators\n    other_rounding_ops = ['ceil', 'floor', 'trunc']\n    if operator.name in other_rounding_ops:\n        if typ in common.iutypes:\n            return 'return nsimd_scalar_{op}_{typ}({in0});'. \\\n                   format(op=operator.name, **fmtspec)\n        else:\n            return 'return sycl::{op}({in0});'. \\\n                   format(op=operator.name, **fmtspec)\n\n    # min/max\n    if operator.name in ['min', 'max']:\n        if typ in common.iutypes:\n            return 'return sycl::{op}({in0}, {in1});'.\\\n                   format(op=operator.name, **fmtspec)\n        else:\n            op = 'sycl::fmin' if operator.name == 'min' else 'sycl::fmax'\n            return 'return {op}({in0}, {in1});'.format(op=op, **fmtspec)\n\n    # abs\n    if operator.name == 'abs':\n        if typ in common.itypes:\n            return 'return ({typ})sycl::abs({in0});'.format(**fmtspec)\n        elif typ in common.utypes:\n            return 'return nsimd_scalar_abs_{typ}({in0});'.format(**fmtspec)\n        else:\n            return 'return sycl::fabs({in0});'.format(**fmtspec)\n\n    # sqrt\n    if operator.name == 'sqrt' and typ in common.ftypes:\n          return 'return sycl::sqrt({in0});'.format(**fmtspec)\n\n    # rsqrt\n    if operator.name in ['rsqrt8', 'rsqrt11', 'rsqrt'] and typ in common.ftypes:\n          return 'return sycl::rsqrt({in0});'.format(**fmtspec)\n\n"
  },
  {
    "path": "egg/operators.py",
    "content": "# Use utf-8 encoding\n# -*- coding: utf-8 -*-\n\n# Copyright (c) 2021 Agenium Scale\n#\n# Permission is hereby granted, free of charge, to any person obtaining a copy\n# of this software and associated documentation files (the \"Software\"), to deal\n# in the Software without restriction, including without limitation the rights\n# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n# copies of the Software, and to permit persons to whom the Software is\n# furnished to do so, subject to the following conditions:\n#\n# The above copyright notice and this permission notice shall be included in all\n# copies or substantial portions of the Software.\n#\n# THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n# SOFTWARE.\n\nif __name__ == 'operators':\n    import common\nelse:\n    from . import common\nimport collections\n\n# -----------------------------------------------------------------------------\n# Metaclass and class to gather all operator categories\n\ncategories = collections.OrderedDict()\n\nclass MAddToCategories(type):\n    def __new__(cls, name, bases, dct):\n        if name != 'DocCategory':\n            if 'title' not in dct:\n                raise Exception('No member title provided for class {}'. \\\n                                format(name))\n            dct['name'] = name\n            dct['id'] = '/categories/{}'.format(name)\n        ret = type.__new__(cls, name, bases, dct)\n        if name != 'DocCategory':\n            categories[name] = ret()\n        return ret\n\nclass DocCategory(object, metaclass=MAddToCategories):\n    pass\n\n# -----------------------------------------------------------------------------\n# Operators categories\n\nclass DocShuffle(DocCategory):\n    title = 'Shuffle functions'\n\nclass DocTrigo(DocCategory):\n    title = 'Trigonometric functions'\n\nclass DocHyper(DocCategory):\n    title = 'Hyperbolic functions'\n\nclass DocExpLog(DocCategory):\n    title = 'Exponential and logarithmic functions'\n\nclass DocBasicArithmetic(DocCategory):\n    title = 'Basic arithmetic operators'\n\nclass DocBitsOperators(DocCategory):\n    title = 'Bits manipulation operators'\n\nclass DocLogicalOperators(DocCategory):\n    title = 'Logicals operators'\n\nclass DocMisc(DocCategory):\n    title = 'Miscellaneous'\n\nclass DocLoadStore(DocCategory):\n    title = 'Loads & stores'\n\nclass DocComparison(DocCategory):\n    title = 'Comparison operators'\n\nclass DocRounding(DocCategory):\n    title = 'Rounding functions'\n\nclass DocConversion(DocCategory):\n    title = 'Conversion operators'\n\n# -----------------------------------------------------------------------------\n# Metaclass and class to gather all operators\n\noperators = collections.OrderedDict()\n\nclass MAddToOperators(type):\n    def __new__(cls, name, bases, dct):\n\n        def member_is_defined(member):\n            if member in dct:\n                return True\n            for bc in range(len(bases)):\n                if member in bases[bc].__dict__:\n                    return True\n            return False\n\n        def get_member_value(member):\n            if member in dct:\n                return dct[member]\n            for bc in range(len(bases)):\n                if member in bases[bc].__dict__:\n                    return bases[bc].__dict__[member]\n            raise Exception('Member does not exists in class {}'.format(name))\n\n        # We don't care about the parent class\n        if name == 'Operator' or name == 'SrcOperator':\n            return type.__new__(cls, name, bases, dct)\n\n        # Mandatory members\n        mm = ['categories', 'signature']\n        for m in mm:\n            if m not in dct:\n                raise Exception('Mandatory member \"{}\" not given in \"{}\"'. \\\n                                format(m, name))\n\n        # Check that all items in categories exists\n        for c in dct['categories']:\n            if type(c) == str:\n                raise Exception( \\\n                      'Category \"{}\" must not be a string for operator \"{}\"'. \\\n                      format(c, name))\n            if not hasattr(c, 'name'):\n                raise Exception( \\\n                      'Category \"{}\" does not exist for operator \"{}\"'. \\\n                      format(c.__class__.__name__, name))\n            if c.name not in categories:\n                raise Exception( \\\n                      'Category \"{}\" does not exist for operator \"{}\"'. \\\n                      format(c.__class__.__name__, name))\n\n        # Some defaults, that are fixed by the implementation\n        (dct['name'], dct['params']) = common.parse_signature(dct['signature'])\n        if 'output_to' in dct:\n            if dct['output_to'] == common.OUTPUT_TO_SAME_TYPE:\n                dct['closed'] = True\n            else:\n                dct['closed'] = False\n        else:\n            dct['closed'] = True\n            dct['output_to'] = common.OUTPUT_TO_SAME_TYPE\n\n        # If the operator takes as inputs vectors and returns a scalar, then\n        # by default we cannot autogenerate the C++ advanced API because we\n        # cannot guess how to combine pieces of a unrolled pack\n        if 'autogen_cxx_adv' not in dct:\n            if dct['params'][0] in ['p', 's']:\n                dct['autogen_cxx_adv'] = False\n            else:\n                dct['autogen_cxx_adv'] = True\n\n        # By default tests are done on random numbers depending on the type\n        # but sometimes one needs to produce only integers even if the\n        # type is a floating point type.\n        if 'tests_on_integers_only' not in dct:\n            dct['tests_on_integers_only'] = False;\n\n        # Fill domain, default is [-20 ; +20]\n        if 'domain' not in dct:\n            dct['domain'] = [[-20, 20], [-20, 20], [-20, 20]]\n\n        # Number of UFP (cf. documentation) for testing\n        if 'ufp' not in dct:\n            dct['ufp'] = {'f16': 8, 'f32': 18, 'f64': 45}\n\n        # Check that params is not empty\n        if len(dct['params']) == 0:\n            raise Exception('\"params\" is empty for operator \"{}\"'. \\\n                            format(name))\n\n        # Fill full_name, default is same as name\n        if 'full_name' not in dct:\n            dct['full_name'] = name\n\n        # Fill desc, default is a basic sentence using full_name\n        if 'desc' not in dct:\n            arg = 'arguments' if len(dct['params']) > 2 else 'argument'\n            if dct['params'][0] == '_':\n                dct['desc'] = '{} the {}.'. \\\n                              format(dct['full_name'].capitalize(), arg)\n            else:\n                dct['desc'] = 'Returns the {} of the {}.'.\\\n                              format(dct['full_name'], arg)\n\n        # Fill src, default is operator is in header not in source\n        if not member_is_defined('src'):\n            dct['src'] = False\n\n        # Fill load_store, default is operator is not for loading/storing\n        if 'load_store' not in dct:\n            dct['load_store'] = False\n\n        # Fill has_scalar_impl, default is based on several properties\n        if 'has_scalar_impl' not in dct:\n            if DocShuffle in dct['categories'] or \\\n               DocMisc in dct['categories'] or \\\n               'vx2' in dct['params'] or \\\n               'vx3' in dct['params'] or \\\n               'vx4' in dct['params'] or \\\n               dct['output_to'] in [common.OUTPUT_TO_UP_TYPES,\n                                    common.OUTPUT_TO_DOWN_TYPES] or \\\n               dct['load_store']:\n                dct['has_scalar_impl'] = False\n            else:\n                dct['has_scalar_impl'] = True\n\n        ret = type.__new__(cls, name, bases, dct)\n        operators[dct['name']] = ret()\n        return ret\n\nclass Operator(object, metaclass=MAddToOperators):\n\n    # Default values (for general purpose)\n    cxx_operator = None\n    autogen_cxx_adv = True\n    output_to = common.OUTPUT_TO_SAME_TYPE\n    types = common.types\n    params = []\n    aliases = []\n    signature = ''\n\n    # Enable bench by default\n    do_bench = True\n\n    # Default values (for documentation)\n    desc = ''\n\n    # Defaults values (for benches)\n    returns_any_type = False\n    bench_auto_against_cpu = True\n    bench_auto_against_mipp = False\n    bench_auto_against_sleef = False\n    bench_auto_against_std = False\n    use_for_parsing = True\n\n    @property\n    def returns(self):\n        return self.params[0]\n\n    @property\n    def args(self):\n        return self.params[1:]\n\n    def __init__(self):\n        (self.name, self.params) = common.parse_signature(self.signature)\n        super(Operator, self).__init__()\n\n    def get_return(self):\n        return self.params[0]\n\n    def tests_mpfr_name(self):\n        return 'mpfr_' + self.name\n\n    def bench_mipp_name(self, typ):\n        return 'mipp::{}<{}>'.format(self.name, typ)\n\n    def bench_mipp_types(self):\n        return common.ftypes_no_f16\n\n    def bench_sleef_name(self, simd, typ):\n        return common.sleef_name(self.name, simd, typ)\n\n    def bench_sleef_types(self):\n        return common.ftypes_no_f16\n\n    def bench_std_name(self, simd, typ):\n        return 'std::{}'.format(self.name)\n\n    def bench_std_types(self):\n        return self.types\n\n    # TODO: move to gen_archis.py\n    def get_header_guard(self, platform, simd_ext):\n        return 'NSIMD_{}_{}_{}_H'.format(platform.upper(),\n            simd_ext.upper(), self.name.upper())\n\n    def get_fmtspec(self, t, tt, simd_ext):\n        ret = {}\n        return_typ = common.get_one_type_specific(self.params[0], simd_ext, tt)\n        ret['return_typ'] = return_typ\n        ret['returns'] = '' if return_typ == 'void' else 'return '\n        args_list = common.enum([common.get_one_type_specific(p, simd_ext, t)\n                                 for p in self.params[1:]])\n        if len(args_list) > 0:\n            ret['c_args'] = ', '.join(['{} a{}'.format(i[1], i[0])\n                                       for i in args_list])\n            ret['cxx_args'] = ret['c_args'] + ', '\n        else:\n            ret['c_args'] = 'void'\n            ret['cxx_args'] = ''\n        if self.closed:\n            ret['cxx_args'] += '{}, {}'.format(t, simd_ext)\n        else:\n            ret['cxx_args'] += '{}, {}, {}'.format(t, tt, simd_ext)\n        ret['vas'] = ', '.join(['a{}'.format(i[0]) for i in args_list])\n        ret['suf'] = tt if self.closed else '{}_{}'.format(tt, t)\n        ret['name'] = self.name\n        ret['hbar'] = common.hbar\n        ret['simd_ext'] = simd_ext\n        if self.src and 'sleef_symbol_prefix' in self.__class__.__dict__:\n            ret['sleef_symbol_prefix'] = self.sleef_symbol_prefix\n        return ret\n\n    def get_generic_signature(self, lang):\n        if lang == 'c_base':\n            vas = common.get_args(len(self.params) - 1)\n            args = vas + (', ' if vas != '' else '')\n            args += 'from_type, to_type' if not self.closed else 'type'\n            return ['#define v{name}({args})'.format(name=self.name,\n                    args=args),\n                    '#define v{name}_e({args}, simd_ext)'. \\\n                    format(name=self.name, args=args)]\n        elif lang == 'c_adv':\n            args = ['a{}'.format(i - 1) for i in range(1, len(self.params))]\n            if not self.closed:\n                args = ['to_type'] + args\n            args = ', '.join(args)\n            return '#define nsimd_{}({})'.format(self.name, args)\n        elif lang == 'cxx_base':\n            def get_type(param, typename):\n                if param == '_':\n                    return 'void'\n                elif param == 'p':\n                    return 'int'\n                elif param == 's':\n                    return typename\n                elif param == '*':\n                    return '{}*'.format(typename)\n                elif param == 'c*':\n                    return '{} const*'.format(typename)\n                elif param == 'vi':\n                    return 'typename simd_traits<typename traits<{}>::itype,' \\\n                           ' NSIMD_SIMD>::simd_vector'.format(typename)\n                elif param == 'l':\n                    return \\\n                    'typename simd_traits<{}, NSIMD_SIMD>::simd_vectorl'. \\\n                    format(typename)\n                elif param.startswith('v'):\n                    return \\\n                    'typename simd_traits<{}, NSIMD_SIMD>::simd_vector{}'. \\\n                    format(typename, param[1:])\n                else:\n                    raise ValueError(\"Unknown param '{}'\".format(param))\n            return_typ = get_type(self.params[0], 'T')\n            args_list = common.enum(self.params[1:])\n\n            if not self.closed :\n                tmpl_args = 'NSIMD_CONCEPT_VALUE_TYPE F, ' \\\n                            'NSIMD_CONCEPT_VALUE_TYPE T'\n                typename = 'F'\n            else:\n                tmpl_args = 'NSIMD_CONCEPT_VALUE_TYPE T'\n                typename = 'T'\n\n            temp = ', '.join(['{} a{}'.format(get_type(a[1], typename),\n                              a[0]) for a in args_list])\n            temp += ', ' if temp != '' else ''\n            if not self.closed:\n                func_args = temp + 'F, T'\n                if self.output_to == common.OUTPUT_TO_SAME_SIZE_TYPES:\n                    cxx20_require = \\\n                        'NSIMD_REQUIRES(sizeof_v<F> == sizeof_v<T>) '\n                elif self.output_to == common.OUTPUT_TO_UP_TYPES:\n                    cxx20_require = \\\n                        'NSIMD_REQUIRES(2 * sizeof_v<F> == sizeof_v<T>) '\n                else:\n                    cxx20_require = \\\n                        'NSIMD_REQUIRES(sizeof_v<F> == 2 * sizeof_v<T>) '\n            else:\n                func_args = temp + 'T'\n                cxx20_require = ''\n\n            return 'template <{tmpl_args}> {cxx20_require}{return_typ} ' \\\n                   'NSIMD_VECTORCALL {name}({func_args});'. \\\n                   format(return_typ=return_typ, tmpl_args=tmpl_args,\n                          func_args=func_args, name=self.name,\n                          cxx20_require=cxx20_require)\n        elif lang == 'cxx_adv':\n            def get_type(param, typename, N):\n                if param == '_':\n                    return 'void'\n                elif param == 'p':\n                    return 'int'\n                elif param == 's':\n                    return typename\n                elif param == '*':\n                    return '{}*'.format(typename)\n                elif param == 'c*':\n                    return '{} const*'.format(typename)\n                elif param == 'vi':\n                    return 'pack<typename traits<{}>::itype, {}, SimdExt>'. \\\n                           format(typename, N)\n                elif param == 'l':\n                    return 'packl<{}, {}, SimdExt>'.format(typename, N)\n                elif param.startswith('v'):\n                    return 'pack{}<{}, {}, SimdExt>'. \\\n                    format(param[1:], typename, N)\n                else:\n                    raise ValueError(\"Unknown param '{}'\".format(param))\n            args_list = common.enum(self.params[1:])\n            # Do we need tag dispatching on pack<>? e.g. len, set1 and load*\n            inter = [i for i in ['v', 'l', 'vi', 'vx2', 'vx3', 'vx4'] \\\n                     if i in self.params[1:]]\n            tag_dispatching = (inter == [])\n\n            # Compute template arguments\n            tmpl_args1 = ['NSIMD_CONCEPT_VALUE_TYPE T',\n                          'NSIMD_CONCEPT_SIMD_EXT SimdExt']\n            tmpl_argsN = ['NSIMD_CONCEPT_VALUE_TYPE T', 'int N',\n                          'NSIMD_CONCEPT_SIMD_EXT SimdExt']\n            def get_PACK(arg):\n                if arg == 'l':\n                    return 'PACKL'\n                elif arg == 'v':\n                    return 'PACK'\n                else:\n                    return 'PACK{}'.format(arg[1:].upper())\n            if not self.closed:\n                tmpl = 'NSIMD_CONCEPT_{} ToPackType'. \\\n                       format(get_PACK(self.params[0]))\n                tmpl_args1 = [tmpl] + tmpl_args1\n                tmpl_argsN = [tmpl] + tmpl_argsN\n            tmpl_args1 = ', '.join(tmpl_args1)\n            tmpl_argsN = ', '.join(tmpl_argsN)\n\n            # Compute function arguments\n            def arg_type(arg, typename, N):\n                if arg in ['v', 'vi', 'vx2', 'vx3', 'vx4', 'l']:\n                    return '{} const&'.format(get_type(arg, typename, N))\n                else:\n                    return get_type(arg, typename, N)\n            args1 = ['{} a{}'.format(arg_type(i[1], 'T', '1'), i[0]) \\\n                     for i in args_list]\n            argsN = ['{} a{}'.format(arg_type(i[1], 'T', 'N'), i[0]) \\\n                     for i in args_list]\n\n            # Arguments without tag dispatching on pack\n            other_argsN = ', '.join(argsN)\n\n            # If we need tag dispatching, then the first argument type\n            # is the output type:\n            #   1. If not closed, then the output type is ToPackType\n            #   2. If closed, then the output type is pack<T, N, SimdExt>\n            if not self.closed:\n                args1 = ['ToPackType const&'] + args1\n                argsN = ['ToPackType const&'] + argsN\n            elif tag_dispatching:\n                args1 = [arg_type(self.params[0], 'T', '1')] + args1\n                argsN = [arg_type(self.params[0], 'T', 'N')] + argsN\n            args1 = ', '.join(args1)\n            argsN = ', '.join(argsN)\n\n            # Compute return type\n            if not self.closed:\n                ret1 = 'ToPackType'\n                retN = 'ToPackType'\n            else:\n                ret1 = get_type(self.params[0], 'T', '1')\n                retN = get_type(self.params[0], 'T', 'N')\n\n            # For non closed operators that need tag dispatching we have a\n            # require clause\n            cxx20_require = ''\n            if not self.closed:\n                tmpl = 'NSIMD_REQUIRES((' \\\n                    '{}sizeof_v<typename ToPackType::value_type> == ' \\\n                        '{}sizeof_v<T> && ' \\\n                    'ToPackType::unroll == {{}} && '\\\n                    'std::is_same_v<typename ToPackType::simd_ext, SimdExt>))'\n                if self.output_to == common.OUTPUT_TO_SAME_SIZE_TYPES:\n                    cxx20_require = tmpl.format('', '')\n                elif self.output_to == common.OUTPUT_TO_UP_TYPES:\n                    cxx20_require = tmpl.format('', '2 * ')\n                else:\n                    cxx20_require = tmpl.format('2 * ', '')\n\n            ret = { \\\n                '1': 'template <{tmpl_args1}> {cxx20_require}{ret1} ' \\\n                     '{cxx_name}({args1});'. \\\n                     format(tmpl_args1=tmpl_args1,\n                            cxx20_require=cxx20_require.format('1'),\n                            ret1=ret1, args1=args1, cxx_name=self.name),\n                'N': 'template <{tmpl_argsN}> {cxx20_require}{retN} ' \\\n                     '{cxx_name}({argsN});'. \\\n                     format(tmpl_argsN=tmpl_argsN,\n                            cxx20_require=cxx20_require.format('N'),\n                            retN=retN, argsN=argsN, cxx_name=self.name)\n            }\n            if self.cxx_operator:\n                ret.update({ \\\n                    'op1':\n                    '''template <{tmpl_args1}>\n                    {ret1} operator{cxx_name}({args1});'''. \\\n                    format(tmpl_args1=tmpl_args1, ret1=ret1, args1=args1,\n                           cxx_name=self.cxx_operator),\n                    'opN':\n                    '''template <{tmpl_argsN}>\n                    {retN} operator{cxx_name}({argsN});'''. \\\n                    format(tmpl_argsN=tmpl_argsN, retN=retN, argsN=argsN,\n                           cxx_name=self.cxx_operator)\n                })\n            if not self.closed:\n                ret['dispatch'] = \\\n                'template <{tmpl_argsN}> {cxx20_require}{retN} ' \\\n                '{cxx_name}({other_argsN});'. \\\n                format(tmpl_argsN=tmpl_argsN,\n                       cxx20_require=cxx20_require.format('N'),\n                       other_argsN=other_argsN, retN=retN, cxx_name=self.name)\n            elif tag_dispatching:\n                if [i for i in ['s', '*', 'c*'] if i in self.params[1:]] == []:\n                    tmpl_T = ''\n                    requires = ''\n                else:\n                    tmpl_T = ', NSIMD_CONCEPT_VALUE_TYPE T'\n                    requires = 'NSIMD_REQUIRES((' \\\n                        'std::is_same_v<typename SimdVector::value_type, T>))'\n                ret['dispatch'] = \\\n                '''template <NSIMD_CONCEPT_{PACK} SimdVector{tmpl_T}>{requires}\n                   SimdVector {cxx_name}({other_argsN});'''.format(\n                   PACK=get_PACK(self.params[0]), requires=requires,\n                   other_argsN=other_argsN, cxx_name=self.name, tmpl_T=tmpl_T)\n            return ret\n        else:\n            raise Exception('Lang must be one of c_base, cxx_base, cxx_adv')\n\n    def get_signature(self, typename, lang, simd_ext):\n        # Check that the type is available for this function\n        if typename not in self.types:\n            raise Exception('Type {} not supported for function {}'. \\\n                            format(typename, self.name))\n\n        fmtspec = self.get_fmtspec(typename, typename, simd_ext)\n\n        if lang == 'c_base':\n            sig = '{return_typ} NSIMD_VECTORCALL ' \\\n                  'nsimd_{name}_{simd_ext}_{suf}({c_args})'.format(**fmtspec)\n        elif lang == 'cxx_base':\n            sig = '{return_typ} NSIMD_VECTORCALL ' \\\n                  '{name}({cxx_args})'.format(**fmtspec)\n        elif lang == 'cxx_adv':\n            sig = ''\n            raise Exception('TODO cxx_adv for {}'.format(lang))\n        else:\n            raise Exception('Unknown langage {}'.format(lang))\n\n        return sig\n\n    def get_scalar_signature(self, cpu_gpu, t, tt, lang):\n        sig = '__device__ ' if cpu_gpu == 'gpu' else ''\n        sig += common.get_one_type_scalar(self.params[0], tt) + ' '\n        func_name = 'nsimd_' if lang == 'c' else ''\n        func_name += 'gpu_' if cpu_gpu in ['gpu', 'oneapi'] else 'scalar_'\n        func_name += self.name\n        operator_on_logicals = (self.params == ['l'] * len(self.params))\n        if lang == 'c' and not operator_on_logicals:\n            func_name += '_{}_{}'.format(tt, t) if not self.closed \\\n                                                else '_{}'.format(t)\n        sig += func_name\n        args_list = common.enum([common.get_one_type_scalar(p, t)\n                                 for p in self.params[1:]])\n        args = ['{} a{}'.format(i[1], i[0]) for i in args_list]\n        if lang == 'cxx' and (not self.closed or \\\n           ('v' not in self.params[1:] and not operator_on_logicals)):\n            args = [tt] + args\n        sig += '(' + ', '.join(args) + ')'\n        return sig\n\nclass SrcOperator(Operator):\n    src = True\n    types = common.ftypes\n\n# -----------------------------------------------------------------------------\n# List of functions/operators\n\nclass Len(Operator):\n    full_name = 'vector length'\n    signature = 'p len'\n    categories = [DocMisc]\n\nclass Set1(Operator):\n    full_name = 'value broadcast'\n    signature = 'v set1 s'\n    categories = [DocMisc]\n    desc = 'Returns a vector whose all elements are set to the given value.'\n\nclass Set1l(Operator):\n    full_name = 'logical value broadcast'\n    signature = 'l set1l p'\n    categories = [DocMisc]\n    desc = 'Returns a vector whose all elements are set to the given ' \\\n           'boolean value: zero means false and nonzero means true.'\n\nclass Loadu(Operator):\n    signature = 'v loadu c*'\n    load_store = True\n    categories = [DocLoadStore]\n    desc = 'Load data from unaligned memory.'\n\nclass MaskoLoadu1(Operator):\n    signature = 'v masko_loadu1 l c* v'\n    load_store = True\n    categories = [DocLoadStore]\n    desc = 'Load data from unaligned memory corresponding to True elements.'\n\nclass MaskzLoadu1(Operator):\n    signature = 'v maskz_loadu1 l c*'\n    load_store = True\n    categories = [DocLoadStore]\n    desc = 'Load data from unaligned memory corresponding to True elements.'\n\nclass Load2u(Operator):\n    full_name = 'load array of structure'\n    signature = 'vx2 load2u c*'\n    load_store = True\n    categories = [DocLoadStore]\n    desc = 'Load array of structures of 2 members from unaligned memory.'\n\nclass Load3u(Operator):\n    full_name = 'load array of structure'\n    signature = 'vx3 load3u c*'\n    load_store = True\n    categories = [DocLoadStore]\n    desc = 'Load array of structures of 3 members from unaligned memory.'\n\nclass Load4u(Operator):\n    full_name = 'load array of structure'\n    signature = 'vx4 load4u c*'\n    load_store = True\n    categories = [DocLoadStore]\n    desc = 'Load array of structures of 4 members from unaligned memory.'\n\nclass Loada(Operator):\n    signature = 'v loada c*'\n    load_store = True\n    categories = [DocLoadStore]\n    desc = 'Load data from aligned memory.'\n\nclass MaskoLoada(Operator):\n    signature = 'v masko_loada1 l c* v'\n    load_store = True\n    categories = [DocLoadStore]\n    desc = 'Load data from aligned memory.'\n\nclass MaskzLoada(Operator):\n    signature = 'v maskz_loada1 l c*'\n    load_store = True\n    categories = [DocLoadStore]\n    desc = 'Load data from aligned memory corresponding to True elements.'\n\nclass Load2a(Operator):\n    full_name = 'load array of structure'\n    signature = 'vx2 load2a c*'\n    load_store = True\n    categories = [DocLoadStore]\n    desc = 'Load array of structures of 2 members from aligned memory.'\n\nclass Load3a(Operator):\n    full_name = 'load array of structure'\n    signature = 'vx3 load3a c*'\n    load_store = True\n    categories = [DocLoadStore]\n    desc = 'Load array of structures of 3 members from aligned memory.'\n\nclass Load4a(Operator):\n    full_name = 'load array of structure'\n    signature = 'vx4 load4a c*'\n    load_store = True\n    categories = [DocLoadStore]\n    desc = 'Load array of structures of 4 members from aligned memory.'\n\nclass Loadlu(Operator):\n    full_name = 'load vector of logicals'\n    signature = 'l loadlu c*'\n    load_store = True\n    categories = [DocLoadStore]\n    desc = 'Load data from unaligned memory and interpret it as booleans. ' + \\\n           'Zero is interpreted as False and nonzero as True.'\n\nclass Loadla(Operator):\n    full_name = 'load vector of logicals'\n    signature = 'l loadla c*'\n    load_store = True\n    categories = [DocLoadStore]\n    desc = 'Load data from aligned memory and interpret it as booleans. ' + \\\n           'Zero is interpreted as False and nonzero as True.'\n\nclass Storeu(Operator):\n    signature = '_ storeu * v'\n    load_store = True\n    categories = [DocLoadStore]\n    desc = 'Store SIMD vector into unaligned memory.'\n\nclass MaskStoreu1(Operator):\n    signature = '_ mask_storeu1 l * v'\n    load_store = True\n    categories = [DocLoadStore]\n    desc = 'Store active SIMD vector elements into unaligned memory.'\n\nclass Store2u(Operator):\n    signature = '_ store2u * v v'\n    load_store = True\n    categories = [DocLoadStore]\n    desc = 'Store 2 SIMD vectors as array of structures of 2 members into ' + \\\n           'unaligned memory.'\n\nclass Store3u(Operator):\n    full_name = 'store into array of structures'\n    signature = '_ store3u * v v v'\n    load_store = True\n    categories = [DocLoadStore]\n    desc = 'Store 3 SIMD vectors as array of structures of 3 members into ' + \\\n           'unaligned memory.'\n\nclass Store4u(Operator):\n    full_name = 'store into array of structures'\n    signature = '_ store4u * v v v v'\n    load_store = True\n    categories = [DocLoadStore]\n    desc = 'Store 4 SIMD vectors as array of structures of 4 members into ' + \\\n           'unaligned memory.'\n\nclass Storea(Operator):\n    signature = '_ storea * v'\n    load_store = True\n    categories = [DocLoadStore]\n    desc = 'Store SIMD vector into aligned memory.'\n\nclass MaskStorea1(Operator):\n    signature = '_ mask_storea1 l * v'\n    load_store = True\n    categories = [DocLoadStore]\n    desc = 'Store active SIMD vector elements into aligned memory.'\n\nclass Store2a(Operator):\n    full_name = 'store into array of structures'\n    signature = '_ store2a * v v'\n    load_store = True\n    categories = [DocLoadStore]\n    desc = 'Store 2 SIMD vectors as array of structures of 2 members into ' + \\\n           'aligned memory.'\n\nclass Store3a(Operator):\n    full_name = 'store into array of structures'\n    signature = '_ store3a * v v v'\n    load_store = True\n    categories = [DocLoadStore]\n    desc = 'Store 3 SIMD vectors as array of structures of 3 members into ' + \\\n           'aligned memory.'\n\nclass Store4a(Operator):\n    full_name = 'store into array of structures'\n    signature = '_ store4a * v v v v'\n    load_store = True\n    categories = [DocLoadStore]\n    desc = 'Store 4 SIMD vectors as array of structures of 4 members into ' + \\\n           'aligned memory.'\n\nclass Gather(Operator):\n    full_name = 'gather elements from memory into a SIMD vector'\n    signature = 'v gather c* vi'\n    load_store = True\n    categories = [DocLoadStore]\n    types = common.ftypes + ['i16', 'u16', 'u32', 'i32', 'i64', 'u64']\n    desc = 'Gather elements from memory with base address given as first ' \\\n           'argument and offsets given as second argument.'\n\nclass GatherLinear(Operator):\n    full_name = 'gather elements from memory into a SIMD vector'\n    signature = 'v gather_linear c* p'\n    load_store = True\n    categories = [DocLoadStore]\n    types = common.types\n    desc = 'Gather elements from memory with base address given as first ' \\\n           'argument and steps given as second argument. This operator ' \\\n           'using a SIMD register.'\n\n#class MaskzGather(Operator):\n#    full_name = 'gather active elements from SIMD vector to memory and put ' \\\n#                'zeros in inactive elements.'\n#    signature = 'v maskz_gather l * vi'\n#    load_store = True\n#    categories = [DocLoadStore]\n#    types = common.ftypes + ['i16', 'u16', 'u32', 'i32', 'i64', 'u64']\n#    desc = 'Gather elements from memory with base address given as second ' \\\n#           'argument and offsets given as third argument. Inactive elements ' \\\n#           '(first argument) are set to zero.'\n\n#class MaskoGather(Operator):\n#    full_name = 'gather active elements from SIMD vector to memory and put ' \\\n#                'zeros in inactive elements.'\n#    signature = 'v masko_gather l * vi v'\n#    load_store = True\n#    categories = [DocLoadStore]\n#    types = common.ftypes + ['i16', 'u16', 'u32', 'i32', 'i64', 'u64']\n#    desc = 'Gather elements from memory with base address given as second ' \\\n#           'argument and offsets given as third argument. Inactive elements ' \\\n#           '(first argument) are set to corresponding elements from fourth ' \\\n#           'argument.'\n\nclass Scatter(Operator):\n    full_name = 'scatter elements from SIMD vector to memory'\n    signature = '_ scatter * vi v'\n    load_store = True\n    categories = [DocLoadStore]\n    types = common.ftypes + ['i16', 'u16', 'u32', 'i32', 'i64', 'u64']\n    desc = 'Scatter elements from third argument to memory with base ' \\\n           'address given as first argument and offsets given as second ' \\\n           'argument.'\n\nclass ScatterLinear(Operator):\n    full_name = 'scatter elements from SIMD vector to memory'\n    signature = '_ scatter_linear * p v'\n    load_store = True\n    categories = [DocLoadStore]\n    types = common.types\n    desc = 'Scatter elements from third argument to memory with base ' \\\n           'address given as first argument and steps given as second ' \\\n           'argument. This operator avoids using a SIMD register.'\n\n#class MaskScatter(Operator):\n#    full_name = 'scatter active elements from SIMD vector to memory'\n#    signature = '_ mask_scatter l * vi v'\n#    load_store = True\n#    categories = [DocLoadStore]\n#    types = common.ftypes + ['i16', 'u16', 'u32', 'i32', 'i64', 'u64']\n#    desc = 'Scatter active (first argument) elements from fourth argument ' \\\n#           'to memory with base address given as second argument and ' \\\n#           'offsets given as third argument.'\n\nclass Storelu(Operator):\n    full_name = 'store vector of logicals'\n    signature = '_ storelu * l'\n    load_store = True\n    categories = [DocLoadStore]\n    desc = 'Store SIMD vector of booleans into unaligned memory. True is ' + \\\n           'stored as 1 and False as 0.'\n\nclass Storela(Operator):\n    full_name = 'store vector of logicals'\n    signature = '_ storela * l'\n    load_store = True\n    categories = [DocLoadStore]\n    desc = 'Store SIMD vector of booleans into aligned memory. True is ' + \\\n           'stored as 1 and False as 0.'\n\nclass Orb(Operator):\n    full_name = 'bitwise or'\n    signature = 'v orb v v'\n    cxx_operator = '|'\n    categories = [DocBitsOperators]\n\nclass Andb(Operator):\n    full_name = 'bitwise and'\n    signature = 'v andb v v'\n    cxx_operator = '&'\n    categories = [DocBitsOperators]\n\nclass Andnotb(Operator):\n    full_name = 'bitwise andnot'\n    signature = 'v andnotb v v'\n    categories = [DocBitsOperators]\n    desc = 'Returns the bitwise andnot of its arguments, more precisely ' \\\n           '\"arg1 and (not arg2)\"'\n\nclass Notb(Operator):\n    full_name = 'bitwise not'\n    signature = 'v notb v'\n    cxx_operator = '~'\n    categories = [DocBitsOperators]\n\nclass Xorb(Operator):\n    full_name = 'bitwise xor'\n    signature = 'v xorb v v'\n    cxx_operator = '^'\n    categories = [DocBitsOperators]\n\nclass Orl(Operator):\n    full_name = 'logical or'\n    signature = 'l orl l l'\n    cxx_operator = '||'\n    categories = [DocLogicalOperators]\n\nclass Andl(Operator):\n    full_name = 'logical and'\n    signature = 'l andl l l'\n    cxx_operator = '&&'\n    categories = [DocLogicalOperators]\n\nclass Andnotl(Operator):\n    full_name = 'logical andnot'\n    signature = 'l andnotl l l'\n    categories = [DocLogicalOperators]\n    desc = 'Returns the logical andnot of its arguments, more precisely ' \\\n           '\"arg1 and (not arg2)\"'\n\nclass Xorl(Operator):\n    full_name = 'logical xor'\n    signature = 'l xorl l l'\n    categories = [DocLogicalOperators]\n\nclass Notl(Operator):\n    full_name = 'logical not'\n    signature = 'l notl l'\n    cxx_operator = '!'\n    categories = [DocLogicalOperators]\n    bench_auto_against_std = True\n\nclass Add(Operator):\n    full_name = 'addition'\n    signature = 'v add v v'\n    cxx_operator = '+'\n    categories = [DocBasicArithmetic]\n    bench_auto_against_std = True\n    bench_auto_against_mipp = True\n\nclass Sub(Operator):\n    full_name = 'subtraction'\n    signature = 'v sub v v'\n    cxx_operator = '-'\n    categories = [DocBasicArithmetic]\n    bench_auto_against_std = True\n    bench_auto_against_mipp = True\n\nclass Addv(Operator):\n    full_name = 'horizontal sum'\n    signature = 's addv v'\n    categories = [DocMisc]\n    desc = 'Returns the sum of all the elements contained in v'\n    do_bench = False\n    types = common.ftypes\n\nclass Mul(Operator):\n    full_name = 'multiplication'\n    signature = 'v mul v v'\n    cxx_operator = '*'\n    categories = [DocBasicArithmetic]\n\nclass Div(Operator):\n    full_name = 'division'\n    signature = 'v div v v'\n    cxx_operator = '/'\n    domain = [[-20, 20], [0.5, 20]]\n    categories = [DocBasicArithmetic]\n\nclass Neg(Operator):\n    full_name = 'opposite'\n    signature = 'v neg v'\n    cxx_operator = '-'\n    categories = [DocBasicArithmetic]\n\nclass Min(Operator):\n    full_name = 'minimum'\n    signature = 'v min v v'\n    categories = [DocBasicArithmetic]\n\nclass Max(Operator):\n    full_name = 'maximum'\n    signature = 'v max v v'\n    categories = [DocBasicArithmetic]\n\nclass Shr(Operator):\n    full_name = 'right shift in zeros'\n    signature = 'v shr v p'\n    types = common.iutypes\n    cxx_operator = '>>'\n    categories = [DocBitsOperators]\n\nclass Shl(Operator):\n    full_name = 'left shift'\n    signature = 'v shl v p'\n    types = common.iutypes\n    cxx_operator = '<<'\n    categories = [DocBitsOperators]\n\nclass Shra(Operator):\n    full_name = 'arithmetic right shift'\n    signature = 'v shra v p'\n    types = common.iutypes\n    categories = [DocBitsOperators]\n    desc = 'Performs a right shift operation with sign extension.'\n\nclass Eq(Operator):\n    full_name = 'compare for equality'\n    signature = 'l eq v v'\n    cxx_operator = '=='\n    categories = [DocComparison]\n\nclass Ne(Operator):\n    full_name = 'compare for inequality'\n    signature = 'l ne v v'\n    cxx_operator = '!='\n    categories = [DocComparison]\n    desc = 'Compare the inputs for inequality.'\n\nclass Gt(Operator):\n    full_name = 'compare for greater-than'\n    signature = 'l gt v v'\n    cxx_operator = '>'\n    categories = [DocComparison]\n    desc = 'Compare the inputs for greater-than.'\n\nclass Ge(Operator):\n    full_name = 'compare for greater-or-equal-than'\n    signature = 'l ge v v'\n    cxx_operator = '>='\n    categories = [DocComparison]\n    desc = 'Compare the inputs for greater-or-equal-than.'\n\nclass Lt(Operator):\n    full_name = 'compare for lesser-than'\n    signature = 'l lt v v'\n    cxx_operator = '<'\n    categories = [DocComparison]\n    desc = 'Compare the inputs for lesser-than.'\n\nclass Le(Operator):\n    full_name = 'compare for lesser-or-equal-than'\n    signature = 'l le v v'\n    cxx_operator = '<='\n    categories = [DocComparison]\n    desc = 'Compare the inputs for lesser-or-equal-than.'\n\nclass If_else1(Operator):\n    full_name = 'blend'\n    signature = 'v if_else1 l v v'\n    categories = [DocMisc]\n    desc = 'Blend the inputs using the vector of logical as a first ' + \\\n           'argument. Elements of the second input is taken when the ' + \\\n           'corresponding elements from the vector of logicals is true, ' + \\\n           'otherwise elements of the second input are taken.'\n\nclass Abs(Operator):\n    full_name = 'absolute value'\n    signature = 'v abs v'\n    categories = [DocBasicArithmetic]\n\nclass Fma(Operator):\n    full_name = 'fused multiply-add'\n    signature = 'v fma v v v'\n    categories = [DocBasicArithmetic]\n    desc = 'Multiply the first and second inputs and then adds the third ' + \\\n           'input.'\n    tests_on_integers_only = True\n\nclass Fnma(Operator):\n    full_name = 'fused negate-multiply-add'\n    signature = 'v fnma v v v'\n    categories = [DocBasicArithmetic]\n    desc = 'Multiply the first and second inputs, negate the intermediate ' + \\\n           'result and then adds the third input.'\n    tests_on_integers_only = True\n\nclass Fms(Operator):\n    full_name = 'fused multiply-substract'\n    signature = 'v fms v v v'\n    categories = [DocBasicArithmetic]\n    desc = 'Substracts the third input to multiplication the first and ' + \\\n           'second inputs.'\n    tests_on_integers_only = True\n\nclass Fnms(Operator):\n    full_name = 'fused negate-multiply-substract'\n    signature = 'v fnms v v v'\n    categories = [DocBasicArithmetic]\n    desc = 'Multiply the first and second inputs, negate the intermediate ' + \\\n           'result and then substracts the third input to the ' + \\\n           'intermediate result.'\n    tests_on_integers_only = True\n\nclass Ceil(Operator):\n    full_name = 'rounding up to integer value'\n    signature = 'v ceil v'\n    categories = [DocRounding]\n\nclass Floor(Operator):\n    full_name = 'rounding down to integer value'\n    signature = 'v floor v'\n    categories = [DocRounding]\n\nclass Trunc(Operator):\n    full_name = 'rounding towards zero to integer value'\n    signature = 'v trunc v'\n    categories = [DocRounding]\n\nclass Round_to_even(Operator):\n    full_name = 'rounding to nearest integer value, tie to even'\n    signature = 'v round_to_even v'\n    categories = [DocRounding]\n\nclass All(Operator):\n    full_name = 'check all elements'\n    signature = 'p all l'\n    categories = [DocMisc]\n    desc = 'Return true if and only if all elements of the inputs are true.'\n\nclass Any(Operator):\n    full_name = 'check for one true elements'\n    signature = 'p any l'\n    categories = [DocMisc]\n    desc = 'Return true if and only if at least one element of the inputs ' + \\\n           'is true.'\n\nclass Nbtrue(Operator):\n    full_name = 'count true elements'\n    signature = 'p nbtrue l'\n    categories = [DocMisc]\n    desc = 'Return the number of true elements in the input.'\n\nclass Reinterpret(Operator):\n    full_name = 'reinterpret vector'\n    signature = 'v reinterpret v'\n    output_to = common.OUTPUT_TO_SAME_SIZE_TYPES\n    categories = [DocConversion]\n    desc = 'Reinterpret input vector into a different vector type ' + \\\n           'preserving all bits.'\n\nclass Reinterpretl(Operator):\n    full_name = 'reinterpret vector of logicals'\n    signature = 'l reinterpretl l'\n    categories = [DocConversion]\n    output_to = common.OUTPUT_TO_SAME_SIZE_TYPES\n    has_scalar_impl = False\n    desc = 'Reinterpret input vector of logicals into a different vector ' + \\\n           'type of logicals preserving all elements values. The output ' + \\\n           'type must have same length as input type.'\n\nclass Cvt(Operator):\n    full_name = 'convert vector'\n    signature = 'v cvt v'\n    output_to = common.OUTPUT_TO_SAME_SIZE_TYPES\n    categories = [DocConversion]\n    desc = 'Convert input vector into a different vector type. The output ' + \\\n           'type must have same length as input type.'\n\nclass Upcvt(Operator):\n    full_name = 'convert vector to larger type'\n    signature = 'vx2 upcvt v'\n    output_to = common.OUTPUT_TO_UP_TYPES\n    types = ['i8', 'u8', 'i16', 'u16', 'f16', 'i32', 'u32', 'f32']\n    categories = [DocConversion]\n    desc = 'Convert input vector into a different larger vector type. The ' + \\\n           'output type must be twice as large as the input type.'\n\nclass Downcvt(Operator):\n    full_name = 'convert vector to narrow type'\n    signature = 'v downcvt v v'\n    output_to = common.OUTPUT_TO_DOWN_TYPES\n    types = ['i16', 'u16', 'f16', 'i32', 'u32', 'f32', 'i64', 'u64', 'f64']\n    categories = [DocConversion]\n    desc = 'Convert input vector into a different narrow vector type. The ' + \\\n           'output type must be twice as less as the input type.'\n\nclass Rec(Operator):\n    full_name = 'reciprocal'\n    signature = 'v rec v'\n    types = common.ftypes\n    domain = [[-20, -0.5, 0.5, 20]]\n    categories = [DocBasicArithmetic]\n\nclass Rec11(Operator):\n    full_name = 'reciprocal with relative error at most $2^{-11}$'\n    signature = 'v rec11 v'\n    types = common.ftypes\n    categories = [DocBasicArithmetic]\n    domain = [[-20, -0.5, 0.5, 20]]\n    ufp = { 'f16': 10, 'f32': 10, 'f64': 10 }\n\nclass Rec8(Operator):\n    full_name = 'reciprocal with relative error at most $2^{-8}$'\n    signature = 'v rec8 v'\n    types = common.ftypes\n    categories = [DocBasicArithmetic]\n    domain = [[-20, -0.5, 0.5, 20]]\n    ufp = { 'f16': 7, 'f32': 7, 'f64': 7 }\n\nclass Sqrt(Operator):\n    full_name = 'square root'\n    signature = 'v sqrt v'\n    types = common.ftypes\n    domain = [[0, 20]]\n    categories = [DocBasicArithmetic]\n\nclass Rsqrt11(Operator):\n    full_name = 'square root with relative error at most $2^{-11}$'\n    signature = 'v rsqrt11 v'\n    types = common.ftypes\n    domain = [[0.5, 20]]\n    ufp = { 'f16': 10, 'f32': 10, 'f64': 10 }\n    categories = [DocBasicArithmetic]\n\nclass Rsqrt8(Operator):\n    full_name = 'square root with relative error at most $2^{-8}$'\n    signature = 'v rsqrt8 v'\n    types = common.ftypes\n    domain = [[0.5, 20]]\n    ufp = { 'f16': 7, 'f32': 7, 'f64': 7 }\n    categories = [DocBasicArithmetic]\n\nclass Ziplo(Operator):\n    full_name = 'zip low halves'\n    signature = 'v ziplo v v'\n    types = common.types\n    categories = [DocShuffle]\n    desc = 'Construct a vector where elements of the first low half input ' + \\\n           'are followed by the corresponding element of the second low ' + \\\n           'half input.'\n\nclass Ziphi(Operator):\n    full_name = 'zip high halves'\n    signature = 'v ziphi v v'\n    types = common.types\n    categories = [DocShuffle]\n    desc = 'Construct a vector where elements of the first high half ' + \\\n           'input are followed by the corresponding element of the second ' + \\\n           'high half input.'\n\nclass Unziplo(Operator):\n    full_name = 'unziplo'\n    signature = 'v unziplo v v'\n    types = common.types\n    categories = [DocShuffle]\n\nclass Unziphi(Operator):\n    full_name = 'unziphi'\n    signature = 'v unziphi v v'\n    types = common.types\n    categories = [DocShuffle]\n\nclass Zip(Operator):\n    full_name = 'zip'\n    signature = 'vx2 zip v v'\n    types = common.types\n    categories = [DocShuffle]\n\nclass Unzip(Operator):\n    full_name = 'unzip'\n    signature = 'vx2 unzip v v'\n    types = common.types\n    categories = [DocShuffle]\n\nclass ToMask(Operator):\n    full_name = 'build mask from logicals'\n    signature = 'v to_mask l'\n    categories = [DocLogicalOperators]\n    desc = 'Returns a mask consisting of all ones for true elements and ' + \\\n           'all zeros for false elements.'\n\nclass ToLogical(Operator):\n    full_name = 'build logicals from data'\n    signature = 'l to_logical v'\n    categories = [DocLogicalOperators]\n    desc = 'Returns a vector of logicals. Set true when the corresponding ' + \\\n           'elements are non zero (at least one bit to 1) and false ' + \\\n           'otherwise.'\n\nclass Iota(Operator):\n    full_name = 'fill vector with increasing values'\n    signature = 'v iota'\n    categories = [DocMisc]\n    desc = 'Returns a vectors whose first element is zero, the second is ' \\\n           'one and so on.'\n\nclass MaskForLoopTail(Operator):\n    full_name = 'build mask for ending loops'\n    signature = 'l mask_for_loop_tail p p'\n    categories = [DocMisc]\n    desc = 'Returns a mask for loading/storing data at loop tails by ' \\\n           'setting the first elements to True and the last to False. ' \\\n           'The first argument is index in a loop whose number of elements ' \\\n           'is given by the second argument.'\n\nclass Adds(Operator):\n    full_name = 'addition using saturation'\n    signature = 'v adds v v'\n    categories = [DocBasicArithmetic]\n    desc = 'Returns the saturated sum of the two vectors given as arguments'\n\nclass Subs(Operator):\n    full_name = 'subtraction using saturation'\n    signature = 'v subs v v'\n    categories = [DocBasicArithmetic]\n    desc = 'Returns the saturated subtraction of the two vectors given as ' \\\n           'arguments'\n\nclass Sin_u35(SrcOperator):\n    full_name = 'sine'\n    signature = 'v sin_u35 v'\n    sleef_symbol_prefix = 'nsimd_sleef_sin_u35'\n    categories = [DocTrigo]\n    desc = 'Compute the sine of its argument with a precision of 3.5 ulps. ' \\\n           'For more informations visit <https://sleef.org/purec.xhtml>.'\n\nclass Cos_u35(SrcOperator):\n    full_name = 'cosine'\n    signature = 'v cos_u35 v'\n    sleef_symbol_prefix = 'nsimd_sleef_cos_u35'\n    categories = [DocTrigo]\n    desc = 'Compute the cosine of its argument with a precision of ' \\\n           '3.5 ulps. ' \\\n           'For more informations visit <https://sleef.org/purec.xhtml>.'\n\nclass Tan_u35(SrcOperator):\n    full_name = 'tangent'\n    signature = 'v tan_u35 v'\n    sleef_symbol_prefix = 'nsimd_sleef_tan_u35'\n    domain = [[-4.7, -1.6, -1.5, 1.5, 1.6, 4.7]]\n    categories = [DocTrigo]\n    desc = 'Compute the tangent of its argument with a precision of ' \\\n           '3.5 ulps. ' \\\n           'For more informations visit <https://sleef.org/purec.xhtml>.'\n\nclass Asin_u35(SrcOperator):\n    full_name = 'arcsine'\n    signature = 'v asin_u35 v'\n    sleef_symbol_prefix = 'nsimd_sleef_asin_u35'\n    domain = [[-0.9, 0.9]]\n    categories = [DocTrigo]\n    desc = 'Compute the arcsine of its argument with a precision of ' \\\n           '3.5 ulps. ' \\\n           'For more informations visit <https://sleef.org/purec.xhtml>.'\n\nclass Acos_u35(SrcOperator):\n    full_name = 'arccosine'\n    signature = 'v acos_u35 v'\n    sleef_symbol_prefix = 'nsimd_sleef_acos_u35'\n    domain = [[-0.9, 0.9]]\n    categories = [DocTrigo]\n    desc = 'Compute the arccosine of its argument with a ' \\\n           'precision of 3.5 ulps. ' \\\n           'For more informations visit <https://sleef.org/purec.xhtml>.'\n\nclass Atan_u35(SrcOperator):\n    full_name = 'arctangent'\n    signature = 'v atan_u35 v'\n    sleef_symbol_prefix = 'nsimd_sleef_atan_u35'\n    categories = [DocTrigo]\n    desc = 'Compute the arctangent of its argument with a ' \\\n           'precision of 3.5 ulps. ' \\\n           'For more informations visit <https://sleef.org/purec.xhtml>.'\n\nclass Atan2_u35(SrcOperator):\n    full_name = 'arctangent'\n    signature = 'v atan2_u35 v v'\n    sleef_symbol_prefix = 'nsimd_sleef_atan2_u35'\n    domain = [[-20, 20], [-20, -0.5, 0.5, 20]]\n    categories = [DocTrigo]\n    desc = 'Compute the arctangent of its argument with a ' \\\n           'precision of 3.5 ulps. ' \\\n           'For more informations visit <https://sleef.org/purec.xhtml>.'\n\nclass Log_u35(SrcOperator):\n    full_name = 'natural logarithm'\n    signature = 'v log_u35 v'\n    sleef_symbol_prefix = 'nsimd_sleef_log_u35'\n    domain = [[0.5, 20]]\n    categories = [DocExpLog]\n    desc = 'Compute the natural logarithm of its argument with a ' \\\n           'precision of 3.5 ulps. ' \\\n           'For more informations visit <https://sleef.org/purec.xhtml>.'\n\nclass Cbrt_u35(SrcOperator):\n    full_name = 'cube root'\n    signature = 'v cbrt_u35 v'\n    sleef_symbol_prefix = 'nsimd_sleef_cbrt_u35'\n    categories = [DocBasicArithmetic]\n    desc = 'Compute the cube root of its argument with a precision of ' \\\n           '3.5 ulps. ' \\\n           'For more informations visit <https://sleef.org/purec.xhtml>.'\n\nclass Sin_u10(SrcOperator):\n    full_name = 'sine'\n    signature = 'v sin_u10 v'\n    sleef_symbol_prefix = 'nsimd_sleef_sin_u10'\n    categories = [DocTrigo]\n    desc = 'Compute the sine of its argument with a precision of 1.0 ulps. ' \\\n           'For more informations visit <https://sleef.org/purec.xhtml>.'\n\nclass Cos_u10(SrcOperator):\n    full_name = 'cosine'\n    signature = 'v cos_u10 v'\n    sleef_symbol_prefix = 'nsimd_sleef_cos_u10'\n    categories = [DocTrigo]\n    desc = 'Compute the cosine of its argument with a precision of ' \\\n           '1.0 ulps. ' \\\n           'For more informations visit <https://sleef.org/purec.xhtml>.'\n\nclass Tan_u10(SrcOperator):\n    full_name = 'tangent'\n    signature = 'v tan_u10 v'\n    sleef_symbol_prefix = 'nsimd_sleef_tan_u10'\n    domain = [[-4.7, -1.6, -1.5, 1.5, 1.6, 4.7]]\n    categories = [DocTrigo]\n    desc = 'Compute the tangent of its argument with a precision of ' \\\n           '1.0 ulps. ' \\\n           'For more informations visit <https://sleef.org/purec.xhtml>.'\n\nclass Asin_u10(SrcOperator):\n    full_name = 'arcsine'\n    signature = 'v asin_u10 v'\n    sleef_symbol_prefix = 'nsimd_sleef_asin_u10'\n    domain = [[-0.9, 0.9]]\n    categories = [DocTrigo]\n    desc = 'Compute the arcsine of its argument with a precision of ' \\\n           '1.0 ulps. ' \\\n           'For more informations visit <https://sleef.org/purec.xhtml>.'\n\nclass Acos_u10(SrcOperator):\n    full_name = 'arccosine'\n    signature = 'v acos_u10 v'\n    sleef_symbol_prefix = 'nsimd_sleef_acos_u10'\n    domain = [[-0.9, 0.9]]\n    categories = [DocTrigo]\n    desc = 'Compute the arccosine of its argument with a precision of ' \\\n           '1.0 ulps. ' \\\n           'For more informations visit <https://sleef.org/purec.xhtml>.'\n\nclass Atan_u10(SrcOperator):\n    full_name = 'arctangent'\n    signature = 'v atan_u10 v'\n    sleef_symbol_prefix = 'nsimd_sleef_atan_u10'\n    categories = [DocTrigo]\n    desc = 'Compute the arctangent of its argument with a precision of ' \\\n           '1.0 ulps. ' \\\n           'For more informations visit <https://sleef.org/purec.xhtml>.'\n\nclass Atan2_u10(SrcOperator):\n    full_name = 'arctangent'\n    signature = 'v atan2_u10 v v'\n    sleef_symbol_prefix = 'nsimd_sleef_atan2_u10'\n    domain = [[-20, 20], [-20, -0.5, 0.5, 20]]\n    categories = [DocTrigo]\n    desc = 'Compute the arctangent of its argument with a precision of ' \\\n           '1.0 ulps. ' \\\n           'For more informations visit <https://sleef.org/purec.xhtml>.'\n\nclass Log_u10(SrcOperator):\n    full_name = 'natural logarithm'\n    signature = 'v log_u10 v'\n    sleef_symbol_prefix = 'nsimd_sleef_log_u10'\n    domain = [[0.5, 20]]\n    categories = [DocExpLog]\n    desc = 'Compute the natural logarithm of its argument with a ' \\\n           'precision of 1.0 ulps. ' \\\n           'For more informations visit <https://sleef.org/purec.xhtml>.'\n\nclass Cbrt_u10(SrcOperator):\n    full_name = 'cube root'\n    signature = 'v cbrt_u10 v'\n    sleef_symbol_prefix = 'nsimd_sleef_cbrt_u10'\n    categories = [DocBasicArithmetic]\n    desc = 'Compute the cube root of its argument with a precision of ' \\\n           '1.0 ulps. ' \\\n           'For more informations visit <https://sleef.org/purec.xhtml>.'\n\nclass Exp_u10(SrcOperator):\n    full_name = 'base-e exponential'\n    signature = 'v exp_u10 v'\n    sleef_symbol_prefix = 'nsimd_sleef_exp_u10'\n    domain = [[-20, 5]]\n    categories = [DocExpLog]\n    desc = 'Compute the base-e exponential of its argument with a ' \\\n           'precision of 1.0 ulps. ' \\\n           'For more informations visit <https://sleef.org/purec.xhtml>.'\n\nclass Pow_u10(SrcOperator):\n    full_name = 'power'\n    signature = 'v pow_u10 v v'\n    sleef_symbol_prefix = 'nsimd_sleef_pow_u10'\n    domain = [[0, 5], [-5, 5]]\n    categories = [DocExpLog]\n    desc = 'Compute the power of its argument with a precision of 1.0 ulps. ' \\\n           'For more informations visit <https://sleef.org/purec.xhtml>.'\n\nclass Sinh_u10(SrcOperator):\n    full_name = 'hyperbolic sine'\n    signature = 'v sinh_u10 v'\n    sleef_symbol_prefix = 'nsimd_sleef_sinh_u10'\n    categories = [DocHyper]\n    desc = 'Compute the hyperbolic sine of its argument with a ' \\\n           'precision of 1.0 ulps. ' \\\n           'For more informations visit <https://sleef.org/purec.xhtml>.'\n\nclass Cosh_u10(SrcOperator):\n    full_name = 'hyperbolic cosine'\n    signature = 'v cosh_u10 v'\n    sleef_symbol_prefix = 'nsimd_sleef_cosh_u10'\n    categories = [DocHyper]\n    desc = 'Compute the hyperbolic cosine of its argument with a ' \\\n           'precision of 1.0 ulps. ' \\\n           'For more informations visit <https://sleef.org/purec.xhtml>.'\n\nclass Tanh_u10(SrcOperator):\n    full_name = 'hyperbolic tangent'\n    signature = 'v tanh_u10 v'\n    sleef_symbol_prefix = 'nsimd_sleef_tanh_u10'\n    categories = [DocHyper]\n    desc = 'Compute the hyperbolic tangent of its argument with a ' \\\n           'precision of 1.0 ulps. ' \\\n           'For more informations visit <https://sleef.org/purec.xhtml>.'\n\nclass Sinh_u35(SrcOperator):\n    full_name = 'hyperbolic sine'\n    signature = 'v sinh_u35 v'\n    sleef_symbol_prefix = 'nsimd_sleef_sinh_u35'\n    categories = [DocHyper]\n    desc = 'Compute the hyperbolic sine of its argument with a ' \\\n           'precision of 3.5 ulps. ' \\\n           'For more informations visit <https://sleef.org/purec.xhtml>.'\n\nclass Cosh_u35(SrcOperator):\n    full_name = 'hyperbolic cosine'\n    signature = 'v cosh_u35 v'\n    sleef_symbol_prefix = 'nsimd_sleef_cosh_u35'\n    categories = [DocHyper]\n    desc = 'Compute the hyperbolic cosine of its argument with a ' \\\n           'precision of 3.5 ulps. ' \\\n           'For more informations visit <https://sleef.org/purec.xhtml>.'\n\nclass Tanh_u35(SrcOperator):\n    full_name = 'hyperbolic tangent'\n    signature = 'v tanh_u35 v'\n    sleef_symbol_prefix = 'nsimd_sleef_tanh_u35'\n    categories = [DocHyper]\n    desc = 'Compute the hyperbolic tangent of its argument with a ' \\\n           'precision of 3.5 ulps. ' \\\n           'For more informations visit <https://sleef.org/purec.xhtml>.'\n\nclass Asinh_u10(SrcOperator):\n    full_name = 'inverse hyperbolic sine'\n    signature = 'v asinh_u10 v'\n    sleef_symbol_prefix = 'nsimd_sleef_asinh_u10'\n    categories = [DocHyper]\n    desc = 'Compute the inverse hyperbolic sine of its argument with a ' \\\n           'precision of 1.0 ulps. ' \\\n           'For more informations visit <https://sleef.org/purec.xhtml>.'\n\nclass Acosh_u10(SrcOperator):\n    full_name = 'inverse hyperbolic cosine'\n    signature = 'v acosh_u10 v'\n    sleef_symbol_prefix = 'nsimd_sleef_acosh_u10'\n    categories = [DocHyper]\n    domain = [[1, 20]]\n    desc = 'Compute the inverse hyperbolic cosine of its argument with a ' \\\n           'precision of 1.0 ulps. ' \\\n           'For more informations visit <https://sleef.org/purec.xhtml>.'\n\nclass Atanh_u10(SrcOperator):\n    full_name = 'inverse hyperbolic tangent'\n    signature = 'v atanh_u10 v'\n    sleef_symbol_prefix = 'nsimd_sleef_atanh_u10'\n    domain = [[-0.9, 0.9]]\n    categories = [DocHyper]\n    desc = 'Compute the inverse hyperbolic tangent of its argument with a ' \\\n           'precision of 1.0 ulps. ' \\\n           'For more informations visit <https://sleef.org/purec.xhtml>.'\n\nclass Exp2_u10(SrcOperator):\n    full_name = 'base-2 exponential'\n    signature = 'v exp2_u10 v'\n    sleef_symbol_prefix = 'nsimd_sleef_exp2_u10'\n    domain = [[-20, 5]]\n    categories = [DocExpLog]\n    desc = 'Compute the base-2 exponential of its argument with a ' \\\n           'precision of 1.0 ulps. ' \\\n           'For more informations visit <https://sleef.org/purec.xhtml>.'\n\nclass Exp2_u35(SrcOperator):\n    full_name = 'base-2 exponential'\n    signature = 'v exp2_u35 v'\n    sleef_symbol_prefix = 'nsimd_sleef_exp2_u35'\n    domain = [[-20, 5]]\n    categories = [DocExpLog]\n    desc = 'Compute the base-2 exponential of its argument with a ' \\\n           'precision of 3.5 ulps. ' \\\n           'For more informations visit <https://sleef.org/purec.xhtml>.'\n\nclass Exp10_u10(SrcOperator):\n    full_name = 'base-10 exponential'\n    signature = 'v exp10_u10 v'\n    sleef_symbol_prefix = 'nsimd_sleef_exp10_u10'\n    domain = [[-5, 3]]\n    categories = [DocExpLog]\n    desc = 'Compute the base-10 exponential of its argument with a ' \\\n           'precision of 1.0 ulps. ' \\\n           'For more informations visit <https://sleef.org/purec.xhtml>.'\n\nclass Exp10_u35(SrcOperator):\n    full_name = 'base-10 exponential'\n    signature = 'v exp10_u35 v'\n    sleef_symbol_prefix = 'nsimd_sleef_exp10_u35'\n    domain = [[-5, 3]]\n    categories = [DocExpLog]\n    desc = 'Compute the base-10 exponential of its argument with a ' \\\n           'precision of 3.5 ulps. ' \\\n           'For more informations visit <https://sleef.org/purec.xhtml>.'\n\nclass Expm1_u10(SrcOperator):\n    full_name = 'exponential minus 1'\n    signature = 'v expm1_u10 v'\n    sleef_symbol_prefix = 'nsimd_sleef_expm1_u10'\n    domain = [[-5, 3]]\n    categories = [DocExpLog]\n    desc = 'Compute the exponential minus 1 of its argument with a ' \\\n           'precision of 1.0 ulps. ' \\\n           'For more informations visit <https://sleef.org/purec.xhtml>.'\n\nclass Log10_u10(SrcOperator):\n    full_name = 'base-10 logarithm'\n    signature = 'v log10_u10 v'\n    sleef_symbol_prefix = 'nsimd_sleef_log10_u10'\n    domain = [[0.5, 20]]\n    categories = [DocExpLog]\n    desc = 'Compute the base-10 logarithm of its argument with a precision ' \\\n           'of 1.0 ulps. ' \\\n           'For more informations visit <https://sleef.org/purec.xhtml>.'\n\nclass Log2_u10(SrcOperator):\n    full_name = 'base-2 logarithm'\n    signature = 'v log2_u10 v'\n    sleef_symbol_prefix = 'nsimd_sleef_log2_u10'\n    domain = [[0.5, 20]]\n    categories = [DocExpLog]\n    desc = 'Compute the base-2 logarithm of its argument with a precision ' \\\n           'of 1.0 ulps. ' \\\n           'For more informations visit <https://sleef.org/purec.xhtml>.'\n\nclass Log2_u35(SrcOperator):\n    full_name = 'base-2 logarithm'\n    signature = 'v log2_u35 v'\n    sleef_symbol_prefix = 'nsimd_sleef_log2_u35'\n    domain = [[0.5, 20]]\n    categories = [DocExpLog]\n    desc = 'Compute the base-2 logarithm of its argument with a ' \\\n           'precision of 3.5 ulps. ' \\\n           'For more informations visit <https://sleef.org/purec.xhtml>.'\n\nclass Log1p_u10(SrcOperator):\n    full_name = 'logarithm of 1 plus argument'\n    signature = 'v log1p_u10 v'\n    sleef_symbol_prefix = 'nsimd_sleef_log1p_u10'\n    domain = [[-0.5, 19]]\n    categories = [DocExpLog]\n    desc = 'Compute the logarithm of 1 plus argument of its argument with ' \\\n           'a precision of 1.0 ulps. ' \\\n           'For more informations visit <https://sleef.org/purec.xhtml>.'\n\nclass Sinpi_u05(SrcOperator):\n    full_name = 'sine of pi times argument'\n    signature = 'v sinpi_u05 v'\n    sleef_symbol_prefix = 'nsimd_sleef_sinpi_u05'\n    categories = [DocTrigo]\n    desc = 'Compute the sine of pi times argument of its argument with a ' \\\n           'precision of 0.5 ulps. ' \\\n           'For more informations visit <https://sleef.org/purec.xhtml>.'\n\nclass Cospi_u05(SrcOperator):\n    full_name = 'cosine of pi times argument'\n    signature = 'v cospi_u05 v'\n    sleef_symbol_prefix = 'nsimd_sleef_cospi_u05'\n    categories = [DocTrigo]\n    desc = 'Compute the cosine of pi times argument of its argument with ' \\\n           'a precision of 0.5 ulps. ' \\\n           'For more informations visit <https://sleef.org/purec.xhtml>.'\n\nclass Hypot_u05(SrcOperator):\n    full_name = 'Euclidean distance'\n    signature = 'v hypot_u05 v v'\n    sleef_symbol_prefix = 'nsimd_sleef_hypot_u05'\n    categories = [DocBasicArithmetic]\n    desc = 'Compute the Euclidean distance of its argument with a ' \\\n           'precision of 0.5 ulps. ' \\\n           'For more informations visit <https://sleef.org/purec.xhtml>.'\n\nclass Hypot_u35(SrcOperator):\n    full_name = 'Euclidean distance'\n    signature = 'v hypot_u35 v v'\n    sleef_symbol_prefix = 'nsimd_sleef_hypot_u35'\n    categories = [DocBasicArithmetic]\n    desc = 'Compute the Euclidean distance of its argument with a ' \\\n           'precision of 3.5 ulps. ' \\\n           'For more informations visit <https://sleef.org/purec.xhtml>.'\n\nclass Remainder(SrcOperator):\n    full_name = 'floating-point remainder'\n    signature = 'v remainder v v'\n    sleef_symbol_prefix = 'nsimd_sleef_remainder'\n    domain = [[1, 20], [1, 20]]\n    categories = [DocBasicArithmetic]\n    desc = 'Compute the floating-point remainder of its arguments. ' \\\n           'For more informations visit <https://sleef.org/purec.xhtml>.'\n\nclass Fmod(SrcOperator):\n    full_name = 'floating-point remainder'\n    signature = 'v fmod v v'\n    sleef_symbol_prefix = 'nsimd_sleef_fmod'\n    domain = [[1, 20], [1, 20]]\n    categories = [DocBasicArithmetic]\n    desc = 'Compute the floating-point remainder of its argument. ' \\\n           'For more informations visit <https://sleef.org/purec.xhtml>.'\n\nclass Lgamma_u10(SrcOperator):\n    full_name = 'log gamma'\n    signature = 'v lgamma_u10 v'\n    sleef_symbol_prefix = 'nsimd_sleef_lgamma_u10'\n    domain = [[0.5, 20]]\n    categories = [DocExpLog]\n    desc = 'Compute the log gamma of its argument with a precision of ' \\\n           '1.0 ulps. ' \\\n           'For more informations visit <https://sleef.org/purec.xhtml>.'\n\nclass Tgamma_u10(SrcOperator):\n    full_name = 'true gamma'\n    signature = 'v tgamma_u10 v'\n    sleef_symbol_prefix = 'nsimd_sleef_tgamma_u10'\n    domain = [[0.5, 5]]\n    categories = [DocExpLog]\n    desc = 'Compute the true gamma of its argument with a precision of ' \\\n           '1.0 ulps. ' \\\n           'For more informations visit <https://sleef.org/purec.xhtml>.'\n\nclass Erf_u10(SrcOperator):\n    full_name = 'complementary error'\n    signature = 'v erf_u10 v'\n    sleef_symbol_prefix = 'nsimd_sleef_erf_u10'\n    categories = [DocExpLog]\n    desc = 'Compute the complementary error of its argument with a ' \\\n           'precision of 1.0 ulps. ' \\\n           'For more informations visit <https://sleef.org/purec.xhtml>.'\n\nclass Erfc_u15(SrcOperator):\n    full_name = 'complementary error'\n    signature = 'v erfc_u15 v'\n    sleef_symbol_prefix = 'nsimd_sleef_erfc_u15'\n    categories = [DocExpLog]\n    desc = 'Compute the complementary error of its argument with a ' \\\n           'precision of 1.5 ulps. ' \\\n           'For more informations visit <https://sleef.org/purec.xhtml>.'\n\n"
  },
  {
    "path": "egg/platform_arm.py",
    "content": "# Copyright (c) 2020 Agenium Scale\n#\n# Permission is hereby granted, free of charge, to any person obtaining a copy\n# of this software and associated documentation files (the \"Software\"), to deal\n# in the Software without restriction, including without limitation the rights\n# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n# copies of the Software, and to permit persons to whom the Software is\n# furnished to do so, subject to the following conditions:\n#\n# The above copyright notice and this permission notice shall be included in all\n# copies or substantial portions of the Software.\n#\n# THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n# SOFTWARE.\n\n# This file gives the implementation of platform ARM, i.e. ARM SIMD.\n# Reading this file is rather straightforward. ARM SIMD extensions are rather\n# coherent and consistent. It implements the following architectures:\n#   - ARMv7   -> 128 bits registers without f16 and f64 support\n#   - Aarch32 -> 128 bits registers with optional f16 and without f64 support\n#   - Aarch64 -> 128 bits registers with optional f16 and f64 support\n#   - SVE     -> up to 2048 bits registers\n# The first three SIMD extensions are collectively called NEON. Aarch32 and\n# Aarch64 correspond respectively to ARMv8 32 and 64 bits chips. Note that\n# the ARM documentation says that ARMv7, Aarch32 are different but it seems\n# that they differ by only a handful of intrinsics which are not in the scope\n# of NSIMD so we have implemented the following:\n#\n#   - ARMv7   \\  -> neon128\n#   - Aarch32 /\n#   - Aarch64    -> aarch64\n#   - SVE        -> sve\n\nimport common\n\n# -----------------------------------------------------------------------------\n# Helpers\n\n\ndef neon_typ(typ):\n    prefix = {'i': 'int', 'u': 'uint', 'f': 'float'}\n    return '{}{}x{}_t'.format(prefix[typ[0]], typ[1:], 128 // int(typ[1:]))\n\ndef half_neon64_typ(typ):\n    prefix = {'i': 'int', 'u': 'uint', 'f': 'float'}\n    return '{}{}x{}_t'.format(prefix[typ[0]], typ[1:], 64 // int(typ[1:]))\n\n\ndef sve_typ(typ):\n    prefix = {'i': 'svint', 'u': 'svuint', 'f': 'svfloat'}\n    return '{}{}_t'.format(prefix[typ[0]], typ[1:])\n\ndef suf(typ):\n    if typ[0] == 'i':\n        return 's{}'.format(typ[1:])\n    else:\n        return typ\n\nneon = ['neon128', 'aarch64']\nfixed_sized_sve = ['sve128', 'sve256', 'sve512', 'sve1024', 'sve2048']\nsve = ['sve'] + fixed_sized_sve\nfmtspec = {}\n\ndef convert_from_predicate(opts, op):\n    if opts.sve_emulate_bool:\n        return '''svsel({op},\n                    svdup_n_u{typnbits}_x({svtrue}, (u{typnbits})~0),\n                    svdup_n_u{typnbits}_x({svtrue}, 0))'''. \\\n                            format(op=op, **fmtspec)\n    else:\n        return op\n\ndef convert_to_predicate(opts, op):\n    if opts.sve_emulate_bool:\n        # TODO: the casts are a workaround to avoid a bug in gcc trunk for sve\n        # it needs to be deleted when the bug is corrected\n        return '''svcmpeq({svtrue}, (svuint{typnbits}_t){op},\n                          svdup_n_u{typnbits}_x({svtrue},\n                          (u{typnbits})~0))'''.format(op=op, **fmtspec)\n    else:\n        return op\n\n# -----------------------------------------------------------------------------\n# Implementation of mandatory functions for this module\n\ndef get_simd_exts():\n    return ['neon128', 'aarch64', 'sve', 'sve128', 'sve256', 'sve512',\n            'sve1024', 'sve2048']\n\ndef get_prev_simd_ext(simd_ext):\n    if simd_ext in ['neon128', 'aarch64']:\n        return 'cpu'\n    elif simd_ext in sve:\n        return 'aarch64'\n    raise ValueError('Unknown SIMD extension \"{}\"'.format(simd_ext))\n\ndef emulate_fp16(simd_ext):\n    if not simd_ext in get_simd_exts():\n        raise ValueError('Unknown SIMD extension \"{}\"'.format(simd_ext))\n    if simd_ext in sve:\n        return False\n    else:\n        return True\n\ndef get_type(opts, simd_ext, typ, nsimd_typ):\n    if simd_ext in neon:\n        if typ == 'f64':\n            if simd_ext == 'neon128':\n                return 'typedef struct {{ double v0; double v1; }} {};'. \\\n                       format(nsimd_typ)\n            else:\n                return 'typedef {} {};'.format(neon_typ('f64'), nsimd_typ)\n        elif typ == 'f16':\n            return '''\n                   #ifdef NSIMD_ARM_FP16\n                     typedef float16x8_t {nsimd_typ};\n                   #else\n                     typedef struct {{ float32x4_t v0; float32x4_t v1; }}\n                         {nsimd_typ};\n                   #endif\n                   '''.format(nsimd_typ=nsimd_typ) # extra \\n are necessary\n        else:\n            return 'typedef {} {};'.format(neon_typ(typ), nsimd_typ)\n    elif simd_ext == 'sve':\n        return 'typedef {} {};'.format(sve_typ(typ), nsimd_typ)\n    elif simd_ext in fixed_sized_sve:\n        return 'typedef {} {} __attribute__((arm_sve_vector_bits({})));'. \\\n               format(sve_typ(typ), nsimd_typ, simd_ext[3:])\n    else:\n        raise ValueError('Unknown SIMD extension \"{}\"'.format(simd_ext))\n\ndef get_logical_type(opts, simd_ext, typ, nsimd_typ):\n    if typ not in common.types:\n        raise ValueError('Unknown type \"{}\"'.format(typ))\n    if simd_ext not in get_simd_exts():\n        raise ValueError('Unknown SIMD extension \"{}\"'.format(simd_ext))\n\n    if typ in common.ftypes + common.itypes:\n        typ2 = 'u{}'.format(typ[1:]);\n    else:\n        typ2 = typ\n\n    if simd_ext == 'neon128':\n        if typ == 'f16':\n            return \\\n            '''\n            #ifdef NSIMD_ARM_FP16\n              typedef uint16x8_t {nsimd_typ};\n            #else\n              typedef struct {{ uint32x4_t v0; uint32x4_t v1; }} {nsimd_typ};\n            #endif\n            '''.format(nsimd_typ=nsimd_typ) # extra \\n are necessary\n        elif typ == 'f64':\n            return 'typedef struct {{ u64 v0; u64 v1; }} {};'.format(nsimd_typ)\n        else:\n            return get_type(opts, simd_ext, typ2, nsimd_typ)\n    if simd_ext == 'aarch64':\n        if typ == 'f16':\n            return get_logical_type(opts, 'neon128', 'f16', nsimd_typ)\n        else:\n            return get_type(opts, simd_ext, typ2, nsimd_typ)\n    elif simd_ext in sve:\n        if opts.sve_emulate_bool:\n            return get_type(opts, simd_ext, 'u' + typ[1:], nsimd_typ)\n        elif simd_ext in fixed_sized_sve:\n            return \\\n            'typedef svbool_t {} __attribute__((arm_sve_vector_bits({})));'. \\\n            format(nsimd_typ, simd_ext[3:])\n        else:\n            return 'typedef svbool_t {};'.format(nsimd_typ)\n\ndef get_nb_registers(simd_ext):\n    if simd_ext in neon:\n        return '16'\n    elif simd_ext in sve:\n        return '32'\n    else:\n        raise ValueError('Unknown SIMD extension \"{}\"'.format(simd_ext))\n\n\ndef get_native_soa_typ(simd_ext, typ, deg):\n    prefix = { 'i': 'int', 'u': 'uint', 'f': 'float' }[typ[0]]\n    if simd_ext in sve:\n        return 'sv{}x{}_t'.format(prefix + typ[1:], deg)\n    else:\n        return '{}{}x{}x{}_t'.format(prefix, typ[1:], 128 // int(typ[1:]),\n                                     deg)\n\n\ndef get_SoA_type(simd_ext, typ, deg, nsimd_typ):\n    if simd_ext != 'sve':\n        raise ValueError('SIMD extension must be \"sve\"')\n    prefix = { 'i': 'int', 'u': 'uint', 'f': 'float' }[typ[0]]\n    return 'typedef {} {};'.format(get_native_soa_typ(simd_ext, typ, deg),\n                                   nsimd_typ)\n\n\ndef has_compatible_SoA_types(simd_ext):\n    if simd_ext not in neon + sve:\n        raise ValueError('Unknown SIMD extension \"{}\"'.format(simd_ext))\n    return False\n\n# -----------------------------------------------------------------------------\n\ndef get_additional_include(func, platform, simd_ext):\n    ret = '''#include <nsimd/cpu/cpu/{}.h>\n             '''.format(func)\n    if simd_ext in sve:\n        ret += '''#include <nsimd/arm/aarch64/{}.h>\n                  '''.format(func)\n    if func in ['load2u', 'load3u', 'load4u', 'load2a', 'load3a', 'load4a']:\n        deg = func[4]\n        ret += '''#if NSIMD_CXX > 0\n                  extern \"C\" {{\n                  #endif\n\n                  NSIMD_INLINE nsimd_{simd_ext}_vu16x{deg}\n                  nsimd_{func}_{simd_ext}_u16(const u16*);\n\n                  # if NSIMD_CXX > 0\n                  }} // extern \"C\"\n                  #endif\n\n                  '''.format(func=func, deg=deg, simd_ext=simd_ext)\n    if func in ['mask_storea1', 'mask_storeu1', 'masko_loada1',\n                'masko_loadu1', 'maskz_loada1', 'maskz_loadu1'] and \\\n                simd_ext not in sve:\n        ret += '''#include <nsimd/scalar_utilities.h>\n                  '''\n    if func == 'mask_for_loop_tail' and simd_ext not in sve:\n        ret += '''#include <nsimd/arm/{simd_ext}/set1.h>\n                  #include <nsimd/arm/{simd_ext}/set1l.h>\n                  #include <nsimd/arm/{simd_ext}/iota.h>\n                  #include <nsimd/arm/{simd_ext}/lt.h>\n                  '''.format(simd_ext=simd_ext)\n    if simd_ext == 'neon128' and func == 'notl':\n        ret += '''#include <nsimd/arm/neon128/notb.h>\n                  '''\n    if simd_ext in neon and func == 'ne':\n        ret += '''#include <nsimd/arm/{simd_ext}/eq.h>\n                  # include <nsimd/arm/{simd_ext}/notl.h>\n                  '''.format(simd_ext=simd_ext)\n    if simd_ext in neon and func in ['fms', 'fnms']:\n        ret += '''#include <nsimd/arm/{simd_ext}/ne.h>\n                  #include <nsimd/arm/{simd_ext}/fma.h>\n                  #include <nsimd/arm/{simd_ext}/fnma.h>\n                  '''.format(simd_ext=simd_ext)\n    if func == 'shra':\n        ret += '''#include <nsimd/arm/{simd_ext}/shr.h>\n        '''.format(simd_ext=simd_ext)\n\n    if func in ['loadlu', 'loadla']:\n        ret += '''#include <nsimd/arm/{simd_ext}/eq.h>\n                  # include <nsimd/arm/{simd_ext}/set1.h>\n                  # include <nsimd/arm/{simd_ext}/{load}.h>\n                  # include <nsimd/arm/{simd_ext}/notl.h>\n                  '''.format(load='load' + func[5], simd_ext=simd_ext)\n    if func in ['storelu', 'storela']:\n        ret += '''#include <nsimd/arm/{simd_ext}/if_else1.h>\n                  # include <nsimd/arm/{simd_ext}/set1.h>\n                  # include <nsimd/arm/{simd_ext}/{store}.h>\n                  '''.format(store='store' + func[6], simd_ext=simd_ext)\n    if func == 'to_logical':\n        ret += '''#include <nsimd/arm/{simd_ext}/reinterpret.h>\n                  #include <nsimd/arm/{simd_ext}/ne.h>\n                  ''' .format(simd_ext=simd_ext)\n    if func == 'zip':\n        ret += '''#include <nsimd/arm/{simd_ext}/ziplo.h>\n                  #include <nsimd/arm/{simd_ext}/ziphi.h>\n                  '''.format(simd_ext=simd_ext)\n    if func == 'unzip':\n        ret += '''#include <nsimd/arm/{simd_ext}/unziplo.h>\n                  #include <nsimd/arm/{simd_ext}/unziphi.h>\n                  '''.format(simd_ext=simd_ext)\n    if func == 'adds':\n        ret += '''#include <nsimd/arm/{simd_ext}/add.h>\n                  '''.format(simd_ext=simd_ext)\n    if func == 'subs':\n        ret += '''#include <nsimd/arm/{simd_ext}/sub.h>\n                  '''.format(simd_ext=simd_ext)\n    if func in ['gather', 'scatter'] and simd_ext == 'sve':\n        ret += '''#include <nsimd/arm/sve/len.h>\n                  '''\n    return ret\n\n# -----------------------------------------------------------------------------\n# Emulators\n\ndef emulate_op1(op, simd_ext, typ):\n    if simd_ext in neon:\n        le = 128 // int(typ[1:]);\n        return '''int i;\n                  {typ} buf[{le}];\n                  vst1q_{suf}(buf, {in0});\n                  for (i=0; i < {le}; i += nsimd_len_cpu_{typ}()) {{\n                    nsimd_storeu_cpu_{typ}( & buf[i], nsimd_{op}_cpu_{typ}(\n                      nsimd_loadu_cpu_{typ}(&buf[i])));}}\n                  return vld1q_{suf}(buf); '''. \\\n                  format(op=op, le=le, **fmtspec)\n    if simd_ext in sve:\n        le = 2048 // int(typ[1:]);\n        return '''int i;\n                  {typ} buf[{le}];\n                  svst1_{suf}({svtrue}, buf, {in0});\n                  for (i=0; i < simd_len_{simd_ext}_{typ}();\n                       i += nsimd_len_cpu_{typ}()) {{\n                    nsimd_storeu_cpu_{typ}( & buf[i], nsimd_{op}_cpu_{typ}(\n                      nsimd_loadu_cpu_{typ}(&buf[i])));}}\n                  return svld1_{suf}({svtrue}, buf); '''. \\\n                  format(op=op, le=le, **fmtspec)\n\ndef emulate_op2(op, simd_ext, typ):\n    if simd_ext in neon:\n        le = 128 // int(typ[1:]);\n        return '''int i;\n                  {typ} buf0[{le}], buf1[{le}];\n                  vst1q_{suf}(buf0, {in0});\n                  vst1q_{suf}(buf1, {in1});\n                  for (i=0; i < {le}; i++) {{\n                    buf0[i] = ({typ})(buf0[i] {op} buf1[i]);}}\n                  return vld1q_{suf}(buf0); '''. \\\n                  format(op=op, le=le, **fmtspec)\n    if simd_ext in sve:\n        le = 2048 // int(typ[1:]);\n        return '''int i;\n                  {typ} buf0[{le}], buf1[{le}];\n                  svst1_{suf}({svtrue}, buf0, {in0});\n                  svst1_{suf}({svtrue}, buf1, {in1});\n                  for (i=0; i < nsimd_len_{simd_ext}_{typ}(); i++) {{\n                    buf0[i] = ({typ})(buf0[i] {op} buf1[i]);}}\n                  return svld1_{suf}({svtrue}, buf0); '''. \\\n                  format(op=op, le=le, **fmtspec)\n\ndef emulate_lop2_neon(opts, op, simd_ext, typ):\n    le = 128 // int(typ[1:]);\n    ltyp = get_logical_type(opts, simd_ext, typ)\n    lsuf = suf(ltyp)\n    return '''int i;\n              {ltyp} buf0[{le}], buf1[{le}];\n              vst1q_{lsuf}(buf0, {in0});\n              vst1q_{lsuf}(buf1, {in1});\n              for (i = 0; i < {le}; i++) {{\n                buf0[i] = buf0[i] {op} buf1[i] ? ({ltyp})-1 : 0;\n              }}\n              return vld1q_{lsuf}(buf0);'''. \\\n              format(op=op, le=le, ltyp=ltyp, lsuf=lsuf, **fmtspec)\n\ndef emulate_op3_neon(op, simd_ext, typ):\n    le = 128 // int(typ[1:]);\n    return '''int i;\n              {typ} buf0[{le}], buf1[{le}], buf2[{le}];\n              vst1q_{suf}(buf0, {in0});\n              vst1q_{suf}(buf1, {in1});\n              vst1q_{suf}(buf2, {in2});\n              for (i = 0; i < {le}; i += nsimd_len_cpu_{typ}()) {{\n                nsimd_storeu_cpu_{typ}(&buf0[i], nsimd_{op}_cpu_{typ}(\n                  nsimd_loadu_cpu_{typ}(&buf0[i]),\n                  nsimd_loadu_cpu_{typ}(&buf1[i]),\n                  nsimd_loadu_cpu_{typ}(&buf2[i])));\n              }}\n              return vld1q_{suf}(buf0);'''.format(op=op, le=le, **fmtspec)\n\ndef emulate_f64_neon(simd_ext, op, params):\n    fmtspec2 = fmtspec.copy()\n    fmtspec2['op'] = op\n    fmtspec2['buf_ret_decl'] = 'nsimd_cpu_{}f64 buf_ret;'. \\\n                               format('v' if params[0] == 'v' else 'vl')\n    fmtspec2['buf_decl'] = '\\n'.join(['nsimd_cpu_{}f64 buf{};'. \\\n                           format('v' if p[1] == 'v' else 'vl', p[0]) \\\n                           for p in common.enum(params[1:])])\n    fmtspec2['bufs'] = ','.join(['buf{}'.format(i) \\\n                                 for i in range(0, len(params) - 1)])\n    fmtspec2['ret_decl'] = 'nsimd_{}_{}f64 ret;'. \\\n                           format(simd_ext, 'v' if params[0] == 'v' else 'vl')\n    buf_set = '\\n'.join('''buf{i}.v0 = {ini}.v0;\n                           buf{i}.v1 = {ini}.v1;'''. \\\n                           format(i=i, ini=fmtspec['in{}'.format(i)]) \\\n                           for i in range(0, len(params) - 1))\n    return '''{buf_ret_decl}\n              {buf_decl}\n              {ret_decl}\n              {buf_set}\n              buf_ret = nsimd_{op}_cpu_f64({bufs});\n              ret.v0 = buf_ret.v0;\n              ret.v1 = buf_ret.v1;\n              return ret;'''.format(buf_set=buf_set, **fmtspec2)\n\n# -----------------------------------------------------------------------------\n\ndef f16f64(simd_ext, typ, op, armop, arity, forced_intrinsics = ''):\n    fmtspec2 = fmtspec.copy()\n    tmpl = ', '.join(['{{in{}}}.v{{{{i}}}}'.format(i).format(**fmtspec) \\\n                      for i in range(0, arity)])\n    fmtspec2['args1'] = tmpl.format(i='0')\n    fmtspec2['args2'] = tmpl.format(i='1')\n    fmtspec2['armop'] = armop\n    fmtspec2['op'] = op\n    if simd_ext in neon and typ == 'f16':\n        if forced_intrinsics != '':\n            fmtspec2['intrinsics'] = forced_intrinsics\n        else:\n            temp = ', '.join(['{{in{}}}'.format(i).format(**fmtspec) \\\n                              for i in range(0, arity)])\n            fmtspec2['intrinsics'] = 'return v{}q_f16({});'.format(armop, temp)\n        return '''#ifdef NSIMD_ARM_FP16\n                    {intrinsics}\n                  #else\n                    nsimd_{simd_ext}_vf16 ret;\n                    ret.v0 = nsimd_{op}_{simd_ext}_f32({args1});\n                    ret.v1 = nsimd_{op}_{simd_ext}_f32({args2});\n                    return ret;\n                  #endif'''.format(**fmtspec2)\n    elif simd_ext == 'neon128' and typ == 'f64':\n        return emulate_f64_neon(simd_ext, op, ['v'] * (arity + 1))\n    return ''\n\n# -----------------------------------------------------------------------------\n# Lenghts\n\ndef max_len(simd_ext, typ):\n    if simd_ext == 'sve':\n        return 2048 // int(typ[1:])\n    elif simd_ext in fixed_sized_sve:\n        return int(simd_ext[3:]) // int(typ[1:])\n    else:\n        return 128 // int(typ[1:])\n\ndef real_len(simd_ext, typ):\n    if simd_ext == 'sve':\n        return 'nsimd_len_sve_{typ}()'.format(**fmtspec)\n    else:\n        return max_len(simd_ext, typ)\n\n# -----------------------------------------------------------------------------\n# Loads of degree 1, 2, 3 and 4\n\ndef load1234(opts, simd_ext, typ, deg):\n    if simd_ext in neon:\n        if deg == 1:\n            normal = 'return vld{deg}q_{suf}({in0});'. \\\n                     format(deg=deg, **fmtspec)\n            if typ == 'f16':\n                return \\\n                '''#ifdef NSIMD_ARM_FP16\n                     {normal}\n                   #else\n                     /* Note that we can do much better but is it useful? */\n                     nsimd_{simd_ext}_vf16 ret;\n                     f32 buf[4];\n                     buf[0] = nsimd_u16_to_f32(*(u16*){in0});\n                     buf[1] = nsimd_u16_to_f32(*((u16*){in0} + 1));\n                     buf[2] = nsimd_u16_to_f32(*((u16*){in0} + 2));\n                     buf[3] = nsimd_u16_to_f32(*((u16*){in0} + 3));\n                     ret.v0 = vld1q_f32(buf);\n                     buf[0] = nsimd_u16_to_f32(*((u16*){in0} + 4));\n                     buf[1] = nsimd_u16_to_f32(*((u16*){in0} + 5));\n                     buf[2] = nsimd_u16_to_f32(*((u16*){in0} + 6));\n                     buf[3] = nsimd_u16_to_f32(*((u16*){in0} + 7));\n                     ret.v1 = vld1q_f32(buf);\n                     return ret;\n                   #endif'''.format(normal=normal, **fmtspec)\n            elif typ == 'f64' and simd_ext == 'neon128':\n                return \\\n                '''nsimd_neon128_vf64 ret;\n                   ret.v0 = *{in0};\n                   ret.v1 = *({in0} + 1);\n                   return ret;'''.format(**fmtspec)\n            else:\n                return normal\n        else:\n            normal = \\\n            '''nsimd_{simd_ext}_v{typ}x{deg} ret;\n               {soa_typ} buf = vld{deg}q_{suf}({in0});\n               {assignment}\n               return ret;'''. \\\n               format(deg=deg, soa_typ=get_native_soa_typ(simd_ext, typ, deg),\n                      assignment='\\n'.join(['ret.v{i} = buf.val[{i}];'. \\\n                      format(i=i) for i in range(0, deg)]), **fmtspec)\n            if typ == 'f16':\n                assignment = \\\n                '''vst1q_u16(buf, temp.val[{{i}}]);\n                   ret.v{{i}} = nsimd_loadu_{simd_ext}_f16((f16 *)buf);'''. \\\n                   format(**fmtspec)\n                return \\\n                '''{soa_typ} temp = vld{deg}q_u16((u16 *){in0});\n                   u16 buf[8];\n                   nsimd_{simd_ext}_vf16x{deg} ret;\n                   {assignment}\n                   return ret;'''. \\\n                   format(deg=deg, assignment='\\n'.join([assignment. \\\n                          format(i=i) for i in range(0, deg)]),\n                          soa_typ=get_native_soa_typ(simd_ext, 'u16', deg),\n                          **fmtspec)\n            elif typ in 'f64' and simd_ext == 'neon128':\n                return \\\n                'nsimd_neon128_vf64x{} ret;\\n'.format(deg) + \\\n                '\\n'.join(['ret.v{i}.v0 = *({in0} + {i});'. \\\n                           format(i=i, **fmtspec) for i in range(0, deg)]) + \\\n                '\\n'.join(['ret.v{i}.v1 = *({in0} + {ipd});'. \\\n                           format(i=i, ipd=i + deg, **fmtspec) \\\n                           for i in range(0, deg)]) + \\\n                '\\nreturn ret;\\n'\n            elif typ in ['i64', 'u64'] and simd_ext == 'neon128':\n                return \\\n                '''nsimd_neon128_v{typ}x{deg} ret;\n                   {typ} buf[2];'''.format(deg=deg, **fmtspec) + \\\n                '\\n'.join(['''buf[0] = *({in0} + {i});\n                              buf[1] = *({in0} + {ipd});\n                              ret.v{i} = vld1q_{suf}(buf);'''. \\\n                              format(i=i, ipd=i + deg, **fmtspec) \\\n                              for i in range(0, deg)]) + \\\n                '\\nreturn ret;\\n'\n            else:\n                return normal\n    else:\n        if deg == 1:\n            return 'return svld{deg}_{suf}({svtrue}, {in0});'. \\\n                   format(deg=deg, **fmtspec)\n        else:\n            return \\\n            '''nsimd_{simd_ext}_v{typ}x{deg} ret;\n               {sve_typ} buf = svld{deg}_{suf}({svtrue}, {in0});\n               {assignment}\n               return ret;'''.format(assignment=\\\n               '\\n'.join(['ret.v{i} = svget{deg}_{suf}(buf, {i});'. \\\n                          format(i=i, deg=deg, **fmtspec) \\\n                          for i in range(deg)]),\n                          sve_typ=get_native_soa_typ('sve', typ, deg),\n                          deg=deg, **fmtspec)\n\n# -----------------------------------------------------------------------------\n# Mask loads\n\ndef maskoz_load(oz, simd_ext, typ):\n    if simd_ext in sve:\n        return 'return svsel_{suf}({in0}, svld1_{suf}({in0}, {in1}), {oz});'. \\\n               format(oz='{in2}'.format(**fmtspec) if oz == 'o' \\\n                      else 'svdup_n_{suf}(({typ})0)'.format(**fmtspec),\n                      **fmtspec)\n    if typ == 'f64' and simd_ext == 'neon128':\n        return '''nsimd_neon128_vf64 ret;\n                  if ({in0}.v0) {{\n                    ret.v0 = {in1}[0];\n                  }} else {{\n                    ret.v0 = {oz0};\n                  }}\n                  if ({in0}.v1) {{\n                    ret.v1 = {in1}[1];\n                  }} else {{\n                    ret.v1 = {oz1};\n                  }}\n                  return ret;'''.format(\n                  oz0 = '0.0f' if oz == 'z' else '{in2}.v0'.format(**fmtspec),\n                  oz1 = '0.0f' if oz == 'z' else '{in2}.v1'.format(**fmtspec),\n                  **fmtspec)\n    le = 128 // int(typ[1:])\n    normal = '''int i;\n                {typ} buf[{le}];\n                u{typnbits} mask[{le}];\n                vst1q_{suf}(buf, {oz});\n                vst1q_u{typnbits}(mask, {in0});\n                for (i = 0; i < {le}; i++) {{\n                  if (mask[i]) {{\n                    buf[i] = {in1}[i];\n                  }}\n                }}\n                return vld1q_{suf}(buf);'''. \\\n                format(oz='vdupq_n_{suf}(({typ})0)'.format(**fmtspec) \\\n                          if oz == 'z' else '{in2}'.format(**fmtspec),\n                          le=le, **fmtspec)\n    if typ == 'f16':\n        return '''#ifdef NSIMD_ARM_FP16\n                    {normal}\n                  #else\n                    int i;\n                    nsimd_{simd_ext}_vf16 ret;\n                    f32 buf[8];\n                    u32 mask[8];\n                    vst1q_f32(buf, {oz0});\n                    vst1q_f32(buf + 4, {oz1});\n                    vst1q_u32(mask, {in0}.v0);\n                    vst1q_u32(mask + 4, {in0}.v1);\n                    for (i = 0; i < 8; i++) {{\n                      if (mask[i]) {{\n                        buf[i] = nsimd_f16_to_f32({in1}[i]);\n                      }}\n                    }}\n                    ret.v0 = vld1q_f32(buf);\n                    ret.v1 = vld1q_f32(buf + 4);\n                    return ret;\n                  #endif'''. \\\n                  format(oz0='vdupq_n_f32(0.0f)'.format(**fmtspec) \\\n                             if oz == 'z' else '{in2}.v0'.format(**fmtspec),\n                         oz1='vdupq_n_f32(0.0f)'.format(**fmtspec) \\\n                             if oz == 'z' else '{in2}.v1'.format(**fmtspec),\n                             normal=normal, **fmtspec)\n    return normal\n\n# -----------------------------------------------------------------------------\n# Stores of degree 1, 2, 3 and 4\n\ndef store1234(opts, simd_ext, typ, deg):\n    if simd_ext in neon:\n        if deg == 1:\n            normal = 'vst{deg}q_{suf}({in0}, {in1});'. \\\n                     format(deg=deg, **fmtspec)\n            if typ == 'f16':\n                return \\\n                '''#ifdef NSIMD_ARM_FP16\n                     {normal}\n                   #else\n                     f32 buf[4];\n                     vst1q_f32(buf, {in1}.v0);\n                     *((u16*){in0}    ) = nsimd_f32_to_u16(buf[0]);\n                     *((u16*){in0} + 1) = nsimd_f32_to_u16(buf[1]);\n                     *((u16*){in0} + 2) = nsimd_f32_to_u16(buf[2]);\n                     *((u16*){in0} + 3) = nsimd_f32_to_u16(buf[3]);\n                     vst1q_f32(buf, {in1}.v1);\n                     *((u16*){in0} + 4) = nsimd_f32_to_u16(buf[0]);\n                     *((u16*){in0} + 5) = nsimd_f32_to_u16(buf[1]);\n                     *((u16*){in0} + 6) = nsimd_f32_to_u16(buf[2]);\n                     *((u16*){in0} + 7) = nsimd_f32_to_u16(buf[3]);\n                   #endif'''.format(normal=normal, **fmtspec)\n            elif typ == 'f64' and simd_ext == 'neon128':\n                return \\\n                '''*{in0} = {in1}.v0;\n                   *({in0} + 1) = {in1}.v1;'''.format(**fmtspec)\n            else:\n                return normal\n        else:\n            normal = \\\n            '''{soa_typ} buf;\n               {assignment}\n               vst{deg}q_{suf}({in0}, buf);'''. \\\n               format(deg=deg, assignment='\\n'.join([\n                      'buf.val[{{}}] = {{in{}}};'.format(i). \\\n                      format(i - 1, **fmtspec) for i in range(1, deg + 1)]),\n                      soa_typ=get_native_soa_typ(simd_ext, typ, deg),\n                      **fmtspec)\n            if typ == 'f16':\n                assignment = \\\n                '''nsimd_storeu_{{simd_ext}}_f16((f16 *)buf, {{in{}}});\n                   temp.val[{{}}] = vld1q_u16(buf);'''\n                return \\\n                '''#ifdef NSIMD_ARM_FP16\n                     {normal}\n                   #else\n                     {soa_typ} temp;\n                     u16 buf[8];\n                     {assignment}\n                     vst{deg}q_u16((u16 *){in0}, temp);\n                   #endif'''. \\\n                   format(assignment='\\n'.join([assignment.format(i). \\\n                          format(i - 1, **fmtspec) \\\n                          for i in range(1, deg + 1)]),\n                          deg=deg, normal=normal,\n                          soa_typ=get_native_soa_typ(simd_ext, 'u16', deg),\n                          **fmtspec)\n            elif typ == 'f64' and simd_ext == 'neon128':\n                return \\\n                '\\n'.join(['*({{in0}} + {}) = {{in{}}}.v0;'. \\\n                           format(i - 1, i).format(**fmtspec) \\\n                           for i in range(1, deg + 1)]) + '\\n' + \\\n                '\\n'.join(['*({{in0}} + {}) = {{in{}}}.v1;'. \\\n                           format(i + deg - 1, i).format(**fmtspec) \\\n                           for i in range(1, deg + 1)])\n            elif typ in ['i64', 'u64'] and simd_ext == 'neon128':\n                return \\\n                '{typ} buf[{biglen}];'.format(biglen=2 * deg, **fmtspec) + \\\n                '\\n'.join(['vst1q_{{suf}}(buf + {im1x2}, {{in{i}}});'. \\\n                           format(im1x2=2 * (i - 1), i=i).format(**fmtspec) \\\n                           for i in range(1, deg + 1)]) + \\\n                '\\n'.join(['''*({in0} + {i}) = buf[{ix2}];\n                              *({in0} + {ipd}) = buf[{ix2p1}];'''. \\\n                              format(i=i, ipd=i + deg, ix2=i * 2,\n                                     ix2p1=2 * i + 1, **fmtspec) \\\n                              for i in range(0, deg)])\n            else:\n                return normal\n    else:\n        if deg == 1:\n            return 'svst{deg}_{suf}({svtrue}, {in0}, {in1});'. \\\n                   format(deg=deg, **fmtspec)\n        fill_soa_typ = \\\n            '\\n'.join(['tmp = svset{{deg}}_{{suf}}(tmp, {im1}, {{in{i}}});'. \\\n            format(im1=i - 1, i=i).format(deg=deg, **fmtspec) \\\n            for i in range(1, deg + 1)])\n        return \\\n        '''{soa_typ} tmp = svundef{deg}_{suf}();\n           {fill_soa_typ}\n           svst{deg}_{suf}({svtrue}, {in0}, tmp);'''. \\\n           format(soa_typ=get_native_soa_typ('sve', typ, deg), deg=deg,\n                  fill_soa_typ=fill_soa_typ, **fmtspec)\n\n# -----------------------------------------------------------------------------\n# Mask stores\n\ndef mask_store(simd_ext, typ):\n    if simd_ext in sve:\n        return 'svst1_{suf}({in0}, {in1}, {in2});'.format(**fmtspec)\n    if typ == 'f64' and simd_ext == 'neon128':\n        return '''if ({in0}.v0) {{\n                    {in1}[0] = {in2}.v0;\n                  }}\n                  if ({in0}.v1) {{\n                    {in1}[1] = {in2}.v1;\n                  }}'''.format(**fmtspec)\n    le = 128 // int(typ[1:])\n    normal = '''int i;\n                {typ} buf[{le}];\n                u{typnbits} mask[{le}];\n                vst1q_{suf}(buf, {in2});\n                vst1q_u{typnbits}(mask, {in0});\n                for (i = 0; i < {le}; i++) {{\n                  if (mask[i]) {{\n                    {in1}[i] = buf[i];\n                  }}\n                }}'''.format(le=le, **fmtspec)\n    if typ == 'f16':\n        return \\\n        '''#ifdef NSIMD_ARM_FP16\n             {normal}\n           #else\n             f32 buf[8];\n             u32 mask[8];\n             int i;\n             vst1q_u32(mask, {in0}.v0);\n             vst1q_u32(mask + 4, {in0}.v1);\n             vst1q_f32(buf, {in2}.v0);\n             vst1q_f32(buf + 4, {in2}.v1);\n             for (i = 0; i < 8; i++) {{\n               if (mask[i]) {{\n                 {in1}[i] = nsimd_f32_to_f16(buf[i]);\n               }}\n             }}\n           #endif'''.format(normal=normal, **fmtspec)\n    return normal\n\n# -----------------------------------------------------------------------------\n# Length\n\ndef len1(simd_ext, typ):\n    if simd_ext in neon:\n        return 'return {};'.format(128 // int(typ[1:]))\n    elif simd_ext == 'sve':\n        return 'return (int)svcntp_b{typnbits}({svtrue}, {svtrue});'. \\\n               format(**fmtspec)\n    elif simd_ext in fixed_sized_sve:\n        return 'return {};'.format(int(simd_ext[3:]) // int(typ[1:]))\n\n# -----------------------------------------------------------------------------\n# Add/sub\n\ndef addsub(op, simd_ext, typ):\n    ret = f16f64(simd_ext, typ, op, op, 2)\n    if ret != '':\n        return ret\n    if simd_ext in neon:\n        return 'return v{op}q_{suf}({in0}, {in1});'. \\\n               format(op=op, **fmtspec)\n    else:\n        return 'return sv{op}_{suf}_x({svtrue}, {in0}, {in1});'. \\\n               format(op=op, **fmtspec)\n\n# -----------------------------------------------------------------------------\n# Multiplication\n\ndef mul2(simd_ext, typ):\n    ret = f16f64(simd_ext, typ, 'mul', 'mul', 2)\n    if ret != '':\n        return ret\n    elif simd_ext in neon and typ in ['i64', 'u64']:\n        return emulate_op2('*', simd_ext, typ)\n    else:\n        if simd_ext in neon:\n            return 'return vmulq_{suf}({in0}, {in1});'.format(**fmtspec)\n        else:\n            return 'return svmul_{suf}_x({svtrue}, {in0}, {in1});'. \\\n                   format(**fmtspec)\n\n# -----------------------------------------------------------------------------\n# Division\n\ndef div2(simd_ext, typ):\n    if simd_ext == 'aarch64' and typ in ['f32', 'f64']:\n        return 'return vdivq_{suf}({in0}, {in1});'.format(**fmtspec)\n    elif simd_ext in sve and \\\n         typ in ['f16', 'f32', 'f64', 'i32', 'u32', 'i64', 'u64']:\n        return 'return svdiv_{suf}_x({svtrue}, {in0}, {in1});'. \\\n               format(**fmtspec)\n    else:\n        ret = f16f64(simd_ext, typ, 'div', 'div', 2)\n        if ret != '':\n            return ret\n    return emulate_op2('/', simd_ext, typ)\n\n# -----------------------------------------------------------------------------\n# Binary operators: and, or, xor, andnot\n\ndef binop2(op, simd_ext, typ):\n    armop = {'orb': 'orr', 'xorb': 'eor', 'andb': 'and', 'andnotb': 'bic'}\n    if typ in common.iutypes:\n        if simd_ext in neon:\n            return 'return v{armop}q_{suf}({in0}, {in1});'. \\\n                   format(armop=armop[op], **fmtspec)\n        else:\n            return 'return sv{armop}_{suf}_x({svtrue}, {in0}, {in1});'. \\\n                   format(armop=armop[op], **fmtspec)\n    # From here only float types\n    if typ == 'f16':\n        intrinsics = \\\n        '''return vreinterpretq_f16_u16(v{armop}q_u16(vreinterpretq_u16_f16(\n                    {in0}), vreinterpretq_u16_f16({in1})));'''. \\\n                    format(armop=armop[op], **fmtspec)\n    else:\n        intrinsics = ''\n    ret = f16f64(simd_ext, typ, op, armop[op], 2, intrinsics)\n    if ret != '':\n        return ret\n    if simd_ext in neon:\n        return \\\n        '''return vreinterpretq_f{typnbits}_u{typnbits}(v{armop}q_u{typnbits}(\n                    vreinterpretq_u{typnbits}_f{typnbits}({in0}),\n                      vreinterpretq_u{typnbits}_f{typnbits}({in1})));'''. \\\n                      format(armop=armop[op], **fmtspec)\n    else:\n        return \\\n        '''return svreinterpret_f{typnbits}_u{typnbits}(\n                    sv{armop}_u{typnbits}_x({svtrue},\n                      svreinterpret_u{typnbits}_f{typnbits}({in0}),\n                      svreinterpret_u{typnbits}_f{typnbits}({in1})));'''. \\\n                      format(armop=armop[op], **fmtspec)\n\n# -----------------------------------------------------------------------------\n# Binary not\n\ndef not1(simd_ext, typ):\n    if typ in common.iutypes:\n        if simd_ext in neon:\n            if typ in ['i8', 'u8', 'i16', 'u16', 'i32', 'u32']:\n                return 'return vmvnq_{suf}({in0});'.format(**fmtspec)\n            else:\n                return \\\n                '''return vreinterpretq_{suf}_u32(vmvnq_u32(\n                            vreinterpretq_u32_{suf}({in0})));'''. \\\n                            format(**fmtspec)\n        if simd_ext in sve:\n            return 'return svnot_{suf}_x({svtrue}, {in0});'.format(**fmtspec)\n    # From here only float types\n    if typ == 'f16':\n        intrinsics = \\\n        '''return vreinterpretq_f16_u16(vmvnq_u16(vreinterpretq_u16_f16(\n                    {in0})));'''.format(**fmtspec)\n    else:\n        intrinsics = ''\n    ret = f16f64(simd_ext, typ, 'notb', 'mvn', 1, intrinsics)\n    if ret != '':\n        return ret\n    if simd_ext in neon:\n        return \\\n        '''return vreinterpretq_{suf}_u32(vmvnq_u32(\n                    vreinterpretq_u32_{suf}({in0})));'''. \\\n                    format(**fmtspec)\n    else:\n        return \\\n        '''return svreinterpret_{suf}_u{typnbits}(svnot_u{typnbits}_x(\n                    {svtrue}, svreinterpret_u{typnbits}_{suf}({in0})));'''. \\\n                    format(**fmtspec)\n\n# -----------------------------------------------------------------------------\n# Logical operators: and, or, xor, andnot\n\ndef lop2(opts, op, simd_ext, typ):\n    armop = {'orl': 'orr', 'xorl': 'eor', 'andl': 'and', 'andnotl': 'bic'}\n    if simd_ext in neon:\n        if typ == 'f16':\n            return \\\n            '''#ifdef NSIMD_ARM_FP16\n                 return v{armop}q_u16({in0}, {in1});\n               #else\n                 nsimd_{simd_ext}_vlf16 ret;\n                 ret.v0 = v{armop}q_u32({in0}.v0, {in1}.v0);\n                 ret.v1 = v{armop}q_u32({in0}.v1, {in1}.v1);\n                 return ret;\n               #endif'''.format(armop=armop[op], **fmtspec)\n        elif simd_ext == 'neon128' and typ == 'f64':\n            if op == 'andnotl':\n                return '''nsimd_{simd_ext}_vlf64 ret;\n                          ret.v0 = {in0}.v0 & (~{in1}.v0);\n                          ret.v1 = {in0}.v1 & (~{in1}.v1);\n                          return ret;'''.format(**fmtspec)\n            else:\n                cpuop = {'orl': '|', 'xorl': '^', 'andl': '&'}\n                return '''nsimd_{simd_ext}_vlf64 ret;\n                          ret.v0 = {in0}.v0 {cpuop} {in1}.v0;\n                          ret.v1 = {in0}.v1 {cpuop} {in1}.v1;\n                          return ret;'''.format(cpuop=cpuop[op], **fmtspec)\n        else:\n            return 'return v{armop}q_u{typnbits}({in0}, {in1});'. \\\n                   format(armop=armop[op], **fmtspec)\n    else:\n        if opts.sve_emulate_bool:\n            # TODO: the casts are a workaround to avoid a bug in gcc trunk for sve\n            # it needs to be deleted when the bug is corrected\n            return \\\n            '''return sv{armop}_x({svtrue},\n                                  (svuint{typnbits}_t){in0},\n                                  (svuint{typnbits}_t){in1});'''. \\\n            format(armop=armop[op], **fmtspec)\n        else:\n            return '''return sv{armop}_z({svtrue}, {in0}, {in1});'''. \\\n            format(armop=armop[op], **fmtspec)\n\n# -----------------------------------------------------------------------------\n# Logical not\n\ndef lnot1(opts, simd_ext, typ):\n    if simd_ext in neon:\n        if typ == 'f16':\n            return \\\n            '''#ifdef NSIMD_ARM_FP16\n                 return vmvnq_u16({in0});\n               #else\n                 nsimd_{simd_ext}_vlf16 ret;\n                 ret.v0 = vmvnq_u32({in0}.v0);\n                 ret.v1 = vmvnq_u32({in0}.v1);\n                 return ret;\n               #endif'''.format(**fmtspec)\n        elif simd_ext == 'neon128' and typ == 'f64':\n            return '''nsimd_neon128_vlf64 ret;\n                      ret.v0 = ~{in0}.v0;\n                      ret.v1 = ~{in0}.v1;\n                      return ret;'''.format(**fmtspec)\n        elif typ in ['i64', 'u64', 'f64']:\n            return '''return vreinterpretq_u{typnbits}_u32(vmvnq_u32(\n                               vreinterpretq_u32_u{typnbits}({in0})));'''. \\\n                               format(**fmtspec)\n        else:\n            return 'return vmvnq_u{typnbits}({in0});'.format(**fmtspec)\n    elif simd_ext in sve:\n        if opts.sve_emulate_bool:\n            # TODO: the cast is a workaround to avoid a bug in gcc trunk for sve\n            # it needs to be deleted when the bug is corrected\n            return 'return svnot_x({svtrue}, (svuint{typnbits}_t){in0});'.format(**fmtspec)\n        else:\n            return 'return svnot_z({svtrue}, {in0});'.format(**fmtspec)\n\n# -----------------------------------------------------------------------------\n# Square root\n\ndef sqrt1(simd_ext, typ):\n    if simd_ext == 'neon128':\n        if typ in 'f16':\n            return '''nsimd_neon128_vf16 ret;\n                      ret.v0 = nsimd_sqrt_neon128_f32({in0}.v0);\n                      ret.v1 = nsimd_sqrt_neon128_f32({in0}.v1);\n                      return ret;'''.format(**fmtspec)\n        elif typ == 'f64':\n            return f16f64('neon128', 'f64', 'sqrt', 'sqrt', 1)\n        else:\n            return emulate_op1('sqrt', simd_ext, typ)\n    elif simd_ext == 'aarch64':\n        if typ == 'f16':\n            return f16f64('aarch64', 'f16', 'sqrt', 'sqrt', 1)\n        else:\n            return 'return vsqrtq_{suf}({in0});'.format(**fmtspec)\n    else:\n        return 'return svsqrt_{suf}_x({svtrue}, {in0});'.format(**fmtspec)\n\n# -----------------------------------------------------------------------------\n# Shifts\n\ndef shl_shr(op, simd_ext, typ):\n    if simd_ext in neon:\n        sign = '-' if op == 'shr' else ''\n        if typ in common.utypes:\n            return '''return vshlq_{suf}({in0}, vdupq_n_s{typnbits}(\n                                 (i{typnbits})({sign}{in1})));'''. \\\n                                 format(sign=sign, **fmtspec)\n        else:\n            return \\\n            '''return vreinterpretq_s{typnbits}_u{typnbits}(vshlq_u{typnbits}(\n                        vreinterpretq_u{typnbits}_s{typnbits}({in0}),\n                          vdupq_n_s{typnbits}((i{typnbits})({sign}{in1}))));'''. \\\n                          format(sign=sign, **fmtspec)\n    else:\n       armop = 'lsl' if op == 'shl' else 'lsr'\n       if op == 'shr' and typ in common.itypes:\n           return \\\n           '''return svreinterpret_{suf}_{suf2}(sv{armop}_{suf2}_x({svtrue},\n                       svreinterpret_{suf2}_{suf}({in0}),\n                       svdup_n_u{typnbits}((u{typnbits}){in1})));'''. \\\n                       format(suf2=common.bitfield_type[typ], armop=armop,\n                              **fmtspec)\n       else:\n           return '''return sv{armop}_{suf}_x({svtrue}, {in0},\n                              svdup_n_u{typnbits}((u{typnbits}){in1}));'''. \\\n                              format(armop=armop, **fmtspec)\n\ndef shra(simd_ext, typ):\n    if typ in common.utypes:\n        return '''return nsimd_shr_{simd_ext}_{typ}({in0}, {in1});'''. \\\n                format(**fmtspec)\n\n    if simd_ext in neon:\n        return  '''return vshlq_{suf}(\n        {in0}, vdupq_n_s{typnbits}((i{typnbits})-{in1}));'''.\\\n            format(**fmtspec)\n    elif simd_ext in sve:\n        if typ[0] == 'i':\n            return '''return svasr_n_{suf}_x({svtrue}, {in0},\n                (u{typnbits}){in1});'''.\\\n                format(**fmtspec)\n        elif typ[0] == 'u':\n            return 'return svlsl_n_{suf}_x({svtrue}, {in0}, (u64){in1});'.\\\n                format(**fmtspec)\n\n# -----------------------------------------------------------------------------\n# Set1\n\ndef set1(simd_ext, typ):\n    if simd_ext in neon:\n        if typ == 'f16':\n            return '''#ifdef NSIMD_ARM_FP16\n                        return vdupq_n_f16({in0});\n                      #else\n                        nsimd_{simd_ext}_vf16 ret;\n                        f32 f = nsimd_f16_to_f32({in0});\n                        ret.v0 = nsimd_set1_{simd_ext}_f32(f);\n                        ret.v1 = nsimd_set1_{simd_ext}_f32(f);\n                        return ret;\n                      #endif'''.format(**fmtspec)\n        elif simd_ext == 'neon128' and typ == 'f64':\n            return '''nsimd_neon128_vf64 ret;\n                      ret.v0 = {in0};\n                      ret.v1 = {in0};\n                      return ret;'''.format(**fmtspec)\n        else:\n            return 'return vdupq_n_{suf}({in0});'.format(**fmtspec)\n    else:\n        return 'return svdup_n_{suf}({in0});'.format(**fmtspec)\n\n# -----------------------------------------------------------------------------\n# Set1l\n\ndef lset1(simd_ext, typ):\n    if simd_ext in sve:\n        return '''if ({in0}) {{\n                    return svptrue_b{typnbits}();\n                  }} else {{\n                    return svpfalse_b();\n                  }}'''.format(**fmtspec)\n    # getting here means no NEON and AARCH64 only\n    mask = 'vdupq_n_u{typnbits}((u{typnbits}){{}})'.format(**fmtspec)\n    normal = '''if ({in0}) {{\n                  return {ones};\n                }} else {{\n                  return {zeros};\n                }}'''.format(ones=mask.format('-1'), zeros=mask.format('0'),\n                             **fmtspec)\n    if typ == 'f16':\n        return '''#ifdef NSIMD_ARM_FP16\n                    {normal}\n                  #else\n                    nsimd_{simd_ext}_vlf16 ret;\n                    ret.v0 = nsimd_set1l_{simd_ext}_f32({in0});\n                    ret.v1 = ret.v0;\n                    return ret;\n                  #endif'''.format(normal=normal, **fmtspec)\n    if typ == 'f64' and simd_ext == 'neon128':\n        return '''nsimd_neon128_vlf64 ret;\n                  ret.v0 = (u64)({in0} ? -1 : 0);\n                  ret.v1 = ret.v0;\n                  return ret;'''.format(**fmtspec)\n    return normal\n\n# -----------------------------------------------------------------------------\n# Comparison operators: ==, <, <=, >, >=\n\ndef cmp2(opts, op, simd_ext, typ):\n    binop = {'eq': '==', 'lt': '<', 'le': '<=', 'gt': '>', 'ge': '>='}\n    armop = {'eq': 'eq', 'lt': 'lt', 'le': 'le', 'gt': 'gt', 'ge': 'ge'}\n    if simd_ext in neon:\n        emul_f16 = '''nsimd_{simd_ext}_vlf16 ret;\n                      ret.v0 = nsimd_{op}_{simd_ext}_f32({in0}.v0, {in1}.v0);\n                      ret.v1 = nsimd_{op}_{simd_ext}_f32({in0}.v1, {in1}.v1);\n                      return ret;'''.format(op=op, **fmtspec)\n        normal = 'return vc{armop}q_{suf}({in0}, {in1});'. \\\n                 format(armop=armop[op], **fmtspec)\n        if typ == 'f16':\n            if simd_ext == 'neon128':\n                return emul_f16\n            else:\n                return \\\n                '''#ifdef NSIMD_ARM_FP16\n                     {}\n                   #else\n                     {}\n                   #endif'''.format(normal, emul_f16)\n        if simd_ext == 'neon128' and typ == 'f64':\n            return '''nsimd_{simd_ext}_vl{typ} ret;\n                      ret.v0 = {in0}.v0 {op} {in1}.v0 ? (u64)-1 : 0;\n                      ret.v1 = {in0}.v1 {op} {in1}.v1 ? (u64)-1 : 0;\n                      return ret;'''.format(op=binop[op], **fmtspec)\n        elif simd_ext == 'neon128' and typ in ['i64', 'u64']:\n            return '''{typ} buf0[2], buf1[2];\n                      u64 ret[2];\n                      vst1q_{suf}(buf0, {in0});\n                      vst1q_{suf}(buf1, {in1});\n                      ret[0] = buf0[0] {op} buf1[0] ? (u64)-1 : 0;\n                      ret[1] = buf0[1] {op} buf1[1] ? (u64)-1 : 0;\n                      return vld1q_u64(ret);'''. \\\n                      format(op=binop[op], **fmtspec)\n        else:\n            return normal\n    elif simd_ext in sve:\n        if opts.sve_emulate_bool:\n            # TODO: the casts are a workaround to avoid a bug in gcc trunk for sve\n            # it needs to be deleted when the bug is corrected\n            comp = 'svcmp{op}_{suf}({svtrue}, ({svetyp}){in0}, ({svetyp}){in1})'. \\\n                format(op=armop[op], **fmtspec)\n            return 'return {};'.format(convert_from_predicate(opts, comp))\n        else:\n            return 'return svcmp{op}_{suf}({svtrue}, {in0}, {in1});'. \\\n                    format(op=armop[op], **fmtspec)\n\n# -----------------------------------------------------------------------------\n# Not equal\n\ndef neq2(opts, simd_ext, typ):\n    if simd_ext in neon:\n        return '''return nsimd_notl_{simd_ext}_{typ}(\n                      nsimd_eq_{simd_ext}_{typ}({in0}, {in1}));'''. \\\n                      format(**fmtspec)\n    elif simd_ext in sve:\n        comp='svcmpne_{suf}({svtrue}, {in0}, {in1})'. \\\n                format(**fmtspec)\n        return 'return {};'.format(convert_from_predicate(opts, comp))\n\n\n# -----------------------------------------------------------------------------\n# If_else\n\ndef if_else3(opts, simd_ext, typ):\n    if simd_ext in neon:\n        intrinsic = 'return vbslq_{suf}({in0}, {in1}, {in2});'. \\\n                    format(**fmtspec)\n        if typ == 'f16':\n            return \\\n            '''#ifdef NSIMD_ARM_FP16\n                 {intrinsic}\n               #else\n                 nsimd_{simd_ext}_vf16 ret;\n                 ret.v0 = nsimd_if_else1_{simd_ext}_f32(\n                            {in0}.v0, {in1}.v0, {in2}.v0);\n                 ret.v1 = nsimd_if_else1_{simd_ext}_f32(\n                            {in0}.v1, {in1}.v1, {in2}.v1);\n                 return ret;\n               #endif'''.format(intrinsic=intrinsic, **fmtspec)\n        elif simd_ext == 'neon128' and typ == 'f64':\n            return '''nsimd_neon128_vf64 ret;\n                      ret.v0 = {in0}.v0 != 0u ? {in1}.v0 : {in2}.v0;\n                      ret.v1 = {in0}.v1 != 0u ? {in1}.v1 : {in2}.v1;\n                      return ret;'''.format(**fmtspec)\n        else:\n            return intrinsic\n    elif simd_ext in sve:\n        if opts.sve_emulate_bool:\n            # TODO: the casts are a workaround to avoid a bug in gcc trunk for sve\n            # it needs to be deleted when the bug is corrected\n            return 'return svsel_{suf}({cond}, ({svetyp}){in1}, ({svetyp}){in2});' \\\n                    .format(cond=convert_to_predicate(opts,\n                                '{in0}'.format(**fmtspec)),\n                            **fmtspec)\n        else:\n            return 'return svsel_{suf}({in0}, {in1}, {in2});' \\\n                    .format(**fmtspec)\n\n# -----------------------------------------------------------------------------\n# Minimum and maximum\n\ndef minmax2(op, simd_ext, typ):\n    ret = f16f64(simd_ext, typ, op, op, 2)\n    if ret != '':\n        return ret\n    if simd_ext in neon:\n        if typ in ['i64', 'u64']:\n            binop = '<' if op == 'min' else '>'\n            return '''{typ} buf0[2], buf1[2];\n                      vst1q_{suf}(buf0, {in0});\n                      vst1q_{suf}(buf1, {in1});\n                      buf0[0] = buf0[0] {binop} buf1[0] ? buf0[0] : buf1[0];\n                      buf0[1] = buf0[1] {binop} buf1[1] ? buf0[1] : buf1[1];\n                      return vld1q_{suf}(buf0);'''. \\\n                      format(binop=binop, **fmtspec)\n        else:\n            return 'return v{op}q_{suf}({in0}, {in1});'. \\\n                   format(op=op, **fmtspec)\n    else:\n        return 'return sv{op}_{suf}_x({svtrue}, {in0}, {in1});'. \\\n               format(op=op, **fmtspec)\n\n# -----------------------------------------------------------------------------\n# Abs\n\ndef abs1(simd_ext, typ):\n    if typ in common.utypes:\n        return 'return {in0};'.format(**fmtspec)\n    elif simd_ext in neon:\n        if typ == 'f16':\n            return f16f64(simd_ext, 'f16', 'abs', 'abs', 1)\n        elif (typ in ['i8', 'i16', 'i32', 'f32']) or \\\n             (simd_ext == 'aarch64' and typ in ['i64', 'f64']):\n            return 'return vabsq_{suf}({in0});'.format(**fmtspec)\n        elif typ == 'i64':\n            return emulate_op1('abs', 'neon128', 'i64')\n        else:\n            return f16f64(simd_ext, 'f64', 'abs', 'abs', 1)\n    else:\n        return 'return svabs_{suf}_x({svtrue}, {in0});'. \\\n               format(**fmtspec)\n\n# -----------------------------------------------------------------------------\n# Round, trunc, ceil and round_to_even\n\ndef round1(op, simd_ext, typ):\n    if typ in common.iutypes:\n        return 'return {in0};'.format(**fmtspec)\n    armop = {'floor': 'rndm', 'ceil': 'rndp', 'trunc': 'rnd',\n             'round_to_even': 'rndn'}\n    if simd_ext == 'neon128':\n        ret = f16f64('neon128', typ, op, 'v{armop}q_{suf}'. \\\n                     format(armop=armop, **fmtspec), 1)\n        if ret != '':\n            return ret\n        return emulate_op1(op, 'neon128', typ);\n    elif simd_ext == 'aarch64':\n        if typ == 'f16':\n            return f16f64('aarch64', 'f16', op, armop[op], 1)\n        else:\n            return 'return v{armop}q_{suf}({in0});'. \\\n                   format(armop=armop[op], **fmtspec)\n    else:\n        armop = {'floor': 'rintm', 'ceil': 'rintp', 'trunc': 'rintz',\n                 'round_to_even': 'rintn'}\n        return 'return sv{armop}_{suf}_x({svtrue}, {in0});'. \\\n               format(armop=armop[op], **fmtspec)\n\n# -----------------------------------------------------------------------------\n# FMA and FNMA\n\ndef fmafnma3(op, simd_ext, typ):\n    if typ in common.ftypes and simd_ext == 'aarch64':\n        armop = {'fma': 'fma', 'fnma': 'fms'}\n    else:\n        armop = {'fma': 'mla', 'fnma': 'mls'}\n    if simd_ext in neon:\n        normal = 'return v{armop}q_{suf}({in2}, {in1}, {in0});'. \\\n                 format(armop=armop[op], **fmtspec)\n        emul = emulate_op3_neon(op, simd_ext, typ)\n        if typ == 'f16':\n            using_f32 = \\\n            '''nsimd_{simd_ext}_vf16 ret;\n               ret.v0 = nsimd_{op}_{simd_ext}_f32({in0}.v0, {in1}.v0, {in2}.v0);\n               ret.v1 = nsimd_{op}_{simd_ext}_f32({in0}.v1, {in1}.v1, {in2}.v1);\n               return ret;'''.format(op=op, **fmtspec)\n            if simd_ext == 'aarch64':\n                return \\\n                '''#ifdef NSIMD_ARM_FP16\n                     {}\n                   #else\n                     {}\n                   #endif'''.format(emul, using_f32)\n            else:\n                return using_f32\n        elif simd_ext == 'neon128' and typ == 'f64':\n            return emulate_f64_neon('neon128', op, ['v'] * 4)\n        elif simd_ext == 'aarch64' and typ == 'f64':\n            return normal\n        elif typ in ['i64', 'u64']:\n            return emul\n        else:\n            return normal\n    else:\n        return 'return sv{armop}_{suf}_x({svtrue}, {in2}, {in1}, {in0});'. \\\n               format(armop=armop[op], **fmtspec)\n\n# -----------------------------------------------------------------------------\n# FMS and FNMS\n\ndef fmsfnms3(op, simd_ext, typ):\n    if typ in common.iutypes:\n        return \\\n        '''return nsimd_neg_{simd_ext}_{typ}(nsimd_{op2}_{simd_ext}_{typ}(\n                      {in0}, {in1}, {in2}));'''. \\\n                      format(op2='fma' if op == 'fnms' else 'fnma', **fmtspec)\n    if simd_ext in neon:\n        return \\\n        '''return nsimd_{op2}_{simd_ext}_{typ}({in0}, {in1},\n                      nsimd_neg_{simd_ext}_{typ}({in2}));'''. \\\n                      format(op2='fma' if op == 'fms' else 'fnma', **fmtspec)\n    else:\n        armop = {'fnms': 'nmla', 'fms': 'nmls'}\n        return 'return sv{armop}_{suf}_x({svtrue}, {in2}, {in1}, {in0});'. \\\n               format(armop=armop[op], **fmtspec)\n\n# -----------------------------------------------------------------------------\n# Neg\n\ndef neg1(simd_ext, typ):\n    if simd_ext in neon:\n        normal = 'return vnegq_{suf}({in0});'.format(**fmtspec)\n        if typ == 'f16':\n            return f16f64(simd_ext, 'f16', 'neg', 'neg', 1)\n        elif typ in ['i8', 'i16', 'i32', 'f32']:\n            return normal\n        elif typ in ['u8', 'u16', 'u32']:\n            return \\\n            '''return vreinterpretq_{suf}_s{typnbits}(\n                        vnegq_s{typnbits}(\n                          vreinterpretq_s{typnbits}_{suf}({in0})));'''. \\\n                          format(**fmtspec)\n        elif simd_ext == 'neon128' and typ in ['i64', 'u64']:\n            return emulate_op1('neg', simd_ext, typ)\n        elif simd_ext == 'neon128' and typ == 'f64':\n            return \\\n            '''nsimd_neon128_vf64 ret;\n               ret.v0 = -{in0}.v0;\n               ret.v1 = -{in0}.v1;\n               return ret;'''.format(**fmtspec)\n        elif simd_ext == 'aarch64' and typ in ['f64', 'i64']:\n            return normal\n        elif simd_ext == 'aarch64' and typ == 'u64':\n            return \\\n            '''return vreinterpretq_u64_s64(vnegq_s64(\n                          vreinterpretq_s64_u64({in0})));'''. \\\n                          format(**fmtspec)\n    else:\n        if typ in common.utypes:\n            return \\\n            '''return svreinterpret_{suf}_s{typnbits}(\n                        svneg_s{typnbits}_x({svtrue},\n                          svreinterpret_s{typnbits}_{suf}({in0})));'''. \\\n                          format(**fmtspec)\n        else:\n            return 'return svneg_{suf}_x({svtrue}, {in0});'.format(**fmtspec)\n\n# -----------------------------------------------------------------------------\n# Reciprocals\n\ndef recs1(op, simd_ext, typ):\n    cte = '({typ})1'.format(**fmtspec) if typ != 'f16' \\\n          else 'nsimd_f32_to_f16(1.0f)'\n    if op in ['rec', 'rec11']:\n        return \\\n        '''return nsimd_div_{simd_ext}_{typ}(\n                      nsimd_set1_{simd_ext}_{typ}({cte}), {in0});'''. \\\n                      format(cte=cte, **fmtspec)\n    elif op == 'rsqrt11':\n        return \\\n        '''return nsimd_div_{simd_ext}_{typ}(\n                      nsimd_set1_{simd_ext}_{typ}({cte}),\n                      nsimd_sqrt_{simd_ext}_{typ}({in0}));'''. \\\n                      format(cte=cte, **fmtspec)\n    elif op in ['rec8', 'rsqrt8']:\n        armop = 'recpe' if op == 'rec8' else 'rsqrte'\n        if simd_ext in sve:\n            return 'return sv{armop}_{suf}({in0});'. \\\n            format(armop=armop, **fmtspec)\n        else:\n            ret = f16f64(simd_ext, typ, op, armop, 1)\n            if ret != '':\n                return ret\n            return 'return v{armop}q_{suf}({in0});'. \\\n            format(armop=armop, **fmtspec)\n\n# Rec11 and rsqrt11\n# According to http://infocenter.arm.com/help/topic/com.arm.doc.faqs/ka14282.html\n# reciprocal estimates only work when inputs is restrained in some small\n# interval so we comment these for now and return full-precision reciprocals.\n\n# def rec11rsqrt11(op, simd_ext, typ):\n#    armop = {'rec11': 'recpe', 'rsqrt11': 'rsqrte'}\n#    if simd_ext in neon:\n#        ret = f16f64(simd_ext, typ, op, armop[op], 1)\n#        if ret != '':\n#            return ret\n#        return 'return v{armop}q_{suf}({in0});'. \\\n#               format(armop=armop[op], **fmtspec)\n#    else:\n#        return 'return sv{armop}_{suf}({in0});'. \\\n#               format(armop=armop[op], **fmtspec)\n\n# -----------------------------------------------------------------------------\n# Load of logicals\n\ndef loadl(aligned, simd_ext, typ):\n    return \\\n    '''/* This can surely be improved but it is not our priority. */\n       return nsimd_notl_{simd_ext}_{typ}(nsimd_eq_{simd_ext}_{typ}(\n                nsimd_load{align}_{simd_ext}_{typ}(\n                  {in0}), nsimd_set1_{simd_ext}_{typ}({zero})));'''. \\\n       format(align='a' if aligned else 'u',\n              zero = 'nsimd_f32_to_f16(0.0f)' if typ == 'f16'\n              else '({})0'.format(typ), **fmtspec)\n\n# -----------------------------------------------------------------------------\n# Store of logicals\n\ndef storel(aligned, simd_ext, typ):\n    return \\\n    '''/* This can surely be improved but it is not our priority. */\n       nsimd_store{align}_{simd_ext}_{typ}({in0},\n         nsimd_if_else1_{simd_ext}_{typ}({in1},\n           nsimd_set1_{simd_ext}_{typ}({one}),\n           nsimd_set1_{simd_ext}_{typ}({zero})));'''. \\\n       format(align = 'a' if aligned else 'u',\n              one = 'nsimd_f32_to_f16(1.0f)' if typ == 'f16'\n              else '({})1'.format(typ),\n              zero = 'nsimd_f32_to_f16(0.0f)' if typ == 'f16'\n              else '({})0'.format(typ), **fmtspec)\n\n# -----------------------------------------------------------------------------\n# All and any\n\ndef allany1(opts, op, simd_ext, typ):\n    binop = '&&' if  op == 'all' else '||'\n    if simd_ext == 'neon128':\n        if typ == 'f16':\n            return \\\n            '''return nsimd_{op}_neon128_f32({in0}.v0) {binop}\n                      nsimd_{op}_neon128_f32({in0}.v1);'''. \\\n                      format(op=op, binop=binop, **fmtspec)\n        elif typ == 'f64':\n            return 'return {in0}.v0 {binop} {in0}.v1;'. \\\n                   format(binop=binop, **fmtspec)\n        else:\n            return 'return ' + \\\n            binop.join(['vgetq_lane_u{typnbits}({in0}, {i})'. \\\n                        format(i=i, **fmtspec) \\\n                        for i in range(0, 128 // int(fmtspec['typnbits']))]) + \\\n                        ';'\n    elif simd_ext == 'aarch64':\n        armop = {'all': 'min', 'any': 'max'}\n        normal = 'return v{armop}vq_u{typnbits}({in0}) != 0;'. \\\n                 format(armop=armop[op], **fmtspec)\n        if typ == 'f16':\n            return \\\n            '''#ifdef NSIMD_ARM_FP16\n                 {normal}\n               #else\n                 return nsimd_{op}_aarch64_f32({in0}.v0) {binop}\n                        nsimd_{op}_aarch64_f32({in0}.v1);\n               #endif'''.format(normal=normal, op=op, binop=binop, **fmtspec)\n        elif typ in ['i64', 'u64', 'f64']:\n            return \\\n            'return v{armop}vq_u32(vreinterpretq_u32_u64({in0})) != 0;'. \\\n            format(armop=armop[op], **fmtspec)\n        else:\n            return normal\n    elif simd_ext in sve:\n        if op == 'any':\n            operand= convert_to_predicate(opts, '{in0}'.format(**fmtspec))\n            return '''return svptest_any({svtrue}, {operand});'''. \\\n                    format(operand=operand, **fmtspec)\n        else:\n            operand='svnot_z({svtrue}, {op})'. \\\n            format(op=convert_to_predicate(opts, '{in0}'.format(**fmtspec)),\n                   **fmtspec)\n\n            return '''return !svptest_any({svtrue}, {operand});'''. \\\n                    format(operand=operand, **fmtspec)\n\n# -----------------------------------------------------------------------------\n# nbtrue\n\ndef nbtrue1(opts, simd_ext, typ):\n    if simd_ext == 'neon128':\n        if typ == 'f16':\n            return \\\n            '''return nsimd_nbtrue_neon128_f32({in0}.v0) +\n                      nsimd_nbtrue_neon128_f32({in0}.v1);'''. \\\n                      format(**fmtspec)\n        elif typ == 'f64':\n            return 'return -(int)((i64){in0}.v0 + (i64){in0}.v1);'. \\\n                   format(**fmtspec)\n        else:\n            return \\\n            '''nsimd_neon128_vi{typnbits} temp =\n                   vreinterpretq_s{typnbits}_u{typnbits}({in0});\n               return -(int)('''.format(**fmtspec) + \\\n            '+'.join(['vgetq_lane_s{typnbits}(temp, {i})'. \\\n                      format(i=i, **fmtspec) \\\n                      for i in range(0, 128 // int(fmtspec['typnbits']))]) + \\\n                      ');'\n    elif simd_ext == 'aarch64':\n        normal = \\\n        '''return -(int)vaddvq_s{typnbits}(\n                          vreinterpretq_s{typnbits}_u{typnbits}({in0}));'''. \\\n                     format(**fmtspec)\n        if typ == 'f16':\n            return \\\n            '''#ifdef NSIMD_ARM_FP16\n                 {normal}\n               #else\n                 return nsimd_nbtrue_aarch64_f32({in0}.v0) +\n                        nsimd_nbtrue_aarch64_f32({in0}.v1);\n               #endif'''.format(normal=normal, **fmtspec)\n        elif typ in ['i64', 'u64', 'f64']:\n            return \\\n            '''return -(vaddvq_s32(vreinterpretq_s32_u64({in0})) >> 1);'''. \\\n                         format(**fmtspec)\n        else:\n            return normal\n    elif simd_ext in sve:\n        return 'return (int)svcntp_b{typnbits}({svtrue}, {op});'. \\\n               format(op=convert_to_predicate(opts, '{in0}'.format(**fmtspec)),\n                      **fmtspec)\n\n# -----------------------------------------------------------------------------\n# Reinterpret logical\n\ndef reinterpretl1(simd_ext, from_typ, to_typ):\n    if from_typ == to_typ or simd_ext in sve:\n        return 'return {in0};'.format(**fmtspec)\n    to_f16_with_f32 = \\\n    '''nsimd_{simd_ext}_vlf16 ret;\n       u32 buf[4];\n       buf[0] = (vgetq_lane_u16({in0}, 0) ? (u32)-1 : 0);\n       buf[1] = (vgetq_lane_u16({in0}, 1) ? (u32)-1 : 0);\n       buf[2] = (vgetq_lane_u16({in0}, 2) ? (u32)-1 : 0);\n       buf[3] = (vgetq_lane_u16({in0}, 3) ? (u32)-1 : 0);\n       ret.v0 = vld1q_u32(buf);\n       buf[0] = (vgetq_lane_u16({in0}, 4) ? (u32)-1 : 0);\n       buf[1] = (vgetq_lane_u16({in0}, 5) ? (u32)-1 : 0);\n       buf[2] = (vgetq_lane_u16({in0}, 6) ? (u32)-1 : 0);\n       buf[3] = (vgetq_lane_u16({in0}, 7) ? (u32)-1 : 0);\n       ret.v1 = vld1q_u32(buf);\n       return ret;'''.format(**fmtspec)\n    from_f16_with_f32 = \\\n    '''u16 buf[8];\n       buf[0] = (vgetq_lane_u32({in0}.v0, 0) ? (u16)-1 : 0);\n       buf[1] = (vgetq_lane_u32({in0}.v0, 1) ? (u16)-1 : 0);\n       buf[2] = (vgetq_lane_u32({in0}.v0, 2) ? (u16)-1 : 0);\n       buf[3] = (vgetq_lane_u32({in0}.v0, 3) ? (u16)-1 : 0);\n       buf[4] = (vgetq_lane_u32({in0}.v1, 0) ? (u16)-1 : 0);\n       buf[5] = (vgetq_lane_u32({in0}.v1, 1) ? (u16)-1 : 0);\n       buf[6] = (vgetq_lane_u32({in0}.v1, 2) ? (u16)-1 : 0);\n       buf[7] = (vgetq_lane_u32({in0}.v1, 3) ? (u16)-1 : 0);\n       return vld1q_u16(buf);'''.format(**fmtspec)\n    if simd_ext == 'neon128':\n        if to_typ == 'f16':\n            return to_f16_with_f32\n        elif from_typ == 'f16':\n            return from_f16_with_f32\n        elif to_typ == 'f64':\n            return '''nsimd_neon128_vlf64 ret;\n                      ret.v0 = vgetq_lane_u64({in0}, 0);\n                      ret.v1 = vgetq_lane_u64({in0}, 1);\n                      return ret;'''.format(**fmtspec)\n        elif from_typ == 'f64':\n            return '''u64 buf[2];\n                      buf[0] = {in0}.v0;\n                      buf[1] = {in0}.v1;\n                      return vld1q_u64(buf);'''.format(**fmtspec)\n        else:\n            return 'return {in0};'.format(**fmtspec)\n    elif simd_ext == 'aarch64':\n        if to_typ == 'f16':\n            return '''#ifdef NSIMD_ARM_FP16\n                        return {in0};\n                      #else\n                        {using_f32}\n                      #endif'''.format(using_f32=to_f16_with_f32, **fmtspec)\n        elif from_typ == 'f16':\n            return '''#ifdef NSIMD_ARM_FP16\n                        return {in0};\n                      #else\n                        {using_f32}\n                      #endif'''.format(using_f32=from_f16_with_f32, **fmtspec)\n        else:\n            return 'return {in0};'.format(**fmtspec)\n\n# -----------------------------------------------------------------------------\n# Convert\n\ndef convert1(simd_ext, from_typ, to_typ):\n    fmtspec2 = fmtspec.copy()\n    fmtspec2['to_suf'] = suf(to_typ)\n    fmtspec2['from_suf'] = suf(from_typ)\n    if from_typ == to_typ:\n        return 'return {in0};'.format(**fmtspec)\n    if from_typ in common.iutypes and to_typ in common.iutypes:\n        if simd_ext in neon:\n            return 'return vreinterpretq_{to_suf}_{from_suf}({in0});'. \\\n                   format(**fmtspec2)\n        else:\n            return 'return svreinterpret_{to_suf}_{from_suf}({in0});'. \\\n                   format(**fmtspec2)\n    if simd_ext in sve:\n        return 'return svcvt_{to_suf}_{from_suf}_x({svtrue}, {in0});'. \\\n               format(**fmtspec2)\n    to_f16_with_f32 = \\\n    '''nsimd_{simd_ext}_vf16 ret;\n       f32 buf[4];\n       buf[0] = (f32)vgetq_lane_{from_suf}({in0}, 0);\n       buf[1] = (f32)vgetq_lane_{from_suf}({in0}, 1);\n       buf[2] = (f32)vgetq_lane_{from_suf}({in0}, 2);\n       buf[3] = (f32)vgetq_lane_{from_suf}({in0}, 3);\n       ret.v0 = vld1q_f32(buf);\n       buf[0] = (f32)vgetq_lane_{from_suf}({in0}, 4);\n       buf[1] = (f32)vgetq_lane_{from_suf}({in0}, 5);\n       buf[2] = (f32)vgetq_lane_{from_suf}({in0}, 6);\n       buf[3] = (f32)vgetq_lane_{from_suf}({in0}, 7);\n       ret.v1 = vld1q_f32(buf);\n       return ret;'''.format(**fmtspec2)\n    from_f16_with_f32 = \\\n    '''{to_typ} buf[8];\n       buf[0] = ({to_typ})vgetq_lane_f32({in0}.v0, 0);\n       buf[1] = ({to_typ})vgetq_lane_f32({in0}.v0, 1);\n       buf[2] = ({to_typ})vgetq_lane_f32({in0}.v0, 2);\n       buf[3] = ({to_typ})vgetq_lane_f32({in0}.v0, 3);\n       buf[4] = ({to_typ})vgetq_lane_f32({in0}.v1, 0);\n       buf[5] = ({to_typ})vgetq_lane_f32({in0}.v1, 1);\n       buf[6] = ({to_typ})vgetq_lane_f32({in0}.v1, 2);\n       buf[7] = ({to_typ})vgetq_lane_f32({in0}.v1, 3);\n       return vld1q_{to_suf}(buf);'''.format(**fmtspec2)\n    if simd_ext == 'neon128':\n        if to_typ == 'f16':\n            return to_f16_with_f32\n        elif from_typ == 'f16':\n            return from_f16_with_f32\n        elif to_typ == 'f64':\n            return '''nsimd_neon128_vf64 ret;\n                      ret.v0 = (f64)vgetq_lane_{from_suf}({in0}, 0);\n                      ret.v1 = (f64)vgetq_lane_{from_suf}({in0}, 1);\n                      return ret;'''.format(**fmtspec2)\n        elif from_typ == 'f64':\n            return '''{to_typ} buf[2];\n                      buf[0] = ({to_typ}){in0}.v0;\n                      buf[1] = ({to_typ}){in0}.v1;\n                      return vld1q_{to_suf}(buf);'''.format(**fmtspec2)\n        else:\n            return 'return vcvtq_{to_suf}_{from_suf}({in0});'. \\\n                   format(**fmtspec2)\n    elif simd_ext == 'aarch64':\n        if to_typ == 'f16':\n            return '''#ifdef NSIMD_ARM_FP16\n                        return vcvtq_{to_suf}_{from_suf}({in0});\n                      #else\n                        {using_f32}\n                      #endif'''.format(using_f32=to_f16_with_f32, **fmtspec2)\n        elif from_typ == 'f16':\n            return '''#ifdef NSIMD_ARM_FP16\n                        return vcvtq_{to_suf}_{from_suf}({in0});\n                      #else\n                        {using_f32}\n                      #endif'''.format(using_f32=from_f16_with_f32, **fmtspec2)\n        else:\n            return 'return vcvtq_{to_suf}_{from_suf}({in0});'. \\\n                   format(**fmtspec2)\n\n# -----------------------------------------------------------------------------\n# Reinterpret\n\ndef reinterpret1(simd_ext, from_typ, to_typ):\n    fmtspec2 = fmtspec.copy()\n    fmtspec2['to_suf'] = suf(to_typ)\n    fmtspec2['from_suf'] = suf(from_typ)\n    if from_typ == to_typ:\n        return 'return {in0};'.format(**fmtspec)\n    if simd_ext in sve:\n        return 'return svreinterpret_{to_suf}_{from_suf}({in0});'. \\\n               format(**fmtspec2)\n    to_f16_with_f32 = \\\n    '''nsimd_{simd_ext}_vf16 ret;\n       f32 buf[4];\n       buf[0] = nsimd_u16_to_f32((u16)vgetq_lane_{from_suf}({in0}, 0));\n       buf[1] = nsimd_u16_to_f32((u16)vgetq_lane_{from_suf}({in0}, 1));\n       buf[2] = nsimd_u16_to_f32((u16)vgetq_lane_{from_suf}({in0}, 2));\n       buf[3] = nsimd_u16_to_f32((u16)vgetq_lane_{from_suf}({in0}, 3));\n       ret.v0 = vld1q_f32(buf);\n       buf[0] = nsimd_u16_to_f32((u16)vgetq_lane_{from_suf}({in0}, 4));\n       buf[1] = nsimd_u16_to_f32((u16)vgetq_lane_{from_suf}({in0}, 5));\n       buf[2] = nsimd_u16_to_f32((u16)vgetq_lane_{from_suf}({in0}, 6));\n       buf[3] = nsimd_u16_to_f32((u16)vgetq_lane_{from_suf}({in0}, 7));\n       ret.v1 = vld1q_f32(buf);\n       return ret;'''.format(**fmtspec2)\n    from_f16_with_f32 = \\\n    '''{to_typ} buf[8];\n       buf[0] = ({to_typ})nsimd_f32_to_u16(vgetq_lane_f32({in0}.v0, 0));\n       buf[1] = ({to_typ})nsimd_f32_to_u16(vgetq_lane_f32({in0}.v0, 1));\n       buf[2] = ({to_typ})nsimd_f32_to_u16(vgetq_lane_f32({in0}.v0, 2));\n       buf[3] = ({to_typ})nsimd_f32_to_u16(vgetq_lane_f32({in0}.v0, 3));\n       buf[4] = ({to_typ})nsimd_f32_to_u16(vgetq_lane_f32({in0}.v1, 0));\n       buf[5] = ({to_typ})nsimd_f32_to_u16(vgetq_lane_f32({in0}.v1, 1));\n       buf[6] = ({to_typ})nsimd_f32_to_u16(vgetq_lane_f32({in0}.v1, 2));\n       buf[7] = ({to_typ})nsimd_f32_to_u16(vgetq_lane_f32({in0}.v1, 3));\n       return vld1q_{to_suf}(buf);'''.format(**fmtspec2)\n    if simd_ext == 'neon128':\n        if to_typ == 'f16':\n            return to_f16_with_f32\n        elif from_typ == 'f16':\n            return from_f16_with_f32\n        elif to_typ == 'f64':\n            return '''nsimd_neon128_vf64 ret;\n                      union {{ f64 to; {from_typ} from; }} buf;\n                      buf.from = vgetq_lane_{from_suf}({in0}, 0);\n                      ret.v0 = buf.to;\n                      buf.from = vgetq_lane_{from_suf}({in0}, 1);\n                      ret.v1 = buf.to;\n                      return ret;'''.format(**fmtspec2)\n        elif from_typ == 'f64':\n            return '''union {{ f64 from; {to_typ} to; }} buf_;\n                      {to_typ} buf[2];\n                      buf_.from = {in0}.v0;\n                      buf[0] = buf_.to;\n                      buf_.from = {in0}.v1;\n                      buf[1] = buf_.to;\n                      return vld1q_{to_suf}(buf);'''.format(**fmtspec2)\n        else:\n            return 'return vreinterpretq_{to_suf}_{from_suf}({in0});'. \\\n                   format(**fmtspec2)\n    elif simd_ext == 'aarch64':\n        if to_typ == 'f16':\n            return '''#ifdef NSIMD_ARM_FP16\n                        return vreinterpretq_{to_suf}_{from_suf}({in0});\n                      #else\n                        {using_f32}\n                      #endif'''.format(using_f32=to_f16_with_f32, **fmtspec2)\n        elif from_typ == 'f16':\n            return '''#ifdef NSIMD_ARM_FP16\n                        return vreinterpretq_{to_suf}_{from_suf}({in0});\n                      #else\n                        {using_f32}\n                      #endif'''.format(using_f32=from_f16_with_f32, **fmtspec2)\n        else:\n            return 'return vreinterpretq_{to_suf}_{from_suf}({in0});'. \\\n                   format(**fmtspec2)\n\n# -----------------------------------------------------------------------------\n# reverse\n\ndef reverse1(simd_ext, typ):\n    armtyp = suf(typ)\n    if simd_ext in sve:\n        return '''return svrev_{suf}( {in0} );'''.format(**fmtspec)\n    elif simd_ext == 'neon128' and typ == 'f64':\n        return '''nsimd_neon128_vf64 ret;\n                  ret.v0 = {in0}.v1;\n                  ret.v1 = {in0}.v0;\n                  return ret;'''.format(**fmtspec)\n    elif typ in [ 'i64', 'u64', 'f64' ]:\n        return '''return vcombine_{armtyp}(vget_high_{armtyp}({in0}),\n                                           vget_low_{armtyp}({in0}));'''. \\\n                                           format(armtyp=armtyp, **fmtspec)\n    elif typ == 'f16':\n        return '''nsimd_{simd_ext}_vf16 ret;\n                  ret.v0 = nsimd_reverse_{simd_ext}_f32(a0.v1);\n                  ret.v1 = nsimd_reverse_{simd_ext}_f32(a0.v0);\n                  return ret;'''.format(**fmtspec)\n    else:\n        return '''{in0} = vrev64q_{armtyp}({in0});\n                  return vcombine_{armtyp}(vget_high_{armtyp}({in0}),\n                                           vget_low_{armtyp}({in0}));'''. \\\n                                           format(armtyp=armtyp, **fmtspec)\n\n# -----------------------------------------------------------------------------\n# Horizontal sum\n\ndef addv(simd_ext, typ):\n\n    if simd_ext == 'neon128':\n        if typ == 'f64':\n            return 'return ({typ})({in0}.v0 + {in0}.v1);'.format(**fmtspec)\n        elif typ == 'f16':\n            return \\\n            '''#ifdef NSIMD_ARM_FP16\n                 {t} tmp = vadd_{suf}(vget_low_{suf}({in0}),\n                                      vget_high_{suf}({in0}));\n                 tmp = vadd_{suf}(tmp, vext_{suf}(tmp, tmp, 3));\n                 tmp = vadd_{suf}(tmp, vext_{suf}(tmp, tmp, 0));\n                 return vget_lane_{suf}(tmp, 0);\n               #else\n                 float32x2_t tmp0 = vadd_f32(vget_low_f32({in0}.v0),\n                                             vget_high_f32({in0}.v0));\n                 tmp0 = vadd_f32(tmp0, vext_f32(tmp0, tmp0, 1));\n                 float32x2_t tmp1 = vadd_f32(vget_low_f32({in0}.v1),\n                                             vget_high_f32({in0}.v1));\n                 tmp1 = vadd_f32(tmp1, vext_f32(tmp1, tmp1, 1));\n                 return nsimd_f32_to_f16(vget_lane_f32(tmp0, 0) +\n                                         vget_lane_f32(tmp1, 0));\n               #endif''' .format(t=half_neon64_typ(typ), **fmtspec)\n        elif typ == 'f32':\n            return \\\n            '''{t} tmp = vadd_{suf}(vget_low_{suf}({in0}),\n                                    vget_high_{suf}({in0}));\n               tmp = vadd_{suf}(tmp, vext_{suf}(tmp, tmp, 1));\n               return vget_lane_{suf}(tmp, 0);'''. \\\n               format(t=half_neon64_typ(typ), **fmtspec)\n        elif typ[0] in ['i', 'u']:\n            le = 128 // int(typ[1:]);\n            return \\\n            '''{typ} res = ({typ})0;\n               {typ} buf[{le}];\n               vst1q_{suf}(buf, {in0});\n               for (int i = 0; i < {le}; i++) {{\n                 res += buf[i];\n               }}\n               return res;'''. \\\n               format(le=le, **fmtspec)\n    elif simd_ext == 'aarch64':\n        if typ == 'f16':\n            return \\\n            '''#ifdef NSIMD_ARM_FP16\n                 {t} tmp = vadd_{suf}(vget_low_{suf}({in0}),\n                                      vget_high_{suf}({in0}));\n                 tmp = vadd_{suf}(tmp, vext_{suf}(tmp, tmp, 3));\n                 tmp = vadd_{suf}(tmp, vext_{suf}(tmp, tmp, 0));\n                 return vget_lane_{suf}(tmp, 0);\n               #else\n                 float32x2_t tmp0 = vadd_f32(vget_low_f32({in0}.v0),\n                                             vget_high_f32({in0}.v0));\n                 tmp0 = vadd_f32(tmp0, vext_f32(tmp0, tmp0, 1));\n                 float32x2_t tmp1 = vadd_f32(vget_low_f32({in0}.v1),\n                                             vget_high_f32({in0}.v1));\n                 tmp1 = vadd_f32(tmp1, vext_f32(tmp1, tmp1, 1));\n                 return nsimd_f32_to_f16(vget_lane_f32(tmp0, 0) +\n                                         vget_lane_f32(tmp1, 0));\n               #endif''' .format(t=half_neon64_typ(typ), **fmtspec)\n        elif typ in ['f32', 'f64']:\n            return 'return vaddvq_{suf}({in0});'.format(**fmtspec)\n    elif simd_ext in sve:\n        return 'return svaddv_{suf}({svtrue}, {in0});' .format(**fmtspec)\n\n# -----------------------------------------------------------------------------\n# Up convert\n\ndef upcvt1(simd_ext, from_typ, to_typ):\n    # For integer upcast, due to 2's complement representation\n    # _s : signed   -> bigger signed\n    # _s : signed   -> bigger unsigned\n    # _u : unsigned -> bigger signed\n    # _u : unsigned -> bigger unsigned\n    if simd_ext in neon:\n        if from_typ == 'f16' and to_typ == 'f32':\n            return \\\n            '''#ifdef NSIMD_ARM_FP16\n                 nsimd_{simd_ext}_vf32x2 ret;\n                 ret.v0 = vcvt_f32_f16(vget_low_{suf}({in0}));\n                 ret.v1 = vcvt_f32_f16(vget_high_{suf}({in0}));\n                 return ret;\n               #else\n                 nsimd_{simd_ext}_vf32x2 ret;\n                 ret.v0 = {in0}.v0;\n                 ret.v1 = {in0}.v1;\n                 return ret;\n               #endif'''.format(**fmtspec)\n        elif from_typ == 'f32' and to_typ == 'f64':\n            if simd_ext == 'neon128':\n                return \\\n                '''nsimd_neon128_vf64x2 ret;\n                   f32 buf[4];\n                   vst1q_f32(buf, {in0});\n                   ret.v0.v0 = (f64)buf[0];\n                   ret.v0.v1 = (f64)buf[1];\n                   ret.v1.v0 = (f64)buf[2];\n                   ret.v1.v1 = (f64)buf[3];\n                   return ret;'''.format(**fmtspec)\n            else:\n                return \\\n                '''nsimd_aarch64_vf64x2 ret;\n                   ret.v0 = vcvt_f64_f32(vget_low_{suf}({in0}));\n                   ret.v1 = vcvt_f64_f32(vget_high_{suf}({in0}));\n                   return ret;'''.format(**fmtspec)\n        elif (from_typ in common.itypes and to_typ in common.itypes) or \\\n             (from_typ in common.utypes and to_typ in common.utypes):\n            return '''nsimd_{simd_ext}_v{to_typ}x2 ret;\n                      ret.v0 = vmovl_{suf}(vget_low_{suf}({in0}));\n                      ret.v1 = vmovl_{suf}(vget_high_{suf}({in0}));\n                      return ret;'''.format(**fmtspec)\n        elif (from_typ in common.itypes and to_typ in common.utypes) or \\\n             (from_typ in common.utypes and to_typ in common.itypes):\n            return '''nsimd_{simd_ext}_v{to_typ}x2 ret;\n                      ret.v0 = vreinterpretq_{suf_to_typ}_{suf_int_typ}(\n                                 vmovl_{suf}(vget_low_{suf}({in0})));\n                      ret.v1 = vreinterpretq_{suf_to_typ}_{suf_int_typ}(\n                                 vmovl_{suf}(vget_high_{suf}({in0})));\n                      return ret;'''. \\\n                      format(suf_to_typ=suf(to_typ),\n                             suf_int_typ=suf(from_typ[0] + to_typ[1:]),\n                             **fmtspec)\n        else:\n            return \\\n            '''nsimd_{simd_ext}_v{to_typ}x2 ret;\n               nsimd_{simd_ext}_v{int_typ}x2 tmp;\n               tmp = nsimd_upcvt_{simd_ext}_{int_typ}_{from_typ}({in0});\n               ret.v0 = nsimd_cvt_{simd_ext}_{to_typ}_{int_typ}(tmp.v0);\n               ret.v1 = nsimd_cvt_{simd_ext}_{to_typ}_{int_typ}(tmp.v1);\n               return ret;'''. \\\n               format(int_typ=from_typ[0] + to_typ[1:], **fmtspec)\n\n    # Getting here means that we deal with SVE\n    if (from_typ in common.itypes and to_typ in common.itypes) or \\\n       (from_typ in common.utypes and to_typ in common.utypes):\n        return '''nsimd_{simd_ext}_v{to_typ}x2 ret;\n                  ret.v0 = svunpklo_{suf_to_typ}({in0});\n                  ret.v1 = svunpkhi_{suf_to_typ}({in0});\n                  return ret;'''.format(suf_to_typ=suf(to_typ), **fmtspec)\n    elif (from_typ in common.itypes and to_typ in common.utypes) or \\\n         (from_typ in common.utypes and to_typ in common.itypes):\n        return \\\n        '''nsimd_{simd_ext}_v{to_typ}x2 ret;\n           ret.v0 = svreinterpret_{suf_to_typ}_{suf_int_typ}(\n                      svunpklo_{suf_int_typ}({in0}));\n           ret.v1 = svreinterpret_{suf_to_typ}_{suf_int_typ}(\n                      svunpkhi_{suf_int_typ}({in0}));\n           return ret;'''. \\\n           format(suf_to_typ=suf(to_typ),\n                  suf_int_typ=suf(from_typ[0] + to_typ[1:]), **fmtspec)\n    elif from_typ in common.iutypes and to_typ in common.ftypes:\n        return \\\n        '''nsimd_{simd_ext}_v{to_typ}x2 ret;\n           ret.v0 = svcvt_{suf_to_typ}_{suf_int_typ}_x(\n                      {svtrue}, svunpklo_{suf_int_typ}({in0}));\n           ret.v1 = svcvt_{suf_to_typ}_{suf_int_typ}_x(\n                      {svtrue}, svunpkhi_{suf_int_typ}({in0}));\n           return ret;'''. \\\n           format(suf_to_typ=suf(to_typ),\n                  suf_int_typ=suf(from_typ[0] + to_typ[1:]), **fmtspec)\n    else:\n        return \\\n        '''nsimd_{simd_ext}_v{to_typ}x2 ret;\n           ret.v0 = svcvt_{suf_to_typ}_{suf}_x({svtrue}, svzip1_{suf}(\n                      {in0}, {in0}));\n           ret.v1 = svcvt_{suf_to_typ}_{suf}_x({svtrue}, svzip2_{suf}(\n                      {in0}, {in0}));\n           return ret;'''.format(suf_to_typ=suf(to_typ), **fmtspec)\n\n# -----------------------------------------------------------------------------\n# Down convert\n\ndef downcvt1(simd_ext, from_typ, to_typ):\n    if simd_ext in neon:\n        if from_typ == 'f64' and to_typ == 'f32':\n            if simd_ext == 'neon128':\n                return '''f32 buf[4];\n                          buf[0] = (f32){in0}.v0;\n                          buf[1] = (f32){in0}.v1;\n                          buf[2] = (f32){in1}.v0;\n                          buf[3] = (f32){in1}.v1;\n                          return vld1q_f32(buf);'''.format(**fmtspec)\n            else:\n                return '''return vcombine_f32(vcvt_f32_f64({in0}),\n                                              vcvt_f32_f64({in1}));'''. \\\n                                              format(**fmtspec)\n        elif from_typ == 'f32' and to_typ == 'f16':\n            return '''#ifdef NSIMD_ARM_FP16\n                        return vcombine_f16(vcvt_f16_f32({in0}),\n                                            vcvt_f16_f32({in1}));\n                      #else\n                        nsimd_{simd_ext}_vf16 ret;\n                        ret.v0 = {in0};\n                        ret.v1 = {in1};\n                        return ret;\n                      #endif'''.format(**fmtspec)\n        elif (from_typ in common.itypes and to_typ in common.itypes) or \\\n             (from_typ in common.utypes and to_typ in common.utypes):\n            return '''return vcombine_{suf_to_typ}(vmovn_{suf}({in0}),\n                               vmovn_{suf}({in1}));'''. \\\n                               format(suf_to_typ=suf(to_typ), **fmtspec)\n        elif (from_typ in common.itypes and to_typ in common.itypes) or \\\n             (from_typ in common.utypes and to_typ in common.utypes):\n            return '''return vreinterpretq_{suf_to_typ}(\n                               vcombine_{suf_to_typ}(vmovn_{suf}({in0}),\n                                 vmovn_{suf}({in1}));'''. \\\n                                 format(suf_to_typ=suf(to_typ), **fmtspec)\n        else:\n            return \\\n            '''return nsimd_downcvt_{simd_ext}_{to_typ}_{int_typ}(\n                        nsimd_cvt_{simd_ext}_{int_typ}_{from_typ}({in0}),\n                        nsimd_cvt_{simd_ext}_{int_typ}_{from_typ}({in1}));'''.\\\n                        format(int_typ=to_typ[0] + from_typ[1:], **fmtspec)\n\n    # Getting here means that we deal with SVE\n    if from_typ in common.iutypes and to_typ in common.iutypes:\n        return '''return svuzp1_{suf_to_typ}(\n                           svreinterpret_{suf_to_typ}_{suf}({in0}),\n                           svreinterpret_{suf_to_typ}_{suf}({in1}));'''. \\\n                           format(suf_to_typ=suf(to_typ), **fmtspec)\n    elif from_typ in common.ftypes and to_typ in common.iutypes:\n        return \\\n        '''return svuzp1_{suf_to_typ}(svreinterpret_{suf_to_typ}_{suf_int_typ}(\n                    svcvt_{suf_int_typ}_{suf}_x({svtrue}, {in0})),\n                      svreinterpret_{suf_to_typ}_{suf_int_typ}(\n                        svcvt_{suf_int_typ}_{suf}_x({svtrue}, {in1})));'''. \\\n                        format(suf_to_typ=suf(to_typ),\n                               suf_int_typ=suf(to_typ[0] + from_typ[1:]),\n                               **fmtspec)\n    else:\n        return \\\n        '''return svuzp1_{suf_to_typ}(svcvt_{suf_to_typ}_{suf}_x(\n                    {svtrue}, {in0}), svcvt_{suf_to_typ}_{suf}_x(\n                      {svtrue}, {in1}));'''. \\\n                    format(suf_to_typ=suf(to_typ), **fmtspec)\n\n# -----------------------------------------------------------------------------\n# adds\n\ndef adds(simd_ext, from_typ):\n    if from_typ in common.ftypes:\n        return 'return nsimd_add_{simd_ext}_{from_typ}({in0}, {in1});'. \\\n               format(**fmtspec)\n    if simd_ext in neon:\n        return 'return vqaddq_{suf}({in0}, {in1});'.format(**fmtspec)\n    else:\n        return 'return svqadd_{suf}({in0}, {in1});'.format(**fmtspec)\n\n# -----------------------------------------------------------------------------\n# subs\n\ndef subs(simd_ext, from_typ):\n    if from_typ in common.ftypes:\n        return 'return nsimd_sub_{simd_ext}_{from_typ}({in0}, {in1});'. \\\n               format(**fmtspec)\n    elif simd_ext in neon:\n        return 'return vqsubq_{suf}({in0}, {in1});'.format(**fmtspec)\n    else:\n        return 'return svqsub_{suf}({in0}, {in1});'.format(**fmtspec)\n\n# -----------------------------------------------------------------------------\n# to_mask\n\ndef to_mask1(opts, simd_ext, typ):\n    if typ in common.itypes + common.ftypes:\n        normal = 'return vreinterpretq_{suf}_u{typnbits}({in0});'. \\\n                 format(**fmtspec)\n    else:\n        normal = 'return {in0};'.format(**fmtspec)\n    emulate_f16 = '''nsimd_{simd_ext}_vf16 ret;\n                     ret.v0 = nsimd_to_mask_{simd_ext}_f32({in0}.v0);\n                     ret.v1 = nsimd_to_mask_{simd_ext}_f32({in0}.v1);\n                     return ret;'''.format(**fmtspec)\n    if simd_ext == 'neon128' and typ == 'f16':\n        return emulate_f16\n    elif simd_ext == 'neon128' and typ == 'f64':\n        return '''nsimd_neon128_vf64 ret;\n                  ret.v0 = nsimd_scalar_reinterpret_f64_u64({in0}.v0);\n                  ret.v1 = nsimd_scalar_reinterpret_f64_u64({in0}.v1);\n                  return ret;'''.format(**fmtspec)\n    elif simd_ext == 'aarch64' and typ == 'f16':\n        return '''#ifdef NSIMD_ARM_FP16\n                    {normal}\n                  #else\n                    {emulate_f16}\n                  #endif'''.format(normal=normal, emulate_f16=emulate_f16)\n    elif simd_ext in sve:\n        if opts.sve_emulate_bool:\n            return 'return svreinterpret_{suf}_u{typnbits}({in0});'. \\\n                    format(**fmtspec)\n        else:\n           utyp = 'u{}'.format(fmtspec['typnbits'])\n           return '''return svreinterpret_{suf}_{utyp}(svsel_{utyp}(\n                          {in0}, svdup_n_{utyp}(({utyp})-1),\n                          svdup_n_{utyp}(({utyp})0)));'''. \\\n                          format(utyp=utyp, **fmtspec)\n    else:\n        return normal\n\n# -----------------------------------------------------------------------------\n# iota\n\ndef iota(simd_ext, typ):\n    if simd_ext in sve:\n        if typ in common.iutypes:\n            return 'return svindex_{suf}(0, 1);'.format(**fmtspec)\n        else:\n            return \\\n            '''return svcvt_{suf}_s{typnbits}_x({svtrue},\n                        svindex_s{typnbits}(0, 1));'''.format(**fmtspec)\n    if typ == 'f64' and simd_ext == 'neon128':\n        return '''nsimd_neon128_vf64 ret;\n                  ret.v0 = 0.0;\n                  ret.v1 = 1.0;\n                  return ret;'''.format(**fmtspec)\n    typ2 = 'f32' if typ == 'f16' else typ\n    le = 128 // int(typ[1:])\n    iota = ', '.join(['({typ2}){i}'.format(typ2=typ2, i=i) \\\n                      for i in range(le)])\n    normal = '''{typ} buf[{le}] = {{ {iota} }};\n                return vld1q_{suf}(buf);'''. \\\n                format(le=le, iota=iota, **fmtspec)\n    if typ == 'f16':\n        return '''#ifdef NSIMD_ARM_FP16\n                    {normal}\n                  #else\n                    f32 buf[8] = {{ {iota} }};\n                    nsimd_{simd_ext}_vf16 ret;\n                    ret.v0 = vld1q_f32(buf);\n                    ret.v1 = vld1q_f32(buf + 4);\n                    return ret;\n                  #endif'''.format(iota=iota, normal=normal, **fmtspec)\n    return normal\n\n# -----------------------------------------------------------------------------\n# mask_for_loop_tail\n\ndef mask_for_loop_tail(simd_ext, typ):\n    if typ == 'f16':\n        threshold = 'nsimd_f32_to_f16((f32)({in1} - {in0}))'.format(**fmtspec)\n    else:\n        threshold = '({typ})({in1} - {in0})'.format(**fmtspec)\n    if simd_ext == 'sve':\n        le = 'nsimd_len_sve_{typ}()'.format(**fmtspec)\n    elif simd_ext in fixed_sized_sve:\n        le = int(simd_ext[3:]) // int(typ[1:])\n    else:\n        le = 128 // int(typ[1:])\n    return '''if ({in0} >= {in1}) {{\n                return nsimd_set1l_{simd_ext}_{typ}(0);\n              }}\n              if ({in1} - {in0} < {le}) {{\n                nsimd_{simd_ext}_v{typ} n =\n                      nsimd_set1_{simd_ext}_{typ}({threshold});\n                return nsimd_lt_{simd_ext}_{typ}(\n                           nsimd_iota_{simd_ext}_{typ}(), n);\n              }} else {{\n                return nsimd_set1l_{simd_ext}_{typ}(1);\n              }}'''.format(le=le, threshold=threshold, **fmtspec)\n\n# -----------------------------------------------------------------------------\n# to_logical\n\ndef to_logical1(opts, simd_ext, typ):\n    if typ in common.iutypes:\n        return '''return nsimd_ne_{simd_ext}_{typ}({in0},\n                           nsimd_set1_{simd_ext}_{typ}(({typ})0));'''. \\\n                           format(**fmtspec)\n    normal_fp = \\\n    '''return nsimd_reinterpretl_{simd_ext}_{suf}_{utyp}(\n                nsimd_ne_{simd_ext}_{utyp}(\n                  nsimd_reinterpret_{simd_ext}_{utyp}_{typ}(\n                    {in0}), nsimd_set1_{simd_ext}_{utyp}(({utyp})0)));'''. \\\n                    format(utyp='u{}'.format(fmtspec['typnbits']), **fmtspec)\n    if typ in ['f32', 'f64'] or (typ == 'f16' and simd_ext in sve):\n        return normal_fp\n    emulate_fp16 = \\\n    '''nsimd_{simd_ext}_vlf16 ret;\n       ret.v0 = nsimd_to_logical_{simd_ext}_f32({in0}.v0);\n       ret.v1 = nsimd_to_logical_{simd_ext}_f32({in0}.v1);\n       return ret;'''.format(**fmtspec)\n    if simd_ext == 'aarch64':\n        return '''#ifdef NSIMD_ARM_FP16\n                    {normal_fp}\n                  #else\n                    {emulate_fp16}\n                  #endif'''.format(normal_fp=normal_fp,\n                                   emulate_fp16=emulate_fp16)\n    elif simd_ext == 'neon128':\n        return emulate_fp16\n\n# -----------------------------------------------------------------------------\n# unpack functions\n\ndef zip_unzip_half(func, simd_ext, typ):\n    if simd_ext == 'aarch64' or simd_ext in sve:\n        if typ =='f16' and simd_ext == 'aarch64':\n            if func in ['zip1', 'zip2']:\n                return '''\\\n                #ifdef NSIMD_ARM_FP16\n                  return {s}v{op}{q}_{suf}({in0}, {in1});\n                #else\n                  nsimd_{simd_ext}_v{typ} ret;\n                  ret.v0 = {s}vzip1{q}_f32({in0}.v{i}, {in1}.v{i});\n                  ret.v1 = {s}vzip2{q}_f32({in0}.v{i}, {in1}.v{i});\n                  return ret;\n                #endif\n                '''.format(op=func,\n                           i = '0' if func in ['zip1', 'uzp1'] else '1',\n                           s = 's' if simd_ext in sve else '',\n                           q = '' if simd_ext in sve else 'q', **fmtspec)\n            else:\n                return '''\\\n                #ifdef NSIMD_ARM_FP16\n                  return {s}v{op}{q}_{suf}({in0}, {in1});\n                #else\n                  nsimd_{simd_ext}_v{typ} ret;\n                  ret.v0 = {s}v{func}{q}_f32({in0}.v0, {in0}.v1);\n                  ret.v1 = {s}v{func}{q}_f32({in1}.v0, {in1}.v1);\n                  return ret;\n                #endif'''.format(op=func, func=func,\n                          s = 's' if simd_ext in sve else '',\n                          q = '' if simd_ext in sve else 'q', **fmtspec)\n        else:\n            return 'return {s}v{op}{q}_{suf}({in0}, {in1});'. \\\n                format(op=func, s = 's' if simd_ext in sve else '',\n                       q = '' if simd_ext in sve else 'q', **fmtspec)\n    elif simd_ext == 'neon128':\n        armop = {'zip1': 'zipq', 'zip2': 'zipq', 'uzp1': 'uzpq',\n                 'uzp2': 'uzpq'}\n        prefix = { 'i': 'int', 'u': 'uint', 'f': 'float' }\n        neon_typ = '{}{}x{}x2_t'. \\\n            format(prefix[typ[0]], typ[1:], 128 // int(typ[1:]))\n        if typ == 'f16':\n            if func in ['zip1', 'zip2']:\n                return '''\\\n                nsimd_{simd_ext}_v{typ} ret;\n                float32x4x2_t tmp = v{op}_f32({in0}.v{i}, {in1}.v{i});\n                ret.v0 = tmp.val[0];\n                ret.v1 = tmp.val[1];\n                return ret;\n                '''.format(i = '0' if func == 'zip1' else '1',\n                           op=armop[func], **fmtspec)\n            else:\n                return '''\\\n                nsimd_{simd_ext}_v{typ} ret;\n                float32x4x2_t tmp0 = vuzpq_f32({in0}.v0, {in0}.v1);\n                float32x4x2_t tmp1 = vuzpq_f32({in1}.v0, {in1}.v1);\n                ret.v0 = tmp0.val[{i}];\n                ret.v1 = tmp1.val[{i}];\n                return ret;\n                '''.format(i = '0' if func == 'uzp1' else '1', **fmtspec)\n        elif typ in ['i64', 'u64']:\n            return '''\\\n            {typ} buf0[2], buf1[2];\n            {typ} ret[2];\n            vst1q_{suf}(buf0, {in0});\n            vst1q_{suf}(buf1, {in1});\n            ret[0] = buf0[{i}];\n            ret[1] = buf1[{i}];\n            return vld1q_{suf}(ret);'''. \\\n                format(**fmtspec, i= '0' if func in ['zip1', 'uzp1'] else '1')\n        elif  typ == 'f64' :\n            return '''\\\n            nsimd_{simd_ext}_v{typ} ret;\n            ret.v0 = {in0}.v{i};\n            ret.v1 = {in1}.v{i};\n            return ret;'''. \\\n                format(**fmtspec, i= '0' if func in ['zip1', 'uzp1'] else '1')\n        else :\n            return '''\\\n            {neon_typ} res;\n            res = v{op}_{suf}({in0}, {in1});\n            return res.val[{i}];'''. \\\n                format(neon_typ=neon_typ, op=armop[func], **fmtspec,\n                       i = '0' if func in ['zip1', 'uzp1'] else '1')\n\ndef zip_unzip(func, simd_ext, typ):\n    lo_hi = '''\\\n    nsimd_{simd_ext}_v{typ}x2 ret;\n    ret.v0 = nsimd_{func}lo_{simd_ext}_{typ}({in0}, {in1});\n    ret.v1 = nsimd_{func}hi_{simd_ext}_{typ}({in0}, {in1});\n    return ret;\n    '''.format(func='zip' if func == 'zip' else 'unzip', **fmtspec)\n    if simd_ext == 'aarch64' or simd_ext in sve:\n        content = '''\\\n        nsimd_{simd_ext}_v{typ}x2 ret;\n        ret.v0 = {s}v{func}1{q}_{suf}({in0}, {in1});\n        ret.v1 = {s}v{func}2{q}_{suf}({in0}, {in1});\n        return ret;'''.format(s = 's' if simd_ext in sve else '',\n                              q = '' if simd_ext in sve else 'q',\n                              func=func, **fmtspec)\n        if typ == 'f16':\n            return '''\\\n            #ifdef NSIMD_ARM_FP16\n            {c}\n            #else\n            {default}\n            #endif'''.\\\n                format(c=content, default=lo_hi, s = 's' if simd_ext in sve else '',\n                       **fmtspec)\n        else:\n            return content\n    else:\n       prefix = { 'i': 'int', 'u': 'uint', 'f': 'float' }\n       neon_typ = '{}{}x{}x2_t'.\\\n               format(prefix[typ[0]], typ[1:], 128 // int(typ[1:]))\n       content = '''\\\n       nsimd_{simd_ext}_v{typ}x2 ret;\n       {neon_typ} tmp = v{func}q_{suf}({in0}, {in1});\n       ret.v0 = tmp.val[0];\n       ret.v1 = tmp.val[1];\n       return ret;'''\\\n           .format(func=func, neon_typ=neon_typ, **fmtspec)\n       if typ in ['u64', 'i64', 'f64']:\n           return lo_hi\n       elif typ == 'f16':\n           return '''\\\n           #ifdef NSIMD_ARM_FP16\n           {content}\n           #else\n           {default}\n           #endif'''.\\\n               format(content=content, default=lo_hi,\n                      f='zip' if func == 'zip' else 'unzip', **fmtspec)\n       else:\n           return content\n\n# -----------------------------------------------------------------------------\n# gather\n\ndef gather(simd_ext, typ):\n    le = max_len(simd_ext, typ)\n    real_le = real_len(simd_ext, typ)\n\n    if simd_ext in sve:\n        emul = '''int i;\n                  {typ} buf[{le}];\n                  i{typnbits} offset_buf[{le}];\n                  svst1_s{typnbits}({svtrue}, offset_buf, {in1});\n                  for (i = 0; i < {real_le}; i++) {{\n                    buf[i] = {in0}[offset_buf[i]];\n                  }}\n                  return svld1_{suf}({svtrue}, buf);'''. \\\n                  format(le=le, real_le=real_le, **fmtspec)\n    else:\n        emul = \\\n        '''nsimd_{simd_ext}_v{typ} ret;\n           ret = vdupq_n_{suf}({in0}[vgetq_lane_s{typnbits}({in1}, 0)]);'''. \\\n           format(**fmtspec) + ''.join([\n        '''ret = vsetq_lane_{suf}({in0}[\n                     vgetq_lane_s{typnbits}({in1}, {i})], ret, {i});\\n'''. \\\n                     format(i=i, **fmtspec) for i in range(1, le)]) + \\\n          'return ret;'\n    if typ == 'f16':\n        if simd_ext in sve:\n            return emul\n        return '''#ifdef NSIMD_ARM_FP16\n                    {emul}\n                  #else\n                    nsimd_{simd_ext}_vf16 ret;\n                    f32 buf[8];\n                  '''.format(emul=emul, **fmtspec) + \\\n                  ''.join(['buf[{i}] = nsimd_f16_to_f32({in0}[' \\\n                           'vgetq_lane_s16({in1}, {i})]);\\n'. \\\n                           format(i=i, **fmtspec) for i in range(4)]) + \\\n                  ''.join(['buf[4 + {i}] = nsimd_f16_to_f32({in0}[' \\\n                           'vgetq_lane_s16({in1}, 4 + {i})]);\\n'. \\\n                           format(i=i, **fmtspec) for i in range(4)]) + \\\n               '''  ret.v0 = vld1q_f32(buf);\n                    ret.v1 = vld1q_f32(buf + 4);\n                    return ret;\n                  #endif'''.format(**fmtspec)\n    if simd_ext == 'neon128' and typ == 'f64':\n        return '''nsimd_neon128_vf64 ret;\n                  i64 offset_buf[2];\n                  vst1q_s64(offset_buf, {in1});\n                  ret.v0 = {in0}[offset_buf[0]];\n                  ret.v1 = {in0}[offset_buf[1]];\n                  return ret;'''.format(**fmtspec)\n    if simd_ext in neon or typ in ['i8', 'u8', 'i16', 'u16']:\n        return emul\n    # getting here means SVE\n    return 'return svld1_gather_s{typnbits}index_{suf}({svtrue}, {in0}, ' \\\n           '{in1});'.format(**fmtspec)\n\n# -----------------------------------------------------------------------------\n# linear gather\n\ndef gather_linear(simd_ext, typ):\n    if simd_ext in sve:\n        if typ in ['i8', 'u8', 'i16', 'u16', 'f16']:\n            le = max_len(simd_ext, typ)\n            real_le = real_len(simd_ext, typ)\n            return '''{typ} buf[{le}];\n                      int i;\n                      for (i = 0; i < {real_le}; i++) {{\n                        buf[i] = {in0}[i * {in1}];\n                      }}\n                      return svld1_{suf}({svtrue}, buf);'''. \\\n                      format(le=le, real_le=real_le, **fmtspec)\n        else:\n            return 'return svld1_gather_s{typnbits}index_{suf}({svtrue}, ' \\\n                   '{in0}, svindex_s{typnbits}(0, (i{typnbits}){in1}));'. \\\n                   format(**fmtspec)\n    # getting here means neon128 and aarch64\n    intrinsic = '''nsimd_{simd_ext}_v{typ} ret;\n                   ret = vdupq_n_{suf}({in0}[0]);\n                '''.format(**fmtspec) + ''.join([\n                  'ret = vsetq_lane_{suf}({in0}[{i} * {in1}], ret, {i});\\n'. \\\n                  format(i=i, **fmtspec) \\\n                  for i in range(1, 128 // int(fmtspec['typnbits']))]) + \\\n               '''return ret;'''\n    if typ == 'f16':\n        return '''#ifdef NSIMD_ARM_FP16\n                    {intrinsic}\n                  #else\n                    nsimd_{simd_ext}_vf16 ret;\n                    f32 buf[8];\n                    int i;\n                    for (i = 0; i < 8; i++) {{\n                      buf[i] = nsimd_f16_to_f32({in0}[i * {in1}]);\n                    }}\n                    ret.v0 = vld1q_f32(buf);\n                    ret.v1 = vld1q_f32(buf + 4);\n                    return ret;\n                  #endif'''.format(intrinsic=intrinsic, **fmtspec)\n    if typ == 'f64' and simd_ext == 'neon128':\n        return '''nsimd_neon128_vf64 ret;\n                  ret.v0 = {in0}[0];\n                  ret.v1 = {in0}[{in1}];\n                  return ret;'''.format(**fmtspec)\n    return intrinsic\n\n# -----------------------------------------------------------------------------\n# masked gather\n\ndef maskoz_gather(oz, simd_ext, typ):\n    le = max_len(simd_ext, typ)\n    real_le = real_len(simd_ext, typ)\n\n    if simd_ext in sve:\n        utyp = 'u{typnbits}'.format(**fmtspec)\n        store = '''svst1_s{typnbits}({svtrue}, offset_buf, {in2});\n                   svst1_{utyp}({svtrue}, mask, svsel_{utyp}(\n                       {in0}, svdup_n_{utyp}(({utyp})-1), svdup_n_{utyp}(\n                         ({utyp})0)));\n                         '''.format(utyp=utyp, **fmtspec)\n        if oz == 'z':\n            store += 'svst1_{suf}({svtrue}, buf, svdup_n_{suf}(({typ})0));'. \\\n                     format(**fmtspec)\n        else:\n            store += 'svst1_{suf}({svtrue}, buf, {in3});'.format(**fmtspec)\n        load = 'svld1_{suf}({svtrue}, buf)'.format(**fmtspec)\n    else:\n        store = '''vst1q_s{typnbits}(offset_buf, {in2});\n                   vst1q_u{typnbits}(mask, {in0});'''.format(**fmtspec)\n        if oz == 'z':\n            store += 'vst1q_{suf}(buf, vdupq_n_{suf}(({typ})0));'. \\\n                     format(**fmtspec)\n        else:\n            store += 'vst1q_{suf}(buf, {in3});'.format(**fmtspec)\n        load = 'vld1q_{suf}(buf)'.format(**fmtspec)\n\n    emul = '''int i;\n              {typ} buf[{le}];\n              u{typnbits} mask[{le}];\n              i{typnbits} offset_buf[{le}];\n              {store}\n              for (i = 0; i < {real_le}; i++) {{\n                if (mask[i]) {{\n                  buf[i] = {in1}[offset_buf[i]];\n                }}\n              }}\n              return {load};'''. \\\n              format(le=le, real_le=real_le, store=store, load=load, **fmtspec)\n    if typ == 'f16':\n        if simd_ext in sve:\n            return emul\n        if oz == 'z':\n            oz0 = 'vdupq_n_f32(0.0f)'\n            oz1 = oz0\n        else:\n            oz0 = '{in3}.v0'.format(**fmtspec)\n            oz1 = '{in3}.v1'.format(**fmtspec)\n        return '''#ifdef NSIMD_ARM_FP16\n                    {emul}\n                  #else\n                    nsimd_{simd_ext}_vf16 ret;\n                    int i;\n                    f32 buf[{le}];\n                    u32 mask[{le}];\n                    i16 offset_buf[{le}];\n                    vst1q_s16(offset_buf, {in2});\n                    vst1q_f32(buf, {oz0});\n                    vst1q_f32(buf + {leo2}, {oz1});\n                    vst1q_u32(mask, {in0}.v0);\n                    vst1q_u32(mask + {leo2}, {in0}.v1);\n                    for (i = 0; i < {le}; i++) {{\n                      if (mask[i]) {{\n                        buf[i] = nsimd_f16_to_f32({in1}[offset_buf[i]]);\n                      }}\n                    }}\n                    ret.v0 = vld1q_f32(buf);\n                    ret.v1 = vld1q_f32(buf + {leo2});\n                    return ret;\n                  #endif'''.format(emul=emul, leo2=le // 2, le=le, oz0=oz0,\n                                   oz1=oz1, **fmtspec)\n    if simd_ext == 'neon128' and typ == 'f64':\n        oz0 = '0.0' if oz == 'z' else '{in3}.v0'.format(**fmtspec)\n        oz1 = '0.0' if oz == 'z' else '{in3}.v1'.format(**fmtspec)\n        return '''nsimd_neon128_vf64 ret;\n                  i64 offset_buf[2];\n                  vst1q_s64(offset_buf, {in2});\n                  if ({in0}.v0) {{\n                    ret.v0 = {in1}[offset_buf[0]];\n                  }} else {{\n                    ret.v0 = {oz0};\n                  }}\n                  if ({in0}.v1) {{\n                    ret.v1 = {in1}[offset_buf[1]];\n                  }} else {{\n                    ret.v1 = {oz1};\n                  }}\n                  return ret;'''.format(oz0=oz0, oz1=oz1, **fmtspec)\n    if simd_ext in neon or typ in ['i8', 'u8', 'i16', 'u16']:\n        return emul\n    # getting here means SVE\n    oz0 = 'svdup_n_{suf}(({typ})0)'.format(**fmtspec) if oz == 'z' \\\n          else '{in3}'.format(**fmtspec)\n    return '''return svsel_{suf}({in0}, svld1_gather_s{typnbits}index_{suf}(\n                         {in0}, {in1}, {in2}), {oz0});'''. \\\n                         format(oz0=oz0, **fmtspec)\n\n# -----------------------------------------------------------------------------\n# scatter\n\ndef scatter(simd_ext, typ):\n    le = max_len(simd_ext, typ)\n    real_le = real_len(simd_ext, typ)\n\n    if simd_ext in sve:\n        emul = '''int i;\n                  {typ} buf[{le}];\n                  i{typnbits} offset_buf[{le}];\n                  svst1_s{typnbits}({svtrue}, offset_buf, {in1});\n                  svst1_{suf}({svtrue}, buf, {in2});\n                  for (i = 0; i < {real_le}; i++) {{\n                    {in0}[offset_buf[i]] = buf[i];\n                  }}'''.format(le=le, real_le=real_le, **fmtspec)\n    else:\n        emul = '\\n'.join(['{in0}[vgetq_lane_s{typnbits}({in1}, {i})] = ' \\\n                          'vgetq_lane_{suf}({in2}, {i});\\n'. \\\n                          format(i=i, **fmtspec) for i in range(int(le))])\n\n    if typ == 'f16':\n        if simd_ext in sve:\n            return emul\n        return '''#ifdef NSIMD_ARM_FP16\n                    {emul}\n                  #else\n                  '''.format(emul=emul) + \\\n                  '\\n'.join(['{in0}[vgetq_lane_s16({in1}, {i})] = ' \\\n                             'nsimd_f32_to_f16(vgetq_lane_f32({in2}.v0, '\n                             '{i}));\\n'.format(i=i, **fmtspec) \\\n                             for i in range(4)]) + \\\n                  '\\n'.join(['{in0}[vgetq_lane_s16({in1}, 4 + {i})] = ' \\\n                             'nsimd_f32_to_f16(vgetq_lane_f32({in2}.v1, '\n                             '{i}));\\n'.format(i=i, **fmtspec) \\\n                             for i in range(4)]) + \\\n               '''\n                  #endif'''\n    if simd_ext == 'neon128' and typ == 'f64':\n        return '''i64 offset_buf[2];\n                  vst1q_s64(offset_buf, {in1});\n                  {in0}[offset_buf[0]] = {in2}.v0;\n                  {in0}[offset_buf[1]] = {in2}.v1;'''.format(**fmtspec)\n    if simd_ext in neon or typ in ['i8', 'u8', 'i16', 'u16']:\n        return emul\n    # getting here means SVE\n    return 'svst1_scatter_s{typnbits}index_{suf}({svtrue}, {in0}, ' \\\n           '{in1}, {in2});'.format(le=le, **fmtspec)\n\n# -----------------------------------------------------------------------------\n# linear scatter\n\ndef scatter_linear(simd_ext, typ):\n    if simd_ext in sve:\n        if typ in ['i8', 'u8', 'i16', 'u16', 'f16']:\n            le = max_len(simd_ext, typ)\n            real_le = real_len(simd_ext, typ)\n            return '''{typ} buf[{le}];\n                      int i;\n                      svst1_{suf}({svtrue}, buf, {in2});\n                      for (i = 0; i < {real_le}; i++) {{\n                        {in0}[i * {in1}] = buf[i];\n                      }}'''.format(le=le, real_le=real_le, **fmtspec)\n        else:\n            return 'svst1_scatter_s{typnbits}index_{suf}({svtrue}, {in0}, ' \\\n                   'svindex_s{typnbits}(0, (i{typnbits}){in1}), {in2});'. \\\n                   format(**fmtspec)\n    # getting here means neon128 and aarch64\n    intrinsic = '\\n'.join([\n      '{in0}[{i} * {in1}] = vgetq_lane_{suf}({in2}, {i});'. \\\n      format(i=i, **fmtspec) for i in range(128 // int(fmtspec['typnbits']))])\n    if typ == 'f16':\n        return '''#ifdef NSIMD_ARM_FP16\n                    {intrinsic}\n                  #else\n                    f32 buf[8];\n                    int i;\n                    vst1q_f32(buf, {in2}.v0);\n                    vst1q_f32(buf + 4, {in2}.v1);\n                    for (i = 0; i < 8; i++) {{\n                      {in0}[i * {in1}] = nsimd_f32_to_f16(buf[i]);\n                    }}\n                  #endif'''.format(intrinsic=intrinsic, **fmtspec)\n    if typ == 'f64' and simd_ext == 'neon128':\n        return '''{in0}[0] = {in2}.v0;\n                  {in0}[{in1}] = {in2}.v1;'''.format(**fmtspec)\n    return intrinsic\n\n# -----------------------------------------------------------------------------\n# mask_scatter\n\ndef mask_scatter(simd_ext, typ):\n    le = max_len(simd_ext, typ)\n    real_le = real_len(simd_ext, typ)\n\n    if simd_ext in sve:\n        store = '''svst1_s{typnbits}({svtrue}, offset_buf, {in2});\n                   svst1_u{typnbits}({svtrue}, mask, svsel_u{typnbits}(\n                       {in0}, svdup_n_u{typnbits}((u{typnbits})1),\n                              svdup_n_u{typnbits}((u{typnbits})0)));\n                   svst1_{suf}({svtrue}, buf, {in3});'''.format(**fmtspec)\n    else:\n        store = '''vst1q_s{typnbits}(offset_buf, {in2});\n                   vst1q_{suf}(buf, {in3});\n                   vst1q_u{typnbits}(mask, {in0});'''.format(**fmtspec)\n\n    emul = '''int i;\n              {typ} buf[{le}];\n              u{typnbits} mask[{le}];\n              i{typnbits} offset_buf[{le}];\n              {store}\n              for (i = 0; i < {real_le}; i++) {{\n                if (mask[i]) {{\n                  {in1}[offset_buf[i]] = buf[i];\n                }}\n              }}'''.format(le=le, real_le=real_le, store=store, **fmtspec)\n    if typ == 'f16':\n        if simd_ext in sve:\n            return emul\n        return '''#ifdef NSIMD_ARM_FP16\n                    {emul}\n                  #else\n                    int i;\n                    f32 buf[{le}];\n                    u32 mask[{le}];\n                    i16 offset_buf[{le}];\n                    vst1q_s16(offset_buf, {in2});\n                    vst1q_f32(buf, {in3}.v0);\n                    vst1q_f32(buf + {leo2}, {in3}.v1);\n                    vst1q_u32(mask, {in0}.v0);\n                    vst1q_u32(mask + {leo2}, {in0}.v1);\n                    for (i = 0; i < {le}; i++) {{\n                      if (mask[i]) {{\n                        {in1}[offset_buf[i]] = nsimd_f32_to_f16(buf[i]);\n                      }}\n                    }}\n                  #endif'''.format(emul=emul, le=le, leo2=le // 2, **fmtspec)\n    if simd_ext == 'neon128' and typ == 'f64':\n        return '''i64 offset_buf[2];\n                  vst1q_s64(offset_buf, {in2});\n                  if ({in0}.v0) {{\n                    {in1}[offset_buf[0]] = {in3}.v0;\n                  }}\n                  if ({in0}.v1) {{\n                    {in1}[offset_buf[1]] = {in3}.v1;\n                  }}'''.format(**fmtspec)\n    if simd_ext in neon or typ in ['i8', 'u8', 'i16', 'u16']:\n        return emul\n    # getting here means SVE\n    return 'svst1_scatter_s{typnbits}index_{suf}({in0}, {in1}, ' \\\n           '{in2}, {in3});'.format(le=le, **fmtspec)\n\n\n# -----------------------------------------------------------------------------\n# get_impl function\n\ndef get_impl(opts, func, simd_ext, from_typ, to_typ):\n    global fmtspec\n\n    simd_ext2 = simd_ext if not simd_ext in fixed_sized_sve else 'sve'\n\n    fmtspec = {\n      'simd_ext': simd_ext,\n      'simd_ext2': simd_ext2,\n      'typ': from_typ,\n      'from_typ': from_typ,\n      'to_typ': to_typ,\n      'suf': suf(from_typ),\n      'in0': common.in0,\n      'in1': common.in1,\n      'in2': common.in2,\n      'in3': common.in3,\n      'in4': common.in4,\n      'in5': common.in5,\n      'typnbits': from_typ[1:],\n      'svtrue': 'svptrue_b{}()'.format(from_typ[1:]),\n      'svetyp': sve_typ(from_typ),\n    }\n\n    impls = {\n        'loada': lambda: load1234(opts, simd_ext, from_typ, 1),\n        'masko_loada1': lambda: maskoz_load('o', simd_ext, from_typ),\n        'maskz_loada1': lambda: maskoz_load('z', simd_ext, from_typ),\n        'load2a': lambda: load1234(opts, simd_ext, from_typ, 2),\n        'load3a': lambda: load1234(opts, simd_ext, from_typ, 3),\n        'load4a': lambda: load1234(opts, simd_ext, from_typ, 4),\n        'loadu': lambda: load1234(opts, simd_ext, from_typ, 1),\n        'masko_loadu1': lambda: maskoz_load('o', simd_ext, from_typ),\n        'maskz_loadu1': lambda: maskoz_load('z', simd_ext, from_typ),\n        'load2u': lambda: load1234(opts, simd_ext, from_typ, 2),\n        'load3u': lambda: load1234(opts, simd_ext, from_typ, 3),\n        'load4u': lambda: load1234(opts, simd_ext, from_typ, 4),\n        'storea': lambda: store1234(opts, simd_ext, from_typ, 1),\n        'mask_storea1': lambda: mask_store(simd_ext, from_typ),\n        'store2a': lambda: store1234(opts, simd_ext, from_typ, 2),\n        'store3a': lambda: store1234(opts, simd_ext, from_typ, 3),\n        'store4a': lambda: store1234(opts, simd_ext, from_typ, 4),\n        'storeu': lambda: store1234(opts, simd_ext, from_typ, 1),\n        'mask_storeu1': lambda: mask_store(simd_ext, from_typ),\n        'store2u': lambda: store1234(opts, simd_ext, from_typ, 2),\n        'store3u': lambda: store1234(opts, simd_ext, from_typ, 3),\n        'store4u': lambda: store1234(opts, simd_ext, from_typ, 4),\n        'gather': lambda: gather(simd_ext, from_typ),\n        'gather_linear': lambda: gather_linear(simd_ext, from_typ),\n        'maskz_gather': lambda: maskoz_gather('z', simd_ext, from_typ),\n        'masko_gather': lambda: maskoz_gather('o', simd_ext, from_typ),\n        'scatter': lambda: scatter(simd_ext, from_typ),\n        'scatter_linear': lambda: scatter_linear(simd_ext, from_typ),\n        'mask_scatter': lambda: mask_scatter(simd_ext, from_typ),\n        'andb': lambda: binop2(\"andb\", simd_ext2, from_typ),\n        'xorb': lambda: binop2(\"xorb\", simd_ext2, from_typ),\n        'orb': lambda: binop2(\"orb\", simd_ext2, from_typ),\n        'andl': lambda: lop2(opts, \"andl\", simd_ext2, from_typ),\n        'xorl': lambda: lop2(opts, \"xorl\", simd_ext2, from_typ),\n        'orl': lambda: lop2(opts, \"orl\", simd_ext2, from_typ),\n        'notb': lambda: not1(simd_ext2, from_typ),\n        'notl': lambda: lnot1(opts, simd_ext2, from_typ),\n        'andnotb': lambda: binop2(\"andnotb\", simd_ext2, from_typ),\n        'andnotl': lambda: lop2(opts, \"andnotl\", simd_ext2, from_typ),\n        'add': lambda: addsub(\"add\", simd_ext2, from_typ),\n        'sub': lambda: addsub(\"sub\", simd_ext2, from_typ),\n        'adds': lambda: adds(simd_ext2, from_typ),\n        'subs': lambda: subs(simd_ext2, from_typ),\n        'div': lambda: div2(simd_ext2, from_typ),\n        'sqrt': lambda: sqrt1(simd_ext2, from_typ),\n        'len': lambda: len1(simd_ext, from_typ),\n        'mul': lambda: mul2(simd_ext2, from_typ),\n        'shl': lambda: shl_shr(\"shl\", simd_ext2, from_typ),\n        'shr': lambda: shl_shr(\"shr\", simd_ext2, from_typ),\n        'shra': lambda: shra(simd_ext2, from_typ),\n        'set1': lambda: set1(simd_ext2, from_typ),\n        'set1l': lambda: lset1(simd_ext2, from_typ),\n        'eq': lambda: cmp2(opts, \"eq\", simd_ext2, from_typ),\n        'lt': lambda: cmp2(opts, \"lt\", simd_ext2, from_typ),\n        'le': lambda: cmp2(opts, \"le\", simd_ext2, from_typ),\n        'gt': lambda: cmp2(opts, \"gt\", simd_ext2, from_typ),\n        'ge': lambda: cmp2(opts, \"ge\", simd_ext2, from_typ),\n        'ne': lambda: neq2(opts, simd_ext2, from_typ),\n        'if_else1': lambda: if_else3(opts, simd_ext2, from_typ),\n        'min': lambda: minmax2(\"min\", simd_ext2, from_typ),\n        'max': lambda: minmax2(\"max\", simd_ext2, from_typ),\n        'loadla': lambda: loadl(True, simd_ext2, from_typ),\n        'loadlu': lambda: loadl(False, simd_ext2, from_typ),\n        'storela': lambda: storel(True, simd_ext2, from_typ),\n        'storelu': lambda: storel(False, simd_ext2, from_typ),\n        'abs': lambda: abs1(simd_ext2, from_typ),\n        'fma': lambda: fmafnma3(\"fma\", simd_ext2, from_typ),\n        'fnma': lambda: fmafnma3(\"fnma\", simd_ext2, from_typ),\n        'fms': lambda: fmsfnms3(\"fms\", simd_ext2, from_typ),\n        'fnms': lambda: fmsfnms3(\"fnms\", simd_ext2, from_typ),\n        'ceil': lambda: round1(\"ceil\", simd_ext2, from_typ),\n        'floor': lambda: round1(\"floor\", simd_ext2, from_typ),\n        'trunc': lambda: round1(\"trunc\", simd_ext2, from_typ),\n        'round_to_even': lambda: round1(\"round_to_even\", simd_ext2, from_typ),\n        'all': lambda: allany1(opts, \"all\", simd_ext2, from_typ),\n        'any': lambda: allany1(opts, \"any\", simd_ext2, from_typ),\n        'reinterpret': lambda: reinterpret1(simd_ext2, from_typ, to_typ),\n        'reinterpretl': lambda: reinterpretl1(simd_ext2, from_typ, to_typ),\n        'cvt': lambda: convert1(simd_ext2, from_typ, to_typ),\n        'rec11': lambda: recs1(\"rec11\", simd_ext2, from_typ),\n        'rec8': lambda: recs1(\"rec8\", simd_ext2, from_typ),\n        'rsqrt11': lambda: recs1(\"rsqrt11\", simd_ext2, from_typ),\n        'rsqrt8': lambda: recs1(\"rsqrt8\", simd_ext2, from_typ),\n        'rec': lambda: recs1(\"rec\", simd_ext2, from_typ),\n        'neg': lambda: neg1(simd_ext2, from_typ),\n        'nbtrue': lambda: nbtrue1(opts, simd_ext2, from_typ),\n        'reverse': lambda: reverse1(simd_ext2, from_typ),\n        'addv': lambda: addv(simd_ext2, from_typ),\n        'upcvt': lambda: upcvt1(simd_ext2, from_typ, to_typ),\n        'downcvt': lambda: downcvt1(simd_ext2, from_typ, to_typ),\n        'to_logical': lambda: to_logical1(opts, simd_ext2, from_typ),\n        'to_mask': lambda: to_mask1(opts, simd_ext2, from_typ),\n        'ziplo': lambda: zip_unzip_half(\"zip1\", simd_ext2, from_typ),\n        'ziphi': lambda: zip_unzip_half(\"zip2\", simd_ext2, from_typ),\n        'unziplo': lambda: zip_unzip_half(\"uzp1\", simd_ext2, from_typ),\n        'unziphi': lambda: zip_unzip_half(\"uzp2\", simd_ext2, from_typ),\n        'zip' : lambda: zip_unzip(\"zip\", simd_ext2, from_typ),\n        'unzip' : lambda: zip_unzip(\"uzp\", simd_ext2, from_typ),\n        'mask_for_loop_tail': lambda : mask_for_loop_tail(simd_ext, from_typ),\n        'iota': lambda : iota(simd_ext2, from_typ)\n    }\n    if simd_ext not in get_simd_exts():\n        raise ValueError('Unknown SIMD extension \"{}\"'.format(simd_ext))\n    if not from_typ in common.types:\n        raise ValueError('Unknown type \"{}\"'.format(from_typ))\n    if not func in impls:\n        return common.NOT_IMPLEMENTED\n    else:\n        return impls[func]()\n"
  },
  {
    "path": "egg/platform_cpu.py",
    "content": "# Copyright (c) 2020 Agenium Scale\n#\n# Permission is hereby granted, free of charge, to any person obtaining a copy\n# of this software and associated documentation files (the \"Software\"), to deal\n# in the Software without restriction, including without limitation the rights\n# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n# copies of the Software, and to permit persons to whom the Software is\n# furnished to do so, subject to the following conditions:\n#\n# The above copyright notice and this permission notice shall be included in all\n# copies or substantial portions of the Software.\n#\n# THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n# SOFTWARE.\n\n# This file gives the implementation of platform CPU, i.e. scalar emulation.\n# Reading this file is straightforward. For each function, e.g. the addition,\n# code looks like:\n#\n#     return 'return {} + {};'.format(common.in0, common.in1)\n#\n# with an 'if' before to handle the FP16 special case.\n\nimport common\nimport scalar\n\n# -----------------------------------------------------------------------------\n# Emulation parameters\n#\n# When emulating, we need to choose a vector length to fit the philosophy of\n# SIMD. By default we choose 64 bits. It must be a multiple of 64 bits.\n\nNBITS = common.CPU_NBITS\n\ndef get_nb_el(typ):\n    return NBITS // int(typ[1:])\n\n# -----------------------------------------------------------------------------\n# Implementation of mandatory functions for this module\n\ndef get_simd_exts():\n    return ['cpu']\n\ndef get_prev_simd_ext(simd_ext):\n    if simd_ext != 'cpu':\n        raise ValueError('Unknown SIMD extension \"{}\"'.format(simd_ext))\n    return ''\n\ndef get_simd_strings(simd_ext):\n    if simd_ext == 'cpu':\n        return ['cpu']\n    else:\n        raise ValueError('Unknown SIMD extension \"{}\"'.format(simd_ext))\n\ndef emulate_fp16(simd_ext):\n    if simd_ext != 'cpu':\n        raise ValueError('Unknown SIMD extension \"{}\"'.format(simd_ext))\n    return True\n\ndef get_type(opts, simd_ext, typ, nsimd_typ):\n    if simd_ext != 'cpu':\n        raise ValueError('Unknown SIMD extension \"{}\"'.format(simd_ext))\n    if typ not in common.types:\n        raise ValueError('Unknown type \"{}\"'.format(typ))\n    typ2 = typ if typ != 'f16' else 'f32'\n    members = '\\n'.join('{} v{};'.format(typ2, i) \\\n                        for i in range(0, get_nb_el(typ)))\n    return 'typedef struct {{ {} }} {};'.format(members, nsimd_typ)\n\ndef get_logical_type(opts, simd_ext, typ, nsimd_typ):\n    if simd_ext != 'cpu':\n        raise ValueError('Unknown SIMD extension \"{}\"'.format(simd_ext))\n    if typ not in common.types:\n        raise ValueError('Unknown type \"{}\"'.format(typ))\n    members = '\\n'.join('unsigned int v{};'.format(i) \\\n                        for i in range(0, get_nb_el(typ)))\n    return 'typedef struct {{ {} }} {};'.format(members, nsimd_typ)\n\ndef get_nb_registers(simd_ext):\n    if simd_ext != 'cpu':\n        raise ValueError('Unknown SIMD extension \"{}\"'.format(simd_ext))\n    return '1'\n\ndef has_compatible_SoA_types(simd_ext):\n    if simd_ext != 'cpu':\n        raise ValueError('Unknown SIMD extension \"{}\"'.format(simd_ext))\n    return False\n\ndef get_additional_include(func, platform, simd_ext):\n    if func in ['adds', 'subs', 'orb', 'andb', 'andnotb', 'xorb', 'min', 'max'\n                'notb', 'sqrt', 'shr', 'shl', 'shra', 'abs', 'fma', 'fnma',\n                'fms', 'fnms', 'ceil', 'floor', 'trunc', 'round_to_even',\n                'rec11', 'rec8', 'rsqrt11', 'rsqrt8', 'rec', 'neg',\n                'lgamma_u10', 'tgamma_u10', 'erf_u10', 'erfc_u15']:\n        return '''#include <nsimd/scalar_utilities.h>\n                  '''\n    elif func == 'zip':\n        return '''#include <nsimd/cpu/cpu/ziplo.h>\n                  #include <nsimd/cpu/cpu/ziphi.h>\n                  '''\n    elif func == 'unzip':\n         return '''#include <nsimd/cpu/cpu/unziplo.h>\n                   #include <nsimd/cpu/cpu/unziphi.h>\n                  '''\n    return ''\n\n# -----------------------------------------------------------------------------\n# Returns C code for func\n\nfmtspec = {}\n\ndef repeat_stmt(fmt, typ):\n    return '\\n'.join(fmt.format(i=i) for i in range(0, get_nb_el(typ)))\n\n# -----------------------------------------------------------------------------\n\ndef func_body(fmt, typ2, logical = False):\n    return '''nsimd_cpu_v{logical}{typ2} ret;\n              {content}\n              return ret;'''.format(logical='l' if logical else '', typ2=typ2,\n                                    content=repeat_stmt(fmt, typ2), **fmtspec)\n\n# -----------------------------------------------------------------------------\n\ndef op2(op, typ):\n    return func_body('ret.v{{i}} = {cast}({in0}.v{{i}} {op} {in1}.v{{i}});'. \\\n                     format(cast='({})'.format(typ) if typ in common.iutypes \\\n                            else '', op=op, **fmtspec), typ)\n\n# -----------------------------------------------------------------------------\n\ndef lop2(op, typ):\n    return func_body('ret.v{{i}} = {in0}.v{{i}} {op} {in1}.v{{i}};'. \\\n                     format(op=op, **fmtspec), typ, True)\n\n# -----------------------------------------------------------------------------\n\ndef landnot2(typ):\n    return func_body('ret.v{{i}} = {in0}.v{{i}} & (~{in1}.v{{i}});'.\\\n                     format(**fmtspec), typ, True)\n\n# -----------------------------------------------------------------------------\n\ndef lnot1(typ):\n    return func_body('ret.v{{i}} = ~{in0}.v{{i}};'.\\\n                     format(**fmtspec), typ, True)\n\n# -----------------------------------------------------------------------------\n\ndef scalar_impl(func, typ, arity):\n    typ2 = 'f32' if typ == 'f16' else typ\n    # special case for shl, shr, shra\n    if func in ['shl', 'shr', 'shra']:\n        args = '{in0}.v{{i}}, {in1}'.format(**fmtspec)\n    else:\n        args = ', '.join(['{{in{}}}'.format(i).format(**fmtspec) \\\n                          + '.v{i}' for i in range(arity)])\n    return func_body('ret.v{{i}} = nsimd_scalar_{func}_{typ2}({args});'. \\\n                     format(func=func, typ2=typ2, args=args, **fmtspec), typ)\n\n# -----------------------------------------------------------------------------\n\ndef cmp2(op, typ):\n    return '''nsimd_cpu_vl{typ} ret;\n              {content}\n              return ret;'''.format(content=repeat_stmt(\n              '''ret.v{{i}} = (u32)({in0}.v{{i}} {op} {in1}.v{{i}}\n                                    ? -1 : 0);'''. \\\n                                    format(op=op, **fmtspec), typ), **fmtspec)\n\n# -----------------------------------------------------------------------------\n\ndef set1(typ):\n    if typ == 'f16':\n        content = repeat_stmt('ret.v{{i}} = nsimd_f16_to_f32({in0});'. \\\n                              format(**fmtspec), typ)\n    else:\n        content = repeat_stmt('ret.v{{i}} = {in0};'.format(**fmtspec), typ)\n    return '''nsimd_cpu_v{typ} ret;\n              {content}\n              return ret;'''.format(content=content, **fmtspec)\n\n# -----------------------------------------------------------------------------\n\ndef set1l(typ):\n    return func_body('ret.v{{i}} = (u32)({in0} ? -1 : 0);'. \\\n                     format(**fmtspec), typ, True)\n\n# -----------------------------------------------------------------------------\n\ndef load(typ):\n    if typ == 'f16':\n        content = repeat_stmt(\n                  'ret.v{{i}} = nsimd_u16_to_f32(((u16 *){in0})[{{i}}]);'. \\\n                  format(**fmtspec), typ)\n    else:\n        content = repeat_stmt('ret.v{{i}} = {in0}[{{i}}];'.format(**fmtspec),\n                  typ)\n    return '''nsimd_cpu_v{typ} ret;\n              {content}\n              return ret;'''.format(content=content, **fmtspec)\n\n# -----------------------------------------------------------------------------\n\ndef maskoz_load(oz, typ):\n    if typ == 'f16':\n        else_value = '0.0f' if oz == 'z' else '{in2}.v{{i}}'.format(**fmtspec)\n        content = repeat_stmt(\n                  '''ret.v{{i}} = {in0}.v{{i}}\n                                ? nsimd_u16_to_f32(((u16 *){in1})[{{i}}])\n                                : {else_value};'''. \\\n                                format(else_value=else_value, **fmtspec), typ)\n    else:\n        else_value = '({typ})0'.format(**fmtspec) if oz == 'z' else \\\n                     '{in2}.v{{i}}'.format(**fmtspec)\n        content = repeat_stmt(\n                  'ret.v{{i}} = {in0}.v{{i}} ? {in1}[{{i}}] : {else_value};'. \\\n                  format(else_value=else_value, **fmtspec), typ)\n    return '''nsimd_cpu_v{typ} ret;\n              {content}\n              return ret;'''.format(content=content, **fmtspec)\n\n# -----------------------------------------------------------------------------\n\ndef load_deg234(typ, deg):\n    if typ == 'f16':\n        buf = repeat_stmt(\n              '''ret.v{{{{j}}}}.v{{i}} =\n                     nsimd_u16_to_f32(\n                       ((u16 *){in0})[{deg} * {{i}} + {{{{j}}}}]);'''. \\\n                       format(deg=deg, **fmtspec), typ)\n    else:\n        buf = repeat_stmt(\n              'ret.v{{{{j}}}}.v{{i}} = {in0}[{deg} * {{i}} + {{{{j}}}}];'. \\\n              format(deg=deg, **fmtspec), typ)\n    content = '\\n'.join(buf.format(j=j) for j in range(0, deg))\n    return '''nsimd_cpu_v{typ}x{deg} ret;\n              {content}\n              return ret;'''.format(deg=deg, content=content, **fmtspec)\n\n# -----------------------------------------------------------------------------\n\ndef store_deg234(typ, deg):\n    content = ''\n    for i in range(0, get_nb_el(typ)):\n        for j in range(1, deg + 1):\n            arg = fmtspec['in{}'.format(j)]\n            if typ == 'f16':\n                content += \\\n                '''((u16 *){in0})[{deg} * {i} + {j}] =\n                       nsimd_f32_to_u16({arg}.v{i});\\n'''. \\\n                       format(deg=deg, i=i, j=j - 1, arg=arg, **fmtspec)\n            else:\n                content += \\\n                '{in0}[{deg} * {i} + {j}] = {arg}.v{i};\\n'. \\\n                format(deg=deg, i=i, j=j - 1, arg=arg, **fmtspec)\n    return content[:-1]\n\n# -----------------------------------------------------------------------------\n\ndef loadl(typ):\n    if typ == 'f16':\n        content = repeat_stmt(\n                  '''ret.v{{i}} = (u32)(nsimd_u16_to_f32(((u16 *){in0})[{{i}}])\n                                      == 0.0f ? 0 : -1);'''. \\\n                                      format(**fmtspec), typ)\n    else:\n        content = repeat_stmt(\n                  '''ret.v{{i}} = (u32)({in0}[{{i}}] == ({typ})0\n                                        ? 0 : -1);'''. \\\n                                        format(**fmtspec), typ)\n    return '''nsimd_cpu_vl{typ} ret;\n              {content}\n              return ret;'''.format(content=content, **fmtspec)\n\n# -----------------------------------------------------------------------------\n\ndef store(typ):\n    if typ == 'f16':\n        return repeat_stmt(\n               '((u16*){in0})[{{i}}] = nsimd_f32_to_u16({in1}.v{{i}});'. \\\n               format(**fmtspec), typ)\n    else:\n        return repeat_stmt('{in0}[{{i}}] = {in1}.v{{i}};'. \\\n                           format(**fmtspec), typ)\n\n# -----------------------------------------------------------------------------\n\ndef mask_store(typ):\n    if typ == 'f16':\n        return repeat_stmt(\n               '''if ({in0}.v{{i}}) {{{{\n                    ((u16*){in1})[{{i}}] = nsimd_f32_to_u16({in2}.v{{i}});\n                  }}}}'''.format(**fmtspec), typ)\n    else:\n        return repeat_stmt('''if ({in0}.v{{i}}) {{{{\n                                {in1}[{{i}}] = {in2}.v{{i}};\n                              }}}}'''.format(**fmtspec), typ)\n\n# -----------------------------------------------------------------------------\n\ndef storel(typ):\n    if typ == 'f16':\n        content = repeat_stmt(\n                  '''((u16*){in0})[{{i}}] = (u16)({in1}.v{{i}} == (u32)0\n                                            ? nsimd_f32_to_u16(0.0f)\n                                            : nsimd_f32_to_u16(1.0f));'''. \\\n                                            format(**fmtspec), typ)\n    else:\n        content = repeat_stmt(\n                  '''{in0}[{{i}}] = ({typ})({in1}.v{{i}} == (u32)0\n                                  ? ({typ})0 : ({typ})1);'''. \\\n                                  format(**fmtspec), typ)\n    return content\n\n# -----------------------------------------------------------------------------\n\ndef if_else1(typ):\n    typ2 = 'f32' if typ == 'f16' else typ\n    return func_body(\n           '''ret.v{{i}} = ({typ2})({in0}.v{{i}} != (u32)0\n                                    ? {in1}.v{{i}} : {in2}.v{{i}});'''. \\\n                                    format(typ2=typ2, **fmtspec), typ)\n\n# -----------------------------------------------------------------------------\n\ndef all_any(typ, func):\n    op = '&&' if func == 'all' else '||'\n    if get_nb_el(typ) == 1:\n        cond = '{in0}.v0 == (u32)-1'.format(**fmtspec)\n    else:\n        cond = op.join('({in0}.v{i} == (u32)-1)'.format(i=i, **fmtspec) \\\n                       for i in range(0, get_nb_el(typ)))\n    return '''if ({cond}) {{\n                return -1;\n              }} else {{\n                return 0;\n              }}'''.format(cond=cond)\n\n# -----------------------------------------------------------------------------\n\ndef reinterpret1(from_typ, to_typ):\n    if from_typ == to_typ:\n        return func_body('ret.v{{i}} = {in0}.v{{i}};'.format(**fmtspec),\n                         to_typ)\n    return '''char buf[{len}];\n              nsimd_storeu_cpu_{from_typ}(({from_typ} *)buf, {in0});\n              return nsimd_loadu_cpu_{to_typ}(({to_typ} *)buf);'''. \\\n              format(len=NBITS // 8, **fmtspec)\n\n# -----------------------------------------------------------------------------\n\ndef reinterpretl1(from_typ, to_typ):\n    return func_body('ret.v{{i}} = {in0}.v{{i}};'.format(**fmtspec), to_typ,\n                     True);\n\n# -----------------------------------------------------------------------------\n\ndef convert1(from_typ, to_typ):\n    if to_typ == from_typ:\n        return func_body('ret.v{{i}} = {in0}.v{{i}};'.format(**fmtspec),\n                         to_typ)\n    typ2 = 'f32' if to_typ == 'f16' else to_typ\n    return func_body('ret.v{{i}} = ({typ2}){in0}.v{{i}};'. \\\n                     format(typ2=typ2, **fmtspec), to_typ)\n\n# -----------------------------------------------------------------------------\n\ndef nbtrue1(typ):\n    acc_code = repeat_stmt('acc += {in0}.v{{i}} == (u32)-1 ? 1 : 0;'. \\\n                           format(**fmtspec), typ)\n    return '''int acc = 0;\n              {acc_code}\n              return acc;'''.format(acc_code=acc_code)\n\n# -----------------------------------------------------------------------------\n\ndef reverse1(typ):\n    n = get_nb_el(typ)\n    content = '\\n'.join('ret.v{i} = {in0}.v{j}'. \\\n                        format(i=i, j=n - i, **fmtspec) \\\n                        for i in range(0, n))\n    return '''nsimd_cpu_v{typ} ret;\n              {content}\n              return ret;'''.format(content=content, **fmtspec)\n\n# -----------------------------------------------------------------------------\n\ndef addv1(typ):\n    content = '+'.join('{in0}.v{i}'.format(i=i, **fmtspec) \\\n                       for i in range(0, get_nb_el(typ)))\n    if typ == 'f16':\n        return 'return nsimd_f32_to_f16({});'.format(content)\n    else:\n        return 'return {};'.format(content)\n\n# -----------------------------------------------------------------------------\n\ndef upcvt1(from_typ, to_typ):\n    n = get_nb_el(to_typ)\n    to_typ2 = 'f32' if to_typ == 'f16' else to_typ\n    lower_half = '\\n'.join('ret.v0.v{i} = ({to_typ2}){in0}.v{i};'. \\\n                           format(i=i, to_typ2=to_typ2, **fmtspec) \\\n                           for i in range(0, n))\n    upper_half = '\\n'.join('ret.v1.v{i} = ({to_typ2}){in0}.v{j};'. \\\n                           format(i=i, j=i + n, to_typ2=to_typ2, **fmtspec) \\\n                           for i in range(0, n))\n    return '''nsimd_cpu_v{to_typ}x2 ret;\n              {lower_half}\n              {upper_half}\n              return ret;'''.format(lower_half=lower_half,\n                                    upper_half=upper_half, **fmtspec)\n\n# -----------------------------------------------------------------------------\n\ndef downcvt2(from_typ, to_typ):\n    n = get_nb_el(from_typ)\n    to_typ2 = 'f32' if to_typ == 'f16' else to_typ\n    lower_half = '\\n'.join('ret.v{i} = ({to_typ2}){in0}.v{i};'. \\\n                           format(i=i, to_typ2=to_typ2, **fmtspec) \\\n                           for i in range(0, n))\n    upper_half = '\\n'.join('ret.v{j} = ({to_typ2}){in1}.v{i};'. \\\n                           format(i=i, j=i + n, to_typ2=to_typ2, **fmtspec) \\\n                           for i in range(0, n))\n    return '''nsimd_cpu_v{to_typ} ret;\n              {lower_half}\n              {upper_half}\n              return ret;'''.format(lower_half=lower_half,\n                                    upper_half=upper_half, **fmtspec)\n\n# -----------------------------------------------------------------------------\n\ndef len1(typ):\n    return 'return {};'.format(get_nb_el(typ))\n\n# -----------------------------------------------------------------------------\n\ndef to_logical1(typ):\n    unsigned_to_logical = \\\n        'ret.v{{i}} = ({in0}.v{{i}} == ({utyp})0 ? (u32)0 : (u32)-1);'. \\\n        format(**fmtspec)\n    if typ in common.utypes:\n        return func_body(unsigned_to_logical, typ, True)\n    else:\n        unsigned_to_logical = \\\n            'ret.v{{i}} = (buf.v{{i}} == ({utyp})0 ? (u32)0 : (u32)-1);'. \\\n            format(**fmtspec)\n        return '''nsimd_cpu_vl{typ} ret;\n                  nsimd_cpu_vu{typnbits} buf;\n                  buf = nsimd_reinterpret_cpu_u{typnbits}_{typ}({in0});\n                  {unsigned_to_logical}\n                  return ret;'''. \\\n                  format(unsigned_to_logical=repeat_stmt(unsigned_to_logical,\n                                                         typ), **fmtspec)\n\n# -----------------------------------------------------------------------------\n\ndef to_mask1(typ):\n    logical_to_unsigned = \\\n        'ret.v{{i}} = ({utyp})({in0}.v{{i}} ? -1 : 0);'. \\\n        format(**fmtspec)\n    if typ in common.utypes:\n        return func_body(logical_to_unsigned, typ)\n    elif typ == 'f16':\n        return '''union {{ f32 f; u32 u; }} buf;\n                  nsimd_cpu_vf16 ret;\n                  {u32_to_f32}\n                  return ret;'''. \\\n                  format(u32_to_f32=repeat_stmt(\n                      'buf.u = {in0}.v{{i}}; ret.v{{i}} = buf.f;'. \\\n                      format(**fmtspec), 'f16'), **fmtspec)\n    else:\n        return '''nsimd_cpu_vu{typnbits} ret;\n                  {logical_to_unsigned}\n                  return nsimd_reinterpret_cpu_{typ}_u{typnbits}(ret);'''. \\\n                  format(logical_to_unsigned=repeat_stmt(logical_to_unsigned,\n                                                         typ), **fmtspec)\n\n# -----------------------------------------------------------------------------\n\ndef zip_half(func, typ):\n    n = get_nb_el(typ)\n    if func == \"ziplo\":\n      content = '\\n'.join('ret.v{j1} = {in0}.v{i}; ret.v{j2} = {in1}.v{i};'. \\\n                          format(i=i, j1=i*2, j2=i*2+1, **fmtspec) \\\n                          for i in range(0, int(n/2)))\n    else :\n      content = '\\n'.join('ret.v{j1} = {in0}.v{i}; ret.v{j2} = {in1}.v{i};'. \\\n                          format(i=i+int(n/2), j1=i*2, j2=i*2+1, **fmtspec) \\\n                          for i in range(0, int(n/2)))\n\n    return '''nsimd_cpu_v{typ} ret;\n            {content}\n            return ret;'''.format(content=content, **fmtspec)\n\n# -----------------------------------------------------------------------------\n\ndef unzip_half(func, typ):\n    n = get_nb_el(typ)\n    content = ''\n    if func == \"unziplo\":\n        content = '\\n'.join('ret.v{i} = {in0}.v{j}; '. \\\n                    format(i=i, j=i*2, **fmtspec) \\\n                    for i in range(0, int(n/2)))\n        content = content + '\\n'.join('ret.v{i} = {in1}.v{j}; '. \\\n                    format(i=i, j=2*(i-int(n/2)), **fmtspec) \\\n                    for i in range(int(n/2), n))\n    else :\n        content = '\\n'.join('ret.v{i} = {in0}.v{j}; '. \\\n                    format(i=i, j=i*2+1, **fmtspec) \\\n                    for i in range(0, int(n/2)))\n        content = content + '\\n'.join('ret.v{i} = {in1}.v{j}; '. \\\n                    format(i=i, j=2*(i-int(n/2))+1, **fmtspec)\\\n                    for i in range(int(n/2), n))\n    return '''nsimd_cpu_v{typ} ret;\n              {content}\n              return ret;'''.format(content=content, **fmtspec)\n\ndef zip(from_typ):\n    return '''nsimd_{simd_ext}_v{typ}x2 ret;\n              ret.v0 = nsimd_ziplo_cpu_{typ}({in0}, {in1});\n              ret.v1 = nsimd_ziphi_cpu_{typ}({in0}, {in1});\n              return ret;'''.format(**fmtspec)\n\ndef unzip(from_typ):\n    return '''nsimd_{simd_ext}_v{typ}x2 ret;\n              ret.v0 = nsimd_unziplo_cpu_{typ}({in0}, {in1});\n              ret.v1 = nsimd_unziphi_cpu_{typ}({in0}, {in1});\n              return ret;'''.format(**fmtspec)\n\n# -----------------------------------------------------------------------------\n\ndef mask_for_loop_tail(typ):\n    return func_body(\n           'ret.v{{i}} = {in0} + {{i}} < {in1} ? (u32)-1 : (u32)0;'. \\\n           format(**fmtspec), typ, True)\n\n# -----------------------------------------------------------------------------\n\ndef iota(typ):\n    typ2 = 'f32' if typ == 'f16' else typ\n    return func_body('ret.v{{i}} = ({typ2}){{i}};'. \\\n                     format(typ2=typ2, **fmtspec), typ)\n\n# -----------------------------------------------------------------------------\n\ndef gather(typ):\n    if typ == 'f16':\n        return func_body(\n               'ret.v{{i}} = nsimd_f16_to_f32({in0}[{in1}.v{{i}}]);'. \\\n               format(**fmtspec), typ)\n    return func_body('ret.v{{i}} = {in0}[{in1}.v{{i}}];'. \\\n                     format(**fmtspec), typ)\n\n# -----------------------------------------------------------------------------\n\ndef gather_linear(typ):\n    if typ == 'f16':\n        return func_body(\n               'ret.v{{i}} = nsimd_f16_to_f32({in0}[{{i}} * {in1}]);'. \\\n               format(**fmtspec), typ)\n    return func_body('ret.v{{i}} = {in0}[{{i}} * {in1}];'. \\\n                     format(**fmtspec), typ)\n\n# -----------------------------------------------------------------------------\n\ndef maskoz_gather(op, typ):\n    if typ == 'f16':\n        oz = '0.0f' if op == 'z' else '{in3}.v{{i}}'\n        return func_body(\n               ('''if ({in0}.v{{i}}) {{{{\n                     ret.v{{i}} = nsimd_f16_to_f32({in1}[{in2}.v{{i}}]);\n                   }}}} else {{{{\n                     ret.v{{i}} = ''' + oz + ''';\n                   }}}}''').format(**fmtspec), typ)\n\n    oz = '({typ})0' if op == 'z' else '{in3}.v{{i}}'\n    return func_body(('''if ({in0}.v{{i}}) {{{{\n                           ret.v{{i}} = {in1}[{in2}.v{{i}}];\n                         }}}} else {{{{\n                           ret.v{{i}} = ''' + oz + ''';\n                         }}}}''').format(**fmtspec), typ)\n\n# -----------------------------------------------------------------------------\n\ndef scatter(typ):\n    if typ == 'f16':\n        return repeat_stmt(\n               '{in0}[{in1}.v{{i}}] = nsimd_f32_to_f16({in2}.v{{i}});'. \\\n               format(**fmtspec), typ)\n    return repeat_stmt('{in0}[{in1}.v{{i}}] = {in2}.v{{i}};'. \\\n                       format(**fmtspec), typ)\n\n# -----------------------------------------------------------------------------\n\ndef scatter_linear(typ):\n    if typ == 'f16':\n        return repeat_stmt(\n               '{in0}[{{i}} * {in1}] = nsimd_f32_to_f16({in2}.v{{i}});'. \\\n               format(**fmtspec), typ)\n    return repeat_stmt('{in0}[{{i}} * {in1}] = {in2}.v{{i}};'. \\\n                       format(**fmtspec), typ)\n\n# -----------------------------------------------------------------------------\n\ndef mask_scatter(typ):\n    if typ == 'f16':\n        return repeat_stmt(\n               '''if ({in0}.v{{i}}) {{{{\n                    {in1}[{in2}.v{{i}}] = nsimd_f32_to_f16({in3}.v{{i}});\n                  }}}}'''.format(**fmtspec), typ)\n    return repeat_stmt('''if ({in0}.v{{i}}) {{{{\n                            {in1}[{in2}.v{{i}}] = {in3}.v{{i}};\n                          }}}}'''.format(**fmtspec), typ)\n\n# -----------------------------------------------------------------------------\n\ndef get_impl(opts, func, simd_ext, from_typ, to_typ=''):\n\n    global fmtspec\n    fmtspec = {\n      'simd_ext': simd_ext,\n      'typ': from_typ,\n      'from_typ': from_typ,\n      'to_typ': to_typ,\n      'utyp': common.bitfield_type[from_typ],\n      'in0': common.in0,\n      'in1': common.in1,\n      'in2': common.in2,\n      'in3': common.in3,\n      'in4': common.in4,\n      'typnbits': from_typ[1:]\n    }\n\n    impls = {\n        'loada': lambda: load(from_typ),\n        'maskz_loada1': lambda: maskoz_load('z', from_typ),\n        'masko_loada1': lambda: maskoz_load('o', from_typ),\n        'load2a': lambda: load_deg234(from_typ, 2),\n        'load3a': lambda: load_deg234(from_typ, 3),\n        'load4a': lambda: load_deg234(from_typ, 4),\n        'loadu': lambda: load(from_typ),\n        'maskz_loadu1': lambda: maskoz_load('z', from_typ),\n        'masko_loadu1': lambda: maskoz_load('o', from_typ),\n        'load2u': lambda: load_deg234(from_typ, 2),\n        'load3u': lambda: load_deg234(from_typ, 3),\n        'load4u': lambda: load_deg234(from_typ, 4),\n        'storea': lambda: store(from_typ),\n        'mask_storea1': lambda: mask_store(from_typ),\n        'store2a': lambda: store_deg234(from_typ, 2),\n        'store3a': lambda: store_deg234(from_typ, 3),\n        'store4a': lambda: store_deg234(from_typ, 4),\n        'storeu': lambda: store(from_typ),\n        'mask_storeu1': lambda: mask_store(from_typ),\n        'store2u': lambda: store_deg234(from_typ, 2),\n        'store3u': lambda: store_deg234(from_typ, 3),\n        'store4u': lambda: store_deg234(from_typ, 4),\n        'loadla': lambda: loadl(from_typ),\n        'loadlu': lambda: loadl(from_typ),\n        'gather': lambda: gather(from_typ),\n        'gather_linear': lambda: gather_linear(from_typ),\n        'maskz_gather': lambda: maskoz_gather('z', from_typ),\n        'masko_gather': lambda: maskoz_gather('o', from_typ),\n        'scatter': lambda: scatter(from_typ),\n        'scatter_linear': lambda: scatter_linear(from_typ),\n        'mask_scatter': lambda: mask_scatter(from_typ),\n        'storela': lambda: storel(from_typ),\n        'storelu': lambda: storel(from_typ),\n        'add': lambda: op2('+', from_typ),\n        'mul': lambda: op2('*', from_typ),\n        'div': lambda: op2('/', from_typ),\n        'sub': lambda: op2('-', from_typ),\n        'adds' : lambda: scalar_impl('adds', from_typ, 2),\n        'subs' : lambda: scalar_impl('subs', from_typ, 2),\n        'orb': lambda: scalar_impl('orb', from_typ, 2),\n        'orl': lambda: lop2('|', from_typ),\n        'andb': lambda: scalar_impl('andb', from_typ, 2),\n        'andnotb': lambda: scalar_impl('andnotb', from_typ, 2),\n        'andnotl': lambda: landnot2(from_typ),\n        'andl': lambda: lop2('&', from_typ),\n        'xorb': lambda: scalar_impl('xorb', from_typ, 2),\n        'xorl': lambda: lop2('^', from_typ),\n        'min': lambda: scalar_impl('min', from_typ, 2),\n        'max': lambda: scalar_impl('max', from_typ, 2),\n        'notb': lambda: scalar_impl('notb', from_typ, 1),\n        'notl': lambda: lnot1(from_typ),\n        'sqrt': lambda: scalar_impl('sqrt', from_typ, 1),\n        'set1': lambda: set1(from_typ),\n        'set1l': lambda: set1l(from_typ),\n        'shr': lambda: scalar_impl('shr', from_typ, 2),\n        'shl': lambda: scalar_impl('shl', from_typ, 2),\n        'shra': lambda: scalar_impl('shra', from_typ, 2),\n        'eq': lambda: cmp2('==', from_typ),\n        'ne': lambda: cmp2('!=', from_typ),\n        'gt': lambda: cmp2('>', from_typ),\n        'ge': lambda: cmp2('>=', from_typ),\n        'lt': lambda: cmp2('<', from_typ),\n        'le': lambda: cmp2('<=', from_typ),\n        'len': lambda: len1(from_typ),\n        'if_else1': lambda: if_else1(from_typ),\n        'abs': lambda: scalar_impl('abs', from_typ, 1),\n        'fma': lambda: scalar_impl('fma', from_typ, 3),\n        'fnma': lambda: scalar_impl('fnma', from_typ, 3),\n        'fms': lambda: scalar_impl('fms', from_typ, 3),\n        'fnms': lambda: scalar_impl('fnms', from_typ, 3),\n        'ceil': lambda: scalar_impl('ceil', from_typ, 1),\n        'floor': lambda: scalar_impl('floor', from_typ, 1),\n        'trunc': lambda: scalar_impl('trunc', from_typ, 1),\n        'round_to_even': lambda: scalar_impl('round_to_even', from_typ, 1),\n        'all': lambda: all_any(from_typ, 'all'),\n        'any': lambda: all_any(from_typ, 'any'),\n        'reinterpret': lambda: reinterpret1(from_typ, to_typ),\n        'reinterpretl': lambda: reinterpretl1(from_typ, to_typ),\n        'cvt': lambda: convert1(from_typ, to_typ),\n        'rec11': lambda: scalar_impl('rec11', from_typ, 1),\n        'rec8': lambda: scalar_impl('rec8', from_typ, 1),\n        'rsqrt11': lambda: scalar_impl('rsqrt11', from_typ, 1),\n        'rsqrt8': lambda: scalar_impl('rsqrt8', from_typ, 1),\n        'rec': lambda: scalar_impl('rec', from_typ, 1),\n        'neg': lambda: scalar_impl('neg', from_typ, 1),\n        'nbtrue': lambda: nbtrue1(from_typ),\n        'reverse': lambda: reverse1(from_typ),\n        'addv': lambda: addv1(from_typ),\n        'upcvt': lambda: upcvt1(from_typ, to_typ),\n        'downcvt': lambda: downcvt2(from_typ, to_typ),\n        'to_logical': lambda: to_logical1(from_typ),\n        'to_mask': lambda: to_mask1(from_typ),\n        'ziplo': lambda: zip_half('ziplo', from_typ),\n        'ziphi': lambda: zip_half('ziphi', from_typ),\n        'unziplo': lambda: unzip_half('unziplo', from_typ),\n        'unziphi': lambda: unzip_half('unziphi', from_typ),\n        'zip' : lambda : zip(from_typ),\n        'unzip' : lambda : unzip(from_typ),\n        'mask_for_loop_tail': lambda : mask_for_loop_tail(from_typ),\n        'iota': lambda : iota(from_typ)\n    }\n    if simd_ext != 'cpu':\n        raise ValueError('Unknown SIMD extension \"{}\"'.format(simd_ext))\n    if not from_typ in common.types:\n        raise ValueError('Unknown from_type \"{}\"'.format(from_typ))\n    if not func in impls:\n        return common.NOT_IMPLEMENTED\n    return impls[func]()\n"
  },
  {
    "path": "egg/platform_ppc.py",
    "content": "# Copyright (c) 2021 Agenium Scale\n#\n# Permission is hereby granted, free of charge, to any person obtaining a copy\n# of this software and associated documentation files (the \"Software\"), to deal\n# in the Software without restriction, including without limitation the rights\n# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n# copies of the Software, and to permit persons to whom the Software is\n# furnished to do so, subject to the following conditions:\n#\n# The above copyright notice and this permission notice shall be included in all\n# copies or substantial portions of the Software.\n#\n# THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n# SOFTWARE.\n\n# This file gives the implementation for the Power PC platform.\n# This script tries to be as readable as possible. It implements VMX and VSX.\n\n# Documentation found from:\n# https://www.nxp.com/docs/en/reference-manual/ALTIVECPIM.pdf\n# https://www.ibm.com/docs/en/xl-c-and-cpp-linux/13.1.6?topic=functions-vector-built-in\n# https://gcc.gnu.org/onlinedocs/gcc-9.1.0/gcc/PowerPC-AltiVec-Built-in-Functions-Available-on-ISA-2_002e06.html\n\nimport common\n\nfmtspec = {}\n\n# -----------------------------------------------------------------------------\n# Helpers\n\ndef has_to_be_emulated(simd_ext, typ):\n    if typ == 'f16':\n        return True\n    if simd_ext == 'vmx' and typ in ['f64', 'i64', 'u64']:\n        return True\n    return False\n\n# Returns the power pc type corresponding to the nsimd type\ndef native_type(typ):\n    if typ == 'u8':\n        return '__vector unsigned char'\n    elif typ == 'i8':\n        return '__vector signed char'\n    elif typ == 'u16':\n        return '__vector unsigned short'\n    elif typ == 'i16':\n        return '__vector signed short'\n    elif typ == 'u32':\n        return '__vector unsigned int'\n    elif typ == 'u64':\n        return '__vector unsigned long long'\n    elif typ == 'i32':\n        return '__vector signed int'\n    elif typ == 'i64':\n        return '__vector signed long long'\n    elif typ == 'f32':\n        return '__vector float'\n    elif typ == 'f64':\n        return '__vector double'\n    else:\n        raise ValueError('Type \"{}\" not supported'.format(typ))\n\n# Returns the logical power pc type corresponding to the nsimd type\ndef native_typel(typ):\n    if typ in ['i8', 'u8']:\n        return '__vector __bool char'\n    elif typ in ['i16', 'u16']:\n        return '__vector __bool short'\n    elif typ in ['i32', 'u32', 'f32']:\n        return '__vector __bool int'\n    elif typ in ['f64', 'i64', 'u64']:\n        return '__vector __bool long long'\n    else:\n        raise ValueError('Type \"{}\" not supported'.format(typ))\n\n# Length of a vector with elements of type typ\ndef get_len(typ):\n    return 128 // int(typ[1:])\n\n# Emulate 64 bits types for vmx only\ndef emulate_64(op, typ, params):\n    def arg(param, i):\n        if param == 'v':\n            return '{}.v{{i}}'.format(common.get_arg(i))\n        elif param == 'l':\n            return '(int)({}.v{{i}} & ((u64)1))'.format(common.get_arg(i))\n        else:\n            return common.get_arg(i)\n    args = ', '.join(arg(params[i + 1], i) for i in range(len(params[1:])))\n    args0 = args.format(i=0)\n    args1 = args.format(i=1)\n    if params[0] == 'v':\n        return '''nsimd_vmx_v{typ} ret;\n                  ret.v0 = nsimd_scalar_{op}_{typ}({args0});\n                  ret.v1 = nsimd_scalar_{op}_{typ}({args1});\n                  return ret;'''. \\\n                  format(typ=typ, op=op, args0=args0, args1=args1)\n    else:\n        return \\\n        '''nsimd_vmx_vl{typ} ret;\n           ret.v0 = (u64)(nsimd_scalar_{op}{suf}({args0}) ? -1 : 0);\n           ret.v1 = (u64)(nsimd_scalar_{op}{suf}({args1}) ? -1 : 0);\n           return ret;'''. \\\n           format(suf='' if params == ['l'] * len(params) else '_' + typ,\n                  typ=typ, op=op, args0=args0, args1=args1)\n\ndef emulate_f16(op, simd_ext, params):\n    tmpl = ', '.join(['{{in{}}}.v{{{{i}}}}'.format(i).format(**fmtspec) \\\n                      for i in range(len(params[1:]))])\n    args1 = tmpl.format(i=0)\n    args2 = tmpl.format(i=1)\n    l = 'l' if params[0] == 'l' else ''\n    return '''nsimd_{simd_ext}_v{l}f16 ret;\n              ret.v0 = nsimd_{op}_{simd_ext}_f32({args1});\n              ret.v1 = nsimd_{op}_{simd_ext}_f32({args2});\n              return ret;'''. \\\n              format(l=l, op=op, args1=args1, args2=args2, **fmtspec)\n\ndef emulation_code(op, simd_ext, typ, params):\n    if typ == 'f16':\n        return emulate_f16(op, simd_ext, params)\n    elif simd_ext == 'vmx' and typ in ['f64', 'i64', 'u64']:\n        return emulate_64(op, typ, params)\n    else:\n        raise ValueError('Automatic emulation for {}/{}/{} is not supported'. \\\n                         format(func, simd_ext, typ))\n\ndef emulate_with_scalar(op, simd_ext, typ, params):\n    def arg(param, i):\n        if param == 'v':\n            return 'vec_extract({}, {{i}})'.format(common.get_arg(i))\n        elif param == 'l':\n            return '(int)(vec_extract({}, {{i}}) & ((u{})1))'. \\\n                   format(common.get_arg(i), typ[1:])\n        else:\n            return common.get_arg(i)\n    args = ', '.join(arg(params[i + 1], i) for i in range(len(params[1:])))\n    if params[0] == 'v':\n        return '''nsimd_{simd_ext}_v{typ} ret;\n                  ret = vec_splats(nsimd_scalar_{op}_{typ}({args0}));\n                  '''.format(typ=typ, op=op, args0=args.format(i=0),\n                             simd_ext=simd_ext) + '\\n' + \\\n               '\\n'.join('ret = vec_insert('\\\n                         'nsimd_scalar_{op}_{typ}({argsi}), ret, {i});'. \\\n                         format(op=op, typ=typ, argsi=args.format(i=i), i=i) \\\n                         for i in range(1, get_len(typ))) + '\\nreturn ret;'\n    else:\n        utyp = 'u' + typ[1:]\n        return \\\n        '''nsimd_{simd_ext}_vl{typ} ret;\n           ret = ({ppc_typl})vec_splats(({utyp})(\n                     nsimd_scalar_{op}_{typ}({args0}) ? -1 : 0));\n           '''.format(typ=typ, op=op, args0=args.format(i=0), utyp=utyp,\n                      ppc_typl=native_typel(typ), simd_ext=simd_ext) + '\\n' + \\\n           '\\n'.join(\n               'ret = ({ppc_typl})vec_insert(({utyp})(' \\\n               'nsimd_scalar_{op}_{typ}({argsi}) ? -1 : 0), ret, {i});'. \\\n               format(op=op, typ=typ, utyp=utyp, argsi=args.format(i=i),\n                      ppc_typl=native_typel(typ), i=i) \\\n               for i in range(1, get_len(typ))) + '\\nreturn ret;'\n\n# -----------------------------------------------------------------------------\n# Implementation of mandatory functions for this module\n\ndef emulate_fp16(simd_ext):\n    return True\n\ndef get_simd_exts():\n    return ['vmx', 'vsx']\n\ndef get_type(opts, simd_ext, typ, nsimd_typ):\n    if simd_ext not in get_simd_exts():\n        raise ValueError('Unknown SIMD extension \"{}\"'.format(simd_ext))\n    if typ not in common.types:\n        raise ValueError('Unknown type \"{}\"'.format(typ))\n    if typ == 'f16':\n        struct = 'struct {__vector float v0; __vector float v1;}'\n    elif simd_ext == 'vmx' and typ in ['i64', 'u64', 'f64']:\n        struct = 'struct {{ {} v0; {} v1; }}'.format(typ, typ)\n    else:\n        struct = native_type(typ)\n    return 'typedef {} {};'.format(struct, nsimd_typ)\n\ndef get_logical_type(opts, simd_ext, typ, nsimd_typ):\n    if simd_ext not in get_simd_exts():\n        raise ValueError('Unknown SIMD extension \"{}\"'.format(simd_ext))\n    if typ not in common.types:\n        raise ValueError('Unknown type \"{}\"'.format(typ))\n    if typ == 'f16':\n        struct = 'struct {__vector __bool int v0; __vector __bool int v1;}'\n    elif simd_ext == 'vmx' and typ in ['i64', 'u64', 'f64']:\n        struct = 'struct { u64 v0; u64 v1; }'\n    else:\n        struct = native_typel(typ)\n    return 'typedef {} {};'.format(struct, nsimd_typ)\n\ndef get_nb_registers(simd_ext):\n    if simd_ext == 'vsx':\n        return '64'\n    elif simd_ext == 'vmx':\n        return '32'\n    else:\n        raise ValueError('Unknown SIMD extension \"{}\"'.format(simd_ext))\n\ndef has_compatible_SoA_types(simd_ext):\n    if simd_ext in get_simd_exts():\n        return False\n    else:\n        raise ValueError('Unknown SIMD extension \"{}\"'.format(simd_ext))\n\ndef get_additional_include(func, platform, simd_ext):\n    ret = '''#include <nsimd/cpu/cpu/{}.h>\n             '''.format(func)\n    if simd_ext == 'vsx':\n        ret += '''#include <nsimd/ppc/vmx/{}.h>\n                  '''.format(func)\n\n    if func == 'neq':\n        ret += '''#include <nsimd/ppc/{simd_ext}/eq.h>\n                  #include <nsimd/ppc/{simd_ext}/notl.h>\n                  '''.format(simd_ext=simd_ext)\n\n    elif func in ['loadlu', 'loadla']:\n        ret += '''#include <nsimd/ppc/{simd_ext}/eq.h>\n                  #include <nsimd/ppc/{simd_ext}/set1.h>\n                  #include <nsimd/ppc/{simd_ext}/{load}.h>\n                  #include <nsimd/ppc/{simd_ext}/notl.h>\n                  '''.format(load='load' + func[5], **fmtspec)\n\n    elif func in ['storelu']:\n        ret += '''#include <nsimd/ppc/{simd_ext}/if_else1.h>\n                  #include <nsimd/ppc/{simd_ext}/set1.h>\n                  '''.format(**fmtspec)\n\n    elif func in ['shr', 'shl']:\n        ret += '''#include <nsimd/ppc/{simd_ext}/set1.h>\n                  '''.format(**fmtspec)\n\n    elif func == \"shra\":\n        ret += '''#include <nsimd/scalar_utilities.h>\n                  '''\n\n    elif func in ['zip', 'unzip']:\n        ret += '''#include <nsimd/ppc/{simd_ext}/{unzip_prefix}ziplo.h>\n                  #include <nsimd/ppc/{simd_ext}/{unzip_prefix}ziphi.h>\n                  '''.format(unzip_prefix=\"\" if func == \"zip\" else \"un\",\n                             **fmtspec)\n\n    elif func in ['unziplo', 'unziphi']:\n        ret += '''#include <nsimd/ppc/{simd_ext}/ziplo.h>\n                  #include <nsimd/ppc/{simd_ext}/ziphi.h>\n                  #include <math.h>\n                  '''.format(**fmtspec)\n\n    elif func[:5] in ['masko', 'maskz']:\n        ret += '''#include <nsimd/scalar_utilities.h>\n                  '''\n\n    elif func == 'mask_for_loop_tail':\n        ret += '''#include <nsimd/ppc/{simd_ext}/set1.h>\n                  #include <nsimd/ppc/{simd_ext}/set1l.h>\n                  #include <nsimd/ppc/{simd_ext}/iota.h>\n                  #include <nsimd/ppc/{simd_ext}/lt.h>\n                  '''.format(simd_ext=simd_ext)\n\n    elif func[:4] == 'load':\n        ret += '''\n        #include <nsimd/ppc/{simd_ext}/unzip.h>\n\n        #define NSIMD_PERMUTE_MASK_64(a, b)                        \\\n                {{ (unsigned char)(8 * a), (unsigned char)(8 * a + 1), \\\n                   (unsigned char)(8 * b), (unsigned char)(8 * b + 1) }}\n\n\n        #define NSIMD_PERMUTE_MASK_32(a, b, c, d)                        \\\n                {{ (unsigned char)(4 * a), (unsigned char)(4 * a + 1),     \\\n                   (unsigned char)(4 * a + 2), (unsigned char)(4 * a + 3),  \\\n                   (unsigned char)(4 * b), (unsigned char)(4 * b + 1),      \\\n                   (unsigned char)(4 * b + 2), (unsigned char)(4 * b + 3),  \\\n                   (unsigned char)(4 * c), (unsigned char)(4 * c + 1),      \\\n                   (unsigned char)(4 * c + 2), (unsigned char)(4 * c + 3),  \\\n                   (unsigned char)(4 * d), (unsigned char)(4 * d + 1),      \\\n                   (unsigned char)(4 * d + 2), (unsigned char)(4 * d + 3) }}\n\n         #define NSIMD_PERMUTE_MASK_16(a, b, c, d, e, f, g, h)           \\\n               {{ (unsigned char)(2 * a + 0), (unsigned char)(2 * a + 1),  \\\n                  (unsigned char)(2 * b + 0), (unsigned char)(2 * b + 1),  \\\n                  (unsigned char)(2 * c + 0), (unsigned char)(2 * c + 1),  \\\n                  (unsigned char)(2 * d + 0), (unsigned char)(2 * d + 1),  \\\n                  (unsigned char)(2 * e + 0), (unsigned char)(2 * e + 1),  \\\n                  (unsigned char)(2 * f + 0), (unsigned char)(2 * f + 1),  \\\n                  (unsigned char)(2 * g + 0), (unsigned char)(2 * g + 1),  \\\n                  (unsigned char)(2 * h + 0), (unsigned char)(2 * h + 1) }}\n\n         #define NSIMD_PERMUTE_MASK_8(a, b, c, d, e, f, g, h,            \\\n                                      i, j, k, l, m, n, o, p)            \\\n              {{ (unsigned char)(a), (unsigned char)(b),                  \\\n                 (unsigned char)(c), (unsigned char)(d),                  \\\n                 (unsigned char)(e), (unsigned char)(f),                  \\\n                 (unsigned char)(g), (unsigned char)(h),                  \\\n                 (unsigned char)(i), (unsigned char)(j),                  \\\n                 (unsigned char)(k), (unsigned char)(l),                  \\\n                 (unsigned char)(m), (unsigned char)(n),                  \\\n                 (unsigned char)(o), (unsigned char)(p) }}\n        '''.format(**fmtspec)\n\n    return ret\n\n# -----------------------------------------------------------------------------\n\ndef printf2(*args0):\n    \"\"\"\n    debugging purposes\n    decorate the function with it and when executed on test, it will print the\n    environnements *args0 are the name of var to printf\n    \"\"\"\n    to_print = []\n    for arg in args0:\n        if isinstance(arg, str):\n            to_print.append(arg)\n\n    def decorator(func):\n        import inspect\n\n        def wrapper(*args, **kwargs):\n            func_args = inspect.signature(func).bind(*args, **kwargs).arguments\n            func_args_str = '{} called on {}\\\\n'. \\\n                            format(func.__name__, fmtspec['typ']) + \\\n                            ', \"'.join('{} = {!r}'.format(*item) \\\n                                       for item in func_args.items())\n            ret = ''\n            if not DEBUG:\n                return func(*args)\n            typ = ''\n            if 'typ' in func_args:\n                typ = func_args['typ']\n            else:\n                typ = func_args['from_typ']\n            ret += 'int k;\\n'\n            if func.__name__ == 'store1234' and typ in ['f64', 'i64', 'u64']:\n                ret += '''\n                       printf(\"element to store: %ld %ld\", {in1}{suf0},\n                              {in1}{suf1});\n                       printf(\"\\\\n\");\n                       '''.format(**fmtspec, **get_suf64(typ))\n            elif func.__name__ == 'store1234' and typ[1:] == '32':\n                ret += '''\n                       printf(\"element to store:\");\n                       for (k = 0; k < 4; k++) {{\n                         printf(\" %lx\", {in1}[k]);\n                       }}\n                       printf(\"\\\\n\");\n                       '''.format(**fmtspec, nbits=get_len(typ))\n            #print var passed as parameter on printf2\n            for var in to_print:\n                if ppc_is_vec_type(typ):\n                    ret += '''\n                           printf(\"values of {var}:\");\n                           for (k = 0; k < {nbits}; k++) {{\n                             printf(\" %lld\", {var}[k]);\n                           }}\n                           printf(\"\\\\n\");\n                           '''.format(var=var, **fmtspec, nbits=get_len(typ))\n            return '''\n                   printf(\"\\\\n---------------\\\\n\");\n                   printf(\"{}.{} ( {} )\\\\n\");\n                   '''.format(func.__module__, func.__qualname__,\n                              func_args_str) + ret + func(*args)\n\n        return wrapper\n\n    return decorator\n\n\n# -----------------------------------------------------------------------------\n# Loads of degree 1, 2, 3 and 4\n# About unaligned loads/stores for Altivec:\n# https://developer.ibm.com/technologies/systems/articles/pa-dalign/\n\ndef load1234(simd_ext, typ, deg, aligned):\n    if typ in ['f64', 'i64', 'u64']:\n        if deg == 1:\n            if simd_ext == 'vmx':\n                return '''nsimd_{simd_ext}_v{typ} ret;\n                          ret.v0 = {in0}[0];\n                          ret.v1 = {in0}[1];\n                          return ret;'''.format(**fmtspec)\n            else:\n                return '''nsimd_{simd_ext}_v{typ} ret;\n                          ret = vec_splats({in0}[0]);\n                          ret = vec_insert({in0}[1], ret, 1);\n                          return ret;'''.format(**fmtspec)\n        else:\n            if simd_ext == 'vmx':\n                return \\\n                'nsimd_{simd_ext}_v{typ}x{} ret;\\n'.format(deg, **fmtspec) + \\\n                '\\n'.join(['ret.v{i}.v0 = *({in0} + {i});'. \\\n                           format(i=i, **fmtspec) \\\n                           for i in range(0, deg)]) + \\\n                '\\n'.join(['ret.v{i}.v1 = *({in0} + {ipd});'. \\\n                           format(i=i, ipd=i + deg, **fmtspec) \\\n                           for i in range(0, deg)]) + \\\n                '\\nreturn ret;'\n            else:\n                return \\\n                'nsimd_{simd_ext}_v{typ}x{} ret;\\n'.format(deg, **fmtspec) + \\\n                '\\n'.join(\n                'ret.v{i} = vec_splats({in0}[{i}]);'.format(i=i, **fmtspec) \\\n                for i in range(0, deg)) + \\\n                '\\n'.join(\n                'ret.v{i} = vec_insert({in0}[{ipd}], ret.v{i}, 1);'. \\\n                format(i=i, ipd=i + deg, **fmtspec) for i in range(0, deg)) + \\\n                '\\nreturn ret;'\n    if typ == 'f16':\n        if deg == 1:\n            return \\\n            '''nsimd_{simd_ext}_vf16 ret;\n               u16 *ptr = (u16 *){in0};\n               ret.v0 = vec_splats(nsimd_u16_to_f32(ptr[0]));\n               ret.v0 = vec_insert(nsimd_u16_to_f32(ptr[1]), ret.v0, 1);\n               ret.v0 = vec_insert(nsimd_u16_to_f32(ptr[2]), ret.v0, 2);\n               ret.v0 = vec_insert(nsimd_u16_to_f32(ptr[3]), ret.v0, 3);\n               ret.v1 = vec_splats(nsimd_u16_to_f32(ptr[4]));\n               ret.v1 = vec_insert(nsimd_u16_to_f32(ptr[5]), ret.v1, 1);\n               ret.v1 = vec_insert(nsimd_u16_to_f32(ptr[6]), ret.v1, 2);\n               ret.v1 = vec_insert(nsimd_u16_to_f32(ptr[7]), ret.v1, 3);\n               return ret;'''.format(**fmtspec)\n        else:\n            ret = '''nsimd_{simd_ext}_vf16x{deg} ret;\n                     u16 *ptr = (u16 *){in0};\n                     '''.format(deg=deg, **fmtspec)\n\n            for i in range(0, deg):\n                for k in range(0, 2):\n                    ret += 'ret.v{}.v{} = vec_splats(' \\\n                           'nsimd_u16_to_f32(ptr[{}]));\\n'. \\\n                           format(i, k, i + k * 4 * deg)\n                    for j in range(1, 4):\n                        ret += 'ret.v{i}.v{k} = vec_insert(nsimd_u16_to_f32(' \\\n                               'ptr[{o}]), ret.v{i}.v{k}, {j});\\n'. \\\n                               format(i=i, k=k, j=j,\n                                      o=i + k * 4 * deg + j * deg)\n            ret += 'return ret;'\n            return ret\n    if deg == 1:\n        if aligned:\n            return 'return vec_ld(0, {in0});'.format(**fmtspec)\n        else:\n            return 'return *({ppc_typ}*){in0};'. \\\n                   format(ppc_typ=native_type(typ), **fmtspec)\n\n    # From here deg >= 2\n\n    if aligned:\n        load = 'nsimd_{simd_ext}_v{typ}x{deg} ret;\\n'. \\\n               format(deg=deg, **fmtspec) + \\\n               '\\n'.join(\n                 'nsimd_{simd_ext}_v{typ} in{i} = vec_ld({o}, {in0});'. \\\n                 format(i=i, o=i * 16, **fmtspec) for i in range(deg))\n    else:\n        load = \\\n        'nsimd_{simd_ext}_v{typ}x{deg} ret;\\n'. \\\n        format(deg=deg, **fmtspec) + \\\n        '\\n'.join(\n          'nsimd_{simd_ext}_v{typ} in{i} = *(({ppc_typ}*){in0} + {i});'. \\\n          format(i=i, ppc_typ=native_type(typ), **fmtspec) \\\n                 for i in range(0, deg))\n    if deg == 2:\n        return '''{load}\n                  ret = nsimd_unzip_{simd_ext}_{typ}(in0, in1);\n                  return ret;'''.format(load=load, **fmtspec)\n    elif deg == 3:\n        if typ in ['i32', 'u32', 'f32']:\n            return \\\n            '''__vector unsigned char perm1 = NSIMD_PERMUTE_MASK_32(\n                                                  0, 3, 6, 0);\n\n               {load}\n\n               nsimd_{simd_ext}_v{typ} tmp0 = vec_perm(in0, in1, perm1);\n               nsimd_{simd_ext}_v{typ} tmp1 = vec_perm(in1, in2, perm1);\n               nsimd_{simd_ext}_v{typ} tmp2 = vec_perm(in2, in0, perm1);\n\n               __vector unsigned char perm2 = NSIMD_PERMUTE_MASK_32(\n                                                  0, 1, 2, 5);\n               __vector unsigned char perm3 = NSIMD_PERMUTE_MASK_32(\n                                                  5, 0, 1, 2);\n               __vector unsigned char perm4 = NSIMD_PERMUTE_MASK_32(\n                                                  2, 5, 0, 1);\n\n               ret.v0 = vec_perm(tmp0, in2, perm2);\n               ret.v1 = vec_perm(tmp1, in0, perm3);\n               ret.v2 = vec_perm(tmp2, in1, perm4);\n\n               return ret;'''.format(load=load, **fmtspec)\n        elif typ in ['i16', 'u16']:\n            return \\\n            '''{load}\n\n               __vector unsigned char permRAB = NSIMD_PERMUTE_MASK_16(\n                                           0, 3, 6, 9, 12, 15, 0, 0);\n               __vector unsigned char permRDC = NSIMD_PERMUTE_MASK_16(\n                                           0, 1, 2, 3, 4, 5, 10, 13);\n\n               nsimd_{simd_ext}_v{typ} tmp0 = vec_perm(in0, in1, permRAB);\n               ret.v0 = vec_perm(tmp0, in2, permRDC);\n\n               __vector unsigned char permGAB = NSIMD_PERMUTE_MASK_16(\n                                           1, 4, 7, 10, 13, 0, 0, 0);\n               __vector unsigned char permGEC = NSIMD_PERMUTE_MASK_16(\n                                           0, 1, 2, 3, 4, 8, 11, 14);\n\n               nsimd_{simd_ext}_v{typ} tmp1 = vec_perm(in0, in1, permGAB);\n               ret.v1 = vec_perm(tmp1, in2, permGEC);\n\n               __vector unsigned char permBAB = NSIMD_PERMUTE_MASK_16(\n                                           2, 5, 8, 11, 14, 0, 0, 0);\n               __vector unsigned char permBFC = NSIMD_PERMUTE_MASK_16(\n                                           0, 1, 2, 3, 4, 9, 12, 15);\n\n               nsimd_{simd_ext}_v{typ} tmp2 = vec_perm(in0, in1, permBAB);\n               ret.v2 = vec_perm(tmp2, in2, permBFC);\n\n               return ret;'''.format(load=load, **fmtspec)\n        elif typ in ['i8', 'u8']:\n            return \\\n            '''{load}\n\n               __vector unsigned char permRAB = NSIMD_PERMUTE_MASK_8(\n                   0, 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 0, 0, 0, 0, 0);\n               __vector unsigned char permRDC = NSIMD_PERMUTE_MASK_8(\n                   0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 17, 20, 23, 26, 29);\n\n               nsimd_{simd_ext}_v{typ} tmp0 = vec_perm(in0, in1, permRAB);\n               ret.v0 = vec_perm(tmp0, in2, permRDC);\n\n               __vector unsigned char permGAB = NSIMD_PERMUTE_MASK_8(\n                   1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31, 0, 0, 0, 0, 0);\n               __vector unsigned char permGEC = NSIMD_PERMUTE_MASK_8(\n                   0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 18, 21, 24, 27, 30);\n\n               nsimd_{simd_ext}_v{typ} tmp1 = vec_perm(in0, in1, permGAB);\n               ret.v1 = vec_perm(tmp1, in2, permGEC);\n\n               __vector unsigned char permBAB = NSIMD_PERMUTE_MASK_8(\n                   2, 5, 8, 11, 14, 17, 20, 23, 26, 29, 0, 0, 0, 0, 0, 0);\n               __vector unsigned char permBFC = NSIMD_PERMUTE_MASK_8(\n                   0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 16, 19, 22, 25, 28, 31);\n\n               nsimd_{simd_ext}_v{typ} tmp2 = vec_perm(in0, in1, permBAB);\n               ret.v2 = vec_perm(tmp2, in2, permBFC);\n\n               return ret;'''.format(load=load, **fmtspec)\n    else:\n        if typ in ['i32', 'u32', 'f32']:\n            return \\\n            '''{load}\n\n               nsimd_{simd_ext}_v{typ} tmp0 = vec_mergeh(in0, in2);\n               nsimd_{simd_ext}_v{typ} tmp1 = vec_mergel(in0, in2);\n               nsimd_{simd_ext}_v{typ} tmp2 = vec_mergeh(in1, in3);\n               nsimd_{simd_ext}_v{typ} tmp3 = vec_mergel(in1, in3);\n\n               ret.v0 = vec_mergeh(tmp0, tmp2);\n               ret.v1 = vec_mergel(tmp0, tmp2);\n               ret.v2 = vec_mergeh(tmp1, tmp3);\n               ret.v3 = vec_mergel(tmp1, tmp3);\n\n               return ret;'''.format(load=load, **fmtspec)\n        elif typ in ['i16', 'u16']:\n            return \\\n            '''{load}\n\n               ret.v0 = vec_mergeh(in0, in2);\n               ret.v1 = vec_mergel(in0, in2);\n               ret.v2 = vec_mergeh(in1, in3);\n               ret.v3 = vec_mergel(in1, in3);\n\n               nsimd_{simd_ext}_v{typ} tmp0 = vec_mergeh(ret.v0, ret.v2);\n               nsimd_{simd_ext}_v{typ} tmp1 = vec_mergel(ret.v0, ret.v2);\n               nsimd_{simd_ext}_v{typ} tmp2 = vec_mergeh(ret.v1, ret.v3);\n               nsimd_{simd_ext}_v{typ} tmp3 = vec_mergel(ret.v1, ret.v3);\n\n               ret.v0 = vec_mergeh(tmp0, tmp2);\n               ret.v1 = vec_mergel(tmp0, tmp2);\n               ret.v2 = vec_mergeh(tmp1, tmp3);\n               ret.v3 = vec_mergel(tmp1, tmp3);\n\n               return ret;'''.format(load=load, **fmtspec)\n        elif typ in ['i8', 'u8']:\n            return \\\n            '''{load}\n\n               nsimd_{simd_ext}_v{typ} tmp0 = vec_mergeh(in0, in2);\n               nsimd_{simd_ext}_v{typ} tmp1 = vec_mergel(in0, in2);\n               nsimd_{simd_ext}_v{typ} tmp2 = vec_mergeh(in1, in3);\n               nsimd_{simd_ext}_v{typ} tmp3 = vec_mergel(in1, in3);\n\n               ret.v0 = vec_mergeh(tmp0, tmp2);\n               ret.v1 = vec_mergel(tmp0, tmp2);\n               ret.v2 = vec_mergeh(tmp1, tmp3);\n               ret.v3 = vec_mergel(tmp1, tmp3);\n\n               tmp0 = vec_mergeh(ret.v0, ret.v2);\n               tmp1 = vec_mergel(ret.v0, ret.v2);\n               tmp2 = vec_mergeh(ret.v1, ret.v3);\n               tmp3 = vec_mergel(ret.v1, ret.v3);\n\n               ret.v0 = vec_mergeh(tmp0, tmp2);\n               ret.v1 = vec_mergel(tmp0, tmp2);\n               ret.v2 = vec_mergeh(tmp1, tmp3);\n               ret.v3 = vec_mergel(tmp1, tmp3);\n\n               return ret;'''.format(load=load, **fmtspec)\n\n# -----------------------------------------------------------------------------\n# Stores of degree 1, 2, 3 and 4\n\ndef store1234(simd_ext, typ, deg, aligned):\n    if typ in ['f64', 'i64', 'u64']:\n        if simd_ext == 'vmx':\n            return '\\n'.join('{}[{}] = {}.v0;'. \\\n                             format(common.in0, i, common.get_arg(i + 1)) \\\n                             for i in range(deg)) + '\\n' + \\\n                   '\\n'.join('{}[{}] = {}.v1;'. \\\n                             format(common.in0, i + deg,\n                                    common.get_arg(i + 1)) for i in range(deg))\n        else:\n            return '\\n'.join('{}[{}] = vec_extract({}, 0);'. \\\n                             format(common.in0, i, common.get_arg(i + 1)) \\\n                             for i in range(deg)) + '\\n' + \\\n                   '\\n'.join('{}[{}] = vec_extract({}, 1);'. \\\n                             format(common.in0, i + deg,\n                                    common.get_arg(i + 1)) for i in range(deg))\n    if typ == 'f16':\n        if deg == 1:\n            return \\\n            '''u16 *ptr = (u16 *){in0};\n               ptr[0] = nsimd_f32_to_u16(vec_extract({in1}.v0, 0));\n               ptr[1] = nsimd_f32_to_u16(vec_extract({in1}.v0, 1));\n               ptr[2] = nsimd_f32_to_u16(vec_extract({in1}.v0, 2));\n               ptr[3] = nsimd_f32_to_u16(vec_extract({in1}.v0, 3));\n               ptr[4] = nsimd_f32_to_u16(vec_extract({in1}.v1, 0));\n               ptr[5] = nsimd_f32_to_u16(vec_extract({in1}.v1, 1));\n               ptr[6] = nsimd_f32_to_u16(vec_extract({in1}.v1, 2));\n               ptr[7] = nsimd_f32_to_u16(vec_extract({in1}.v1, 3));'''. \\\n               format(**fmtspec)\n        else:\n            ret = 'u16 *ptr = (u16 *){in0};\\n'.format(**fmtspec)\n            for i in range(0, deg):\n                for k in range(0, 2):\n                    for j in range(0, 4):\n                        ret += 'ptr[{o}] = nsimd_f32_to_u16(' \\\n                               'vec_extract({a}.v{k}, {j}));\\n'. \\\n                               format(a=common.get_arg(i + 1), j=j, k=k,\n                                      o=i + k * 4 * deg + j * deg, **fmtspec)\n            return ret\n    if deg == 1:\n        if aligned:\n            return 'vec_st({in1}, 0, {in0});'.format(**fmtspec)\n        else:\n            return '*({ppc_typ} *){in0} = {in1};'. \\\n                   format(ppc_typ=native_type(typ), **fmtspec)\n\n    # From here deg >= 2\n\n    if aligned:\n        store = '\\n'.join('vec_st(ret{i}, {o}, {in0});'. \\\n                          format(i=i, o=i * 16, **fmtspec) \\\n                          for i in range(0, deg))\n    else:\n        store = '\\n'.join('*({ppc_typ} *)({in0} + {o}) = ret{i};'. \\\n                          format(o=i * get_len(typ), ppc_typ=native_type(typ),\n                                 i=i, **fmtspec) for i in range(deg))\n    if deg == 2:\n        return \\\n        '''nsimd_{simd_ext}_v{typ} ret0 = vec_mergeh({in1}, {in2});\n           nsimd_{simd_ext}_v{typ} ret1 = vec_mergel({in1}, {in2});\n\n           {store}'''.format(store=store, **fmtspec)\n    elif deg == 3:\n        if typ in ['i32', 'u32', 'f32']:\n            return \\\n            '''__vector unsigned char perm1 = NSIMD_PERMUTE_MASK_32(\n                                                  0, 2, 4, 6);\n               __vector unsigned char perm2 = NSIMD_PERMUTE_MASK_32(\n                                                  0, 2, 5, 7);\n               __vector unsigned char perm3 = NSIMD_PERMUTE_MASK_32(\n                                                  1, 3, 5, 7);\n\n               nsimd_{simd_ext}_v{typ} tmp0 = vec_perm({in1}, {in2}, perm1);\n               nsimd_{simd_ext}_v{typ} tmp1 = vec_perm({in3}, {in1}, perm2);\n               nsimd_{simd_ext}_v{typ} tmp2 = vec_perm({in2}, {in3}, perm3);\n\n               nsimd_{simd_ext}_v{typ} ret0 = vec_perm(tmp0, tmp1, perm1);\n               nsimd_{simd_ext}_v{typ} ret1 = vec_perm(tmp2, tmp0, perm2);\n               nsimd_{simd_ext}_v{typ} ret2 = vec_perm(tmp1, tmp2, perm3);\n\n               {store}'''.format(store=store, **fmtspec)\n        elif typ in ['i16', 'u16']:\n            return \\\n            '''__vector unsigned char permARG = NSIMD_PERMUTE_MASK_16(\n                                           0, 8, 0, 1, 9, 0, 2, 10);\n               __vector unsigned char permAXB = NSIMD_PERMUTE_MASK_16(\n                                           0, 1, 8, 3, 4, 9, 6, 7);\n\n               nsimd_{simd_ext}_v{typ} tmp0 = vec_perm({in1}, {in2}, permARG);\n               nsimd_{simd_ext}_v{typ} ret0 = vec_perm(tmp0, {in3}, permAXB);\n\n               __vector unsigned char permBRG = NSIMD_PERMUTE_MASK_16(\n                                           0, 3, 11, 0, 4, 12, 0, 5);\n               __vector unsigned char permBYB = NSIMD_PERMUTE_MASK_16(\n                                           10, 1, 2, 11, 4, 5, 12, 7);\n\n               nsimd_{simd_ext}_v{typ} tmp1 = vec_perm({in1}, {in2}, permBRG);\n               nsimd_{simd_ext}_v{typ} ret1 = vec_perm(tmp1, {in3}, permBYB);\n\n               __vector unsigned char permCRG = NSIMD_PERMUTE_MASK_16(\n                                           13, 0, 6, 14, 0, 7, 15, 0);\n               __vector unsigned char permCZB = NSIMD_PERMUTE_MASK_16(\n                                           0, 13, 2, 3, 14, 5, 6, 15);\n\n               nsimd_{simd_ext}_v{typ} tmp2 = vec_perm({in1}, {in2}, permCRG);\n               nsimd_{simd_ext}_v{typ} ret2 = vec_perm(tmp2, {in3}, permCZB);\n\n               {store}'''.format(store=store, **fmtspec)\n        elif typ in ['i8', 'u8']:\n            return \\\n            '''__vector unsigned char mARG = NSIMD_PERMUTE_MASK_8(\n                   0, 16, 0, 1, 17, 0, 2, 18, 0, 3, 19, 0, 4, 20, 0, 5);\n               __vector unsigned char mAXB = NSIMD_PERMUTE_MASK_8(\n                   0, 1, 16, 3, 4, 17, 6, 7, 18, 9, 10, 19, 12, 13, 20, 15);\n\n               nsimd_{simd_ext}_v{typ} tmp0 = vec_perm({in1}, {in2}, mARG);\n               nsimd_{simd_ext}_v{typ} ret0 = vec_perm(tmp0, {in3}, mAXB);\n\n               __vector unsigned char mBRG = NSIMD_PERMUTE_MASK_8(\n                   21, 0, 6, 22, 0, 7, 23, 0, 8, 24, 0, 9, 25, 0, 10, 26);\n               __vector unsigned char mBYB = NSIMD_PERMUTE_MASK_8(\n                   0, 21, 2, 3, 22, 5, 6, 23, 8, 9, 24, 11, 12, 25, 14, 15);\n\n               nsimd_{simd_ext}_v{typ} tmp1 = vec_perm({in1}, {in2}, mBRG);\n               nsimd_{simd_ext}_v{typ} ret1 = vec_perm(tmp1, {in3}, mBYB);\n\n               __vector unsigned char mCRG = NSIMD_PERMUTE_MASK_8(\n                   0, 11, 27, 0, 12, 28, 0, 13, 29, 0, 14, 30, 0, 15, 31, 0);\n               __vector unsigned char mCZB = NSIMD_PERMUTE_MASK_8(\n                   26, 1, 2, 27, 4, 5, 28, 7, 8, 29, 10, 11, 30, 13, 14, 31);\n\n               nsimd_{simd_ext}_v{typ} tmp2 = vec_perm({in1}, {in2}, mCRG);\n               nsimd_{simd_ext}_v{typ} ret2 = vec_perm(tmp2, {in3}, mCZB);\n\n               {store}'''.format(store=store, **fmtspec)\n    else:\n        if typ in ['i32', 'u32', 'f32']:\n            return \\\n            '''nsimd_{simd_ext}_v{typ} tmp0 = vec_mergeh({in1}, {in3});\n               nsimd_{simd_ext}_v{typ} tmp1 = vec_mergel({in1}, {in3});\n               nsimd_{simd_ext}_v{typ} tmp2 = vec_mergeh({in2}, {in4});\n               nsimd_{simd_ext}_v{typ} tmp3 = vec_mergel({in2}, {in4});\n\n               nsimd_{simd_ext}_v{typ} ret0 = vec_mergeh(tmp0, tmp2);\n               nsimd_{simd_ext}_v{typ} ret1 = vec_mergel(tmp0, tmp2);\n               nsimd_{simd_ext}_v{typ} ret2 = vec_mergeh(tmp1, tmp3);\n               nsimd_{simd_ext}_v{typ} ret3 = vec_mergel(tmp1, tmp3);\n\n               {store}'''.format(store=store, **fmtspec)\n        elif typ in ['i16', 'u16']:\n            return \\\n            '''nsimd_{simd_ext}_v{typ} tmp0 = vec_mergeh({in1}, {in3});\n               nsimd_{simd_ext}_v{typ} tmp1 = vec_mergel({in1}, {in3});\n               nsimd_{simd_ext}_v{typ} tmp2 = vec_mergeh({in2}, {in4});\n               nsimd_{simd_ext}_v{typ} tmp3 = vec_mergel({in2}, {in4});\n\n               nsimd_{simd_ext}_v{typ} ret0 = vec_mergeh(tmp0, tmp2);\n               nsimd_{simd_ext}_v{typ} ret1 = vec_mergel(tmp0, tmp2);\n               nsimd_{simd_ext}_v{typ} ret2 = vec_mergeh(tmp1, tmp3);\n               nsimd_{simd_ext}_v{typ} ret3 = vec_mergel(tmp1, tmp3);\n\n               {store}'''.format(store=store, **fmtspec)\n        elif typ in ['i8', 'u8']:\n            return \\\n            '''nsimd_{simd_ext}_v{typ} tmp0 = vec_mergeh({in1}, {in3});\n               nsimd_{simd_ext}_v{typ} tmp1 = vec_mergel({in1}, {in3});\n               nsimd_{simd_ext}_v{typ} tmp2 = vec_mergeh({in2}, {in4});\n               nsimd_{simd_ext}_v{typ} tmp3 = vec_mergel({in2}, {in4});\n\n               nsimd_{simd_ext}_v{typ} ret0 = vec_mergeh(tmp0, tmp2);\n               nsimd_{simd_ext}_v{typ} ret1 = vec_mergel(tmp0, tmp2);\n               nsimd_{simd_ext}_v{typ} ret2 = vec_mergeh(tmp1, tmp3);\n               nsimd_{simd_ext}_v{typ} ret3 = vec_mergel(tmp1, tmp3);\n\n               {store}'''.format(store=store, **fmtspec)\n\n# -----------------------------------------------------------------------------\n# Length\n\ndef len1(simd_ext, typ):\n    return 'return {};'.format(128 // int(typ[1:]))\n\n# -----------------------------------------------------------------------------\n# Other helper functions\n\ndef simple_op2(op, simd_ext, typ):\n    if has_to_be_emulated(simd_ext, typ):\n        return emulation_code(op, simd_ext, typ, ['v', 'v', 'v'])\n    return 'return vec_{op}({in0}, {in1});'.format(op=op, **fmtspec)\n\n# Binary operators: and, or, xor, andnot\ndef binary_op2(op, simd_ext, typ):\n    if has_to_be_emulated(simd_ext, typ):\n        return emulation_code(op, simd_ext, typ, ['v', 'v', 'v'])\n    else:\n        ppcop = {'orb': 'or', 'xorb': 'xor', 'andb': 'and', 'andnotb': 'andc'}\n        return 'return vec_{op}({in0}, {in1});'.format(op=ppcop[op], **fmtspec)\n\n# Logical operators: and, or, xor, andnot\ndef logical_op2(op, simd_ext, typ):\n    if has_to_be_emulated(simd_ext, typ):\n        return emulation_code(op, simd_ext, typ, ['l', 'l', 'l'])\n    ppcop = {'orl': 'or', 'xorl': 'xor', 'andl': 'and', 'andnotl': 'andc'}\n    return 'return vec_{op}({in0}, {in1});'.format(op=ppcop[op], **fmtspec)\n\n# -----------------------------------------------------------------------------\n\ndef div2(simd_ext, typ):\n    if has_to_be_emulated(simd_ext, typ):\n        return emulation_code('div', simd_ext, typ, ['v', 'v', 'v'])\n    elif typ in common.ftypes:\n        return 'return vec_div({in0}, {in1});'.format(**fmtspec)\n    elif typ in common.iutypes:\n        return '''nsimd_{simd_ext}_v{typ} ret;\n                  ret = vec_splats(({typ})(vec_extract({in0}, 0) /\n                                   vec_extract({in1}, 0)));\n                  '''.format(**fmtspec) + \\\n               '\\n'.join(\n               '''ret = vec_insert(({typ})(vec_extract({in0}, {i}) /\n                                   vec_extract({in1}, {i})), ret, {i});'''. \\\n                                   format(i=i, **fmtspec) \\\n                                   for i in range(get_len(typ))) + \\\n               '\\nreturn ret;'\n\n# -----------------------------------------------------------------------------\n\ndef not1(simd_ext, typ):\n    if has_to_be_emulated(simd_ext, typ):\n        return emulation_code('notb', simd_ext, typ, ['v', 'v'])\n    return 'return vec_nor({in0}, {in0});'.format(**fmtspec)\n\n# -----------------------------------------------------------------------------\n\ndef lnot1(simd_ext, typ):\n    if has_to_be_emulated(simd_ext, typ):\n        return emulation_code('notl', simd_ext, typ, ['l', 'l'])\n    return 'return vec_nor({in0}, {in0});'.format(**fmtspec)\n\n# -----------------------------------------------------------------------------\n\ndef sqrt1(simd_ext, typ):\n    if has_to_be_emulated(simd_ext, typ):\n        return emulation_code('sqrt', simd_ext, typ, ['v', 'v'])\n    return 'return vec_sqrt({in0});'.format(**fmtspec)\n\n# -----------------------------------------------------------------------------\n\ndef shift2(op, simd_ext, typ):\n    if has_to_be_emulated(simd_ext, typ):\n        return emulation_code(op, simd_ext, typ, ['v', 'v', 'p'])\n    return 'return vec_{ppcop}({in0}, vec_splats((u{typnbits}){in1}));'. \\\n           format(ppcop={'shl': 'sl', 'shr': 'sr', 'shra': 'sra'}[op],\n                  **fmtspec)\n\n# -----------------------------------------------------------------------------\n\ndef set1(simd_ext, typ):\n    if typ == 'f16':\n        return '''nsimd_{simd_ext}_vf16 ret;\n                  f32 tmp = nsimd_f16_to_f32({in0});\n                  ret.v0 = vec_splats(tmp);\n                  ret.v1 = ret.v0;\n                  return ret;'''.format(**fmtspec)\n    elif has_to_be_emulated(simd_ext, typ):\n        return '''nsimd_{simd_ext}_v{typ} ret;\n                  ret.v0 = {in0};\n                  ret.v1 = {in0};\n                  return ret;'''.format(**fmtspec)\n    else:\n        return 'return vec_splats({in0});'.format(**fmtspec)\n\n# -----------------------------------------------------------------------------\n\ndef lset1(simd_ext, typ):\n    if typ == 'f16':\n        return \\\n        '''nsimd_{simd_ext}_vlf16 ret;\n           ret.v0 = (__vector __bool int)vec_splats((u32)({in0} ? -1 : 0));\n           ret.v1 = ret.v0;\n           return ret;'''.format(**fmtspec)\n    elif has_to_be_emulated(simd_ext, typ):\n        return '''nsimd_{simd_ext}_vl{typ} ret;\n                  ret.v0 = (u64)({in0} ? -1 : 0);\n                  ret.v1 = (u64)({in0} ? -1 : 0);\n                  return ret;'''.format(**fmtspec)\n    else:\n        return '''if ({in0}) {{\n                    return ({ppc_typ})vec_splats((u{typnbits})-1);\n                  }} else {{\n                    return {lzeros};\n                  }}'''.format(ppc_typ=native_typel(typ), **fmtspec)\n\n# -----------------------------------------------------------------------------\n\ndef cmp2(op, simd_ext, typ):\n    if has_to_be_emulated(simd_ext, typ):\n        return emulation_code(op, simd_ext, typ, ['l', 'v', 'v'])\n    elif typ in common.iutypes:\n        if op == 'ne':\n            return '''nsimd_{simd_ext}_vl{typ} tmp;\n                      tmp = vec_cmpeq({in0}, {in1});\n                      return vec_nor(tmp, tmp);'''.format(op=op, **fmtspec)\n        else:\n            return 'return vec_cmp{op}({in0}, {in1});'.format(op=op, **fmtspec)\n    else:\n        return emulate_with_scalar(op, simd_ext, typ, ['l', 'v', 'v'])\n\n# -----------------------------------------------------------------------------\n\ndef if_else3(simd_ext, typ):\n    if typ == 'f16':\n        return emulate_f16('if_else1', simd_ext, ['v', 'l', 'v', 'v'])\n    elif has_to_be_emulated(simd_ext, typ):\n        return '''nsimd_{simd_ext}_v{typ} ret;\n                  ret.v0 = ({in0}.v0 ? {in1}.v0 : {in2}.v0);\n                  ret.v1 = ({in0}.v1 ? {in1}.v1 : {in2}.v1);\n                  return ret;'''.format(**fmtspec)\n    return 'return vec_sel({in2}, {in1}, {in0});'.format(**fmtspec)\n\n# -----------------------------------------------------------------------------\n\ndef minmax2(op, simd_ext, typ):\n    if has_to_be_emulated(simd_ext, typ):\n        return emulation_code(op, simd_ext, typ, ['v', 'v', 'v'])\n    return 'return vec_{op}({in0}, {in1});'.format(op=op, **fmtspec)\n\n# -----------------------------------------------------------------------------\n\ndef abs1(simd_ext, typ):\n    if typ in common.utypes:\n        return 'return {in0};'.format(**fmtspec)\n    elif has_to_be_emulated(simd_ext, typ):\n        return emulation_code('abs', simd_ext, typ, ['v', 'v'])\n    return 'return vec_abs({in0});'.format(**fmtspec)\n\n# -----------------------------------------------------------------------------\n\ndef round1(op, simd_ext, typ):\n    if typ in common.iutypes:\n        return 'return {in0};'.format(**fmtspec)\n    elif has_to_be_emulated(simd_ext, typ):\n        return emulation_code(op, simd_ext, typ, ['v', 'v'])\n    if op == 'round_to_even':\n        return emulate_with_scalar('round_to_even', simd_ext, typ, ['v', 'v'])\n    ppcop = { 'trunc': 'trunc', 'ceil': 'ceil', 'floor': 'floor' }\n    return 'return vec_{op}({in0});'.format(op=ppcop[op], **fmtspec)\n\n# -----------------------------------------------------------------------------\n\ndef fma(op, simd_ext, typ):\n    if has_to_be_emulated(simd_ext, typ):\n        return emulation_code(op, simd_ext, typ, ['v', 'v', 'v', 'v'])\n    elif typ in common.iutypes:\n        if op == 'fma':\n            return \\\n            'return vec_add(vec_mul({in0}, {in1}), {in2});'.format(**fmtspec)\n        elif op == 'fms':\n            return \\\n            'return vec_sub(vec_mul({in0}, {in1}), {in2});'.format(**fmtspec)\n        elif op == 'fnma':\n            return \\\n            'return vec_sub({in2}, vec_mul({in0}, {in1}));'.format(**fmtspec)\n        elif op == 'fnms':\n            return '''return vec_sub(nsimd_neg_{simd_ext}_{typ}({in2}),\n                                 vec_mul({in0}, {in1}));'''.format(**fmtspec)\n    elif typ in common.ftypes:\n        ppcop = { 'fma': 'vec_madd', 'fms': 'vec_msub', 'fnms': 'vec_nmadd',\n                  'fnma': 'vec_nmsub' }\n        return 'return {ppcop}({in0}, {in1}, {in2});'. \\\n               format(ppcop=ppcop[op], **fmtspec)\n\n# -----------------------------------------------------------------------------\n\ndef neg1(simd_ext, typ):\n    if has_to_be_emulated(simd_ext, typ):\n        return emulation_code('neg', simd_ext, typ, ['v', 'v'])\n    elif typ in common.itypes or typ in common.ftypes:\n        return 'return vec_neg({in0});'.format(**fmtspec)\n    else:\n        return 'return vec_sub({zeros}, {in0});'.format(**fmtspec)\n\n# -----------------------------------------------------------------------------\n\ndef recs1(op, simd_ext, typ):\n    if has_to_be_emulated(simd_ext, typ):\n        return emulation_code(op, simd_ext, typ, ['v', 'v'])\n    elif op == 'rec':\n        return 'return vec_div(vec_splats(({typ})1), {in0});'. \\\n               format(**fmtspec)\n    elif op in ['rec8', 'rec11']:\n        return 'return vec_re({in0});'.format(**fmtspec)\n    elif op in ['rsqrt8', 'rsqrt11']:\n        return 'return vec_rsqrte({in0});'.format(**fmtspec)\n\n# -----------------------------------------------------------------------------\n\ndef loadl(aligned, simd_ext, typ):\n    return \\\n    '''/* This can surely be improved but it is not our priority. */\n       return nsimd_notl_{simd_ext}_{typ}(nsimd_eq_{simd_ext}_{typ}(\n                nsimd_load{align}_{simd_ext}_{typ}(\n                  {in0}), nsimd_set1_{simd_ext}_{typ}({zero})));'''. \\\n                  format(align='a' if aligned else 'u',\n                         zero='nsimd_f32_to_f16(0.0f)' if typ == 'f16'\n                         else '({})0'.format(typ), **fmtspec)\n\n# -----------------------------------------------------------------------------\n\ndef storel(aligned, simd_ext, typ):\n    return \\\n    '''/* This can surely be improved but it is not our priority. */\n       nsimd_store{align}_{simd_ext}_{typ}({in0},\n         nsimd_if_else1_{simd_ext}_{typ}({in1},\n           nsimd_set1_{simd_ext}_{typ}({one}),\n           nsimd_set1_{simd_ext}_{typ}({zero})));'''. \\\n           format(align='a' if aligned else 'u',\n                  one='nsimd_f32_to_f16(1.0f)' if typ == 'f16'\n                  else '({})1'.format(typ),\n                  zero='nsimd_f32_to_f16(0.0f)' if typ == 'f16'\n                  else '({})0'.format(typ), **fmtspec)\n\n# -----------------------------------------------------------------------------\n\ndef allany1(op, simd_ext, typ):\n    binop = '&&' if op == 'all' else '||'\n    if typ == 'f16':\n        return \\\n        '''return nsimd_{op}_{simd_ext}_f32({in0}.v0) {binop}\n                  nsimd_{op}_{simd_ext}_f32({in0}.v1);'''. \\\n                  format(op=op, binop=binop, **fmtspec)\n    elif has_to_be_emulated(simd_ext, typ):\n        return 'return {in0}.v0 {binop} {in0}.v1;'. \\\n               format(binop=binop, **fmtspec)\n    return 'return vec_{op}_ne({in0}, ({lzeros}));'.format(op=op, **fmtspec)\n\n# -----------------------------------------------------------------------------\n\ndef nbtrue1(simd_ext, typ):\n    if typ == 'f16':\n        return \\\n        '''return nsimd_nbtrue_{simd_ext}_f32({in0}.v0) +\n                  nsimd_nbtrue_{simd_ext}_f32({in0}.v1);'''. \\\n                  format(**fmtspec)\n    elif has_to_be_emulated(simd_ext, typ):\n        return 'return -(int)((i64)({in0}.v0) + (i64)({in0}.v1));'. \\\n               format(**fmtspec)\n    return 'return {};'. \\\n           format(' + '.join('(vec_extract({in0}, {i}) ? 1 : 0)'. \\\n                             format(i=i, **fmtspec) \\\n                             for i in range(get_len(typ))))\n\n# -----------------------------------------------------------------------------\n\ndef reinterpretl1(simd_ext, from_typ, to_typ):\n    if from_typ == to_typ:\n        return 'return {in0};'.format(**fmtspec)\n    elif simd_ext == 'vmx' and from_typ in ['f64', 'i64', 'u64']:\n        return \\\n        '''nsimd_{simd_ext}_vl{to_typ} ret;\n           ret.v0 = {in0}.v0;\n           ret.v1 = {in0}.v1;\n           return ret;'''.format(**fmtspec)\n    elif from_typ == 'f16':\n        return \\\n        '''nsimd_{simd_ext}_vl{to_typ} ret =\n               (__vector __bool short)vec_splats(\n                   (u16)vec_extract({in0}.v0, 0));\n           ret = (__vector __bool short)vec_insert(\n                     (u16)vec_extract({in0}.v0, 1), ret, 1);\n           ret = (__vector __bool short)vec_insert(\n                     (u16)vec_extract({in0}.v0, 2), ret, 2);\n           ret = (__vector __bool short)vec_insert(\n                     (u16)vec_extract({in0}.v0, 3), ret, 3);\n           ret = (__vector __bool short)vec_insert(\n                     (u16)vec_extract({in0}.v1, 0), ret, 4);\n           ret = (__vector __bool short)vec_insert(\n                     (u16)vec_extract({in0}.v1, 1), ret, 5);\n           ret = (__vector __bool short)vec_insert(\n                     (u16)vec_extract({in0}.v1, 2), ret, 6);\n           ret = (__vector __bool short)vec_insert(\n                     (u16)vec_extract({in0}.v1, 3), ret, 7);\n           return ret;'''.format(**fmtspec)\n    elif to_typ == 'f16':\n        return \\\n        '''nsimd_{simd_ext}_vlf16 ret;\n           ret.v0 = (__vector __bool int)vec_splats(\n                        (u32)(vec_extract({in0}, 0) ? -1 : 0));\n           ret.v0 = (__vector __bool int)vec_insert(\n                        (u32)(vec_extract({in0}, 1) ? -1 : 0), ret.v0, 1);\n           ret.v0 = (__vector __bool int)vec_insert(\n                        (u32)(vec_extract({in0}, 2) ? -1 : 0), ret.v0, 2);\n           ret.v0 = (__vector __bool int)vec_insert(\n                        (u32)(vec_extract({in0}, 3) ? -1 : 0), ret.v0, 3);\n           ret.v1 = (__vector __bool int)vec_splats(\n                        (u32)(vec_extract({in0}, 4) ? -1 : 0));\n           ret.v1 = (__vector __bool int)vec_insert(\n                        (u32)(vec_extract({in0}, 5) ? -1 : 0), ret.v1, 1);\n           ret.v1 = (__vector __bool int)vec_insert(\n                        (u32)(vec_extract({in0}, 6) ? -1 : 0), ret.v1, 2);\n           ret.v1 = (__vector __bool int)vec_insert(\n                        (u32)(vec_extract({in0}, 7) ? -1 : 0), ret.v1, 3);\n           return ret;'''.format(**fmtspec)\n    else:\n        return 'return ({ppc_to_typ}){in0};'. \\\n               format(ppc_to_typ=native_typel(to_typ), **fmtspec)\n\n# -----------------------------------------------------------------------------\n\ndef convert1(simd_ext, from_typ, to_typ):\n    if from_typ == to_typ:\n        return 'return {in0};'.format(**fmtspec)\n    elif from_typ == 'f16' and to_typ == 'u16':\n        return \\\n        '''return vec_pack((__vector unsigned int)vec_ctu({in0}.v0, 0),\n                           (__vector unsigned int)vec_ctu({in0}.v1, 0));'''. \\\n                           format(**fmtspec)\n    elif from_typ == 'f16' and to_typ == 'i16':\n        return \\\n        '''return vec_pack((__vector signed int)vec_cts({in0}.v0, 0),\n                           (__vector signed int)vec_cts({in0}.v1, 0));'''. \\\n                           format(**fmtspec)\n    elif from_typ == 'u16' and to_typ == 'f16':\n        return \\\n        '''nsimd_{simd_ext}_vf16 ret;\n           /* Unpack extends the sign, we need to remove the extra 1s */\n           __vector int mask = vec_splats((int)0xFFFF);\n           ret.v0 = vec_ctf(vec_and(vec_unpackh((__vector short){in0}), mask),\n                            0);\n           ret.v1 = vec_ctf(vec_and(vec_unpackl((__vector short){in0}), mask),\n                            0);\n           return ret;'''.format(**fmtspec)\n    elif from_typ == 'i16' and to_typ == 'f16':\n        return '''nsimd_{simd_ext}_vf16 ret;\n                  ret.v0 = vec_ctf(vec_unpackh({in0}), 0);\n                  ret.v1 = vec_ctf(vec_unpackl({in0}), 0);\n                  return ret;'''.format(**fmtspec)\n    elif has_to_be_emulated(simd_ext, to_typ):\n        return '''nsimd_{simd_ext}_v{to_typ} ret;\n                  ret.v0 = nsimd_scalar_cvt_{to_typ}_{from_typ}({in0}.v0);\n                  ret.v1 = nsimd_scalar_cvt_{to_typ}_{from_typ}({in0}.v1);\n                  return ret;'''.format(**fmtspec)\n    elif from_typ in ['f32', 'f64'] and to_typ in ['i32', 'i64']:\n        return 'return vec_cts({in0}, 0);'.format(**fmtspec)\n    elif from_typ in ['f32', 'f64'] and to_typ in ['u32', 'u64']:\n        return 'return vec_ctu({in0}, 0);'.format(**fmtspec)\n    elif from_typ in ['i32', 'i64', 'u32', 'u64'] and to_typ in ['f32', 'f64']:\n        return 'return vec_ctf({in0}, 0);'.format(**fmtspec)\n    elif from_typ in common.iutypes and to_typ in common.iutypes:\n        return 'return ({ppctyp}){in0};'. \\\n               format(ppctyp=native_type(to_typ), **fmtspec)\n\n# -----------------------------------------------------------------------------\n\ndef reinterpret1(simd_ext, from_typ, to_typ):\n    if from_typ == to_typ:\n        return 'return {in0};'.format(**fmtspec)\n    elif simd_ext == 'vmx' and from_typ in ['f64', 'i64', 'u64']:\n        return \\\n        '''nsimd_{simd_ext}_v{to_typ} ret;\n           ret.v0 = nsimd_scalar_reinterpret_{to_typ}_{from_typ}({in0}.v0);\n           ret.v1 = nsimd_scalar_reinterpret_{to_typ}_{from_typ}({in0}.v1);\n           return ret;'''.format(**fmtspec)\n    elif from_typ == 'f16' and to_typ == 'u16':\n        return \\\n        '''nsimd_{simd_ext}_vu16 ret;\n           ret = vec_splats(nsimd_f32_to_u16(vec_extract({in0}.v0, 0)));\n           ret = vec_insert(nsimd_f32_to_u16(vec_extract({in0}.v0, 1)),\n                            ret, 1);\n           ret = vec_insert(nsimd_f32_to_u16(vec_extract({in0}.v0, 2)),\n                            ret, 2);\n           ret = vec_insert(nsimd_f32_to_u16(vec_extract({in0}.v0, 3)),\n                            ret, 3);\n           ret = vec_insert(nsimd_f32_to_u16(vec_extract({in0}.v1, 0)),\n                            ret, 4);\n           ret = vec_insert(nsimd_f32_to_u16(vec_extract({in0}.v1, 1)),\n                            ret, 5);\n           ret = vec_insert(nsimd_f32_to_u16(vec_extract({in0}.v1, 2)),\n                            ret, 6);\n           ret = vec_insert(nsimd_f32_to_u16(vec_extract({in0}.v1, 3)),\n                            ret, 7);\n           return ret;'''.format(**fmtspec)\n    elif from_typ == 'f16' and to_typ == 'i16':\n        return \\\n        '''nsimd_{simd_ext}_vi16 ret;\n           ret = vec_splats(nsimd_scalar_reinterpret_i16_u16(\n                     nsimd_f32_to_u16(vec_extract({in0}.v0, 0))));\n           ret = vec_insert(nsimd_scalar_reinterpret_i16_u16(\n                     nsimd_f32_to_u16(vec_extract({in0}.v0, 1))), ret, 1);\n           ret = vec_insert(nsimd_scalar_reinterpret_i16_u16(\n                     nsimd_f32_to_u16(vec_extract({in0}.v0, 2))), ret, 2);\n           ret = vec_insert(nsimd_scalar_reinterpret_i16_u16(\n                     nsimd_f32_to_u16(vec_extract({in0}.v0, 3))), ret, 3);\n           ret = vec_insert(nsimd_scalar_reinterpret_i16_u16(\n                     nsimd_f32_to_u16(vec_extract({in0}.v1, 0))), ret, 4);\n           ret = vec_insert(nsimd_scalar_reinterpret_i16_u16(\n                     nsimd_f32_to_u16(vec_extract({in0}.v1, 1))), ret, 5);\n           ret = vec_insert(nsimd_scalar_reinterpret_i16_u16(\n                     nsimd_f32_to_u16(vec_extract({in0}.v1, 2))), ret, 6);\n           ret = vec_insert(nsimd_scalar_reinterpret_i16_u16(\n                     nsimd_f32_to_u16(vec_extract({in0}.v1, 3))), ret, 7);\n           return ret;'''.format(**fmtspec)\n    elif from_typ == 'u16' and to_typ == 'f16':\n        return \\\n        '''nsimd_{simd_ext}_vf16 ret;\n           ret.v0 = vec_splats(nsimd_u16_to_f32(vec_extract({in0}, 0)));\n           ret.v0 = vec_insert(nsimd_u16_to_f32(vec_extract({in0}, 1)),\n                               ret.v0, 1);\n           ret.v0 = vec_insert(nsimd_u16_to_f32(vec_extract({in0}, 2)),\n                               ret.v0, 2);\n           ret.v0 = vec_insert(nsimd_u16_to_f32(vec_extract({in0}, 3)),\n                               ret.v0, 3);\n           ret.v1 = vec_splats(nsimd_u16_to_f32(vec_extract({in0}, 4)));\n           ret.v1 = vec_insert(nsimd_u16_to_f32(vec_extract({in0}, 5)),\n                               ret.v1, 1);\n           ret.v1 = vec_insert(nsimd_u16_to_f32(vec_extract({in0}, 6)),\n                               ret.v1, 2);\n           ret.v1 = vec_insert(nsimd_u16_to_f32(vec_extract({in0}, 7)),\n                               ret.v1, 3);\n           return ret;'''.format(**fmtspec)\n    elif from_typ == 'i16' and to_typ == 'f16':\n        return \\\n        '''nsimd_{simd_ext}_vf16 ret;\n           ret.v0 = vec_splats(nsimd_u16_to_f32(\n                        nsimd_scalar_reinterpret_u16_i16(\n                            vec_extract({in0}, 0))));\n           ret.v0 = vec_insert(nsimd_u16_to_f32(\n                        nsimd_scalar_reinterpret_u16_i16(\n                            vec_extract({in0}, 1))), ret.v0, 1);\n           ret.v0 = vec_insert(nsimd_u16_to_f32(\n                        nsimd_scalar_reinterpret_u16_i16(\n                            vec_extract({in0}, 2))), ret.v0, 2);\n           ret.v0 = vec_insert(nsimd_u16_to_f32(\n                        nsimd_scalar_reinterpret_u16_i16(\n                            vec_extract({in0}, 3))), ret.v0, 3);\n           ret.v1 = vec_splats(nsimd_u16_to_f32(\n                        nsimd_scalar_reinterpret_u16_i16(\n                            vec_extract({in0}, 4))));\n           ret.v1 = vec_insert(nsimd_u16_to_f32(\n                        nsimd_scalar_reinterpret_u16_i16(\n                            vec_extract({in0}, 5))), ret.v1, 1);\n           ret.v1 = vec_insert(nsimd_u16_to_f32(\n                        nsimd_scalar_reinterpret_u16_i16(\n                            vec_extract({in0}, 6))), ret.v1, 2);\n           ret.v1 = vec_insert(nsimd_u16_to_f32(\n                        nsimd_scalar_reinterpret_u16_i16(\n                            vec_extract({in0}, 7))), ret.v1, 3);\n           return ret;'''.format(**fmtspec)\n    else:\n        return 'return ({ppc_typ}){in0};'. \\\n               format(ppc_typ=native_type(to_typ), **fmtspec)\n\n# -----------------------------------------------------------------------------\n\ndef reverse1(simd_ext, typ):\n    if typ == 'f16':\n        return emulate_f16('reverse', simd_ext, ['v', 'v'])\n    elif has_to_be_emulated(simd_ext, typ):\n        return '''nsimd_{simd_ext}_v{typ} ret;\n                  ret.v0 = {in0}.v1;\n                  ret.v1 = {in0}.v0;\n                  return ret;'''.format(**fmtspec)\n    elif typ in ['i8', 'u8']:\n        return '''return vec_perm({in0}, {in0}, (__vector unsigned char)\n                                  {{ 15, 14, 13, 12, 11, 10, 9, 8,\n                                      7,  6,  5,  4,  3,  2, 1, 0 }});'''. \\\n                                      format(**fmtspec)\n    elif typ in ['i16', 'u16']:\n        return '''return vec_perm({in0}, {in0}, (__vector unsigned char)\n                                  {{ 14, 15, 12, 13, 10, 11, 8, 9,\n                                      6,  7,  4,  5,  2,  3, 0, 1 }});'''. \\\n                                      format(**fmtspec)\n    elif typ in ['i32', 'u32', 'f32']:\n        return '''return vec_perm({in0}, {in0}, (__vector unsigned char)\n                                  {{ 12, 13, 14, 15,  8,  9, 10, 11,\n                                      4,  5,  6,  7,  0,  1,  2,  3 }});'''. \\\n                                      format(**fmtspec)\n    elif typ in ['f64', 'i64', 'u64']:\n        return '''return vec_perm({in0}, {in0}, (__vector unsigned char)\n                                  {{ 8, 9, 10, 11, 12, 13, 14, 15,\n                                     0, 1,  2,  3,  4,  5,  6,  7  }});'''. \\\n                                      format(**fmtspec)\n\n# -----------------------------------------------------------------------------\n\ndef addv(simd_ext, typ):\n    if typ == 'f16':\n        return '''return nsimd_f32_to_f16(\n                    nsimd_addv_{simd_ext}_f32({in0}.v0) +\n                    nsimd_addv_{simd_ext}_f32({in0}.v1));'''.format(**fmtspec)\n    elif has_to_be_emulated(simd_ext, typ):\n        return 'return {in0}.v0 + {in0}.v1;'.format(**fmtspec)\n    return 'return ({})({});'. \\\n           format(typ, ' + '.join('vec_extract({in0}, {i})'. \\\n                                  format(i=i, **fmtspec) \\\n                                  for i in range(get_len(typ))))\n\n# -----------------------------------------------------------------------------\n\ndef add_sub_s(op, simd_ext, typ):\n    if has_to_be_emulated(simd_ext, typ):\n        return emulation_code(op, simd_ext, typ, ['v', 'v', 'v'])\n    if typ in common.ftypes:\n        return 'return vec_{op}({in0}, {in1});'.format(op=op[:-1], **fmtspec)\n    elif typ in ['i64', 'u64']:\n        return '''nsimd_{simd_ext}_v{typ} ret;\n                  ret = vec_splats(nsimd_scalar_{op}_{typ}(\n                            vec_extract({in0}, 0), vec_extract({in1}, 0)));\n                  ret = vec_insert(nsimd_scalar_{op}_{typ}(\n                            vec_extract({in0}, 1), vec_extract({in1}, 1)),\n                            ret, 1);\n                  return ret;'''.format(op=op, **fmtspec)\n    return 'return vec_{op}({in0}, {in1});'.format(op=op, **fmtspec)\n\n# -----------------------------------------------------------------------------\n\ndef upcvt1(simd_ext, from_typ, to_typ):\n    if from_typ in ['i8', 'u8'] and to_typ == 'f16':\n        return '''nsimd_{simd_ext}_vf16x2 ret;\n                  nsimd_{simd_ext}_vi16x2 tmp;\n                  tmp = nsimd_upcvt_{simd_ext}_i16_{from_typ}(a0);\n                  ret.v0 = nsimd_cvt_{simd_ext}_f16_i16(tmp.v0);\n                  ret.v1 = nsimd_cvt_{simd_ext}_f16_i16(tmp.v1);\n                  return ret;'''.format(**fmtspec)\n    elif from_typ == 'f16' and to_typ == 'f32':\n        return '''nsimd_{simd_ext}_v{to_typ}x2 ret;\n                  ret.v0 = {in0}.v0;\n                  ret.v1 = {in0}.v1;\n                  return ret;'''.format(**fmtspec)\n    elif from_typ == 'f16' and to_typ in ['i32', 'u32']:\n        sign = 'u' if to_typ[0] == 'u' else 's'\n        return '''nsimd_{simd_ext}_v{to_typ}x2 ret;\n                  ret.v0 = vec_ct{sign}({in0}.v0, 0);\n                  ret.v1 = vec_ct{sign}({in0}.v1, 0);\n                  return ret;'''.format(sign=sign, **fmtspec)\n    elif from_typ == 'f32' and to_typ in ['f64', 'i64', 'u64']:\n        if simd_ext == 'vmx':\n            return '''nsimd_vmx_v{to_typ}x2 ret;\n                      ret.v0.v0 = ({to_typ})vec_extract({in0}, 0);\n                      ret.v0.v1 = ({to_typ})vec_extract({in0}, 1);\n                      ret.v1.v0 = ({to_typ})vec_extract({in0}, 2);\n                      ret.v1.v1 = ({to_typ})vec_extract({in0}, 3);\n                      return ret;'''.format(**fmtspec)\n        else:\n            return \\\n            '''nsimd_vsx_v{to_typ}x2 ret;\n               ret.v0 = vec_splats(({to_typ})vec_extract({in0}, 0));\n               ret.v0 = vec_insert(({to_typ})vec_extract({in0}, 1), ret.v0, 1);\n               ret.v1 = vec_splats(({to_typ})vec_extract({in0}, 2));\n               ret.v1 = vec_insert(({to_typ})vec_extract({in0}, 3), ret.v1, 1);\n               return ret;'''.format(**fmtspec)\n    elif (from_typ in ['i16', 'u16'] and to_typ == 'f32') or \\\n         (from_typ in ['i32', 'u32'] and to_typ == 'f64'):\n        return '''nsimd_{simd_ext}_v{to_typ}x2 ret;\n                  nsimd_{simd_ext}_v{sto_typ}x2 tmp;\n                  tmp = nsimd_upcvt_{simd_ext}_{sto_typ}_{from_typ}({in0});\n                  ret.v0 = nsimd_cvt_{simd_ext}_{to_typ}_{sto_typ}(tmp.v0);\n                  ret.v1 = nsimd_cvt_{simd_ext}_{to_typ}_{sto_typ}(tmp.v1);\n                  return ret;'''.format(sto_typ=from_typ[0] + to_typ[1:],\n                                        **fmtspec)\n    elif from_typ in ['u8', 'u16']:\n        mask='(i{})0x{}'.format(to_typ[1:], 'F' * (int(from_typ[1:]) // 4))\n        ppc_sto_typ = native_type('i' + to_typ[1:])\n        ppc_sfrom_typ = '({})'.format(native_type('i' + from_typ[1:]))\n        ppc_to_typ = '({})'.format(native_type(to_typ)) \\\n                     if to_typ in common.utypes else ''\n        return '''nsimd_{simd_ext}_v{to_typ}x2 ret;\n                  {ppc_sto_typ} mask = vec_splats({mask});\n                  ret.v0 = {ppc_to_typ}vec_and(\n                               vec_unpackh({ppc_sfrom_typ}{in0}), mask);\n                  ret.v1 = {ppc_to_typ}vec_and(\n                               vec_unpackl({ppc_sfrom_typ}{in0}), mask);\n                  return ret;'''.format(mask=mask, ppc_sto_typ=ppc_sto_typ,\n                                        ppc_sfrom_typ=ppc_sfrom_typ,\n                                        ppc_to_typ=ppc_to_typ, **fmtspec)\n    elif from_typ in ['i8', 'i16']:\n        ppc_to_typ = '({})'.format(native_type(to_typ)) \\\n                     if to_typ in common.utypes else ''\n        return '''nsimd_{simd_ext}_v{to_typ}x2 ret;\n                  ret.v0 = {ppc_to_typ}vec_unpackh({in0});\n                  ret.v1 = {ppc_to_typ}vec_unpackl({in0});\n                  return ret;'''.format(ppc_to_typ=ppc_to_typ, **fmtspec)\n    elif from_typ in ['i32', 'u32']:\n        if simd_ext == 'vmx':\n            return '''nsimd_vmx_v{to_typ}x2 ret;\n                      ret.v0.v0 = ({to_typ})vec_extract({in0}, 0);\n                      ret.v0.v1 = ({to_typ})vec_extract({in0}, 1);\n                      ret.v1.v0 = ({to_typ})vec_extract({in0}, 2);\n                      ret.v1.v1 = ({to_typ})vec_extract({in0}, 3);\n                      return ret;'''.format(**fmtspec)\n        else:\n            return \\\n            '''nsimd_vsx_v{to_typ}x2 ret;\n               ret.v0 = vec_splats(({to_typ})vec_extract({in0}, 0));\n               ret.v0 = vec_insert(({to_typ})vec_extract({in0}, 1), ret.v0, 1);\n               ret.v1 = vec_splats(({to_typ})vec_extract({in0}, 2));\n               ret.v1 = vec_insert(({to_typ})vec_extract({in0}, 3), ret.v1, 1);\n               return ret;'''.format(**fmtspec)\n\n# -----------------------------------------------------------------------------\n\ndef downcvt1(simd_ext, from_typ, to_typ):\n    if from_typ in ['f64', 'i64', 'u64']:\n        if simd_ext == 'vmx':\n            return '''nsimd_vmx_v{to_typ} ret;\n                      ret = vec_splats(({to_typ}){in0}.v0);\n                      ret = vec_insert(({to_typ}){in0}.v1, ret, 1);\n                      ret = vec_insert(({to_typ}){in1}.v0, ret, 2);\n                      ret = vec_insert(({to_typ}){in1}.v1, ret, 3);\n                      return ret;'''.format(**fmtspec)\n        else:\n            return \\\n            '''nsimd_vsx_v{to_typ} ret;\n               ret = vec_splats(({to_typ})vec_extract({in0}, 0));\n               ret = vec_insert(({to_typ})vec_extract({in0}, 1), ret, 1);\n               ret = vec_insert(({to_typ})vec_extract({in1}, 0), ret, 2);\n               ret = vec_insert(({to_typ})vec_extract({in1}, 1), ret, 3);\n               return ret;'''.format(**fmtspec)\n    elif from_typ in common.iutypes and to_typ in common.iutypes:\n        return 'return {cast}vec_pack({in0}, {in1});'. \\\n               format(cast='({})'.format(native_type(to_typ)) \\\n                      if from_typ[0] != to_typ[0] else '', **fmtspec)\n    elif from_typ == 'f32' and to_typ == 'f16':\n        return '''nsimd_{simd_ext}_vf16 ret;\n                  ret.v0 = {in0};\n                  ret.v1 = {in1};\n                  return ret;'''.format(**fmtspec)\n    elif from_typ == 'f32' and to_typ in common.iutypes:\n        return 'return vec_pack(vec_ct{s}({in0}, 0), vec_ct{s}({in1}, 0));'. \\\n               format(s='s' if to_typ == 'i16' else 'u', **fmtspec)\n    elif from_typ in common.iutypes and to_typ == 'f16':\n        return '''nsimd_{simd_ext}_vf16 ret;\n                  ret.v0 = vec_ctf({in0}, 0);\n                  ret.v1 = vec_ctf({in1}, 0);\n                  return ret;'''.format(**fmtspec)\n    elif from_typ == 'f16':\n        return \\\n        '''return vec_pack(vec_pack(vec_ct{s}({in0}.v0, 0),\n                                    vec_ct{s}({in0}.v1, 0)),\n                           vec_pack(vec_ct{s}({in1}.v0, 0),\n                                    vec_ct{s}({in1}.v1, 0)));'''. \\\n                                    format(s='s' if to_typ == 'i8' else 'u',\n                                           **fmtspec)\n\n# -----------------------------------------------------------------------------\n\ndef unzip(func, simd_ext, typ):\n    if typ == 'f16':\n        return '''nsimd_{simd_ext}_vf16 ret;\n                  ret.v0 = nsimd_{func}_{simd_ext}_f32({in0}.v0, {in0}.v1);\n                  ret.v1 = nsimd_{func}_{simd_ext}_f32({in1}.v0, {in1}.v1);\n                  return ret;'''.format(func=func, **fmtspec)\n    elif typ in ['f64', 'i64', 'u64']:\n        if simd_ext == 'vmx':\n            return '''nsimd_vmx_v{typ} ret;\n                      ret.v0 = {in0}.v{i};\n                      ret.v1 = {in1}.v{i};\n                      return ret;'''.format(i=0 if func == 'unziplo' else 1,\n                                            **fmtspec)\n        else:\n            return '''nsimd_vsx_v{typ} ret;\n                      ret = vec_splats(vec_extract({in0}, {i}));\n                      ret = vec_insert(vec_extract({in1}, {i}), ret, 1);\n                      return ret;'''.format(i=0 if func == 'unziplo' else 1,\n                                            **fmtspec)\n    elif typ in ['i8', 'u8', 'i16', 'u16', 'i32', 'u32', 'f32']:\n        perm = []\n        le = get_len(typ)\n        for i in range(le):\n            sz = int(typ[1:]) // 8\n            for j in range(0, sz):\n                perm += ['(unsigned char)' + str(2 * sz * i + \\\n                         (0 if func == 'unziplo' else sz) + j)]\n        return \\\n        '''__vector unsigned char perm = {{ {perm} }};\n           return vec_perm({in0}, {in1}, perm);'''. \\\n           format(perm=', '.join(perm), **fmtspec)\n\n# -----------------------------------------------------------------------------\n\ndef zip(op, simd_ext, typ):\n    if typ == 'f16':\n        return '''nsimd_{simd_ext}_vf16 ret;\n                  ret.v0 = vec_splats(vec_extract({in0}.v{i}, 0));\n                  ret.v0 = vec_insert(vec_extract({in0}.v{i}, 1), ret.v0, 2);\n                  ret.v1 = vec_splats(vec_extract({in0}.v{i}, 2));\n                  ret.v1 = vec_insert(vec_extract({in0}.v{i}, 3), ret.v1, 2);\n                  ret.v0 = vec_insert(vec_extract({in1}.v{i}, 0), ret.v0, 1);\n                  ret.v0 = vec_insert(vec_extract({in1}.v{i}, 1), ret.v0, 3);\n                  ret.v1 = vec_insert(vec_extract({in1}.v{i}, 2), ret.v1, 1);\n                  ret.v1 = vec_insert(vec_extract({in1}.v{i}, 3), ret.v1, 3);\n                  return ret;'''.format(i=0 if op == 'ziplo' else 1,\n                                        **fmtspec)\n    elif simd_ext == 'vmx' and typ in ['f64', 'i64', 'u64']:\n        return '''nsimd_{simd_ext}_v{typ} ret;\n                  ret.v0 = {in0}.v{i};\n                  ret.v1 = {in1}.v{i};\n                  return ret;'''.format(i='1' if op == 'ziphi' else '0',\n                                        **fmtspec)\n    return 'return vec_merge{suf}({in0}, {in1});'. \\\n           format(suf='l' if op == 'ziphi' else 'h', **fmtspec)\n\n# -----------------------------------------------------------------------------\n\ndef zip_unzip_basic(op, simd_ext, typ):\n    return \\\n    '''nsimd_{simd_ext}_v{typ}x2 ret;\n       ret.v0 = nsimd_{pre}ziplo_{simd_ext}_{typ}({in0}, {in1});\n       ret.v1 = nsimd_{pre}ziphi_{simd_ext}_{typ}({in0}, {in1});\n       return ret;'''.format(pre='un' if op == 'unzip' else '', **fmtspec)\n\n# -----------------------------------------------------------------------------\n\ndef to_mask(simd_ext, typ):\n    if typ == 'f16':\n        return '''nsimd_{simd_ext}_vf16 ret;\n                  ret.v0 = (__vector float){in0}.v0;\n                  ret.v1 = (__vector float){in0}.v1;\n                  return ret;'''.format(**fmtspec)\n    if simd_ext == 'vmx' and typ in ['f64', 'i64']:\n        return '''nsimd_{simd_ext}_v{typ} ret;\n                  ret.v0 = nsimd_scalar_reinterpret_{typ}_u64({in0}.v0);\n                  ret.v1 = nsimd_scalar_reinterpret_{typ}_u64({in0}.v1);\n                  return ret;'''.format(**fmtspec)\n    elif simd_ext == 'vmx' and typ == 'u64':\n        return '''nsimd_{simd_ext}_vu64 ret;\n                  ret.v0 = {in0}.v0;\n                  ret.v1 = {in0}.v1;\n                  return ret;'''.format(**fmtspec)\n    return 'return ({ppc_typ}){in0};'. \\\n           format(ppc_typ=native_type(typ), **fmtspec)\n\n# -----------------------------------------------------------------------------\n\ndef iota(simd_ext, typ):\n    if typ == 'f16':\n        return '''nsimd_{simd_ext}_vf16 ret;\n                  ret.v0 = vec_splats(0.0f);\n                  ret.v0 = vec_insert(1.0f, ret.v0, 1);\n                  ret.v0 = vec_insert(2.0f, ret.v0, 2);\n                  ret.v0 = vec_insert(3.0f, ret.v0, 3);\n                  ret.v1 = vec_splats(4.0f);\n                  ret.v1 = vec_insert(5.0f, ret.v1, 1);\n                  ret.v1 = vec_insert(6.0f, ret.v1, 2);\n                  ret.v1 = vec_insert(7.0f, ret.v1, 3);\n                  return ret;'''.format(**fmtspec)\n    elif has_to_be_emulated(simd_ext, typ):\n        return '''nsimd_vmx_v{typ} ret;\n                  ret.v0 = ({typ})0;\n                  ret.v1 = ({typ})1;\n                  return ret;'''.format(**fmtspec)\n    return 'nsimd_{simd_ext}_v{typ} ret;\\n' \\\n           'ret = vec_splats(({typ})0);\\n'.format(**fmtspec) + \\\n           '\\n'.join('ret = vec_insert(({}){}, ret, {});'.format(typ, i, i) \\\n                     for i in range(1, get_len(typ))) + \\\n           '\\nreturn ret;'\n\n# -----------------------------------------------------------------------------\n\ndef mask_for_loop_tail(simd_ext, typ):\n    le = get_len(typ)\n    if typ == 'f16':\n        threshold = 'nsimd_f32_to_f16((f32)({in1} - {in0}))'.format(**fmtspec)\n    else:\n        threshold = '({typ})({in1} - {in0})'.format(**fmtspec)\n    return '''if ({in0} >= {in1}) {{\n                return nsimd_set1l_{simd_ext}_{typ}(0);\n              }}\n              if ({in1} - {in0} < {le}) {{\n                nsimd_{simd_ext}_v{typ} n =\n                      nsimd_set1_{simd_ext}_{typ}({threshold});\n                return nsimd_lt_{simd_ext}_{typ}(\n                           nsimd_iota_{simd_ext}_{typ}(), n);\n              }} else {{\n                return nsimd_set1l_{simd_ext}_{typ}(1);\n              }}'''.format(le=le, threshold=threshold, **fmtspec)\n\n# -----------------------------------------------------------------------------\n\ndef scatter(simd_ext, typ):\n    le = get_len(typ)\n    if typ == 'f16':\n        return \\\n        '''{in0}[vec_extract({in1}, 0)] = nsimd_f32_to_f16(\n                                              vec_extract({in2}.v0, 0));\n           {in0}[vec_extract({in1}, 1)] = nsimd_f32_to_f16(\n                                              vec_extract({in2}.v0, 1));\n           {in0}[vec_extract({in1}, 2)] = nsimd_f32_to_f16(\n                                              vec_extract({in2}.v0, 2));\n           {in0}[vec_extract({in1}, 3)] = nsimd_f32_to_f16(\n                                              vec_extract({in2}.v0, 3));\n           {in0}[vec_extract({in1}, 4)] = nsimd_f32_to_f16(\n                                              vec_extract({in2}.v1, 0));\n           {in0}[vec_extract({in1}, 5)] = nsimd_f32_to_f16(\n                                              vec_extract({in2}.v1, 1));\n           {in0}[vec_extract({in1}, 6)] = nsimd_f32_to_f16(\n                                              vec_extract({in2}.v1, 2));\n           {in0}[vec_extract({in1}, 7)] = nsimd_f32_to_f16(\n                                              vec_extract({in2}.v1, 3));'''. \\\n                                              format(**fmtspec)\n    elif has_to_be_emulated(simd_ext, typ):\n        return '''{in0}[{in1}.v0] = {in2}.v0;\n                  {in0}[{in1}.v1] = {in2}.v1;'''.format(**fmtspec)\n    return '\\n'.join(['{in0}[vec_extract({in1}, {i})] = ' \\\n                      'vec_extract({in2}, {i});'.format(i=i, **fmtspec) \\\n                      for i in range(get_len(typ))])\n\n# -----------------------------------------------------------------------------\n\ndef gather(simd_ext, typ):\n    if typ == 'f16':\n        return \\\n        '''nsimd_{simd_ext}_v{typ} ret;\n           ret.v0 = vec_splats(nsimd_f16_to_f32({in0}[vec_extract({in1}, 0)]));\n           ret.v0 = vec_insert(nsimd_f16_to_f32({in0}[vec_extract({in1}, 0)]),\n                               ret.v0, 1);\n           ret.v0 = vec_insert(nsimd_f16_to_f32({in0}[vec_extract({in1}, 0)]),\n                               ret.v0, 2);\n           ret.v0 = vec_insert(nsimd_f16_to_f32({in0}[vec_extract({in1}, 0)]),\n                               ret.v0, 3);\n           ret.v1 = vec_splats(nsimd_f16_to_f32({in0}[vec_extract({in1}, 0)]));\n           ret.v1 = vec_insert(nsimd_f16_to_f32({in0}[vec_extract({in1}, 0)]),\n                               ret.v1, 1);\n           ret.v1 = vec_insert(nsimd_f16_to_f32({in0}[vec_extract({in1}, 0)]),\n                               ret.v1, 2);\n           ret.v1 = vec_insert(nsimd_f16_to_f32({in0}[vec_extract({in1}, 0)]),\n                               ret.v1, 3);\n           return ret;'''.format(**fmtspec)\n    elif has_to_be_emulated(simd_ext, typ):\n        return '''nsimd_{simd_ext}_v{typ} ret;\n                  ret.v0 = {in0}[{in1}.v0];\n                  ret.v1 = {in0}[{in1}.v1];\n                  return ret;'''.format(**fmtspec)\n    return '''nsimd_{simd_ext}_v{typ} ret;\n              ret = vec_splats({in0}[vec_extract({in1}, 0)]);\n              '''.format(**fmtspec) + \\\n           '\\n'.join('ret = vec_insert({in0}[vec_extract({in1}, {i})], ' \\\n                     'ret, {i});'.format(i=i, **fmtspec) \\\n                     for i in range(1, get_len(typ))) + '\\n' + \\\n           'return ret;'\n\n# -----------------------------------------------------------------------------\n\ndef gather_linear(simd_ext, typ):\n    if typ == 'f16':\n        return \\\n        '''nsimd_{simd_ext}_v{typ} ret;\n           ret.v0 = vec_splats(nsimd_f16_to_f32({in0}[0]));\n           ret.v0 = vec_insert(nsimd_f16_to_f32({in0}[{in1}]), ret.v0, 1);\n           ret.v0 = vec_insert(nsimd_f16_to_f32({in0}[2 * {in1}]), ret.v0, 2);\n           ret.v0 = vec_insert(nsimd_f16_to_f32({in0}[3 * {in1}]), ret.v0, 3);\n           ret.v1 = vec_splats(nsimd_f16_to_f32({in0}[4 * {in1}]));\n           ret.v1 = vec_insert(nsimd_f16_to_f32({in0}[5 * {in1}]), ret.v1, 1);\n           ret.v1 = vec_insert(nsimd_f16_to_f32({in0}[6 * {in1}]), ret.v1, 2);\n           ret.v1 = vec_insert(nsimd_f16_to_f32({in0}[7 * {in1}]), ret.v1, 3);\n           return ret;'''.format(**fmtspec)\n    elif has_to_be_emulated(simd_ext, typ):\n        return '''nsimd_{simd_ext}_v{typ} ret;\n                  ret.v0 = {in0}[0];\n                  ret.v1 = {in0}[{in1}];\n                  return ret;'''.format(**fmtspec)\n    return '''nsimd_{simd_ext}_v{typ} ret;\n           ret = vec_splats({in0}[0]);\n           '''.format(**fmtspec) + \\\n        '\\n'.join('ret = vec_insert({in0}[{in1} * {i}], ret, {i});'. \\\n                  format(i=i, **fmtspec) for i in range(1, get_len(typ))) + \\\n        '\\nreturn ret;'\n\n# -----------------------------------------------------------------------------\n\ndef scatter_linear(simd_ext, typ):\n    if typ == 'f16':\n        return \\\n        '''{in0}[0] = nsimd_f32_to_f16(vec_extract({in2}.v0, 0));\n           {in0}[{in1}] = nsimd_f32_to_f16(vec_extract({in2}.v0, 1));\n           {in0}[2 * {in1}] = nsimd_f32_to_f16(vec_extract({in2}.v0, 2));\n           {in0}[3 * {in1}] = nsimd_f32_to_f16(vec_extract({in2}.v0, 3));\n           {in0}[4 * {in1}] = nsimd_f32_to_f16(vec_extract({in2}.v1, 0));\n           {in0}[5 * {in1}] = nsimd_f32_to_f16(vec_extract({in2}.v1, 1));\n           {in0}[6 * {in1}] = nsimd_f32_to_f16(vec_extract({in2}.v1, 2));\n           {in0}[7 * {in1}] = nsimd_f32_to_f16(vec_extract({in2}.v1, 3));'''. \\\n           format(**fmtspec)\n    elif has_to_be_emulated(simd_ext, typ):\n        return '''{in0}[0] = {in2}.v0;\n                  {in0}[{in1}] = {in2}.v1;'''.format(**fmtspec)\n    return '\\n'.join(['{in0}[{in1} * {i}] = vec_extract({in2}, {i});'. \\\n                      format(i=i, **fmtspec) for i in range(get_len(typ))])\n\n# -----------------------------------------------------------------------------\n\ndef maskoz_load(oz, simd_ext, typ):\n    if typ == 'f16':\n        return \\\n        '''nsimd_{simd_ext}_vf16 ret;\n           ret.v0 = vec_splats(0.0f);\n           ret.v0 = vec_insert(vec_extract({in0}.v0, 0) ?\n                               nsimd_f16_to_f32({in1}[0]) : {oz0}, ret.v0, 0);\n           ret.v0 = vec_insert(vec_extract({in0}.v0, 1) ?\n                               nsimd_f16_to_f32({in1}[1]) : {oz1}, ret.v0, 1);\n           ret.v0 = vec_insert(vec_extract({in0}.v0, 2) ?\n                               nsimd_f16_to_f32({in1}[2]) : {oz2}, ret.v0, 2);\n           ret.v0 = vec_insert(vec_extract({in0}.v0, 3) ?\n                               nsimd_f16_to_f32({in1}[3]) : {oz3}, ret.v0, 3);\n           ret.v1 = ret.v0;\n           ret.v1 = vec_insert(vec_extract({in0}.v1, 0) ?\n                               nsimd_f16_to_f32({in1}[4]) : {oz4}, ret.v1, 0);\n           ret.v1 = vec_insert(vec_extract({in0}.v1, 1) ?\n                               nsimd_f16_to_f32({in1}[5]) : {oz5}, ret.v1, 1);\n           ret.v1 = vec_insert(vec_extract({in0}.v1, 2) ?\n                               nsimd_f16_to_f32({in1}[6]) : {oz6}, ret.v1, 2);\n           ret.v1 = vec_insert(vec_extract({in0}.v1, 3) ?\n                               nsimd_f16_to_f32({in1}[7]) : {oz7}, ret.v1, 3);\n           return ret;'''. \\\n           format(oz0='0.0f' if oz == 'z' else 'vec_extract({in2}.v0, 0)',\n                  oz1='0.0f' if oz == 'z' else 'vec_extract({in2}.v0, 1)',\n                  oz2='0.0f' if oz == 'z' else 'vec_extract({in2}.v0, 2)',\n                  oz3='0.0f' if oz == 'z' else 'vec_extract({in2}.v0, 3)',\n                  oz4='0.0f' if oz == 'z' else 'vec_extract({in2}.v1, 0)',\n                  oz5='0.0f' if oz == 'z' else 'vec_extract({in2}.v1, 1)',\n                  oz6='0.0f' if oz == 'z' else 'vec_extract({in2}.v1, 2)',\n                  oz7='0.0f' if oz == 'z' else 'vec_extract({in2}.v1, 3)',\n                  **fmtspec).format(**fmtspec)\n    elif has_to_be_emulated(simd_ext, typ):\n        if oz == 'z':\n            return '''nsimd_{simd_ext}_v{typ} ret;\n                      ret.v0 = {in0}.v0 ? {in1}[0] : ({typ})0;\n                      ret.v1 = {in0}.v1 ? {in1}[1] : ({typ})0;\n                      return ret;'''.format(**fmtspec)\n        else:\n            return '''nsimd_{simd_ext}_v{typ} ret;\n                      ret.v0 = {in0}.v0 ? {in1}[0] : {in2}.v0;\n                      ret.v1 = {in0}.v1 ? {in1}[1] : {in2}.v1;\n                      return ret;'''.format(**fmtspec)\n    return 'nsimd_{simd_ext}_v{typ} ret = {zeros};\\n'.format(**fmtspec) + \\\n           '\\n'.join(\n           '''if (vec_extract({in0}, {i})) {{\n                ret = vec_insert({in1}[{i}], ret, {i});\n              }} else {{\n                ret = vec_insert({v}, ret, {i});\n              }}'''.format(i=i, v='({})0'.format(typ) if oz == 'z' \\\n                                else 'vec_extract({in2}, {i})'. \\\n                                     format(i=i, **fmtspec), **fmtspec) \\\n                                     for i in range(get_len(typ))) + \\\n                                     '\\nreturn ret;'\n\n# -----------------------------------------------------------------------------\n\ndef mask_store(simd_ext, typ):\n    if typ == 'f16':\n        return \\\n        '''if (vec_extract({in0}.v0, 0)) {{\n             {in1}[0] = nsimd_f32_to_f16(vec_extract({in2}.v0, 0));\n           }}\n           if (vec_extract({in0}.v0, 1)) {{\n             {in1}[1] = nsimd_f32_to_f16(vec_extract({in2}.v0, 1));\n           }}\n           if (vec_extract({in0}.v0, 2)) {{\n             {in1}[2] = nsimd_f32_to_f16(vec_extract({in2}.v0, 2));\n           }}\n           if (vec_extract({in0}.v0, 3)) {{\n             {in1}[3] = nsimd_f32_to_f16(vec_extract({in2}.v0, 3));\n           }}\n           if (vec_extract({in0}.v1, 0)) {{\n             {in1}[4] = nsimd_f32_to_f16(vec_extract({in2}.v1, 0));\n           }}\n           if (vec_extract({in0}.v1, 1)) {{\n             {in1}[5] = nsimd_f32_to_f16(vec_extract({in2}.v1, 1));\n           }}\n           if (vec_extract({in0}.v1, 2)) {{\n             {in1}[6] = nsimd_f32_to_f16(vec_extract({in2}.v1, 2));\n           }}\n           if (vec_extract({in0}.v1, 3)) {{\n             {in1}[7] = nsimd_f32_to_f16(vec_extract({in2}.v1, 3));\n           }}'''.format(**fmtspec)\n    elif has_to_be_emulated(simd_ext, typ):\n        return '''if ({in0}.v0) {{\n                    {in1}[0] = {in2}.v0;\n                  }}\n                  if ({in0}.v1) {{\n                    {in1}[1] = {in2}.v1;\n                  }}'''.format(**fmtspec)\n    return '\\n'.join(\n           '''if (vec_extract({in0}, {i})) {{\n                {in1}[{i}] = vec_extract({in2}, {i});\n              }}'''.format(i=i, **fmtspec) for i in range(get_len(typ)))\n\n# -----------------------------------------------------------------------------\n\ndef to_logical(simd_ext, typ):\n    if typ == 'f16':\n        return emulate_f16('to_logical', simd_ext, ['l', 'v'])\n    elif has_to_be_emulated(simd_ext, typ):\n        if typ in ['i64', 'u64']:\n            return '''nsimd_{simd_ext}_vl{typ} ret;\n                      ret.v0 = (u64)({in0}.v0 != ({typ})0 ? -1 : 0);\n                      ret.v1 = (u64)({in0}.v1 != ({typ})0 ? -1 : 0);\n                      return ret;'''.format(**fmtspec)\n        elif typ == 'f64':\n            return '''nsimd_{simd_ext}_vl{typ} ret;\n                      ret.v0 = (u64)(nsimd_scalar_reinterpret_u64_f64(\n                                       {in0}.v0) != (u64)0 ? -1 : 0);\n                      ret.v1 = (u64)(nsimd_scalar_reinterpret_u64_f64(\n                                       {in0}.v1) != (u64)0 ? -1 : 0);\n                      return ret;'''.format(**fmtspec)\n    elif typ in common.iutypes:\n        return 'return nsimd_ne_{simd_ext}_{typ}({in0}, {zeros});'. \\\n               format(**fmtspec)\n    elif typ in ['f32', 'f64']:\n        return '''return nsimd_ne_{simd_ext}_u{typnbits}(\n                             nsimd_reinterpret_{simd_ext}_u{typnbits}_{typ}(\n                                 {in0}), vec_splats((u{typnbits})0));'''. \\\n                                 format(**fmtspec)\n\n# -----------------------------------------------------------------------------\n\ndef get_impl(opts, func, simd_ext, from_typ, to_typ):\n    global fmtspec\n\n    fmtspec = {\n        'simd_ext': simd_ext,\n        'typ': from_typ,\n        'styp': get_type(opts, simd_ext, from_typ, to_typ),\n        'from_typ': from_typ,\n        'to_typ': to_typ,\n        'in0': common.in0,\n        'in1': common.in1,\n        'in2': common.in2,\n        'in3': common.in3,\n        'in4': common.in4,\n        'in5': common.in5,\n        'zeros': 'vec_splats(({})0)'.format(from_typ),\n        'lzeros': '({})vec_splats((u{})0)'. \\\n                  format(native_typel(from_typ), from_typ[1:]) \\\n                  if not has_to_be_emulated(simd_ext, from_typ) else '',\n        'typnbits': from_typ[1:]\n    }\n\n    impls = {\n        'loada': 'load1234(simd_ext, from_typ, 1, True)',\n        'load2a': 'load1234(simd_ext, from_typ, 2, True)',\n        'load3a': 'load1234(simd_ext, from_typ, 3, True)',\n        'load4a': 'load1234(simd_ext, from_typ, 4, True)',\n        'loadu': 'load1234(simd_ext, from_typ, 1, False)',\n        'load2u': 'load1234(simd_ext, from_typ, 2, False)',\n        'load3u': 'load1234(simd_ext, from_typ, 3, False)',\n        'load4u': 'load1234(simd_ext, from_typ, 4, False)',\n        'storea': 'store1234(simd_ext, from_typ, 1, True)',\n        'store2a': 'store1234(simd_ext, from_typ, 2, True)',\n        'store3a': 'store1234(simd_ext, from_typ, 3, True)',\n        'store4a': 'store1234(simd_ext, from_typ, 4, True)',\n        'storeu': 'store1234(simd_ext, from_typ, 1, False)',\n        'store2u': 'store1234(simd_ext, from_typ, 2, False)',\n        'store3u': 'store1234(simd_ext, from_typ, 3, False)',\n        'store4u': 'store1234(simd_ext, from_typ, 4, False)',\n        'andb': 'binary_op2(\"andb\", simd_ext, from_typ)',\n        'xorb': 'binary_op2(\"xorb\", simd_ext, from_typ)',\n        'orb': 'binary_op2(\"orb\", simd_ext, from_typ)',\n        'andl': 'logical_op2(\"andl\", simd_ext, from_typ)',\n        'xorl': 'logical_op2(\"xorl\", simd_ext, from_typ)',\n        'orl': 'logical_op2(\"orl\", simd_ext, from_typ)',\n        'notb': 'not1(simd_ext, from_typ)',\n        'notl': 'lnot1(simd_ext, from_typ)',\n        'andnotb': 'binary_op2(\"andnotb\", simd_ext, from_typ)',\n        'andnotl': 'logical_op2(\"andnotl\", simd_ext, from_typ)',\n        'add': 'simple_op2(\"add\", simd_ext, from_typ)',\n        'adds': 'add_sub_s(\"adds\",simd_ext, from_typ)',\n        'sub': 'simple_op2(\"sub\", simd_ext, from_typ)',\n        'subs': 'add_sub_s(\"subs\",simd_ext, from_typ)',\n        'div': 'div2(simd_ext, from_typ)',\n        'sqrt': 'sqrt1(simd_ext, from_typ)',\n        'len': 'len1(simd_ext, from_typ)',\n        'mul': 'simple_op2(\"mul\", simd_ext, from_typ)',\n        'shl': 'shift2(\"shl\", simd_ext, from_typ)',\n        'shr': 'shift2(\"shr\", simd_ext, from_typ)',\n        'shra': 'shift2(\"shra\", simd_ext, from_typ)',\n        'set1': 'set1(simd_ext, from_typ)',\n        'set1l': 'lset1(simd_ext, from_typ)',\n        'eq': 'cmp2(\"eq\", simd_ext, from_typ)',\n        'lt': 'cmp2(\"lt\", simd_ext, from_typ)',\n        'le': 'cmp2(\"le\", simd_ext, from_typ)',\n        'gt': 'cmp2(\"gt\", simd_ext, from_typ)',\n        'ge': 'cmp2(\"ge\", simd_ext, from_typ)',\n        'ne': 'cmp2(\"ne\", simd_ext, from_typ)',\n        'if_else1': 'if_else3(simd_ext, from_typ)',\n        'min': 'minmax2(\"min\", simd_ext, from_typ)',\n        'max': 'minmax2(\"max\", simd_ext, from_typ)',\n        'loadla': 'loadl(True, simd_ext, from_typ)',\n        'loadlu': 'loadl(False, simd_ext, from_typ)',\n        'storela': 'storel(True, simd_ext, from_typ)',\n        'storelu': 'storel(False, simd_ext, from_typ)',\n        'abs': 'abs1(simd_ext, from_typ)',\n        'fma': 'fma(\"fma\", simd_ext, from_typ)',\n        'fnma': 'fma(\"fnma\", simd_ext, from_typ)',\n        'fms': 'fma(\"fms\", simd_ext, from_typ)',\n        'fnms': 'fma(\"fnms\", simd_ext, from_typ)',\n        'ceil': 'round1(\"ceil\", simd_ext, from_typ)',\n        'floor': 'round1(\"floor\", simd_ext, from_typ)',\n        'trunc': 'round1(\"trunc\", simd_ext, from_typ)',\n        'round_to_even': 'round1(\"round_to_even\", simd_ext, from_typ)',\n        'all': 'allany1(\"all\", simd_ext, from_typ)',\n        'any': 'allany1(\"any\", simd_ext, from_typ)',\n        'reinterpret': 'reinterpret1(simd_ext, from_typ, to_typ)',\n        'reinterpretl': 'reinterpretl1(simd_ext, from_typ, to_typ)',\n        'cvt': 'convert1(simd_ext, from_typ, to_typ)',\n        'rec8': 'recs1(\"rec8\", simd_ext, from_typ)',\n        'rec11': 'recs1(\"rec11\", simd_ext, from_typ)',\n        'rsqrt8': 'recs1(\"rsqrt8\", simd_ext, from_typ)',\n        'rsqrt11': 'recs1(\"rsqrt11\", simd_ext, from_typ)',\n        'rec': 'recs1(\"rec\", simd_ext, from_typ)',\n        'neg': 'neg1(simd_ext, from_typ)',\n        'nbtrue': 'nbtrue1(simd_ext, from_typ)',\n        'reverse': 'reverse1(simd_ext, from_typ)',\n        'addv': 'addv(simd_ext, from_typ)',\n        'upcvt': 'upcvt1(simd_ext, from_typ, to_typ)',\n        'downcvt': 'downcvt1(simd_ext, from_typ, to_typ)',\n        'iota': 'iota(simd_ext, from_typ)',\n        'to_logical': 'to_logical(simd_ext, from_typ)',\n        'mask_for_loop_tail': 'mask_for_loop_tail(simd_ext, from_typ)',\n        'masko_loadu1': 'maskoz_load(\"o\", simd_ext, from_typ)',\n        'maskz_loadu1': 'maskoz_load(\"z\", simd_ext, from_typ)',\n        'masko_loada1': 'maskoz_load(\"o\", simd_ext, from_typ)',\n        'maskz_loada1': 'maskoz_load(\"z\", simd_ext, from_typ)',\n        'mask_storea1': 'mask_store(simd_ext, from_typ)',\n        'mask_storeu1': 'mask_store(simd_ext, from_typ)',\n        'gather': 'gather(simd_ext, from_typ)',\n        'scatter': 'scatter(simd_ext, from_typ)',\n        'gather_linear': 'gather_linear(simd_ext, from_typ)',\n        'scatter_linear': 'scatter_linear(simd_ext, from_typ)',\n        'to_mask': 'to_mask(simd_ext, from_typ)',\n        'ziplo': 'zip(\"ziplo\", simd_ext, from_typ)',\n        'ziphi': 'zip(\"ziphi\", simd_ext, from_typ)',\n        'zip': 'zip_unzip_basic(\"zip\", simd_ext, from_typ)',\n        'unzip': 'zip_unzip_basic(\"unzip\", simd_ext, from_typ)',\n        'unziplo': 'unzip(\"unziplo\", simd_ext, from_typ)',\n        'unziphi': 'unzip(\"unziphi\", simd_ext, from_typ)'\n    }\n    if simd_ext not in get_simd_exts():\n        raise ValueError('Unknown SIMD extension \"{}\"'.format(simd_ext))\n    if not from_typ in common.types:\n        raise ValueError('Unknown type \"{}\"'.format(from_typ))\n    if not func in impls:\n        return common.NOT_IMPLEMENTED\n    else:\n        return eval(impls[func])\n"
  },
  {
    "path": "egg/platform_x86.py",
    "content": "# Copyright (c) 2019 Agenium Scale\n#\n# Permission is hereby granted, free of charge, to any person obtaining a copy\n# of this software and associated documentation files (the \"Software\"), to deal\n# in the Software without restriction, including without limitation the rights\n# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n# copies of the Software, and to permit persons to whom the Software is\n# furnished to do so, subject to the following conditions:\n#\n# The above copyright notice and this permission notice shall be included in all\n# copies or substantial portions of the Software.\n#\n# THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n# SOFTWARE.\n\n# This file gives the implementation of platform x86, i.e. Intel/AMD SIMD.\n# Reading this file is NOT straightforward. X86 SIMD extensions is a mess.\n# This script nonetheless tries to be as readable as possible. It implements\n# SSE2, SSE42, AVX, AVX2, AVX512 as found on KNLs and AVX512 as found on Xeon\n# Skylakes.\n\nimport common\nimport x86_load_store_deg234 as ldst234\n\n# -----------------------------------------------------------------------------\n# Helpers\n\nsse = ['sse2', 'sse42']\navx = ['avx', 'avx2']\navx512 = ['avx512_knl', 'avx512_skylake']\n\n# -----------------------------------------------------------------------------\n# Implementation of mandatory functions for this module\n\n\ndef get_simd_exts():\n    return ['sse2', 'sse42', 'avx', 'avx2', 'avx512_knl', 'avx512_skylake']\n\n\ndef get_prev_simd_ext(simd_ext):\n    if simd_ext == 'sse2':\n        return 'cpu'\n    elif simd_ext == 'sse42':\n        return 'sse2'\n    elif simd_ext == 'avx':\n        return 'sse42'\n    elif simd_ext == 'avx2':\n        return 'avx'\n    elif simd_ext in avx512:\n        return 'avx2'\n    raise ValueError('Unknown SIMD extension \"{}\"'.format(simd_ext))\n\n\ndef emulate_fp16(simd_ext):\n    if not simd_ext in get_simd_exts():\n        raise ValueError('Unknown SIMD extension \"{}\"'.format(simd_ext))\n    return True\n\n\ndef get_native_typ(simd_ext, typ):\n    # Number of bits\n    if simd_ext in sse:\n        bits = '128'\n    elif simd_ext in avx:\n        bits = '256'\n    elif simd_ext in avx512:\n        bits = '512'\n    else:\n        raise ValueError('Unknown SIMD extension \"{}\"'.format(simd_ext))\n    if typ == 'f32':\n        return '__m{}'.format(bits)\n    elif typ == 'f64':\n        return '__m{}d'.format(bits)\n    elif typ in common.iutypes:\n        return '__m{}i'.format(bits)\n\n\ndef get_type(opts, simd_ext, typ, nsimd_typ):\n    if typ not in common.types:\n        raise ValueError('Unknown type \"{}\"'.format(typ))\n    if typ == 'f16':\n        return 'typedef struct {{{t} v0; {t} v1; }} {nsimd_typ};'. \\\n               format(t=get_native_typ(simd_ext, 'f32'), nsimd_typ=nsimd_typ)\n    else:\n        return 'typedef {} {};'.format(get_native_typ(simd_ext, typ),\n                                       nsimd_typ)\n\n\ndef get_logical_type(opts, simd_ext, typ, nsimd_typ):\n    if typ not in common.types:\n        raise ValueError('Unknown type \"{}\"'.format(typ))\n    if simd_ext in sse + avx:\n        return get_type(opts, simd_ext, typ, nsimd_typ)\n    elif simd_ext in avx512:\n        if typ == 'f16':\n            return 'typedef struct {{ __mmask16 v0; __mmask16 v1; }} {};'. \\\n                   format(nsimd_typ)\n        return 'typedef __mmask{} {};'. \\\n               format(512 // common.bitsize(typ), nsimd_typ)\n    else:\n        raise ValueError('Unknown SIMD extension \"{}\"'.format(simd_ext))\n\n\ndef get_nb_registers(simd_ext):\n    if simd_ext in sse + avx:\n        return '16'\n    elif simd_ext in avx512:\n        return '32'\n    else:\n        raise ValueError('Unknown SIMD extension \"{}\"'.format(simd_ext))\n\n\ndef has_compatible_SoA_types(simd_ext):\n    if simd_ext not in sse + avx + avx512:\n        raise ValueError('Unknown SIMD extension \"{}\"'.format(simd_ext))\n    else:\n        return False\n\n\ndef get_additional_include(func, platform, simd_ext):\n    ret = ''\n    if simd_ext == 'sse2':\n        ret += '''#include <nsimd/cpu/cpu/{}.h>\n                  '''.format(func)\n    elif simd_ext == 'sse42':\n        ret += '''#include <nsimd/x86/sse2/{}.h>\n                  '''.format(func)\n    elif simd_ext == 'avx':\n        ret += '''#include <nsimd/x86/sse42/{}.h>\n                  '''.format(func)\n    elif simd_ext == 'avx2':\n        ret += '''#include <nsimd/x86/avx/{}.h>\n                  '''.format(func)\n    elif simd_ext == 'avx512_knl':\n        ret += '''#include <nsimd/x86/avx2/{}.h>\n                  '''.format(func)\n    elif simd_ext == 'avx512_skylake':\n        ret += '''#include <nsimd/x86/avx2/{}.h>\n                  '''.format(func)\n    if func == 'shra':\n        ret += '''#include <nsimd/x86/{simd_ext}/shr.h>\n                  '''.format(simd_ext=simd_ext)\n    if func in ['loadla', 'loadlu', 'storela', 'storelu']:\n        ret += '''#include <nsimd/x86/{simd_ext}/set1.h>\n                  # include <nsimd/x86/{simd_ext}/eq.h>\n                  # include <nsimd/x86/{simd_ext}/notl.h>\n                  # include <nsimd/x86/{simd_ext}/if_else1.h>\n                  '''.format(simd_ext=simd_ext)\n    if func in ['masko_loada1', 'masko_loadu1', 'maskz_loada1',\n                'maskz_loadu1', 'mask_storea1', 'mask_storeu1']:\n        ret += '''#include <nsimd/scalar_utilities.h>\n                  '''\n    if func in ['notb']:\n        ret += '''#include <nsimd/x86/{simd_ext}/andnotb.h>\n                  '''.format(simd_ext=simd_ext)\n    if func in ['notl']:\n        ret += '''#include <nsimd/x86/{simd_ext}/andnotb.h>\n                  # include <nsimd/x86/{simd_ext}/andnotl.h>\n                  '''.format(simd_ext=simd_ext)\n    if func in ['min', 'max']:\n        ret += '''#include <nsimd/x86/{simd_ext}/gt.h>\n                  '''.format(simd_ext=simd_ext)\n    if func in ['lt']:\n        ret += '''#include <nsimd/x86/{simd_ext}/gt.h>\n                  '''.format(simd_ext=simd_ext)\n    if func in ['ge']:\n        ret += '''#include <nsimd/x86/{simd_ext}/lt.h>\n                  '''.format(simd_ext=simd_ext)\n    if func in ['if_else1']:\n        ret += '''#include <nsimd/x86/{simd_ext}/notb.h>\n                  # include <nsimd/x86/{simd_ext}/orb.h>\n                  # include <nsimd/x86/{simd_ext}/andnotb.h>\n                  # include <nsimd/x86/{simd_ext}/andb.h>\n                  '''.format(simd_ext=simd_ext)\n    if func in ['abs']:\n        ret += '''#include <nsimd/x86/{simd_ext}/if_else1.h>\n                  # include <nsimd/x86/{simd_ext}/set1.h>\n                  '''.format(simd_ext=simd_ext)\n    if func == 'reinterpretl' and simd_ext in ['sse', 'avx']:\n        ret += '''#include <nsimd/x86/{simd_ext}/storeu.h>\n                  # include <nsimd/x86/{simd_ext}/loadu.h>\n                  '''.format(simd_ext=simd_ext)\n    if func == 'upcvt':\n        ret += '''#include <nsimd/x86/{simd_ext}/cvt.h>\n                  '''.format(simd_ext=simd_ext)\n    if func == 'ziplo' and simd_ext in ['avx512_knl', 'avx512_skylake']:\n        ret += '''#include <nsimd/x86/avx2/ziphi.h>\n                  '''.format(simd_ext=simd_ext)\n    if func == 'ziphi' and simd_ext in ['avx512_knl', 'avx512_skylake']:\n        ret += '''#include <nsimd/x86/avx2/ziplo.h>\n                  '''.format(simd_ext=simd_ext)\n    if func == 'zip':\n        ret += '''#include <nsimd/x86/{simd_ext}/ziplo.h>\n                  #include <nsimd/x86/{simd_ext}/ziphi.h>\n                  '''.format(simd_ext=simd_ext)\n    if func == 'unzip':\n        ret += '''#include <nsimd/x86/{simd_ext}/unziplo.h>\n                  #include <nsimd/x86/{simd_ext}/unziphi.h>\n                  '''.format(simd_ext=simd_ext)\n    if simd_ext in avx512 and func in ['loadlu', 'loadla']:\n        ret += '''\n                  # if NSIMD_CXX > 0\n                  extern \"C\" {{\n                  # endif\n\n                  NSIMD_INLINE nsimd_{simd_ext}_vlu16 NSIMD_VECTORCALL\n                  nsimd_{func}_{simd_ext}_u16(const u16*);\n\n                  # if NSIMD_CXX > 0\n                  }} // extern \"C\"\n                  # endif\n                  '''.format(func=func, simd_ext=simd_ext)\n    if func in ['load2u', 'load3u', 'load4u', 'load2a', 'load3a', 'load4a']:\n        ret += '''\n                  # include <nsimd/x86/{simd_ext}/loadu.h>\n                  # include <nsimd/x86/{simd_ext}/storeu.h>\n\n                  # if NSIMD_CXX > 0\n                  extern \"C\" {{\n                  # endif\n\n                  NSIMD_INLINE nsimd_{simd_ext}_vu16x{deg} NSIMD_VECTORCALL\n                  nsimd_{func}_{simd_ext}_u16(const u16*);\n\n                  # if NSIMD_CXX > 0\n                  }} // extern \"C\"\n                  # endif\n                  '''.format(func=func, deg=func[4], simd_ext=simd_ext)\n    if func in ['store2u', 'store3u', 'store4u', 'store2a', 'store3a',\n                'store4a']:\n        deg = func[5]\n        args = ','.join(['nsimd_{simd_ext}_vu16'.format(simd_ext=simd_ext)\n                         for i in range(1, int(deg) + 1)])\n        ret += '''\n                  # include <nsimd/x86/{simd_ext}/loadu.h>\n                  # include <nsimd/x86/{simd_ext}/storeu.h>\n\n                  # if NSIMD_CXX > 0\n                  extern \"C\" {{\n                  # endif\n\n                  NSIMD_INLINE void NSIMD_VECTORCALL\n                  nsimd_{func}_{simd_ext}_u16(u16*, {args});\n\n                  # if NSIMD_CXX > 0\n                  }} // extern \"C\"\n                  # endif\n                  '''.format(func=func, deg=deg, args=args, simd_ext=simd_ext)\n    if func == 'to_logical':\n        ret += '''#include <nsimd/x86/{simd_ext}/ne.h>\n                  #include <nsimd/x86/{simd_ext}/reinterpretl.h>\n                  '''.format(simd_ext=simd_ext)\n    if func == 'adds':\n        ret += '''#include <nsimd/x86/{simd_ext}/reinterpret.h>\n                  #include <nsimd/x86/{simd_ext}/add.h>\n                  #include <nsimd/x86/{simd_ext}/set1.h>\n                  #include <nsimd/x86/{simd_ext}/shr.h>\n                  #include <nsimd/x86/{simd_ext}/orb.h>\n                  #include <nsimd/x86/{simd_ext}/xorb.h>\n                  #include <nsimd/x86/{simd_ext}/notb.h>\n                  #include <nsimd/x86/{simd_ext}/if_else1.h>\n\n                  #if NSIMD_CXX > 0\n                      #include <climits>\n                  #else\n                      #include <limits.h>\n                  #endif\n                  ''' .format(simd_ext=simd_ext)\n        if simd_ext in avx512:\n            ret += '''#include <nsimd/x86/{simd_ext}/to_logical.h>\n                      '''.format(simd_ext=simd_ext)\n    if func == 'subs':\n        ret += '''#include <nsimd/x86/{simd_ext}/adds.h>\n                  #include <nsimd/x86/{simd_ext}/neg.h>\n                  #include <nsimd/x86/{simd_ext}/sub.h>\n                  #include <nsimd/x86/{simd_ext}/gt.h>\n                  #include <nsimd/x86/{simd_ext}/set1.h>\n                  #include <nsimd/x86/{simd_ext}/if_else1.h>\n                  '''.format(simd_ext=simd_ext)\n    if func == 'mask_for_loop_tail':\n        ret += '''#include <nsimd/x86/{simd_ext}/lt.h>\n                  #include <nsimd/x86/{simd_ext}/set1l.h>\n                  #include <nsimd/x86/{simd_ext}/iota.h>\n                  #include <nsimd/x86/{simd_ext}/set1.h>\n                  '''.format(simd_ext=simd_ext)\n    return ret\n\n# -----------------------------------------------------------------------------\n# Function prefixes and suffixes\n\ndef pre(simd_ext):\n    # Number of bits\n    if simd_ext in sse:\n        bits = ''\n    elif simd_ext in avx:\n        bits = '256'\n    elif simd_ext in avx512:\n        bits = '512'\n    else:\n        raise ValueError('Unknown SIMD extension \"{}\"'.format(simd_ext))\n    return '_mm{}_'.format(bits)\n\ndef suf_ep(typ):\n    if typ == 'f16':\n        return '_ph'\n    elif typ == 'f32':\n        return '_ps'\n    elif typ == 'f64':\n        return '_pd'\n    elif typ in common.iutypes:\n        return '_epi{}'.format(typ[1:])\n    else:\n        raise ValueError('Unknown type \"{}\"'.format(typ))\n\ndef nbits(simd_ext):\n    if simd_ext in sse:\n        return '128'\n    elif simd_ext in avx:\n        return '256'\n    else:\n        return '512'\n\ndef suf_si(simd_ext, typ):\n    if typ == 'f16':\n        return '_ph'\n    elif typ == 'f32':\n        return '_ps'\n    elif typ == 'f64':\n        return '_pd'\n    elif typ in common.iutypes:\n        return '_si{}'.format(nbits(simd_ext))\n    else:\n        raise ValueError('Unknown type \"{}\"'.format(typ))\n\n# -----------------------------------------------------------------------------\n# Other helper functions\n\nfmtspec = {}\n\nLO = 0\nHI = 1\n\ndef castsi(simd_ext, typ):\n    if typ in common.ftypes:\n        return ''\n    else:\n        return '(__m{}i *)'.format(nbits(simd_ext))\n\ndef extract(simd_ext, typ, lohi, var):\n    if simd_ext in avx:\n        lohi_arg = '0' if lohi == LO else '1'\n        if typ == 'f32':\n            if lohi == LO:\n                return '_mm256_castps256_ps128({})'.format(var)\n            else:\n                return '_mm256_extractf128_ps({}, 1)'.format(var)\n        elif typ == 'f64':\n            if lohi == LO:\n                return '_mm256_castpd256_pd128({})'.format(var)\n            else:\n                return '_mm256_extractf128_pd({}, 1)'.format(var)\n        else:\n            if lohi == LO:\n                return '_mm256_castsi256_si128({})'.format(var)\n            else:\n                return '_mm256_extractf128_si256({}, 1)'.format(var)\n    elif simd_ext in avx512:\n        lohi_arg = '0' if lohi == LO else '1'\n        if typ == 'f32':\n            if lohi == LO:\n                return '_mm512_castps512_ps256({})'.format(var)\n            else:\n                return '''_mm256_castsi256_ps(_mm512_extracti64x4_epi64(\n                              _mm512_castps_si512({}), 1))'''.format(var)\n        elif typ == 'f64':\n            if lohi == LO:\n                return '_mm512_castpd512_pd256({})'.format(var)\n            else:\n                return '_mm512_extractf64x4_pd({}, 1)'.format(var)\n        else:\n            if lohi == LO:\n                return '_mm512_castsi512_si256({})'.format(var)\n            else:\n                return '_mm512_extracti64x4_epi64({}, 1)'.format(var)\n\ndef setr(simd_ext, typ, var1, var2):\n    if simd_ext in avx:\n        if typ == 'f32':\n            return '''_mm256_insertf128_ps(_mm256_castps128_ps256(\n                        {}), {}, 1)'''.format(var1, var2)\n        elif typ == 'f64':\n            return '''_mm256_insertf128_pd(_mm256_castpd128_pd256(\n                        {}), {}, 1)'''.format(var1, var2)\n        else:\n            return '''_mm256_insertf128_si256(_mm256_castsi128_si256(\n                        {}), {}, 1)'''.format(var1, var2)\n    elif simd_ext in avx512:\n        if typ == 'f32':\n            return '''_mm512_castpd_ps(_mm512_insertf64x4(_mm512_castps_pd(\n                        _mm512_castps256_ps512({})), _mm256_castps_pd(\n                          {}), 1))'''. \\\n                      format(var1, var2)\n        elif typ == 'f64':\n            return '''_mm512_insertf64x4(_mm512_castpd256_pd512(\n                        {}), {}, 1)'''.format(var1, var2)\n        else:\n            return '''_mm512_inserti64x4(_mm512_castsi256_si512(\n                        {}), {}, 1)'''.format(var1, var2)\n\ndef set_lane(simd_ext, typ, var_name, scalar, i):\n    # No code for f16's\n    if typ == 'f16':\n        return ''\n\n    # Inserting a\n\n    # Code for reinterpreting bits of input:\n    #   All intrinscis manipulates only integers. So we use them.\n    if typ in ['u8', 'u16']:\n        vin0 = var_name\n        if simd_ext in sse:\n            vin1 = '(int)({})'.format(scalar)\n        else:\n            vin1 = scalar\n    if typ in ['i8', 'i16']:\n        vin0 = var_name\n        vin1 =  '(int)nsimd_scalar_reinterpret_{}_{}({})'. \\\n               format('u' + typ[1:], typ, scalar)\n    elif typ in ['i32', 'i64']:\n        vin0 = var_name\n        vin1 = scalar\n    elif typ in ['u32', 'f32', 'u64', 'f64']:\n        if typ in ['u32', 'u64']:\n            vin0 = var_name\n        else:\n            vin0 = '{pre}cast{pspd}_si{nbits}({var_name})'. \\\n                   format(pspd='ps' if typ == 'f32' else 'pd',\n                          var_name=var_name, **fmtspec)\n        vin1 = 'nsimd_scalar_reinterpret_{}_{}({})'. \\\n               format('i' + typ[1:], typ, scalar)\n\n    # Code for inserting bits\n    if simd_ext == 'sse2':\n        if typ[1:] == '8':\n            if i % 2 == 0:\n                tmp = '_mm_insert_epi16({vin0}, ' \\\n                      '(_mm_extract_epi16({vin0}, {io2}) & 65280) | {vin1}, ' \\\n                      '{io2})'.format(vin0=vin0, vin1=vin1, io2=int(i // 2))\n            else:\n                tmp = '_mm_insert_epi16({vin0}, ' \\\n                      '(_mm_extract_epi16({vin0}, {io2}) & 255) | ' \\\n                      '({vin1} << 8), {io2})'. \\\n                      format(vin0=vin0, vin1=vin1, io2=int(i // 2))\n        if typ[1:] == '16':\n            tmp = '_mm_insert_epi16({}, {}, {})'.format(vin0, vin1, i)\n        if typ[1:] == '32':\n            tmp = '_mm_insert_epi16(_mm_insert_epi16({vin0}, {vin1} & 65535,' \\\n                  ' {ix2}), (int)nsimd_scalar_reinterpret_u32_i32(' \\\n                  '{vin1}) >> 16, {ix2p1})'.format(vin0=vin0, vin1=vin1,\n                  ix2=i * 2, ix2p1=(i * 2) + 1)\n        if typ[1:] == '64':\n            if i == 0:\n                tmp = '_mm_unpackhi_epi64(_mm_slli_si128(' \\\n                      '_mm_cvtsi64_si128({vin1}), 8), {vin0})'. \\\n                      format(vin0=vin0, vin1=vin1)\n            elif i == 1:\n                tmp = '_mm_unpacklo_epi64({vin0}, ' \\\n                      '_mm_cvtsi64_si128({vin1}))'.format(vin0=vin0, vin1=vin1)\n    elif simd_ext in ['sse42'] + avx:\n        tmp = '{pre}insert_epi{typnbits}({vin0}, {vin1}, {i})'. \\\n              format(vin0=vin0, vin1=vin1, i=i, **fmtspec)\n    elif simd_ext in avx512:\n        half = int(nbits(simd_ext)) // 2 // int(typ[1:])\n        if i < half:\n            tmp = '_mm512_inserti64x4({vin0}, _mm256_insert_epi{typnbits}(' \\\n                  '_mm512_castsi512_si256({vin0}), {vin1}, {i}), 0)'. \\\n                  format(vin0=vin0, vin1=vin1, i=i, **fmtspec)\n        else:\n            tmp = '_mm512_inserti64x4({vin0}, _mm256_insert_epi{typnbits}(' \\\n                  '_mm512_extracti64x4_epi64({vin0}, 1), {vin1}, {i}),' \\\n                  ' 1)'.format(vin0=vin0, vin1=vin1, i=i - half, **fmtspec)\n\n    # Then code for reinterpreting bits of output:\n    if typ in common.iutypes:\n        return '{} = {};'.format(var_name, tmp)\n    elif typ in ['f32', 'f64']:\n        return '{var_name} = {pre}castsi{nbits}_{pdps}({tmp});'. \\\n               format(var_name=var_name, pdps='ps' if typ == 'f32' else 'pd',\n                      tmp=tmp, **fmtspec)\n\ndef get_lane(simd_ext, typ, var_name, i):\n    # No code for f16's\n    if typ == 'f16':\n        return ''\n\n    # Code for reinterpreting bits of input:\n    #   All intrinscis manipulates only integers. So we use them.\n    if typ in common.iutypes:\n        vin = var_name\n    elif typ in ['f32', 'f64']:\n        vin = '{pre}cast{pdps}_si{nbits}({v})'. \\\n              format(pdps='ps' if typ == 'f32' else 'pd', v=var_name,\n                     **fmtspec)\n\n    # Code for extracting bits\n    if simd_ext == 'sse2':\n        if typ[1:] == '8':\n            lane = '(_mm_cvtsi128_si32(_mm_srli_si128({vin}, {i})) & 255)'. \\\n                   format(vin=vin, i=i, **fmtspec)\n        if typ[1:] == '16':\n            lane = '_mm_extract_epi16({}, {})'.format(vin, i)\n        if typ[1:] in ['32', '64']:\n            lane = '(_mm_cvtsi128_si{}(_mm_srli_si128({}, {})))'. \\\n                   format(typ[1:], vin, i * int(typ[1:]) // 8)\n    elif simd_ext in ['sse42', 'avx2']:\n        lane = '{pre}extract_epi{typnbits}({vin}, {i})'. \\\n               format(vin=vin, i=i, **fmtspec)\n    elif simd_ext in ['avx'] + avx512:\n        if simd_ext == 'avx' and typ[1:] in ['32', '64']:\n            lane = '{pre}extract_epi{typnbits}({vin}, {i})'. \\\n                   format(vin=vin, i=i, **fmtspec)\n        else:\n            half = int(nbits(simd_ext)) // 2 // int(typ[1:])\n            if i < half:\n                ext_half = extract(simd_ext, 'i' + typ[1:], LO, vin)\n                lane = '{}extract_epi{}({}, {})'.format(\n                           '_mm_' if simd_ext == 'avx' else '_mm256_',\n                           typ[1:], ext_half, i)\n            else:\n                ext_half = extract(simd_ext, 'i' + typ[1:], HI, vin)\n                lane = '{}extract_epi{}({}, {})'.format(\n                           '_mm_' if simd_ext == 'avx' else '_mm256_',\n                           typ[1:], ext_half, i - half)\n\n    # Then code for reinterpreting bits of output:\n    #   - For 8 and 16-bits types intrinsics returns an 32-bits int\n    #   - For 32 and 64-bits types intrinsics returns an int of that size\n    if typ in ['u8', 'u16']:\n        return '({})({})'.format(typ, lane)\n    if typ in ['i8', 'i16']:\n        return 'nsimd_scalar_reinterpret_{}_{}(({})({}))'. \\\n               format(typ, 'u' + typ[1:], 'u' + typ[1:], lane)\n    elif typ in ['i32', 'i64']:\n        return lane\n    elif typ in ['u32', 'f32', 'u64', 'f64']:\n        return 'nsimd_scalar_reinterpret_{}_{}({})'. \\\n               format(typ, 'i' + typ[1:], lane)\n\ndef get_undefined(simd_ext, typ):\n    if typ in ['f32', 'f64']:\n        return '{pre}undefined{suf}()'.format(**fmtspec)\n    elif typ in common.iutypes:\n        if simd_ext in sse + avx:\n            return '{pre}undefined{sufsi}()'.format(**fmtspec)\n        elif simd_ext in avx512:\n            return '{pre}undefined_epi32()'.format(**fmtspec)\n\n# Signature must be a list of 'v', 's'\n#   'v' means vector so code to extract has to be emitted\n#   's' means base type so no need to write code for extraction\ndef get_emulation_code(func, signature, simd_ext, typ):\n    # Trick using insert and extract\n    trick = 'nsimd_{simd_ext}_v{typ} ret = {undef};\\n'. \\\n           format(undef=get_undefined(simd_ext, typ), **fmtspec)\n    arity = len(signature)\n    trick += typ + ' ' + \\\n            ', '.join(['tmp{}'.format(i) \\\n                       for i in range(arity) if signature[i] == 'v']) + ';\\n'\n    args = ', '.join(['{{in{}}}'.format(i).format(**fmtspec) \\\n                      if signature[i] == 's' else 'tmp{}'.format(i) \\\n                      for i in range(arity)])\n    for i in range(fmtspec['le']):\n        trick += '\\n'.join(['tmp{} = {};'. \\\n                format(j, get_lane(simd_ext, typ,\n                       '{{in{}}}'.format(j).format(**fmtspec), i)) \\\n                       for j in range(arity) if signature[j] == 'v']) + '\\n'\n        trick += set_lane(simd_ext, typ, 'ret',\n                         'nsimd_scalar_{func}_{typ}({args})'. \\\n                         format(func=func, args=args, **fmtspec), i) + '\\n'\n    trick += 'return ret;'\n\n    # but in 32-bits mode insert and extract instrinsics are almost never\n    # available so we emulate\n    emulation = 'int i;\\n{typ} ret[{le}];\\n'.format(**fmtspec)\n    emulation += typ + ' ' + \\\n                 ', '.join(['buf{}[{}]'.format(i, fmtspec['le']) \\\n                            for i in range(arity) if signature[i] == 'v']) + \\\n                            ';\\n'\n    emulation += '\\n'.join(['{{pre}}store{{sufsi}}({cast}buf{i}, {{in{i}}});'. \\\n                            format(i=i, cast=castsi(simd_ext, typ)). \\\n                            format(**fmtspec) \\\n                            for i in range(arity) if signature[i] == 'v']) + \\\n                            '\\n'\n    args = ', '.join(['{{in{}}}'.format(i).format(**fmtspec) \\\n                      if signature[i] == 's' else 'buf{}[i]'.format(i) \\\n                      for i in range(arity)])\n    emulation += '''for (i = 0; i < {le}; i++) {{\n                      ret[i] = nsimd_scalar_{func}_{typ}({args});\n                    }}\n                    return {pre}loadu{sufsi}({cast}ret);'''. \\\n                    format(args=args, cast=castsi(simd_ext, typ), func=func,\n                           **fmtspec)\n\n    if simd_ext == 'sse42' and \\\n       typ in ['i8', 'u8', 'i16', 'u16', 'i32', 'u32', 'f32']:\n        return trick\n    else:\n        return '''#if NSIMD_WORD_SIZE == 32\n                    {}\n                  #else\n                    {}\n                  #endif'''.format(emulation, trick)\n\ndef how_it_should_be_op2(func, simd_ext, typ):\n    if typ == 'f16':\n        return '''nsimd_{simd_ext}_vf16 ret;\n                  ret.v0 = {pre}{func}_ps({in0}.v0, {in1}.v0);\n                  ret.v1 = {pre}{func}_ps({in0}.v1, {in1}.v1);\n                  return ret;'''.format(func=func, **fmtspec)\n    else:\n        return 'return {pre}{func}{suf}({in0}, {in1});'. \\\n               format(func=func, **fmtspec)\n\ndef split_opn(func, simd_ext, typ, n):\n    simd_ext2 = 'sse42' if simd_ext in avx else 'avx2'\n    inp = [common.in0, common.in1, common.in2]\n    defi = ''\n    for i in range(0, n):\n        defi += \\\n        '''nsimd_{simd_ext2}_v{typ} v{i}0 = {extract_loi};\n           nsimd_{simd_ext2}_v{typ} v{i}1 = {extract_hii};'''. \\\n           format(simd_ext2=simd_ext2, typ=typ, i=i,\n                  extract_loi=extract(simd_ext, typ, LO, inp[i]),\n                  extract_hii=extract(simd_ext, typ, HI, inp[i]))\n    vlo = ', '.join(['v{}0'.format(i) for i in range(0, n)])\n    vhi = ', '.join(['v{}1'.format(i) for i in range(0, n)])\n    return '''{defi}\n              v00 = nsimd_{func}_{simd_ext2}_{typ}({vlo});\n              v01 = nsimd_{func}_{simd_ext2}_{typ}({vhi});\n              return {merge};'''. \\\n              format(defi=defi, vlo=vlo, vhi=vhi,\n                     func=func, simd_ext2=simd_ext2, typ=typ,\n                     merge=setr(simd_ext, typ, 'v00', 'v01'))\n\ndef split_op2(func, simd_ext, typ):\n    return split_opn(func, simd_ext, typ, 2)\n\ndef emulate_op2(opts, op, simd_ext, typ):\n    func = {'/': 'div', '*': 'mul'}\n    return get_emulation_code(func[op], ['v', 'v'], simd_ext, typ)\n\ndef emulate_op1(opts, func, simd_ext, typ):\n    return get_emulation_code(func, ['v'], simd_ext, typ)\n\ndef split_cmp2(func, simd_ext, typ):\n    simd_ext2 = 'sse42' if simd_ext in avx else 'avx2'\n    leo2 = int(fmtspec['le']) // 2\n    if simd_ext in avx512:\n        if typ in ['i8', 'u8', 'f32', 'f64']:\n            merge = \\\n            '''return (__mmask{le})(u32)_mm256_movemask{suf}(\n                        v00) | ((__mmask{le})(u32)_mm256_movemask{suf}(\n                          v01) << {leo2});'''. \\\n                       format(leo2=leo2, **fmtspec)\n        elif typ in ['i32', 'u32', 'i64', 'u64']:\n            ftyp = 'f{typnbits}'.format(**fmtspec)\n            merge = \\\n            '''return (__mmask{le})(u32)_mm256_movemask{fsuf}(\n                        _mm256_castsi256{suf}(v00)) |\n                          (((__mmask{le})(u32)_mm256_movemask{fsuf}(\n                            _mm256_castsi256{suf}(v01))) << {leo2});'''. \\\n                            format(fsuf=suf_ep(ftyp), leo2=leo2, **fmtspec)\n        else:\n            merge = \\\n            '''v00 = _mm256_permute4x64_epi64(v00, 216); /* exchange middle qwords */\n               nsimd_avx2_vi16 lo1 = _mm256_unpacklo_epi16(v00, v00);\n               nsimd_avx2_vi16 hi1 = _mm256_unpackhi_epi16(v00, v00);\n               v01 = _mm256_permute4x64_epi64(v01, 216); /* exchange middle qwords */\n               nsimd_avx2_vi16 lo2 = _mm256_unpacklo_epi16(v01, v01);\n               nsimd_avx2_vi16 hi2 = _mm256_unpackhi_epi16(v01, v01);\n               return (__mmask32)(u32)_mm256_movemask_ps(\n                                   _mm256_castsi256_ps(lo1)) |\n                      (__mmask32)((u32)_mm256_movemask_ps(\n                                   _mm256_castsi256_ps(hi1)) << 8) |\n                      (__mmask32)((u32)_mm256_movemask_ps(\n                                   _mm256_castsi256_ps(lo2)) << 16) |\n                      (__mmask32)((u32)_mm256_movemask_ps(\n                                   _mm256_castsi256_ps(hi2)) << 24);'''. \\\n                                   format(**fmtspec)\n    else:\n        merge = 'return {};'.format(setr(simd_ext, typ, 'v00', 'v01'))\n    return '''nsimd_{simd_ext2}_v{typ} v00 = {extract_lo0};\n              nsimd_{simd_ext2}_v{typ} v01 = {extract_hi0};\n              nsimd_{simd_ext2}_v{typ} v10 = {extract_lo1};\n              nsimd_{simd_ext2}_v{typ} v11 = {extract_hi1};\n              v00 = nsimd_{func}_{simd_ext2}_{typ}(v00, v10);\n              v01 = nsimd_{func}_{simd_ext2}_{typ}(v01, v11);\n              {merge}'''. \\\n              format(simd_ext2=simd_ext2,\n                     extract_lo0=extract(simd_ext, typ, LO, common.in0),\n                     extract_hi0=extract(simd_ext, typ, HI, common.in0),\n                     extract_lo1=extract(simd_ext, typ, LO, common.in1),\n                     extract_hi1=extract(simd_ext, typ, HI, common.in1),\n                     func=func, merge=merge, **fmtspec)\n\ndef f16_cmp2(func, simd_ext):\n    return '''nsimd_{simd_ext}_vlf16 ret;\n              ret.v0 = nsimd_{func}_{simd_ext}_f32({in0}.v0, {in1}.v0);\n              ret.v1 = nsimd_{func}_{simd_ext}_f32({in0}.v1, {in1}.v1);\n              return ret;'''.format(func=func, **fmtspec)\n\ndef cmp2_with_add(func, simd_ext, typ):\n    cte = { 'u8': '0x80', 'u16': '0x8000', 'u32': '0x80000000',\n            'u64': '0x8000000000000000' }\n    return \\\n    '''nsimd_{simd_ext}_v{typ} cte = nsimd_set1_{simd_ext}_{typ}({cte});\n       return nsimd_{func}_{simd_ext}_{ityp}(\n                {pre}add{suf}({in0}, cte),\n                {pre}add{suf}({in1}, cte));'''. \\\n                format(func=func, cte=cte[typ],\n                       ityp='i{}'.format(typ[1:]), **fmtspec)\n\n# -----------------------------------------------------------------------------\n# Returns C code for func\n\n# Load\n\ndef load(simd_ext, typ, aligned):\n    align = '' if aligned else 'u'\n    cast = castsi(simd_ext, typ)\n    if typ == 'f16':\n        if simd_ext in sse:\n            return \\\n            '''#ifdef NSIMD_FP16\n                 nsimd_{simd_ext}_vf16 ret;\n                 __m128i v = _mm_load{align}_si128((__m128i*){in0});\n                 ret.v0 = _mm_cvtph_ps(v);\n                 v = _mm_shuffle_epi32(v, 14); /* = (3 << 2) | (2 << 0) */\n                 ret.v1 = _mm_cvtph_ps(v);\n                 return ret;\n               #else\n                 /* Note that we can do much better but is it useful? */\n                 nsimd_{simd_ext}_vf16 ret;\n                 f32 buf[4];\n                 buf[0] = nsimd_u16_to_f32(*(u16*){in0});\n                 buf[1] = nsimd_u16_to_f32(*((u16*){in0} + 1));\n                 buf[2] = nsimd_u16_to_f32(*((u16*){in0} + 2));\n                 buf[3] = nsimd_u16_to_f32(*((u16*){in0} + 3));\n                 ret.v0 = _mm_loadu_ps(buf);\n                 buf[0] = nsimd_u16_to_f32(*((u16*){in0} + 4));\n                 buf[1] = nsimd_u16_to_f32(*((u16*){in0} + 5));\n                 buf[2] = nsimd_u16_to_f32(*((u16*){in0} + 6));\n                 buf[3] = nsimd_u16_to_f32(*((u16*){in0} + 7));\n                 ret.v1 = _mm_loadu_ps(buf);\n                 return ret;\n               #endif'''.format(align=align, **fmtspec)\n        elif simd_ext in avx:\n            return '''#ifdef NSIMD_FP16\n                        nsimd_{simd_ext}_vf16 ret;\n                        ret.v0 = _mm256_cvtph_ps(_mm_load{align}_si128(\n                                   (__m128i*){in0}));\n                        ret.v1 = _mm256_cvtph_ps(_mm_load{align}_si128(\n                                   (__m128i*){in0} + 1));\n                        return ret;\n                      #else\n                        /* Note that we can do much better but is it useful? */\n                        nsimd_{simd_ext}_vf16 ret;\n                        f32 buf[8];\n                        int i;\n                        for (i = 0; i < 8; i++) {{\n                          buf[i] = nsimd_u16_to_f32(*((u16*){in0} + i));\n                        }}\n                        ret.v0 = _mm256_loadu_ps(buf);\n                        for (i = 0; i < 8; i++) {{\n                          buf[i] = nsimd_u16_to_f32(*((u16*){in0} + (8 + i)));\n                        }}\n                        ret.v1 = _mm256_loadu_ps(buf);\n                        return ret;\n                      #endif'''.format(align=align, **fmtspec)\n        elif simd_ext in avx512:\n            return '''nsimd_{simd_ext}_vf16 ret;\n                      ret.v0 = _mm512_cvtph_ps(\n                                 _mm256_load{align}_si256((__m256i*){in0})\n                               );\n                      ret.v1 = _mm512_cvtph_ps(\n                                 _mm256_load{align}_si256((__m256i*){in0} + 1)\n                               );\n                      return ret;\n                      '''.format(align=align, **fmtspec)\n    else:\n        return 'return {pre}load{align}{sufsi}({cast}{in0});'. \\\n               format(align=align, cast=cast, **fmtspec)\n\n# -----------------------------------------------------------------------------\n# masked loads\n\ndef maskoz_load(simd_ext, typ, oz, aligned):\n    if typ == 'f16':\n        le2 = fmtspec['le'] // 2\n        if simd_ext in sse + avx:\n            store_mask = '''{pre}storeu_ps(mask, {in0}.v0);\n                            {pre}storeu_ps(mask + {le2}, {in0}.v1);'''. \\\n                            format(le2=le2, **fmtspec)\n        else:\n            store_mask = '''_mm512_storeu_ps(mask, _mm512_maskz_mov_ps(\n                              {in0}.v0, _mm512_set1_ps(1.0f)));\n                            _mm512_storeu_ps(mask + {le2}, _mm512_maskz_mov_ps(\n                              {in0}.v1, _mm512_set1_ps(1.0f)));'''. \\\n                            format(le2=le2, **fmtspec)\n        return '''int i;\n                  nsimd_{simd_ext}_vf16 ret;\n                  f32 buf[{le}], mask[{le}];\n                  {store_mask}\n                  {pre}storeu_ps(buf, {oz0});\n                  {pre}storeu_ps(buf + {le2}, {oz1});\n                  for (i = 0; i < {le}; i++) {{\n                    if (nsimd_scalar_reinterpret_u32_f32(mask[i]) != (u32)0) {{\n                      buf[i] = nsimd_f16_to_f32({in1}[i]);\n                    }}\n                  }}\n                  ret.v0 = {pre}loadu_ps(buf);\n                  ret.v1 = {pre}loadu_ps(buf + {le2});\n                  return ret;'''.format(le2=fmtspec['le'] // 2,\n                  oz0 = '{pre}setzero_ps()'.format(**fmtspec) if oz == 'z' \\\n                        else '{in2}.v0'.format(**fmtspec),\n                  oz1 = '{pre}setzero_ps()'.format(**fmtspec) if oz == 'z' \\\n                        else '{in2}.v1'.format(**fmtspec),\n                  store_mask=store_mask, **fmtspec)\n    if (typ in ['i8', 'u8', 'i16', 'u16'] and simd_ext != 'avx512_skylake') \\\n       or (typ in ['i32', 'u32', 'f32', 'i64', 'u64', 'f64'] and \\\n           simd_ext in sse):\n        cast = castsi(simd_ext, typ)\n        if simd_ext == 'avx512_knl':\n            mask_decl = 'u64 mask;'\n            store_mask = 'mask = (u64){in0};'.format(**fmtspec)\n            cond = '(mask >> i) & 1'\n        else:\n            mask_decl = '{typ} mask[{le}];'.format(**fmtspec)\n            store_mask = '{pre}storeu{sufsi}({cast}mask, {in0});'. \\\n                         format(cast=cast, **fmtspec)\n            cond = 'nsimd_scalar_reinterpret_{utyp}_{typ}(mask[i]) != '\\\n                   '({utyp})0'.format(utyp='u' + typ[1:], **fmtspec)\n        return \\\n        '''int i;\n           {typ} buf[{le}];\n           {mask_decl}\n           {pre}storeu{sufsi}({cast}buf, {oz});\n           {store_mask}\n           for (i = 0; i < {le}; i++) {{\n             if ({cond}) {{\n               buf[i] = {in1}[i];\n             }}\n           }}\n           return {pre}loadu{sufsi}({cast}buf);'''. \\\n           format(cast=cast, mask_decl=mask_decl, store_mask=store_mask,\n                  cond=cond, oz='{in2}'.format(**fmtspec) if oz == 'o' else \\\n                  '{pre}setzero{sufsi}()'.format(**fmtspec), **fmtspec)\n    # Here typ is 32 of 64-bits wide except\n    if simd_ext in avx:\n        suf2 = 'ps' if typ[1:] == '32' else 'pd'\n        if typ in common.ftypes:\n            maskload = \\\n            '{pre}maskload{suf}({in1}, {pre}cast{suf2}_si256({in0}))'. \\\n            format(suf2=suf2, **fmtspec)\n            if oz == 'z':\n                return 'return {};'.format(maskload)\n            else:\n                return \\\n                'return {pre}blendv{suf}({in2}, {maskload}, {in0});'. \\\n                format(maskload=maskload, **fmtspec)\n        else:\n            if simd_ext == 'avx2':\n                maskload = '{pre}maskload{suf}({cast}{in1}, {in0})'. \\\n                           format(cast='(nsimd_longlong *)' \\\n                                  if typ in ['i64', 'u64'] else '(int *)',\n                                  **fmtspec)\n                if oz == 'z':\n                    return 'return {};'.format(maskload)\n                else:\n                    return \\\n                    'return {pre}blendv_epi8({in2}, {maskload}, {in0});'. \\\n                    format(maskload=maskload, **fmtspec)\n            else:\n                maskload = '{pre}maskload_{suf2}(({ftyp}*){in1}, {in0})'. \\\n                           format(suf2=suf2, ftyp='f' + typ[1:], **fmtspec)\n                if oz == 'z':\n                    return 'return {pre}cast{suf2}_si256({maskload});'. \\\n                           format(maskload=maskload, suf2=suf2, **fmtspec)\n                else:\n                    return \\\n                    '''return {pre}cast{suf2}_si256({pre}blendv_{suf2}(\n                                {pre}castsi256_{suf2}({in2}), {maskload},\n                                  {pre}castsi256_{suf2}({in0})));'''. \\\n                                  format(suf2=suf2, maskload=maskload,\n                                         **fmtspec)\n    # getting here means avx512 with intrinsics\n    mask = {\n      'z': 'return {pre}maskz_load{{}}{suf}({in0}, (void*){in1});'. \\\n           format(**fmtspec),\n      'o': 'return {pre}mask_load{{}}{suf}({in2}, {in0}, (void*){in1});'. \\\n           format(**fmtspec)\n    }\n    if typ in ['i32', 'u32', 'f32', 'i64', 'u64', 'f64']:\n        return mask[oz].format('' if aligned else 'u')\n    else:\n        return mask[oz].format('u')\n\n# -----------------------------------------------------------------------------\n# Loads of degree 2, 3 and 4\n\ndef load_deg234(simd_ext, typ, align, deg):\n    if typ == 'f16':\n        a = 'a' if align else 'u'\n        code = '\\n'.join([ \\\n               '''nsimd_storeu_{simd_ext}_u16(buf, tmp.v{i});\n                  ret.v{i} = nsimd_loadu_{simd_ext}_f16((f16 *)buf);'''. \\\n                  format(i=i, **fmtspec) for i in range(0, deg)])\n        return \\\n        '''nsimd_{simd_ext}_v{typ}x{deg} ret;\n           u16 buf[{le}];\n           nsimd_{simd_ext}_vu16x{deg} tmp =\n               nsimd_load{deg}{a}_{simd_ext}_u16((u16*)a0);\n           {code}\n           return ret;'''.format(code=code, a=a, deg=deg, **fmtspec)\n    if simd_ext in sse:\n        if deg == 2:\n            return ldst234.load2_sse(simd_ext, typ, align, fmtspec)\n        if deg == 3:\n            return ldst234.load3_sse(simd_ext, typ, align, fmtspec)\n        if deg == 4:\n            return ldst234.load4_sse(simd_ext, typ, align, fmtspec)\n    if simd_ext in avx:\n        if deg == 2:\n            return ldst234.load2_avx(simd_ext, typ, align, fmtspec)\n        if deg == 3:\n            return ldst234.load3_avx(simd_ext, typ, align, fmtspec)\n        if deg == 4:\n            return ldst234.load4_avx(simd_ext, typ, align, fmtspec)\n    if simd_ext in avx512:\n        if deg == 2:\n            return ldst234.load2_avx512(simd_ext, typ, align, fmtspec)\n        if deg == 3:\n            return ldst234.load3_avx512(simd_ext, typ, align, fmtspec)\n        if deg == 4:\n            return ldst234.load4_avx512(simd_ext, typ, align, fmtspec)\n    return common.NOT_IMPLEMENTED\n\n# -----------------------------------------------------------------------------\n# Stores of degree 2, 3 and 4\n\ndef store_deg234(simd_ext, typ, align, deg):\n    if typ == 'f16':\n        a = 'a' if align else 'u'\n        variables = ', '.join(['v{}'.format(i) for i in range(0, deg)])\n        code = '\\n'.join([ \\\n               '''nsimd_storeu_{{simd_ext}}_f16((f16 *)buf, {{in{ip1}}});\n                  v{i} = nsimd_loadu_{{simd_ext}}_u16((u16 *)buf);'''. \\\n                  format(i=i, ip1=i + 1).format(**fmtspec) \\\n                  for i in range(0, deg)])\n        return \\\n        '''nsimd_{simd_ext}_vu16 {variables};\n           u16 buf[{le}];\n           {code}\n           nsimd_store{deg}{a}_{simd_ext}_u16((u16 *){in0}, {variables});'''. \\\n           format(variables=variables, code=code, a=a, deg=deg, **fmtspec)\n    if simd_ext in sse:\n        if deg == 2:\n            return ldst234.store2(simd_ext, typ, align, fmtspec)\n        if deg == 3:\n            return ldst234.store3_sse(simd_ext, typ, align, fmtspec)\n        if deg == 4:\n            return ldst234.store4_sse(typ, align, fmtspec)\n    if simd_ext in avx:\n        if deg == 2:\n            return ldst234.store2(simd_ext, typ, align, fmtspec)\n        if deg == 3:\n            return ldst234.store3_avx(simd_ext, typ, align, fmtspec)\n        if deg == 4:\n            return ldst234.store4_avx(simd_ext, typ, align, fmtspec)\n    if simd_ext in avx512:\n        if deg == 2:\n            return ldst234.store2(simd_ext, typ, align, fmtspec)\n        if deg == 3:\n            return ldst234.store3_avx512(simd_ext, typ, align, fmtspec)\n        if deg == 4:\n            return ldst234.store4_avx512(simd_ext, typ, align, fmtspec)\n    return common.NOT_IMPLEMENTED\n\n# -----------------------------------------------------------------------------\n# Store\n\ndef store(simd_ext, typ, aligned):\n    align = '' if aligned else 'u'\n    cast = castsi(simd_ext, typ)\n    if typ == 'f16':\n        if simd_ext in sse:\n            return \\\n            '''#ifdef NSIMD_FP16\n                 __m128i v0 = _mm_cvtps_ph({in1}.v0, 4);\n                 __m128i v1 = _mm_cvtps_ph({in1}.v1, 4);\n                 __m128d v = _mm_shuffle_pd(_mm_castsi128_pd(v0),\n                               _mm_castsi128_pd(v1),\n                                 0 /* = (0 << 1) | (0 << 0) */);\n                 _mm_store{align}_pd((f64*){in0}, v);\n               #else\n                 /* Note that we can do much better but is it useful? */\n                 f32 buf[4];\n                 _mm_storeu_ps(buf, {in1}.v0);\n                 *((u16*){in0}    ) = nsimd_f32_to_u16(buf[0]);\n                 *((u16*){in0} + 1) = nsimd_f32_to_u16(buf[1]);\n                 *((u16*){in0} + 2) = nsimd_f32_to_u16(buf[2]);\n                 *((u16*){in0} + 3) = nsimd_f32_to_u16(buf[3]);\n                 _mm_storeu_ps(buf, {in1}.v1);\n                 *((u16*){in0} + 4) = nsimd_f32_to_u16(buf[0]);\n                 *((u16*){in0} + 5) = nsimd_f32_to_u16(buf[1]);\n                 *((u16*){in0} + 6) = nsimd_f32_to_u16(buf[2]);\n                 *((u16*){in0} + 7) = nsimd_f32_to_u16(buf[3]);\n               #endif'''.format(align=align, **fmtspec)\n        elif simd_ext in avx:\n            return \\\n            '''#ifdef NSIMD_FP16\n                 _mm_store{align}_si128((__m128i*){in0},\n                   _mm256_cvtps_ph({in1}.v0, 4));\n                 _mm_store{align}_si128((__m128i*){in0} + 1,\n                   _mm256_cvtps_ph({in1}.v1, 4));\n               #else\n                 /* Note that we can do much better but is it useful? */\n                 int i;\n                 f32 buf[8];\n                 _mm256_storeu_ps(buf, {in1}.v0);\n                 for (i = 0; i < 8; i++) {{\n                   *((u16*){in0} + i) = nsimd_f32_to_u16(buf[i]);\n                 }}\n                 _mm256_storeu_ps(buf, {in1}.v1);\n                 for (i = 0; i < 8; i++) {{\n                   *((u16*){in0} + (8 + i)) = nsimd_f32_to_u16(buf[i]);\n                 }}\n               #endif'''.format(align=align, **fmtspec)\n        elif simd_ext in avx512:\n            return \\\n            '''_mm256_store{align}_si256((__m256i*){in0},\n                   _mm512_cvtps_ph({in1}.v0, 4));\n               _mm256_store{align}_si256((__m256i*){in0} + 1,\n                   _mm512_cvtps_ph({in1}.v1, 4));'''. \\\n                        format(align=align, **fmtspec)\n    else:\n        return '{pre}store{align}{sufsi}({cast}{in0}, {in1});'. \\\n               format(align=align, cast=cast, **fmtspec)\n\n# masked store\n\ndef mask_store(simd_ext, typ, aligned):\n    if typ == 'f16':\n        le2 = fmtspec['le'] // 2\n        if simd_ext in sse + avx:\n            store_mask = '''{pre}storeu_ps(mask, {in0}.v0);\n                            {pre}storeu_ps(mask + {le2}, {in0}.v1);'''. \\\n                            format(le2=le2, **fmtspec)\n        else:\n            store_mask = '''_mm512_storeu_ps(mask, _mm512_maskz_mov_ps(\n                              {in0}.v0, _mm512_set1_ps(1.0f)));\n                            _mm512_storeu_ps(mask + {le2}, _mm512_maskz_mov_ps(\n                              {in0}.v1, _mm512_set1_ps(1.0f)));'''. \\\n                            format(le2=le2, **fmtspec)\n        return '''f32 mask[{le}], buf[{le}];\n                  int i;\n                  {store_mask}\n                  {pre}storeu_ps(buf, {in2}.v0);\n                  {pre}storeu_ps(buf + {le2}, {in2}.v1);\n                  for (i = 0; i < {le}; i++) {{\n                    if (nsimd_scalar_reinterpret_u32_f32(mask[i]) != (u32)0) {{\n                      {in1}[i] = nsimd_f32_to_f16(buf[i]);\n                    }}\n                  }}'''.format(store_mask=store_mask, le2=le2, **fmtspec)\n    suf2 = 'ps' if typ[1:] == '32' else 'pd'\n    if simd_ext in sse:\n        if typ in common.iutypes:\n            return '_mm_maskmoveu_si128({in2}, {in0}, (char *){in1});'. \\\n                   format(**fmtspec)\n        else:\n            return '''_mm_maskmoveu_si128(_mm_cast{suf2}_si128({in2}),\n                                          _mm_cast{suf2}_si128({in0}),\n                                          (char *){in1});'''. \\\n                                          format(suf2=suf2, **fmtspec)\n    if typ in ['i8', 'u8', 'i16', 'u16'] and simd_ext != 'avx512_skylake':\n        if simd_ext == 'avx512_knl':\n            return \\\n            '''int i;\n               u64 mask;\n               {typ} buf[{le}];\n               {pre}storeu{sufsi}((__m512i *)buf, {in2});\n               mask = (u64){in0};\n               for (i = 0; i < {le}; i++) {{\n                 if ((mask >> i) & 1) {{\n                   {in1}[i] = buf[i];\n                 }}\n               }}'''.format(utyp='u' + typ[1:], **fmtspec)\n        else:\n            return \\\n            '''nsimd_{op_name}_sse42_{typ}({mask_lo}, {in1}, {val_lo});\n               nsimd_{op_name}_sse42_{typ}({mask_hi}, {in1} + {le2},\n                                           {val_hi});\n               '''.format(le2=fmtspec['le'] // 2,\n               op_name='mask_store{}1'.format('a' if  aligned else 'u'),\n               mask_lo=extract(simd_ext, typ, LO, common.in0),\n               mask_hi=extract(simd_ext, typ, HI, common.in0),\n               val_lo=extract(simd_ext, typ, LO, common.in2),\n               val_hi=extract(simd_ext, typ, HI, common.in2), **fmtspec)\n    # Here typ is 32 of 64-bits wide except\n    if simd_ext in avx:\n        if typ in common.ftypes:\n            return '''{pre}maskstore{suf}({in1},\n                          {pre}cast{suf2}_si256({in0}), {in2});'''. \\\n                          format(suf2=suf2, **fmtspec)\n        else:\n            if simd_ext == 'avx2':\n                return '{pre}maskstore{suf}({cast}{in1}, {in0}, {in2});'. \\\n                       format(cast='(nsimd_longlong *)' \\\n                              if typ in ['i64', 'u64'] \\\n                              else '(int *)', **fmtspec)\n            else:\n                return '''{pre}maskstore_{suf2}(({ftyp}*){in1}, {in0},\n                            {pre}castsi256_{suf2}({in2}));'''. \\\n                            format(suf2=suf2, ftyp='f' + typ[1:], **fmtspec)\n    # getting here means avx512 with intrinsics\n    code = '{pre}mask_store{{}}{suf}((void*){in1}, {in0}, {in2});'. \\\n           format(**fmtspec)\n    if typ in ['i32', 'u32', 'f32', 'i64', 'u64', 'f64']:\n        return code.format('' if aligned else 'u')\n    else:\n        return code.format('u')\n\n# -----------------------------------------------------------------------------\n# Code for binary operators: and, or, xor\n\ndef binop2(func, simd_ext, typ, logical=False):\n    logical = 'l' if logical else ''\n    func = func[0:-1]\n    if typ == 'f16':\n        return \\\n        '''nsimd_{simd_ext}_v{logi}f16 ret;\n           ret.v0 = nsimd_{func}{logi2}_{simd_ext}_f32({in0}.v0, {in1}.v0);\n           ret.v1 = nsimd_{func}{logi2}_{simd_ext}_f32({in0}.v1, {in1}.v1);\n           return ret;'''.format(logi='l' if logical else '', func=func,\n                                 logi2='l' if logical else 'b', **fmtspec)\n    normal = 'return {pre}{func}{sufsi}({in0}, {in1});'. \\\n             format(func=func, **fmtspec)\n    if simd_ext in sse:\n        return normal\n    if simd_ext in avx:\n        if simd_ext == 'avx2' or typ in ['f32', 'f64']:\n            return normal\n        else:\n            return '''return _mm256_castpd_si256(_mm256_{func}_pd(\n                               _mm256_castsi256_pd({in0}),\n                                 _mm256_castsi256_pd({in1})));'''. \\\n                                 format(func=func, **fmtspec)\n    if simd_ext in avx512:\n        if simd_ext == 'avx512_skylake' or typ in common.iutypes:\n            return normal\n        else:\n            return \\\n            '''return _mm512_castsi512{suf}(_mm512_{func}_si512(\n                        _mm512_cast{typ2}_si512({in0}),\n                          _mm512_cast{typ2}_si512({in1})));'''. \\\n                          format(func=func, typ2=suf_ep(typ)[1:], **fmtspec)\n\n# -----------------------------------------------------------------------------\n# Code for logical binary operators: andl, orl, xorl\n\ndef binlop2(func, simd_ext, typ):\n    op = { 'orl': '|', 'xorl': '^', 'andl': '&' }\n    op_fct = { 'orl': 'kor', 'xorl': 'kxor', 'andl': 'kand' }\n    if simd_ext not in avx512:\n        if typ == 'f16':\n            return binop2(func, simd_ext, typ, True)\n        else:\n            return binop2(func, simd_ext, typ)\n    elif simd_ext == 'avx512_knl':\n        if typ == 'f16':\n            return '''nsimd_{simd_ext}_vlf16 ret;\n                      ret.v0 = _{op_fct}_mask16({in0}.v0, {in1}.v0);\n                      ret.v1 = _{op_fct}_mask16({in0}.v1, {in1}.v1);\n                      return ret;'''. \\\n                      format(op_fct=op_fct[func], **fmtspec)\n        elif typ in ['f32', 'u32', 'i32']:\n            return 'return _{op_fct}_mask16({in0}, {in1});'. \\\n                   format(op_fct=op_fct[func], **fmtspec)\n        else:\n            return 'return (__mmask{le})({in0} {op} {in1});'. \\\n                   format(op=op[func], **fmtspec)\n    elif simd_ext == 'avx512_skylake':\n        if typ == 'f16':\n            return '''nsimd_{simd_ext}_vlf16 ret;\n                      #if defined(NSIMD_IS_GCC) || defined(NSIMD_IS_CLANG)\n                        ret.v0 = (__mmask16)({in0}.v0 {op} {in1}.v0);\n                        ret.v1 = (__mmask16)({in0}.v1 {op} {in1}.v1);\n                      #else\n                        ret.v0 = _{op_fct}_mask16({in0}.v0, {in1}.v0);\n                        ret.v1 = _{op_fct}_mask16({in0}.v1, {in1}.v1);\n                      #endif\n                      return ret;'''. \\\n                      format(op_fct=op_fct[func], op=op[func], **fmtspec)\n        else:\n            return '''#if defined(NSIMD_IS_GCC) || defined(NSIMD_IS_CLANG)\n                        return (__mmask{le})({in0} {op} {in1});\n                      #else\n                        return _{op_fct}_mask{le}({in0}, {in1});\n                      #endif'''.format(op_fct=op_fct[func], op=op[func],\n                                       **fmtspec)\n\n# -----------------------------------------------------------------------------\n# andnot\n\ndef andnot2(simd_ext, typ, logical=False):\n    if typ == 'f16':\n        return \\\n        '''nsimd_{simd_ext}_v{logi}f16 ret;\n           ret.v0 = nsimd_andnot{logi2}_{simd_ext}_f32({in0}.v0, {in1}.v0);\n           ret.v1 = nsimd_andnot{logi2}_{simd_ext}_f32({in0}.v1, {in1}.v1);\n           return ret;'''.format(logi='l' if logical else '',\n                                 logi2='l' if logical else 'b', **fmtspec)\n    if simd_ext in sse:\n        return 'return _mm_andnot{sufsi}({in1}, {in0});'.format(**fmtspec)\n    if simd_ext in avx:\n        if simd_ext == 'avx2' or typ in ['f32', 'f64']:\n            return 'return _mm256_andnot{sufsi}({in1}, {in0});'. \\\n                   format(**fmtspec)\n        else:\n            return '''return _mm256_castpd_si256(_mm256_andnot_pd(\n                               _mm256_castsi256_pd({in1}),\n                               _mm256_castsi256_pd({in0})));'''. \\\n                               format(**fmtspec)\n    if simd_ext in avx512:\n        if simd_ext == 'avx512_skylake' or typ in common.iutypes:\n            return 'return _mm512_andnot{sufsi}({in1}, {in0});'. \\\n                   format(**fmtspec)\n        else:\n            return '''return _mm512_castsi512{suf}(_mm512_andnot_si512(\n                               _mm512_cast{suf2}_si512({in1}),\n                               _mm512_cast{suf2}_si512({in0})));'''. \\\n                               format(suf2=fmtspec['suf'][1:], **fmtspec)\n\n# -----------------------------------------------------------------------------\n# logical andnot\n\ndef landnot2(simd_ext, typ):\n    if simd_ext in avx512:\n        if typ == 'f16':\n            return '''nsimd_{simd_ext}_vlf16 ret;\n                      ret.v0 = (__mmask16)({in0}.v0 & (~{in1}.v0));\n                      ret.v1 = (__mmask16)({in0}.v1 & (~{in1}.v1));\n                      return ret;'''.format(**fmtspec)\n        else:\n            return 'return (__mmask{le})({in0} & (~{in1}));'.format(**fmtspec)\n    return andnot2(simd_ext, typ, True)\n\n# -----------------------------------------------------------------------------\n# Code for unary not\n\ndef not1(simd_ext, typ, logical=False):\n    if typ == 'f16':\n        return \\\n        '''nsimd_{simd_ext}_v{logi}f16 ret;\n           nsimd_{simd_ext}_vf32 cte = {pre}castsi{nbits}_ps(\n                                         {pre}set1_epi8(-1));\n           ret.v0 = nsimd_andnot{logi2}_{simd_ext}_f32(cte, {in0}.v0);\n           ret.v1 = nsimd_andnot{logi2}_{simd_ext}_f32(cte, {in0}.v1);\n           return ret;'''.format(logi='l' if logical else '',\n                                 logi2='l' if logical else 'b', **fmtspec)\n    elif typ in ['f32', 'f64']:\n        return '''return nsimd_andnotb_{simd_ext}_{typ}(\n                           {pre}castsi{nbits}{suf}(\n                             {pre}set1_epi8(-1)), {in0});'''.format(**fmtspec)\n    else:\n        return '''return nsimd_andnotb_{simd_ext}_{typ}(\n                           {pre}set1_epi8(-1), {in0});'''.format(**fmtspec)\n\n# -----------------------------------------------------------------------------\n# Code for unary logical lnot\n\ndef lnot1(simd_ext, typ):\n    if simd_ext in avx512:\n        if typ == 'f16':\n            return '''nsimd_{simd_ext}_vlf16 ret;\n                      ret.v0 = (__mmask16)(~{in0}.v0);\n                      ret.v1 = (__mmask16)(~{in0}.v1);\n                      return ret;'''.format(**fmtspec)\n        else:\n            return 'return (__mmask{le})(~{in0});'.format(**fmtspec)\n    return not1(simd_ext, typ, True)\n\n# -----------------------------------------------------------------------------\n# Addition and substraction\n\ndef addsub(func, simd_ext, typ):\n    if typ in common.ftypes or simd_ext in sse or \\\n       (simd_ext in avx512 and typ in ['u32', 'i32', 'u64', 'i64']):\n        return how_it_should_be_op2(func, simd_ext, typ)\n    else:\n        if simd_ext in ['avx2', 'avx512_skylake']:\n            return how_it_should_be_op2(func, simd_ext, typ)\n        else:\n            return split_op2(func, simd_ext, typ)\n\n# -----------------------------------------------------------------------------\n# Len\n\ndef len1(simd_ext, typ):\n    return 'return {le};'.format(**fmtspec)\n\n# -----------------------------------------------------------------------------\n# Division\n\ndef div2(opts, simd_ext, typ):\n    if typ in common.ftypes:\n        return how_it_should_be_op2('div', simd_ext, typ)\n    return emulate_op2(opts, '/', simd_ext, typ)\n\n# -----------------------------------------------------------------------------\n# Multiplication\n\ndef mul2(opts, simd_ext, typ):\n    emulate = emulate_op2(opts, '*', simd_ext, typ)\n    split = split_op2('mul', simd_ext, typ)\n    # Floats\n    if typ in common.ftypes:\n        return how_it_should_be_op2('mul', simd_ext, typ)\n    # Integers 16, 32 on SSE\n    if simd_ext in sse and typ in ['i16', 'u16']:\n        return 'return _mm_mullo_epi16({in0}, {in1});'.format(**fmtspec)\n    if simd_ext in sse and typ in ['i32', 'u32']:\n        if simd_ext == 'sse42':\n            return 'return _mm_mullo_epi32({in0}, {in1});'.format(**fmtspec)\n        else:\n            return emulate\n    # Integers 16, 32 on AVX\n    if simd_ext in avx and typ in ['i16', 'u16', 'i32', 'u32']:\n        if simd_ext == 'avx2':\n            return 'return _mm256_mullo{suf}({in0}, {in1});'.format(**fmtspec)\n        else:\n            return split\n    # Integers 64 on SSE on AVX\n    if simd_ext in sse + avx and typ in ['i64', 'u64']:\n        return emulate_op2(opts, '*', simd_ext, typ)\n    # Integers 16 on AVX512\n    if simd_ext in avx512 and typ in ['i16', 'u16']:\n        if simd_ext == 'avx512_skylake':\n            return 'return _mm512_mullo_epi16({in0}, {in1});'.format(**fmtspec)\n        else:\n            return split\n    # Integers 32 on AVX512\n    if simd_ext in avx512 and typ in ['i32', 'u32']:\n        return 'return _mm512_mullo_epi32({in1}, {in0});'.format(**fmtspec)\n    # Integers 64 on AVX512\n    if simd_ext in avx512 and typ in ['i64', 'u64']:\n        if simd_ext == 'avx512_skylake':\n            return 'return _mm512_mullo_epi64({in0}, {in1});'.format(**fmtspec)\n        else:\n            return emulate\n    # Integers 8 on SSE\n    with_epi16 = '''nsimd_{simd_ext}_v{typ} lo =\n                        {pre}mullo_epi16({in0}, {in1});\n                    nsimd_{simd_ext}_v{typ} hi = {pre}slli_epi16(\n                        {pre}mullo_epi16({pre}srli_epi16({in0}, 8),\n                          {pre}srli_epi16({in1}, 8)), 8);\n                    return {pre}or{sufsi}({pre}and{sufsi}(\n                              lo, {pre}set1_epi16(255)),hi);'''. \\\n                    format(**fmtspec)\n    split_epi16 = split_op2('mul', simd_ext, typ)\n    if simd_ext in sse and typ in ['i8', 'u8']:\n        return with_epi16\n    if simd_ext in avx + avx512 and typ in ['i8', 'u8']:\n        if simd_ext in ['avx2', 'avx512_skylake']:\n            return with_epi16\n        else:\n            return split_epi16\n\n# -----------------------------------------------------------------------------\n# Shift left and right\n\ndef shl_shr(func, simd_ext, typ):\n    if typ in ['f16', 'f32', 'f64']:\n        return ''\n    intrinsic = 'srl' if func == 'shr' else 'sll'\n    simd_ext2 = 'sse42' if simd_ext in avx else 'avx2'\n    split = '''nsimd_{simd_ext2}_v{typ} v0 = {extract_lo};\n               nsimd_{simd_ext2}_v{typ} v1 = {extract_hi};\n               v0 = nsimd_{func}_{simd_ext2}_{typ}(v0, {in1});\n               v1 = nsimd_{func}_{simd_ext2}_{typ}(v1, {in1});\n               return {merge};'''. \\\n               format(simd_ext2=simd_ext2, func=func,\n                      extract_lo=extract(simd_ext, typ, LO, common.in0),\n                      extract_hi=extract(simd_ext, typ, HI, common.in0),\n                      merge=setr(simd_ext, typ, 'v0', 'v1'), **fmtspec)\n    normal_16_32_64 = '''return {pre}{intrinsic}{suf}(\n                           {in0}, _mm_set1_epi64x({in1}));'''. \\\n                      format(intrinsic=intrinsic, **fmtspec)\n    FFs = '0x' + ('F' * int((int(typ[1:]) // 4)))\n    FFOOs = FFs  + ('0' * int((int(typ[1:]) // 4)))\n    with_2n_for_n = '''nsimd_{simd_ext}_v{typ} lo = {pre}and{sufsi}(\n                         {pre}{intrinsic}_epi{typ2nbits}(\n                           {in0}, _mm_set1_epi64x({in1})),\n                             nsimd_set1_{simd_ext}_u{typ2nbits}({masklo}));\n                       nsimd_{simd_ext}_v{typ} hi =\n                         {pre}{intrinsic}_epi{typ2nbits}({pre}and{sufsi}({in0},\n                           nsimd_set1_{simd_ext}_u{typ2nbits}({maskhi})),\n                             _mm_set1_epi64x({in1}));\n                       return {pre}or{sufsi}(hi, lo);'''. \\\n                       format(intrinsic=intrinsic, typ2nbits=2 * int(typ[1:]),\n                              masklo=FFs if func == 'shl' else FFOOs,\n                              maskhi=FFOOs if func == 'shl' else FFs, **fmtspec)\n    with_32_for_8 = '''nsimd_{simd_ext}_v{typ} masklo =\n                         nsimd_set1_{simd_ext}_u32(0xFF00FF);\n                       nsimd_{simd_ext}_v{typ} lo =\n                         {pre}and{sufsi}({pre}{intrinsic}_epi32(\n                           {pre}and{sufsi}({in0}, masklo),\n                             _mm_set1_epi64x({in1})), masklo);\n                       nsimd_{simd_ext}_v{typ} maskhi =\n                         nsimd_set1_{simd_ext}_u32(0xFF00FF00);\n                       nsimd_{simd_ext}_v{typ} hi =\n                           {pre}and{sufsi}({pre}{intrinsic}_epi32(\n                             {pre}and{sufsi}({in0}, maskhi),\n                               _mm_set1_epi64x({in1})), maskhi);\n                       return {pre}or{sufsi}(hi, lo);'''. \\\n                       format(intrinsic=intrinsic, **fmtspec)\n    if simd_ext in sse:\n        if typ in ['i8', 'u8']:\n            return with_2n_for_n\n        if typ in ['i16', 'u16', 'i32', 'u32', 'i64', 'u64']:\n            return normal_16_32_64\n    if simd_ext in avx:\n        if typ in ['i8', 'u8']:\n            return with_2n_for_n if simd_ext == 'avx2' else split\n        if typ in ['i16', 'u16', 'i32', 'u32', 'i64', 'u64']:\n            return normal_16_32_64 if simd_ext == 'avx2' else split\n    if simd_ext in avx512:\n        if typ in ['i8', 'u8']:\n            return with_2n_for_n if simd_ext == 'avx512_skylake' \\\n                                 else with_32_for_8\n        if typ in ['i16', 'u16']:\n            return normal_16_32_64 if simd_ext == 'avx512_skylake' \\\n                                   else with_2n_for_n\n        if typ in ['i32', 'u32', 'i64', 'u64']:\n            return normal_16_32_64\n\n# -----------------------------------------------------------------------------\n# Arithmetic shift right\n\ndef shra(opts, simd_ext, typ):\n    if typ in common.utypes:\n        # For unsigned type, logical shift\n        return 'return nsimd_shr_{simd_ext}_{typ}({in0}, {in1});'. \\\n               format(**fmtspec)\n\n    intrinsic = 'return {pre}sra{suf}({in0}, _mm_set1_epi64x((i64){in1}));'. \\\n                format(**fmtspec)\n\n    simd_ext2 = 'sse42' if simd_ext in avx else 'avx2'\n    split = '''nsimd_{simd_ext2}_v{typ} v0 = {extract_lo};\n               nsimd_{simd_ext2}_v{typ} v1 = {extract_hi};\n               v0 = nsimd_shra_{simd_ext2}_{typ}(v0, {in1});\n               v1 = nsimd_shra_{simd_ext2}_{typ}(v1, {in1});\n               return {merge};'''. \\\n               format(simd_ext2=simd_ext2,\n                      extract_lo=extract(simd_ext, typ, LO, common.in0),\n                      extract_hi=extract(simd_ext, typ, HI, common.in0),\n                      merge=setr(simd_ext, typ, 'v0', 'v1'), **fmtspec)\n\n    trick_for_i8 = \\\n    '''__m128i count = _mm_set1_epi64x((i64){in1});\n       nsimd_{simd_ext}_vi16 lo, hi;\n       hi = {pre}andnot{sufsi}({pre}set1_epi16(255),\n                               {pre}sra_epi16({in0}, count));\n       lo = {pre}srli_epi16({pre}sra_epi16(\n                {pre}slli_epi16({in0}, 8), count), 8);\n       return {pre}or{sufsi}(hi, lo);'''.format(**fmtspec)\n\n    emulation = get_emulation_code('shra', ['v', 's'], simd_ext, typ)\n\n    if simd_ext in sse + ['avx2']:\n        if typ == 'i8':\n            return trick_for_i8\n        elif typ in ['i16', 'i32']:\n            return intrinsic\n        elif typ == 'i64':\n            return emulation\n    elif simd_ext == 'avx':\n        if typ in ['i8', 'i16', 'i32']:\n            return split\n        elif typ == 'i64':\n            return emulation\n    elif simd_ext == 'avx512_knl':\n        if typ in ['i8', 'i16']:\n            return split\n        elif typ in ['i32', 'i64']:\n            return intrinsic\n    elif simd_ext == 'avx512_skylake':\n        if typ == 'i8':\n            return trick_for_i8\n        elif typ in ['i16', 'i32', 'i64']:\n            return intrinsic\n\n# -----------------------------------------------------------------------------\n# set1 or splat function\n\ndef set1(simd_ext, typ):\n    if typ == 'f16':\n        return '''nsimd_{simd_ext}_vf16 ret;\n                  f32 f = nsimd_f16_to_f32({in0});\n                  ret.v0 = {pre}set1_ps(f);\n                  ret.v1 = {pre}set1_ps(f);\n                  return ret;'''.format(**fmtspec)\n    if simd_ext in sse + avx:\n        if typ == 'i64':\n            return 'return {pre}set1_epi64x({in0});'.format(**fmtspec)\n        if typ == 'u64':\n            return '''union {{ u64 u; i64 i; }} buf;\n                      buf.u = {in0};\n                      return {pre}set1_epi64x(buf.i);'''.format(**fmtspec)\n    if typ in ['u8', 'u16', 'u32', 'u64']:\n        return '''union {{ {typ} u; i{typnbits} i; }} buf;\n                  buf.u = {in0};\n                  return {pre}set1{suf}(buf.i);'''.format(**fmtspec)\n    return 'return {pre}set1{suf}({in0});'.format(**fmtspec)\n\n# -----------------------------------------------------------------------------\n# set1l or splat function for logical\n\ndef set1l(simd_ext, typ):\n    if typ == 'f16':\n        return '''nsimd_{simd_ext}_vlf16 ret;\n                  ret.v0 = nsimd_set1l_{simd_ext}_f32({in0});\n                  ret.v1 = ret.v0;\n                  return ret;'''.format(**fmtspec)\n    if simd_ext in sse + avx:\n        if simd_ext in sse:\n            ones = '_mm_cmpeq_pd(_mm_setzero_pd(), _mm_setzero_pd())'\n        else:\n            ones = '_mm256_cmp_pd(_mm256_setzero_pd(), _mm256_setzero_pd(), ' \\\n                   '_CMP_EQ_OQ)'\n        if typ != 'f64':\n            ones = '{pre}castpd{sufsi}({ones})'.format(ones=ones, **fmtspec)\n        return '''if ({in0}) {{\n                    return {ones};\n                  }} else {{\n                    return {pre}setzero{sufsi}();\n                  }}'''.format(ones=ones, **fmtspec)\n    else:\n        return '''if ({in0}) {{\n                    return (__mmask{le})(~(__mmask{le})(0));\n                  }} else {{\n                    return (__mmask{le})(0);\n                  }}'''.format(**fmtspec)\n\n# -----------------------------------------------------------------------------\n# Equality\n\ndef eq2(simd_ext, typ):\n    if typ == 'f16':\n        return f16_cmp2('eq', simd_ext)\n    if simd_ext in sse:\n        if typ in ['i64', 'u64']:\n            if simd_ext == 'sse42':\n                return how_it_should_be_op2('cmpeq', simd_ext, typ)\n            else:\n                return \\\n                '''__m128i t = _mm_cmpeq_epi32({in0}, {in1});\n                   return _mm_and_si128(t,\n                            _mm_shuffle_epi32(t, 177) /* = 2|3|0|1 */);'''. \\\n                            format(**fmtspec)\n        else:\n            return how_it_should_be_op2('cmpeq', simd_ext, typ)\n    if simd_ext in avx:\n        if typ in ['f32', 'f64']:\n            return 'return _mm256_cmp{suf}({in0}, {in1}, _CMP_EQ_OQ);'. \\\n                   format(**fmtspec)\n        else:\n            if simd_ext == 'avx2':\n                return how_it_should_be_op2('cmpeq', simd_ext, typ)\n            else:\n                return split_cmp2('eq', simd_ext, typ)\n    if simd_ext in avx512:\n        if typ in ['f32', 'f64']:\n            return 'return _mm512_cmp{suf}_mask({in0}, {in1}, _CMP_EQ_OQ);'. \\\n                   format(**fmtspec)\n        elif typ in ['i32', 'u32', 'i64', 'u64']:\n            return \\\n            'return _mm512_cmp{suf}_mask({in0}, {in1}, _MM_CMPINT_EQ);'. \\\n            format(**fmtspec)\n        else:\n            if simd_ext == 'avx512_skylake':\n                return \\\n                'return _mm512_cmp{suf}_mask({in0}, {in1}, _MM_CMPINT_EQ);'. \\\n                format(**fmtspec)\n            else:\n                return split_cmp2('eq', simd_ext, typ)\n\n# -----------------------------------------------------------------------------\n# not equal\n\ndef neq2(simd_ext, typ):\n    if typ == 'f16':\n        return f16_cmp2('ne', simd_ext)\n    if simd_ext in sse and typ in ['f32', 'f64']:\n        return how_it_should_be_op2('cmpneq', simd_ext, typ)\n    if simd_ext in avx and typ in ['f32', 'f64']:\n        return 'return _mm256_cmp{suf}({in0}, {in1}, _CMP_NEQ_UQ);'. \\\n               format(**fmtspec)\n    if simd_ext in avx512 and typ in ['f32', 'f64']:\n        return 'return _mm512_cmp{suf}_mask({in0}, {in1}, _CMP_NEQ_UQ);'. \\\n               format(**fmtspec)\n    noteq = '''return nsimd_notl_{simd_ext}_{typ}(\n                        nsimd_eq_{simd_ext}_{typ}({in0}, {in1}));'''. \\\n                        format(**fmtspec)\n    if simd_ext in avx512:\n        intrinsic = \\\n            'return _mm512_cmp{suf}_mask({in0}, {in1}, _MM_CMPINT_NE);'. \\\n            format(**fmtspec)\n        if typ in ['i32', 'u32', 'i64', 'u64']:\n            return intrinsic\n        else:\n            return intrinsic if  simd_ext == 'avx512_skylake' else noteq\n    return noteq\n\n# -----------------------------------------------------------------------------\n# Greater than\n\ndef gt2(simd_ext, typ):\n    if typ == 'f16':\n        return f16_cmp2('gt', simd_ext)\n    if simd_ext in sse:\n        if typ in ['f32', 'f64', 'i8', 'i16', 'i32']:\n            return how_it_should_be_op2('cmpgt', simd_ext, typ)\n        if typ == 'i64':\n            if simd_ext == 'sse42':\n                return how_it_should_be_op2('cmpgt', simd_ext, typ)\n            #return '''return _mm_sub_epi64(_mm_setzero_si128(), _mm_srli_epi64(\n            #                   _mm_sub_epi64({in1}, {in0}), 63));'''. \\\n            #                   format(**fmtspec)\n            return '''{typ} buf0[2], buf1[2];\n                      _mm_storeu_si128((__m128i*)buf0, {in0});\n                      _mm_storeu_si128((__m128i*)buf1, {in1});\n                      buf0[0] = -(buf0[0] > buf1[0]);\n                      buf0[1] = -(buf0[1] > buf1[1]);\n                      return _mm_loadu_si128((__m128i*)buf0);'''. \\\n                      format(**fmtspec)\n        return cmp2_with_add('gt', simd_ext, typ)\n    if simd_ext in avx:\n        if typ in ['f32', 'f64']:\n            return 'return _mm256_cmp{suf}({in0}, {in1}, _CMP_GT_OQ);'. \\\n                   format(**fmtspec)\n        if typ in ['i8', 'i16', 'i32', 'i64']:\n            if simd_ext == 'avx2':\n                return how_it_should_be_op2('cmpgt', simd_ext, typ)\n            else:\n                return split_cmp2('gt', simd_ext, typ)\n        if simd_ext == 'avx2':\n            return cmp2_with_add('gt', simd_ext, typ)\n        else:\n            return split_cmp2('gt', simd_ext, typ)\n    # AVX512\n    if typ in ['f32', 'f64', 'i32', 'i64']:\n        return \\\n        'return _mm512_cmp{suf}_mask({in0}, {in1}, {cte});'. \\\n        format(cte='_CMP_GT_OQ' if typ in ['f32', 'f64'] else '_MM_CMPINT_NLE',\n               **fmtspec)\n    if typ in ['u32', 'u64']:\n        return \\\n        'return _mm512_cmp_epu{typ2}_mask({in0}, {in1}, _MM_CMPINT_NLE);'. \\\n        format(typ2=typ[1:], **fmtspec)\n    if simd_ext == 'avx512_skylake':\n        return \\\n        'return _mm512_cmp_ep{typ}_mask({in0}, {in1}, _MM_CMPINT_NLE);'. \\\n        format(**fmtspec)\n    else:\n        return split_cmp2('gt', simd_ext, typ)\n\n# -----------------------------------------------------------------------------\n# lesser than\n\ndef lt2(simd_ext, typ):\n    return 'return nsimd_gt_{simd_ext}_{typ}({in1}, {in0});'. \\\n           format(**fmtspec)\n\n# -----------------------------------------------------------------------------\n# greater or equal\n\ndef geq2(simd_ext, typ):\n    if typ == 'f16':\n        return f16_cmp2('ge', simd_ext)\n    notlt = '''return nsimd_notl_{simd_ext}_{typ}(\n                        nsimd_lt_{simd_ext}_{typ}({in0}, {in1}));'''. \\\n            format(**fmtspec)\n    if simd_ext in sse:\n        if typ in ['f32', 'f64']:\n            return how_it_should_be_op2('cmpge', simd_ext, typ)\n    if simd_ext in avx:\n        if typ in ['f32', 'f64']:\n            return 'return _mm256_cmp{suf}({in0}, {in1}, _CMP_GE_OQ);'. \\\n                   format(**fmtspec)\n    if simd_ext in avx512:\n        if typ in ['i32', 'i64', 'u32', 'u64']:\n            return \\\n              'return _mm512_cmp_ep{typ}_mask({in0}, {in1}, _MM_CMPINT_NLT);'. \\\n              format(**fmtspec)\n        if typ in ['f32', 'f64']:\n            return 'return _mm512_cmp{suf}_mask({in0}, {in1}, _CMP_GE_OQ);'. \\\n                   format(**fmtspec)\n        if simd_ext == 'avx512_skylake':\n            return \\\n            'return _mm512_cmp_ep{typ}_mask({in0}, {in1}, _MM_CMPINT_NLT);'. \\\n            format(**fmtspec)\n        else:\n            return notlt\n    return notlt\n\n# -----------------------------------------------------------------------------\n# lesser or equal\n\ndef leq2(simd_ext, typ):\n    if typ == 'f16':\n        return f16_cmp2('le', simd_ext)\n    notgt = '''return nsimd_notl_{simd_ext}_{typ}(\n                        nsimd_gt_{simd_ext}_{typ}({in0}, {in1}));'''. \\\n                        format(**fmtspec)\n    if simd_ext in sse and typ in ['f32', 'f64']:\n        return 'return _mm_cmple{suf}({in0}, {in1});'.format(**fmtspec)\n    if simd_ext in avx and typ in ['f32', 'f64']:\n            return 'return _mm256_cmp{suf}({in0}, {in1}, _CMP_LE_OQ);'. \\\n                   format(**fmtspec)\n    if simd_ext in avx512:\n        if typ in ['i32', 'i64', 'u32', 'u64']:\n            return \\\n              'return _mm512_cmp_ep{typ}_mask({in0}, {in1}, _MM_CMPINT_LE);'. \\\n              format(**fmtspec)\n        if typ in ['f32', 'f64']:\n            return 'return _mm512_cmp{suf}_mask({in0}, {in1}, _CMP_LE_OQ);'. \\\n                   format(**fmtspec)\n        if simd_ext == 'avx512_skylake':\n            return \\\n            'return _mm512_cmp_ep{typ}_mask({in0}, {in1}, _MM_CMPINT_LE);'. \\\n            format(**fmtspec)\n        else:\n            return notgt\n    return notgt\n\n# -----------------------------------------------------------------------------\n# if_else1 function\n\ndef if_else1(simd_ext, typ):\n    if typ == 'f16':\n        return '''nsimd_{simd_ext}_vf16 ret;\n                  ret.v0 = nsimd_if_else1_{simd_ext}_f32(\n                             {in0}.v0, {in1}.v0, {in2}.v0);\n                  ret.v1 = nsimd_if_else1_{simd_ext}_f32(\n                             {in0}.v1, {in1}.v1, {in2}.v1);\n                  return ret;'''.format(**fmtspec)\n    manual = '''return nsimd_orb_{simd_ext}_{typ}(\n                         nsimd_andb_{simd_ext}_{typ}({in1}, {in0}),\n                         nsimd_andnotb_{simd_ext}_{typ}({in2}, {in0}));'''. \\\n                         format(**fmtspec)\n    if simd_ext in sse:\n        if simd_ext == 'sse42':\n            return 'return _mm_blendv{fsuf}({in2}, {in1}, {in0});'. \\\n                   format(fsuf=suf_ep(typ) if typ in ['f32', 'f64']\n                          else '_epi8', **fmtspec)\n        else:\n            return manual\n    if simd_ext in avx:\n        if typ in ['f32', 'f64']:\n            return 'return _mm256_blendv{suf}({in2}, {in1}, {in0});'. \\\n                   format(**fmtspec)\n        else:\n            if simd_ext == 'avx2':\n                return 'return _mm256_blendv_epi8({in2}, {in1}, {in0});'. \\\n                       format(**fmtspec)\n            else:\n                return manual\n    if simd_ext in avx512:\n        if typ in ['f32', 'f64', 'i32', 'u32', 'i64', 'u64']:\n            return 'return _mm512_mask_blend{suf}({in0}, {in2}, {in1});'. \\\n                   format(**fmtspec)\n        else:\n            if simd_ext == 'avx512_skylake':\n                return 'return _mm512_mask_blend{suf}({in0}, {in2}, {in1});'. \\\n                       format(**fmtspec)\n            else:\n                return '''int i;\n                          {typ} buf0[{le}], buf1[{le}];\n                          _mm512_storeu_si512(buf0, {in1});\n                          _mm512_storeu_si512(buf1, {in2});\n                          for (i = 0; i < {le}; i++) {{\n                            if ((({in0} >> i) & 1) == 0) {{\n                              buf0[i] = buf1[i];\n                            }}\n                          }}\n                          return _mm512_loadu_si512(buf0);'''.format(**fmtspec)\n\n# -----------------------------------------------------------------------------\n# min and max functions\n\ndef minmax(func, simd_ext, typ):\n    if typ in ['f16', 'f32', 'f64']:\n        return how_it_should_be_op2(func, simd_ext, typ)\n    with_if_else = '''return nsimd_if_else1_{simd_ext}_{typ}(\n                               nsimd_gt_{simd_ext}_{typ}(\n                                 {args}), {in0}, {in1});'''. \\\n                   format(args = '{in0}, {in1}'.format(**fmtspec)\n                            if func == 'max'\n                            else '{in1}, {in0}'.format(**fmtspec), **fmtspec)\n    if simd_ext in sse:\n        if typ in ['u8', 'i16']:\n            return 'return _mm_{func}_ep{typ}({in0}, {in1});'. \\\n                   format(func=func, **fmtspec)\n        if typ in ['i8', 'u16', 'i32', 'u32']:\n            if simd_ext == 'sse42':\n                return 'return _mm_{func}_ep{typ}({in0}, {in1});'. \\\n                       format(func=func, **fmtspec)\n            else:\n                return with_if_else\n    if simd_ext in avx and typ in ['i8', 'u8', 'i16', 'u16', 'i32', 'u32']:\n        if simd_ext == 'avx2':\n            return 'return _mm256_{func}_ep{typ}({in0}, {in1});'. \\\n                   format(func=func, **fmtspec)\n        else:\n            return split_op2(func, simd_ext, typ)\n    if simd_ext in avx512:\n        if typ in ['i32', 'u32', 'i64', 'u64']:\n            return 'return _mm512_{func}_ep{typ}({in0}, {in1});'. \\\n                   format(func=func, **fmtspec)\n        else:\n            if simd_ext == 'avx512_skylake':\n                return 'return _mm512_{func}_ep{typ}({in0}, {in1});'. \\\n                       format(func=func, **fmtspec)\n            else:\n                return split_op2(func, simd_ext, typ)\n    return with_if_else\n\n# -----------------------------------------------------------------------------\n# sqrt\n\ndef sqrt1(simd_ext, typ):\n    if typ == 'f16':\n        return '''nsimd_{simd_ext}_vf16 ret;\n                  ret.v0 = {pre}sqrt_ps({in0}.v0);\n                  ret.v1 = {pre}sqrt_ps({in0}.v1);\n                  return ret;'''.format(**fmtspec)\n    return 'return {pre}sqrt{suf}({in0});'.format(**fmtspec)\n\n# -----------------------------------------------------------------------------\n# Load logical\n\ndef loadl(simd_ext, typ, aligned):\n    if simd_ext in avx512:\n        if typ == 'f16':\n            return '''/* This can surely be improved but it is not our\n                         priority. Note that we take advantage of the fact that\n                         floating zero is represented as integer zero to\n                         simplify code. */\n                      nsimd_{simd_ext}_vlf16 ret;\n                      __mmask32 tmp = nsimd_loadlu_{simd_ext}_u16((u16*){in0});\n                      ret.v0 = (__mmask16)(tmp & 0xFFFF);\n                      ret.v1 = (__mmask16)((tmp >> 16) & 0xFFFF);\n                      return ret;'''.format(**fmtspec)\n        return '''/* This can surely be improved but it is not our priority. */\n                  int i;\n                  __mmask{le} ret = 0;\n                  for (i = 0; i < {le}; i++) {{\n                    if ({in0}[i] != ({typ})0) {{\n                      ret |= (__mmask{le})((__mmask{le})1 << i);\n                    }}\n                  }}\n                  return ret;'''.format(**fmtspec)\n    return \\\n    '''/* This can surely be improved but it is not our priority. */\n       return nsimd_notl_{simd_ext}_{typ}(nsimd_eq_{simd_ext}_{typ}(\n                nsimd_load{align}_{simd_ext}_{typ}(\n                  {in0}), nsimd_set1_{simd_ext}_{typ}({zero})));'''. \\\n       format(align='a' if aligned else 'u',\n              zero = 'nsimd_f32_to_f16(0.0f)' if typ == 'f16'\n              else '({})0'.format(typ), **fmtspec)\n\n# -----------------------------------------------------------------------------\n# Store logical\n\ndef storel(simd_ext, typ, aligned):\n    if simd_ext in avx512:\n        if typ == 'f16':\n            return '''/* This can surely be improved but it is not our\n                         priority. Note that we take advantage of the fact that\n                         floating zero is represented as integer zero to\n                         simplify code. */\n                      int i;\n                      u16 one = 0x3C00; /* FP16 IEEE754 representation of 1 */\n                      for (i = 0; i < 16; i++) {{\n                        ((u16*){in0})[i] = (u16)((({in1}.v0 >> i) & 1) ? one\n                                                                       : 0);\n                      }}\n                      for (i = 0; i < 16; i++) {{\n                        ((u16*){in0})[i + 16] = (u16)((({in1}.v1 >> i) & 1)\n                                                      ? one : 0);\n                      }}'''.format(**fmtspec)\n        return '''/* This can surely be improved but it is not our priority. */\n                  int i;\n                  for (i = 0; i < {le}; i++) {{\n                    {in0}[i] = ({typ})((({in1} >> i) & 1) ? 1 : 0);\n                  }}'''.format(**fmtspec)\n    return \\\n    '''/* This can surely be improved but it is not our priority. */\n       nsimd_store{align}_{simd_ext}_{typ}({in0},\n         nsimd_if_else1_{simd_ext}_{typ}({in1},\n           nsimd_set1_{simd_ext}_{typ}({one}),\n           nsimd_set1_{simd_ext}_{typ}({zero})));'''. \\\n           format(align = 'a' if aligned else 'u',\n                  one = 'nsimd_f32_to_f16(1.0f)' if typ == 'f16'\n                  else '({})1'.format(typ),\n                  zero = 'nsimd_f32_to_f16(0.0f)' if typ == 'f16'\n                  else '({})0'.format(typ), **fmtspec)\n\n# -----------------------------------------------------------------------------\n# Absolute value\n\ndef abs1(simd_ext, typ):\n    def mask(typ):\n        return '0x7F' + ('F' * int(((int(typ[1:]) - 8) // 4)))\n    if typ == 'f16':\n        return \\\n        '''nsimd_{simd_ext}_vf16 ret;\n           nsimd_{simd_ext}_vf32 mask = {pre}castsi{nbits}_ps(\n                                          nsimd_set1_{simd_ext}_u32({mask}));\n           ret.v0 = nsimd_andb_{simd_ext}_f32({in0}.v0, mask);\n           ret.v1 = nsimd_andb_{simd_ext}_f32({in0}.v1, mask);\n           return ret;'''.format(mask=mask('f32'), **fmtspec)\n    if typ in ['u8', 'u16', 'u32', 'u64']:\n        return 'return {in0};'.format(**fmtspec)\n    if typ in ['f32', 'f64']:\n        return \\\n        '''nsimd_{simd_ext}_v{typ} mask = {pre}castsi{nbits}{suf}(\n               nsimd_set1_{simd_ext}_u{typnbits}({mask}));\n           return nsimd_andb_{simd_ext}_{typ}({in0}, mask);'''. \\\n           format(mask=mask(typ), **fmtspec)\n    bit_twiddling_arith_shift = \\\n    '''nsimd_{simd_ext}_v{typ} mask = {pre}srai{suf}({in0}, {typnbitsm1});\n       return {pre}xor{sufsi}({pre}add{suf}({in0}, mask), mask);'''. \\\n       format(typnbitsm1=int(typ[1:]) - 1, **fmtspec)\n    bit_twiddling_no_arith_shift = \\\n    '''nsimd_{simd_ext}_v{typ} mask = {pre}sub{suf}({pre}setzero{sufsi}(),\n                                        nsimd_shr_{simd_ext}_{typ}(\n                                          {in0}, {typnbitsm1}));\n       return {pre}xor{sufsi}({pre}add{suf}({in0}, mask), mask);'''. \\\n       format(typnbitsm1=int(typ[1:]) - 1, **fmtspec)\n    with_blendv = \\\n    '''return _mm256_castpd_si256(_mm256_blendv_pd(\n        _mm256_castsi256_pd({in0}),\n        _mm256_castsi256_pd(_mm256_sub_epi64(_mm256_setzero_si256(), {in0})),\n        _mm256_castsi256_pd({in0})));'''.format(**fmtspec)\n    if simd_ext in sse:\n        if typ in ['i16', 'i32']:\n            if simd_ext == 'sse42':\n                return 'return _mm_abs{suf}({in0});'.format(**fmtspec)\n            else:\n                return bit_twiddling_arith_shift\n        if typ == 'i8':\n            if simd_ext == 'sse42':\n                return 'return _mm_abs{suf}({in0});'.format(**fmtspec)\n            else:\n                return bit_twiddling_no_arith_shift\n        if typ == 'i64':\n            return bit_twiddling_no_arith_shift\n    if simd_ext in avx:\n        if typ in ['i8', 'i16', 'i32']:\n            if simd_ext == 'avx2':\n                return 'return _mm256_abs{suf}({in0});'.format(**fmtspec)\n            else:\n                return split_opn('abs', simd_ext, typ, 1)\n        else:\n            if simd_ext == 'avx2':\n                return with_blendv\n            else:\n                return split_opn('abs', simd_ext, typ, 1)\n    if simd_ext in avx512:\n        if typ in ['i32', 'i64']:\n            return 'return _mm512_abs{suf}({in0});'.format(**fmtspec)\n        else:\n            if simd_ext == 'avx512_skylake':\n                return 'return _mm512_abs{suf}({in0});'.format(**fmtspec)\n            else:\n                return split_opn('abs', simd_ext, typ, 1)\n\n# -----------------------------------------------------------------------------\n# FMA and FMS\n\ndef fma_fms(func, simd_ext, typ):\n    op = 'add' if func in ['fma', 'fnma'] else 'sub'\n    neg = 'n' if func in ['fnma', 'fnms'] else ''\n    if typ == 'f16':\n        return \\\n        '''nsimd_{simd_ext}_vf16 ret;\n           ret.v0 = nsimd_{func}_{simd_ext}_f32({in0}.v0, {in1}.v0, {in2}.v0);\n           ret.v1 = nsimd_{func}_{simd_ext}_f32({in0}.v1, {in1}.v1, {in2}.v1);\n           return ret;'''.format(neg=neg, func=func, **fmtspec)\n    if neg == '':\n        emulate = '''return nsimd_{op}_{simd_ext}_{typ}(\n                              nsimd_mul_{simd_ext}_{typ}({in0}, {in1}),\n                                {in2});'''.format(op=op, **fmtspec)\n    else:\n        emulate = '''return nsimd_{op}_{simd_ext}_{typ}(\n                              nsimd_mul_{simd_ext}_{typ}(\n                                nsimd_neg_{simd_ext}_{typ}({in0}), {in1}),\n                                    {in2});'''.format(op=op, **fmtspec)\n    # One could use only emulate and no split. But to avoid splitting and\n    # merging SIMD register for each operation: sub, mul and add, we use\n    # emulation only for SIMD extensions that have natively add, sub and mul\n    # intrinsics.\n    split = split_opn(func, simd_ext, typ, 3)\n    if typ in ['f32', 'f64']:\n        if simd_ext in sse + avx:\n            return '''#ifdef NSIMD_FMA\n                        return {pre}f{neg}m{op}{suf}({in0}, {in1}, {in2});\n                      # else\n                        {emulate}\n                      # endif'''.format(op=op, neg=neg, emulate=emulate,\n                                       **fmtspec)\n        else:\n            return 'return {pre}f{neg}m{op}{suf}({in0}, {in1}, {in2});'. \\\n                   format(op=op, neg=neg, **fmtspec)\n    if simd_ext in avx:\n        return emulate if simd_ext == 'avx2' else split\n    if simd_ext in avx512:\n        return emulate if simd_ext == 'avx512_skylake' else split\n    return emulate\n\n# -----------------------------------------------------------------------------\n# Ceil and floor\n\ndef round1(opts, func, simd_ext, typ):\n    if typ == 'f16':\n        return '''nsimd_{simd_ext}_vf16 ret;\n                  ret.v0 = nsimd_{func}_{simd_ext}_f32({in0}.v0);\n                  ret.v1 = nsimd_{func}_{simd_ext}_f32({in0}.v1);\n                  return ret;'''.format(func=func, **fmtspec)\n    if typ in ['f32', 'f64']:\n        normal = 'return {pre}{func}{suf}({in0});'.format(func=func, **fmtspec)\n        if simd_ext not in sse:\n            return normal\n        if simd_ext == 'sse42':\n            return normal\n        else:\n            return emulate_op1(opts, func, simd_ext, typ)\n    return 'return {in0};'.format(**fmtspec)\n\n# -----------------------------------------------------------------------------\n# Trunc\n\ndef trunc1(opts, simd_ext, typ):\n    if typ == 'f16':\n        return '''nsimd_{simd_ext}_vf16 ret;\n                  ret.v0 = nsimd_trunc_{simd_ext}_f32({in0}.v0);\n                  ret.v1 = nsimd_trunc_{simd_ext}_f32({in0}.v1);\n                  return ret;'''.format(**fmtspec)\n    if typ in ['f32', 'f64']:\n        normal = '''return {pre}round{suf}({in0}, _MM_FROUND_TO_ZERO |\n                               _MM_FROUND_NO_EXC);'''.format(**fmtspec)\n        if simd_ext == 'sse2':\n            return emulate_op1(opts, 'trunc', simd_ext, typ)\n        if simd_ext == 'sse42':\n            return normal\n        if simd_ext in avx:\n            return normal\n        if simd_ext in avx512:\n            return \\\n            '''__mmask{le} cond = nsimd_gt_{simd_ext}_{typ}(\n                                    {in0}, _mm512_setzero{sufsi}());\n               return nsimd_if_else1_{simd_ext}_{typ}(cond,\n                        nsimd_floor_{simd_ext}_{typ}({in0}),\n                          nsimd_ceil_{simd_ext}_{typ}({in0}));'''. \\\n                          format(**fmtspec)\n    return 'return {in0};'.format(**fmtspec)\n\n# -----------------------------------------------------------------------------\n# Round to even\n\ndef round_to_even1(opts, simd_ext, typ):\n    if typ == 'f16':\n        return '''nsimd_{simd_ext}_vf16 ret;\n                  ret.v0 = nsimd_round_to_even_{simd_ext}_f32({in0}.v0);\n                  ret.v1 = nsimd_round_to_even_{simd_ext}_f32({in0}.v1);\n                  return ret;'''.format(**fmtspec)\n    if typ in ['f32', 'f64']:\n        normal = '''return {pre}round{suf}({in0}, _MM_FROUND_TO_NEAREST_INT |\n                               _MM_FROUND_NO_EXC);'''.format(**fmtspec)\n        if simd_ext == 'sse2':\n            return emulate_op1(opts, 'round_to_even', simd_ext, typ)\n        if simd_ext == 'sse42':\n            return normal\n        if simd_ext in avx:\n            return normal\n        if simd_ext in avx512:\n            return 'return _mm512_roundscale{suf}({in0}, 0);'.format(**fmtspec)\n    return 'return {in0};'.format(**fmtspec)\n\n# -----------------------------------------------------------------------------\n# All and any functions\n\ndef all_any(func, simd_ext, typ):\n    if typ == 'f16':\n        return \\\n        '''return nsimd_{func}_{simd_ext}_f32({in0}.v0) {and_or}\n                  nsimd_{func}_{simd_ext}_f32({in0}.v1);'''. \\\n                  format(func=func, and_or='&&' if func == 'all' else '||',\n                         **fmtspec)\n    if simd_ext in sse:\n        if typ in common.iutypes:\n            return 'return (u32)_mm_movemask_epi8({in0}) {test};'. \\\n                   format(test='== 0xFFFF' if func == 'all' else '!= 0u',\n                          **fmtspec)\n        else:\n            mask = '0xF' if typ == 'f32' else '0x3'\n            return 'return (u32)_mm_movemask{suf}({in0}) {test};'. \\\n                   format(test='== ' + mask if func == 'all' else '!= 0u',\n                          **fmtspec)\n    if simd_ext in avx:\n        if typ in common.iutypes:\n            if simd_ext == 'avx2':\n                return 'return _mm256_movemask_epi8({in0}) {test};'. \\\n                       format(test='== -1' if func == 'all' else '!= 0',\n                              **fmtspec)\n            else:\n                return \\\n                '''nsimd_sse42_v{typ} lo = {extract_lo};\n                   nsimd_sse42_v{typ} hi = {extract_hi};\n                   return nsimd_{func}_sse42_{typ}(lo) {and_or}\n                          nsimd_{func}_sse42_{typ}(hi);'''. \\\n                   format(extract_lo=extract(simd_ext, typ, LO, common.in0),\n                          extract_hi=extract(simd_ext, typ, HI, common.in0),\n                          func=func, and_or='&&' if func == 'all' else '||',\n                          **fmtspec)\n        else:\n            mask = '0xFF' if typ == 'f32' else '0xF'\n            return 'return _mm256_movemask{suf}({in0}) {test};'. \\\n                   format(test='== ' + mask if func == 'all' else '!= 0',\n                          **fmtspec)\n    if simd_ext in avx512:\n        all_test = '== 0x' + ('F' * int((512 // int(typ[1:]) // 4)))\n        return 'return {in0} {test};'. \\\n               format(test=all_test if func == 'all' else '!= 0', **fmtspec)\n\n# -----------------------------------------------------------------------------\n# Reinterpret (bitwise_cast)\n\ndef reinterpret1(simd_ext, from_typ, to_typ):\n    if from_typ == to_typ:\n        return 'return {in0};'.format(**fmtspec)\n    if to_typ == 'f16':\n        emulate = '''{from_typ} buf[{le}];\n                     nsimd_storeu_{simd_ext}_{from_typ}(buf, {in0});\n                     return nsimd_loadu_{simd_ext}_f16((f16*)buf);'''. \\\n                     format(**fmtspec)\n        native = '''nsimd_{simd_ext}_vf16 ret;\n                    ret.v0 = {pre}cvtph_ps({extract_lo});\n                    ret.v1 = {pre}cvtph_ps({extract_hi});\n                    return ret;'''.format(\n                    extract_lo=extract(simd_ext, 'u16', LO, common.in0),\n                    extract_hi=extract(simd_ext, 'u16', HI, common.in0),\n                    **fmtspec)\n        if simd_ext in sse:\n            return \\\n            '''#ifdef NSIMD_FP16\n                 nsimd_{simd_ext}_vf16 ret;\n                 ret.v0 = _mm_cvtph_ps({in0});\n                 {in0} = _mm_shuffle_epi32({in0}, 14); /* = (3 << 2) | (2 << 0) */\n                 ret.v1 = _mm_cvtph_ps({in0});\n                 return ret;\n               #else\n                 {emulate}\n               #endif'''.format(emulate=emulate, **fmtspec)\n        if simd_ext in avx:\n            return \\\n            '''#ifdef NSIMD_FP16\n                 {}\n               #else\n                 {}\n               #endif'''.format(native, emulate)\n        if simd_ext in avx512:\n            return native\n    if from_typ == 'f16':\n        emulate = \\\n        '''u16 buf[{le}];\n           nsimd_storeu_{simd_ext}_f16((f16*)buf, {in0});\n           return nsimd_loadu_{simd_ext}_{to_typ}(({to_typ}*)buf);'''. \\\n           format(**fmtspec)\n        native = 'return {};'.format(setr(simd_ext, 'u16',\n                 '{pre}cvtps_ph({in0}.v0, 4)'.format(**fmtspec),\n                 '{pre}cvtps_ph({in0}.v1, 4)'.format(**fmtspec)))\n        if simd_ext in sse:\n            return \\\n            '''#ifdef NSIMD_FP16\n                 __m128i lo = _mm_cvtps_ph({in0}.v0, 4);\n                 __m128i hi = _mm_cvtps_ph({in0}.v1, 4);\n                 return _mm_castpd_si128(_mm_shuffle_pd(\n                          _mm_castsi128_pd(lo), _mm_castsi128_pd(hi), 0));\n               #else\n                 {emulate}\n               #endif'''.format(emulate=emulate, **fmtspec)\n        if simd_ext in avx:\n            return \\\n            '''#ifdef NSIMD_FP16\n                 {}\n               #else\n                 {}\n               #endif'''.format(native, emulate)\n        if simd_ext in avx512:\n            return native\n    if from_typ in common.iutypes and to_typ in common.iutypes:\n        return 'return {in0};'.format(**fmtspec)\n    if to_typ in ['f32', 'f64']:\n        return 'return {pre}castsi{nbits}{to_suf}({in0});'. \\\n               format(to_suf=suf_ep(to_typ), **fmtspec)\n    if from_typ in ['f32', 'f64']:\n        return 'return {pre}cast{from_suf}_si{nbits}({in0});'. \\\n               format(from_suf=suf_ep(from_typ)[1:], **fmtspec)\n\n# -----------------------------------------------------------------------------\n# Reinterpretl, i.e. reinterpret on logicals\n\ndef reinterpretl1(simd_ext, from_typ, to_typ):\n    if from_typ == to_typ:\n        return 'return {in0};'.format(**fmtspec)\n    if to_typ == 'f16':\n        if simd_ext in sse:\n            return \\\n            '''nsimd_{simd_ext}_vlf16 ret;\n               ret.v0 = _mm_castsi128_ps(_mm_unpacklo_epi16({in0}, {in0}));\n               ret.v1 = _mm_castsi128_ps(_mm_unpackhi_epi16({in0}, {in0}));\n               return ret;'''.format(**fmtspec)\n        if simd_ext == 'avx':\n            return \\\n            '''nsimd_{simd_ext}_vlf16 ret;\n               nsimd_sse42_vlf16 tmp1 =\n                   nsimd_reinterpretl_sse42_f16_{from_typ}(\n                     _mm256_castsi256_si128({in0}));\n               nsimd_sse42_vlf16 tmp2 =\n                   nsimd_reinterpretl_sse42_f16_{from_typ}(\n                      _mm256_extractf128_si256({in0}, 1));\n               ret.v0 = {setr_tmp1};\n               ret.v1 = {setr_tmp2};\n               return ret;'''. \\\n               format(setr_tmp1=setr('avx', 'f32', 'tmp1.v0', 'tmp1.v1'),\n                      setr_tmp2=setr('avx', 'f32', 'tmp2.v0', 'tmp2.v1'),\n                      **fmtspec)\n        if simd_ext == 'avx2':\n            return \\\n            '''nsimd_{simd_ext}_vlf16 ret;\n               ret.v0 = _mm256_castsi256_ps(_mm256_cvtepi16_epi32(\n                          _mm256_castsi256_si128({in0})));\n               ret.v1 = _mm256_castsi256_ps(_mm256_cvtepi16_epi32(\n                          _mm256_extractf128_si256({in0}, 1)));\n               return ret;'''.format(**fmtspec)\n        if simd_ext in avx512:\n            return '''nsimd_{simd_ext}_vlf16 ret;\n                      ret.v0 = (__mmask16)({in0} & 0xFFFF);\n                      ret.v1 = (__mmask16)(({in0} >> 16) & 0xFFFF);\n                      return ret;'''.format(**fmtspec)\n    if from_typ == 'f16':\n        if simd_ext in sse + avx:\n            return '''f32 in[{le}];\n                      {to_typ} out[{le}];\n                      int i;\n                      nsimd_storeu_{simd_ext}_f32(in, {in0}.v0);\n                      nsimd_storeu_{simd_ext}_f32(in + {leo2}, {in0}.v1);\n                      for (i = 0; i < {le}; i++) {{\n                        out[i] = ({to_typ})(in[i] != 0.0f ? -1 : 0);\n                      }}\n                      return nsimd_loadu_{simd_ext}_{to_typ}(out);'''. \\\n                      format(leo2=int(fmtspec['le']) // 2, **fmtspec)\n        if simd_ext in avx512:\n            return \\\n            'return (__mmask32){in0}.v0 | ((__mmask32){in0}.v1 << 16);'. \\\n            format(**fmtspec)\n    if simd_ext in sse + avx:\n        return reinterpret1(simd_ext, from_typ, to_typ)\n    else:\n        return 'return {in0};'.format(**fmtspec)\n\n# -----------------------------------------------------------------------------\n# Convert\n\ndef convert1(simd_ext, from_typ, to_typ):\n    if to_typ == from_typ or \\\n       to_typ in common.iutypes and from_typ in common.iutypes:\n        return 'return {in0};'.format(**fmtspec)\n    if to_typ == 'f16':\n        if simd_ext in sse:\n            getlo = '{in0}'.format(**fmtspec)\n            gethi = '_mm_unpackhi_epi64({in0}, {in0})'.format(**fmtspec)\n        if simd_ext in avx:\n            getlo = '_mm256_castsi256_si128({in0})'.format(**fmtspec)\n            gethi = '_mm256_extractf128_si256({in0}, 1)'.format(**fmtspec)\n        if simd_ext in avx512:\n            getlo = '_mm512_castsi512_si256({in0})'.format(**fmtspec)\n            gethi = '_mm512_extracti64x4_epi64({in0}, 1)'.format(**fmtspec)\n        through_epi32 = \\\n        '''nsimd_{simd_ext}_v{to_typ} ret;\n           ret.v0 = {pre}cvtepi32_ps({pre}cvtep{from_typ}_epi32({getlo}));\n           ret.v1 = {pre}cvtepi32_ps({pre}cvtep{from_typ}_epi32({gethi}));\n           return ret;'''.format(getlo=getlo, gethi=gethi, **fmtspec)\n        emulate = '''{from_typ} in[{le}];\n                     f32 out[{leo2}];\n                     nsimd_{simd_ext}_vf16 ret;\n                     int i;\n                     nsimd_storeu_{simd_ext}_{from_typ}(in, {in0});\n                     for (i = 0; i < {leo2}; i++) {{\n                       out[i] = (f32)in[i];\n                     }}\n                     ret.v0 = nsimd_loadu_{simd_ext}_f32(out);\n                     for (i = 0; i < {leo2}; i++) {{\n                       out[i] = (f32)in[i + {leo2}];\n                     }}\n                     ret.v1 = nsimd_loadu_{simd_ext}_f32(out);\n                     return ret;'''.format(leo2=int(fmtspec['le']) // 2,\n                                           **fmtspec)\n        if simd_ext in ['sse42', 'avx2']:\n            return through_epi32\n        if simd_ext in ['sse2', 'avx']:\n            return emulate\n        if simd_ext in avx512:\n            return through_epi32\n    if from_typ == 'f16':\n        return '''f32 in[{leo2}];\n                  {to_typ} out[{le}];\n                  int i;\n                  nsimd_storeu_{simd_ext}_f32(in, {in0}.v0);\n                  for (i = 0; i < {leo2}; i++) {{\n                    out[i] = ({to_typ})in[i];\n                  }}\n                  nsimd_storeu_{simd_ext}_f32(in, {in0}.v1);\n                  for (i = 0; i < {leo2}; i++) {{\n                    out[i + {leo2}] = ({to_typ})in[i];\n                  }}\n                  return nsimd_loadu_{simd_ext}_{to_typ}(out);'''. \\\n                  format(leo2=int(fmtspec['le']) // 2, **fmtspec)\n    emulate = '''{from_typ} in[{le}];\n                 {to_typ} out[{le}];\n                 int i;\n                 nsimd_storeu_{simd_ext}_{from_typ}(in, {in0});\n                 for (i = 0; i < {le}; i++) {{\n                   out[i] = ({to_typ})in[i];\n                 }}\n                 return nsimd_loadu_{simd_ext}_{to_typ}(out);'''. \\\n                 format(**fmtspec)\n    if to_typ == 'f64' or from_typ == 'f64':\n        if simd_ext == 'avx512_skylake':\n            return 'return _mm512_cvt{from_suf}{to_suf}({in0});'. \\\n                   format(from_suf=suf_ep(from_typ)[1:], to_suf=suf_ep(to_typ),\n                          **fmtspec)\n        else:\n            return emulate\n    if to_typ == 'f32' and from_typ == 'i32':\n        return 'return {pre}cvtepi32_ps({in0});'.format(**fmtspec)\n    if to_typ == 'f32' and from_typ == 'u32':\n        if simd_ext in sse + avx:\n            return emulate\n        if simd_ext in avx512:\n            return 'return _mm512_cvtepu32_ps({in0});'.format(**fmtspec)\n    if to_typ == 'i32' and from_typ == 'f32':\n        return 'return {pre}cvtps_epi32({in0});'.format(**fmtspec)\n    if to_typ == 'u32' and from_typ == 'f32':\n        if simd_ext in sse + avx:\n            return emulate\n        if simd_ext in avx512:\n            return 'return _mm512_cvtps_epu32({in0});'.format(**fmtspec)\n\n# -----------------------------------------------------------------------------\n# Reciprocal (at least 11 bits of precision)\n\ndef rec11_rsqrt11(func, simd_ext, typ):\n    if typ == 'f16':\n        return '''nsimd_{simd_ext}_vf16 ret;\n                  ret.v0 = nsimd_{func}11_{simd_ext}_f32({in0}.v0);\n                  ret.v1 = nsimd_{func}11_{simd_ext}_f32({in0}.v1);\n                  return ret;'''. \\\n                  format(func='rec' if func == 'rcp' else 'rsqrt', **fmtspec)\n    if typ == 'f32':\n        if simd_ext in sse + avx:\n            return 'return {pre}{func}_ps({in0});'.format(func=func, **fmtspec)\n        if simd_ext in avx512:\n            return 'return _mm512_{func}14_ps({in0});'. \\\n                   format(func=func, **fmtspec)\n    if typ == 'f64':\n        if simd_ext in sse + avx:\n            one = '{pre}set1_pd(1.0)'.format(**fmtspec)\n            if func == 'rcp':\n                return 'return {pre}div{suf}({one}, {in0});'.format(one=one, **fmtspec)\n            else:\n                return 'return {pre}div{suf}({one}, {pre}sqrt{suf}({in0}));'. \\\n                        format(one=one, **fmtspec)\n            format(func=func, **fmtspec)\n        if simd_ext in avx512:\n            return 'return _mm512_{func}14_pd({in0});'. \\\n                   format(func=func, **fmtspec)\n\n# -----------------------------------------------------------------------------\n# Reciprocal (IEEE)\n\ndef rec1(simd_ext, typ):\n    one = '{pre}set1_ps(1.0f)'.format(**fmtspec) if typ in ['f16', 'f32'] \\\n          else '{pre}set1_pd(1.0)'.format(**fmtspec)\n    if typ == 'f16':\n        return '''nsimd_{simd_ext}_vf16 ret;\n                  nsimd_{simd_ext}_vf32 one = {one};\n                  ret.v0 = {pre}div_ps(one, {in0}.v0);\n                  ret.v1 = {pre}div_ps(one, {in0}.v1);\n                  return ret;'''.format(one=one, **fmtspec)\n    return 'return {pre}div{suf}({one}, {in0});'.format(one=one, **fmtspec)\n\n# -----------------------------------------------------------------------------\n# Negative\n\ndef neg1(simd_ext, typ):\n    cte = '0x80000000' if typ in ['f16', 'f32'] else '0x8000000000000000'\n    fsuf = '_ps' if typ in ['f16', 'f32'] else '_pd'\n    utyp = 'u32' if typ in ['f16', 'f32'] else 'u64'\n    vmask = '{pre}castsi{nbits}{fsuf}(nsimd_set1_{simd_ext}_{utyp}({cte}))'. \\\n            format(cte=cte, utyp=utyp, fsuf=fsuf, **fmtspec)\n    if typ == 'f16':\n        return '''nsimd_{simd_ext}_vf16 ret;\n                  nsimd_{simd_ext}_vf32 mask = {vmask};\n                  ret.v0 = nsimd_xorb_{simd_ext}_f32(mask, {in0}.v0);\n                  ret.v1 = nsimd_xorb_{simd_ext}_f32(mask, {in0}.v1);\n                  return ret;'''.format(vmask=vmask, **fmtspec)\n    if typ in ['f32', 'f64']:\n        return 'return nsimd_xorb_{simd_ext}_{typ}({vmask}, {in0});'. \\\n               format(vmask=vmask, **fmtspec)\n    return '''return nsimd_sub_{simd_ext}_{typ}(\n                  {pre}setzero_si{nbits}(), {in0});'''. \\\n              format(**fmtspec)\n\n# -----------------------------------------------------------------------------\n# nbtrue\n\ndef nbtrue1(simd_ext, typ):\n    if typ == 'f16':\n        return '''return nsimd_nbtrue_{simd_ext}_f32({in0}.v0) +\n                         nsimd_nbtrue_{simd_ext}_f32({in0}.v1);'''. \\\n                         format(**fmtspec)\n    if typ in ['i8', 'u8']:\n        code = 'return nsimd_popcnt32_((u32){pre}movemask_epi8({in0}));'. \\\n               format(**fmtspec)\n    elif typ in ['i16', 'u16']:\n        code = 'return nsimd_popcnt32_((u32){pre}movemask_epi8({in0})) >> 1;'. \\\n               format(**fmtspec)\n    elif typ in ['i32', 'u32', 'i64', 'u64']:\n        code = '''return nsimd_popcnt32_((u32){pre}movemask{fsuf}(\n                      {pre}castsi{nbits}{fsuf}({in0})));'''. \\\n                      format(fsuf='_ps' if typ in ['i32', 'u32'] else '_pd',\n                             **fmtspec)\n    else:\n        code = 'return nsimd_popcnt32_((u32){pre}movemask{suf}({in0}));'. \\\n               format(**fmtspec)\n    if simd_ext in sse:\n        return code\n    if simd_ext in avx:\n        if typ in ['i32', 'u32', 'i64', 'u64', 'f32', 'f64']:\n            return code\n        else:\n            if simd_ext == 'avx2':\n                return code\n            else:\n                return \\\n                '''return nsimd_nbtrue_sse42_{typ}(\n                            _mm256_castsi256_si128({in0})) +\n                              nsimd_nbtrue_sse42_{typ}(\n                                _mm256_extractf128_si256({in0}, 1));'''. \\\n                                format(**fmtspec)\n    if simd_ext in avx512:\n        return 'return nsimd_popcnt64_((u64){in0});'.format(**fmtspec)\n\n# -----------------------------------------------------------------------------\n# reverse\n\ndef reverse1(simd_ext, typ):\n    # 8-bit int\n    if typ in ['i8', 'u8']:\n        if simd_ext == 'sse2':\n            return '''{in0} = _mm_shufflehi_epi16({in0}, _MM_SHUFFLE(0,1,2,3));\n                      {in0} = _mm_shufflelo_epi16({in0}, _MM_SHUFFLE(0,1,2,3));\n                      {in0} = _mm_castpd_si128(_mm_shuffle_pd(\n                                _mm_castsi128_pd({in0}), _mm_castsi128_pd(\n                                  {in0}), 1));\n                      nsimd_{simd_ext}_v{typ} r0 = _mm_srli_epi16({in0}, 8);\n                      nsimd_{simd_ext}_v{typ} r1 = _mm_slli_epi16({in0}, 8);\n                      return _mm_or_si128(r0, r1);'''.format(**fmtspec)\n        elif simd_ext == 'sse42':\n            return '''nsimd_{simd_ext}_v{typ} mask = _mm_set_epi8(\n                        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\n                      return _mm_shuffle_epi8({in0}, mask);'''. \\\n                      format(**fmtspec)\n        elif simd_ext == 'avx':\n            return \\\n            '''nsimd_sse42_v{typ} r0 = _mm_shuffle_epi8(\n                 _mm256_extractf128_si256({in0}, 0), _mm_set_epi8(\n                   0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15));\n               nsimd_sse42_v{typ} r1 = _mm_shuffle_epi8(\n                 _mm256_extractf128_si256({in0}, 1), _mm_set_epi8(\n                   0,  1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15));\n               {in0} = _mm256_insertf128_si256({in0}, r0, 1);\n               return _mm256_insertf128_si256({in0}, r1, 0);'''. \\\n               format(**fmtspec)\n        elif simd_ext == 'avx2':\n             return \\\n             '''{in0} = _mm256_shuffle_epi8({in0}, _mm256_set_epi8(\n                   0,  1,  2,  3,  4,  5,  6,  7,\n                   8,  9, 10, 11, 12, 13, 14, 15,\n                  16, 17, 18, 19, 20, 21, 22, 23,\n                  24, 25, 26, 27, 28, 29, 30, 31));\n                return _mm256_permute2x128_si256({in0}, {in0}, 1);'''. \\\n                format(**fmtspec)\n        # AVX-512F and above.\n        else:\n             return \\\n             '''nsimd_avx2_v{typ} r0 = _mm512_extracti64x4_epi64({in0}, 0);\n                nsimd_avx2_v{typ} r1 = _mm512_extracti64x4_epi64({in0}, 1);\n                r0 = _mm256_shuffle_epi8(r0, _mm256_set_epi8(\n                     0,  1,  2,  3,  4,  5,  6,  7,\n                     8,  9, 10, 11, 12, 13, 14, 15,\n                    16, 17, 18, 19, 20, 21, 22, 23,\n                    24, 25, 26, 27, 28, 29, 30, 31));\n                r1 = _mm256_shuffle_epi8(r1, _mm256_set_epi8(\n                      0,  1,  2,  3,  4,  5,  6,  7,\n                      8,  9, 10, 11, 12, 13, 14, 15,\n                     16, 17, 18, 19, 20, 21, 22, 23,\n                     24, 25, 26, 27, 28, 29, 30, 31));\n                r0 = _mm256_permute2x128_si256(r0, r0, 1);\n                r1 = _mm256_permute2x128_si256(r1, r1, 1);\n                {in0} = _mm512_insertf64x4({in0}, r0, 1);\n                return _mm512_insertf64x4({in0}, r1, 0);'''.format(**fmtspec)\n    # 16-bit int\n    elif typ in ['i16', 'u16']:\n        if simd_ext == 'sse2':\n            return '''{in0} = _mm_shufflehi_epi16( {in0}, _MM_SHUFFLE(0,1,2,3) );\n                      {in0} = _mm_shufflelo_epi16( {in0}, _MM_SHUFFLE(0,1,2,3) );\n                      return _mm_castpd_si128(_mm_shuffle_pd(\n                               _mm_castsi128_pd({in0}),\n                               _mm_castsi128_pd({in0}), 1));'''. \\\n                               format(**fmtspec)\n        elif simd_ext == 'sse42':\n            return \\\n            '''return _mm_shuffle_epi8({in0}, _mm_set_epi8(\n                        1,  0,  3,  2,  5,  4,  7, 6,\n                        9,  8, 11, 10, 13, 12, 15, 14));'''.format(**fmtspec)\n        elif simd_ext == 'avx':\n            return \\\n            '''nsimd_sse42_v{typ} r0 = _mm_shuffle_epi8(\n                 _mm256_extractf128_si256({in0}, 0), _mm_set_epi8(\n                   1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14));\n               nsimd_sse42_v{typ} r1 = _mm_shuffle_epi8(\n                 _mm256_extractf128_si256({in0}, 1), _mm_set_epi8(\n                   1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14));\n               {in0} = _mm256_insertf128_si256( {in0}, r0, 1);\n               return _mm256_insertf128_si256({in0}, r1, 0);'''. \\\n               format(**fmtspec)\n        elif simd_ext == 'avx2':\n            return \\\n            '''{in0} = _mm256_shuffle_epi8({in0}, _mm256_set_epi8(\n                           1,  0,  3,  2,  5,  4,  7,  6,\n                           9,  8, 11, 10, 13, 12, 15, 14,\n                          17, 16, 19, 18, 21, 20, 23, 22,\n                          25, 24, 27, 26, 29, 28, 31, 30));\n               return _mm256_permute2x128_si256({in0}, {in0}, 1);'''. \\\n               format(**fmtspec)\n        # AVX-512F\n        elif simd_ext == 'avx512_knl':\n            return \\\n            '''{in0} = _mm512_permutexvar_epi32(_mm512_set_epi32(\n                 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15),\n                 {in0});\n               nsimd_{simd_ext}_v{typ} r0 = _mm512_srli_epi32({in0}, 16);\n               nsimd_{simd_ext}_v{typ} r1 = _mm512_slli_epi32({in0}, 16);\n               return _mm512_or_si512(r0, r1);'''.format(**fmtspec)\n        # AVX-512F+BW (Skylake) + WORKAROUND GCC<=8\n        else:\n            return \\\n            '''return _mm512_permutexvar_epi16(_mm512_set_epi32(\n                 (0<<16)  | 1,  (2<<16)  | 3,  (4<<16)  | 5,  (6<<16)  | 7,\n                 (8<<16)  | 9,  (10<<16) | 11, (12<<16) | 13, (14<<16) | 15,\n                 (16<<16) | 17, (18<<16) | 19, (20<<16) | 21, (22<<16) | 23,\n                 (24<<16) | 25, (26<<16) | 27, (28<<16) | 29, (30<<16) | 31),\n                 {in0} );'''.format(**fmtspec)\n    # 32-bit int\n    elif typ in ['i32', 'u32']:\n        if simd_ext in ['sse2', 'sse42']:\n            return 'return _mm_shuffle_epi32({in0}, _MM_SHUFFLE(0,1,2,3));'. \\\n                   format(**fmtspec)\n        elif simd_ext == 'avx':\n            return '''{in0} = _mm256_castps_si256(_mm256_shuffle_ps(\n                                _mm256_castsi256_ps({in0}),\n                                _mm256_castsi256_ps({in0}),\n                                _MM_SHUFFLE(0,1,2,3)));\n                      return _mm256_permute2f128_si256({in0}, {in0}, 1);'''. \\\n                      format(**fmtspec)\n        elif simd_ext == 'avx2':\n            return \\\n            '''{in0} = _mm256_shuffle_epi32({in0}, _MM_SHUFFLE(0,1,2,3));\n               return _mm256_permute2x128_si256({in0}, {in0}, 1);'''. \\\n               format(**fmtspec)\n        else:\n            return \\\n            '''return _mm512_permutexvar_epi32(_mm512_set_epi32(\n                 0, 1,  2,  3,  4,  5,  6,  7,\n                 8, 9, 10, 11, 12, 13, 14, 15), {in0});'''. \\\n                 format(**fmtspec)\n    elif typ in ['i64', 'u64']:\n        if simd_ext in ['sse2', 'sse42']:\n            return '''return _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(\n                               {in0}), _mm_castsi128_pd({in0}), 1));'''. \\\n                               format(**fmtspec)\n        elif simd_ext == 'avx':\n            return '''{in0} = _mm256_castpd_si256(\n                                  _mm256_shuffle_pd(\n                                     _mm256_castsi256_pd({in0}),\n                                     _mm256_castsi256_pd({in0}),\n                                     (1<<2) | 1\n                                  )\n                              );\n                       return _mm256_permute2f128_si256({in0}, {in0}, 1);'''. \\\n                       format(**fmtspec)\n        elif simd_ext == 'avx2':\n           return '''return _mm256_permute4x64_epi64({in0},\n                              _MM_SHUFFLE(0, 1, 2, 3));'''.format(**fmtspec)\n        else:\n           return '''return _mm512_permutexvar_epi64(_mm512_set_epi64(\n                              0, 1, 2, 3, 4, 5, 6, 7), {in0});'''. \\\n                              format(**fmtspec)\n    # 16-bit float\n    elif typ == 'f16':\n        return '''nsimd_{simd_ext}_vf16 ret;\n                  ret.v0 = nsimd_reverse_{simd_ext}_f32({in0}.v0);\n                  ret.v1 = nsimd_reverse_{simd_ext}_f32({in0}.v1);\n                  return ret;'''.format(**fmtspec)\n    # 32-bit float\n    elif typ == 'f32':\n        if simd_ext in ['sse2', 'sse42']:\n            return '''return _mm_shuffle_ps({in0}, {in0},\n                               _MM_SHUFFLE(0, 1, 2, 3));'''.format(**fmtspec)\n        elif simd_ext in ['avx', 'avx2']:\n            return '''{in0} = _mm256_shuffle_ps({in0}, {in0},\n                                _MM_SHUFFLE(0, 1, 2, 3));\n                      return _mm256_permute2f128_ps({in0}, {in0}, 1);'''. \\\n                      format(**fmtspec)\n        else:\n            return \\\n            '''return _mm512_permutexvar_ps(_mm512_set_epi32(\n                        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15),\n                        {in0} );'''.format(**fmtspec)\n    # 64-bit float\n    else:\n        if simd_ext in ['sse2', 'sse42']:\n            return 'return _mm_shuffle_pd({in0}, {in0}, 1);'.format(**fmtspec)\n        elif simd_ext == 'avx':\n            return '''{in0} = _mm256_shuffle_pd({in0}, {in0}, (1<<2) | 1);\n                      return _mm256_permute2f128_pd({in0}, {in0}, 1);'''. \\\n                      format(**fmtspec)\n        elif simd_ext == 'avx2':\n            return '''return _mm256_permute4x64_pd({in0},\n                               _MM_SHUFFLE(0, 1, 2, 3));'''.format(**fmtspec)\n        else:\n            return '''return _mm512_permute_mm512_set_epi64(\n                               0, 1, 2, 3, 4, 5, 6, 7), {in0});'''. \\\n                               format(**fmtspec)\n\n# -----------------------------------------------------------------------------\n# addv\n\ndef addv(simd_ext, typ):\n    if simd_ext in sse:\n        if typ == 'f64':\n            return \\\n            '''return _mm_cvtsd_f64(_mm_add_pd({in0},\n                                    _mm_shuffle_pd({in0}, {in0}, 0x01)));'''. \\\n                                    format(**fmtspec)\n        elif typ == 'f32':\n            return \\\n            '''nsimd_{simd_ext}_vf32 tmp = _mm_add_ps({in0}, _mm_shuffle_ps(\n                                             {in0}, {in0}, 0xb1));\n               return _mm_cvtss_f32(_mm_add_ps(tmp, _mm_shuffle_ps(\n                        tmp, tmp, 0x4e)));''' .format(**fmtspec)\n        elif typ == 'f16':\n            return \\\n            '''nsimd_{simd_ext}_vf32 tmp0 = _mm_add_ps({in0}.v0,\n                 _mm_shuffle_ps({in0}.v0, {in0}.v0, 0xb1));\n               nsimd_{simd_ext}_vf32 tmp1 = _mm_add_ps({in0}.v1,\n                 _mm_shuffle_ps({in0}.v1, {in0}.v1, 0xb1));\n               return nsimd_f32_to_f16(_mm_cvtss_f32(_mm_add_ps(\n                 tmp0, _mm_shuffle_ps(tmp0, tmp0, 0x4e))) +\n                   _mm_cvtss_f32(_mm_add_ps(tmp1, _mm_shuffle_ps(\n                     tmp1, tmp1, 0x4e))));''' .format(**fmtspec)\n    elif simd_ext in avx:\n        if typ == 'f64':\n            return \\\n            '''__m128d tmp = _mm_add_pd(_mm256_extractf128_pd({in0}, 1),\n                                        _mm256_extractf128_pd({in0}, 0));\n               return _mm_cvtsd_f64(_mm_add_pd(tmp, _mm_shuffle_pd(\n                        tmp, tmp, 0x01)));''' .format(**fmtspec)\n        elif typ == 'f32':\n            return \\\n            '''__m128 tmp0 = _mm_add_ps(_mm256_extractf128_ps({in0}, 1),\n                                        _mm256_extractf128_ps({in0}, 0));\n               __m128 tmp1 = _mm_add_ps(tmp0, _mm_shuffle_ps(tmp0, tmp0, 0xb1));\n               return _mm_cvtss_f32(_mm_add_ps(tmp1, _mm_shuffle_ps(\n                        tmp1, tmp1, 0x4e)));''' .format(**fmtspec)\n        elif typ == 'f16':\n            return \\\n            '''__m128 tmp00 = _mm_add_ps(_mm256_extractf128_ps({in0}.v0, 1),\n                                         _mm256_extractf128_ps({in0}.v0, 0));\n               __m128 tmp01 = _mm_add_ps(tmp00, _mm_shuffle_ps(\n                                tmp00, tmp00, 0xb1));\n               __m128 tmp10 = _mm_add_ps(_mm256_extractf128_ps({in0}.v1, 1),\n                                         _mm256_extractf128_ps({in0}.v1, 0));\n               __m128 tmp11 = _mm_add_ps(tmp10, _mm_shuffle_ps(\n                                tmp10, tmp10, 0xb1));\n               return nsimd_f32_to_f16(_mm_cvtss_f32(_mm_add_ps(\n                        tmp01, _mm_shuffle_ps(tmp01, tmp01, 0x4e))) +\n                          _mm_cvtss_f32(_mm_add_ps(tmp11, _mm_shuffle_ps(\n                            tmp11, tmp11, 0x4e))));\n                    ''' .format(**fmtspec)\n    elif simd_ext in avx512:\n        if typ == 'f64':\n            return \\\n            '''__m256d tmp0 = _mm256_add_pd(_mm512_extractf64x4_pd({in0}, 0),\n                                            _mm512_extractf64x4_pd({in0}, 1));\n               __m128d tmp1 = _mm_add_pd(_mm256_extractf128_pd(tmp0, 1),\n                                         _mm256_extractf128_pd(tmp0, 0));\n               return _mm_cvtsd_f64(_mm_add_pd(tmp1, _mm_shuffle_pd(\n                        tmp1, tmp1, 0x01)));''' .format(**fmtspec)\n        elif typ == 'f32':\n            return \\\n            '''__m128 tmp0 = _mm_add_ps(_mm_add_ps(_mm512_extractf32x4_ps(\n                               {in0}, 0), _mm512_extractf32x4_ps({in0}, 1)),\n                               _mm_add_ps(_mm512_extractf32x4_ps({in0}, 2),\n                               _mm512_extractf32x4_ps({in0}, 3)));\n               __m128 tmp1 = _mm_add_ps(tmp0, _mm_shuffle_ps(\n                               tmp0, tmp0, 0xb1));\n               return _mm_cvtss_f32(_mm_add_ps(tmp1, _mm_shuffle_ps(\n                        tmp1, tmp1, 0x4e)));''' .format(**fmtspec)\n        elif typ == 'f16':\n            return \\\n            '''f32 res;\n               __m128 tmp0 = _mm_add_ps(\n                   _mm_add_ps(_mm512_extractf32x4_ps({in0}.v0, 0),\n                               _mm512_extractf32x4_ps({in0}.v0, 1)),\n                   _mm_add_ps(_mm512_extractf32x4_ps({in0}.v0, 2),\n                               _mm512_extractf32x4_ps({in0}.v0, 3)));\n               __m128 tmp1 = _mm_add_ps(tmp0, _mm_shuffle_ps(\n                               tmp0, tmp0, 0xb1));\n               res = _mm_cvtss_f32(_mm_add_ps(tmp1, _mm_shuffle_ps(\n                       tmp1, tmp1, 0x4e)));\n               tmp0 = _mm_add_ps(\n                   _mm_add_ps(_mm512_extractf32x4_ps({in0}.v1, 0),\n                               _mm512_extractf32x4_ps({in0}.v1, 1)),\n                   _mm_add_ps(_mm512_extractf32x4_ps({in0}.v1, 2),\n                               _mm512_extractf32x4_ps({in0}.v1, 3)));\n               tmp1 = _mm_add_ps(tmp0, _mm_shuffle_ps(tmp0, tmp0, 0xb1));\n               return nsimd_f32_to_f16(res + _mm_cvtss_f32(_mm_add_ps(\n                        tmp1, _mm_shuffle_ps(tmp1, tmp1, 0x4e))));''' . \\\n                        format(**fmtspec)\n\n# -----------------------------------------------------------------------------\n# upconvert\n\ndef upcvt1(simd_ext, from_typ, to_typ):\n    # From f16 is easy\n    if from_typ == 'f16':\n        if to_typ == 'f32':\n            return \\\n            '''nsimd_{simd_ext}_vf32x2 ret;\n               ret.v0 = {in0}.v0;\n               ret.v1 = {in0}.v1;\n               return ret;'''.format(**fmtspec)\n        else:\n            return \\\n            '''nsimd_{simd_ext}_v{to_typ}x2 ret;\n               ret.v0 = nsimd_cvt_{simd_ext}_{to_typ}_f32({in0}.v0);\n               ret.v1 = nsimd_cvt_{simd_ext}_{to_typ}_f32({in0}.v1);\n               return ret;'''.format(**fmtspec)\n\n    # To f16 is easy\n    if to_typ == 'f16':\n        return \\\n        '''nsimd_{simd_ext}_vf16x2 ret;\n           nsimd_{simd_ext}_v{iu}16x2 buf;\n           buf = nsimd_upcvt_{simd_ext}_{iu}16_{iu}8({in0});\n           ret.v0 = nsimd_cvt_{simd_ext}_f16_{iu}16(buf.v0);\n           ret.v1 = nsimd_cvt_{simd_ext}_f16_{iu}16(buf.v1);\n           return ret;'''.format(iu=from_typ[0], **fmtspec)\n\n    # For integer upcast, due to 2's complement representation\n    # epi_epi : signed   -> bigger signed\n    # epi_epi : signed   -> bigger unsigned\n    # epu_epi : unsigned -> bigger signed\n    # epu_epi : unsigned -> bigger unsigned\n    if from_typ in common.iutypes:\n        suf_epep = 'ep{ui}{typnbits}_epi{typnbits2}'. \\\n                   format(ui='u' if from_typ in common.utypes else 'i',\n                          typnbits2=str(int(fmtspec['typnbits']) * 2),\n                          **fmtspec)\n    else:\n        suf_epep = 'ps_pd'\n\n    # compute lower half\n    if simd_ext in sse:\n        lower_half = '{in0}'.format(**fmtspec)\n    else:\n        lower_half = extract(simd_ext, from_typ, LO, fmtspec['in0'])\n\n    # compute upper half\n    if simd_ext in sse:\n        if from_typ in common.iutypes:\n            upper_half = '_mm_shuffle_epi32({in0}, 14 /* 2 | 3 */)'. \\\n                         format(**fmtspec)\n        else:\n            upper_half = '''{pre}castpd_ps({pre}shuffle_pd(\n                                {pre}castps_pd({in0}),\n                                {pre}castps_pd({in0}), 1))'''.format(**fmtspec)\n    else:\n        upper_half = extract(simd_ext, from_typ, HI, fmtspec['in0'])\n\n    # When intrinsics are provided\n    # for conversions integers <-> floating point, there is no intrinsics, so\n    # we use cvt's\n    if from_typ == 'i32' and to_typ == 'f64':\n        with_intrinsic = \\\n        '''nsimd_{simd_ext}_vf64x2 ret;\n           ret.v0 = {pre}cvtepi32_pd({lower_half});\n           ret.v1 = {pre}cvtepi32_pd({upper_half});\n           return ret;'''.format(upper_half=upper_half,\n                                 lower_half=lower_half, **fmtspec)\n    elif (from_typ in common.iutypes and to_typ in common.iutypes) or \\\n         (from_typ == 'f32' and to_typ == 'f64'):\n        with_intrinsic = \\\n        '''nsimd_{simd_ext}_v{to_typ}x2 ret;\n           ret.v0 = {pre}cvt{suf_epep}({lower_half});\n           ret.v1 = {pre}cvt{suf_epep}({upper_half});\n           return ret;'''.format(upper_half=upper_half, lower_half=lower_half,\n                                 suf_epep=suf_epep, **fmtspec)\n    else:\n        from_typ2 = from_typ[0] + str(int(fmtspec['typnbits']) * 2)\n        if from_typ not in common.iutypes:\n            # getting here means that from_typ=f32 and to_typ=f64\n            with_intrinsic = \\\n            '''nsimd_{simd_ext}_vf64x2 ret;\n               ret.v0 = nsimd_cvt_{simd_ext}_{to_typ}_f64({pre}cvtps_pd(\n                            {lower_half}));\n               ret.v1 = nsimd_cvt_{simd_ext}_{to_typ}_f64({pre}cvtps_pd(\n                            {upper_half}));\n               return ret;'''. \\\n               format(upper_half=upper_half, lower_half=lower_half,\n                      from_typ2=from_typ2, suf_epep=suf_epep, **fmtspec)\n\n    # When no intrinsic is given for going from integers to floating or\n    # from floating to integer we can go through a cvt\n    if to_typ in common.ftypes:\n        int_float = \\\n        '''nsimd_{simd_ext}_v{to_typ}x2 ret;\n           nsimd_{simd_ext}_v{int_typ}x2 tmp;\n           tmp = nsimd_upcvt_{simd_ext}_{int_typ}_{from_typ}({in0});\n           ret.v0 = nsimd_cvt_{simd_ext}_{to_typ}_{int_typ}(tmp.v0);\n           ret.v1 = nsimd_cvt_{simd_ext}_{to_typ}_{int_typ}(tmp.v1);\n           return ret;'''. \\\n           format(int_typ=from_typ[0] + to_typ[1:], lower_half=lower_half,\n                  upper_half=upper_half, **fmtspec)\n    else:\n        int_float = \\\n        '''return nsimd_upcvt_{simd_ext}_{to_typ}_{int_typ}(\n                      nsimd_cvt_{simd_ext}_{int_typ}_{from_typ}({in0}));'''. \\\n                      format(int_typ=to_typ[0] + from_typ[1:],\n                             lower_half=lower_half, upper_half=upper_half,\n                             **fmtspec)\n\n    # When no intrinsic is given we can use the trick of falling back to\n    # the lower SIMD extension\n    split_trick = \\\n    '''nsimd_{simd_ext}_v{to_typ}x2 ret;\n       nsimd_{simd_ext2}_v{to_typ}x2 ret2;\n       ret2 = nsimd_upcvt_{simd_ext2}_{to_typ}_{from_typ}({lo});\n       ret.v0 = {merge};\n       ret2 = nsimd_upcvt_{simd_ext2}_{to_typ}_{from_typ}({hi});\n       ret.v1 = {merge};\n       return ret;'''. \\\n       format(simd_ext2='sse42' if simd_ext == 'avx' else 'avx2',\n              lo=extract(simd_ext, from_typ, LO, common.in0),\n              hi=extract(simd_ext, from_typ, HI, common.in0),\n              merge=setr(simd_ext, to_typ, 'ret2.v0', 'ret2.v1'), **fmtspec)\n\n    # return C code\n    if from_typ == 'i32' and to_typ == 'f64':\n        return with_intrinsic\n    if (from_typ in common.ftypes and to_typ in common.iutypes) or \\\n       (from_typ in common.iutypes and to_typ in common.ftypes):\n        return int_float\n    # if simd_ext == 'sse2':\n    if simd_ext in sse:\n        if from_typ in common.itypes and to_typ in common.iutypes:\n            return \\\n            '''nsimd_{simd_ext}_v{to_typ}x2 ret;\n               __m128i mask = _mm_cmpgt{suf}(_mm_setzero_si128(), {in0});\n               ret.v0 = _mm_unpacklo{suf}({in0}, mask);\n               ret.v1 = _mm_unpackhi{suf}({in0}, mask);\n               return ret;'''.format(**fmtspec)\n        elif from_typ in common.utypes and to_typ in common.iutypes:\n            return \\\n            '''nsimd_{simd_ext}_v{to_typ}x2 ret;\n               ret.v0 = _mm_unpacklo{suf}({in0}, _mm_setzero_si128());\n               ret.v1 = _mm_unpackhi{suf}({in0}, _mm_setzero_si128());\n               return ret;'''.format(**fmtspec)\n        else:\n            return with_intrinsic\n    # elif simd_ext == 'sse42':\n    #    return with_intrinsic\n    elif simd_ext == 'avx':\n        if from_typ == 'i32' and to_typ == 'f64':\n            return with_intrinsic\n        else:\n            return split_trick\n    elif simd_ext == 'avx2':\n        return with_intrinsic\n    elif simd_ext == 'avx512_knl':\n        if from_typ in ['i16', 'u16', 'i32', 'u32', 'f32']:\n            return with_intrinsic\n        else:\n            return split_trick\n    else:\n        return with_intrinsic\n\n# -----------------------------------------------------------------------------\n# downconvert\n\ndef downcvt1(opts, simd_ext, from_typ, to_typ):\n    # From f16 is easy\n    if from_typ == 'f16':\n        le_to_typ = int(fmtspec['le']) * 2\n        le_1f32 = le_to_typ // 4\n        le_2f32 = 2 * le_to_typ // 4\n        le_3f32 = 3 * le_to_typ // 4\n        cast = castsi(simd_ext, to_typ)\n        return \\\n        '''{to_typ} dst[{le_to_typ}];\n           f32 src[{le_to_typ}];\n           int i;\n           {pre}storeu_ps(src, {in0}.v0);\n           {pre}storeu_ps(src + {le_1f32}, {in0}.v1);\n           {pre}storeu_ps(src + {le_2f32}, {in1}.v0);\n           {pre}storeu_ps(src + {le_3f32}, {in1}.v1);\n           for (i = 0; i < {le_to_typ}; i++) {{\n             dst[i] = ({to_typ})src[i];\n           }}\n           return {pre}loadu_si{nbits}({cast}dst);'''. \\\n           format(le_to_typ=le_to_typ, le_1f32=le_1f32, le_2f32=le_2f32,\n                  le_3f32=le_3f32, cast=cast, **fmtspec)\n\n    # To f16 is easy\n    if to_typ == 'f16':\n        if from_typ == 'f32':\n            return \\\n            '''nsimd_{simd_ext}_vf16 ret;\n               ret.v0 = {in0};\n               ret.v1 = {in1};\n               return ret;'''.format(**fmtspec)\n        else:\n            return \\\n            '''nsimd_{simd_ext}_vf16 ret;\n               ret.v0 = nsimd_cvt_{simd_ext}_f32_{from_typ}({in0});\n               ret.v1 = nsimd_cvt_{simd_ext}_f32_{from_typ}({in1});\n               return ret;'''.format(**fmtspec)\n\n    # f64 --> f32 have intrinsics\n    if from_typ == 'f64' and to_typ == 'f32':\n        if simd_ext in sse:\n            return '''return _mm_movelh_ps(_mm_cvtpd_ps({in0}),\n                                           _mm_cvtpd_ps({in1}));'''. \\\n                                           format(**fmtspec)\n        else:\n            return 'return {};'.format(setr(simd_ext, 'f32',\n                                '{pre}cvtpd_ps({in0})'.format(**fmtspec),\n                                '{pre}cvtpd_ps({in1})'.format(**fmtspec)))\n\n    # integer conversions intrinsics are only available with AVX-512\n    if simd_ext in avx512:\n        if (from_typ in ['i32', 'i64'] and to_typ in common.itypes) or \\\n           (simd_ext == 'avx512_skylake' and from_typ == 'i16' and \\\n            to_typ == 'i8'):\n            return 'return {};'.format(setr(simd_ext, to_typ,\n                   '{pre}cvtep{from_typ}_ep{to_typ}({in0})'.format(**fmtspec),\n                   '{pre}cvtep{from_typ}_ep{to_typ}({in1})'.format(**fmtspec)))\n        elif from_typ == 'i64' and to_typ == 'f32':\n            return 'return nsimd_cvt_{simd_ext}_f32_i32({});'. \\\n                   format(setr(simd_ext, from_typ,\n                          '{pre}cvtepi64_epi32({in0})'.format(**fmtspec),\n                          '{pre}cvtepi64_epi32({in1})'.format(**fmtspec)),\n                          **fmtspec)\n\n    # and then emulation\n    le_to_typ = 2 * int(fmtspec['le'])\n    cast_src = '(__m{nbits}i *)'.format(**fmtspec) \\\n               if from_typ in common.iutypes else ''\n    cast_dst = '(__m{nbits}i *)'.format(**fmtspec) \\\n               if to_typ in common.iutypes else ''\n    return \\\n    '''{to_typ} dst[{le_to_typ}];\n       {from_typ} src[{le_to_typ}];\n       int i;\n       {pre}storeu{sufsi}({cast_src}src, {in0});\n       {pre}storeu{sufsi}({cast_src}(src + {le}), {in1});\n       for (i = 0; i < {le_to_typ}; i++) {{\n         dst[i] = ({to_typ})src[i];\n       }}\n       return {pre}loadu{sufsi_to_typ}({cast_dst}dst);'''. \\\n       format(cast_src=cast_src, cast_dst=cast_dst, le_to_typ=le_to_typ,\n              sufsi_to_typ=suf_si(simd_ext, to_typ), **fmtspec)\n\n# -----------------------------------------------------------------------------\n# adds / subs helper\n\ndef adds_subs_intrinsic_instructions_i8_i16_u8_u16(which_op, simd_ext, typ):\n\n    valid_types = ('i8', 'i16', 'u8', 'u16')\n    if typ not in valid_types:\n        raise TypeError(\n    '''def adds_subs_intrinsic_instructions_i8_i16_u8_u16(...):\n     {typ} must belong to the following types set: {valid_types}'''.\\\n        format(typ=typ, valid_types=valid_types)\n    )\n    if 'sse2' in simd_ext or 'sse42' in simd_ext:\n        return'''\n        return _mm_{which_op}_ep{typ}({in0}, {in1});\n        '''.format(which_op=which_op, **fmtspec)\n    if 'avx' == simd_ext:\n        return split_opn(which_op, simd_ext, typ, 2)\n    if simd_ext in ('avx2', 'avx512_skylake'):\n        return 'return {pre}{which_op}_ep{typ}({in0}, {in1});'. \\\n            format(which_op=which_op, **fmtspec)\n    if 'avx512_knl' == simd_ext:\n        return split_opn(which_op, simd_ext, typ, 2)\n\ndef get_avx512_sse2_i32_i64_dependent_code(simd_ext, typ):\n    if 'avx512' in simd_ext or 'sse2' in simd_ext:\n        mask_processing = \\\n        '''/* For avx512/sse2 */\n           const nsimd_{simd_ext}_vu{typnbits} mask_strong_bit =\n               nsimd_shr_{simd_ext}_u{typnbits}(\n                   mask, sizeof(u{typnbits}) * CHAR_BIT - 1);\n           const nsimd_{simd_ext}_vi{typnbits} imask_strong_bit =\n               nsimd_reinterpret_{simd_ext}_i{typnbits}_u{typnbits}(\n                   mask_strong_bit);\n           const nsimd_{simd_ext}_vli{typnbits} limask_strong_bit =\n               nsimd_to_logical_{simd_ext}_i{typnbits}(imask_strong_bit);'''. \\\n               format(**fmtspec)\n        if_else = \\\n        '''/* For avx512/sse2 */\n           return nsimd_if_else1_{simd_ext}_i{typnbits}(\n                      limask_strong_bit, ires, i_max_min);'''. \\\n                      format(**fmtspec)\n    else:\n        mask_processing = '/* Before avx512: is_same(__m128i, ' \\\n                          'vector<signed>, vector<unsigned>, ' \\\n                          'vector<logical>) */'\n        suf2 = 'ps' if typ in ['i32', 'u32'] else 'pd'\n        if_else = '''return {pre}cast{suf2}_si{nbits}({pre}blendv_{suf2}(\n                                {pre}castsi{nbits}_{suf2}(i_max_min),\n                                {pre}castsi{nbits}_{suf2}(ires),\n                                {pre}castsi{nbits}_{suf2}(mask)));\n                                '''.format(suf2=suf2, **fmtspec)\n\n    return { 'mask_processing': mask_processing, 'if_else': if_else }\n\n# -----------------------------------------------------------------------------\n# adds\n\ndef adds(simd_ext, typ):\n\n    if typ in common.ftypes:\n        return 'return nsimd_add_{simd_ext}_{typ}({in0}, {in1});'. \\\n               format(**fmtspec)\n\n    if typ in ('i8', 'i16', 'u8', 'u16'):\n        return adds_subs_intrinsic_instructions_i8_i16_u8_u16(\n                   'adds', simd_ext, typ)\n\n    if typ in common.utypes:\n        return \\\n        '''/* Algo pseudo code: */\n           /* ures = a + b */\n           /* if overflow then ures < a && ures < b */\n           /* --> test against a single value: if(ures < a){{ overflow ; }} */\n           /* return ures < a ? {type_max} : ures */\n\n           const nsimd_{simd_ext}_v{typ} ures =\n               nsimd_add_{simd_ext}_{typ}({in0}, {in1});\n           const nsimd_{simd_ext}_v{typ} type_max =\n               nsimd_set1_{simd_ext}_{typ}(({typ}){type_max});\n           return nsimd_if_else1_{simd_ext}_{typ}(\n                    nsimd_lt_{simd_ext}_{typ}(ures, {in0}),\n                    type_max, ures);'''. \\\n                    format(type_max=common.limits[typ]['max'], **fmtspec)\n\n    avx512_sse2_i32_i64_dependent_code = \\\n        get_avx512_sse2_i32_i64_dependent_code(simd_ext, typ)\n\n    return \\\n    '''/* Algo pseudo code: */\n\n       /* if ( ( same_sign(ux, uy) && same_sign(uy, res) ) || */\n       /*      ! same_sign(ux, uy) ): */\n       /*     neither overflow nor underflow happened */\n       /* else: */\n       /*     if(ux > 0 && uy > 0): res = MAX // overflow */\n       /*     else: res = MIN // underflow */\n\n       /* Step 1: reinterpret to unsigned to work with the bits */\n\n       nsimd_{simd_ext}_vu{typnbits} ux =\n           nsimd_reinterpret_{simd_ext}_u{typnbits}_i{typnbits}({in0});\n       const nsimd_{simd_ext}_vu{typnbits} uy =\n           nsimd_reinterpret_{simd_ext}_u{typnbits}_i{typnbits}({in1});\n       const nsimd_{simd_ext}_vu{typnbits} ures =\n           nsimd_add_{simd_ext}_u{typnbits}(ux, uy);\n\n       /* Step 2: check signs different: ux, uy, res */\n\n       /* xor_ux_uy's most significant bit will be zero if both ux and */\n       /* uy have same sign */\n\n       const nsimd_{simd_ext}_vu{typnbits} xor_ux_uy =\n           nsimd_xorb_{simd_ext}_u{typnbits}(ux, uy);\n\n       /* xor_uy_res's most significant bit will be zero if both uy and */\n       /* ures have same sign */\n\n       const nsimd_{simd_ext}_vu{typnbits} xor_uy_res =\n           nsimd_xorb_{simd_ext}_u{typnbits}(uy, ures);\n\n       /* Step 3: Construct the MIN/MAX vector */\n\n       /* Pseudo code: */\n\n       /* Both positive --> overflow possible */\n       /* --> get the MAX: */\n\n       /* (signed)ux >= 0 && (signed)uy >= 0 */\n       /* <=> ((unsigned)ux | (unsigned)uy) >> 31 == 0 */\n       /* --> MAX + ( (ux | uy) >> 31 ) == MAX + 0 == MAX */\n\n       /* At least one negative */\n       /* --> overflow not possible / underflow possible if both negative */\n       /* --> get the MIN: */\n\n       /* unsigned tmp = (unsigned)MAX + */\n       /*                ( ( (ux | uy) >> 31 ) == (unsigned)MAX + 1 ) */\n       /* --> MIN = (reinterpret signed)tmp */\n\n       /* ux | uy */\n       const nsimd_{simd_ext}_vu{typnbits} ux_uy_orb =\n           nsimd_orb_{simd_ext}_u{typnbits}(ux, uy);\n\n       /* (ux | uy) >> 31 --> Vector of 0's and 1's */\n       const nsimd_{simd_ext}_vu{typnbits} u_zeros_ones =\n           nsimd_shr_{simd_ext}_u{typnbits}(\n               ux_uy_orb, sizeof(u{typnbits}) * CHAR_BIT - 1);\n\n       /* MIN/MAX vector */\n\n       /* i{typnbits} tmp = sMAX + 1 --> undefined behavior */\n       /* u{typnbits} tmp = (u{typnbits})sMAX + 1 */\n       /* i{typnbits} sMIN = *(i{typnbits}*)(&tmp) */\n\n       const nsimd_{simd_ext}_vu{typnbits} u_max =\n           nsimd_set1_{simd_ext}_u{typnbits}((u{typnbits}){type_max});\n       const nsimd_{simd_ext}_vu{typnbits} u_max_min =\n           nsimd_add_{simd_ext}_u{typnbits}(u_max, u_zeros_ones);\n       const nsimd_{simd_ext}_vi{typnbits} i_max_min =\n           nsimd_reinterpret_{simd_ext}_i{typnbits}_u{typnbits}(u_max_min);\n\n       /* Step 4: Construct the mask vector */\n\n       /* mask == ( 8ot_same_sign(ux, uy) || same_sign(uy, res) ) */\n       /* mask: True (no underflow/overflow) / False (underflow/overflow) */\n       /* mask = xor_ux_uy | ~ xor_uy_res */\n\n       const nsimd_{simd_ext}_vu{typnbits} not_xor_uy_res =\n           nsimd_notb_{simd_ext}_u{typnbits}(xor_uy_res);\n       const nsimd_{simd_ext}_vu{typnbits} mask =\n           nsimd_orb_{simd_ext}_u{typnbits}(xor_ux_uy, not_xor_uy_res);\n\n       {avx512_sse2_dependent_mask_processing}\n\n       /* Step 5: Apply the Mask */\n\n       const nsimd_{simd_ext}_vi{typnbits} ires =\n           nsimd_reinterpret_{simd_ext}_i{typnbits}_u{typnbits}(ures);\n\n       {avx512_sse2_dependent_if_else}'''. \\\n       format(type_max = common.limits[typ]['max'],\n              avx512_sse2_dependent_mask_processing = \\\n                  avx512_sse2_i32_i64_dependent_code['mask_processing'],\n              avx512_sse2_dependent_if_else = \\\n                  avx512_sse2_i32_i64_dependent_code['if_else'], **fmtspec)\n\n# -----------------------------------------------------------------------------\n# subs\n\ndef subs(simd_ext, typ):\n\n    if typ in common.ftypes:\n        return 'return nsimd_sub_{simd_ext}_{typ}({in0}, {in1});'. \\\n               format(**fmtspec)\n\n    if typ in ('i8', 'i16', 'u8', 'u16'):\n        return adds_subs_intrinsic_instructions_i8_i16_u8_u16(\n                   'subs', simd_ext, typ)\n\n    if typ in common.itypes:\n        return 'return nsimd_adds_{simd_ext}_{typ}({in0}, ' \\\n               'nsimd_neg_{simd_ext}_{typ}({in1}));'.format(**fmtspec)\n\n    min_ = common.limits[typ]['min']\n\n    return \\\n    '''/* Algo pseudo code: */\n\n       /* unsigned only */\n       /* a > 0; b > 0 ==> a - b --> possibility for underflow only */\n       /* if b > a --> underflow */\n\n       const nsimd_{simd_ext}_v{typ} ures =\n           nsimd_sub_{simd_ext}_{typ}({in0}, {in1});\n       const nsimd_{simd_ext}_vl{typ} is_underflow =\n           nsimd_gt_{simd_ext}_{typ}({in1}, {in0});\n       const nsimd_{simd_ext}_v{typ} umin =\n           nsimd_set1_{simd_ext}_{typ}(({typ}){min_});\n       return nsimd_if_else1_{simd_ext}_{typ}(is_underflow, umin, ures);'''. \\\n       format(min_=min_, **fmtspec)\n\n# -----------------------------------------------------------------------------\n# to_mask\n\ndef to_mask1(simd_ext, typ):\n    if typ == 'f16':\n        return '''nsimd_{simd_ext}_vf16 ret;\n                  ret.v0 = nsimd_to_mask_{simd_ext}_f32({in0}.v0);\n                  ret.v1 = nsimd_to_mask_{simd_ext}_f32({in0}.v1);\n                  return ret;'''.format(**fmtspec)\n    if simd_ext in sse + avx:\n        return 'return {in0};'.format(**fmtspec)\n    elif simd_ext == 'avx512_skylake':\n        if typ in common.iutypes:\n            return 'return _mm512_movm_epi{typnbits}({in0});'. \\\n                   format(**fmtspec)\n        elif typ in ['f32', 'f64']:\n            return '''return _mm512_castsi512{suf}(\n                               _mm512_movm_epi{typnbits}({in0}));'''. \\\n                               format(**fmtspec)\n    else:\n        if typ in ['i32', 'u32', 'i64', 'u64']:\n            return '''return _mm512_mask_mov{suf}(_mm512_setzero_si512(),\n                                 {in0}, _mm512_set1_epi32(-1));'''. \\\n                                 format(**fmtspec)\n        elif typ in ['f32', 'f64']:\n            return '''return _mm512_mask_mov{suf}(_mm512_castsi512{suf}(\n                               _mm512_setzero_si512()), {in0},\n                                 _mm512_castsi512{suf}(\n                                   _mm512_set1_epi32(-1)));'''. \\\n                                   format(**fmtspec)\n        else:\n            return '''{typ} buf[{le}];\n                      int i;\n                      for (i = 0; i < {le}; i++) {{\n                        if (({in0} >> i) & 1) {{\n                          buf[i] = ({typ})-1;\n                        }} else {{\n                          buf[i] = ({typ})0;\n                        }}\n                      }}\n                      return _mm512_loadu_si512(buf);'''.format(**fmtspec)\n\n# -----------------------------------------------------------------------------\n# to_logical\n\ndef to_logical1(simd_ext, typ):\n    if typ in common.iutypes:\n        return '''return nsimd_ne_{simd_ext}_{typ}(\n                           {in0}, {pre}setzero{sufsi}());'''.format(**fmtspec)\n    elif typ in ['f32', 'f64']:\n        return '''return nsimd_reinterpretl_{simd_ext}_{typ}_{utyp}(\n                           nsimd_ne_{simd_ext}_{utyp}(\n                             {pre}cast{suf2}_si{nbits}({in0}),\n                               {pre}setzero_si{nbits}()));'''. \\\n                               format(suf2=suf_si(simd_ext, typ)[1:],\n                                      utyp='u{}'.format(fmtspec['typnbits']),\n                                      **fmtspec)\n    else:\n        return '''nsimd_{simd_ext}_vlf16 ret;\n                  ret.v0 = nsimd_to_logical_{simd_ext}_f32({in0}.v0);\n                  ret.v1 = nsimd_to_logical_{simd_ext}_f32({in0}.v1);\n                  return ret;'''.format(**fmtspec)\n\n# -----------------------------------------------------------------------------\n# zip functions\n\ndef zip_half(func, simd_ext, typ):\n    simd_ext2 = 'sse42' if simd_ext in avx else 'avx2'\n    if simd_ext in sse:\n        if typ == 'f16':\n            return '''nsimd_{simd_ext}_v{typ} ret;\n                      ret.v0 = _mm_unpacklo_ps({in0}.v{k}, {in1}.v{k});\n                      ret.v1 = _mm_unpackhi_ps({in0}.v{k}, {in1}.v{k});\n                      return ret;'''. \\\n                      format(k='0' if func == 'ziplo' else '1', **fmtspec)\n        else:\n            return 'return {pre}unpack{lo}{suf}({in0}, {in1});'. \\\n                   format(lo='lo' if func == 'ziplo' else 'hi', **fmtspec)\n    elif simd_ext in avx:\n        # Currently, 256 and 512 bits vectors are splitted into 128 bits\n        # vectors in order to perform the ziplo/hi operation using the\n        # unpacklo/hi sse operations.\n        if typ == 'f16':\n            in0vk = '{in0}.v{k}'.format(k='0' if func == 'ziplo' else '1',\n                                        **fmtspec)\n            in1vk = '{in1}.v{k}'.format(k='0' if func == 'ziplo' else '1',\n                                        **fmtspec)\n            return \\\n            '''nsimd_{simd_ext}_v{typ} ret;\n               __m128 v_tmp0 = {get_low_in0vk};\n               __m128 v_tmp1 = {get_low_in1vk};\n               __m128 v_tmp2 = {get_high_in0vk};\n               __m128 v_tmp3 = {get_high_in1vk};\n               __m128 vres_lo0 = _mm_unpacklo_ps(v_tmp0, v_tmp1);\n               __m128 vres_hi0 = _mm_unpackhi_ps(v_tmp0, v_tmp1);\n               ret.v0 = {merge0};\n               __m128 vres_lo1 = _mm_unpacklo_ps(v_tmp2, v_tmp3);\n               __m128 vres_hi1 = _mm_unpackhi_ps(v_tmp2, v_tmp3);\n               ret.v1 = {merge1};\n               return ret;\n               '''.format(get_low_in0vk=extract(simd_ext, 'f32', LO, in0vk),\n                          get_low_in1vk=extract(simd_ext, 'f32', LO, in1vk),\n                          get_high_in0vk=extract(simd_ext, 'f32', HI, in0vk),\n                          get_high_in1vk=extract(simd_ext, 'f32', HI, in1vk),\n                          merge0=setr(simd_ext, 'f32', 'vres_lo0', 'vres_hi0'),\n                          merge1=setr(simd_ext, 'f32', 'vres_lo1', 'vres_hi1'),\n                          **fmtspec)\n        else:\n            hl = LO if func == 'ziplo' else HI\n            return \\\n            '''{nat} v_tmp0 = {half_in0};\n               {nat} v_tmp1 = {half_in1};\n               {nat} vres_lo = _mm_unpacklo{suf}(v_tmp0, v_tmp1);\n               {nat} vres_hi = _mm_unpackhi{suf}(v_tmp0, v_tmp1);\n               return {merge};\n               '''.format(nat=get_native_typ(simd_ext2, typ),\n                          half_in0=extract(simd_ext, typ, hl, common.in0),\n                          half_in1=extract(simd_ext, typ, hl, common.in1),\n                          merge=setr(simd_ext, typ, 'vres_lo', 'vres_hi'),\n                          **fmtspec)\n    else:\n        if typ == 'f16':\n            return \\\n            '''nsimd_{simd_ext}_v{typ} ret;\n               __m512 v0 = {in0}.v{k};\n               __m512 v1 = {in1}.v{k};\n               __m256 v_tmp0, v_tmp1, vres_lo, vres_hi;\n               /* Low part */\n               v_tmp0 = {low_v0};\n               v_tmp1 = {low_v1};\n               vres_lo = nsimd_ziplo_avx2_f32(v_tmp0, v_tmp1);\n               vres_hi = nsimd_ziphi_avx2_f32(v_tmp0, v_tmp1);\n               ret.v0 = {merge};\n               /* High part */\n               v_tmp0 = {high_v0};\n               v_tmp1 = {high_v1};\n               vres_lo = nsimd_ziplo_avx2_f32(v_tmp0, v_tmp1);\n               vres_hi = nsimd_ziphi_avx2_f32(v_tmp0, v_tmp1);\n               ret.v1 = {merge};\n               return ret;'''. \\\n               format(k='0' if func == 'ziplo' else '1',\n                      low_v0=extract(simd_ext, 'f32', LO, 'v0'),\n                      low_v1=extract(simd_ext, 'f32', LO, 'v1'),\n                      high_v0=extract(simd_ext, 'f32', HI, 'v0'),\n                      high_v1=extract(simd_ext, 'f32', HI, 'v1'),\n                      merge=setr(simd_ext, 'f32', 'vres_lo', 'vres_hi'),\n                      **fmtspec)\n        else:\n            hl = LO if func == 'ziplo' else HI\n            return \\\n            '''{nat} v_tmp0, v_tmp1;\n               v_tmp0 = {half_in0};\n               v_tmp1 = {half_in1};\n               {nat} vres_lo = nsimd_ziplo_avx2_{typ}(v_tmp0, v_tmp1);\n               {nat} vres_hi = nsimd_ziphi_avx2_{typ}(v_tmp0, v_tmp1);\n               return {merge};'''. \\\n               format(nat=get_native_typ(simd_ext2, typ),\n                      half_in0=extract(simd_ext, typ, hl, common.in0),\n                      half_in1=extract(simd_ext, typ, hl, common.in1),\n                      merge=setr(simd_ext, typ, 'vres_lo', 'vres_hi'),\n                      **fmtspec)\n\ndef zip(simd_ext, typ):\n    return '''nsimd_{simd_ext}_v{typ}x2 ret;\n              ret.v0 = nsimd_ziplo_{simd_ext}_{typ}({in0}, {in1});\n              ret.v1 = nsimd_ziphi_{simd_ext}_{typ}({in0}, {in1});\n              return ret;\n              '''.format(**fmtspec)\n\n# -----------------------------------------------------------------------------\n# unzip functions\n\ndef unzip_half(opts, func, simd_ext, typ):\n    loop = '''{typ} tab[{lex2}];\n              {typ} res[{le}];\n              int i;\n              nsimd_storeu_{simd_ext}_{typ}(tab, {in0});\n              nsimd_storeu_{simd_ext}_{typ}(tab + {le}, {in1});\n              for(i = 0; i < {le}; i++) {{\n                res[i] = tab[2 * i + {offset}];\n              }}\n              return nsimd_loadu_{simd_ext}_{typ}(res);\n              '''.format(lex2=2 * int(fmtspec['le']),\n                         offset='0' if func == 'unziplo' else '1', **fmtspec)\n\n    if simd_ext in sse:\n        if typ in ['f32', 'i32', 'u32']:\n            v0 = ('_mm_castsi128_ps({in0})' if typ in ['i32', 'u32'] \\\n                                            else '{in0}').format(**fmtspec)\n            v1 = ('_mm_castsi128_ps({in1})' if typ in ['i32', 'u32'] \\\n                                            else '{in1}').format(**fmtspec)\n            ret = ('_mm_castps_si128(v_res)' if typ in ['i32', 'u32'] \\\n                                             else 'v_res').format(**fmtspec)\n            return '''__m128 v_res;\n                      v_res = _mm_shuffle_ps({v0}, {v1}, {mask});\n                      return {ret};'''.format(\n                      mask='_MM_SHUFFLE(2, 0, 2, 0)' if func == 'unziplo' \\\n                      else '_MM_SHUFFLE(3, 1, 3, 1)',\n                      v0=v0, v1=v1, ret=ret, **fmtspec)\n        elif typ == 'f16':\n            return \\\n            '''nsimd_{simd_ext}_v{typ} v_res;\n               v_res.v0 = _mm_shuffle_ps({in0}.v0, {in0}.v1, {mask});\n               v_res.v1 = _mm_shuffle_ps({in1}.v0, {in1}.v1, {mask});\n               return v_res;'''.format(mask='_MM_SHUFFLE(2, 0, 2, 0)' \\\n                                       if func == 'unziplo' \\\n                                       else '_MM_SHUFFLE(3, 1, 3, 1)',\n                                       **fmtspec)\n        elif typ in ['f64', 'i64', 'u64']:\n            v0 = ('_mm_castsi128_pd({in0})' if typ in ['i64', 'u64'] \\\n                                            else '{in0}').format(**fmtspec)\n            v1 = ('_mm_castsi128_pd({in1})' if typ in ['i64', 'u64'] \\\n                                            else '{in1}').format(**fmtspec)\n            ret = ('_mm_castpd_si128(v_res)' if typ in ['i64', 'u64'] \\\n                                             else 'v_res').format(**fmtspec)\n            return '''__m128d v_res;\n                      v_res = _mm_shuffle_pd({v0}, {v1}, {mask});\n                      return {ret};\n                      '''.format(mask='0' if func == 'unziplo' else '3',\n                                 v0=v0, v1=v1, ret=ret, **fmtspec)\n        elif typ in ['i16', 'u16']:\n            return '''__m128i v_tmp0 = _mm_shufflelo_epi16(\n                                           {in0}, _MM_SHUFFLE(3, 1, 2, 0));\n                      v_tmp0 = _mm_shufflehi_epi16(v_tmp0,\n                                   _MM_SHUFFLE(3, 1, 2, 0));\n                      __m128i v_tmp1 = _mm_shufflelo_epi16({in1},\n                                   _MM_SHUFFLE(3, 1, 2, 0));\n                      v_tmp1 = _mm_shufflehi_epi16(v_tmp1,\n                                   _MM_SHUFFLE(3, 1, 2, 0));\n                      __m128 v_res = _mm_shuffle_ps(_mm_castsi128_ps(v_tmp0),\n                                         _mm_castsi128_ps(v_tmp1), {mask});\n                      return _mm_castps_si128(v_res);\n                      '''.format(mask='_MM_SHUFFLE(2, 0, 2, 0)' \\\n                                 if func == 'unziplo' \\\n                                 else '_MM_SHUFFLE(3, 1, 3, 1)', **fmtspec)\n        else:\n            return loop\n    elif simd_ext in avx:\n        ret_template = \\\n        '''v_tmp0 = _mm256_permute2f128_{t}({v0}, {v0}, 0x01);\n           v_tmp0 = _mm256_shuffle_{t}({v0}, v_tmp0, {mask});\n           v_tmp1 = _mm256_permute2f128_{t}({v1}, {v1}, 0x01);\n           v_tmp1 = _mm256_shuffle_{t}({v1}, v_tmp1, {mask});\n           v_res  = _mm256_permute2f128_{t}(v_tmp0, v_tmp1, 0x20);\n           {ret} = {v_res};'''\n        if typ in ['f32', 'i32', 'u32']:\n            v0 = '_mm256_castsi256_ps({in0})' \\\n                 if typ in ['i32', 'u32'] else '{in0}'\n            v1 = '_mm256_castsi256_ps({in1})' \\\n                 if typ in ['i32', 'u32'] else '{in1}'\n            v_res = '_mm256_castps_si256(v_res)' \\\n                    if typ in ['i32', 'u32'] else 'v_res'\n            ret = 'ret'\n            src = ret_template.format(mask='_MM_SHUFFLE(2, 0, 2, 0)' \\\n                      if func == 'unziplo' else '_MM_SHUFFLE(3, 1, 3, 1)',\n                      v0=v0, v1=v1, v_res=v_res, ret=ret, t='ps', **fmtspec)\n            return '''nsimd_{simd_ext}_v{typ} ret;\n                      __m256 v_res, v_tmp0, v_tmp1;\n                      {src}\n                      return ret;'''. \\\n                      format(src=src.format(**fmtspec), **fmtspec)\n        elif typ == 'f16':\n            src0 = ret_template.format(mask='_MM_SHUFFLE(2, 0, 2, 0)' \\\n                       if func == 'unziplo' else '_MM_SHUFFLE(3, 1, 3, 1)',\n                       v0='{in0}.v0', v1='{in0}.v1', v_res='v_res',\n                       ret='ret.v0', t='ps')\n            src1 = ret_template.format(mask='_MM_SHUFFLE(2, 0, 2, 0)' \\\n                       if func == 'unziplo' else '_MM_SHUFFLE(3, 1, 3, 1)',\n                       v0='{in1}.v0', v1='{in1}.v1', v_res='v_res',\n                       ret='ret.v1', t='ps')\n            return '''nsimd_{simd_ext}_v{typ} ret;\n                      __m256 v_res, v_tmp0, v_tmp1;\n                      {src0}\n                      {src1}\n                      return ret;'''.format(src0=src0.format(**fmtspec),\n                                            src1=src1.format(**fmtspec),\n                                            **fmtspec)\n        elif typ in ['f64', 'i64', 'u64']:\n            v0 = ('_mm256_castsi256_pd({in0})' \\\n                      if typ in ['i64', 'u64'] else '{in0}').format(**fmtspec)\n            v1 = ('_mm256_castsi256_pd({in1})' \\\n                      if typ in ['i64', 'u64'] else '{in1}').format(**fmtspec)\n            v_res = ('_mm256_castpd_si256(v_res)' \\\n                         if typ in ['i64', 'u64'] else 'v_res'). \\\n                         format(**fmtspec)\n            src = ret_template.format(mask='0x00' if func == 'unziplo' \\\n                      else '0x03', v0=v0, v1=v1, ret='ret', v_res=v_res,\n                      t='pd')\n            return '''nsimd_{simd_ext}_v{typ} ret;\n                      __m256d v_res, v_tmp0, v_tmp1;\n                      {src}\n                      return ret;'''.format(src=src.format(**fmtspec),\n                                            **fmtspec)\n        elif typ in ['i16', 'u16']:\n            return \\\n            '''__m128i v_tmp0_hi = {hi0};\n               __m128i v_tmp0_lo = {lo0};\n               __m128i v_tmp1_hi = {hi1};\n               __m128i v_tmp1_lo = {lo1};\n               v_tmp0_lo = nsimd_{func}_sse2_{typ}(v_tmp0_lo, v_tmp0_hi);\n               v_tmp1_lo = nsimd_{func}_sse2_{typ}(v_tmp1_lo, v_tmp1_hi);\n               return {merge};'''. \\\n               format(hi0=extract(simd_ext, typ, HI, common.in0),\n                      lo0=extract(simd_ext, typ, LO, common.in0),\n                      hi1=extract(simd_ext, typ, HI, common.in1),\n                      lo1=extract(simd_ext, typ, LO, common.in1),\n                      merge=setr(simd_ext, typ, 'v_tmp0_lo', 'v_tmp1_lo'),\n                      func=func, **fmtspec)\n        else:\n            return loop\n    else:\n        if typ == 'f16':\n            return \\\n            '''nsimd_{simd_ext}_v{typ} ret;\n               __m256 v_tmp0, v_tmp1, v_res_lo, v_res_hi;\n               v_tmp0 = {loin0v0};\n               v_tmp1 = {hiin0v0};\n               v_res_lo = nsimd_{func}_avx2_f32(v_tmp0, v_tmp1);\n               v_tmp0 = {loin0v1};\n               v_tmp1 = {hiin0v1};\n               v_res_hi = nsimd_{func}_avx2_f32(v_tmp0, v_tmp1);\n               ret.v0 = {merge};\n               v_tmp0 = {loin1v0};\n               v_tmp1 = {hiin1v0};\n               v_res_lo = nsimd_{func}_avx2_f32(v_tmp0, v_tmp1);\n               v_tmp0 = {loin1v1};\n               v_tmp1 = {hiin1v1};\n               v_res_hi = nsimd_{func}_avx2_f32(v_tmp0, v_tmp1);\n               ret.v1 = {merge};\n               return ret;'''.format(\n                   loin0v0=extract(simd_ext, 'f32', LO, common.in0 + '.v0'),\n                   hiin0v0=extract(simd_ext, 'f32', HI, common.in0 + '.v0'),\n                   loin0v1=extract(simd_ext, 'f32', LO, common.in0 + '.v1'),\n                   hiin0v1=extract(simd_ext, 'f32', HI, common.in0 + '.v1'),\n                   loin1v0=extract(simd_ext, 'f32', LO, common.in1 + '.v0'),\n                   hiin1v0=extract(simd_ext, 'f32', HI, common.in1 + '.v0'),\n                   loin1v1=extract(simd_ext, 'f32', LO, common.in1 + '.v1'),\n                   hiin1v1=extract(simd_ext, 'f32', HI, common.in1 + '.v1'),\n                   merge=setr(simd_ext, 'f32', 'v_res_lo', 'v_res_hi'),\n                   func=func, **fmtspec)\n        else:\n            return '''nsimd_avx2_v{typ} v00 = {extract_lo0};\n                      nsimd_avx2_v{typ} v01 = {extract_hi0};\n                      nsimd_avx2_v{typ} v10 = {extract_lo1};\n                      nsimd_avx2_v{typ} v11 = {extract_hi1};\n                      v00 = nsimd_{func}_avx2_{typ}(v00, v01);\n                      v01 = nsimd_{func}_avx2_{typ}(v10, v11);\n                      return {merge};'''.format(\n                          func=func,\n                          extract_lo0=extract(simd_ext, typ, LO, common.in0),\n                          extract_lo1=extract(simd_ext, typ, LO, common.in1),\n                          extract_hi0=extract(simd_ext, typ, HI, common.in0),\n                          extract_hi1=extract(simd_ext, typ, HI, common.in1),\n                          merge=setr(simd_ext, typ, 'v00', 'v01'), **fmtspec)\n\ndef unzip(simd_ext, typ):\n    return '''nsimd_{simd_ext}_v{typ}x2 ret;\n              ret.v0 = nsimd_unziplo_{simd_ext}_{typ}({in0}, {in1});\n              ret.v1 = nsimd_unziphi_{simd_ext}_{typ}({in0}, {in1});\n              return ret;'''.format(**fmtspec)\n\n# -----------------------------------------------------------------------------\n# mask_for_loop_tail\n\ndef mask_for_loop_tail(simd_ext, typ):\n    if typ == 'f16':\n        fill_n = '''n.v0 = {pre}set1_ps((f32)({in1} - {in0}));\n                    n.v1 = n.v0;'''.format(**fmtspec)\n    else:\n        fill_n = 'n = nsimd_set1_{simd_ext}_{typ}(({typ})({in1} - {in0}));'. \\\n                 format(**fmtspec)\n    return '''if ({in0} >= {in1}) {{\n                return nsimd_set1l_{simd_ext}_{typ}(0);\n              }}\n              if ({in1} - {in0} < {le}) {{\n                nsimd_{simd_ext}_v{typ} n;\n                {fill_n}\n                return nsimd_lt_{simd_ext}_{typ}(\n                         nsimd_iota_{simd_ext}_{typ}(), n);\n              }} else {{\n                return nsimd_set1l_{simd_ext}_{typ}(1);\n              }}'''.format(fill_n=fill_n, **fmtspec)\n\n# -----------------------------------------------------------------------------\n# iota\n\ndef iota(simd_ext, typ):\n    typ2 = 'f32' if typ == 'f16' else typ\n    iota = ', '.join(['({typ2}){i}'.format(typ2=typ2, i=i) \\\n                      for i in range(int(fmtspec['le']))])\n    if typ == 'f16':\n        return '''f32 buf[{le}] = {{ {iota} }};\n                  nsimd_{simd_ext}_vf16 ret;\n                  ret.v0 = {pre}loadu_ps(buf);\n                  ret.v1 = {pre}loadu_ps(buf + {le2});\n                  return ret;'''. \\\n                  format(iota=iota, le2=fmtspec['le'] // 2, **fmtspec)\n    return '''{typ} buf[{le}] = {{ {iota} }};\n              return {pre}loadu{sufsi}({cast}buf);'''. \\\n              format(iota=iota, cast='(__m{nbits}i*)'.format(**fmtspec) \\\n                                if typ in common.iutypes else '', **fmtspec)\n\n# -----------------------------------------------------------------------------\n# scatter\n\ndef scatter(simd_ext, typ):\n    if typ == 'f16':\n        return '''int i;\n                  f32 buf[{le}];\n                  i16 offset_buf[{le}];\n                  {pre}storeu_si{nbits}((__m{nbits}i *)offset_buf, {in1});\n                  {pre}storeu_ps(buf, {in2}.v0);\n                  {pre}storeu_ps(buf + {leo2}, {in2}.v1);\n                  for (i = 0; i < {le}; i++) {{\n                    {in0}[offset_buf[i]] = nsimd_f32_to_f16(buf[i]);\n                  }}'''.format(leo2=int(fmtspec['le']) // 2, **fmtspec)\n    if simd_ext in (sse + avx) or typ in ['i8', 'u8', 'i16', 'u16']:\n        cast = castsi(simd_ext, typ)\n        return '''int i;\n                  {typ} buf[{le}];\n                  {ityp} offset_buf[{le}];\n                  {pre}storeu_si{nbits}((__m{nbits}i *)offset_buf, {in1});\n                  {pre}storeu{sufsi}({cast}buf, {in2});\n                  for (i = 0; i < {le}; i++) {{\n                    {in0}[offset_buf[i]] = buf[i];\n                  }}'''.format(ityp='i' + typ[1:], cast=cast, **fmtspec)\n    # getting here means 32 and 64-bits types for avx512\n    return '''{pre}i{typnbits}scatter{suf}(\n                  (void *){in0}, {in1}, {in2}, {scale});'''. \\\n                  format(scale=int(typ[1:]) // 8, **fmtspec)\n\n# -----------------------------------------------------------------------------\n# linear scatter\n\ndef scatter_linear(simd_ext, typ):\n    if typ == 'f16':\n        return '''int i;\n                  f32 buf[{le}];\n                  {pre}storeu_ps(buf, {in2}.v0);\n                  {pre}storeu_ps(buf + {leo2}, {in2}.v1);\n                  for (i = 0; i < {le}; i++) {{\n                    {in0}[i * {in1}] = nsimd_f32_to_f16(buf[i]);\n                  }}'''.format(leo2=int(fmtspec['le']) // 2, **fmtspec)\n    if simd_ext in avx512:\n        return '''nsimd_scatter_linear_avx2_{typ}({in0}, {in1}, {lo});\n                  nsimd_scatter_linear_avx2_{typ}({in0} + ({leo2} * {in1}),\n                                                  {in1}, {hi});'''. \\\n                  format(leo2=int(fmtspec['le']) // 2,\n                         lo=extract(simd_ext, typ, LO, fmtspec['in2']),\n                         hi=extract(simd_ext, typ, HI, fmtspec['in2']),\n                         **fmtspec)\n    emulation = '''int i;\n                   {typ} buf[{le}];\n                   {pre}storeu{sufsi}({cast}buf, {in2});\n                   for (i = 0; i < {le}; i++) {{\n                     {in0}[i * {in1}] = buf[i];\n                   }}'''.format(cast=castsi(simd_ext, typ), **fmtspec)\n    if (simd_ext == 'sse2' and typ in ['i16', 'u16']) or \\\n       (simd_ext == 'avx' and \\\n        typ in ['i32', 'u32', 'f32', 'i64', 'u64', 'f64']) or \\\n       (simd_ext in ['sse42', 'avx2']):\n        trick = '\\n'.join([\n        '{in0}[{i} * {in1}] = {get_lane};'.format(i=i,\n        get_lane=get_lane(simd_ext, typ, '{in2}'.format(**fmtspec), i),\n        **fmtspec) for i in range(int(fmtspec['le']))])\n        return '''#if NSIMD_WORD_SIZE == 32\n                    {}\n                  #else\n                    {}\n                  #endif'''.format(emulation, trick)\n    else:\n        return emulation\n\n# -----------------------------------------------------------------------------\n# mask_scatter\n\ndef mask_scatter(simd_ext, typ):\n    if typ == 'f16':\n        le2 = fmtspec['le'] // 2\n        if simd_ext in sse + avx:\n            store_mask = '''{pre}storeu_ps(mask, {in0}.v0);\n                            {pre}storeu_ps(mask + {le2}, {in0}.v1);'''. \\\n                            format(le2=le2, **fmtspec)\n        else:\n            store_mask = '''_mm512_storeu_ps(mask, _mm512_maskz_mov_ps(\n                              {in0}.v0, _mm512_set1_ps(1.0f)));\n                            _mm512_storeu_ps(mask + {le2}, _mm512_maskz_mov_ps(\n                              {in0}.v1, _mm512_set1_ps(1.0f)));'''. \\\n                            format(le2=le2, **fmtspec)\n        return '''int i;\n                  f32 mask[{le}], buf[{le}];\n                  i16 offset_buf[{le}];\n                  {store_mask}\n                  {pre}storeu_si{nbits}((__m{nbits}i *)offset_buf, {in2});\n                  {pre}storeu_ps(buf, {in3}.v0);\n                  {pre}storeu_ps(buf + {le2}, {in3}.v1);\n                  for (i = 0; i < {le}; i++) {{\n                    if (nsimd_scalar_reinterpret_u32_f32(mask[i]) != (u32)0) {{\n                      {in1}[offset_buf[i]] = nsimd_f32_to_f16(buf[i]);\n                    }}\n                  }}'''.format(le2=le2, store_mask=store_mask, **fmtspec)\n    if simd_ext in (sse + avx) or typ in ['i8', 'u8', 'i16', 'u16']:\n        cast = castsi(simd_ext, typ)\n        if simd_ext in avx512:\n            mask_decl = 'u64 mask;'\n            store_mask = 'mask = (u64){in0};'.format(**fmtspec)\n            cond = '(mask >> i) & 1'\n        else:\n            mask_decl = '{typ} mask[{le}];'.format(**fmtspec)\n            store_mask = '{pre}storeu{sufsi}({cast}mask, {in0});'. \\\n                         format(cast=cast, **fmtspec)\n            cond = 'nsimd_scalar_reinterpret_{utyp}_{typ}(mask[i]) != '\\\n                   '({utyp})0'.format(utyp='u' + typ[1:], **fmtspec)\n        return '''int i;\n                  {typ} buf[{le}];\n                  {mask_decl}\n                  {ityp} offset_buf[{le}];\n                  {store_mask}\n                  {pre}storeu_si{nbits}((__m{nbits}i *)offset_buf, {in2});\n                  {pre}storeu{sufsi}({cast}buf, {in3});\n                  for (i = 0; i < {le}; i++) {{\n                    if ({cond}) {{\n                      {in1}[offset_buf[i]] = buf[i];\n                    }}\n                  }}'''.format(ityp='i' + typ[1:], cast=cast, cond=cond,\n                               mask_decl=mask_decl, store_mask=store_mask,\n                               **fmtspec)\n    # getting here means 32 and 64-bits types for avx512\n    return '''{pre}mask_i{typnbits}scatter{suf}(\n                  (void *){in1}, {in0}, {in2}, {in3}, {scale});'''. \\\n                  format(scale=int(typ[1:]) // 8, **fmtspec)\n\n# -----------------------------------------------------------------------------\n# gather\n\ndef gather(simd_ext, typ):\n    if typ == 'f16':\n        return '''nsimd_{simd_ext}_vf16 ret;\n                  int i;\n                  f32 buf[{le}];\n                  i16 offset_buf[{le}];\n                  {pre}storeu_si{nbits}((__m{nbits}i *)offset_buf, {in1});\n                  for (i = 0; i < {le}; i++) {{\n                    buf[i] = nsimd_f16_to_f32({in0}[offset_buf[i]]);\n                  }}\n                  ret.v0 = {pre}loadu_ps(buf);\n                  ret.v1 = {pre}loadu_ps(buf + {leo2});\n                  return ret;'''.format(leo2=int(fmtspec['le']) // 2,\n                                        **fmtspec)\n    if simd_ext in (sse + ['avx']) or typ in ['i8', 'u8', 'i16', 'u16']:\n        cast = castsi(simd_ext, typ)\n        return '''int i;\n                  {typ} buf[{le}];\n                  {ityp} offset_buf[{le}];\n                  {pre}storeu_si{nbits}((__m{nbits}i *)offset_buf, {in1});\n                  for (i = 0; i < {le}; i++) {{\n                    buf[i] = {in0}[offset_buf[i]];\n                  }}\n                  return {pre}loadu{sufsi}({cast}buf);'''. \\\n                  format(ityp='i' + typ[1:], cast=cast, **fmtspec)\n    # getting here means 32 and 64-bits types for avx2 and avx512\n    if simd_ext == 'avx2':\n        if typ in ['i64', 'u64']:\n            cast = '(nsimd_longlong *)'\n        elif typ in ['i32', 'u32']:\n            cast = '(int *)'\n        else:\n            cast = '({typ} *)'.format(**fmtspec)\n        return '''return {pre}i{typnbits}gather{suf}(\n                             {cast}{in0}, {in1}, {scale});'''. \\\n                             format(scale=int(typ[1:]) // 8, cast=cast,\n                                    **fmtspec)\n    elif simd_ext in avx512:\n        return 'return {pre}i{typnbits}gather{suf}({in1}, ' \\\n                      '(const void *){in0}, {scale});'. \\\n                      format(scale=int(typ[1:]) // 8, **fmtspec)\n\n# -----------------------------------------------------------------------------\n# linear gather\n\ndef gather_linear(simd_ext, typ):\n    le = int(fmtspec['le'])\n    cast = castsi(simd_ext, typ)\n    if typ == 'f16':\n        return '''nsimd_{simd_ext}_vf16 ret;\n                  f32 buf[{le}];\n                  int i;\n                  for (i = 0; i < {le}; i++) {{\n                    buf[i] = nsimd_f16_to_f32({in0}[i * {in1}]);\n                  }}\n                  ret.v0 = {pre}loadu_ps(buf);\n                  ret.v1 = {pre}loadu_ps(buf + {leo2});\n                  return ret;'''.format(leo2=le // 2, **fmtspec)\n    emulation = '''{typ} buf[{le}];\n                   int i;\n                   for (i = 0; i < {le}; i++) {{\n                     buf[i] = {in0}[i * {in1}];\n                   }}\n                   return {pre}loadu{sufsi}({cast}buf);'''. \\\n                   format(cast=cast, **fmtspec)\n    if simd_ext == 'sse2' and typ not in ['i16', 'u16']:\n        return emulation\n    if simd_ext in sse + avx:\n        trick = \\\n        '''nsimd_{simd_ext}_v{typ} ret;\n           ret = {pre}undefined{sufsi}();\n           '''.format(**fmtspec) + ''.join([\n           set_lane(simd_ext, typ, 'ret', '{in0}[{i} * {in1}]'. \\\n                                          format(i=i, **fmtspec), i) + '\\n' \\\n                                          for i in range(le)]) + \\\n        '''return ret;'''\n        return '''#if NSIMD_WORD_SIZE == 32\n                    {}\n                  #else\n                    {}\n                  #endif\n                  '''.format(emulation, trick)\n    # getting here means AVX-512\n    return \\\n    '''nsimd_avx2_v{typ} lo = _mm256_undefined{sufsi2}();\n       nsimd_avx2_v{typ} hi = _mm256_undefined{sufsi2}();\n       lo = nsimd_gather_linear_avx2_{typ}({in0}, {in1});\n       hi = nsimd_gather_linear_avx2_{typ}({in0} + ({leo2} * {in1}), {in1});\n       return {merge};'''.format(merge=setr(simd_ext, typ, 'lo', 'hi'),\n                                 sufsi2=suf_si('avx2', typ),\n                                 leo2=le // 2, **fmtspec)\n\n# -----------------------------------------------------------------------------\n# maksed gather\n\ndef maskoz_gather(oz, simd_ext, typ):\n    if typ == 'f16':\n        le2 = fmtspec['le'] // 2\n        if simd_ext in sse + avx:\n            store_mask = '''{pre}storeu_ps(mask, {in0}.v0);\n                            {pre}storeu_ps(mask + {le2}, {in0}.v1);'''. \\\n                            format(le2=le2, **fmtspec)\n        else:\n            store_mask = '''_mm512_storeu_ps(mask, _mm512_maskz_mov_ps(\n                              {in0}.v0, _mm512_set1_ps(1.0f)));\n                            _mm512_storeu_ps(mask + {le2}, _mm512_maskz_mov_ps(\n                              {in0}.v1, _mm512_set1_ps(1.0f)));'''. \\\n                            format(le2=le2, **fmtspec)\n        if oz == 'z':\n            store_oz = '''{pre}storeu_ps(buf, {pre}setzero_ps());\n                          {pre}storeu_ps(buf + {le2}, {pre}setzero_ps());'''. \\\n                          format(le2=le2, **fmtspec)\n        else:\n            store_oz = '''{pre}storeu_ps(buf, {in3}.v0);\n                          {pre}storeu_ps(buf + {le2}, {in3}.v1);'''. \\\n                          format(le2=le2, **fmtspec)\n        return '''nsimd_{simd_ext}_vf16 ret;\n                  int i;\n                  f32 buf[{le}], mask[{le}];\n                  i16 offset_buf[{le}];\n                  {store_mask}\n                  {store_oz}\n                  {pre}storeu_si{nbits}((__m{nbits}i *)offset_buf, {in2});\n                  for (i = 0; i < {le}; i++) {{\n                    if (nsimd_scalar_reinterpret_u32_f32(mask[i]) != (u32)0) {{\n                      buf[i] = nsimd_f16_to_f32({in1}[offset_buf[i]]);\n                    }}\n                  }}\n                  ret.v0 = {pre}loadu_ps(buf);\n                  ret.v1 = {pre}loadu_ps(buf + {leo2});\n                  return ret;'''.format(leo2=le2, store_mask=store_mask,\n                                        store_oz=store_oz, **fmtspec)\n    if simd_ext in (sse + ['avx']) or typ in ['i8', 'u8', 'i16', 'u16']:\n        cast = castsi(simd_ext, typ)\n        if simd_ext in sse + avx:\n            mask_decl = '{typ} mask[{le}];'.format(**fmtspec)\n            store_mask = '{pre}storeu{sufsi}({cast}mask, {in0});'. \\\n                         format(cast=cast, **fmtspec)\n            if typ in common.iutypes:\n                comp = 'mask[i]'\n            else:\n                comp = 'nsimd_scalar_reinterpret_u{typnbits}_{typ}(mask[i])'. \\\n                       format(**fmtspec)\n        else:\n            mask_decl = 'u64 mask;'\n            store_mask = 'mask = (u64){in0};'.format(**fmtspec)\n            comp = '(mask >> i) & 1'\n        if oz == 'z':\n            store_oz = '''{pre}storeu{sufsi}({cast}buf,\n                                             {pre}setzero{sufsi}());'''. \\\n                                             format(cast=cast, **fmtspec)\n        else:\n            store_oz = '{pre}storeu{sufsi}({cast}buf, {in3});'. \\\n                       format(cast=cast, **fmtspec)\n        return '''int i;\n                  {typ} buf[{le}];\n                  {mask_decl}\n                  {ityp} offset_buf[{le}];\n                  {store_mask}\n                  {store_oz}\n                  {pre}storeu_si{nbits}((__m{nbits}i *)offset_buf, {in2});\n                  for (i = 0; i < {le}; i++) {{\n                    if ({comp}) {{\n                      buf[i] = {in1}[offset_buf[i]];\n                    }}\n                  }}\n                  return {pre}loadu{sufsi}({cast}buf);'''. \\\n                  format(ityp='i' + typ[1:], cast=cast, store_mask=store_mask,\n                         store_oz=store_oz, comp=comp, mask_decl=mask_decl,\n                         **fmtspec)\n    # getting here means 32 and 64-bits types for avx2 and avx512\n    if oz == 'o':\n        src = '{in3}'.format(**fmtspec)\n    else:\n        src = '{pre}setzero{sufsi}()'.format(**fmtspec)\n    if simd_ext == 'avx2':\n        if typ in ['i64', 'u64']:\n            cast = '(nsimd_longlong *)'\n        elif typ in ['i32', 'u32']:\n            cast = '(int *)'\n        else:\n            cast = '({typ} *)'.format(**fmtspec)\n        return '''return {pre}mask_i{typnbits}gather{suf}({src},\n                             {cast}{in1}, {in2}, {in0}, {scale});'''. \\\n                             format(scale=int(typ[1:]) // 8, cast=cast,\n                                    src=src, **fmtspec)\n    elif simd_ext in avx512:\n        return 'return {pre}mask_i{typnbits}gather{suf}({src}, {in0}, ' \\\n                      '{in2}, (const void *){in1}, {scale});'. \\\n                      format(src=src, scale=int(typ[1:]) // 8, **fmtspec)\n\n\n# -----------------------------------------------------------------------------\n# get_impl function\n\ndef get_impl(opts, func, simd_ext, from_typ, to_typ):\n    global fmtspec\n\n    fmtspec = {\n      'simd_ext': simd_ext,\n      'typ': from_typ,\n      'styp': get_native_typ(simd_ext, from_typ),\n      'from_typ': from_typ,\n      'to_typ': to_typ,\n      'pre': pre(simd_ext),\n      'suf': suf_ep(from_typ),\n      'sufsi': suf_si(simd_ext, from_typ),\n      'in0': common.in0,\n      'in1': common.in1,\n      'in2': common.in2,\n      'in3': common.in3,\n      'in4': common.in4,\n      'in5': common.in5,\n      'nbits': nbits(simd_ext),\n      'le': int(nbits(simd_ext)) // int(from_typ[1:]),\n      'typnbits': from_typ[1:]\n    }\n\n    impls = {\n        'loada': lambda: load(simd_ext, from_typ, True),\n        'masko_loada1': lambda: maskoz_load(simd_ext, from_typ, 'o', True),\n        'maskz_loada1': lambda: maskoz_load(simd_ext, from_typ, 'z', True),\n        'load2a': lambda: load_deg234(simd_ext, from_typ, True, 2),\n        'load3a': lambda: load_deg234(simd_ext, from_typ, True, 3),\n        'load4a': lambda: load_deg234(simd_ext, from_typ, True, 4),\n        'loadu': lambda: load(simd_ext, from_typ, False),\n        'masko_loadu1': lambda: maskoz_load(simd_ext, from_typ, 'o', False),\n        'maskz_loadu1': lambda: maskoz_load(simd_ext, from_typ, 'z', False),\n        'load2u': lambda: load_deg234(simd_ext, from_typ, False, 2),\n        'load3u': lambda: load_deg234(simd_ext, from_typ, False, 3),\n        'load4u': lambda: load_deg234(simd_ext, from_typ, False, 4),\n        'storea': lambda: store(simd_ext, from_typ, True),\n        'mask_storea1': lambda: mask_store(simd_ext, from_typ, True),\n        'store2a': lambda: store_deg234(simd_ext, from_typ, True, 2),\n        'store3a': lambda: store_deg234(simd_ext, from_typ, True, 3),\n        'store4a': lambda: store_deg234(simd_ext, from_typ, True, 4),\n        'storeu': lambda: store(simd_ext, from_typ, False),\n        'mask_storeu1': lambda: mask_store(simd_ext, from_typ, False),\n        'store2u': lambda: store_deg234(simd_ext, from_typ, False, 2),\n        'store3u': lambda: store_deg234(simd_ext, from_typ, False, 3),\n        'store4u': lambda: store_deg234(simd_ext, from_typ, False, 4),\n        'gather': lambda: gather(simd_ext, from_typ),\n        'gather_linear': lambda: gather_linear(simd_ext, from_typ),\n        'masko_gather': lambda: maskoz_gather('o', simd_ext, from_typ),\n        'maskz_gather': lambda: maskoz_gather('z', simd_ext, from_typ),\n        'scatter': lambda: scatter(simd_ext, from_typ),\n        'scatter_linear': lambda: scatter_linear(simd_ext, from_typ),\n        'mask_scatter': lambda: mask_scatter(simd_ext, from_typ),\n        'andb': lambda: binop2('andb', simd_ext, from_typ),\n        'xorb': lambda: binop2('xorb', simd_ext, from_typ),\n        'orb': lambda: binop2('orb', simd_ext, from_typ),\n        'andl': lambda: binlop2('andl', simd_ext, from_typ),\n        'xorl': lambda: binlop2('xorl', simd_ext, from_typ),\n        'orl': lambda: binlop2('orl', simd_ext, from_typ),\n        'notb': lambda: not1(simd_ext, from_typ),\n        'notl': lambda: lnot1(simd_ext, from_typ),\n        'andnotb': lambda: andnot2(simd_ext, from_typ),\n        'andnotl': lambda: landnot2(simd_ext, from_typ),\n        'add': lambda: addsub('add', simd_ext, from_typ),\n        'sub': lambda: addsub('sub', simd_ext, from_typ),\n        'adds': lambda: adds(simd_ext, from_typ),\n        'subs': lambda: subs(simd_ext, from_typ),\n        'div': lambda: div2(opts, simd_ext, from_typ),\n        'sqrt': lambda: sqrt1(simd_ext, from_typ),\n        'len': lambda: len1(simd_ext, from_typ),\n        'mul': lambda: mul2(opts, simd_ext, from_typ),\n        'shl': lambda: shl_shr('shl', simd_ext, from_typ),\n        'shr': lambda: shl_shr('shr', simd_ext, from_typ),\n        'shra': lambda: shra(opts, simd_ext, from_typ),\n        'set1': lambda: set1(simd_ext, from_typ),\n        'set1l': lambda: set1l(simd_ext, from_typ),\n        'eq': lambda: eq2(simd_ext, from_typ),\n        'ne': lambda: neq2(simd_ext, from_typ),\n        'gt': lambda: gt2(simd_ext, from_typ),\n        'lt': lambda: lt2(simd_ext, from_typ),\n        'ge': lambda: geq2(simd_ext, from_typ),\n        'le': lambda: leq2(simd_ext, from_typ),\n        'if_else1': lambda: if_else1(simd_ext, from_typ),\n        'min': lambda: minmax('min', simd_ext, from_typ),\n        'max': lambda: minmax('max', simd_ext, from_typ),\n        'loadla': lambda: loadl(simd_ext, from_typ, True),\n        'loadlu': lambda: loadl(simd_ext, from_typ, False),\n        'storela': lambda: storel(simd_ext, from_typ, True),\n        'storelu': lambda: storel(simd_ext, from_typ, False),\n        'abs': lambda: abs1(simd_ext, from_typ),\n        'fma': lambda: fma_fms('fma', simd_ext, from_typ),\n        'fnma': lambda: fma_fms('fnma', simd_ext, from_typ),\n        'fms': lambda: fma_fms('fms', simd_ext, from_typ),\n        'fnms': lambda: fma_fms('fnms', simd_ext, from_typ),\n        'ceil': lambda: round1(opts, 'ceil', simd_ext, from_typ),\n        'floor': lambda: round1(opts, 'floor', simd_ext, from_typ),\n        'trunc': lambda: trunc1(opts, simd_ext, from_typ),\n        'round_to_even': lambda: round_to_even1(opts, simd_ext, from_typ),\n        'all': lambda: all_any('all', simd_ext, from_typ),\n        'any': lambda: all_any('any', simd_ext, from_typ),\n        'reinterpret': lambda: reinterpret1(simd_ext, from_typ, to_typ),\n        'reinterpretl': lambda: reinterpretl1(simd_ext, from_typ, to_typ),\n        'cvt': lambda: convert1(simd_ext, from_typ, to_typ),\n        'rec11': lambda: rec11_rsqrt11('rcp', simd_ext, from_typ),\n        'rec8': lambda: rec11_rsqrt11('rcp', simd_ext, from_typ),\n        'rsqrt11': lambda: rec11_rsqrt11('rsqrt', simd_ext, from_typ),\n        'rsqrt8': lambda: rec11_rsqrt11('rsqrt', simd_ext, from_typ),\n        'rec': lambda: rec1(simd_ext, from_typ),\n        'neg': lambda: neg1(simd_ext, from_typ),\n        'nbtrue': lambda: nbtrue1(simd_ext, from_typ),\n        'reverse': lambda: reverse1(simd_ext, from_typ),\n        'addv': lambda: addv(simd_ext, from_typ),\n        'upcvt': lambda: upcvt1(simd_ext, from_typ, to_typ),\n        'downcvt': lambda: downcvt1(opts, simd_ext, from_typ, to_typ),\n        'to_mask': lambda: to_mask1(simd_ext, from_typ),\n        'to_logical': lambda: to_logical1(simd_ext, from_typ),\n        'ziplo': lambda: zip_half('ziplo', simd_ext, from_typ),\n        'ziphi': lambda: zip_half('ziphi', simd_ext, from_typ),\n        'unziplo': lambda: unzip_half(opts, 'unziplo', simd_ext, from_typ),\n        'unziphi': lambda: unzip_half(opts, 'unziphi', simd_ext, from_typ),\n        'zip' : lambda : zip(simd_ext, from_typ),\n        'unzip' : lambda : unzip(simd_ext, from_typ),\n        'mask_for_loop_tail': lambda : mask_for_loop_tail(simd_ext, from_typ),\n        'iota': lambda : iota(simd_ext, from_typ)\n    }\n    if simd_ext not in get_simd_exts():\n        raise ValueError('Unknown SIMD extension \"{}\"'.format(simd_ext))\n    if not from_typ in common.types:\n        raise ValueError('Unknown type \"{}\"'.format(from_typ))\n    if not func in impls:\n        return common.NOT_IMPLEMENTED\n    else:\n        return impls[func]()\n"
  },
  {
    "path": "egg/rocm.py",
    "content": "# Copyright (c) 2020 Agenium Scale\n#\n# Permission is hereby granted, free of charge, to any person obtaining a copy\n# of this software and associated documentation files (the \"Software\"), to deal\n# in the Software without restriction, including without limitation the rights\n# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n# copies of the Software, and to permit persons to whom the Software is\n# furnished to do so, subject to the following conditions:\n#\n# The above copyright notice and this permission notice shall be included in all\n# copies or substantial portions of the Software.\n#\n# THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n# SOFTWARE.\n\nimport cuda\n\n# -----------------------------------------------------------------------------\n\ndef get_impl(operator, totyp, typ):\n    return cuda.get_impl(operator, totyp, typ)\n"
  },
  {
    "path": "egg/scalar.py",
    "content": "# Copyright (c) 2020 Agenium Scale\n#\n# Permission is hereby granted, free of charge, to any person obtaining a copy\n# of this software and associated documentation files (the \"Software\"), to deal\n# in the Software without restriction, including without limitation the rights\n# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n# copies of the Software, and to permit persons to whom the Software is\n# furnished to do so, subject to the following conditions:\n#\n# The above copyright notice and this permission notice shall be included in all\n# copies or substantial portions of the Software.\n#\n# THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n# SOFTWARE.\n\nimport common\n\nfmtspec = dict()\n\n# -----------------------------------------------------------------------------\n\ndef opnum(func, typ):\n    normal = 'return ({typ})({func});'. \\\n             format(func=func.format(**fmtspec), **fmtspec)\n    if typ == 'f16':\n        return \\\n        '''#ifdef NSIMD_ARM_FP16\n             {normal}\n           #else\n             return nsimd_f32_to_f16({func});\n           #endif'''.format(normal=normal, func=func. \\\n           format(in0='nsimd_f16_to_f32({in0})',\n                  in1='nsimd_f16_to_f32({in1})',\n                  in2='nsimd_f16_to_f32({in2})').format(**fmtspec))\n    else:\n        return normal\n\n# -----------------------------------------------------------------------------\n\ndef cmp(func, typ):\n    normal = 'return ({func});'. \\\n             format(func=func.format(**fmtspec), **fmtspec)\n    if typ == 'f16':\n        return \\\n        '''#ifdef NSIMD_ARM_FP16\n             {normal}\n           #else\n             return ({func});\n           #endif'''.format(normal=normal, func=func. \\\n           format(in0='nsimd_f16_to_f32({in0})',\n                  in1='nsimd_f16_to_f32({in1})',\n                  in2='nsimd_f16_to_f32({in2})').format(**fmtspec))\n    else:\n        return normal\n\n# -----------------------------------------------------------------------------\n\ndef opbit(func, typ):\n    in0 = '{in0}'.format(**fmtspec) if typ in common.utypes else \\\n          'nsimd_scalar_reinterpret_u{typnbits}_{typ}({in0})'.format(**fmtspec)\n    in1 = '{in1}'.format(**fmtspec) if typ in common.utypes else \\\n          'nsimd_scalar_reinterpret_u{typnbits}_{typ}({in1})'.format(**fmtspec)\n    if typ in common.utypes:\n        return 'return ({typ})({func});'. \\\n               format(func=func.format(in0=in0, in1=in1), **fmtspec)\n    else:\n        return '''return nsimd_scalar_reinterpret_{typ}_u{typnbits}(\n                             (u{typnbits})({func}));'''.format(\n                             func=func.format(in0=in0, in1=in1), **fmtspec)\n\n# -----------------------------------------------------------------------------\n\ndef shift(func, typ):\n    if func == 'shl':\n        return 'return ({typ})({in0} << {in1});'.format(**fmtspec)\n    # getting here means shr or shra\n    if typ in common.utypes:\n        return 'return ({typ})({in0} >> {in1});'.format(**fmtspec)\n    # getting here means shr or shra on signed type\n    utyp = common.bitfield_type[typ]\n    if func == 'shr':\n        return '''return nsimd_scalar_reinterpret_{typ}_{utyp}(\n                           ({utyp})(nsimd_scalar_reinterpret_{utyp}_{typ}(\n                             {in0}) >> {in1}));'''.format(utyp=utyp, **fmtspec)\n    # getting here means shra on signed type\n    return \\\n    '''if ({in1} == 0) {{\n         return {in0};\n       }}\n       if ({in0} >= 0) {{\n         return nsimd_scalar_reinterpret_{typ}_{utyp}(({utyp})(\n                  nsimd_scalar_reinterpret_{utyp}_{typ}({in0}) >> {in1}));\n       }} else {{\n         {utyp} mask = ({utyp})((({utyp})-1) << ({typnbits} - {in1}));\n         return nsimd_scalar_reinterpret_{typ}_{utyp}(({utyp})(mask |\n                  ({utyp})(nsimd_scalar_reinterpret_{utyp}_{typ}(\n                    {in0}) >> {in1})));\n       }}'''.format(utyp=utyp, **fmtspec)\n\n# -----------------------------------------------------------------------------\n\ndef libm_opn(func, arity, typ, until_cpp11, c89_code):\n    cxx_version = '> 0' if not until_cpp11 else '>= 2011'\n    comment = \\\n    '''/* {func} is not available in C89 but is given by POSIX 2001 */\n       /* and C99. But we do not want to pollute the user includes  */\n       /* and POSIX value if set so we play dirty.                  */'''. \\\n       format(func=func)\n    args = ', '.join(['{{in{}}}'.format(i).format(**fmtspec) \\\n                      for i in range(arity)])\n    args_f16 = ', '.join(['nsimd_f16_to_f32({{in{}}})'.format(i). \\\n                          format(**fmtspec) for i in range(arity)])\n    args_f64 = ', '.join(['(f64){{in{}}}'.format(i).format(**fmtspec) \\\n                          for i in range(arity)])\n    args_f64_f16 = ', '.join(['(f64)nsimd_f16_to_f32({{in{}}})'.format(i). \\\n                              format(**fmtspec) for i in range(arity)])\n    if typ == 'f16':\n        c99_code = 'return nsimd_f32_to_f16({}f({}));'.format(func, args_f16)\n        if c89_code == '':\n            c89_code = 'return nsimd_f32_to_f16((f32){}({}));'. \\\n                       format(func, args_f64_f16)\n        return \\\n        '''  {comment}\n           #if defined(NSIMD_IS_MSVC) && _MSC_VER <= 1800 /* VS 2012 */\n             {c89_code}\n           #else\n             #if NSIMD_CXX {cxx_version} || NSIMD_C >= 1999 || \\\n                 _POSIX_C_SOURCE >= 200112L\n               {c99_code}\n             #else\n               {c89_code}\n             #endif\n           #endif'''. \\\n           format(comment=comment, cxx_version=cxx_version, c89_code=c89_code,\n                  c99_code=c99_code)\n    elif typ == 'f32':\n        c99_code = 'return {}f({});'.format(func, args)\n        if c89_code == '':\n            c89_code = 'return (f32){}({});'.format(func, args_f64)\n        return \\\n        '''  {comment}\n           #if defined(NSIMD_IS_MSVC) && _MSC_VER <= 1800 /* VS 2012 */\n             {c89_code}\n           #else\n             #if NSIMD_CXX {cxx_version} || NSIMD_C >= 1999 || \\\n                 _POSIX_C_SOURCE >= 200112L\n               {c99_code}\n             #else\n               {c89_code}\n             #endif\n           #endif'''. \\\n           format(comment=comment, cxx_version=cxx_version, c89_code=c89_code,\n                  c99_code=c99_code)\n    else:\n        normal = 'return {}({});'.format(func, args)\n        if c89_code == '':\n            return normal\n        return \\\n        '''  {comment}\n           #if NSIMD_CXX {cxx_version} || NSIMD_C >= 1999 || \\\n               _POSIX_C_SOURCE >= 200112L\n             {normal}\n           #else\n             {c89_code}\n           #endif'''. \\\n           format(comment=comment, normal=normal, c89_code=c89_code,\n                  cxx_version=cxx_version)\n\n# -----------------------------------------------------------------------------\n\ndef round_to_even(typ):\n    if typ in ['f32', 'f64']:\n        return \\\n        '''{typ} fl = nsimd_scalar_floor_{typ}({in0});\n           {typ} ce = nsimd_scalar_ceil_{typ}({in0});\n           {typ} df = {in0} - fl; /* exactly representable in IEEE754 */\n           {typ} dc = ce - {in0}; /* exactly representable in IEEE754 */\n           if (df < dc) {{\n             return fl;\n           }} else if (df > dc) {{\n             return ce;\n           }} else {{\n             {typ} fld2 = fl * 0.5{f}; /* exactly representable in IEEE754 */\n             if (fld2 == nsimd_scalar_floor_{typ}(fld2)) {{\n               return fl;\n             }} else {{\n               return ce;\n             }}\n           }}'''.format(f='f' if typ == 'f32' else '', **fmtspec)\n    elif typ == 'f16':\n        return \\\n        '''f32 in0 = nsimd_f16_to_f32({in0});\n           f32 fl = nsimd_scalar_floor_f32(in0);\n           f32 ce = nsimd_scalar_ceil_f32(in0);\n           f32 df = in0 - fl; /* exactly representable in IEEE754 */\n           f32 dc = ce - in0; /* exactly representable in IEEE754 */\n           if (df < dc) {{\n             return nsimd_f32_to_f16(fl);\n           }} else if (df > dc) {{\n             return nsimd_f32_to_f16(ce);\n           }} else {{\n             f32 fld2 = fl * 0.5f; /* exactly representable in IEEE754 */\n             if (fld2 == nsimd_scalar_floor_f32(fld2)) {{\n               return nsimd_f32_to_f16(fl);\n             }} else {{\n               return nsimd_f32_to_f16(ce);\n             }}\n           }}'''.format(**fmtspec)\n    else:\n        return 'return {in0};'.format(**fmtspec)\n\n# -----------------------------------------------------------------------------\n\ndef reinterpret(totyp, typ):\n    if totyp == typ:\n        return 'return {in0};'.format(**fmtspec)\n    via_union = '''union {{ {typ} from; {totyp} to; }} buf;\n                   buf.from = {in0};\n                   return buf.to;'''.format(**fmtspec)\n    via_memcpy = '''{totyp} ret;\n                    memcpy((void *)&ret, (void *)&{in0}, sizeof(ret));\n                    return ret;'''.format(**fmtspec)\n    if typ == 'f16':\n        if totyp == 'u16':\n            emulated = 'return {in0}.u;'.format(**fmtspec)\n        else:\n            emulated = 'return nsimd_scalar_reinterpret_i16_u16({in0}.u);'. \\\n                       format(**fmtspec)\n        return \\\n        '''#if defined(NSIMD_ARM_FP16) && defined(NSIMD_IS_GCC)\n             {via_union}\n           #elif (defined(NSIMD_ARM_FP16) && !defined(NSIMD_IS_GCC)) || \\\n                 defined(NSIMD_CUDA) || defined(NSIMD_ROCM) || \\\n                 defined(NSIMD_ONEAPI)\n             {via_memcpy}\n           #else\n             {emulated}\n           #endif'''.format(via_union=via_union, via_memcpy=via_memcpy,\n                            emulated=emulated)\n    if totyp == 'f16':\n        if typ == 'u16':\n            emulated = '''f16 ret;\n                          ret.u = {in0};\n                          return ret;'''.format(**fmtspec)\n        else:\n            emulated = '''f16 ret;\n                          ret.u = nsimd_scalar_reinterpret_u16_i16({in0});\n                          return ret;'''.format(**fmtspec)\n        return \\\n        '''#if defined(NSIMD_ARM_FP16) && defined(NSIMD_IS_GCC)\n             {via_union}\n           #elif (defined(NSIMD_ARM_FP16) && !defined(NSIMD_IS_GCC)) || \\\n                 defined(NSIMD_CUDA) || defined(NSIMD_ROCM) || \\\n                 defined(NSIMD_ONEAPI)\n             {via_memcpy}\n           #else\n             {emulated}\n           #endif'''.format(via_union=via_union, via_memcpy=via_memcpy,\n                            emulated=emulated)\n    return '''#ifdef NSIMD_IS_GCC\n                {via_union}\n              #else\n                {via_memcpy}\n              #endif'''.format(via_union=via_union, via_memcpy=via_memcpy)\n\n# -----------------------------------------------------------------------------\n\ndef cvt(totyp, typ):\n    if totyp == typ:\n        return 'return {in0};'.format(**fmtspec)\n    if typ == 'f16':\n        return '''#ifdef NSIMD_ARM_FP16\n                      return ({totyp}){in0};\n                  #else\n                      return ({totyp})nsimd_f16_to_f32({in0});\n                  #endif'''.format(**fmtspec)\n    if totyp == 'f16':\n        return '''#ifdef NSIMD_ARM_FP16\n                      return (f16){in0};\n                  #else\n                      return nsimd_f32_to_f16((f32){in0});\n                  #endif'''.format(**fmtspec)\n    return 'return ({totyp}){in0};'.format(**fmtspec)\n\n# -----------------------------------------------------------------------------\n\ndef adds(typ):\n    if typ in common.ftypes:\n        return opnum('{in0} + {in1}', typ)\n    if typ in common.utypes:\n        return '''{typ} tmp = ({typ})({in0} + {in1});\n                  if (tmp < {in0} || tmp < {in1}) {{\n                    return ({typ})-1;\n                  }} else {{\n                    return tmp;\n                  }}\n                  '''.format(**fmtspec)\n    # Getting here means typ is signed\n    int_max = 'NSIMD_' + typ.upper() + '_MAX'\n    int_min = 'NSIMD_' + typ.upper() + '_MIN'\n    return '''if (({in0} >= 0 && {in1} <= 0) || ({in0} <= 0 && {in1} >= 0)) {{\n                return ({typ})({in0} + {in1});\n              }} else {{\n                if ({in0} > 0) {{\n                  if ({in1} > {int_max} - {in0}) {{\n                    return {int_max};\n                  }} else {{\n                    return ({typ})({in0} + {in1});\n                  }}\n                }} else {{\n                  if ({in1} < {int_min} - {in0}) {{\n                    return {int_min};\n                  }} else {{\n                    return ({typ})({in0} + {in1});\n                  }}\n                }}\n              }}'''.format(int_min=int_min, int_max=int_max, **fmtspec)\n\n# -----------------------------------------------------------------------------\n\ndef subs(typ):\n    if typ in common.ftypes:\n        return opnum('{in0} - {in1}', typ)\n    if typ in common.utypes:\n        return '''if ({in0} < {in1}) {{\n                    return ({typ})0;\n                  }} else {{\n                    return ({typ})({in0} - {in1});\n                  }}\n                  '''.format(**fmtspec)\n    # Getting here means typ is signed\n    return 'return nsimd_scalar_adds_{typ}({in0}, ({typ})(-{in1}));'. \\\n           format(**fmtspec)\n\n# -----------------------------------------------------------------------------\n\ndef get_impl(operator, totyp, typ):\n\n    global fmtspec\n\n    fmtspec = {\n      'in0': common.in0,\n      'in1': common.in1,\n      'in2': common.in2,\n      'typ': typ,\n      'totyp': totyp,\n      'typnbits': typ[1:]\n    }\n\n    if operator.name == 'trunc':\n        if typ in common.iutypes:\n            return 'return {in0};'.format(**fmtspec)\n        elif typ == 'f16':\n            c89_code = \\\n            '''f32 buf = nsimd_f16_to_f32({in0});\n               return nsimd_f32_to_f16(buf >= 0.0f ?\n                                       nsimd_scalar_floor_f32(buf) :\n                                       nsimd_scalar_ceil_f32(buf));'''. \\\n                                       format(**fmtspec)\n        else:\n            c89_code = \\\n            '''return {in0} >= 0.0{f} ? nsimd_scalar_floor_{typ}({in0})\n                      : nsimd_scalar_ceil_{typ}({in0});'''. \\\n                      format(f='f' if typ == 'f32' else '', **fmtspec)\n        return libm_opn('trunc', 1, typ, True, c89_code)\n    if operator.name == 'abs':\n        if typ == 'f16':\n            return '''f32 tmp = nsimd_f16_to_f32({in0});\n                      return nsimd_f32_to_f16(tmp >= 0.0f ? tmp : -tmp);'''. \\\n                      format(**fmtspec)\n        elif typ in common.utypes:\n            return 'return {in0};'.format(**fmtspec)\n        else:\n            return 'return ({typ})({in0} >= ({typ})0 ? {in0} : -{in0});'. \\\n                   format(**fmtspec)\n    if operator.name in ['min', 'max']:\n        op = '<' if operator.name == 'min' else '>'\n        if typ == 'f16':\n            return '''f32 in0 = nsimd_f16_to_f32({in0});\n                      f32 in1 = nsimd_f16_to_f32({in1});\n                      return nsimd_f32_to_f16(in0 {op} in1 ? in0 : in1);'''. \\\n                      format(op=op, **fmtspec)\n        else:\n            return 'return {in0} {op} {in1} ? {in0} : {in1};'. \\\n                   format(op=op, **fmtspec)\n    if operator.name == 'to_logical':\n        if typ in common.iutypes:\n            return 'return {in0} != ({typ})0;'.format(**fmtspec)\n        else:\n            return '''return nsimd_scalar_reinterpret_u{typnbits}_{typ}(\n                               {in0}) != (u{typnbits})0;'''.format(**fmtspec)\n    if operator.name == 'to_mask':\n        if typ in common.utypes:\n            return 'return ({typ})({in0} ? -1 : 0);'.format(**fmtspec)\n        else:\n            return '''return nsimd_scalar_reinterpret_{typ}_u{typnbits}((\n                                 u{typnbits})({in0} ? -1 : 0));'''. \\\n                                 format(**fmtspec)\n    if operator.name == 'round_to_even':\n        return round_to_even(typ)\n    if operator.name in ['floor', 'ceil', 'sqrt']:\n        if typ in common.iutypes and operator.name != 'sqrt':\n            return 'return {in0};'.format(**fmtspec)\n        return libm_opn(operator.name, 1, typ, False, '')\n    if operator.name == 'fma':\n        if typ in common.iutypes:\n            return 'return ({typ})({in0} * {in1} + {in2});'.format(**fmtspec)\n        else:\n            if typ == 'f16':\n                c89_code = 'return nsimd_f32_to_f16(nsimd_f16_to_f32({in0}) ' \\\n                           '* nsimd_f16_to_f32({in1}) ' \\\n                           '+ nsimd_f16_to_f32({in2}));'.format(**fmtspec)\n            else:\n                c89_code = 'return {in0} * {in1} + {in2};'.format(**fmtspec)\n            return libm_opn(operator.name, 3, typ, False, c89_code)\n    if operator.name in ['fnma', 'fms', 'fnms']:\n        neg = '-' if operator.name in ['fnms', 'fnma'] else ''\n        op = '-' if operator.name in ['fms', 'fnms'] else '+'\n        if typ in common.iutypes:\n            return 'return ({typ})(({neg}{in0}) * {in1} {op} {in2});'. \\\n                   format(neg=neg, op=op, **fmtspec)\n        else:\n            typ2 = 'f32' if typ == 'f16' else typ\n            return opnum(\n            'nsimd_scalar_fma_{typ2}({neg}{{in0}}, {{in1}}, {op}{{in2}})'. \\\n            format(typ2=typ2, neg=neg, op=op, **fmtspec), typ)\n    f = 'f' if typ in ['f16', 'f32'] else ''\n    typ2 = 'f32' if typ == 'f16' else typ\n    if operator.src:\n        if typ == 'f16':\n            return \\\n            '''return nsimd_f32_to_f16(\n                        nsimd_sleef_{op_name}_scalar_f32({vas}));'''. \\\n                        format(op_name=operator.name,\n                               vas=', '.join(['nsimd_f16_to_f32({})'. \\\n                               format(common.get_arg(i)) \\\n                               for i in range(len(operator.params[1:]))]),\n                               **fmtspec)\n        else:\n            return 'return nsimd_sleef_{op_name}_scalar_{typ}({vas});'. \\\n                   format(op_name=operator.name,\n                          vas=common.get_args(len(operator.params[1:])),\n                          **fmtspec)\n    func = {\n        'orb': lambda: opbit('{in0} | {in1}', typ),\n        'andb': lambda: opbit('{in0} & {in1}', typ),\n        'andnotb': lambda: opbit('{in0} & (~{in1})', typ),\n        'notb': lambda: opbit('~{in0}', typ),\n        'xorb': lambda: opbit('{in0} ^ {in1}', typ),\n        'add': lambda: opnum('{in0} + {in1}', typ),\n        'sub': lambda: opnum('{in0} - {in1}', typ),\n        'mul': lambda: opnum('{in0} * {in1}', typ),\n        'div': lambda: opnum('{in0} / {in1}', typ),\n        'neg': lambda: opnum('-{in0}', typ),\n        'lt': lambda: cmp('{in0} < {in1}', typ),\n        'gt': lambda: cmp('{in0} > {in1}', typ),\n        'le': lambda: cmp('{in0} <= {in1}', typ),\n        'ge': lambda: cmp('{in0} >= {in1}', typ),\n        'ne': lambda: cmp('{in0} != {in1}', typ),\n        'eq': lambda: cmp('{in0} == {in1}', typ),\n        'andl': lambda: 'return {in0} && {in1};'.format(**fmtspec),\n        'orl': lambda: 'return {in0} || {in1};'.format(**fmtspec),\n        'xorl': lambda: 'return {in0} ^ {in1};'.format(**fmtspec),\n        'andnotl': lambda: 'return {in0} && (!{in1});'.format(**fmtspec),\n        'notl': lambda: 'return !{in0};'.format(**fmtspec),\n        'shl': lambda: shift('shl', typ),\n        'shr': lambda: shift('shr', typ),\n        'shra': lambda: shift('shra', typ),\n        'reinterpret': lambda: reinterpret(totyp, typ),\n        'cvt': lambda: cvt(totyp, typ),\n        'adds': lambda: adds(typ),\n        'subs': lambda: subs(typ),\n        'rec': lambda: opnum('1.0{f} / {{in0}}'.format(f=f), typ),\n        'rec8': lambda: opnum('1.0{f} / {{in0}}'.format(f=f), typ),\n        'rec11': lambda: opnum('1.0{f} / {{in0}}'.format(f=f), typ),\n        'rsqrt': lambda:\n                 opnum('1.0{f} / nsimd_scalar_sqrt_{typ2}({{in0}})'. \\\n                 format(f=f, typ2=typ2), typ),\n        'rsqrt8': lambda:\n                  opnum('1.0{f} / nsimd_scalar_sqrt_{typ2}({{in0}})'. \\\n                  format(f=f, typ2=typ2), typ),\n        'rsqrt11': lambda:\n                   opnum('1.0{f} / nsimd_scalar_sqrt_{typ2}({{in0}})'. \\\n                   format(f=f, typ2=typ2), typ)\n    }\n    return func[operator.name]()\n\n"
  },
  {
    "path": "egg/x86_load_store_deg234.py",
    "content": "# Copyright (c) 2019 Agenium Scale\n#\n# Permission is hereby granted, free of charge, to any person obtaining a copy\n# of this software and associated documentation files (the \"Software\"), to deal\n# in the Software without restriction, including without limitation the rights\n# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n# copies of the Software, and to permit persons to whom the Software is\n# furnished to do so, subject to the following conditions:\n#\n# The above copyright notice and this permission notice shall be included in all\n# copies or substantial portions of the Software.\n#\n# THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n# SOFTWARE.\n\nimport platform_x86 as x86\nimport common\n\nsse = ['sse2', 'sse42']\navx = ['avx', 'avx2']\navx512 = ['avx512_knl', 'avx512_skylake']\n\n###############################################################################\n# Helper\n\ndef perm64(var1, var2, ind1, ind2):\n    return '''_mm_castpd_si128(_mm_shuffle_pd(\n                _mm_castsi128_pd({}), _mm_castsi128_pd(\n                  {}), _MM_SHUFFLE2({}, {})))'''.format(var1, var2, ind1, ind2)\n\n###############################################################################\n\ndef get_load_v0v1(simd_ext, typ, align, fmtspec):\n    load = '{pre}load{a}{sufsi}'.format(a='' if align else 'u', **fmtspec)\n    if typ in ['f32', 'f64']:\n        return '''{styp} v0 = {load}(a0);\n                  {styp} v1 = {load}(a0 + {le});'''. \\\n                  format(load=load, **fmtspec)\n    else:\n        return '''{styp} v0 = {load}(({styp}*)a0);\n                  {styp} v1 = {load}(({styp}*)a0 + 1);'''. \\\n                  format(load=load, **fmtspec)\n\n###############################################################################\n\ndef load2_sse(simd_ext, typ, align, fmtspec2):\n    fmtspec = fmtspec2.copy()\n    fmtspec['load_v0v1'] = get_load_v0v1('sse', typ, align, fmtspec)\n    if typ in ['i8', 'u8']:\n        if simd_ext == 'sse42':\n            return \\\n            '''nsimd_sse42_v{typ}x2 ret;\n               {load_v0v1}\n               __m128i mask = _mm_set_epi8(15, 13, 11, 9, 7, 5, 3, 1, 14,\n                                           12, 10, 8, 6, 4, 2, 0);\n               __m128i A0 = _mm_shuffle_epi8(v0, mask);\n               __m128i B0 = _mm_shuffle_epi8(v1, mask);\n               ret.v0 = {perm0};\n               ret.v1 = {perm1};\n               return ret;'''. \\\n               format(perm0=perm64('A0', 'B0', '0', '0'),\n                      perm1=perm64('A0', 'B0', '1', '1'), **fmtspec)\n        else:\n            return \\\n            '''nsimd_sse2_v{typ}x2 ret;\n               {load_v0v1}\n               __m128i A1 = _mm_unpacklo_epi8(v0, v1);\n               __m128i B2 = _mm_unpackhi_epi8(v0, v1);\n               __m128i A3 = _mm_unpacklo_epi8(A1, B2);\n               __m128i B4 = _mm_unpackhi_epi8(A1, B2);\n               __m128i A5 = _mm_unpacklo_epi8(A3, B4);\n               __m128i B6 = _mm_unpackhi_epi8(A3, B4);\n               ret.v0 = _mm_unpacklo_epi8(A5, B6);\n               ret.v1 = _mm_unpackhi_epi8(A5, B6);\n               return ret;'''.format(**fmtspec)\n    if typ in ['i16', 'u16']:\n        if simd_ext == 'sse42':\n            return \\\n            '''nsimd_sse42_v{typ}x2 ret;\n               {load_v0v1}\n               __m128i mask = _mm_set_epi8(15, 14, 11, 10, 7, 6, 3, 2,\n                                           13, 12, 9, 8, 5, 4, 1, 0);\n               __m128i A0 = _mm_shuffle_epi8(v0, mask);\n               __m128i B0 = _mm_shuffle_epi8(v1, mask);\n               ret.v0 = {perm0};\n               ret.v1 = {perm1};\n               return ret;'''. \\\n               format(perm0=perm64('A0', 'B0', '0', '0'),\n                      perm1=perm64('A0', 'B0', '1', '1'), **fmtspec)\n        else:\n            return \\\n            '''nsimd_sse2_v{typ}x2 ret;\n               {load_v0v1}\n               __m128i v2 = _mm_unpacklo_epi16(v0, v1);\n               __m128i v3 = _mm_unpackhi_epi16(v0, v1);\n               __m128i v5 = _mm_unpacklo_epi16(v2, v3);\n               __m128i v6 = _mm_unpackhi_epi16(v2, v3);\n               ret.v0 = _mm_unpacklo_epi16(v5, v6);\n               ret.v1 = _mm_unpackhi_epi16(v5, v6);\n               return ret;'''.format(**fmtspec)\n    if typ in ['i32', 'u32', 'f32']:\n        return '''nsimd_{simd_ext}_v{typ}x2 ret;\n                  {load_v0v1}\n                  {styp} A0 = _mm_unpacklo{suf}(v0, v1);\n                  {styp} B0 = _mm_unpackhi{suf}(v0, v1);\n                  ret.v0 = _mm_unpacklo{suf}(A0, B0);\n                  ret.v1 = _mm_unpackhi{suf}(A0, B0);\n                  return ret;'''.format(**fmtspec)\n    if typ in ['i64', 'u64', 'f64']:\n        return '''nsimd_{simd_ext}_v{typ}x2 ret;\n                  {load_v0v1}\n                  ret.v0 = _mm_unpacklo{suf}(v0, v1);\n                  ret.v1 = _mm_unpackhi{suf}(v0, v1);\n                  return ret;'''.format(**fmtspec)\n\n###############################################################################\n\ndef load2_avx(simd_ext, typ, align, fmtspec2):\n    fmtspec = fmtspec2.copy()\n    fmtspec['exlo_v0'] = x86.extract('avx', typ, x86.LO, 'v0')\n    fmtspec['exhi_v0'] = x86.extract('avx', typ, x86.HI, 'v0')\n    fmtspec['exlo_v1'] = x86.extract('avx', typ, x86.LO, 'v1')\n    fmtspec['exhi_v1'] = x86.extract('avx', typ, x86.HI, 'v1')\n    fmtspec['load_v0v1'] = get_load_v0v1('avx', typ, align, fmtspec)\n    fmtspec['a'] = 'a' if align else 'u'\n    if typ in ['i8', 'u8']:\n        if simd_ext == 'avx2':\n            return \\\n            '''nsimd_avx2_v{typ}x2 ret;\n               {load_v0v1}\n\n               __m256i mask = _mm256_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14,\n                                               1, 3, 5, 7, 9, 11, 13, 15,\n                                               0, 2, 4, 6, 8, 10, 12, 14,\n                                               1, 3, 5, 7, 9, 11, 13, 15);\n\n               __m256i A1 = _mm256_shuffle_epi8(v0, mask);\n               __m256i B1 = _mm256_shuffle_epi8(v1, mask);\n\n               __m256i A2 = _mm256_permute4x64_epi64(A1, _MM_SHUFFLE(3,1,2,0));\n               __m256i B2 = _mm256_permute4x64_epi64(B1, _MM_SHUFFLE(3,1,2,0));\n\n               ret.v0 = _mm256_permute2f128_si256(A2, B2, 2 << 4);\n               ret.v1 = _mm256_permute2f128_si256(A2, B2, (3 << 4) | 1);\n               return ret;'''.format(**fmtspec)\n        else:\n            return \\\n            '''nsimd_avx_v{typ}x2 ret;\n               {load_v0v1}\n\n               __m128i v0a = {exlo_v0};\n               __m128i v0b = {exhi_v0};\n               __m128i v1a = {exlo_v1};\n               __m128i v1b = {exhi_v1};\n\n               __m128i mask = _mm_set_epi8(15, 13, 11, 9, 7, 5, 3, 1,\n                                           14, 12, 10, 8, 6, 4, 2, 0);\n\n               __m128i A0a = _mm_shuffle_epi8(v0a, mask);\n               __m128i B0a = _mm_shuffle_epi8(v1a, mask);\n               __m128i A1a = {perm_a0};\n               __m128i B1a = {perm_a1};\n\n               __m128i A0b = _mm_shuffle_epi8(v0b, mask);\n               __m128i B0b = _mm_shuffle_epi8(v1b, mask);\n               __m128i A1b = {perm_b0};\n               __m128i B1b = {perm_b1};\n\n               ret.v0 = {merge_A1};\n               ret.v1 = {merge_B1};\n               return ret;'''. \\\n               format(merge_A1=x86.setr('avx', typ, 'A1a', 'A1b'),\n                      merge_B1=x86.setr('avx', typ, 'B1a', 'B1b'),\n                      perm_a0=perm64('A0a', 'B0a', '0', '0'),\n                      perm_a1=perm64('A0a', 'B0a', '1', '1'),\n                      perm_b0=perm64('A0b', 'B0b', '0', '0'),\n                      perm_b1=perm64('A0b', 'B0b', '1', '1'), **fmtspec)\n    if typ in ['i16', 'u16']:\n        if simd_ext == 'avx2':\n            return \\\n            '''nsimd_avx2_v{typ}x2 ret;\n               {load_v0v1}\n\n               __m256i A1 = _mm256_unpacklo_epi16(v0, v1);\n               __m256i B1 = _mm256_unpackhi_epi16(v0, v1);\n               __m256i A2 = _mm256_unpacklo_epi16(A1, B1);\n               __m256i B2 = _mm256_unpackhi_epi16(A1, B1);\n               ret.v0 = _mm256_unpacklo_epi16(A2, B2);\n               ret.v1 = _mm256_unpackhi_epi16(A2, B2);\n               return ret;'''.format(**fmtspec)\n        else:\n            return \\\n            '''nsimd_avx_v{typ}x2 ret;\n               {load_v0v1}\n\n               __m128i Aa = {exlo_v0};\n               __m128i Ba = {exhi_v0};\n               __m128i Ab = {exlo_v1};\n               __m128i Bb = {exhi_v1};\n\n               __m128i mask = _mm_set_epi8(15, 14, 11, 10, 7, 6, 3, 2,\n                                           13, 12, 9, 8, 5, 4, 1, 0);\n\n               __m128i XY0 = _mm_shuffle_epi8(Aa, mask);\n               __m128i XY1 = _mm_shuffle_epi8(Ba, mask);\n               __m128i Xa = {perm0};\n               __m128i Ya = {perm1};\n\n               XY0 = _mm_shuffle_epi8(Ab, mask);\n               XY1 = _mm_shuffle_epi8(Bb, mask);\n               __m128i Xb = {perm0};\n               __m128i Yb = {perm1};\n\n               ret.v0 = {mergeX};\n               ret.v1 = {mergeY};\n\n               return ret;'''. \\\n               format(perm0=perm64('XY0', 'XY1', '0', '0'),\n                      perm1=perm64('XY0', 'XY1', '1', '1'),\n                      mergeX=x86.setr('avx', typ, 'Xa', 'Xb'),\n                      mergeY=x86.setr('avx', typ, 'Ya', 'Yb'), **fmtspec)\n    if typ == 'f32':\n        return '''nsimd_{simd_ext}_vf32x2 ret;\n                  {load_v0v1}\n                  __m256 A1 = _mm256_unpacklo_ps(v0, v1);\n                  __m256 B1 = _mm256_unpackhi_ps(v0, v1);\n                  ret.v0 = _mm256_unpacklo_ps(A1, B1);\n                  ret.v1 = _mm256_unpackhi_ps(A1, B1);\n                  return ret;'''.format(**fmtspec)\n    if typ in ['i32', 'u32']:\n        if simd_ext == 'avx2':\n            return \\\n            '''nsimd_avx2_v{typ}x2 ret;\n               {load_v0v1}\n               __m256i A1 = _mm256_unpacklo_epi32(v0, v1);\n               __m256i B1 = _mm256_unpackhi_epi32(v0, v1);\n               ret.v0 = _mm256_unpacklo_epi32(A1, B1);\n               ret.v1 = _mm256_unpackhi_epi32(A1, B1);\n               return ret;'''.format(**fmtspec)\n        else:\n            return \\\n            '''nsimd_avx_v{typ}x2 ret;\n               nsimd_avx_vf32x2 retf32 = nsimd_load2{a}_avx_f32((f32 *){in0});\n               ret.v0 = _mm256_castps_si256(retf32.v0);\n               ret.v1 = _mm256_castps_si256(retf32.v1);\n               return ret;'''.format(**fmtspec)\n    if typ == 'f64':\n        return '''nsimd_{simd_ext}_vf64x2 ret;\n                  {load_v0v1}\n                  ret.v0 = _mm256_unpacklo_pd(v0, v1);\n                  ret.v1 = _mm256_unpackhi_pd(v0, v1);\n                  return ret;'''.format(**fmtspec)\n    if typ in ['i64', 'u64']:\n        if simd_ext == 'avx2':\n            return \\\n            '''nsimd_avx2_v{typ}x2 ret;\n               {load_v0v1}\n               ret.v0 = _mm256_unpacklo_epi64(v0, v1);\n               ret.v1 = _mm256_unpackhi_epi64(v0, v1);\n               return ret;'''.format(**fmtspec)\n        else:\n            return \\\n             '''nsimd_avx_v{typ}x2 ret;\n                nsimd_avx_vf64x2 retf64 = nsimd_load2{a}_avx_f64((f64 *){in0});\n                ret.v0 = _mm256_castpd_si256(retf64.v0);\n                ret.v1 = _mm256_castpd_si256(retf64.v1);\n                return ret;'''.format(**fmtspec)\n\n###############################################################################\n\ndef load2_avx512(simd_ext, typ, align, fmtspec2):\n    fmtspec = fmtspec2.copy()\n    fmtspec['exlo_v0'] = x86.extract(simd_ext, typ, x86.LO, 'v0')\n    fmtspec['exhi_v0'] = x86.extract(simd_ext, typ, x86.HI, 'v0')\n    fmtspec['exlo_v1'] = x86.extract(simd_ext, typ, x86.LO, 'v1')\n    fmtspec['exhi_v1'] = x86.extract(simd_ext, typ, x86.HI, 'v1')\n    fmtspec['load_v0v1'] = get_load_v0v1(simd_ext, typ, align, fmtspec)\n    if typ in ['i8', 'u8']:\n        return \\\n        '''nsimd_{simd_ext}_v{typ}x2 ret;\n           {load_v0v1}\n\n           __m256i A0 = {exlo_v0};\n           __m256i B0 = {exhi_v0};\n           __m256i C0 = {exlo_v1};\n           __m256i D0 = {exhi_v1};\n\n           __m256i mask = _mm256_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14,\n                                           1, 3, 5, 7, 9, 11, 13, 15,\n                                           0, 2, 4, 6, 8, 10, 12, 14,\n                                           1, 3, 5, 7, 9, 11, 13, 15);\n\n           __m256i A1 = _mm256_shuffle_epi8(A0, mask);\n           __m256i B1 = _mm256_shuffle_epi8(B0, mask);\n           __m256i C1 = _mm256_shuffle_epi8(C0, mask);\n           __m256i D1 = _mm256_shuffle_epi8(D0, mask);\n\n           __m256i A2 = _mm256_permute4x64_epi64(A1, _MM_SHUFFLE(3,1,2,0));\n           __m256i B2 = _mm256_permute4x64_epi64(B1, _MM_SHUFFLE(3,1,2,0));\n           __m256i C2 = _mm256_permute4x64_epi64(C1, _MM_SHUFFLE(3,1,2,0));\n           __m256i D2 = _mm256_permute4x64_epi64(D1, _MM_SHUFFLE(3,1,2,0));\n\n           __m256i A3 = _mm256_permute2f128_si256(A2, B2, 2 << 4);\n           __m256i B3 = _mm256_permute2f128_si256(A2, B2, (3 << 4) | 1);\n           __m256i C3 = _mm256_permute2f128_si256(C2, D2, 2 << 4);\n           __m256i D3 = _mm256_permute2f128_si256(C2, D2, (3 << 4) | 1);\n\n           ret.v0 = {mergeAC};\n           ret.v1 = {mergeBD};\n           return ret;'''.format(mergeAC=x86.setr(simd_ext, typ, 'A3', 'C3'),\n                                 mergeBD=x86.setr(simd_ext, typ, 'B3', 'D3'),\n                                 **fmtspec)\n    if typ in ['i16', 'u16']:\n        return \\\n        '''nsimd_{simd_ext}_v{typ}x2 ret;\n           {load_v0v1}\n\n           __m256i A0a = {exlo_v0};\n           __m256i B0a = {exhi_v0};\n           __m256i A0b = {exlo_v1};\n           __m256i B0b = {exhi_v1};\n\n           __m256i A1 = _mm256_unpacklo_epi16(A0a, B0a);\n           __m256i B1 = _mm256_unpackhi_epi16(A0a, B0a);\n           __m256i A2 = _mm256_unpacklo_epi16(A1, B1);\n           __m256i B2 = _mm256_unpackhi_epi16(A1, B1);\n           __m256i A3a = _mm256_unpacklo_epi16(A2, B2);\n           __m256i B3a = _mm256_unpackhi_epi16(A2, B2);\n\n           A1 = _mm256_unpacklo_epi16(A0b, B0b);\n           B1 = _mm256_unpackhi_epi16(A0b, B0b);\n           A2 = _mm256_unpacklo_epi16(A1, B1);\n           B2 = _mm256_unpackhi_epi16(A1, B1);\n           __m256i A3b = _mm256_unpacklo_epi16(A2, B2);\n           __m256i B3b = _mm256_unpackhi_epi16(A2, B2);\n\n           ret.v0 = {mergeA};\n           ret.v1 = {mergeB};\n           return ret;'''.format(mergeA=x86.setr(simd_ext, typ, 'A3a', 'A3b'),\n                                 mergeB=x86.setr(simd_ext, typ, 'B3a', 'B3b'),\n                                 **fmtspec)\n    if typ in ['f32', 'i32', 'u32']:\n        return \\\n        '''nsimd_{simd_ext}_v{typ}x2 ret;\n           {load_v0v1}\n           __m512i mask1 = _mm512_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14,\n                                             16, 18, 20, 22, 24, 26, 28, 30);\n           __m512i mask2 = _mm512_setr_epi32(1, 3, 5, 7, 9, 11, 13, 15,\n                                             17, 19, 21, 23, 25, 27, 29, 31);\n           ret.v0 = _mm512_permutex2var{suf}(v0, mask1, v1);\n           ret.v1 = _mm512_permutex2var{suf}(v0, mask2, v1);\n           return ret;'''.format(**fmtspec)\n    if typ in ['f64', 'i64', 'u64']:\n        return \\\n        '''nsimd_{simd_ext}_v{typ}x2 ret;\n           {load_v0v1}\n           ret.v0 = _mm512_unpacklo{suf}(v0, v1);\n           ret.v1 = _mm512_unpackhi{suf}(v0, v1);\n           return ret;'''.format(**fmtspec)\n\n###############################################################################\n\ndef store2(simd_ext, typ, align, fmtspec2):\n    fmtspec = fmtspec2.copy()\n    fmtspec['store'] = '{pre}store{a}{sufsi}'.format(a='' if align else 'u',\n                                                     **fmtspec)\n    if typ in ['f32', 'f64']:\n        dest1 = '{in0}'.format(**fmtspec)\n        dest2 = '{in0} + {le}'.format(**fmtspec)\n    else:\n        dest1 = '(__m{nbits}i *){in0}'.format(**fmtspec)\n        dest2 = '(__m{nbits}i *){in0} + 1'.format(**fmtspec)\n    normal = '''{store}({dest1}, {pre}unpacklo{suf}({in1}, {in2}));\n                {store}({dest2}, {pre}unpackhi{suf}({in1}, {in2}));'''. \\\n                format(dest1=dest1, dest2=dest2, **fmtspec)\n    if simd_ext in sse:\n        return normal\n    fmtspec['exlo_in1'] = x86.extract(simd_ext, typ, x86.LO, common.in1)\n    fmtspec['exhi_in1'] = x86.extract(simd_ext, typ, x86.HI, common.in1)\n    fmtspec['exlo_in2'] = x86.extract(simd_ext, typ, x86.LO, common.in2)\n    fmtspec['exhi_in2'] = x86.extract(simd_ext, typ, x86.HI, common.in2)\n    fmtspec['normal'] = normal\n    fmtspec['dest1'] = dest1\n    fmtspec['dest2'] = dest2\n    if simd_ext == 'avx2':\n        if typ in ['i8', 'u8']:\n            return \\\n            '''__m256i A1 = _mm256_permute2f128_si256({in1}, {in2}, 2 << 4);\n               __m256i B1 = _mm256_permute2f128_si256(\n                              {in1}, {in2}, (3 << 4) | 1);\n\n               __m256i A2 = _mm256_permute4x64_epi64(A1, _MM_SHUFFLE(3,1,2,0));\n               __m256i B2 = _mm256_permute4x64_epi64(B1, _MM_SHUFFLE(3,1,2,0));\n\n               __m256i mask = _mm256_setr_epi8(0, 8, 1, 9, 2, 10, 3, 11,\n                                               4, 12, 5, 13, 6, 14, 7, 15,\n                                               0, 8, 1, 9, 2, 10, 3, 11,\n                                               4, 12, 5, 13, 6, 14, 7, 15);\n\n               {store}({dest1}, _mm256_shuffle_epi8(A2, mask));\n               {store}({dest2}, _mm256_shuffle_epi8(B2, mask));'''. \\\n               format(**fmtspec)\n        if typ in ['i16', 'u16']:\n            return normal\n    if simd_ext == 'avx':\n        if typ in ['i8', 'u8']:\n            return \\\n            '''__m128i v0a = {exlo_in1};\n               __m128i v0b = {exhi_in1};\n               __m128i v1a = {exlo_in2};\n               __m128i v1b = {exhi_in2};\n\n               __m128i A1a = _mm_unpacklo_epi8(v0a, v1a);\n               __m128i B1a = _mm_unpackhi_epi8(v0a, v1a);\n               __m128i A1b = _mm_unpacklo_epi8(v0b, v1b);\n               __m128i B1b = _mm_unpackhi_epi8(v0b, v1b);\n\n               __m256i A1 = {mergeA1};\n               __m256i B1 = {mergeB1};\n\n               {store}({dest1}, A1);\n               {store}({dest2}, B1);'''. \\\n               format(mergeA1=x86.setr('avx', typ, 'A1a', 'A1b'),\n                      mergeB1=x86.setr('avx', typ, 'B1a', 'B1b'),\n                      **fmtspec)\n        if typ in ['i16', 'u16']:\n            return \\\n            '''__m128i Xa = {exlo_in1};\n               __m128i Xb = {exhi_in1};\n               __m128i Ya = {exlo_in2};\n               __m128i Yb = {exhi_in2};\n\n               __m128i A0 = _mm_unpacklo_epi16(Xa, Ya);\n               __m128i B0 = _mm_unpackhi_epi16(Xa, Ya);\n               __m128i A1 = _mm_unpacklo_epi16(Xb, Yb);\n               __m128i B1 = _mm_unpackhi_epi16(Xb, Yb);\n\n               __m256i A = {merge0};\n               __m256i B = {merge1};\n\n               {store}({dest1}, A);\n               {store}({dest2}, B);'''. \\\n               format(merge0=x86.setr('avx', typ, 'A0', 'B0'),\n                      merge1=x86.setr('avx', typ, 'A1', 'B1'),\n                      **fmtspec)\n    if (simd_ext in avx and typ in ['f32', 'f64']) or \\\n       simd_ext == 'avx2' and typ in ['i32', 'u32', 'i64', 'u64']:\n        return normal\n    if simd_ext == 'avx' and typ in ['i32', 'u32', 'i64', 'u64']:\n        ftyp = '__m256' if typ in ['i32', 'u32'] else '__m256d'\n        fsuf = 'ps' if typ in ['i32', 'u32'] else 'pd'\n        return '''{ftyp} v0 = _mm256_castsi256_{fsuf}({in1});\n                  {ftyp} v1 = _mm256_castsi256_{fsuf}({in2});\n                  {store}({dest1}, _mm256_cast{fsuf}_si256(\n                          _mm256_unpacklo_{fsuf}(v0, v1)));\n                  {store}({dest2}, _mm256_cast{fsuf}_si256(\n                          _mm256_unpackhi_{fsuf}(v0, v1)));'''. \\\n                          format(ftyp=ftyp, fsuf=fsuf, **fmtspec)\n    if simd_ext in avx512:\n        if typ in ['i8', 'u8']:\n            return \\\n            '''__m256i A1 = {exlo_in1};\n               __m256i B1 = {exhi_in1};\n               __m256i C1 = {exlo_in2};\n               __m256i D1 = {exhi_in2};\n\n               __m256i A2 = _mm256_permute2f128_si256(A1, C1, 2 << 4);\n               __m256i B2 = _mm256_permute2f128_si256(A1, C1, (3 << 4) | 1);\n               __m256i C2 = _mm256_permute2f128_si256(B1, D1, 2 << 4);\n               __m256i D2 = _mm256_permute2f128_si256(B1, D1, (3 << 4) | 1);\n\n               __m256i A3 = _mm256_permute4x64_epi64(A2, _MM_SHUFFLE(3,1,2,0));\n               __m256i B3 = _mm256_permute4x64_epi64(B2, _MM_SHUFFLE(3,1,2,0));\n               __m256i C3 = _mm256_permute4x64_epi64(C2, _MM_SHUFFLE(3,1,2,0));\n               __m256i D3 = _mm256_permute4x64_epi64(D2, _MM_SHUFFLE(3,1,2,0));\n\n               __m256i mask = _mm256_setr_epi8(0, 8, 1, 9, 2, 10, 3, 11,\n                                               4, 12, 5, 13, 6, 14, 7, 15,\n                                               0, 8, 1, 9, 2, 10, 3, 11,\n                                               4, 12, 5, 13, 6, 14, 7, 15);\n\n               __m256i A4 = _mm256_shuffle_epi8(A3, mask);\n               __m256i B4 = _mm256_shuffle_epi8(B3, mask);\n               __m256i C4 = _mm256_shuffle_epi8(C3, mask);\n               __m256i D4 = _mm256_shuffle_epi8(D3, mask);\n\n               {store}({dest1}, {mergeAB});\n               {store}({dest2}, {mergeCD});'''. \\\n               format(mergeAB=x86.setr(simd_ext, typ, 'A4', 'B4'),\n                      mergeCD=x86.setr(simd_ext, typ, 'C4', 'D4'),\n                      **fmtspec)\n        if typ in ['i16', 'u16']:\n            return \\\n            '''__m256i A0a = {exlo_in1};\n               __m256i A0b = {exhi_in1};\n               __m256i B0a = {exlo_in2};\n               __m256i B0b = {exhi_in2};\n\n               __m256i A1a = _mm256_unpacklo_epi16(A0a, B0a);\n               __m256i B1a = _mm256_unpackhi_epi16(A0a, B0a);\n               __m256i A1b = _mm256_unpacklo_epi16(A0b, B0b);\n               __m256i B1b = _mm256_unpackhi_epi16(A0b, B0b);\n\n               {store}({dest1}, {mergea});\n               {store}({dest2}, {mergeb});'''.\\\n               format(mergea=x86.setr(simd_ext, typ, 'A1a', 'B1a'),\n                      mergeb=x86.setr(simd_ext, typ, 'A1b', 'B1b'),\n                      **fmtspec)\n        if typ in ['i32', 'f32', 'u32']:\n            return \\\n            '''__m512i mask1 = _mm512_setr_epi32(0, 16, 1, 17, 2, 18, 3, 19,\n                                                 4, 20, 5, 21, 6, 22, 7, 23);\n               __m512i mask2 = _mm512_setr_epi32(8, 24, 9, 25, 10, 26, 11, 27,\n                                                 12, 28, 13, 29, 14, 30, 15,\n                                                 31);\n               {store}({dest1}, _mm512_permutex2var{suf}({in1}, mask1, {in2}));\n               {store}({dest2}, _mm512_permutex2var{suf}(\n                   {in1}, mask2, {in2}));'''.format(**fmtspec)\n        if typ in ['i64', 'u64', 'f64']:\n            return \\\n            '''{store}({dest1}, _mm512_unpacklo{suf}({in1}, {in2}));\n               {store}({dest2}, _mm512_unpackhi{suf}({in1}, {in2}));'''. \\\n               format(**fmtspec)\n\n###############################################################################\n\ndef get_load_v0v1v2v3(simd_ext, typ, align, fmtspec):\n    load = '{pre}load{a}{sufsi}'.format(a='' if align else 'u', **fmtspec)\n    if typ in ['f32', 'f64']:\n        return '''{styp} v0 = {load}(a0);\n                  {styp} v1 = {load}(a0 + {le});\n                  {styp} v2 = {load}(a0 + (2 * {le}));\n                  {styp} v3 = {load}(a0 + (3 * {le}));'''. \\\n                  format(load=load, **fmtspec)\n    else:\n        return '''{styp} v0 = {load}(({styp}*)a0);\n                  {styp} v1 = {load}(({styp}*)a0 + 1);\n                  {styp} v2 = {load}(({styp}*)a0 + 2);\n                  {styp} v3 = {load}(({styp}*)a0 + 3);'''. \\\n                  format(load=load, **fmtspec)\n\n###############################################################################\n\ndef load4_sse(simd_ext, typ, align, fmtspec2):\n    fmtspec = fmtspec2.copy()\n    fmtspec['load_v0v1v2v3'] = get_load_v0v1v2v3('sse', typ, align, fmtspec)\n    if typ in ['i8', 'u8']:\n        if simd_ext == 'sse42':\n            return \\\n            '''nsimd_sse42_v{typ}x4 ret;\n               {load_v0v1v2v3}\n               __m128i mask = _mm_set_epi8(15, 11, 7, 3, 14, 10, 6, 2,\n                                           13,  9, 5, 1, 12,  8, 4, 0);\n               __m128d A1 = _mm_castsi128_pd(_mm_shuffle_epi8(v0, mask));\n               __m128d B1 = _mm_castsi128_pd(_mm_shuffle_epi8(v1, mask));\n               __m128d C1 = _mm_castsi128_pd(_mm_shuffle_epi8(v2, mask));\n               __m128d D1 = _mm_castsi128_pd(_mm_shuffle_epi8(v3, mask));\n\n               __m128 A2 = _mm_castpd_ps(_mm_shuffle_pd(A1, B1,\n                                                        _MM_SHUFFLE2(0, 0)));\n               __m128 A3 = _mm_castpd_ps(_mm_shuffle_pd(C1, D1,\n                                                        _MM_SHUFFLE2(0, 0)));\n               __m128 C2 = _mm_castpd_ps(_mm_shuffle_pd(A1, B1,\n                                                        _MM_SHUFFLE2(1, 1)));\n               __m128 C3 = _mm_castpd_ps(_mm_shuffle_pd(C1, D1,\n                                                        _MM_SHUFFLE2(1, 1)));\n\n               ret.v0 = _mm_castps_si128(_mm_shuffle_ps(\n                            A2, A3, _MM_SHUFFLE(2, 0, 2, 0)));\n               ret.v1 = _mm_castps_si128(_mm_shuffle_ps(\n                            A2, A3, _MM_SHUFFLE(3, 1, 3, 1)));\n               ret.v2 = _mm_castps_si128(_mm_shuffle_ps(\n                            C2, C3, _MM_SHUFFLE(2, 0, 2, 0)));\n               ret.v3 = _mm_castps_si128(_mm_shuffle_ps(\n                            C2, C3, _MM_SHUFFLE(3, 1, 3, 1)));\n               return ret;'''.format(**fmtspec)\n        else:\n            return \\\n            '''nsimd_sse2_v{typ}x4 ret;\n               {load_v0v1v2v3}\n               __m128i A1 = _mm_unpacklo_epi8(v0, v2);\n               __m128i B1 = _mm_unpackhi_epi8(v0, v2);\n               __m128i C1 = _mm_unpacklo_epi8(v1, v3);\n               __m128i D1 = _mm_unpackhi_epi8(v1, v3);\n\n               __m128i A2 = _mm_unpacklo_epi8(A1, C1);\n               __m128i B2 = _mm_unpackhi_epi8(A1, C1);\n               __m128i C2 = _mm_unpacklo_epi8(B1, D1);\n               __m128i D2 = _mm_unpackhi_epi8(B1, D1);\n\n               __m128i A3 = _mm_unpacklo_epi8(A2, C2);\n               __m128i B3 = _mm_unpackhi_epi8(A2, C2);\n               __m128i C3 = _mm_unpacklo_epi8(B2, D2);\n               __m128i D3 = _mm_unpackhi_epi8(B2, D2);\n\n               ret.v0 = _mm_unpacklo_epi8(A3, C3);\n               ret.v1 = _mm_unpackhi_epi8(A3, C3);\n               ret.v2 = _mm_unpacklo_epi8(B3, D3);\n               ret.v3 = _mm_unpackhi_epi8(B3, D3);\n               return ret;'''.format(**fmtspec)\n    if typ in ['i16', 'u16']:\n        return \\\n        '''nsimd_{simd_ext}_v{typ}x4 ret;\n           {load_v0v1v2v3}\n           __m128i E = _mm_unpacklo_epi16(v0,v1);\n           __m128i F = _mm_unpackhi_epi16(v0,v1);\n           __m128i G = _mm_unpacklo_epi16(v2,v3);\n           __m128i H = _mm_unpackhi_epi16(v2,v3);\n\n           __m128i I = _mm_unpacklo_epi16(E,F);\n           __m128i J = _mm_unpackhi_epi16(E,F);\n           __m128i K = _mm_unpacklo_epi16(G,H);\n           __m128i L = _mm_unpackhi_epi16(G,H);\n\n           ret.v0 = _mm_unpacklo_epi64(I,K);\n           ret.v1 = _mm_unpackhi_epi64(I,K);\n           ret.v2 = _mm_unpacklo_epi64(J,L);\n           ret.v3 = _mm_unpackhi_epi64(J,L);\n           return ret;'''.format(**fmtspec)\n    if typ in ['f32', 'i32', 'u32']:\n        return \\\n        '''nsimd_{simd_ext}_v{typ}x4 ret;\n           {load_v0v1v2v3}\n           {styp} A1 = _mm_unpacklo{suf}(v0, v2);\n           {styp} B1 = _mm_unpackhi{suf}(v0, v2);\n           {styp} C1 = _mm_unpacklo{suf}(v1, v3);\n           {styp} D1 = _mm_unpackhi{suf}(v1, v3);\n\n           ret.v0 = _mm_unpacklo{suf}(A1, C1);\n           ret.v1 = _mm_unpackhi{suf}(A1, C1);\n           ret.v2 = _mm_unpacklo{suf}(B1, D1);\n           ret.v3 = _mm_unpackhi{suf}(B1, D1);\n\n           return ret;'''.format(**fmtspec)\n    if typ in ['f64', 'i64', 'u64']:\n        return \\\n        '''nsimd_{simd_ext}_v{typ}x4 ret;\n           {load_v0v1v2v3}\n           ret.v0 = _mm_unpacklo{suf}(v0, v2);\n           ret.v1 = _mm_unpackhi{suf}(v0, v2);\n           ret.v2 = _mm_unpacklo{suf}(v1, v3);\n           ret.v3 = _mm_unpackhi{suf}(v1, v3);\n           return ret;'''.format(**fmtspec)\n\n###############################################################################\n\ndef load4_avx(simd_ext, typ, align, fmtspec2):\n    fmtspec = fmtspec2.copy()\n    fmtspec['load_v0v1v2v3'] = get_load_v0v1v2v3('avx', typ, align, fmtspec)\n    fmtspec['exlo_v0'] = x86.extract('avx', typ, x86.LO, 'v0')\n    fmtspec['exhi_v0'] = x86.extract('avx', typ, x86.HI, 'v0')\n    fmtspec['exlo_v1'] = x86.extract('avx', typ, x86.LO, 'v1')\n    fmtspec['exhi_v1'] = x86.extract('avx', typ, x86.HI, 'v1')\n    fmtspec['exlo_v2'] = x86.extract('avx', typ, x86.LO, 'v2')\n    fmtspec['exhi_v2'] = x86.extract('avx', typ, x86.HI, 'v2')\n    fmtspec['exlo_v3'] = x86.extract('avx', typ, x86.LO, 'v3')\n    fmtspec['exhi_v3'] = x86.extract('avx', typ, x86.HI, 'v3')\n    fmtspec['a'] = 'a' if align else 'u'\n    if typ in ['i8', 'u8']:\n        if simd_ext == 'avx2':\n            return \\\n            '''nsimd_avx2_v{typ}x4 ret;\n               {load_v0v1v2v3}\n\n               __m256i mask = _mm256_setr_epi8(0, 4, 8, 12, 1, 5, 9, 13,\n                                               2, 6, 10, 14, 3, 7, 11, 15,\n                                               0, 4, 8, 12, 1, 5, 9, 13, 2,\n                                               6, 10, 14, 3, 7, 11, 15);\n               __m256i mask2 = _mm256_setr_epi32(0, 4, 1, 5, 2, 6, 3, 7);\n\n               __m256i A1 = _mm256_shuffle_epi8(v0, mask);\n               __m256i B1 = _mm256_shuffle_epi8(v1, mask);\n               __m256i C1 = _mm256_shuffle_epi8(v2, mask);\n               __m256i D1 = _mm256_shuffle_epi8(v3, mask);\n\n               __m256i A2 = _mm256_permutevar8x32_epi32(A1, mask2);\n               __m256i B2 = _mm256_permutevar8x32_epi32(B1, mask2);\n               __m256i C2 = _mm256_permutevar8x32_epi32(C1, mask2);\n               __m256i D2 = _mm256_permutevar8x32_epi32(D1, mask2);\n\n               __m256i A3 = _mm256_permute2x128_si256(A2, C2, 2 << 4);\n               __m256i C3 = _mm256_permute2x128_si256(B2, D2, 2 << 4);\n               __m256i B3 = _mm256_permute2x128_si256(A2, C2, (3 << 4) | 1);\n               __m256i D3 = _mm256_permute2x128_si256(B2, D2, (3 << 4) | 1);\n\n               ret.v0 = _mm256_unpacklo_epi64(A3, C3);\n               ret.v1 = _mm256_unpackhi_epi64(A3, C3);\n               ret.v2 = _mm256_unpacklo_epi64(B3, D3);\n               ret.v3 = _mm256_unpackhi_epi64(B3, D3);\n               return ret;'''.format(**fmtspec)\n        else:\n            return \\\n            '''nsimd_avx_v{typ}x4 ret;\n               {load_v0v1v2v3}\n\n               __m128i Aa = {exlo_v0};\n               __m128i Ba = {exhi_v0};\n               __m128i Ca = {exlo_v1};\n               __m128i Da = {exhi_v1};\n               __m128i Ab = {exlo_v2};\n               __m128i Bb = {exhi_v2};\n               __m128i Cb = {exlo_v3};\n               __m128i Db = {exhi_v3};\n\n               __m128i mask = _mm_set_epi8(15, 11, 7, 3, 14, 10, 6, 2,\n                                           13,  9, 5, 1, 12,  8, 4, 0);\n\n               __m128d A1 = _mm_castsi128_pd(_mm_shuffle_epi8(Aa, mask));\n               __m128d B1 = _mm_castsi128_pd(_mm_shuffle_epi8(Ba, mask));\n               __m128d C1 = _mm_castsi128_pd(_mm_shuffle_epi8(Ca, mask));\n               __m128d D1 = _mm_castsi128_pd(_mm_shuffle_epi8(Da, mask));\n\n               __m128 A2 = _mm_castpd_ps(_mm_shuffle_pd(A1, B1,\n                                                        _MM_SHUFFLE2(0, 0)));\n               __m128 A3 = _mm_castpd_ps(_mm_shuffle_pd(C1, D1,\n                                                        _MM_SHUFFLE2(0, 0)));\n               __m128 C2 = _mm_castpd_ps(_mm_shuffle_pd(A1, B1,\n                                                        _MM_SHUFFLE2(1, 1)));\n               __m128 C3 = _mm_castpd_ps(_mm_shuffle_pd(C1, D1,\n                                                        _MM_SHUFFLE2(1, 1)));\n\n               __m128i Wa = _mm_castps_si128(_mm_shuffle_ps(A2, A3,\n                                             _MM_SHUFFLE(2, 0, 2, 0)));\n               __m128i Xa = _mm_castps_si128(_mm_shuffle_ps(A2, A3,\n                                             _MM_SHUFFLE(3, 1, 3, 1)));\n               __m128i Ya = _mm_castps_si128(_mm_shuffle_ps(C2, C3,\n                                             _MM_SHUFFLE(2, 0, 2, 0)));\n               __m128i Za = _mm_castps_si128(_mm_shuffle_ps(C2, C3,\n                                             _MM_SHUFFLE(3, 1, 3, 1)));\n\n               A1 = _mm_castsi128_pd(_mm_shuffle_epi8(Ab, mask));\n               B1 = _mm_castsi128_pd(_mm_shuffle_epi8(Bb, mask));\n               C1 = _mm_castsi128_pd(_mm_shuffle_epi8(Cb, mask));\n               D1 = _mm_castsi128_pd(_mm_shuffle_epi8(Db, mask));\n\n               A2 = _mm_castpd_ps(_mm_shuffle_pd(A1, B1, _MM_SHUFFLE2(0, 0)));\n               A3 = _mm_castpd_ps(_mm_shuffle_pd(C1, D1, _MM_SHUFFLE2(0, 0)));\n               C2 = _mm_castpd_ps(_mm_shuffle_pd(A1, B1, _MM_SHUFFLE2(1, 1)));\n               C3 = _mm_castpd_ps(_mm_shuffle_pd(C1, D1, _MM_SHUFFLE2(1, 1)));\n\n               __m128i Wb = _mm_castps_si128(_mm_shuffle_ps(A2, A3,\n                                             _MM_SHUFFLE(2, 0, 2, 0)));\n               __m128i Xb = _mm_castps_si128(_mm_shuffle_ps(A2, A3,\n                                             _MM_SHUFFLE(3, 1, 3, 1)));\n               __m128i Yb = _mm_castps_si128(_mm_shuffle_ps(C2, C3,\n                                             _MM_SHUFFLE(2, 0, 2, 0)));\n               __m128i Zb = _mm_castps_si128(_mm_shuffle_ps(C2, C3,\n                                             _MM_SHUFFLE(3, 1, 3, 1)));\n\n               ret.v0 = {mergeW};\n               ret.v1 = {mergeX};\n               ret.v2 = {mergeY};\n               ret.v3 = {mergeZ};\n\n               return ret;'''.format(mergeW=x86.setr('avx', typ, 'Wa', 'Wb'),\n                                     mergeX=x86.setr('avx', typ, 'Xa', 'Xb'),\n                                     mergeY=x86.setr('avx', typ, 'Ya', 'Yb'),\n                                     mergeZ=x86.setr('avx', typ, 'Za', 'Zb'),\n                                     **fmtspec)\n    if typ in ['i16', 'u16']:\n        if simd_ext == 'avx2':\n            return \\\n            '''nsimd_avx2_v{typ}x4 ret;\n               {load_v0v1v2v3}\n\n               __m256i A1 = _mm256_unpacklo_epi16(v0, v2);\n               __m256i B1 = _mm256_unpackhi_epi16(v0, v2);\n               __m256i C1 = _mm256_unpacklo_epi16(v1, v3);\n               __m256i D1 = _mm256_unpackhi_epi16(v1, v3);\n\n               __m256i A2 = _mm256_unpacklo_epi16(A1, C1);\n               __m256i B2 = _mm256_unpackhi_epi16(A1, C1);\n               __m256i C2 = _mm256_unpacklo_epi16(B1, D1);\n               __m256i D2 = _mm256_unpackhi_epi16(B1, D1);\n\n               ret.v0 = _mm256_unpacklo_epi16(A2, C2);\n               ret.v1 = _mm256_unpackhi_epi16(A2, C2);\n               ret.v2 = _mm256_unpacklo_epi16(B2, D2);\n               ret.v3 = _mm256_unpackhi_epi16(B2, D2);\n\n               return ret;'''.format(**fmtspec)\n        else:\n            return \\\n            '''nsimd_avx_v{typ}x4 ret;\n               {load_v0v1v2v3}\n\n               __m128i Aa = {exlo_v0};\n               __m128i Ba = {exhi_v0};\n               __m128i Ca = {exlo_v1};\n               __m128i Da = {exhi_v1};\n               __m128i Ab = {exlo_v2};\n               __m128i Bb = {exhi_v2};\n               __m128i Cb = {exlo_v3};\n               __m128i Db = {exhi_v3};\n\n               __m128i mask = _mm_set_epi8(15, 14, 7, 6, 13, 12, 5, 4,\n                                           11, 10, 3, 2,  9,  8, 1, 0);\n               __m128d A1 = _mm_castsi128_pd(_mm_shuffle_epi8(Aa, mask));\n               __m128d B1 = _mm_castsi128_pd(_mm_shuffle_epi8(Ba, mask));\n               __m128d C1 = _mm_castsi128_pd(_mm_shuffle_epi8(Ca, mask));\n               __m128d D1 = _mm_castsi128_pd(_mm_shuffle_epi8(Da, mask));\n\n               __m128 A2 = _mm_castpd_ps(_mm_shuffle_pd(A1, B1,\n                                                        _MM_SHUFFLE2(0, 0)));\n               __m128 A3 = _mm_castpd_ps(_mm_shuffle_pd(C1, D1,\n                                                        _MM_SHUFFLE2(0, 0)));\n               __m128 C2 = _mm_castpd_ps(_mm_shuffle_pd(A1, B1,\n                                                        _MM_SHUFFLE2(1, 1)));\n               __m128 C3 = _mm_castpd_ps(_mm_shuffle_pd(C1, D1,\n                                                        _MM_SHUFFLE2(1, 1)));\n\n               __m128i Wa = _mm_castps_si128(_mm_shuffle_ps(A2, A3,\n                                             _MM_SHUFFLE(2, 0, 2, 0)));\n               __m128i Xa = _mm_castps_si128(_mm_shuffle_ps(A2, A3,\n                                             _MM_SHUFFLE(3, 1, 3, 1)));\n               __m128i Ya = _mm_castps_si128(_mm_shuffle_ps(C2, C3,\n                                             _MM_SHUFFLE(2, 0, 2, 0)));\n               __m128i Za = _mm_castps_si128(_mm_shuffle_ps(C2, C3,\n                                             _MM_SHUFFLE(3, 1, 3, 1)));\n\n               A1 = _mm_castsi128_pd(_mm_shuffle_epi8(Ab, mask));\n               B1 = _mm_castsi128_pd(_mm_shuffle_epi8(Bb, mask));\n               C1 = _mm_castsi128_pd(_mm_shuffle_epi8(Cb, mask));\n               D1 = _mm_castsi128_pd(_mm_shuffle_epi8(Db, mask));\n\n               A2 = _mm_castpd_ps(_mm_shuffle_pd(A1, B1, _MM_SHUFFLE2(0, 0)));\n               A3 = _mm_castpd_ps(_mm_shuffle_pd(C1, D1, _MM_SHUFFLE2(0, 0)));\n               C2 = _mm_castpd_ps(_mm_shuffle_pd(A1, B1, _MM_SHUFFLE2(1, 1)));\n               C3 = _mm_castpd_ps(_mm_shuffle_pd(C1, D1, _MM_SHUFFLE2(1, 1)));\n\n               __m128i Wb = _mm_castps_si128(_mm_shuffle_ps(A2, A3,\n                                             _MM_SHUFFLE(2, 0, 2, 0)));\n               __m128i Xb = _mm_castps_si128(_mm_shuffle_ps(A2, A3,\n                                             _MM_SHUFFLE(3, 1, 3, 1)));\n               __m128i Yb = _mm_castps_si128(_mm_shuffle_ps(C2, C3,\n                                             _MM_SHUFFLE(2, 0, 2, 0)));\n               __m128i Zb = _mm_castps_si128(_mm_shuffle_ps(C2, C3,\n                                             _MM_SHUFFLE(3, 1, 3, 1)));\n\n               ret.v0 = {mergeW};\n               ret.v1 = {mergeX};\n               ret.v2 = {mergeY};\n               ret.v3 = {mergeZ};\n\n               return ret;'''.format(mergeW=x86.setr('avx', typ, 'Wa', 'Wb'),\n                                     mergeX=x86.setr('avx', typ, 'Xa', 'Xb'),\n                                     mergeY=x86.setr('avx', typ, 'Ya', 'Yb'),\n                                     mergeZ=x86.setr('avx', typ, 'Za', 'Zb'),\n                                     **fmtspec)\n    if typ == 'f32':\n        return '''nsimd_{simd_ext}_vf32x4 ret;\n                  {load_v0v1v2v3}\n                  __m256 A1 = _mm256_unpacklo_ps(v0, v2);\n                  __m256 B1 = _mm256_unpackhi_ps(v0, v2);\n                  __m256 C1 = _mm256_unpacklo_ps(v1, v3);\n                  __m256 D1 = _mm256_unpackhi_ps(v1, v3);\n\n                  ret.v0 = _mm256_unpacklo_ps(A1, C1);\n                  ret.v1 = _mm256_unpackhi_ps(A1, C1);\n                  ret.v2 = _mm256_unpacklo_ps(B1, D1);\n                  ret.v3 = _mm256_unpackhi_ps(B1, D1);\n                  return ret;'''.format(**fmtspec)\n    if typ in ['i32', 'u32']:\n        if simd_ext == 'avx2':\n            return \\\n            '''nsimd_avx2_v{typ}x4 ret;\n               {load_v0v1v2v3}\n\n               __m256i A1 = _mm256_unpacklo_epi32(v0, v2);\n               __m256i B1 = _mm256_unpackhi_epi32(v0, v2);\n               __m256i C1 = _mm256_unpacklo_epi32(v1, v3);\n               __m256i D1 = _mm256_unpackhi_epi32(v1, v3);\n\n               ret.v0 = _mm256_unpacklo_epi32(A1, C1);\n               ret.v1 = _mm256_unpackhi_epi32(A1, C1);\n               ret.v2 = _mm256_unpacklo_epi32(B1, D1);\n               ret.v3 = _mm256_unpackhi_epi32(B1, D1);\n\n               return ret;'''.format(**fmtspec)\n        else:\n            return \\\n            '''nsimd_avx_v{typ}x4 ret;\n               nsimd_avx_vf32x4 retf32 = nsimd_load4{a}_avx_f32((f32 *){in0});\n               ret.v0 = _mm256_castps_si256(retf32.v0);\n               ret.v1 = _mm256_castps_si256(retf32.v1);\n               ret.v2 = _mm256_castps_si256(retf32.v2);\n               ret.v3 = _mm256_castps_si256(retf32.v3);\n               return ret;'''.format(**fmtspec)\n    if typ == 'f64':\n        return \\\n        '''nsimd_{simd_ext}_vf64x4 ret;\n           {load_v0v1v2v3}\n\n           __m256d A1 = _mm256_permute2f128_pd(v0, v2, 2 << 4);\n           __m256d B1 = _mm256_permute2f128_pd(v0, v2, (3 << 4) | 1);\n           __m256d C1 = _mm256_permute2f128_pd(v1, v3, 2 << 4);\n           __m256d D1 = _mm256_permute2f128_pd(v1, v3, (3 << 4) | 1);\n\n           ret.v0 = _mm256_unpacklo_pd(A1, C1);\n           ret.v1 = _mm256_unpackhi_pd(A1, C1);\n           ret.v2 = _mm256_unpacklo_pd(B1, D1);\n           ret.v3 = _mm256_unpackhi_pd(B1, D1);\n           return ret;'''.format(**fmtspec)\n    if typ in ['i64', 'u64']:\n        if simd_ext == 'avx2':\n            return \\\n            '''nsimd_avx2_v{typ}x4 ret;\n               {load_v0v1v2v3}\n\n               __m256i A1 = _mm256_permute2f128_si256(v0, v2, 2 << 4);\n               __m256i B1 = _mm256_permute2f128_si256(v0, v2, (3 << 4) | 1);\n               __m256i C1 = _mm256_permute2f128_si256(v1, v3, 2 << 4);\n               __m256i D1 = _mm256_permute2f128_si256(v1, v3, (3 << 4) | 1);\n\n               ret.v0 = _mm256_unpacklo_epi64(A1, C1);\n               ret.v1 = _mm256_unpackhi_epi64(A1, C1);\n               ret.v2 = _mm256_unpacklo_epi64(B1, D1);\n               ret.v3 = _mm256_unpackhi_epi64(B1, D1);\n\n               return ret;'''.format(**fmtspec)\n        else:\n            return \\\n            '''nsimd_avx_vf64x4 retf64 = nsimd_load4{a}_avx_f64((f64 *){in0});\n               nsimd_avx_v{typ}x4 ret;\n               ret.v0 = _mm256_castpd_si256(retf64.v0);\n               ret.v1 = _mm256_castpd_si256(retf64.v1);\n               ret.v2 = _mm256_castpd_si256(retf64.v2);\n               ret.v3 = _mm256_castpd_si256(retf64.v3);\n               return ret;'''.format(**fmtspec)\n\n###############################################################################\n\ndef load4_avx512(simd_ext, typ, align, fmtspec2):\n    fmtspec = fmtspec2.copy()\n    fmtspec['load_v0v1v2v3'] = get_load_v0v1v2v3(simd_ext, typ, align, fmtspec)\n    fmtspec['exlo_v0'] = x86.extract(simd_ext, typ, x86.LO, 'v0')\n    fmtspec['exhi_v0'] = x86.extract(simd_ext, typ, x86.HI, 'v0')\n    fmtspec['exlo_v1'] = x86.extract(simd_ext, typ, x86.LO, 'v1')\n    fmtspec['exhi_v1'] = x86.extract(simd_ext, typ, x86.HI, 'v1')\n    fmtspec['exlo_v2'] = x86.extract(simd_ext, typ, x86.LO, 'v2')\n    fmtspec['exhi_v2'] = x86.extract(simd_ext, typ, x86.HI, 'v2')\n    fmtspec['exlo_v3'] = x86.extract(simd_ext, typ, x86.LO, 'v3')\n    fmtspec['exhi_v3'] = x86.extract(simd_ext, typ, x86.HI, 'v3')\n    fmtspec['a'] = 'a' if align else 'u'\n    if typ in ['i8', 'u8']:\n        return \\\n        '''nsimd_{simd_ext}_v{typ}x4 ret;\n\n           {load_v0v1v2v3}\n\n           __m256i A0a = {exlo_v0};\n           __m256i B0a = {exhi_v0};\n           __m256i C0a = {exlo_v1};\n           __m256i D0a = {exhi_v1};\n           __m256i A0b = {exlo_v2};\n           __m256i B0b = {exhi_v2};\n           __m256i C0b = {exlo_v3};\n           __m256i D0b = {exhi_v3};\n\n           __m256i mask = _mm256_setr_epi8(0, 4, 8, 12, 1, 5, 9, 13,\n                                           2, 6, 10, 14, 3, 7, 11, 15,\n                                           0, 4, 8, 12, 1, 5, 9, 13,\n                                           2, 6, 10, 14, 3, 7, 11, 15);\n           __m256i mask2 = _mm256_setr_epi32(0, 4, 1, 5, 2, 6, 3, 7);\n\n           __m256i A1 = _mm256_shuffle_epi8(A0a, mask);\n           __m256i B1 = _mm256_shuffle_epi8(B0a, mask);\n           __m256i C1 = _mm256_shuffle_epi8(C0a, mask);\n           __m256i D1 = _mm256_shuffle_epi8(D0a, mask);\n\n           __m256i A2 = _mm256_permutevar8x32_epi32(A1, mask2);\n           __m256i B2 = _mm256_permutevar8x32_epi32(B1, mask2);\n           __m256i C2 = _mm256_permutevar8x32_epi32(C1, mask2);\n           __m256i D2 = _mm256_permutevar8x32_epi32(D1, mask2);\n\n           __m256i A3 = _mm256_permute2x128_si256(A2, C2, 2 << 4);\n           __m256i C3 = _mm256_permute2x128_si256(B2, D2, 2 << 4);\n           __m256i B3 = _mm256_permute2x128_si256(A2, C2, (3 << 4) | 1);\n           __m256i D3 = _mm256_permute2x128_si256(B2, D2, (3 << 4) | 1);\n\n           __m256i A4a = _mm256_unpacklo_epi64(A3, C3);\n           __m256i B4a = _mm256_unpackhi_epi64(A3, C3);\n           __m256i C4a = _mm256_unpacklo_epi64(B3, D3);\n           __m256i D4a = _mm256_unpackhi_epi64(B3, D3);\n\n           A1 = _mm256_shuffle_epi8(A0b, mask);\n           B1 = _mm256_shuffle_epi8(B0b, mask);\n           C1 = _mm256_shuffle_epi8(C0b, mask);\n           D1 = _mm256_shuffle_epi8(D0b, mask);\n\n           A2 = _mm256_permutevar8x32_epi32(A1, mask2);\n           B2 = _mm256_permutevar8x32_epi32(B1, mask2);\n           C2 = _mm256_permutevar8x32_epi32(C1, mask2);\n           D2 = _mm256_permutevar8x32_epi32(D1, mask2);\n\n           A3 = _mm256_permute2x128_si256(A2, C2, 2 << 4);\n           C3 = _mm256_permute2x128_si256(B2, D2, 2 << 4);\n           B3 = _mm256_permute2x128_si256(A2, C2, (3 << 4) | 1);\n           D3 = _mm256_permute2x128_si256(B2, D2, (3 << 4) | 1);\n\n           __m256i A4b = _mm256_unpacklo_epi64(A3, C3);\n           __m256i B4b = _mm256_unpackhi_epi64(A3, C3);\n           __m256i C4b = _mm256_unpacklo_epi64(B3, D3);\n           __m256i D4b = _mm256_unpackhi_epi64(B3, D3);\n\n           ret.v0 = {mergeA};\n           ret.v1 = {mergeB};\n           ret.v2 = {mergeC};\n           ret.v3 = {mergeD};\n\n           return ret;'''.format(mergeA=x86.setr(simd_ext, typ, 'A4a', 'A4b'),\n                                 mergeB=x86.setr(simd_ext, typ, 'B4a', 'B4b'),\n                                 mergeC=x86.setr(simd_ext, typ, 'C4a', 'C4b'),\n                                 mergeD=x86.setr(simd_ext, typ, 'D4a', 'D4b'),\n                                 **fmtspec)\n    if typ in ['i16', 'u16']:\n        return \\\n        '''nsimd_{simd_ext}_v{typ}x4 ret;\n\n           {load_v0v1v2v3}\n\n           __m256i A0a = {exlo_v0};\n           __m256i B0a = {exhi_v0};\n           __m256i C0a = {exlo_v1};\n           __m256i D0a = {exhi_v1};\n           __m256i A0b = {exlo_v2};\n           __m256i B0b = {exhi_v2};\n           __m256i C0b = {exlo_v3};\n           __m256i D0b = {exhi_v3};\n\n           __m256i A1 = _mm256_unpacklo_epi16(A0a, C0a);\n           __m256i B1 = _mm256_unpackhi_epi16(A0a, C0a);\n           __m256i C1 = _mm256_unpacklo_epi16(B0a, D0a);\n           __m256i D1 = _mm256_unpackhi_epi16(B0a, D0a);\n\n           __m256i A2 = _mm256_unpacklo_epi16(A1, C1);\n           __m256i B2 = _mm256_unpackhi_epi16(A1, C1);\n           __m256i C2 = _mm256_unpacklo_epi16(B1, D1);\n           __m256i D2 = _mm256_unpackhi_epi16(B1, D1);\n\n           __m256i A3a = _mm256_unpacklo_epi16(A2, C2);\n           __m256i B3a = _mm256_unpackhi_epi16(A2, C2);\n           __m256i C3a = _mm256_unpacklo_epi16(B2, D2);\n           __m256i D3a = _mm256_unpackhi_epi16(B2, D2);\n\n           A1 = _mm256_unpacklo_epi16(A0b, C0b);\n           B1 = _mm256_unpackhi_epi16(A0b, C0b);\n           C1 = _mm256_unpacklo_epi16(B0b, D0b);\n           D1 = _mm256_unpackhi_epi16(B0b, D0b);\n\n           A2 = _mm256_unpacklo_epi16(A1, C1);\n           B2 = _mm256_unpackhi_epi16(A1, C1);\n           C2 = _mm256_unpacklo_epi16(B1, D1);\n           D2 = _mm256_unpackhi_epi16(B1, D1);\n\n           __m256i A3b = _mm256_unpacklo_epi16(A2, C2);\n           __m256i B3b = _mm256_unpackhi_epi16(A2, C2);\n           __m256i C3b = _mm256_unpacklo_epi16(B2, D2);\n           __m256i D3b = _mm256_unpackhi_epi16(B2, D2);\n\n           ret.v0 = {mergeA};\n           ret.v1 = {mergeB};\n           ret.v2 = {mergeC};\n           ret.v3 = {mergeD};\n\n           return ret;'''.format(mergeA=x86.setr(simd_ext, typ, 'A3a', 'A3b'),\n                                 mergeB=x86.setr(simd_ext, typ, 'B3a', 'B3b'),\n                                 mergeC=x86.setr(simd_ext, typ, 'C3a', 'C3b'),\n                                 mergeD=x86.setr(simd_ext, typ, 'D3a', 'D3b'),\n                                 **fmtspec)\n    if typ in ['f32', 'i32', 'u32']:\n        return \\\n        '''nsimd_{simd_ext}_v{typ}x4 ret;\n\n           {load_v0v1v2v3}\n\n           __m512i WXm = _mm512_setr_epi32(0, 4, 8, 12, 16, 20, 24, 28,\n                                           1, 5, 9, 13, 17, 21, 25, 29);\n           __m512i YZm = _mm512_setr_epi32(2, 6, 10, 14, 18, 22, 26, 30,\n                                           3, 7, 11, 15, 19, 23, 27, 31);\n           __m512i Wm = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7,\n                                          16, 17, 18, 19, 20, 21, 22, 23);\n           __m512i Xm = _mm512_setr_epi32(8, 9, 10, 11, 12, 13, 14, 15,\n                                          24, 25, 26, 27, 28, 29, 30, 31);\n\n           {styp} WXa = _mm512_permutex2var{suf}(v0, WXm, v1);\n           {styp} WXb = _mm512_permutex2var{suf}(v2, WXm, v3);\n           {styp} YZa = _mm512_permutex2var{suf}(v0, YZm, v1);\n           {styp} YZb = _mm512_permutex2var{suf}(v2, YZm, v3);\n\n           ret.v0 = _mm512_permutex2var{suf}(WXa, Wm, WXb);\n           ret.v1 = _mm512_permutex2var{suf}(WXa, Xm, WXb);\n           ret.v2 = _mm512_permutex2var{suf}(YZa, Wm, YZb);\n           ret.v3 = _mm512_permutex2var{suf}(YZa, Xm, YZb);\n\n           return ret;'''.format(**fmtspec)\n    if typ in ['f64', 'i64', 'u64']:\n        return \\\n        '''nsimd_{simd_ext}_v{typ}x4 ret;\n\n           {load_v0v1v2v3}\n\n           {styp} A1 = _mm512_unpacklo{suf}(v0, v1);\n           {styp} B1 = _mm512_unpacklo{suf}(v2, v3);\n           {styp} C1 = _mm512_unpackhi{suf}(v0, v1);\n           {styp} D1 = _mm512_unpackhi{suf}(v2, v3);\n\n           __m512i A_mask = _mm512_set_epi64(13, 9, 12, 8, 5, 1, 4, 0);\n           __m512i B_mask = _mm512_set_epi64(15, 11, 14, 10, 7, 3, 6, 2);\n\n           ret.v0 = _mm512_permutex2var{suf}(A1, A_mask, B1);\n           ret.v1 = _mm512_permutex2var{suf}(C1, A_mask, D1);\n           ret.v2 = _mm512_permutex2var{suf}(A1, B_mask, B1);\n           ret.v3 = _mm512_permutex2var{suf}(C1, B_mask, D1);\n\n           return ret;'''.format(**fmtspec)\n\n###############################################################################\n\ndef store4(simd_ext, typ, align, fmtspec2, v0, v1, v2, v3):\n    fmtspec = fmtspec2.copy()\n    fmtspec['a'] = '' if align else 'u'\n    store = '{pre}store{a}{sufsi}'.format(**fmtspec)\n    fmtspec['store'] = store\n    fmtspec['v0'] = v0\n    fmtspec['v1'] = v1\n    fmtspec['v2'] = v2\n    fmtspec['v3'] = v3\n    if typ in ['f32', 'f64']:\n        return \\\n        '''{store}({in0}, {v0});\n           {store}({in0} + {le}, {v1});\n           {store}({in0} + (2 * {le}), {v2});\n           {store}({in0} + (3 * {le}), {v3});'''.format(**fmtspec)\n    else:\n        return \\\n        '''{store}(({styp} *){in0}, {v0});\n           {store}(({styp} *){in0} + 1, {v1});\n           {store}(({styp} *){in0} + 2, {v2});\n           {store}(({styp} *){in0} + 3, {v3});'''.format(**fmtspec)\n\n###############################################################################\n\ndef store4_sse(typ, align, fmtspec2):\n    fmtspec = fmtspec2.copy()\n    if typ in ['i8', 'u8']:\n        return \\\n        '''__m128i A5 = _mm_unpacklo_epi8({in1}, {in3});\n           __m128i B5 = _mm_unpackhi_epi8({in1}, {in3});\n           __m128i C5 = _mm_unpacklo_epi8({in2}, {in4});\n           __m128i D5 = _mm_unpackhi_epi8({in2}, {in4});\n\n           __m128i A6 = _mm_unpacklo_epi8(A5, C5);\n           __m128i B6 = _mm_unpackhi_epi8(A5, C5);\n           __m128i C6 = _mm_unpacklo_epi8(B5, D5);\n           __m128i D6 = _mm_unpackhi_epi8(B5, D5);\n\n           {store}'''.format(store=store4('sse', typ, align, fmtspec,\n                                          'A6', 'B6', 'C6', 'D6'), **fmtspec)\n    if typ in ['i16', 'u16']:\n        return \\\n        '''__m128i Q = _mm_unpacklo_epi16({in1}, {in2});\n           __m128i R = _mm_unpackhi_epi16({in1}, {in2});\n           __m128i S = _mm_unpacklo_epi16({in3}, {in4});\n           __m128i T = _mm_unpackhi_epi16({in3}, {in4});\n\n           __m128i U = _mm_unpacklo_epi32(Q, S);\n           __m128i V = _mm_unpackhi_epi32(Q, S);\n           __m128i W = _mm_unpacklo_epi32(R, T);\n           __m128i X = _mm_unpackhi_epi32(R, T);\n\n           {store}'''.format(store=store4('sse', typ, align, fmtspec,\n                                          'U', 'V', 'W', 'X'), **fmtspec)\n    if typ in ['f32', 'i32', 'u32']:\n        return \\\n        '''{styp} A3 = _mm_unpacklo{suf}({in1}, {in3});\n           {styp} B3 = _mm_unpackhi{suf}({in1}, {in3});\n           {styp} C3 = _mm_unpacklo{suf}({in2}, {in4});\n           {styp} D3 = _mm_unpackhi{suf}({in2}, {in4});\n\n           {styp} A4 = _mm_unpacklo{suf}(A3, C3);\n           {styp} B4 = _mm_unpackhi{suf}(A3, C3);\n           {styp} C4 = _mm_unpacklo{suf}(B3, D3);\n           {styp} D4 = _mm_unpackhi{suf}(B3, D3);\n\n           {store}'''.format(store=store4('sse', typ, align, fmtspec,\n                                          'A4', 'B4', 'C4', 'D4'), **fmtspec)\n    if typ in ['f64', 'u64', 'i64']:\n        return \\\n        '''{styp} A0 = _mm_unpacklo{suf}({in1}, {in2});\n           {styp} B0 = _mm_unpacklo{suf}({in3}, {in4});\n           {styp} C0 = _mm_unpackhi{suf}({in1}, {in2});\n           {styp} D0 = _mm_unpackhi{suf}({in3}, {in4});\n           {store}'''.format(store=store4('sse', typ, align, fmtspec,\n                                          'A0', 'B0', 'C0', 'D0'), **fmtspec)\n\n###############################################################################\n\ndef store4_avx(simd_ext, typ, align, fmtspec2):\n    fmtspec = fmtspec2.copy()\n    fmtspec['exlo_in1'] = x86.extract('avx', typ, x86.LO, common.in1)\n    fmtspec['exhi_in1'] = x86.extract('avx', typ, x86.HI, common.in1)\n    fmtspec['exlo_in2'] = x86.extract('avx', typ, x86.LO, common.in2)\n    fmtspec['exhi_in2'] = x86.extract('avx', typ, x86.HI, common.in2)\n    fmtspec['exlo_in3'] = x86.extract('avx', typ, x86.LO, common.in3)\n    fmtspec['exhi_in3'] = x86.extract('avx', typ, x86.HI, common.in3)\n    fmtspec['exlo_in4'] = x86.extract('avx', typ, x86.LO, common.in4)\n    fmtspec['exhi_in4'] = x86.extract('avx', typ, x86.HI, common.in4)\n    fmtspec['a'] = 'a' if align else 'u'\n    if typ in ['i8', 'u8']:\n        if simd_ext == 'avx2':\n            return \\\n            '''__m256i A1 = _mm256_unpacklo_epi8({in1}, {in3});\n               __m256i B1 = _mm256_unpackhi_epi8({in1}, {in3});\n               __m256i C1 = _mm256_unpacklo_epi8({in2}, {in4});\n               __m256i D1 = _mm256_unpackhi_epi8({in2}, {in4});\n\n               __m256i A2 = _mm256_permute4x64_epi64(A1, _MM_SHUFFLE(3,1,2,0));\n               __m256i B2 = _mm256_permute4x64_epi64(B1, _MM_SHUFFLE(3,1,2,0));\n               __m256i C2 = _mm256_permute4x64_epi64(C1, _MM_SHUFFLE(3,1,2,0));\n               __m256i D2 = _mm256_permute4x64_epi64(D1, _MM_SHUFFLE(3,1,2,0));\n\n               __m256i A = _mm256_unpacklo_epi8(A2, C2);\n               __m256i B = _mm256_unpacklo_epi8(B2, D2);\n               __m256i C = _mm256_unpackhi_epi8(A2, C2);\n               __m256i D = _mm256_unpackhi_epi8(B2, D2);\n\n               {store}'''.format(store=store4('avx', typ, align, fmtspec,\n                                 'A', 'B', 'C', 'D'), **fmtspec)\n        else:\n            return \\\n            '''__m128i Wa = {exlo_in1};\n               __m128i Wb = {exhi_in1};\n               __m128i Xa = {exlo_in2};\n               __m128i Xb = {exhi_in2};\n               __m128i Ya = {exlo_in3};\n               __m128i Yb = {exhi_in3};\n               __m128i Za = {exlo_in4};\n               __m128i Zb = {exhi_in4};\n\n               __m128i AA = _mm_unpacklo_epi8(Wa, Ya);\n               __m128i BB = _mm_unpackhi_epi8(Wa, Ya);\n               __m128i CC = _mm_unpacklo_epi8(Xa, Za);\n               __m128i DD = _mm_unpackhi_epi8(Xa, Za);\n\n               __m128i A0 = _mm_unpacklo_epi8(AA, CC);\n               __m128i B0 = _mm_unpackhi_epi8(AA, CC);\n               __m128i C0 = _mm_unpacklo_epi8(BB, DD);\n               __m128i D0 = _mm_unpackhi_epi8(BB, DD);\n\n               AA = _mm_unpacklo_epi8(Wb, Yb);\n               BB = _mm_unpackhi_epi8(Wb, Yb);\n               CC = _mm_unpacklo_epi8(Xb, Zb);\n               DD = _mm_unpackhi_epi8(Xb, Zb);\n\n               __m128i A1 = _mm_unpacklo_epi8(AA, CC);\n               __m128i B1 = _mm_unpackhi_epi8(AA, CC);\n               __m128i C1 = _mm_unpacklo_epi8(BB, DD);\n               __m128i D1 = _mm_unpackhi_epi8(BB, DD);\n\n               __m256i A = {mergeAB0};\n               __m256i B = {mergeCD0};\n               __m256i C = {mergeAB1};\n               __m256i D = {mergeCD1};\n\n               {store}'''.format(mergeAB0=x86.setr('avx', typ, 'A0', 'B0'),\n                                 mergeCD0=x86.setr('avx', typ, 'C0', 'D0'),\n                                 mergeAB1=x86.setr('avx', typ, 'A1', 'B1'),\n                                 mergeCD1=x86.setr('avx', typ, 'C1', 'D1'),\n                                 store=store4('avx', typ, align, fmtspec,\n                                         'A', 'B', 'C', 'D'), **fmtspec)\n    if typ in ['i16', 'u16']:\n        if simd_ext == 'avx2':\n            return \\\n            '''__m256i A3 = _mm256_unpacklo_epi16({in1}, {in3});\n               __m256i B3 = _mm256_unpackhi_epi16({in1}, {in3});\n               __m256i C3 = _mm256_unpacklo_epi16({in2}, {in4});\n               __m256i D3 = _mm256_unpackhi_epi16({in2}, {in4});\n\n               __m256i A = _mm256_unpacklo_epi16(A3, C3);\n               __m256i B = _mm256_unpackhi_epi16(A3, C3);\n               __m256i C = _mm256_unpacklo_epi16(B3, D3);\n               __m256i D = _mm256_unpackhi_epi16(B3, D3);\n\n               {store}'''.format(store=store4('avx', typ, align, fmtspec,\n                                              'A', 'B', 'C', 'D'), **fmtspec)\n        else:\n            return \\\n            '''__m128i Wa = {exlo_in1};\n               __m128i Wb = {exhi_in1};\n               __m128i Xa = {exlo_in2};\n               __m128i Xb = {exhi_in2};\n               __m128i Ya = {exlo_in3};\n               __m128i Yb = {exhi_in3};\n               __m128i Za = {exlo_in4};\n               __m128i Zb = {exhi_in4};\n\n               __m128i AA = _mm_unpacklo_epi16(Wa, Ya);\n               __m128i BB = _mm_unpackhi_epi16(Wa, Ya);\n               __m128i CC = _mm_unpacklo_epi16(Xa, Za);\n               __m128i DD = _mm_unpackhi_epi16(Xa, Za);\n\n               __m128i A0 = _mm_unpacklo_epi16(AA, CC);\n               __m128i B0 = _mm_unpackhi_epi16(AA, CC);\n               __m128i C0 = _mm_unpacklo_epi16(BB, DD);\n               __m128i D0 = _mm_unpackhi_epi16(BB, DD);\n\n               AA = _mm_unpacklo_epi16(Wb, Yb);\n               BB = _mm_unpackhi_epi16(Wb, Yb);\n               CC = _mm_unpacklo_epi16(Xb, Zb);\n               DD = _mm_unpackhi_epi16(Xb, Zb);\n\n               __m128i A1 = _mm_unpacklo_epi16(AA, CC);\n               __m128i B1 = _mm_unpackhi_epi16(AA, CC);\n               __m128i C1 = _mm_unpacklo_epi16(BB, DD);\n               __m128i D1 = _mm_unpackhi_epi16(BB, DD);\n\n               __m256i A = {mergeAB0};\n               __m256i B = {mergeCD0};\n               __m256i C = {mergeAB1};\n               __m256i D = {mergeCD1};\n\n               {store}'''.format(mergeAB0=x86.setr('avx', typ, 'A0', 'B0'),\n                                 mergeCD0=x86.setr('avx', typ, 'C0', 'D0'),\n                                 mergeAB1=x86.setr('avx', typ, 'A1', 'B1'),\n                                 mergeCD1=x86.setr('avx', typ, 'C1', 'D1'),\n                                 store=store4('avx', typ, align, fmtspec,\n                                              'A', 'B', 'C', 'D'), **fmtspec)\n    if typ == 'f32':\n        return \\\n        '''__m256 A3 = _mm256_unpacklo_ps({in1}, {in3});\n           __m256 B3 = _mm256_unpackhi_ps({in1}, {in3});\n           __m256 C3 = _mm256_unpacklo_ps({in2}, {in4});\n           __m256 D3 = _mm256_unpackhi_ps({in2}, {in4});\n\n           __m256 A = _mm256_unpacklo_ps(A3, C3);\n           __m256 B = _mm256_unpackhi_ps(A3, C3);\n           __m256 C = _mm256_unpacklo_ps(B3, D3);\n           __m256 D = _mm256_unpackhi_ps(B3, D3);\n\n           {store}'''.format(store=store4('avx', typ, align, fmtspec,\n                                          'A', 'B', 'C', 'D'), **fmtspec)\n    if typ in ['i32', 'u32']:\n        if simd_ext == 'avx2':\n            return \\\n            '''__m256i A3 = _mm256_unpacklo_epi32({in1}, {in3});\n               __m256i B3 = _mm256_unpackhi_epi32({in1}, {in3});\n               __m256i C3 = _mm256_unpacklo_epi32({in2}, {in4});\n               __m256i D3 = _mm256_unpackhi_epi32({in2}, {in4});\n\n               __m256i A = _mm256_unpacklo_epi32(A3, C3);\n               __m256i B = _mm256_unpackhi_epi32(A3, C3);\n               __m256i C = _mm256_unpacklo_epi32(B3, D3);\n               __m256i D = _mm256_unpackhi_epi32(B3, D3);\n\n               {store}'''.format(store=store4('avx', typ, align, fmtspec,\n                                 'A', 'B', 'C', 'D'), **fmtspec)\n        else:\n            return \\\n            '''nsimd_store4{a}_avx_f32((f32 *){in0},\n                                       _mm256_castsi256_ps({in1}),\n                                       _mm256_castsi256_ps({in2}),\n                                       _mm256_castsi256_ps({in3}),\n                                       _mm256_castsi256_ps({in4}));'''. \\\n               format(**fmtspec)\n    if typ == 'f64':\n        return \\\n        '''__m256d A3 = _mm256_permute2f128_pd({in1}, {in3}, 2 << 4);\n           __m256d B3 = _mm256_permute2f128_pd({in2}, {in4}, 2 << 4);\n           __m256d C3 = _mm256_permute2f128_pd({in1}, {in3}, (3 << 4) | 1);\n           __m256d D3 = _mm256_permute2f128_pd({in2}, {in4}, (3 << 4) | 1);\n\n           __m256d A = _mm256_unpacklo_pd(A3, B3);\n           __m256d B = _mm256_unpackhi_pd(A3, B3);\n           __m256d C = _mm256_unpacklo_pd(C3, D3);\n           __m256d D = _mm256_unpackhi_pd(C3, D3);\n\n           {store}'''.format(store=store4('avx', typ, align, fmtspec,\n                                          'A', 'B', 'C', 'D'), **fmtspec)\n\n    if typ in ['i64', 'u64']:\n        if simd_ext == 'avx2':\n            return \\\n            '''__m256i A3 = _mm256_permute2f128_si256({in1}, {in3}, 2 << 4);\n               __m256i B3 = _mm256_permute2f128_si256({in2}, {in4}, 2 << 4);\n               __m256i C3 = _mm256_permute2f128_si256(\n                              {in1}, {in3}, (3 << 4) | 1);\n               __m256i D3 = _mm256_permute2f128_si256(\n                              {in2}, {in4}, (3 << 4) | 1);\n\n               __m256i A = _mm256_unpacklo_epi64(A3, B3);\n               __m256i B = _mm256_unpackhi_epi64(A3, B3);\n               __m256i C = _mm256_unpacklo_epi64(C3, D3);\n               __m256i D = _mm256_unpackhi_epi64(C3, D3);\n\n               {store}'''.format(store=store4('avx', typ, align, fmtspec,\n                                 'A', 'B', 'C', 'D'), **fmtspec)\n        else:\n            return \\\n            '''nsimd_store4{a}_avx_f64((f64 *){in0},\n                                       _mm256_castsi256_pd({in1}),\n                                       _mm256_castsi256_pd({in2}),\n                                       _mm256_castsi256_pd({in3}),\n                                       _mm256_castsi256_pd({in4}));'''. \\\n               format(**fmtspec)\n\n###############################################################################\n\ndef store4_avx512(simd_ext, typ, align, fmtspec2):\n    fmtspec = fmtspec2.copy()\n    fmtspec['exlo_in1'] = x86.extract(simd_ext, typ, x86.LO, common.in1)\n    fmtspec['exhi_in1'] = x86.extract(simd_ext, typ, x86.HI, common.in1)\n    fmtspec['exlo_in2'] = x86.extract(simd_ext, typ, x86.LO, common.in2)\n    fmtspec['exhi_in2'] = x86.extract(simd_ext, typ, x86.HI, common.in2)\n    fmtspec['exlo_in3'] = x86.extract(simd_ext, typ, x86.LO, common.in3)\n    fmtspec['exhi_in3'] = x86.extract(simd_ext, typ, x86.HI, common.in3)\n    fmtspec['exlo_in4'] = x86.extract(simd_ext, typ, x86.LO, common.in4)\n    fmtspec['exhi_in4'] = x86.extract(simd_ext, typ, x86.HI, common.in4)\n    fmtspec['a'] = 'a' if align else 'u'\n    if typ in ['i8', 'u8']:\n        return \\\n        '''__m256i A0a = {exlo_in1};\n           __m256i A0b = {exhi_in1};\n           __m256i B0a = {exlo_in2};\n           __m256i B0b = {exhi_in2};\n           __m256i C0a = {exlo_in3};\n           __m256i C0b = {exhi_in3};\n           __m256i D0a = {exlo_in4};\n           __m256i D0b = {exhi_in4};\n\n           __m256i A1 = _mm256_unpacklo_epi8(A0a, C0a);\n           __m256i B1 = _mm256_unpackhi_epi8(A0a, C0a);\n           __m256i C1 = _mm256_unpacklo_epi8(B0a, D0a);\n           __m256i D1 = _mm256_unpackhi_epi8(B0a, D0a);\n\n           __m256i A2 = _mm256_permute4x64_epi64(A1, _MM_SHUFFLE(3,1,2,0));\n           __m256i B2 = _mm256_permute4x64_epi64(B1, _MM_SHUFFLE(3,1,2,0));\n           __m256i C2 = _mm256_permute4x64_epi64(C1, _MM_SHUFFLE(3,1,2,0));\n           __m256i D2 = _mm256_permute4x64_epi64(D1, _MM_SHUFFLE(3,1,2,0));\n\n           __m256i A3a = _mm256_unpacklo_epi8(A2, C2);\n           __m256i B3a = _mm256_unpacklo_epi8(B2, D2);\n           __m256i C3a = _mm256_unpackhi_epi8(A2, C2);\n           __m256i D3a = _mm256_unpackhi_epi8(B2, D2);\n\n           A1 = _mm256_unpacklo_epi8(A0b, C0b);\n           B1 = _mm256_unpackhi_epi8(A0b, C0b);\n           C1 = _mm256_unpacklo_epi8(B0b, D0b);\n           D1 = _mm256_unpackhi_epi8(B0b, D0b);\n\n           A2 = _mm256_permute4x64_epi64(A1, _MM_SHUFFLE(3,1,2,0));\n           B2 = _mm256_permute4x64_epi64(B1, _MM_SHUFFLE(3,1,2,0));\n           C2 = _mm256_permute4x64_epi64(C1, _MM_SHUFFLE(3,1,2,0));\n           D2 = _mm256_permute4x64_epi64(D1, _MM_SHUFFLE(3,1,2,0));\n\n           __m256i A3b = _mm256_unpacklo_epi8(A2, C2);\n           __m256i B3b = _mm256_unpacklo_epi8(B2, D2);\n           __m256i C3b = _mm256_unpackhi_epi8(A2, C2);\n           __m256i D3b = _mm256_unpackhi_epi8(B2, D2);\n\n           __m512i A = {mergeABa};\n           __m512i B = {mergeCDa};\n           __m512i C = {mergeABb};\n           __m512i D = {mergeCDb};\n\n           {store}'''.format(mergeABa=x86.setr(simd_ext, typ, 'A3a', 'B3a'),\n                             mergeCDa=x86.setr(simd_ext, typ, 'C3a', 'D3a'),\n                             mergeABb=x86.setr(simd_ext, typ, 'A3b', 'B3b'),\n                             mergeCDb=x86.setr(simd_ext, typ, 'C3b', 'D3b'),\n                             store=store4(simd_ext, typ, align, fmtspec,\n                                          'A', 'B', 'C', 'D'), **fmtspec)\n    if typ in ['i16', 'u16']:\n        return \\\n        '''__m256i A0a = {exlo_in1};\n           __m256i A0b = {exhi_in1};\n           __m256i B0a = {exlo_in2};\n           __m256i B0b = {exhi_in2};\n           __m256i C0a = {exlo_in3};\n           __m256i C0b = {exhi_in3};\n           __m256i D0a = {exlo_in4};\n           __m256i D0b = {exhi_in4};\n\n           __m256i A3 = _mm256_unpacklo_epi16(A0a, C0a);\n           __m256i B3 = _mm256_unpackhi_epi16(A0a, C0a);\n           __m256i C3 = _mm256_unpacklo_epi16(B0a, D0a);\n           __m256i D3 = _mm256_unpackhi_epi16(B0a, D0a);\n\n           __m256i A4a = _mm256_unpacklo_epi16(A3, C3);\n           __m256i B4a = _mm256_unpackhi_epi16(A3, C3);\n           __m256i C4a = _mm256_unpacklo_epi16(B3, D3);\n           __m256i D4a = _mm256_unpackhi_epi16(B3, D3);\n\n           A3 = _mm256_unpacklo_epi16(A0b, C0b);\n           B3 = _mm256_unpackhi_epi16(A0b, C0b);\n           C3 = _mm256_unpacklo_epi16(B0b, D0b);\n           D3 = _mm256_unpackhi_epi16(B0b, D0b);\n\n           __m256i A4b = _mm256_unpacklo_epi16(A3, C3);\n           __m256i B4b = _mm256_unpackhi_epi16(A3, C3);\n           __m256i C4b = _mm256_unpacklo_epi16(B3, D3);\n           __m256i D4b = _mm256_unpackhi_epi16(B3, D3);\n\n           __m512i A = {mergeABa};\n           __m512i B = {mergeCDa};\n           __m512i C = {mergeABb};\n           __m512i D = {mergeCDb};\n\n           {store}'''.format(mergeABa=x86.setr(simd_ext, typ, 'A4a', 'B4a'),\n                             mergeCDa=x86.setr(simd_ext, typ, 'C4a', 'D4a'),\n                             mergeABb=x86.setr(simd_ext, typ, 'A4b', 'B4b'),\n                             mergeCDb=x86.setr(simd_ext, typ, 'C4b', 'D4b'),\n                             store=store4(simd_ext, typ, align, fmtspec,\n                                          'A', 'B', 'C', 'D'), **fmtspec)\n    if typ in ['f32', 'i32', 'u32']:\n        return \\\n        '''__m512i m1 = _mm512_setr_epi32(0, 1, 2, 3, 16, 17, 18, 19,\n                                          4, 5, 6, 7, 20, 21, 22, 23);\n           __m512i m2 = _mm512_setr_epi32(8, 9, 10, 11, 24, 25, 26, 27,\n                                          12, 13, 14, 15, 28, 29, 30, 31);\n           __m512i m3 = _mm512_setr_epi32(0, 4, 16, 20, 1, 5, 17, 21,\n                                          2, 6, 18, 22, 3, 7, 19, 23);\n           __m512i m4 = _mm512_setr_epi32(8, 12, 24, 28, 9, 13, 25, 29,\n                                          10, 14, 26, 30, 11, 15, 27, 31);\n\n           {styp} WXa = _mm512_permutex2var{suf}({in1}, m1, {in2});\n           {styp} WXb = _mm512_permutex2var{suf}({in1}, m2, {in2});\n           {styp} YZa = _mm512_permutex2var{suf}({in3}, m1, {in4});\n           {styp} YZb = _mm512_permutex2var{suf}({in3}, m2, {in4});\n\n           {styp} A = _mm512_permutex2var{suf}(WXa, m3, YZa);\n           {styp} B = _mm512_permutex2var{suf}(WXa, m4, YZa);\n           {styp} C = _mm512_permutex2var{suf}(WXb, m3, YZb);\n           {styp} D = _mm512_permutex2var{suf}(WXb, m4, YZb);\n\n           {store}'''.format(store=store4(simd_ext, typ, align, fmtspec,\n                                          'A', 'B', 'C', 'D'), **fmtspec)\n    if typ in ['f64', 'i64', 'u64']:\n        return \\\n        '''__m512i A_mask = _mm512_setr_epi64(0, 1,  2,  3,  8,  9, 10, 11);\n           __m512i B_mask = _mm512_setr_epi64(4, 5,  6,  7, 12, 13, 14, 15);\n           __m512i C_mask = _mm512_setr_epi64(0, 4,  8, 12,  1,  5,  9, 13);\n           __m512i D_mask = _mm512_setr_epi64(2, 6, 10, 14,  3,  7, 11, 15);\n\n           {styp} A1 = _mm512_permutex2var{suf}({in1}, A_mask, {in2});\n           {styp} B1 = _mm512_permutex2var{suf}({in1}, B_mask, {in2});\n           {styp} C1 = _mm512_permutex2var{suf}({in3}, A_mask, {in4});\n           {styp} D1 = _mm512_permutex2var{suf}({in3}, B_mask, {in4});\n\n           {styp} A = _mm512_permutex2var{suf}(A1, C_mask, C1);\n           {styp} B = _mm512_permutex2var{suf}(A1, D_mask, C1);\n           {styp} C = _mm512_permutex2var{suf}(B1, C_mask, D1);\n           {styp} D = _mm512_permutex2var{suf}(B1, D_mask, D1);\n\n           {store}'''.format(store=store4(simd_ext, typ, align, fmtspec,\n                                          'A', 'B', 'C', 'D'), **fmtspec)\n\n###############################################################################\n\ndef get_load_v0v1v2(simd_ext, typ, align, fmtspec):\n    load = '{pre}load{a}{sufsi}'.format(a='' if align else 'u', **fmtspec)\n    if typ in ['f32', 'f64']:\n        return '''{styp} v0 = {load}(a0);\n                  {styp} v1 = {load}(a0 + {le});\n                  {styp} v2 = {load}(a0 + (2 * {le}));'''. \\\n                  format(load=load, **fmtspec)\n    else:\n        return '''{styp} v0 = {load}(({styp}*)a0);\n                  {styp} v1 = {load}(({styp}*)a0 + 1);\n                  {styp} v2 = {load}(({styp}*)a0 + 2);'''. \\\n                  format(load=load, **fmtspec)\n\n###############################################################################\n\ndef load3_sse(simd_ext, typ, align, fmtspec2):\n    fmtspec = fmtspec2.copy()\n    fmtspec['load_v0v1v2'] = get_load_v0v1v2('sse', typ, align, fmtspec)\n    fmtspec['a'] = 'a' if align else 'u'\n    if typ in ['i8', 'u8']:\n        if simd_ext == 'sse42':\n            return \\\n            '''nsimd_sse42_v{typ}x3 ret;\n               {load_v0v1v2}\n\n               __m128i A1_mask = _mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1,\n                                              -1, -1, 15, 12,  9,  6,  3,  0);\n               __m128i A2_mask = _mm_set_epi8(-1, -1, -1, -1, -1, 14, 11,  8,\n                                               5,  2, -1, -1, -1, -1, -1, -1);\n               __m128i A3_mask = _mm_set_epi8(13, 10,  7,  4,  1, -1, -1, -1,\n                                              -1, -1, -1, -1, -1, -1, -1, -1);\n               __m128i A4 = _mm_shuffle_epi8(v0, A1_mask);\n               __m128i A5 = _mm_shuffle_epi8(v1, A2_mask);\n               __m128i A6 = _mm_shuffle_epi8(v2, A3_mask);\n               A4 = _mm_or_si128(A4, A5);\n               ret.v0 = _mm_or_si128(A4, A6);\n\n               __m128i B1_mask = _mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1,\n                                              -1, -1, -1, 13, 10,  7,  4,  1);\n               __m128i B2_mask = _mm_set_epi8(-1, -1, -1, -1, -1, 15, 12,  9,\n                                               6,  3,  0, -1, -1, -1, -1, -1);\n               __m128i B3_mask = _mm_set_epi8(14, 11,  8,  5,  2, -1, -1, -1,\n                                              -1, -1, -1, -1, -1, -1, -1, -1);\n               __m128i B4 = _mm_shuffle_epi8(v0, B1_mask);\n               __m128i B5 = _mm_shuffle_epi8(v1, B2_mask);\n               __m128i B6 = _mm_shuffle_epi8(v2, B3_mask);\n               B4 = _mm_or_si128(B4, B5);\n               ret.v1 = _mm_or_si128(B4, B6);\n\n               __m128i C1_mask = _mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1,\n                                              -1, -1, -1, 14, 11,  8,  5,  2);\n               __m128i C2_mask = _mm_set_epi8(-1, -1, -1, -1, -1, -1, 13, 10,\n                                               7,  4,  1, -1, -1, -1, -1, -1);\n               __m128i C3_mask = _mm_set_epi8(15, 12,  9,  6,  3,  0, -1, -1,\n                                              -1, -1, -1, -1, -1, -1, -1, -1);\n               __m128i C4 = _mm_shuffle_epi8(v0, C1_mask);\n               __m128i C5 = _mm_shuffle_epi8(v1, C2_mask);\n               __m128i C6 = _mm_shuffle_epi8(v2, C3_mask);\n               C4 = _mm_or_si128(C4, C5);\n               ret.v2 = _mm_or_si128(C4, C6);\n\n               return ret;'''.format(**fmtspec)\n        else:\n            return \\\n            '''nsimd_sse2_v{typ}x3 ret;\n               {load_v0v1v2}\n\n               __m128i A0 = v0;\n               __m128i B0 = v1;\n               __m128i C0 = v2;\n               int k;\n\n               for (k = 0; k < 4; ++k) {{\n                 __m128d B0_pd = _mm_castsi128_pd(B0);\n                 __m128d C0_pd = _mm_castsi128_pd(C0);\n\n                 __m128d B1_pd = _mm_shuffle_pd(B0_pd, B0_pd, 1);\n                 __m128d C2_pd = _mm_shuffle_pd(C0_pd, C0_pd, 1);\n\n                 __m128i B1 = _mm_castpd_si128(B1_pd);\n                 __m128i C2 = _mm_castpd_si128(C2_pd);\n\n                 __m128i B3 = _mm_unpackhi_epi8(A0, C2);\n                 __m128i A4 = _mm_unpacklo_epi8(A0, B1);\n                 __m128i C5 = _mm_unpackhi_epi8(B1, C0);\n                 A0 = A4;\n                 B0 = B3;\n                 C0 = C5;\n               }}\n               ret.v0 = A0;\n               ret.v1 = B0;\n               ret.v2 = C0;\n               return ret;'''.format(**fmtspec)\n    if typ in ['i16', 'u16']:\n        return \\\n        '''nsimd_{simd_ext}_v{typ}x3 ret;\n\n           {load_v0v1v2}\n\n           int k;\n\n           for (k = 0; k < 3; ++k) {{\n             __m128d B1_pd = _mm_castsi128_pd(v1);\n             __m128d C1_pd = _mm_castsi128_pd(v2);\n             __m128d B2_pd = _mm_shuffle_pd(B1_pd, B1_pd, 1);\n             __m128d C3_pd = _mm_shuffle_pd(C1_pd, C1_pd, 1);\n             __m128i B2 = _mm_castpd_si128(B2_pd);\n             __m128i C3 = _mm_castpd_si128(C3_pd);\n\n             __m128i B4 = _mm_unpackhi_epi16(v0, C3);\n             __m128i A5 = _mm_unpacklo_epi16(v0, B2);\n             __m128i C7 = _mm_unpackhi_epi16(B2, v2);\n\n             v0 = A5;\n             v1 = B4;\n             v2 = C7;\n           }}\n           ret.v0 = v0;\n           ret.v1 = v1;\n           ret.v2 = v2;\n           return ret;'''.format(**fmtspec)\n    if typ == 'f32':\n        return \\\n        '''nsimd_{simd_ext}_v{typ}x3 ret;\n           {load_v0v1v2}\n\n           __m128 A1 = _mm_shuffle_ps(v2, v1, _MM_SHUFFLE(3,2,1,0));\n           __m128 B2 = _mm_shuffle_ps(v0, v2, _MM_SHUFFLE(3,2,1,0));\n           __m128 C3 = _mm_shuffle_ps(v1, v0, _MM_SHUFFLE(3,2,1,0));\n\n           ret.v0 = _mm_shuffle_ps(v0, A1, _MM_SHUFFLE(1,2,3,0));\n           __m128 B5 = _mm_shuffle_ps(B2, v1, _MM_SHUFFLE(0,3,2,1));\n           ret.v2 = _mm_shuffle_ps(C3, v2, _MM_SHUFFLE(3,0,1,2));\n\n           ret.v1 = _mm_shuffle_ps(B5, B5, _MM_SHUFFLE(1,2,3,0));\n\n           return ret;'''.format(**fmtspec)\n    if typ in ['i32', 'u32']:\n        return \\\n        '''nsimd_{simd_ext}_v{typ}x3 ret;\n           nsimd_{simd_ext}_vf32x3 retf32 =\n               nsimd_load3{a}_{simd_ext}_f32((f32 *){in0});\n           ret.v0 = _mm_castps_si128(retf32.v0);\n           ret.v1 = _mm_castps_si128(retf32.v1);\n           ret.v2 = _mm_castps_si128(retf32.v2);\n           return ret;'''.format(**fmtspec)\n    if typ == 'f64':\n        return \\\n        '''nsimd_{simd_ext}_vf64x3 ret;\n           {load_v0v1v2}\n           ret.v0 = _mm_shuffle_pd(v0, v1, 2);\n           ret.v1 = _mm_shuffle_pd(v0, v2, 1);\n           ret.v2 = _mm_shuffle_pd(v1, v2, 2);\n           return ret;'''.format(**fmtspec)\n    if typ in ['i64', 'u64']:\n        return \\\n        '''nsimd_{simd_ext}_v{typ}x3 ret;\n           nsimd_{simd_ext}_vf64x3 retf64 =\n               nsimd_load3{a}_{simd_ext}_f64((f64 *){in0});\n           ret.v0 = _mm_castpd_si128(retf64.v0);\n           ret.v1 = _mm_castpd_si128(retf64.v1);\n           ret.v2 = _mm_castpd_si128(retf64.v2);\n           return ret;'''.format(**fmtspec)\n\n###############################################################################\n\ndef store3(simd_ext, typ, align, fmtspec2, v0, v1, v2):\n    fmtspec = fmtspec2.copy()\n    fmtspec['a'] = '' if align else 'u'\n    store = '{pre}store{a}{sufsi}'.format(**fmtspec)\n    fmtspec['store'] = store\n    fmtspec['v0'] = v0\n    fmtspec['v1'] = v1\n    fmtspec['v2'] = v2\n    if typ in ['f32', 'f64']:\n        return \\\n        '''{store}({in0}, {v0});\n           {store}({in0} + {le}, {v1});\n           {store}({in0} + (2 * {le}), {v2});'''.format(**fmtspec)\n    else:\n        return \\\n        '''{store}(({styp} *){in0}, {v0});\n           {store}(({styp} *){in0} + 1, {v1});\n           {store}(({styp} *){in0} + 2, {v2});'''.format(**fmtspec)\n\n###############################################################################\n\ndef store3_sse(simd_ext, typ, align, fmtspec2):\n    fmtspec = fmtspec2.copy()\n    fmtspec['a'] = 'a' if align else 'u'\n    if typ in ['i8', 'u8']:\n        if simd_ext == 'sse42':\n            return \\\n            '''__m128i A1_mask = _mm_set_epi8( 5, -1, -1,  4, -1, -1,  3, -1,\n                                              -1,  2, -1, -1,  1, -1, -1,  0);\n               __m128i A2_mask = _mm_set_epi8(-1, -1,  4, -1, -1,  3, -1, -1,\n                                               2, -1, -1,  1, -1, -1,  0, -1);\n               __m128i A3_mask = _mm_set_epi8(-1,  4, -1, -1,  3, -1, -1,  2,\n                                              -1, -1,  1, -1, -1,  0, -1, -1);\n               __m128i A4 = _mm_shuffle_epi8({in1}, A1_mask);\n               __m128i A5 = _mm_shuffle_epi8({in2}, A2_mask);\n               __m128i A6 = _mm_shuffle_epi8({in3}, A3_mask);\n               A4 = _mm_or_si128(A4, A5);\n               A4 = _mm_or_si128(A4, A6);\n\n               __m128i B1_mask = _mm_set_epi8(-1, 10, -1, -1,  9, -1, -1,  8,\n                                              -1, -1,  7, -1, -1,  6, -1, -1);\n               __m128i B2_mask = _mm_set_epi8(10, -1, -1,  9, -1, -1,  8, -1,\n                                              -1,  7, -1, -1,  6, -1, -1,  5);\n               __m128i B3_mask = _mm_set_epi8(-1, -1,  9, -1, -1,  8, -1, -1,\n                                               7, -1, -1,  6, -1, -1,  5, -1);\n               __m128i B4 = _mm_shuffle_epi8({in1}, B1_mask);\n               __m128i B5 = _mm_shuffle_epi8({in2}, B2_mask);\n               __m128i B6 = _mm_shuffle_epi8({in3}, B3_mask);\n               B4 = _mm_or_si128(B4, B5);\n               B4 = _mm_or_si128(B4, B6);\n\n               __m128i C1_mask = _mm_set_epi8(-1, -1, 15, -1, -1, 14, -1, -1,\n                                              13, -1, -1, 12, -1, -1, 11, -1);\n               __m128i C2_mask = _mm_set_epi8(-1, 15, -1, -1, 14, -1, -1, 13,\n                                              -1, -1, 12, -1, -1, 11, -1, -1);\n               __m128i C3_mask = _mm_set_epi8(15, -1, -1, 14, -1, -1, 13, -1,\n                                              -1, 12, -1, -1, 11, -1, -1, 10);\n               __m128i C4 = _mm_shuffle_epi8({in1}, C1_mask);\n               __m128i C5 = _mm_shuffle_epi8({in2}, C2_mask);\n               __m128i C6 = _mm_shuffle_epi8({in3}, C3_mask);\n               C4 = _mm_or_si128(C4, C5);\n               C4 = _mm_or_si128(C4, C6);\n\n               {store4}'''.format(store4=store3('sse', typ, align, fmtspec,\n                                  'A4', 'B4', 'C4'), **fmtspec)\n        else:\n            return \\\n            '''__m128i A0 = {in1};\n               __m128i B0 = {in2};\n               __m128i C0 = {in3};\n               int k;\n\n               for (k = 0; k < 4; ++k) {{\n                 __m128i A1 = _mm_unpacklo_epi8(A0, B0);\n                 __m128i A2 = _mm_unpackhi_epi8(A0, B0);\n                 __m128i A3 = _mm_unpacklo_epi8(A1, A2);\n                 __m128i A4 = _mm_unpackhi_epi8(A1, A2);\n                 __m128i A5 = _mm_unpacklo_epi8(A3, A4);\n                 __m128i A6 = _mm_unpackhi_epi8(A3, A4);\n                 __m128i A7 = _mm_unpacklo_epi8(A5, A6);\n                 __m128i B8 = _mm_unpackhi_epi8(A5, A6);\n\n                 __m128i C9  = _mm_castpd_si128(_mm_shuffle_pd(\n                                   _mm_castsi128_pd(C0),\n                                   _mm_castsi128_pd(C0), 1));\n                 __m128i C10 = _mm_unpacklo_epi8(C0, C9);\n                 __m128i C11 = _mm_castpd_si128(_mm_shuffle_pd(\n                                   _mm_castsi128_pd(C10),\n                                   _mm_castsi128_pd(C10), 1));\n                 __m128i C12 = _mm_unpacklo_epi8(C10, C11);\n                 __m128i C13 = _mm_castpd_si128(_mm_shuffle_pd(\n                                   _mm_castsi128_pd(C12),\n                                   _mm_castsi128_pd(C12), 1));\n                 __m128i C14 = _mm_unpacklo_epi8(C12, C13);\n\n                 __m128i B15 = _mm_castpd_si128(_mm_shuffle_pd(\n                                   _mm_castsi128_pd(C14),\n                                   _mm_castsi128_pd(B8), 0));\n                 __m128i C16 = _mm_castpd_si128(_mm_shuffle_pd(\n                                   _mm_castsi128_pd(B8),\n                                   _mm_castsi128_pd(C14), 3));\n\n                 A0 = A7;\n                 B0 = B15;\n                 C0 = C16;\n               }}\n               {store0}'''.format(store0=store3('sse', typ, align, fmtspec,\n                                  'A0', 'B0', 'C0'), **fmtspec)\n    if typ in ['i16', 'u16']:\n        if simd_ext == 'avx2':\n            return \\\n            '''__m128i A0 = {in1};\n               __m128i B0 = {in2};\n               __m128i C0 = {in3};\n\n               __m128i A1_mask = _mm_set_epi8(-1, -1,  5,  4, -1, -1, -1, -1,\n                                               3,  2, -1, -1, -1, -1,  1,  0);\n               __m128i A2_mask = _mm_set_epi8( 5,  4, -1, -1, -1, -1,  3,  2,\n                                              -1, -1, -1, -1,  1,  0, -1, -1);\n               __m128i A3_mask = _mm_set_epi8(-1, -1, -1, -1,  3,  2, -1, -1,\n                                              -1, -1,  1,  0, -1, -1, -1, -1);\n               __m128i A4 = _mm_shuffle_epi8(A0, A1_mask);\n               __m128i A5 = _mm_shuffle_epi8(B0, A2_mask);\n               __m128i A6 = _mm_shuffle_epi8(C0, A3_mask);\n               A4 = _mm_or_si128(A4, A5);\n               A4 = _mm_or_si128(A4, A6);\n\n               __m128i B1_mask = _mm_set_epi8(11, 10, -1, -1, -1, -1,  9,  8,\n                                              -1, -1, -1, -1,  7,  6, -1, -1);\n               __m128i B2_mask = _mm_set_epi8(-1, -1, -1, -1,  9,  8, -1, -1,\n                                              -1, -1,  7,  6, -1, -1, -1, -1);\n               __m128i B3_mask = _mm_set_epi8(-1, -1,  9,  8, -1, -1, -1, -1,\n                                               7,  6, -1, -1, -1, -1,  5,  4);\n               __m128i B4 = _mm_shuffle_epi8(A0, B1_mask);\n               __m128i B5 = _mm_shuffle_epi8(B0, B2_mask);\n               __m128i B6 = _mm_shuffle_epi8(C0, B3_mask);\n               B4 = _mm_or_si128(B4, B5);\n               B4 = _mm_or_si128(B4, B6);\n\n               __m128i C1_mask = _mm_set_epi8(-1, -1, -1, -1, 15, 14, -1, -1,\n                                              -1, -1, 13, 12, -1, -1, -1, -1);\n               __m128i C2_mask = _mm_set_epi8(-1, -1, 15, 14, -1, -1, -1, -1,\n                                              13, 12, -1, -1, -1, -1, 11, 10);\n               __m128i C3_mask = _mm_set_epi8(15, 14, -1, -1, -1, -1, 13, 12,\n                                              -1, -1, -1, -1, 11, 10, -1, -1);\n               __m128i C4 = _mm_shuffle_epi8(A0, C1_mask);\n               __m128i C5 = _mm_shuffle_epi8(B0, C2_mask);\n               __m128i C6 = _mm_shuffle_epi8(C0, C3_mask);\n               C4 = _mm_or_si128(C4, C5);\n               C4 = _mm_or_si128(C4, C6);\n\n               {store4};'''.format(store4=store3('sse', typ, align, fmtspec,\n                                   'A4', 'B4', 'C4'), **fmtspec)\n        else:\n            return \\\n            '''__m128i A0 = {in1};\n               __m128i B0 = {in2};\n               __m128i C0 = {in3};\n               int k;\n\n               for (k = 0; k < 3; ++k) {{\n                 __m128i A1 = _mm_shufflelo_epi16(A0, _MM_SHUFFLE(3, 1, 2, 0));\n                 __m128i A2 = _mm_shufflehi_epi16(A1, _MM_SHUFFLE(3, 1, 2, 0));\n                 __m128i B3 = _mm_shufflelo_epi16(B0, _MM_SHUFFLE(3, 1, 2, 0));\n                 __m128i B4 = _mm_shufflehi_epi16(B3, _MM_SHUFFLE(3, 1, 2, 0));\n                 __m128i C5 = _mm_shufflelo_epi16(C0, _MM_SHUFFLE(3, 1, 2, 0));\n                 __m128i C6 = _mm_shufflehi_epi16(C5, _MM_SHUFFLE(3, 1, 2, 0));\n\n                 __m128 A2_ps = _mm_castsi128_ps(A2);\n                 __m128 B4_ps = _mm_castsi128_ps(B4);\n                 __m128 C6_ps = _mm_castsi128_ps(C6);\n\n                 __m128 A0_ps = _mm_shuffle_ps(A2_ps, B4_ps,\n                                               _MM_SHUFFLE(2, 0, 2, 0));\n                 __m128 B0_ps = _mm_shuffle_ps(C6_ps, A2_ps,\n                                               _MM_SHUFFLE(3, 1, 2, 0));\n                 __m128 C0_ps = _mm_shuffle_ps(B4_ps, C6_ps,\n                                               _MM_SHUFFLE(3, 1, 3, 1));\n\n                 A0 = _mm_castps_si128(A0_ps);\n                 B0 = _mm_castps_si128(B0_ps);\n                 C0 = _mm_castps_si128(C0_ps);\n               }}\n\n               {store0}'''.format(store0=store3('sse', typ, align, fmtspec,\n                                  'A0', 'B0', 'C0'), **fmtspec)\n    if typ == 'f32':\n        return \\\n        '''__m128 A1 = _mm_shuffle_ps({in1}, {in2}, _MM_SHUFFLE(2,0,2,0));\n           __m128 B2 = _mm_shuffle_ps({in3}, {in1}, _MM_SHUFFLE(3,1,2,0));\n           __m128 C3 = _mm_shuffle_ps({in2}, {in3}, _MM_SHUFFLE(3,1,3,1));\n\n           __m128 A4 = _mm_shuffle_ps(A1, B2, _MM_SHUFFLE(2,0,2,0));\n           __m128 B5 = _mm_shuffle_ps(C3, A1, _MM_SHUFFLE(3,1,2,0));\n           __m128 C6 = _mm_shuffle_ps(B2, C3, _MM_SHUFFLE(3,1,3,1));\n\n           {store};'''. \\\n           format(store=store3('sse', typ, align, fmtspec, 'A4', 'B5', 'C6'),\n                  **fmtspec)\n    if typ in ['i32', 'u32']:\n        return \\\n        '''nsimd_store3{a}_{simd_ext}_f32((f32 *){in0},\n                                          _mm_castsi128_ps({in1}),\n                                          _mm_castsi128_ps({in2}),\n                                          _mm_castsi128_ps({in3}));'''. \\\n                                          format(**fmtspec)\n    if typ == 'f64':\n        return \\\n        '''__m128d A0 = _mm_unpacklo_pd({in1}, {in2});\n           __m128d B0 = _mm_shuffle_pd({in3}, {in1}, 2);\n           __m128d C0 = _mm_unpackhi_pd({in2}, {in3});\n           {store}'''. \\\n           format(store=store3('sse', typ, align, fmtspec, 'A0', 'B0', 'C0'),\n                  **fmtspec)\n    if typ in ['i64', 'u64']:\n        return \\\n        '''nsimd_store3{a}_{simd_ext}_f64((f64 *){in0},\n                                          _mm_castsi128_pd({in1}),\n                                          _mm_castsi128_pd({in2}),\n                                          _mm_castsi128_pd({in3}));'''. \\\n                                          format(**fmtspec)\n\n###############################################################################\n\ndef load3_avx(simd_ext, typ, align, fmtspec2):\n    fmtspec = fmtspec2.copy()\n    fmtspec['load_v0v1v2'] = get_load_v0v1v2('avx', typ, align, fmtspec)\n    fmtspec['exlo_v0'] = x86.extract('avx', typ, x86.LO, 'v0')\n    fmtspec['exhi_v0'] = x86.extract('avx', typ, x86.HI, 'v0')\n    fmtspec['exlo_v1'] = x86.extract('avx', typ, x86.LO, 'v1')\n    fmtspec['exhi_v1'] = x86.extract('avx', typ, x86.HI, 'v1')\n    fmtspec['exlo_v2'] = x86.extract('avx', typ, x86.LO, 'v2')\n    fmtspec['exhi_v2'] = x86.extract('avx', typ, x86.HI, 'v2')\n    fmtspec['a'] = 'a' if align else 'u'\n    if typ in ['i8', 'u8']:\n        if simd_ext == 'avx2':\n            return \\\n            '''nsimd_avx2_v{typ}x3 ret;\n               {load_v0v1v2}\n\n               __m256i ARmask = _mm256_setr_epi8( 0,  3,  6,  9, 12, 15, -1, -1,\n                                                 -1, -1, -1, -1, -1, -1, -1, -1,\n                                                 -1, -1, -1, -1, -1, -1,  2,  5,\n                                                  8, 11, 14, -1, -1, -1, -1, -1);\n               __m256i BRmask = _mm256_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1,\n                                                 -1, -1, -1,  1,  4,  7, 10, 13,\n                                                  0,  3,  6,  9, 12, 15, -1, -1,\n                                                 -1, -1, -1, -1, -1, -1, -1, -1);\n               __m256i CRmask = _mm256_setr_epi8(-1, -1, -1, -1, -1, -1,  2,  5,\n                                                  8, 11, 14, -1, -1, -1, -1, -1,\n                                                 -1, -1, -1, -1, -1, -1, -1, -1,\n                                                 -1, -1, -1,  1,  4,  7, 10, 13);\n\n               __m256i AR = _mm256_shuffle_epi8(v0, ARmask);\n               __m256i BR = _mm256_shuffle_epi8(v1, BRmask);\n               __m256i CR = _mm256_shuffle_epi8(v2, CRmask);\n               __m256i DR = _mm256_permute2f128_si256(AR, CR, (2 << 4) | 1);\n\n               __m256i R0 = _mm256_or_si256(AR, BR);\n               __m256i R1 = _mm256_or_si256(BR, CR);\n               __m256i R2 = _mm256_permute2f128_si256(R0, R1, 3 << 4);\n               ret.v0 = _mm256_or_si256(DR, R2);\n\n\n               __m256i AGmask = _mm256_setr_epi8( 1,  4,  7, 10, 13, -1, -1, -1,\n                                                 -1, -1, -1, -1, -1, -1, -1, -1,\n                                                 -1, -1, -1, -1, -1,  0,  3,  6,\n                                                  9, 12, 15, -1, -1, -1, -1, -1);\n               __m256i BGmask = _mm256_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1,\n                                                 -1, -1, -1,  2,  5,  8, 11, 14,\n                                                  1,  4,  7, 10, 13, -1, -1, -1,\n                                                 -1, -1, -1, -1, -1, -1, -1, -1);\n               __m256i CGmask = _mm256_setr_epi8(-1, -1, -1, -1, -1,  0,  3,  6,\n                                                  9, 12, 15, -1, -1, -1, -1, -1,\n                                                 -1, -1, -1, -1, -1, -1, -1, -1,\n                                                 -1, -1, -1,  2,  5,  8, 11, 14);\n\n               __m256i AG = _mm256_shuffle_epi8(v0, AGmask);\n               __m256i BG = _mm256_shuffle_epi8(v1, BGmask);\n               __m256i CG = _mm256_shuffle_epi8(v2, CGmask);\n               __m256i DG = _mm256_permute2f128_si256(AG, CG, (2 << 4) | 1);\n\n               __m256i G0 = _mm256_or_si256(AG, BG);\n               __m256i G1 = _mm256_or_si256(BG, CG);\n               __m256i G2 = _mm256_permute2f128_si256(G0, G1, 3 << 4);\n               ret.v1 = _mm256_or_si256(DG, G2);\n\n               __m256i ABmask = _mm256_setr_epi8( 2,  5,  8, 11, 14, -1, -1, -1,\n                                                 -1, -1, -1, -1, -1, -1, -1, -1,\n                                                 -1, -1, -1, -1, -1,  1,  4,  7,\n                                                 10, 13, -1, -1, -1, -1, -1, -1);\n               __m256i BBmask = _mm256_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1,\n                                                 -1, -1,  0,  3,  6,  9, 12, 15,\n                                                  2,  5,  8, 11, 14, -1, -1, -1,\n                                                 -1, -1, -1, -1, -1, -1, -1, -1);\n               __m256i CBmask = _mm256_setr_epi8(-1, -1, -1, -1, -1,  1,  4,  7,\n                                                 10, 13, -1, -1, -1, -1, -1, -1,\n                                                 -1, -1, -1, -1, -1, -1, -1, -1,\n                                                 -1, -1,  0,  3,  6,  9, 12, 15);\n\n               __m256i AB = _mm256_shuffle_epi8(v0, ABmask);\n               __m256i BB = _mm256_shuffle_epi8(v1, BBmask);\n               __m256i CB = _mm256_shuffle_epi8(v2, CBmask);\n               __m256i DB = _mm256_permute2f128_si256(AB, CB, (2 << 4) | 1);\n\n               __m256i B0 = _mm256_or_si256(AB, BB);\n               __m256i B1 = _mm256_or_si256(BB, CB);\n               __m256i B2 = _mm256_permute2f128_si256(B0, B1, 3 << 4);\n               ret.v2 = _mm256_or_si256(DB, B2);\n               return ret;'''.format(**fmtspec)\n        else:\n            return \\\n            '''nsimd_avx_v{typ}x3 ret;\n               {load_v0v1v2}\n\n               __m128i Aa = {exlo_v0};\n               __m128i Ba = {exhi_v0};\n               __m128i Ca = {exlo_v1};\n               __m128i Ab = {exhi_v1};\n               __m128i Bb = {exlo_v2};\n               __m128i Cb = {exhi_v2};\n\n               __m128i ARm = _mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1,\n                                          -1, -1, 15, 12,  9,  6,  3,  0);\n               __m128i BRm = _mm_set_epi8(-1, -1, -1, -1, -1, 14, 11,  8,\n                                           5,  2, -1, -1, -1, -1, -1, -1);\n               __m128i CRm = _mm_set_epi8(13, 10,  7,  4,  1, -1, -1, -1,\n                                          -1, -1, -1, -1, -1, -1, -1, -1);\n               __m128i AR = _mm_shuffle_epi8(Aa, ARm);\n               __m128i BR = _mm_shuffle_epi8(Ba, BRm);\n               __m128i CR = _mm_shuffle_epi8(Ca, CRm);\n               __m128i R0 = _mm_or_si128(AR, BR);\n               R0 = _mm_or_si128(R0, CR);\n\n               AR = _mm_shuffle_epi8(Ab, ARm);\n               BR = _mm_shuffle_epi8(Bb, BRm);\n               CR = _mm_shuffle_epi8(Cb, CRm);\n               __m128i R1 = _mm_or_si128(AR, BR);\n               R1 = _mm_or_si128(R1, CR);\n\n               __m128i AGm = _mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1,\n                                          -1, -1, -1, 13, 10,  7,  4,  1);\n               __m128i BGm = _mm_set_epi8(-1, -1, -1, -1, -1, 15, 12,  9,\n                                           6,  3,  0, -1, -1, -1, -1, -1);\n               __m128i CGm = _mm_set_epi8(14, 11,  8,  5,  2, -1, -1, -1,\n                                          -1, -1, -1, -1, -1, -1, -1, -1);\n               __m128i AG = _mm_shuffle_epi8(Aa, AGm);\n               __m128i BG = _mm_shuffle_epi8(Ba, BGm);\n               __m128i CG = _mm_shuffle_epi8(Ca, CGm);\n               __m128i G0 = _mm_or_si128(AG, BG);\n               G0 = _mm_or_si128(G0, CG);\n\n               AG = _mm_shuffle_epi8(Ab, AGm);\n               BG = _mm_shuffle_epi8(Bb, BGm);\n               CG = _mm_shuffle_epi8(Cb, CGm);\n               __m128i G1 = _mm_or_si128(AG, BG);\n               G1 = _mm_or_si128(G1, CG);\n\n               __m128i ABm = _mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1,\n                                          -1, -1, -1, 14, 11,  8,  5,  2);\n               __m128i BBm = _mm_set_epi8(-1, -1, -1, -1, -1, -1, 13, 10,\n                                           7,  4,  1, -1, -1, -1, -1, -1);\n               __m128i CBm = _mm_set_epi8(15, 12,  9,  6,  3,  0, -1, -1,\n                                          -1, -1, -1, -1, -1, -1, -1, -1);\n               __m128i AB = _mm_shuffle_epi8(Aa, ABm);\n               __m128i BB = _mm_shuffle_epi8(Ba, BBm);\n               __m128i CB = _mm_shuffle_epi8(Ca, CBm);\n               __m128i B0 = _mm_or_si128(AB, BB);\n               B0 = _mm_or_si128(B0, CB);\n\n               AB = _mm_shuffle_epi8(Ab, ABm);\n               BB = _mm_shuffle_epi8(Bb, BBm);\n               CB = _mm_shuffle_epi8(Cb, CBm);\n               __m128i B1 = _mm_or_si128(AB, BB);\n               B1 = _mm_or_si128(B1, CB);\n\n               ret.v0 = {mergeR};\n               ret.v1 = {mergeG};\n               ret.v2 = {mergeB};\n\n               return ret;'''.format(mergeR=x86.setr('avx', typ, 'R0', 'R1'),\n                                     mergeG=x86.setr('avx', typ, 'G0', 'G1'),\n                                     mergeB=x86.setr('avx', typ, 'B0', 'B1'),\n                                     **fmtspec)\n    if typ in ['i16', 'u16']:\n        if simd_ext == 'avx2':\n            return \\\n            '''nsimd_avx2_v{typ}x3 ret;\n               {load_v0v1v2}\n               __m256i ARmask = _mm256_setr_epi8( 0,  1,  6,  7, 12, 13, -1, -1,\n                                                 -1, -1, -1, -1, -1, -1, -1, -1,\n                                                 -1, -1, -1, -1, -1, -1,  2,  3,\n                                                  8,  9, 14, 15, -1, -1, -1, -1);\n               __m256i BRmask = _mm256_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1,\n                                                 -1, -1, -1, -1,  4,  5, 10, 11,\n                                                  0,  1,  6,  7, 12, 13, -1, -1,\n                                                 -1, -1, -1, -1, -1, -1, -1, -1);\n               __m256i CRmask = _mm256_setr_epi8(-1, -1, -1, -1, -1, -1,  2,  3,\n                                                  8,  9, 14, 15, -1, -1, -1, -1,\n                                                 -1, -1, -1, -1, -1, -1, -1, -1,\n                                                 -1, -1, -1, -1,  4,  5, 10, 11);\n\n               __m256i AR = _mm256_shuffle_epi8(v0, ARmask);\n               __m256i BR = _mm256_shuffle_epi8(v1, BRmask);\n               __m256i CR = _mm256_shuffle_epi8(v2, CRmask);\n               __m256i DR = _mm256_permute2f128_si256(AR, CR, (2 << 4) | 1);\n\n               __m256i R0 = _mm256_or_si256(AR, BR);\n               __m256i R1 = _mm256_or_si256(BR, CR);\n               __m256i R2 = _mm256_permute2f128_si256(R0, R1, 3 << 4);\n               ret.v0 = _mm256_or_si256(DR, R2);\n\n\n               __m256i AGmask = _mm256_setr_epi8( 2,  3,  8,  9, 14, 15, -1, -1,\n                                                 -1, -1, -1, -1, -1, -1, -1, -1,\n                                                 -1, -1, -1, -1, -1, -1,  4,  5,\n                                                 10, 11, -1, -1, -1, -1, -1, -1);\n               __m256i BGmask = _mm256_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1,\n                                                 -1, -1,  0,  1,  6,  7, 12, 13,\n                                                  2,  3,  8,  9, 14, 15, -1, -1,\n                                                 -1, -1, -1, -1, -1, -1, -1, -1);\n               __m256i CGmask = _mm256_setr_epi8(-1, -1, -1, -1, -1, -1,  4,  5,\n                                                 10, 11, -1, -1, -1, -1, -1, -1,\n                                                 -1, -1, -1, -1, -1, -1, -1, -1,\n                                                 -1, -1,  0,  1,  6,  7, 12, 13);\n\n               __m256i AG = _mm256_shuffle_epi8(v0, AGmask);\n               __m256i BG = _mm256_shuffle_epi8(v1, BGmask);\n               __m256i CG = _mm256_shuffle_epi8(v2, CGmask);\n               __m256i DG = _mm256_permute2f128_si256(AG, CG, (2 << 4) | 1);\n\n               __m256i G0 = _mm256_or_si256(AG, BG);\n               __m256i G1 = _mm256_or_si256(BG, CG);\n               __m256i G2 = _mm256_permute2f128_si256(G0, G1, 3 << 4);\n               ret.v1 = _mm256_or_si256(DG, G2);\n\n               __m256i ABmask = _mm256_setr_epi8( 4,  5, 10, 11, -1, -1, -1, -1,\n                                                 -1, -1, -1, -1, -1, -1, -1, -1,\n                                                 -1, -1, -1, -1,  0,  1,  6,  7,\n                                                 12, 13, -1, -1, -1, -1, -1, -1);\n               __m256i BBmask = _mm256_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1,\n                                                 -1, -1,  2,  3,  8,  9, 14, 15,\n                                                  4,  5, 10, 11, -1, -1, -1, -1,\n                                                 -1, -1, -1, -1, -1, -1, -1, -1);\n               __m256i CBmask = _mm256_setr_epi8(-1, -1, -1, -1,  0,  1,  6,  7,\n                                                 12, 13, -1, -1, -1, -1, -1, -1,\n                                                 -1, -1, -1, -1, -1, -1, -1, -1,\n                                                 -1, -1,  2,  3,  8,  9, 14, 15);\n\n               __m256i AB = _mm256_shuffle_epi8(v0, ABmask);\n               __m256i BB = _mm256_shuffle_epi8(v1, BBmask);\n               __m256i CB = _mm256_shuffle_epi8(v2, CBmask);\n               __m256i DB = _mm256_permute2f128_si256(AB, CB, (2 << 4) | 1);\n\n               __m256i B0 = _mm256_or_si256(AB, BB);\n               __m256i B1 = _mm256_or_si256(BB, CB);\n               __m256i B2 = _mm256_permute2f128_si256(B0, B1, 3 << 4);\n               ret.v2 = _mm256_or_si256(DB, B2);\n               return ret;'''.format(**fmtspec)\n        else:\n            return \\\n            '''nsimd_avx_v{typ}x3 ret;\n               {load_v0v1v2}\n\n               __m128i Aa = {exlo_v0};\n               __m128i Ba = {exhi_v0};\n               __m128i Ca = {exlo_v1};\n               __m128i Ab = {exhi_v1};\n               __m128i Bb = {exlo_v2};\n               __m128i Cb = {exhi_v2};\n\n               __m128i ARm = _mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1,\n                                          -1, -1, 13, 12,  7,  6,  1,  0);\n               __m128i BRm = _mm_set_epi8(-1, -1, -1, -1, 15, 14,  9,  8,\n                                           3,  2, -1, -1, -1, -1, -1, -1);\n               __m128i CRm = _mm_set_epi8(11, 10,  5,  4, -1, -1, -1, -1,\n                                          -1, -1, -1, -1, -1, -1, -1, -1);\n               __m128i AR = _mm_shuffle_epi8(Aa, ARm);\n               __m128i BR = _mm_shuffle_epi8(Ba, BRm);\n               __m128i CR = _mm_shuffle_epi8(Ca, CRm);\n               __m128i R0 = _mm_or_si128(AR, BR);\n               R0 = _mm_or_si128(R0, CR);\n\n               AR = _mm_shuffle_epi8(Ab, ARm);\n               BR = _mm_shuffle_epi8(Bb, BRm);\n               CR = _mm_shuffle_epi8(Cb, CRm);\n               __m128i R1 = _mm_or_si128(AR, BR);\n               R1 = _mm_or_si128(R1, CR);\n\n               __m128i AGm = _mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1,\n                                          -1, -1, 15, 14,  9,  8,  3,  2);\n               __m128i BGm = _mm_set_epi8(-1, -1, -1, -1, -1, -1, 11, 10,\n                                           5,  4, -1, -1, -1, -1, -1, -1);\n               __m128i CGm = _mm_set_epi8(13, 12,  7,  6,  1,  0, -1, -1,\n                                          -1, -1, -1, -1, -1, -1, -1, -1);\n               __m128i AG = _mm_shuffle_epi8(Aa, AGm);\n               __m128i BG = _mm_shuffle_epi8(Ba, BGm);\n               __m128i CG = _mm_shuffle_epi8(Ca, CGm);\n               __m128i G0 = _mm_or_si128(AG, BG);\n               G0 = _mm_or_si128(G0, CG);\n\n               AG = _mm_shuffle_epi8(Ab, AGm);\n               BG = _mm_shuffle_epi8(Bb, BGm);\n               CG = _mm_shuffle_epi8(Cb, CGm);\n               __m128i G1 = _mm_or_si128(AG, BG);\n               G1 = _mm_or_si128(G1, CG);\n\n               __m128i ABm = _mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1,\n                                          -1, -1, -1, -1, 11, 10,  5,  4);\n               __m128i BBm = _mm_set_epi8(-1, -1, -1, -1, -1, -1, 13, 12,\n                                           7,  6,  1,  0, -1, -1, -1, -1);\n               __m128i CBm = _mm_set_epi8(15, 14,  9,  8,  3,  2, -1, -1,\n                                          -1, -1, -1, -1, -1, -1, -1, -1);\n               __m128i AB = _mm_shuffle_epi8(Aa, ABm);\n               __m128i BB = _mm_shuffle_epi8(Ba, BBm);\n               __m128i CB = _mm_shuffle_epi8(Ca, CBm);\n               __m128i B0 = _mm_or_si128(AB, BB);\n               B0 = _mm_or_si128(B0, CB);\n\n               AB = _mm_shuffle_epi8(Ab, ABm);\n               BB = _mm_shuffle_epi8(Bb, BBm);\n               CB = _mm_shuffle_epi8(Cb, CBm);\n               __m128i B1 = _mm_or_si128(AB, BB);\n               B1 = _mm_or_si128(B1, CB);\n\n               ret.v0 = {mergeR};\n               ret.v1 = {mergeG};\n               ret.v2 = {mergeB};\n               return ret;'''.format(mergeR=x86.setr('avx', typ, 'R0', 'R1'),\n                                     mergeG=x86.setr('avx', typ, 'G0', 'G1'),\n                                     mergeB=x86.setr('avx', typ, 'B0', 'B1'),\n                                     **fmtspec)\n    avx2_template = \\\n    '''nsimd_avx2_v{typ}x3 ret;\n       {load_v0v1v2}\n\n       __m256i RAm = _mm256_setr_epi32( 0,  3,  6, -1, -1, -1, -1, -1);\n       __m256i RBm = _mm256_setr_epi32(-1, -1, -1,  1,  4,  7, -1, -1);\n       __m256i RCm = _mm256_setr_epi32(-1, -1, -1, -1, -1, -1,  2,  5);\n\n       __m256i GAm = _mm256_setr_epi32( 1,  4,  7, -1, -1, -1, -1, -1);\n       __m256i GBm = _mm256_setr_epi32(-1, -1, -1,  2,  5, -1, -1, -1);\n       __m256i GCm = _mm256_setr_epi32(-1, -1, -1, -1, -1,  0,  3,  6);\n\n       __m256i BAm = _mm256_setr_epi32( 2,  5, -1, -1, -1, -1, -1, -1);\n       __m256i BBm = _mm256_setr_epi32(-1, -1,  0,  3,  6, -1, -1, -1);\n       __m256i BCm = _mm256_setr_epi32(-1, -1, -1, -1, -1,  1,  4,  7);\n\n       {styp} RA = _mm256_permutevar8x32{suf}(v0, RAm);\n       {styp} RB = _mm256_permutevar8x32{suf}(v1, RBm);\n       {styp} RC = _mm256_permutevar8x32{suf}(v2, RCm);\n\n       {styp} R = _mm256_blend{suf}(RA, RB, 8 + 16 + 32);\n       ret.v0 = _mm256_blend{suf}(R, RC, 64 + 128);\n\n       {styp} GA = _mm256_permutevar8x32{suf}(v0, GAm);\n       {styp} GB = _mm256_permutevar8x32{suf}(v1, GBm);\n       {styp} GC = _mm256_permutevar8x32{suf}(v2, GCm);\n       {styp} G = _mm256_blend{suf}(GA, GB, 8 + 16);\n       ret.v1 = _mm256_blend{suf}(G, GC, 32 + 64 + 128);\n\n       {styp} BA = _mm256_permutevar8x32{suf}(v0, BAm);\n       {styp} BB = _mm256_permutevar8x32{suf}(v1, BBm);\n       {styp} BC = _mm256_permutevar8x32{suf}(v2, BCm);\n       {styp} B = _mm256_blend{suf}(BA, BB, 4 + 8 + 16);\n       ret.v2 = _mm256_blend{suf}(B, BC, 32 + 64 + 128);\n\n       return ret;'''.format(**fmtspec)\n    if typ == 'f32':\n        if simd_ext == 'avx2':\n            return avx2_template\n        else:\n            return \\\n            '''nsimd_avx_v{typ}x3 ret;\n               {load_v0v1v2}\n\n               __m256i RAm = _mm256_setr_epi32( 0,  3, -1, -1, -1, -1,  2, -1);\n               __m256i RBm = _mm256_setr_epi32(-1, -1, -1,  1,  0,  3, -1, -1);\n               __m256i RCm = _mm256_setr_epi32( 0,  0,  2,  0,  1,  1,  1,  1);\n\n               __m256i GAm = _mm256_setr_epi32( 1, -1, -1, -1, -1,  0,  3, -1);\n               __m256i GBm = _mm256_setr_epi32(-1, -1, -1,  2,  5, -1, -1, -1);\n               __m256i GCm = _mm256_setr_epi32(-1,  0,  3, -1, -1, -1, -1,  6);\n\n               __m256i BAm = _mm256_setr_epi32( 2, -1, -1, -1, -1,  1, -1, -1);\n               __m256i BBm = _mm256_setr_epi32(-1, -1,  0,  3,  6, -1, -1, -1);\n               __m256i BCm = _mm256_setr_epi32(-1,  1, -1, -1, -1, -1,  4,  7);\n\n               __m256 RA = _mm256_permutevar_ps(v0, RAm);\n               __m256 RAi = _mm256_permute2f128_ps(RA, RA, (2 << 4) | 1);\n               RA = _mm256_blend_ps(RAi, RA, 1 + 2);\n               __m256 RB = _mm256_permutevar_ps(v1, RBm);\n               __m256 RC = _mm256_permutevar_ps(v2, RCm);\n               __m256 RCi = _mm256_permute2f128_ps(RC, RC, 2 << 4);\n               RC = _mm256_blend_ps(RC, RCi, 64);\n               __m256 R = _mm256_blend_ps(RA, RB, 8 + 16 + 32);\n               ret.v0 = _mm256_blend_ps(R, RC, 64 + 128);\n\n               __m256 GA = _mm256_permutevar_ps(v0, GAm);\n               __m256 GAi = _mm256_permute2f128_ps(GA, GA, (2 << 4) | 1);\n               GA = _mm256_blend_ps(GA, GAi, 2 + 4);\n               __m256 GB = _mm256_permutevar_ps(v1, GBm);\n               __m256 GC = _mm256_permutevar_ps(v2, GCm);\n               __m256 GCi = _mm256_permute2f128_ps(GC, GC, 2 << 4);\n               GC = _mm256_blend_ps(GC, GCi, 32 + 64);\n               __m256 G = _mm256_blend_ps(GA, GB, 8 + 16);\n               ret.v1 = _mm256_blend_ps(G, GC, 32 + 64 + 128);\n\n               __m256 BA = _mm256_permutevar_ps(v0, BAm);\n               __m256 BAi = _mm256_permute2f128_ps(BA, BA, (2 << 4) | 1);\n               BA = _mm256_blend_ps(BA, BAi, 2);\n               __m256 BB = _mm256_permutevar_ps(v1, BBm);\n               __m256 BC = _mm256_permutevar_ps(v2, BCm);\n               __m256 BCi = _mm256_permute2f128_ps(BC, BC, 2 << 4);\n               BC = _mm256_blend_ps(BC, BCi, 32);\n               __m256 B = _mm256_blend_ps(BA, BB, 4 + 8 + 16);\n               ret.v2 = _mm256_blend_ps(B, BC, 32 + 64 + 128);\n\n               return ret;'''.format(**fmtspec)\n    if typ in ['i32', 'u32', 'f32']:\n        if simd_ext == 'avx2':\n            return avx2_template\n        else:\n            return \\\n            '''nsimd_avx_v{typ}x3 ret;\n               nsimd_avx_vf32x3 retf32 = nsimd_load3{a}_avx_f32((f32 *){in0});\n               ret.v0 = _mm256_castps_si256(retf32.v0);\n               ret.v1 = _mm256_castps_si256(retf32.v1);\n               ret.v2 = _mm256_castps_si256(retf32.v2);\n               return ret;'''.format(**fmtspec)\n    avx2_template = \\\n    '''nsimd_avx2_v{typ}x3 ret;\n       {load_v0v1v2}\n       {styp} A1 = _mm256_permute4x64{suf}(v0, _MM_SHUFFLE(2, 1, 3, 0));\n       {styp} C2 = _mm256_permute4x64{suf}(v2, _MM_SHUFFLE(3, 0, 2, 1));\n       {styp} B3 = _mm256_permute2f128{sufsi}(A1, v1, (2 << 4) | 1);\n       {styp} B4 = _mm256_permute2f128{sufsi}(v1, C2, (2 << 4) | 1);\n       {styp} B5 = _mm256_permute4x64{suf}(B3, _MM_SHUFFLE(3, 1, 2, 0));\n       {styp} B6 = _mm256_permute4x64{suf}(B4, _MM_SHUFFLE(3, 1, 2, 0));\n       ret.v0 = _mm256_permute2f128{sufsi}(A1, B6, 2 << 4);\n       ret.v1 = _mm256_permute2f128{sufsi}(B5, B6, 3 << 4);\n       ret.v2 = _mm256_permute2f128{sufsi}(B5, C2, (3 << 4 ) | 1);\n       return ret;'''.format(**fmtspec)\n    if typ == 'f64':\n        if simd_ext == 'avx2':\n            return avx2_template\n        else:\n            return \\\n            '''nsimd_avx_v{typ}x3 ret;\n               {load_v0v1v2}\n\n               __m256d R1 = _mm256_permute2f128_pd(v0, v2, (2 << 4) | 1);\n               __m256d R2 = _mm256_permute2f128_pd(v0, v1, 3 << 4);\n               ret.v0  = _mm256_blend_pd(R1, R2, 1 + 4);\n\n               __m256d G1 = _mm256_permute2f128_pd(v0, v1, 3 << 4);\n               __m256d G2 = _mm256_permute2f128_pd(v1, v2, 3 << 4);\n               __m256d G  = _mm256_blend_pd(G1, G2, 1 + 4);\n               ret.v1 = _mm256_permute_pd(G, 1 + 4);\n\n               __m256d B1 = _mm256_permute2f128_pd(v0, v2, (2 << 4) | 1);\n               __m256d B2 = _mm256_permute2f128_pd(v1, v2, 3 << 4);\n               ret.v2  = _mm256_blend_pd(B1, B2, 2 + 8);\n\n               return ret;'''.format(**fmtspec)\n    if typ in ['i64', 'u64']:\n        if simd_ext == 'avx2':\n            return avx2_template\n        else:\n            return \\\n            '''nsimd_avx_v{typ}x3 ret;\n               nsimd_avx_vf64x3 retf64 = nsimd_load3{a}_avx_f64((f64 *){in0});\n               ret.v0 = _mm256_castpd_si256(retf64.v0);\n               ret.v1 = _mm256_castpd_si256(retf64.v1);\n               ret.v2 = _mm256_castpd_si256(retf64.v2);\n               return ret;'''.format(**fmtspec)\n\n###############################################################################\n\ndef store3_avx(simd_ext, typ, align, fmtspec2):\n    fmtspec = fmtspec2.copy()\n    fmtspec['exlo_in1'] = x86.extract('avx', typ, x86.LO, common.in1)\n    fmtspec['exhi_in1'] = x86.extract('avx', typ, x86.HI, common.in1)\n    fmtspec['exlo_in2'] = x86.extract('avx', typ, x86.LO, common.in2)\n    fmtspec['exhi_in2'] = x86.extract('avx', typ, x86.HI, common.in2)\n    fmtspec['exlo_in3'] = x86.extract('avx', typ, x86.LO, common.in3)\n    fmtspec['exhi_in3'] = x86.extract('avx', typ, x86.HI, common.in3)\n    fmtspec['a'] = 'a' if align else 'u'\n    if typ in ['i8', 'u8']:\n        if simd_ext == 'avx2':\n            return \\\n            '''__m256i RACm = _mm256_setr_epi8( 0, -1, -1,  1, -1, -1,  2, -1,\n                                               -1,  3, -1, -1,  4, -1, -1,  5,\n                                               -1, 27, -1, -1, 28, -1, -1, 29,\n                                               -1, -1, 30, -1, -1, 31, -1, -1);\n               __m256i RBBm = _mm256_setr_epi8(-1, 11, -1, -1, 12, -1, -1, 13,\n                                               -1, -1, 14, -1, -1, 15, -1, -1,\n                                               16, -1, -1, 17, -1, -1, 18, -1,\n                                               -1, 19, -1, -1, 20, -1, -1, 21);\n               __m256i RCAm = _mm256_setr_epi8(-1, -1, 22, -1, -1, 23, -1, -1,\n                                               24, -1, -1, 25, -1, -1, 26, -1,\n                                               -1, -1,  6, -1, -1,  7, -1, -1,\n                                                8, -1, -1,  9, -1, -1, 10, -1);\n\n               __m256i GACm = _mm256_setr_epi8(-1,  0, -1, -1,  1, -1, -1,  2,\n                                               -1, -1,  3, -1, -1,  4, -1, -1,\n                                               -1, -1, 27, -1, -1, 28, -1, -1,\n                                               29, -1, -1, 30, -1, -1, 31, -1);\n               __m256i GBBm = _mm256_setr_epi8(-1, -1, 11, -1, -1, 12, -1, -1,\n                                               13, -1, -1, 14, -1, -1, 15, -1,\n                                               -1, 16, -1, -1, 17, -1, -1, 18,\n                                               -1, -1, 19, -1, -1, 20, -1, -1);\n               __m256i GCAm = _mm256_setr_epi8(21, -1, -1, 22, -1, -1, 23, -1,\n                                               -1, 24, -1, -1, 25, -1, -1, 26,\n                                                5, -1, -1,  6, -1, -1,  7, -1,\n                                               -1,  8, -1, -1,  9, -1, -1, 10);\n\n               __m256i BACm = _mm256_setr_epi8(-1, -1,  0, -1, -1,  1, -1, -1,\n                                                2, -1, -1,  3, -1, -1,  4, -1,\n                                               26, -1, -1, 27, -1, -1, 28, -1,\n                                               -1, 29, -1, -1, 30, -1, -1, 31);\n               __m256i BBBm = _mm256_setr_epi8(10, -1, -1, 11, -1, -1, 12, -1,\n                                               -1, 13, -1, -1, 14, -1, -1, 15,\n                                               -1, -1, 16, -1, -1, 17, -1, -1,\n                                               18, -1, -1, 19, -1, -1, 20, -1);\n               __m256i BCAm = _mm256_setr_epi8(-1, 21, -1, -1, 22, -1, -1, 23,\n                                               -1, -1, 24, -1, -1, 25, -1, -1,\n                                               -1,  5, -1, -1,  6, -1, -1,  7,\n                                               -1, -1,  8, -1, -1,  9, -1, -1);\n\n               __m256i RAC = _mm256_shuffle_epi8({in1}, RACm);\n               __m256i GAC = _mm256_shuffle_epi8({in2}, GACm);\n               __m256i BAC = _mm256_shuffle_epi8({in3}, BACm);\n\n               __m256i RBB = _mm256_shuffle_epi8({in1}, RBBm);\n               __m256i GBB = _mm256_shuffle_epi8({in2}, GBBm);\n               __m256i BBB = _mm256_shuffle_epi8({in3}, BBBm);\n\n               __m256i RCA = _mm256_shuffle_epi8({in1}, RCAm);\n               __m256i GCA = _mm256_shuffle_epi8({in2}, GCAm);\n               __m256i BCA = _mm256_shuffle_epi8({in3}, BCAm);\n\n               __m256i AC = _mm256_or_si256(RAC, GAC);\n               AC = _mm256_or_si256(AC, BAC);\n\n               __m256i B = _mm256_or_si256(RBB, GBB);\n               B = _mm256_or_si256(B, BBB);\n\n               __m256i CA = _mm256_or_si256(RCA, GCA);\n               CA = _mm256_or_si256(CA, BCA);\n\n               __m256i A = _mm256_permute2f128_si256(AC, CA, 2 << 4);\n               __m256i C = _mm256_permute2f128_si256(AC, CA, (1 << 4) | 3);\n\n               {store}'''.format(store=store3('avx', typ, align, fmtspec,\n                                              'A', 'B', 'C'), **fmtspec)\n        else:\n            return \\\n            '''__m128i Ra = {exlo_in1};\n               __m128i Rb = {exhi_in1};\n               __m128i Ga = {exlo_in2};\n               __m128i Gb = {exhi_in2};\n               __m128i Ba = {exlo_in3};\n               __m128i Bb = {exhi_in3};\n\n               __m128i RAm = _mm_set_epi8( 5, -1, -1,  4, -1, -1,  3, -1,\n                                          -1,  2, -1, -1,  1, -1, -1,  0);\n               __m128i GAm = _mm_set_epi8(-1, -1,  4, -1, -1,  3, -1, -1,\n                                           2, -1, -1,  1, -1, -1,  0, -1);\n               __m128i BAm = _mm_set_epi8(-1,  4, -1, -1,  3, -1, -1,  2,\n                                          -1, -1,  1, -1, -1,  0, -1, -1);\n               __m128i RA = _mm_shuffle_epi8(Ra, RAm);\n               __m128i GA = _mm_shuffle_epi8(Ga, GAm);\n               __m128i BA = _mm_shuffle_epi8(Ba, BAm);\n               __m128i A0 = _mm_or_si128(RA, GA);\n               A0 = _mm_or_si128(A0, BA);\n\n               RA = _mm_shuffle_epi8(Rb, RAm);\n               GA = _mm_shuffle_epi8(Gb, GAm);\n               BA = _mm_shuffle_epi8(Bb, BAm);\n               __m128i A1 = _mm_or_si128(RA, GA);\n               A1 = _mm_or_si128(A1, BA);\n\n               __m128i RBm = _mm_set_epi8(-1, 10, -1, -1,  9, -1, -1,  8,\n                                          -1, -1,  7, -1, -1,  6, -1, -1);\n               __m128i GBm = _mm_set_epi8(10, -1, -1,  9, -1, -1,  8, -1,\n                                          -1,  7, -1, -1,  6, -1, -1,  5);\n               __m128i BBm = _mm_set_epi8(-1, -1,  9, -1, -1,  8, -1, -1,\n                                           7, -1, -1,  6, -1, -1,  5, -1);\n               __m128i RB = _mm_shuffle_epi8(Ra, RBm);\n               __m128i GB = _mm_shuffle_epi8(Ga, GBm);\n               __m128i BB = _mm_shuffle_epi8(Ba, BBm);\n               __m128i B0 = _mm_or_si128(RB, GB);\n               B0 = _mm_or_si128(B0, BB);\n\n               RB = _mm_shuffle_epi8(Rb, RBm);\n               GB = _mm_shuffle_epi8(Gb, GBm);\n               BB = _mm_shuffle_epi8(Bb, BBm);\n               __m128i B1 = _mm_or_si128(RB, GB);\n               B1 = _mm_or_si128(B1, BB);\n\n               __m128i RCm = _mm_set_epi8(-1, -1, 15, -1, -1, 14, -1, -1,\n                                          13, -1, -1, 12, -1, -1, 11, -1);\n               __m128i GCm = _mm_set_epi8(-1, 15, -1, -1, 14, -1, -1, 13,\n                                          -1, -1, 12, -1, -1, 11, -1, -1);\n               __m128i BCm = _mm_set_epi8(15, -1, -1, 14, -1, -1, 13, -1,\n                                          -1, 12, -1, -1, 11, -1, -1, 10);\n               __m128i RC = _mm_shuffle_epi8(Ra, RCm);\n               __m128i GC = _mm_shuffle_epi8(Ga, GCm);\n               __m128i BC = _mm_shuffle_epi8(Ba, BCm);\n               __m128i C0 = _mm_or_si128(RC, GC);\n               C0 = _mm_or_si128(C0, BC);\n\n               RC = _mm_shuffle_epi8(Rb, RCm);\n               GC = _mm_shuffle_epi8(Gb, GCm);\n               BC = _mm_shuffle_epi8(Bb, BCm);\n               __m128i C1 = _mm_or_si128(RC, GC);\n               C1 = _mm_or_si128(C1, BC);\n\n               __m256i A = {mergeA0B0};\n               __m256i B = {mergeC0A1};\n               __m256i C = {mergeB1C1};\n\n               {store}'''.format(mergeA0B0=x86.setr('avx', typ, 'A0', 'B0'),\n                                 mergeC0A1=x86.setr('avx', typ, 'C0', 'A1'),\n                                 mergeB1C1=x86.setr('avx', typ, 'B1', 'C1'),\n                                 store=store3('avx', typ, align, fmtspec,\n                                              'A', 'B', 'C'), **fmtspec)\n    if typ in ['i16', 'u16']:\n        if simd_ext == 'avx2':\n            return \\\n            '''__m256i RACm = _mm256_setr_epi8( 0,  1, -1, -1, -1, -1,  2,  3,\n                                               -1, -1, -1, -1,  4,  5, -1, -1,\n                                               -1, -1, -1, -1, 12, 13, -1, -1,\n                                               -1, -1, 14, 15, -1, -1, -1, -1);\n               __m256i RBBm = _mm256_setr_epi8(-1, -1, -1, -1, 12, 13, -1, -1,\n                                               -1, -1, 14, 15, -1, -1, -1, -1,\n                                                0,  1, -1, -1, -1, -1,  2,  3,\n                                               -1, -1, -1, -1,  4,  5, -1, -1);\n               __m256i RCAm = _mm256_setr_epi8(-1, -1,  6,  7, -1, -1, -1, -1,\n                                                8,  9, -1, -1, -1, -1, 10, 11,\n                                               -1, -1,  6,  7, -1, -1, -1, -1,\n                                                8,  9, -1, -1, -1, -1, 10, 11);\n\n               __m256i GACm = _mm256_setr_epi8(-1, -1,  0,  1, -1, -1, -1, -1,\n                                                2,  3, -1, -1, -1, -1,  4,  5,\n                                               10, 11, -1, -1, -1, -1, 12, 13,\n                                               -1, -1, -1, -1, 14, 15, -1, -1);\n               __m256i GBBm = _mm256_setr_epi8(10, 11, -1, -1, -1, -1, 12, 13,\n                                               -1, -1, -1, -1, 14, 15, -1, -1,\n                                               -1, -1,  0,  1, -1, -1, -1, -1,\n                                                2,  3, -1, -1, -1, -1,  4,  5);\n               __m256i GCAm = _mm256_setr_epi8(-1, -1, -1, -1,  6,  7, -1, -1,\n                                               -1, -1,  8,  9, -1, -1, -1, -1,\n                                               -1, -1, -1, -1,  6,  7, -1, -1,\n                                               -1, -1,  8,  9, -1, -1, -1, -1);\n\n               __m256i BACm = _mm256_setr_epi8(-1, -1, -1, -1,  0,  1, -1, -1,\n                                               -1, -1,  2,  3, -1, -1, -1, -1,\n                                               -1, -1, 10, 11, -1, -1, -1, -1,\n                                               12, 13, -1, -1, -1, -1, 14, 15);\n               __m256i BBBm = _mm256_setr_epi8(-1, -1, 10, 11, -1, -1, -1, -1,\n                                               12, 13, -1, -1, -1, -1, 14, 15,\n                                               -1, -1, -1, -1,  0,  1, -1, -1,\n                                               -1, -1,  2,  3, -1, -1, -1, -1);\n               __m256i BCAm = _mm256_setr_epi8( 4,  5, -1, -1, -1, -1,  6,  7,\n                                               -1, -1, -1, -1,  8,  9, -1, -1,\n                                                4,  5, -1, -1, -1, -1,  6,  7,\n                                                -1, -1, -1, -1,  8,  9, -1, -1);\n\n               __m256i RAC = _mm256_shuffle_epi8({in1}, RACm);\n               __m256i GAC = _mm256_shuffle_epi8({in2}, GACm);\n               __m256i BAC = _mm256_shuffle_epi8({in3}, BACm);\n\n               __m256i RBB = _mm256_shuffle_epi8({in1}, RBBm);\n               __m256i GBB = _mm256_shuffle_epi8({in2}, GBBm);\n               __m256i BBB = _mm256_shuffle_epi8({in3}, BBBm);\n\n               __m256i RCA = _mm256_shuffle_epi8({in1}, RCAm);\n               __m256i GCA = _mm256_shuffle_epi8({in2}, GCAm);\n               __m256i BCA = _mm256_shuffle_epi8({in3}, BCAm);\n\n               __m256i AC = _mm256_or_si256(RAC, GAC);\n               AC = _mm256_or_si256(AC, BAC);\n\n               __m256i B = _mm256_or_si256(RBB, GBB);\n               B = _mm256_or_si256(B, BBB);\n\n               __m256i CA = _mm256_or_si256(RCA, GCA);\n               CA = _mm256_or_si256(CA, BCA);\n\n               __m256i A = _mm256_permute2f128_si256(AC, CA, 2 << 4);\n               __m256i C = _mm256_permute2f128_si256(AC, CA, (1 << 4) | 3);\n\n               {store}'''.format(store=store3('avx', typ, align, fmtspec,\n                                              'A', 'B', 'C'), **fmtspec)\n        else:\n            return \\\n            '''__m128i Ra = {exlo_in1};\n               __m128i Rb = {exhi_in1};\n               __m128i Ga = {exlo_in2};\n               __m128i Gb = {exhi_in2};\n               __m128i Ba = {exlo_in3};\n               __m128i Bb = {exhi_in3};\n\n               __m128i RAm = _mm_set_epi8(-1, -1,  5,  4, -1, -1, -1, -1,\n                                           3,  2, -1, -1, -1, -1,  1,  0);\n               __m128i GAm = _mm_set_epi8( 5,  4, -1, -1, -1, -1,  3,  2,\n                                          -1, -1, -1, -1,  1,  0, -1, -1);\n               __m128i BAm = _mm_set_epi8(-1, -1, -1, -1,  3,  2, -1, -1,\n                                          -1, -1,  1,  0, -1, -1, -1, -1);\n               __m128i RA = _mm_shuffle_epi8(Ra, RAm);\n               __m128i GA = _mm_shuffle_epi8(Ga, GAm);\n               __m128i BA = _mm_shuffle_epi8(Ba, BAm);\n               __m128i A0 = _mm_or_si128(RA, GA);\n               A0 = _mm_or_si128(A0, BA);\n\n               RA = _mm_shuffle_epi8(Rb, RAm);\n               GA = _mm_shuffle_epi8(Gb, GAm);\n               BA = _mm_shuffle_epi8(Bb, BAm);\n               __m128i A1 = _mm_or_si128(RA, GA);\n               A1 = _mm_or_si128(A1, BA);\n\n               __m128i RBm = _mm_set_epi8(11, 10, -1, -1, -1, -1,  9,  8,\n                                          -1, -1, -1, -1,  7,  6, -1, -1);\n               __m128i GBm = _mm_set_epi8(-1, -1, -1, -1,  9,  8, -1, -1,\n                                          -1, -1,  7,  6, -1, -1, -1, -1);\n               __m128i BBm = _mm_set_epi8(-1, -1,  9,  8, -1, -1, -1, -1,\n                                           7,  6, -1, -1, -1, -1,  5,  4);\n               __m128i RB = _mm_shuffle_epi8(Ra, RBm);\n               __m128i GB = _mm_shuffle_epi8(Ga, GBm);\n               __m128i BB = _mm_shuffle_epi8(Ba, BBm);\n               __m128i B0 = _mm_or_si128(RB, GB);\n               B0 = _mm_or_si128(B0, BB);\n\n               RB = _mm_shuffle_epi8(Rb, RBm);\n               GB = _mm_shuffle_epi8(Gb, GBm);\n               BB = _mm_shuffle_epi8(Bb, BBm);\n               __m128i B1 = _mm_or_si128(RB, GB);\n               B1 = _mm_or_si128(B1, BB);\n\n               __m128i RCm = _mm_set_epi8(-1, -1, -1, -1, 15, 14, -1, -1,\n                                          -1, -1, 13, 12, -1, -1, -1, -1);\n               __m128i GCm = _mm_set_epi8(-1, -1, 15, 14, -1, -1, -1, -1,\n                                          13, 12, -1, -1, -1, -1, 11, 10);\n               __m128i BCm = _mm_set_epi8(15, 14, -1, -1, -1, -1, 13, 12,\n                                          -1, -1, -1, -1, 11, 10, -1, -1);\n               __m128i RC = _mm_shuffle_epi8(Ra, RCm);\n               __m128i GC = _mm_shuffle_epi8(Ga, GCm);\n               __m128i BC = _mm_shuffle_epi8(Ba, BCm);\n               __m128i C0 = _mm_or_si128(RC, GC);\n               C0 = _mm_or_si128(C0, BC);\n\n               RC = _mm_shuffle_epi8(Rb, RCm);\n               GC = _mm_shuffle_epi8(Gb, GCm);\n               BC = _mm_shuffle_epi8(Bb, BCm);\n               __m128i C1 = _mm_or_si128(RC, GC);\n               C1 = _mm_or_si128(C1, BC);\n\n               __m256i A = {mergeA0B0};\n               __m256i B = {mergeC0A1};\n               __m256i C = {mergeB1C1};\n\n               {store}'''.format(mergeA0B0=x86.setr('avx', typ, 'A0', 'B0'),\n                                 mergeC0A1=x86.setr('avx', typ, 'C0', 'A1'),\n                                 mergeB1C1=x86.setr('avx', typ, 'B1', 'C1'),\n                                 store=store3('avx', typ, align, fmtspec,\n                                              'A', 'B', 'C'), **fmtspec)\n    avx2_template = \\\n    '''__m256i RAm = _mm256_setr_epi32( 0, -1, -1,  1, -1, -1,  2, -1);\n       __m256i RBm = _mm256_setr_epi32(-1,  3, -1, -1,  4, -1, -1,  5);\n       __m256i RCm = _mm256_setr_epi32(-1, -1,  6, -1, -1,  7, -1, -1);\n\n       __m256i GAm = _mm256_setr_epi32(-1,  0, -1, -1,  1, -1, -1,  2);\n       __m256i GBm = _mm256_setr_epi32(-1, -1,  3, -1, -1,  4, -1, -1);\n       __m256i GCm = _mm256_setr_epi32( 5, -1, -1,  6, -1, -1,  7, -1);\n\n       __m256i BAm = _mm256_setr_epi32(-1, -1,  0, -1, -1,  1, -1, -1);\n       __m256i BBm = _mm256_setr_epi32( 2, -1, -1,  3, -1, -1,  4, -1);\n       __m256i BCm = _mm256_setr_epi32(-1,  5, -1, -1,  6, -1, -1,  7);\n\n       {styp} RA = _mm256_permutevar8x32{suf}({in1}, RAm);\n       {styp} RB = _mm256_permutevar8x32{suf}({in1}, RBm);\n       {styp} RC = _mm256_permutevar8x32{suf}({in1}, RCm);\n\n       {styp} GA = _mm256_permutevar8x32{suf}({in2}, GAm);\n       {styp} GB = _mm256_permutevar8x32{suf}({in2}, GBm);\n       {styp} GC = _mm256_permutevar8x32{suf}({in2}, GCm);\n\n       {styp} BA = _mm256_permutevar8x32{suf}({in3}, BAm);\n       {styp} BB = _mm256_permutevar8x32{suf}({in3}, BBm);\n       {styp} BC = _mm256_permutevar8x32{suf}({in3}, BCm);\n\n       {styp} A = _mm256_blend{suf}(RA, GA, 2 + 16 + 128);\n       A = _mm256_blend{suf}(A, BA, 4 + 32);\n\n       {styp} B = _mm256_blend{suf}(RB, GB, 4 + 32);\n       B = _mm256_blend{suf}(B, BB, 1 + 8 + 64);\n\n       {styp} C = _mm256_blend{suf}(RC, GC, 1 + 8 + 64);\n       C = _mm256_blend{suf}(C, BC, 2 + 16 + 128);\n\n       {store}'''.format(store=store3('avx', typ, align, fmtspec,\n                                      'A', 'B', 'C'), **fmtspec)\n    if typ == 'f32':\n        if simd_ext == 'avx2':\n            return avx2_template\n        else:\n            return \\\n            '''__m256i RAm = _mm256_setr_epi32( 0, -1, -1,  1, -1, -1,  2, -1);\n               __m256i RBm = _mm256_setr_epi32(-1,  3, -1, -1,  4, -1, -1,  5);\n               __m256i RCm = _mm256_setr_epi32(-1, -1,  6, -1, -1,  7, -1, -1);\n\n               __m256i GAm = _mm256_setr_epi32(-1,  0, -1, -1,  1, -1, -1,  2);\n               __m256i GBm = _mm256_setr_epi32(-1, -1,  3, -1, -1,  4, -1, -1);\n               __m256i GCm = _mm256_setr_epi32( 5, -1, -1,  6, -1, -1,  7, -1);\n\n               __m256i BAm = _mm256_setr_epi32(-1, -1,  0, -1, -1,  1, -1, -1);\n               __m256i BBm = _mm256_setr_epi32( 2, -1, -1,  3, -1, -1,  4, -1);\n               __m256i BCm = _mm256_setr_epi32(-1,  5, -1, -1,  6, -1, -1,  7);\n\n               __m256 RA = _mm256_permutevar_ps({in1}, RAm);\n               __m256 RB = _mm256_permutevar_ps({in1}, RBm);\n               __m256 RC = _mm256_permutevar_ps({in1}, RCm);\n\n               __m256 GA = _mm256_permutevar_ps({in2}, GAm);\n               __m256 GB = _mm256_permutevar_ps({in2}, GBm);\n               __m256 GC = _mm256_permutevar_ps({in2}, GCm);\n\n               __m256 BA = _mm256_permutevar_ps({in3}, BAm);\n               __m256 BB = _mm256_permutevar_ps({in3}, BBm);\n               __m256 BC = _mm256_permutevar_ps({in3}, BCm);\n\n               __m256 A1 = _mm256_blend_ps(RA, GA, 2 + 16 + 128);\n               A1 = _mm256_blend_ps(A1, BA, 4 + 32);\n\n               __m256 B = _mm256_blend_ps(RB, GB, 4 + 32);\n               B = _mm256_blend_ps(B, BB, 1 + 8 + 64);\n\n               __m256 C1 = _mm256_blend_ps(RC, GC, 1 + 8 + 64);\n               C1 = _mm256_blend_ps(C1, BC, 2 + 16 + 128);\n\n               __m256 A = _mm256_permute2f128_ps(A1, C1, 2 << 4);\n               __m256 C = _mm256_permute2f128_ps(A1, C1, (3 << 4) | 1);\n\n               {store}'''.format(avx2_template=avx2_template,\n                                 store=store3('avx', typ, align, fmtspec,\n                                              'A', 'B', 'C'), **fmtspec)\n    if typ in ['i32', 'u32']:\n        if simd_ext == 'avx2':\n            return avx2_template\n        else:\n            return \\\n            '''nsimd_store3{a}_avx_f32((f32 *){in0},\n                                       _mm256_castsi256_ps({in1}),\n                                       _mm256_castsi256_ps({in2}),\n                                       _mm256_castsi256_ps({in3}));'''. \\\n                                       format(**fmtspec)\n    if typ == 'f64':\n        return \\\n        '''__m256d invv1 = _mm256_permute_pd({in2}, 1 + 4);\n           __m256d A1C0 = _mm256_blend_pd({in1}, {in3}, 1 + 4);\n           __m256d A0B1 = _mm256_blend_pd({in1}, invv1, 2 + 8);\n           __m256d B0C1 = _mm256_blend_pd(invv1, {in3}, 2 + 8);\n\n           __m256d A = _mm256_permute2f128_pd(A0B1, A1C0, 2 << 4);\n           __m256d B = _mm256_blend_pd(B0C1, A0B1, 4 + 8);\n           __m256d C = _mm256_permute2f128_pd(A1C0, B0C1, (3 << 4) |  1);\n\n           {store}'''.format(store=store3('avx', typ, align, fmtspec,\n                                          'A', 'B', 'C'), **fmtspec)\n    if typ in ['i64', 'u64']:\n        return \\\n        '''nsimd_store3{a}_{simd_ext}_f64((f64 *){in0},\n                                          _mm256_castsi256_pd({in1}),\n                                          _mm256_castsi256_pd({in2}),\n                                          _mm256_castsi256_pd({in3}));'''. \\\n                                          format(**fmtspec)\n\n###############################################################################\n\ndef load3_avx512(simd_ext, typ, align, fmtspec2):\n    fmtspec = fmtspec2.copy()\n    fmtspec['load_v0v1v2'] = get_load_v0v1v2(simd_ext, typ, align, fmtspec)\n    fmtspec['exlo_v0'] = x86.extract(simd_ext, typ, x86.LO, 'v0')\n    fmtspec['exhi_v0'] = x86.extract(simd_ext, typ, x86.HI, 'v0')\n    fmtspec['exlo_v1'] = x86.extract(simd_ext, typ, x86.LO, 'v1')\n    fmtspec['exhi_v1'] = x86.extract(simd_ext, typ, x86.HI, 'v1')\n    fmtspec['exlo_v2'] = x86.extract(simd_ext, typ, x86.LO, 'v2')\n    fmtspec['exhi_v2'] = x86.extract(simd_ext, typ, x86.HI, 'v2')\n    fmtspec['a'] = 'a' if align else 'u'\n    if typ in ['i8', 'u8']:\n        return \\\n        '''nsimd_{simd_ext}_v{typ}x3 ret;\n\n           {load_v0v1v2}\n\n           __m256i A0in = {exlo_v0};\n           __m256i B0in = {exhi_v0};\n           __m256i C0in = {exlo_v1};\n           __m256i A1in = {exhi_v1};\n           __m256i B1in = {exlo_v2};\n           __m256i C1in = {exhi_v2};\n\n\t   __m256i ARmask = _mm256_setr_epi8( 0,  3,  6,  9, 12, 15, -1, -1,\n\t                                     -1, -1, -1, -1, -1, -1, -1, -1,\n                                             -1, -1, -1, -1, -1, -1,  2,  5,\n                                              8, 11, 14, -1, -1, -1, -1, -1);\n           __m256i BRmask = _mm256_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1,\n                                             -1, -1, -1,  1,  4,  7, 10, 13,\n                                              0,  3,  6,  9, 12, 15, -1, -1,\n                                              -1, -1, -1, -1, -1, -1, -1, -1);\n           __m256i CRmask = _mm256_setr_epi8(-1, -1, -1, -1, -1, -1,  2,  5,\n                                              8, 11, 14, -1, -1, -1, -1, -1,\n                                             -1, -1, -1, -1, -1, -1, -1, -1,\n                                             -1, -1, -1,  1,  4,  7, 10, 13);\n\n           __m256i AR = _mm256_shuffle_epi8(A0in, ARmask);\n           __m256i BR = _mm256_shuffle_epi8(B0in, BRmask);\n           __m256i CR = _mm256_shuffle_epi8(C0in, CRmask);\n           __m256i DR = _mm256_permute2f128_si256(AR, CR, (2 << 4) | 1);\n\n           __m256i R0 = _mm256_or_si256(AR, BR);\n           __m256i R1 = _mm256_or_si256(BR, CR);\n           __m256i R2 = _mm256_permute2f128_si256(R0, R1, 3 << 4);\n           __m256i R3 = _mm256_or_si256(DR, R2);\n\n           AR = _mm256_shuffle_epi8(A1in, ARmask);\n           BR = _mm256_shuffle_epi8(B1in, BRmask);\n           CR = _mm256_shuffle_epi8(C1in, CRmask);\n           DR = _mm256_permute2f128_si256(AR, CR, (2 << 4) | 1);\n\n           R0 = _mm256_or_si256(AR, BR);\n           R1 = _mm256_or_si256(BR, CR);\n           R2 = _mm256_permute2f128_si256(R0, R1, 3 << 4);\n           __m256i R3b = _mm256_or_si256(DR, R2);\n\n           __m256i AGmask = _mm256_setr_epi8( 1,  4,  7, 10, 13, -1, -1, -1,\n                                             -1, -1, -1, -1, -1, -1, -1, -1,\n                                             -1, -1, -1, -1, -1,  0,  3,  6,\n                                              9, 12, 15, -1, -1, -1, -1, -1);\n           __m256i BGmask = _mm256_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1,\n                                             -1, -1, -1,  2,  5,  8, 11, 14,\n                                              1,  4,  7, 10, 13, -1, -1, -1,\n                                             -1, -1, -1, -1, -1, -1, -1, -1);\n           __m256i CGmask = _mm256_setr_epi8(-1, -1, -1, -1, -1,  0,  3,  6,\n                                              9, 12, 15, -1, -1, -1, -1, -1,\n                                             -1, -1, -1, -1, -1, -1, -1, -1,\n                                             -1, -1, -1,  2,  5,  8, 11, 14);\n\n           __m256i AG = _mm256_shuffle_epi8(A0in, AGmask);\n           __m256i BG = _mm256_shuffle_epi8(B0in, BGmask);\n           __m256i CG = _mm256_shuffle_epi8(C0in, CGmask);\n           __m256i DG = _mm256_permute2f128_si256(AG, CG, (2 << 4) | 1);\n\n           __m256i G0 = _mm256_or_si256(AG, BG);\n           __m256i G1 = _mm256_or_si256(BG, CG);\n           __m256i G2 = _mm256_permute2f128_si256(G0, G1, 3 << 4);\n           __m256i G3 = _mm256_or_si256(DG, G2);\n\n           AG = _mm256_shuffle_epi8(A1in, AGmask);\n           BG = _mm256_shuffle_epi8(B1in, BGmask);\n           CG = _mm256_shuffle_epi8(C1in, CGmask);\n           DG = _mm256_permute2f128_si256(AG, CG, (2 << 4) | 1);\n\n           G0 = _mm256_or_si256(AG, BG);\n           G1 = _mm256_or_si256(BG, CG);\n           G2 = _mm256_permute2f128_si256(G0, G1, 3 << 4);\n           __m256i G3b = _mm256_or_si256(DG, G2);\n\n           __m256i ABmask = _mm256_setr_epi8( 2,  5,  8, 11, 14, -1, -1, -1,\n                                             -1, -1, -1, -1, -1, -1, -1, -1,\n                                             -1, -1, -1, -1, -1,  1,  4,  7,\n                                             10, 13, -1, -1, -1, -1, -1, -1);\n           __m256i BBmask = _mm256_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1,\n                                             -1, -1,  0,  3,  6,  9, 12, 15,\n                                              2,  5,  8, 11, 14, -1, -1, -1,\n                                             -1, -1, -1, -1, -1, -1, -1, -1);\n           __m256i CBmask = _mm256_setr_epi8(-1, -1, -1, -1, -1,  1,  4,  7,\n                                             10, 13, -1, -1, -1, -1, -1, -1,\n                                             -1, -1, -1, -1, -1, -1, -1, -1,\n                                             -1, -1,  0,  3,  6,  9, 12, 15);\n\n           __m256i AB = _mm256_shuffle_epi8(A0in, ABmask);\n           __m256i BB = _mm256_shuffle_epi8(B0in, BBmask);\n           __m256i CB = _mm256_shuffle_epi8(C0in, CBmask);\n           __m256i DB = _mm256_permute2f128_si256(AB, CB, (2 << 4) | 1);\n\n           __m256i B0 = _mm256_or_si256(AB, BB);\n           __m256i B1 = _mm256_or_si256(BB, CB);\n           __m256i B2 = _mm256_permute2f128_si256(B0, B1, 3 << 4);\n           __m256i B3 = _mm256_or_si256(DB, B2);\n\n           AB = _mm256_shuffle_epi8(A1in, ABmask);\n           BB = _mm256_shuffle_epi8(B1in, BBmask);\n           CB = _mm256_shuffle_epi8(C1in, CBmask);\n           DB = _mm256_permute2f128_si256(AB, CB, (2 << 4) | 1);\n\n           B0 = _mm256_or_si256(AB, BB);\n           B1 = _mm256_or_si256(BB, CB);\n           B2 = _mm256_permute2f128_si256(B0, B1, 3 << 4);\n           __m256i B3b = _mm256_or_si256(DB, B2);\n\n           ret.v0 = {mergeR};\n           ret.v1 = {mergeG};\n           ret.v2 = {mergeB};\n\n           return ret;'''. \\\n           format(mergeR=x86.setr(simd_ext, typ, 'R3', 'R3b'),\n                  mergeG=x86.setr(simd_ext, typ, 'G3', 'G3b'),\n                  mergeB=x86.setr(simd_ext, typ, 'B3', 'B3b'),\n                  **fmtspec)\n    if typ in ['i16', 'u16']:\n        return \\\n        '''nsimd_{simd_ext}_v{typ}x3 ret;\n\n           {load_v0v1v2}\n\n           __m256i A0a = {exlo_v0};\n           __m256i B0a = {exhi_v0};\n           __m256i C0a = {exlo_v1};\n           __m256i A0b = {exhi_v1};\n           __m256i B0b = {exlo_v2};\n           __m256i C0b = {exhi_v2};\n\n           __m256i ARmask = _mm256_setr_epi8( 0,  1,  6,  7, 12, 13, -1, -1,\n                                             -1, -1, -1, -1, -1, -1, -1, -1,\n                                             -1, -1, -1, -1, -1, -1,  2,  3,\n                                              8,  9, 14, 15, -1, -1, -1, -1);\n           __m256i BRmask = _mm256_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1,\n                                             -1, -1, -1, -1,  4,  5, 10, 11,\n                                              0,  1,  6,  7, 12, 13, -1, -1,\n                                             -1, -1, -1, -1, -1, -1, -1, -1);\n           __m256i CRmask = _mm256_setr_epi8(-1, -1, -1, -1, -1, -1,  2,  3,\n                                              8,  9, 14, 15, -1, -1, -1, -1,\n                                             -1, -1, -1, -1, -1, -1, -1, -1,\n                                             -1, -1, -1, -1,  4,  5, 10, 11);\n\n           __m256i AR = _mm256_shuffle_epi8(A0a, ARmask);\n           __m256i BR = _mm256_shuffle_epi8(B0a, BRmask);\n           __m256i CR = _mm256_shuffle_epi8(C0a, CRmask);\n           __m256i DR = _mm256_permute2f128_si256(AR, CR, (2 << 4) | 1);\n\n           __m256i R0 = _mm256_or_si256(AR, BR);\n           __m256i R1 = _mm256_or_si256(BR, CR);\n           __m256i R2 = _mm256_permute2f128_si256(R0, R1, 3 << 4);\n           __m256i R3a = _mm256_or_si256(DR, R2);\n\n           AR = _mm256_shuffle_epi8(A0b, ARmask);\n           BR = _mm256_shuffle_epi8(B0b, BRmask);\n           CR = _mm256_shuffle_epi8(C0b, CRmask);\n           DR = _mm256_permute2f128_si256(AR, CR, (2 << 4) | 1);\n\n           R0 = _mm256_or_si256(AR, BR);\n           R1 = _mm256_or_si256(BR, CR);\n           R2 = _mm256_permute2f128_si256(R0, R1, 3 << 4);\n           __m256i R3b = _mm256_or_si256(DR, R2);\n\n           __m256i AGmask = _mm256_setr_epi8( 2,  3,  8,  9, 14, 15, -1, -1,\n                                             -1, -1, -1, -1, -1, -1, -1, -1,\n                                             -1, -1, -1, -1, -1, -1,  4,  5,\n                                             10, 11, -1, -1, -1, -1, -1, -1);\n           __m256i BGmask = _mm256_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1,\n                                             -1, -1,  0,  1,  6,  7, 12, 13,\n                                              2,  3,  8,  9, 14, 15, -1, -1,\n                                             -1, -1, -1, -1, -1, -1, -1, -1);\n           __m256i CGmask = _mm256_setr_epi8(-1, -1, -1, -1, -1, -1,  4,  5,\n                                             10, 11, -1, -1, -1, -1, -1, -1,\n                                             -1, -1, -1, -1, -1, -1, -1, -1,\n                                             -1, -1,  0,  1,  6,  7, 12, 13);\n\n           __m256i AG = _mm256_shuffle_epi8(A0a, AGmask);\n           __m256i BG = _mm256_shuffle_epi8(B0a, BGmask);\n           __m256i CG = _mm256_shuffle_epi8(C0a, CGmask);\n           __m256i DG = _mm256_permute2f128_si256(AG, CG, (2 << 4) | 1);\n\n           __m256i G0 = _mm256_or_si256(AG, BG);\n           __m256i G1 = _mm256_or_si256(BG, CG);\n           __m256i G2 = _mm256_permute2f128_si256(G0, G1, 3 << 4);\n           __m256i G3a = _mm256_or_si256(DG, G2);\n\n           AG = _mm256_shuffle_epi8(A0b, AGmask);\n           BG = _mm256_shuffle_epi8(B0b, BGmask);\n           CG = _mm256_shuffle_epi8(C0b, CGmask);\n           DG = _mm256_permute2f128_si256(AG, CG, (2 << 4) | 1);\n\n           G0 = _mm256_or_si256(AG, BG);\n           G1 = _mm256_or_si256(BG, CG);\n           G2 = _mm256_permute2f128_si256(G0, G1, 3 << 4);\n           __m256i G3b = _mm256_or_si256(DG, G2);\n\n           __m256i ABmask = _mm256_setr_epi8( 4,  5, 10, 11, -1, -1, -1, -1,\n                                             -1, -1, -1, -1, -1, -1, -1, -1,\n                                             -1, -1, -1, -1,  0,  1,  6,  7,\n                                             12, 13, -1, -1, -1, -1, -1, -1);\n           __m256i BBmask = _mm256_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1,\n                                             -1, -1,  2,  3,  8,  9, 14, 15,\n                                              4,  5, 10, 11, -1, -1, -1, -1,\n                                             -1, -1, -1, -1, -1, -1, -1, -1);\n           __m256i CBmask = _mm256_setr_epi8(-1, -1, -1, -1,  0,  1,  6,  7,\n                                             12, 13, -1, -1, -1, -1, -1, -1,\n                                             -1, -1, -1, -1, -1, -1, -1, -1,\n                                             -1, -1,  2,  3,  8,  9, 14, 15);\n\n           __m256i AB = _mm256_shuffle_epi8(A0a, ABmask);\n           __m256i BB = _mm256_shuffle_epi8(B0a, BBmask);\n           __m256i CB = _mm256_shuffle_epi8(C0a, CBmask);\n           __m256i DB = _mm256_permute2f128_si256(AB, CB, (2 << 4) | 1);\n\n           __m256i B0 = _mm256_or_si256(AB, BB);\n           __m256i B1 = _mm256_or_si256(BB, CB);\n           __m256i B2 = _mm256_permute2f128_si256(B0, B1, 3 << 4);\n           __m256i B3a = _mm256_or_si256(DB, B2);\n\n           AB = _mm256_shuffle_epi8(A0b, ABmask);\n           BB = _mm256_shuffle_epi8(B0b, BBmask);\n           CB = _mm256_shuffle_epi8(C0b, CBmask);\n           DB = _mm256_permute2f128_si256(AB, CB, (2 << 4) | 1);\n\n           B0 = _mm256_or_si256(AB, BB);\n           B1 = _mm256_or_si256(BB, CB);\n           B2 = _mm256_permute2f128_si256(B0, B1, 3 << 4);\n           __m256i B3b = _mm256_or_si256(DB, B2);\n\n           ret.v0 = {mergeR};\n           ret.v1 = {mergeG};\n           ret.v2 = {mergeB};\n\n           return ret;'''. \\\n           format(mergeR=x86.setr(simd_ext, typ, 'R3a', 'R3b'),\n                  mergeG=x86.setr(simd_ext, typ, 'G3a', 'G3b'),\n                  mergeB=x86.setr(simd_ext, typ, 'B3a', 'B3b'),\n                  **fmtspec)\n    if typ in ['f32', 'i32', 'u32']:\n        return \\\n        '''nsimd_{simd_ext}_v{typ}x3 ret;\n\n           {load_v0v1v2}\n\n           __m512i RABm  = _mm512_setr_epi32( 0,  3,  6,  9, 12, 15, 18, 21,\n                                             24, 27, 30,  0,  0,  0 , 0,  0);\n           __m512i RABCm = _mm512_setr_epi32( 0,  1,  2,  3,  4,  5,  6 , 7,\n                                              8,  9, 10, 17, 20, 23, 26, 29);\n           __m512i GABm  = _mm512_setr_epi32( 1,  4,  7, 10, 13, 16, 19, 22,\n                                             25, 28, 31,  0,  0,  0 , 0,  0);\n           __m512i GABCm = _mm512_setr_epi32( 0,  1,  2,  3,  4,  5,  6 , 7,\n                                              8,  9, 10, 18, 21, 24, 27, 30);\n           __m512i BABm  = _mm512_setr_epi32( 2,  5,  8, 11, 14, 17, 20, 23,\n                                             26, 29,  0,  0,  0,  0 , 0,  0);\n           __m512i BABCm = _mm512_setr_epi32( 0,  1,  2,  3,  4,  5,  6 , 7,\n                                              8,  9, 16, 19, 22, 25, 28, 31);\n\n           {styp} R = _mm512_permutex2var{suf}(v0, RABm, v1);\n           ret.v0 = _mm512_permutex2var{suf}(R, RABCm, v2);\n           {styp} G = _mm512_permutex2var{suf}(v0, GABm, v1);\n           ret.v1 = _mm512_permutex2var{suf}(G, GABCm, v2);\n           {styp} B = _mm512_permutex2var{suf}(v0, BABm, v1);\n           ret.v2 = _mm512_permutex2var{suf}(B, BABCm, v2);\n\n           return ret;'''.format(**fmtspec)\n    if typ in ['f64', 'i64', 'u64']:\n        return \\\n        '''nsimd_{simd_ext}_v{typ}x3 ret;\n\n           {load_v0v1v2}\n\n           __m512i R_mask0 = _mm512_set_epi64( 0,  0, 15, 12, 9, 6, 3, 0);\n           __m512i R_mask1 = _mm512_set_epi64(13, 10,  5,  4, 3, 2, 1, 0);\n           {styp} A1 = _mm512_permutex2var{suf}(v0, R_mask0, v1);\n           ret.v0 = _mm512_permutex2var{suf}(A1, R_mask1, v2);\n\n           __m512i G_mask0 = _mm512_set_epi64( 0,  0,  0, 13, 10, 7, 4, 1);\n           __m512i G_mask1 = _mm512_set_epi64(14, 11,  8,  4,  3, 2, 1, 0);\n           {styp} B1 = _mm512_permutex2var{suf}(v0, G_mask0, v1);\n           ret.v1 = _mm512_permutex2var{suf}(B1, G_mask1, v2);\n\n           __m512i B_mask0 = _mm512_set_epi64( 0,  0,  0, 14, 11, 8, 5, 2);\n           __m512i B_mask1 = _mm512_set_epi64(15, 12,  9,  4,  3, 2, 1, 0);\n           {styp} C1 = _mm512_permutex2var{suf}(v0, B_mask0, v1);\n           ret.v2 = _mm512_permutex2var{suf}(C1, B_mask1, v2);\n\n           return ret;'''.format(**fmtspec)\n\n###############################################################################\n\ndef store3_avx512(simd_ext, typ, align, fmtspec2):\n    fmtspec = fmtspec2.copy()\n    fmtspec['exlo_in1'] = x86.extract(simd_ext, typ, x86.LO, common.in1)\n    fmtspec['exhi_in1'] = x86.extract(simd_ext, typ, x86.HI, common.in1)\n    fmtspec['exlo_in2'] = x86.extract(simd_ext, typ, x86.LO, common.in2)\n    fmtspec['exhi_in2'] = x86.extract(simd_ext, typ, x86.HI, common.in2)\n    fmtspec['exlo_in3'] = x86.extract(simd_ext, typ, x86.LO, common.in3)\n    fmtspec['exhi_in3'] = x86.extract(simd_ext, typ, x86.HI, common.in3)\n    fmtspec['a'] = 'a' if align else 'u'\n    if typ in ['i8', 'u8']:\n        return \\\n        '''__m256i R0 = {exlo_in1};\n           __m256i R1 = {exhi_in1};\n           __m256i G0 = {exlo_in2};\n           __m256i G1 = {exhi_in2};\n           __m256i B0 = {exlo_in3};\n           __m256i B1 = {exhi_in3};\n\n\n           __m256i RACm = _mm256_setr_epi8( 0, -1, -1,  1, -1, -1,  2, -1,\n                                           -1,  3, -1, -1,  4, -1, -1,  5,\n                                           -1, 27, -1, -1, 28, -1, -1, 29,\n                                           -1, -1, 30, -1, -1, 31, -1, -1);\n           __m256i RBBm = _mm256_setr_epi8(-1, 11, -1, -1, 12, -1, -1, 13,\n                                           -1, -1, 14, -1, -1, 15, -1, -1,\n                                           16, -1, -1, 17, -1, -1, 18, -1,\n                                           -1, 19, -1, -1, 20, -1, -1, 21);\n           __m256i RCAm = _mm256_setr_epi8(-1, -1, 22, -1, -1, 23, -1, -1,\n                                           24, -1, -1, 25, -1, -1, 26, -1,\n                                           -1, -1,  6, -1, -1,  7, -1, -1,\n                                            8, -1, -1,  9, -1, -1, 10, -1);\n\n           __m256i GACm = _mm256_setr_epi8(-1,  0, -1, -1,  1, -1, -1,  2,\n                                           -1, -1,  3, -1, -1,  4, -1, -1,\n                                           -1, -1, 27, -1, -1, 28, -1, -1,\n                                           29, -1, -1, 30, -1, -1, 31, -1);\n           __m256i GBBm = _mm256_setr_epi8(-1, -1, 11, -1, -1, 12, -1, -1,\n                                           13, -1, -1, 14, -1, -1, 15, -1,\n                                           -1, 16, -1, -1, 17, -1, -1, 18,\n                                           -1, -1, 19, -1, -1, 20, -1, -1);\n           __m256i GCAm = _mm256_setr_epi8(21, -1, -1, 22, -1, -1, 23, -1,\n                                           -1, 24, -1, -1, 25, -1, -1, 26,\n                                           05, -1, -1,  6, -1, -1,  7, -1,\n                                           -1,  8, -1, -1,  9, -1, -1, 10);\n\n           __m256i BACm = _mm256_setr_epi8(-1, -1,  0, -1, -1,  1, -1, -1,\n                                            2, -1, -1,  3, -1, -1,  4, -1,\n                                           26, -1, -1, 27, -1, -1, 28, -1,\n                                           -1, 29, -1, -1, 30, -1, -1, 31);\n           __m256i BBBm = _mm256_setr_epi8(10, -1, -1, 11, -1, -1, 12, -1,\n                                           -1, 13, -1, -1, 14, -1, -1, 15,\n                                           -1, -1, 16, -1, -1, 17, -1, -1,\n                                           18, -1, -1, 19, -1, -1, 20, -1);\n           __m256i BCAm = _mm256_setr_epi8(-1, 21, -1, -1, 22, -1, -1, 23,\n                                           -1, -1, 24, -1, -1, 25, -1, -1,\n                                           -1,  5, -1, -1,  6, -1, -1,  7,\n                                           -1, -1,  8, -1, -1,  9, -1, -1);\n\n           __m256i RAC = _mm256_shuffle_epi8(R0, RACm);\n           __m256i GAC = _mm256_shuffle_epi8(G0, GACm);\n           __m256i BAC = _mm256_shuffle_epi8(B0, BACm);\n\n           __m256i AC0 = _mm256_or_si256(RAC, GAC);\n           AC0 = _mm256_or_si256(AC0, BAC);\n\n           __m256i RBB = _mm256_shuffle_epi8(R0, RBBm);\n           __m256i GBB = _mm256_shuffle_epi8(G0, GBBm);\n           __m256i BBB = _mm256_shuffle_epi8(B0, BBBm);\n\n           __m256i BB0 = _mm256_or_si256(RBB, GBB);\n           BB0 = _mm256_or_si256(BB0, BBB);\n\n           __m256i RCA = _mm256_shuffle_epi8(R0, RCAm);\n           __m256i GCA = _mm256_shuffle_epi8(G0, GCAm);\n           __m256i BCA = _mm256_shuffle_epi8(B0, BCAm);\n\n           __m256i CA0 = _mm256_or_si256(RCA, GCA);\n           CA0 = _mm256_or_si256(CA0, BCA);\n\n           __m256i AA0 = _mm256_permute2f128_si256(AC0, CA0, 2 << 4);\n           __m256i CC0 = _mm256_permute2f128_si256(AC0, CA0, (1 << 4) | 3);\n\n           RAC = _mm256_shuffle_epi8(R1, RACm);\n           GAC = _mm256_shuffle_epi8(G1, GACm);\n           BAC = _mm256_shuffle_epi8(B1, BACm);\n\n           __m256i AC1 = _mm256_or_si256(RAC, GAC);\n           AC1 = _mm256_or_si256(AC1, BAC);\n\n           RBB = _mm256_shuffle_epi8(R1, RBBm);\n           GBB = _mm256_shuffle_epi8(G1, GBBm);\n           BBB = _mm256_shuffle_epi8(B1, BBBm);\n\n           __m256i BB1 = _mm256_or_si256(RBB, GBB);\n           BB1 = _mm256_or_si256(BB1, BBB);\n\n           RCA = _mm256_shuffle_epi8(R1, RCAm);\n           GCA = _mm256_shuffle_epi8(G1, GCAm);\n           BCA = _mm256_shuffle_epi8(B1, BCAm);\n\n           __m256i CA1 = _mm256_or_si256(RCA, GCA);\n           CA1 = _mm256_or_si256(CA1, BCA);\n\n           __m256i AA1 = _mm256_permute2f128_si256(AC1, CA1, 2 << 4);\n           __m256i CC1 = _mm256_permute2f128_si256(AC1, CA1, (1 << 4) | 3);\n\n           __m512i A = {mergeA0B0};\n           __m512i B = {mergeC0A1};\n           __m512i C = {mergeB1C1};\n\n           {store}'''. \\\n           format(mergeA0B0=x86.setr(simd_ext, typ, 'AA0', 'BB0'),\n                  mergeC0A1=x86.setr(simd_ext, typ, 'CC0', 'AA1'),\n                  mergeB1C1=x86.setr(simd_ext, typ, 'BB1', 'CC1'),\n                  store=store3(simd_ext, typ, align, fmtspec, 'A', 'B', 'C'),\n                  **fmtspec)\n    if typ in ['i16', 'u16']:\n        return \\\n        '''__m256i R0a = {exlo_in1};\n           __m256i R0b = {exhi_in1};\n           __m256i G0a = {exlo_in2};\n           __m256i G0b = {exhi_in2};\n           __m256i B0a = {exlo_in3};\n           __m256i B0b = {exhi_in3};\n\n           __m256i RACm = _mm256_setr_epi8( 0,  1, -1, -1, -1, -1,  2,  3,\n                                           -1, -1, -1, -1,  4,  5, -1, -1,\n                                           -1, -1, -1, -1, 12, 13, -1, -1,\n                                           -1, -1, 14, 15, -1, -1, -1, -1);\n           __m256i RBBm = _mm256_setr_epi8(-1, -1, -1, -1, 12, 13, -1, -1,\n                                           -1, -1, 14, 15, -1, -1, -1, -1,\n                                            0,  1, -1, -1, -1, -1,  2,  3,\n                                           -1, -1, -1, -1,  4,  5, -1, -1);\n           __m256i RCAm = _mm256_setr_epi8(-1, -1,  6,  7, -1, -1, -1, -1,\n                                            8,  9, -1, -1, -1, -1, 10, 11,\n                                           -1, -1,  6,  7, -1, -1, -1, -1,\n                                            8,  9, -1, -1, -1, -1, 10, 11);\n\n           __m256i GACm = _mm256_setr_epi8(-1, -1,  0,  1, -1, -1, -1, -1,\n                                            2,  3, -1, -1, -1, -1,  4,  5,\n                                           10, 11, -1, -1, -1, -1, 12, 13,\n                                           -1, -1, -1, -1, 14, 15, -1, -1);\n           __m256i GBBm = _mm256_setr_epi8(10, 11, -1, -1, -1, -1, 12, 13,\n                                           -1, -1, -1, -1, 14, 15, -1, -1,\n                                           -1, -1,  0,  1, -1, -1, -1, -1,\n                                            2,  3, -1, -1, -1, -1,  4,  5);\n           __m256i GCAm = _mm256_setr_epi8(-1, -1, -1, -1,  6,  7, -1, -1,\n                                           -1, -1,  8,  9, -1, -1, -1, -1,\n                                           -1, -1, -1, -1,  6,  7, -1, -1,\n                                           -1, -1,  8,  9, -1, -1, -1, -1);\n\n           __m256i BACm = _mm256_setr_epi8(-1, -1, -1, -1,  0,  1, -1, -1,\n                                           -1, -1,  2,  3, -1, -1, -1, -1,\n                                           -1, -1, 10, 11, -1, -1, -1, -1,\n                                           12, 13, -1, -1, -1, -1, 14, 15);\n           __m256i BBBm = _mm256_setr_epi8(-1, -1, 10, 11, -1, -1, -1, -1,\n                                           12, 13, -1, -1, -1, -1, 14, 15,\n                                           -1, -1, -1, -1,  0,  1, -1, -1,\n                                           -1, -1,  2,  3, -1, -1, -1, -1);\n           __m256i BCAm = _mm256_setr_epi8( 4,  5, -1, -1, -1, -1,  6,  7,\n                                           -1, -1, -1, -1,  8,  9, -1, -1,\n                                            4,  5, -1, -1, -1, -1,  6,  7,\n                                           -1, -1, -1, -1,  8,  9, -1, -1);\n\n           __m256i RAC = _mm256_shuffle_epi8(R0a, RACm);\n           __m256i GAC = _mm256_shuffle_epi8(G0a, GACm);\n           __m256i BAC = _mm256_shuffle_epi8(B0a, BACm);\n\n           __m256i RBB = _mm256_shuffle_epi8(R0a, RBBm);\n           __m256i GBB = _mm256_shuffle_epi8(G0a, GBBm);\n           __m256i BBB = _mm256_shuffle_epi8(B0a, BBBm);\n\n           __m256i RCA = _mm256_shuffle_epi8(R0a, RCAm);\n           __m256i GCA = _mm256_shuffle_epi8(G0a, GCAm);\n           __m256i BCA = _mm256_shuffle_epi8(B0a, BCAm);\n\n           __m256i AC = _mm256_or_si256(RAC, GAC);\n           AC = _mm256_or_si256(AC, BAC);\n\n           __m256i BBa = _mm256_or_si256(RBB, GBB);\n           BBa = _mm256_or_si256(BBa, BBB);\n\n           __m256i CA = _mm256_or_si256(RCA, GCA);\n           CA = _mm256_or_si256(CA, BCA);\n\n           __m256i AAa = _mm256_permute2f128_si256(AC, CA, 2 << 4);\n           __m256i CCa = _mm256_permute2f128_si256(AC, CA, (1 << 4) | 3);\n\n           RAC = _mm256_shuffle_epi8(R0b, RACm);\n           GAC = _mm256_shuffle_epi8(G0b, GACm);\n           BAC = _mm256_shuffle_epi8(B0b, BACm);\n\n           RBB = _mm256_shuffle_epi8(R0b, RBBm);\n           GBB = _mm256_shuffle_epi8(G0b, GBBm);\n           BBB = _mm256_shuffle_epi8(B0b, BBBm);\n\n           RCA = _mm256_shuffle_epi8(R0b, RCAm);\n           GCA = _mm256_shuffle_epi8(G0b, GCAm);\n           BCA = _mm256_shuffle_epi8(B0b, BCAm);\n\n           AC = _mm256_or_si256(RAC, GAC);\n           AC = _mm256_or_si256(AC, BAC);\n\n           __m256i BBb = _mm256_or_si256(RBB, GBB);\n           BBb = _mm256_or_si256(BBb, BBB);\n\n           CA = _mm256_or_si256(RCA, GCA);\n           CA = _mm256_or_si256(CA, BCA);\n\n           __m256i AAb = _mm256_permute2f128_si256(AC, CA, 2 << 4);\n           __m256i CCb = _mm256_permute2f128_si256(AC, CA, (1 << 4) | 3);\n\n           __m512i A = {mergeAaBa};\n           __m512i B = {mergeCaAb};\n           __m512i C = {mergeBbCb};\n\n           {store}'''. \\\n           format(mergeAaBa=x86.setr(simd_ext, typ, 'AAa', 'BBa'),\n                  mergeCaAb=x86.setr(simd_ext, typ, 'CCa', 'AAb'),\n                  mergeBbCb=x86.setr(simd_ext, typ, 'BBb', 'CCb'),\n                  store=store3(simd_ext, typ, align, fmtspec, 'A', 'B', 'C'),\n                  **fmtspec)\n    if typ in ['f32', 'i32', 'u32']:\n        return \\\n        '''__m512i ARGm  = _mm512_setr_epi32( 0, 16,  0,  1, 17,  0,  2, 18,\n                                              0,  3, 19,  0,  4, 20,  0,  5);\n           __m512i ARGBm = _mm512_setr_epi32( 0,  1, 16,  3,  4, 17,  6,  7,\n                                             18,  9, 10, 19, 12, 13, 20, 15);\n           __m512i BRGm  = _mm512_setr_epi32(21,  0,  6, 22,  0,  7, 23,  0,\n                                              8, 24,  0,  9, 25,  0, 10, 26);\n           __m512i BRGBm = _mm512_setr_epi32( 0, 21,  2,  3, 22,  5,  6, 23,\n                                              8,  9, 24, 11, 12, 25, 14, 15);\n           __m512i CRGm  = _mm512_setr_epi32( 0, 11, 27,  0, 12, 28,  0, 13,\n                                             29,  0, 14, 30,  0, 15, 31,  0);\n           __m512i CRGBm = _mm512_setr_epi32(26,  1,  2, 27,  4,  5, 28,  7,\n                                              8, 29, 10, 11, 30, 13, 14, 31);\n\n           {styp} A = _mm512_permutex2var{suf}({in1}, ARGm, {in2});\n           A = _mm512_permutex2var{suf}(A, ARGBm, {in3});\n           {styp} B = _mm512_permutex2var{suf}({in1}, BRGm, {in2});\n           B = _mm512_permutex2var{suf}(B, BRGBm, {in3});\n           {styp} C = _mm512_permutex2var{suf}({in1}, CRGm, {in2});\n           C = _mm512_permutex2var{suf}(C, CRGBm, {in3});\n\n           {store}'''. \\\n           format(store=store3(simd_ext, typ, align, fmtspec, 'A', 'B', 'C'),\n                  **fmtspec)\n    if typ in ['f64', 'i64', 'u64']:\n        return \\\n        '''__m512i A_mask0 = _mm512_set_epi64(10,  2,  0,  9,  1,  0,  8,  0);\n           __m512i A_mask1 = _mm512_set_epi64( 7,  6,  9,  4,  3,  8,  1,  0);\n           {styp} A1 = _mm512_permutex2var{suf}({in1}, A_mask0, {in2});\n           {styp} A2 = _mm512_permutex2var{suf}(A1, A_mask1, {in3});\n\n           __m512i B_mask0 = _mm512_set_epi64( 5,  0, 12,  4,  0, 11,  3,  0);\n           __m512i B_mask1 = _mm512_set_epi64( 7, 12,  5,  4, 11,  2,  1, 10);\n           {styp} B1 = _mm512_permutex2var{suf}({in1}, B_mask0, {in2});\n           {styp} B2 = _mm512_permutex2var{suf}(B1, B_mask1, {in3});\n\n           __m512i C_mask0 = _mm512_set_epi64( 0, 15,  7,  0, 14,  6,  0, 13);\n           __m512i C_mask1 = _mm512_set_epi64(15,  6,  5, 14,  3,  2, 13,  0);\n           {styp} C1 = _mm512_permutex2var{suf}({in1}, C_mask0, {in2});\n           {styp} C2 = _mm512_permutex2var{suf}(C1, C_mask1, {in3});\n\n           {store}'''. \\\n           format(store=store3(simd_ext, typ, align, fmtspec,\n                  'A2', 'B2', 'C2'), **fmtspec)\n"
  },
  {
    "path": "examples/module_fixed_point.cpp",
    "content": "// Copyright (c) 2019 Agenium Scale\n//\n// Permission is hereby granted, free of charge, to any person obtaining a copy\n// of this software and associated documentation files (the \"Software\"), to\n// deal in the Software without restriction, including without limitation the\n// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or\n// sell copies of the Software, and to permit persons to whom the Software is\n// furnished to do so, subject to the following conditions:\n//\n// The above copyright notice and this permission notice shall be included in\n// all copies or substantial portions of the Software.\n//\n// THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING\n// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS\n// IN THE SOFTWARE.\n\n#include <ctime>\n#include <cstdlib>\n#include <iostream>\n#include <nsimd/modules/fixed_point.hpp>\n\nfloat rand_float() {\n  return 4.0f * ((float) rand() / (float) RAND_MAX) - 2.0f;        \n}\n\nint main() {\n  // We use fixed point numbers with 8 bits of integer part and 8 bits of \n  // decimal part. It will use 32 bits integers for internal storage.\n  typedef nsimd::fixed_point::fp_t<8, 8> fp_t;\n  typedef nsimd::fixed_point::pack<fp_t> fp_pack_t;\n  \n  const size_t v_size = nsimd::fixed_point::len(fp_t());\n\n  fp_t *input0 = (fp_t*)malloc(v_size * sizeof(fp_t));\n  fp_t *input1 = (fp_t *)malloc(v_size * sizeof(fp_t));\n  fp_t *res = (fp_t *)malloc(v_size * sizeof(fp_t));\n  \n  // Input and output initializations \n  for(size_t i = 0; i < nsimd::fixed_point::len(fp_t()); i++) {\n    input0[i] = fp_t(rand_float());\n    input1[i] = fp_t(rand_float());\n  }\n  \n  fp_pack_t v0 = nsimd::fixed_point::loadu<fp_pack_t>(input0);\n  fp_pack_t v1 = nsimd::fixed_point::loadu<fp_pack_t>(input1);\n  fp_pack_t vres = nsimd::fixed_point::add(v0, v1);\n  nsimd::fixed_point::storeu(res, vres);\n  \n  for(size_t i = 0; i < nsimd::fixed_point::len(fp_t()); i++) {\n    std::cout << float(input0[i]) << \" | \"\n      << float(input1[i]) << \" | \"\n      << float(res[i]) << \"\\n\";\n  }\n  std::cout << std::endl;\n  \n  return EXIT_SUCCESS;\n}\n"
  },
  {
    "path": "examples/tutorial.cpp",
    "content": "#include <nsimd/nsimd-all.hpp>\n\n#include <string>\n#include <vector>\n#include <iostream>\n\ntemplate <typename T>\nvoid uppercase_scalar(T *dst, const T *src, int n) {\n  for (int i = 0; i < n; i++) {\n    if (src[i] >= 'a' && src[i] <= 'z') {\n      dst[i] = src[i] + ('A' - 'a');\n    } else {\n      dst[i] = src[i];\n    }\n  }\n}\n\ntemplate <typename T>\nvoid uppercase_simd(T *dst, const T *src, int n) {\n  using namespace nsimd;\n  typedef pack<T> p_t;\n  typedef packl<T> pl_t;\n  int l = len<p_t>();\n\n  int i;\n  for (i = 0; i + l <= n; i += l) {\n    p_t text = loadu<p_t>(src + i);\n    pl_t mask = text >= 'a' && text <= 'z';\n    p_t then_pack = text + ('A' - 'a');\n    p_t TEXT = if_else(mask, then_pack, text);\n    storeu(dst + i, TEXT);\n  }\n\n  pl_t mask = mask_for_loop_tail<pl_t>(i, n);\n  p_t text = maskz_loadu(mask, src + i);\n  p_t TEXT = if_else(text >= 'a' && text <= 'z', text + ('A' - 'a'), text);\n  mask_storeu(mask, dst + i, TEXT);\n}\n\nint main(int argc, char **argv) {\n  std::string input;\n\n  for (int i = 1; i < argc; i++) {\n    input += std::string(argv[i]);\n    if (i < argc - 1) {\n      input += std::string(\" \");\n    }\n  }\n\n  std::cout << \"Orignal text         : \" << input << std::endl;\n\n  std::vector<i8> dst_scalar(input.size() + 1);\n  uppercase_scalar(&dst_scalar[0], (i8 *)input.c_str(), (int)input.size());\n  std::cout << \"Scalar uppercase text: \" << &dst_scalar[0] << std::endl;\n\n  std::vector<i8> dst_simd(input.size() + 1);\n  uppercase_simd(&dst_simd[0], (i8 *)input.c_str(), (int)input.size());\n  std::cout << \"NSIMD uppercase text : \" << &dst_simd[0] << std::endl;\n\n  return 0;\n}\n"
  },
  {
    "path": "include/nsimd/c_adv_api.h",
    "content": "/*\n\nCopyright (c) 2021 Agenium Scale\n\nPermission is hereby granted, free of charge, to any person obtaining a copy\nof this software and associated documentation files (the \"Software\"), to deal\nin the Software without restriction, including without limitation the rights\nto use, copy, modify, merge, publish, distribute, sublicense, and/or sell\ncopies of the Software, and to permit persons to whom the Software is\nfurnished to do so, subject to the following conditions:\n\nThe above copyright notice and this permission notice shall be included in all\ncopies or substantial portions of the Software.\n\nTHE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\nIMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\nFITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\nAUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\nLIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\nOUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\nSOFTWARE.\n\n*/\n\n#ifndef NSIMD_C_ADV_API_H\n#define NSIMD_C_ADV_API_H\n\n#include <nsimd/nsimd.h>\n\n#if NSIMD_C >= 2011\n\nNSIMD_INLINE void nsimd_c11_type_unsupported(void) {}\n\n/* ------------------------------------------------------------------------- */\n\n#include <nsimd/c_adv_api_functions.h>\n\n/* ------------------------------------------------------------------------- */\n/* We add by hand parametrized loads/stores. */\n\n/* loads */\n\n#define nsimd_load_aligned(type, ptr) nsimd_loada(type, ptr)\n#define nsimd_load_unaligned(type, ptr) nsimd_loadu(type, ptr)\n\n#define nsimd_load(alignment, type, ptr)                                      \\\n  NSIMD_PP_CAT_2(nsimd_load_, alignment)(type, ptr)\n\n/* stores */\n\n#define nsimd_store_aligned(ptr, vec) nsimd_storea(ptr, vec)\n#define nsimd_store_unaligned(ptr, vec) nsimd_storeu(ptr, vec)\n\n#define nsimd_store(alignment, ptr, vec)                                      \\\n  NSIMD_PP_CAT_2(nsimd_store_, alignment)(ptr, vec)\n\n/* ------------------------------------------------------------------------- */\n/* Generic types */\n\n#define nsimd_pack(type) NSIMD_PP_CAT_2(nsimd_pack_, type)\n#define nsimd_packl(type) NSIMD_PP_CAT_2(nsimd_packl_, type)\n#define nsimd_packx2(type) NSIMD_PP_CAT_2(nsimd_packx2_, type)\n#define nsimd_packx3(type) NSIMD_PP_CAT_2(nsimd_packx3_, type)\n#define nsimd_packx4(type) NSIMD_PP_CAT_2(nsimd_packx4_, type)\n\n#define nsimd_pack_a(type, simd_ext) NSIMD_PP_CAT_3(nsimd_pack_, type)\n#define nsimd_packl_a(type, simd_ext) NSIMD_PP_CAT_3(nsimd_packl_, type)\n#define nsimd_packx2_a(type, simd_ext) NSIMD_PP_CAT_3(nsimd_packx2_, type)\n#define nsimd_packx3_a(type, simd_ext) NSIMD_PP_CAT_3(nsimd_packx3_, type)\n#define nsimd_packx4_a(type, simd_ext) NSIMD_PP_CAT_3(nsimd_packx4_, type)\n\n#endif /* NSIMD_C >= 2011 */\n\n#endif /* NSIMD_C_ADV_API_HPP */\n"
  },
  {
    "path": "include/nsimd/cxx_adv_api.hpp",
    "content": "/*\n\nCopyright (c) 2021 Agenium Scale\n\nPermission is hereby granted, free of charge, to any person obtaining a copy\nof this software and associated documentation files (the \"Software\"), to deal\nin the Software without restriction, including without limitation the rights\nto use, copy, modify, merge, publish, distribute, sublicense, and/or sell\ncopies of the Software, and to permit persons to whom the Software is\nfurnished to do so, subject to the following conditions:\n\nThe above copyright notice and this permission notice shall be included in all\ncopies or substantial portions of the Software.\n\nTHE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\nIMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\nFITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\nAUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\nLIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\nOUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\nSOFTWARE.\n\n*/\n\n#ifndef NSIMD_CXX_ADV_API_HPP\n#define NSIMD_CXX_ADV_API_HPP\n\n#include <nsimd/nsimd.h>\n#include <ostream>\n\n// ----------------------------------------------------------------------------\n\nnamespace nsimd {\n\n// ----------------------------------------------------------------------------\n// \"mimic\" static_assert in C++98\n\ntemplate <bool> struct nsimd_static_assert;\ntemplate <> struct nsimd_static_assert<true> {};\n\n// ----------------------------------------------------------------------------\n// Definition of pack\n\ntemplate <NSIMD_CONCEPT_VALUE_TYPE T, int N = 1,\n          NSIMD_CONCEPT_SIMD_EXT SimdExt = NSIMD_SIMD>\nNSIMD_STRUCT pack;\n\ntemplate <NSIMD_CONCEPT_VALUE_TYPE T, NSIMD_CONCEPT_SIMD_EXT SimdExt>\nNSIMD_STRUCT pack<T, 1, SimdExt> {\n  typedef typename simd_traits<T, SimdExt>::simd_vector simd_vector;\n  typedef T value_type;\n  typedef SimdExt simd_ext;\n  static const int unroll = 1;\n  static const int soa_num_packs = 1;\n\n  simd_vector car;\n\n  // Default ctor\n  pack() {}\n\n  // Ctor that splats\n  template <NSIMD_CONCEPT_VALUE_TYPE S> pack(S const &s) {\n    car = set1(T(s), T(), SimdExt());\n  }\n\n  // Ctor taking a SIMD vector\n  pack(simd_vector v) { car = v; }\n\n  // Underlying native SIMD vector getter\n  simd_vector native_register() const { return car; }\n\n  // Arithmetic and assignment operators\n  pack &operator+=(pack const &other);\n  pack &operator-=(pack const &other);\n  pack &operator*=(pack const &other);\n  pack &operator/=(pack const &other);\n  pack &operator|=(pack const &other);\n  pack &operator&=(pack const &other);\n  pack &operator^=(pack const &other);\n  pack &operator<<=(int);\n  pack &operator>>=(int);\n\n  // For std::cout'ing a pack\n  friend std::ostream &operator<<(std::ostream &os, pack const &a0) {\n    T buf[max_len_t<T>::value];\n    storeu(buf, a0.car, T(), SimdExt());\n    os << \"{ \";\n    int n = len(a0);\n    for (int i = 0; i < n; i++) {\n      os << to_biggest(buf[i]);\n      if (i < n - 1) {\n        os << \", \";\n      }\n    }\n    os << \" }\";\n    return os;\n  }\n};\n\ntemplate <NSIMD_CONCEPT_VALUE_TYPE T, int N, NSIMD_CONCEPT_SIMD_EXT SimdExt>\nNSIMD_STRUCT pack {\n  typedef typename simd_traits<T, SimdExt>::simd_vector simd_vector;\n  typedef T value_type;\n  typedef SimdExt simd_ext;\n  static const int unroll = N;\n  static const int soa_num_packs = 1;\n\n  simd_vector car;\n  pack<T, N - 1, SimdExt> cdr;\n\n  // Default ctor\n  pack() {}\n\n  // Ctor that splats\n  template <NSIMD_CONCEPT_VALUE_TYPE S> pack(S const &s) : cdr(s) {\n    car = set1(T(s), T(), SimdExt());\n  }\n\n  // Arithmetic and assignment operators\n  pack &operator+=(pack const &other);\n  pack &operator-=(pack const &other);\n  pack &operator*=(pack const &other);\n  pack &operator/=(pack const &other);\n  pack &operator|=(pack const &other);\n  pack &operator&=(pack const &other);\n  pack &operator^=(pack const &other);\n  pack &operator<<=(int);\n  pack &operator>>=(int);\n\n  // For std::cout'ing a pack\n  friend std::ostream &operator<<(std::ostream &os, pack const &a0) {\n    os << pack<T, 1, SimdExt>(a0.car) << \", \" << a0.cdr;\n    return os;\n  }\n};\n\n#if NSIMD_CXX >= 2020\ntemplate <typename T> struct is_pack_t : public std::false_type {};\n\ntemplate <NSIMD_CONCEPT_VALUE_TYPE T, int N, NSIMD_CONCEPT_SIMD_EXT SimdExt>\nstruct is_pack_t<pack<T, N, SimdExt> > : public std::true_type {};\n\ntemplate <typename T> concept is_pack_c = is_pack_t<T>::value;\n#define NSIMD_CONCEPT_PACK nsimd::is_pack_c\n#else\n#define NSIMD_CONCEPT_PACK typename\n#endif\n\n// ----------------------------------------------------------------------------\n// Definition of logical\n\ntemplate <NSIMD_CONCEPT_VALUE_TYPE T, int N = 1,\n          NSIMD_CONCEPT_SIMD_EXT SimdExt = NSIMD_SIMD>\nNSIMD_STRUCT packl;\n\ntemplate <NSIMD_CONCEPT_VALUE_TYPE T, NSIMD_CONCEPT_SIMD_EXT SimdExt>\nNSIMD_STRUCT packl<T, 1, SimdExt> {\n  typedef typename simd_traits<T, SimdExt>::simd_vectorl simd_vectorl;\n  simd_vectorl car;\n  typedef T value_type;\n  typedef SimdExt simd_ext;\n  static const int unroll = 1;\n\n  // Default ctor\n  packl() {}\n\n  // Ctor taking a SIMD vector\n  packl(simd_vectorl v) { car = v; }\n\n  // Ctor that splats\n  template <NSIMD_CONCEPT_VALUE_TYPE_OR_BOOL S> packl(S const &s) {\n    car = set1l(int(s), T(), SimdExt());\n  }\n\n  // Underlying native SIMD vector getter\n  simd_vectorl native_register() const { return car; }\n\n  // For std::cout'ing a packl\n  friend std::ostream &operator<<(std::ostream &os, packl const &a0) {\n    T buf[max_len_t<T>::value];\n    storelu(buf, a0.car, T(), SimdExt());\n    os << \"{ \";\n    int n = len(a0);\n    for (int i = 0; i < n; i++) {\n      os << buf[i];\n      if (i < n - 1) {\n        os << \", \";\n      }\n    }\n    os << \" }\";\n    return os;\n  }\n};\n\ntemplate <NSIMD_CONCEPT_VALUE_TYPE T, int N, NSIMD_CONCEPT_SIMD_EXT SimdExt>\nNSIMD_STRUCT packl {\n  typename simd_traits<T, SimdExt>::simd_vectorl car;\n  typedef T value_type;\n  typedef SimdExt simd_ext;\n  static const int unroll = N;\n\n  packl<T, N - 1, SimdExt> cdr;\n\n  // Default ctor\n  packl() {}\n\n  // Ctor that splats\n  template <NSIMD_CONCEPT_VALUE_TYPE S> packl(S const &s) : cdr(s) {\n    car = set1l(int(s), T(), SimdExt());\n  }\n\n  // For std::cout'ing a packl\n  friend std::ostream &operator<<(std::ostream &os, packl const &a0) {\n    os << packl<T, 1, SimdExt>(a0.car) << \", \" << a0.cdr;\n    return os;\n  }\n};\n\n#if NSIMD_CXX >= 2020\ntemplate <typename T> struct is_packl_t : public std::false_type {};\n\ntemplate <NSIMD_CONCEPT_VALUE_TYPE T, int N, NSIMD_CONCEPT_SIMD_EXT SimdExt>\nstruct is_packl_t<packl<T, N, SimdExt> > : public std::true_type {};\n\ntemplate <typename T> concept is_packl_c = is_packl_t<T>::value;\n#define NSIMD_CONCEPT_PACKL nsimd::is_packl_c\n#else\n#define NSIMD_CONCEPT_PACKL typename\n#endif\n\n// ----------------------------------------------------------------------------\n// Definition of SOA of degree 1\n\ntemplate <NSIMD_CONCEPT_VALUE_TYPE T, int N = 1,\n          NSIMD_CONCEPT_SIMD_EXT SimdExt = NSIMD_SIMD>\nNSIMD_STRUCT packx1;\n\ntemplate <NSIMD_CONCEPT_VALUE_TYPE T, NSIMD_CONCEPT_SIMD_EXT SimdExt>\nNSIMD_STRUCT packx1<T, 1, SimdExt> {\n  typedef typename simd_traits<T, SimdExt>::simd_vector simd_vector;\n  typedef T value_type;\n  typedef SimdExt simd_ext;\n  static const int unroll = 1;\n  static const int soa_num_packs = 1;\n\n  pack<T, 1, SimdExt> v0;\n\n  void set_car(simd_vector v0_) {\n    v0.car = v0_;\n  }\n};\n\ntemplate <NSIMD_CONCEPT_VALUE_TYPE T, int N, NSIMD_CONCEPT_SIMD_EXT SimdExt>\nNSIMD_STRUCT packx1 {\n  typedef typename simd_traits<T, SimdExt>::simd_vector simd_vector;\n  typedef T value_type;\n  typedef SimdExt simd_ext;\n  static const int unroll = N;\n  static const int soa_num_packs = 1;\n\n  pack<T, N, SimdExt> v0;\n\n  void set_car(simd_vector v0_) {\n    v0.car = v0_;\n  }\n\n  void set_cdr(pack<T, N - 1, SimdExt> const &v0_) {\n    v0.cdr = v0_;\n  }\n};\n\n#if NSIMD_CXX >= 2020\ntemplate <typename T> struct is_packx1_t : public std::false_type {};\n\ntemplate <NSIMD_CONCEPT_VALUE_TYPE T, int N, NSIMD_CONCEPT_SIMD_EXT SimdExt>\nstruct is_packx1_t<packx1<T, N, SimdExt> > : public std::true_type {};\n\ntemplate <typename T> concept is_packx1_c = is_packx1_t<T>::value;\n#define NSIMD_CONCEPT_PACKX1 nsimd::is_packx1_c\n#else\n#define NSIMD_CONCEPT_PACKX1 typename\n#endif\n\n// ----------------------------------------------------------------------------\n// Definition of SOA of degree 2\n\ntemplate <NSIMD_CONCEPT_VALUE_TYPE T, int N = 1,\n          NSIMD_CONCEPT_SIMD_EXT SimdExt = NSIMD_SIMD>\nNSIMD_STRUCT packx2;\n\ntemplate <NSIMD_CONCEPT_VALUE_TYPE T, NSIMD_CONCEPT_SIMD_EXT SimdExt>\nNSIMD_STRUCT packx2<T, 1, SimdExt> {\n  typedef typename simd_traits<T, SimdExt>::simd_vector simd_vector;\n  typedef T value_type;\n  typedef SimdExt simd_ext;\n  static const int unroll = 1;\n  static const int soa_num_packs = 2;\n\n  pack<T, 1, SimdExt> v0;\n  pack<T, 1, SimdExt> v1;\n\n  void set_car(simd_vector v0_, simd_vector v1_) {\n    v0.car = v0_;\n    v1.car = v1_;\n  }\n};\n\ntemplate <NSIMD_CONCEPT_VALUE_TYPE T, int N, NSIMD_CONCEPT_SIMD_EXT SimdExt>\nNSIMD_STRUCT packx2 {\n  typedef typename simd_traits<T, SimdExt>::simd_vector simd_vector;\n  typedef T value_type;\n  typedef SimdExt simd_ext;\n  static const int unroll = N;\n  static const int soa_num_packs = 2;\n\n  pack<T, N, SimdExt> v0;\n  pack<T, N, SimdExt> v1;\n\n  void set_car(simd_vector v0_, simd_vector v1_) {\n    v0.car = v0_;\n    v1.car = v1_;\n  }\n\n  void set_cdr(pack<T, N - 1, SimdExt> const &v0_,\n               pack<T, N - 1, SimdExt> const &v1_) {\n    v0.cdr = v0_;\n    v1.cdr = v1_;\n  }\n};\n\n#if NSIMD_CXX >= 2020\ntemplate <typename T> struct is_packx2_t : public std::false_type {};\n\ntemplate <NSIMD_CONCEPT_VALUE_TYPE T, int N, NSIMD_CONCEPT_SIMD_EXT SimdExt>\nstruct is_packx2_t<packx2<T, N, SimdExt> > : public std::true_type {};\n\ntemplate <typename T> concept is_packx2_c = is_packx2_t<T>::value;\n#define NSIMD_CONCEPT_PACKX2 nsimd::is_packx2_c\n#else\n#define NSIMD_CONCEPT_PACKX2 typename\n#endif\n\n// ----------------------------------------------------------------------------\n// Definition of SOA of degree 3\n\ntemplate <NSIMD_CONCEPT_VALUE_TYPE T, int N = 1,\n          NSIMD_CONCEPT_SIMD_EXT SimdExt = NSIMD_SIMD>\nNSIMD_STRUCT packx3;\n\ntemplate <NSIMD_CONCEPT_VALUE_TYPE T, NSIMD_CONCEPT_SIMD_EXT SimdExt>\nNSIMD_STRUCT packx3<T, 1, SimdExt> {\n  typedef typename simd_traits<T, SimdExt>::simd_vector simd_vector;\n  typedef T value_type;\n  typedef SimdExt simd_ext;\n  static const int unroll = 1;\n  static const int soa_num_packs = 3;\n\n  pack<T, 1, SimdExt> v0;\n  pack<T, 1, SimdExt> v1;\n  pack<T, 1, SimdExt> v2;\n\n  void set_car(simd_vector v0_, simd_vector v1_, simd_vector v2_) {\n    v0.car = v0_;\n    v1.car = v1_;\n    v2.car = v2_;\n  }\n};\n\ntemplate <NSIMD_CONCEPT_VALUE_TYPE T, int N, NSIMD_CONCEPT_SIMD_EXT SimdExt>\nNSIMD_STRUCT packx3 {\n  typedef typename simd_traits<T, SimdExt>::simd_vector simd_vector;\n  typedef T value_type;\n  typedef SimdExt simd_ext;\n  static const int unroll = N;\n  static const int soa_num_packs = 3;\n\n  pack<T, N, SimdExt> v0;\n  pack<T, N, SimdExt> v1;\n  pack<T, N, SimdExt> v2;\n\n  void set_car(simd_vector v0_, simd_vector v1_, simd_vector v2_) {\n    v0.car = v0_;\n    v1.car = v1_;\n    v2.car = v2_;\n  }\n\n  void set_cdr(pack<T, N - 1, SimdExt> const &v0_,\n               pack<T, N - 1, SimdExt> const &v1_,\n               pack<T, N - 1, SimdExt> const &v2_) {\n    v0.cdr = v0_;\n    v1.cdr = v1_;\n    v2.cdr = v2_;\n  }\n};\n\n#if NSIMD_CXX >= 2020\ntemplate <typename T> struct is_packx3_t : public std::false_type {};\n\ntemplate <NSIMD_CONCEPT_VALUE_TYPE T, int N, NSIMD_CONCEPT_SIMD_EXT SimdExt>\nstruct is_packx3_t<packx3<T, N, SimdExt> > : public std::true_type {};\n\ntemplate <typename T> concept is_packx3_c = is_packx3_t<T>::value;\n#define NSIMD_CONCEPT_PACKX3 nsimd::is_packx3_c\n#else\n#define NSIMD_CONCEPT_PACKX3 typename\n#endif\n\n// ----------------------------------------------------------------------------\n// Definition of SOA of degree 4\n\ntemplate <NSIMD_CONCEPT_VALUE_TYPE T, int N = 1,\n          NSIMD_CONCEPT_SIMD_EXT SimdExt = NSIMD_SIMD>\nNSIMD_STRUCT packx4;\n\ntemplate <NSIMD_CONCEPT_VALUE_TYPE T, NSIMD_CONCEPT_SIMD_EXT SimdExt>\nNSIMD_STRUCT packx4<T, 1, SimdExt> {\n  typedef typename simd_traits<T, SimdExt>::simd_vector simd_vector;\n  typedef T value_type;\n  typedef SimdExt simd_ext;\n  static const int unroll = 1;\n  static const int soa_num_packs = 4;\n\n  pack<T, 1, SimdExt> v0;\n  pack<T, 1, SimdExt> v1;\n  pack<T, 1, SimdExt> v2;\n  pack<T, 1, SimdExt> v3;\n\n  void set_car(simd_vector v0_, simd_vector v1_, simd_vector v2_,\n               simd_vector v3_) {\n    v0.car = v0_;\n    v1.car = v1_;\n    v2.car = v2_;\n    v3.car = v3_;\n  }\n};\n\ntemplate <NSIMD_CONCEPT_VALUE_TYPE T, int N, NSIMD_CONCEPT_SIMD_EXT SimdExt>\nNSIMD_STRUCT packx4 {\n  typedef typename simd_traits<T, SimdExt>::simd_vector simd_vector;\n  typedef T value_type;\n  typedef SimdExt simd_ext;\n  static const int unroll = N;\n  static const int soa_num_packs = 4;\n\n  pack<T, N, SimdExt> v0;\n  pack<T, N, SimdExt> v1;\n  pack<T, N, SimdExt> v2;\n  pack<T, N, SimdExt> v3;\n\n  void set_car(simd_vector v0_, simd_vector v1_, simd_vector v2_,\n               simd_vector v3_) {\n    v0.car = v0_;\n    v1.car = v1_;\n    v2.car = v2_;\n    v3.car = v3_;\n  }\n\n  void set_cdr(\n      pack<T, N - 1, SimdExt> const &v0_, pack<T, N - 1, SimdExt> const &v1_,\n      pack<T, N - 1, SimdExt> const &v2_, pack<T, N - 1, SimdExt> const &v3_) {\n    v0.cdr = v0_;\n    v1.cdr = v1_;\n    v2.cdr = v2_;\n    v3.cdr = v3_;\n  }\n};\n\n#if NSIMD_CXX >= 2020\ntemplate <typename T> struct is_packx4_t : public std::false_type {};\n\ntemplate <NSIMD_CONCEPT_VALUE_TYPE T, int N, NSIMD_CONCEPT_SIMD_EXT SimdExt>\nstruct is_packx4_t<packx4<T, N, SimdExt> > : public std::true_type {};\n\ntemplate <typename T> concept is_packx4_c = is_packx4_t<T>::value;\n#define NSIMD_CONCEPT_PACKX4 nsimd::is_packx4_c\n#else\n#define NSIMD_CONCEPT_PACKX4 typename\n#endif\n\n// ----------------------------------------------------------------------------\n// A C++20 concept\n\n#if NSIMD_CXX >=2020\ntemplate <typename T>\nconcept any_pack_c = is_pack_c<T> || is_packl_c<T> || is_packx1_c<T> ||\n                     is_packx2_c<T> || is_packx3_c<T> || is_packx4_c<T>;\n#define NSIMD_CONCEPT_ANY_PACK nsimd::any_pack_c\n#else\n#define NSIMD_CONCEPT_ANY_PACK typename\n#endif\n\n// ----------------------------------------------------------------------------\n// The len function cannot be auto-generated\n\ntemplate <NSIMD_CONCEPT_VALUE_TYPE T, int N, NSIMD_CONCEPT_SIMD_EXT SimdExt>\nint len(pack<T, N, SimdExt> const &) {\n  return N * len(T(), SimdExt());\n}\n\ntemplate <NSIMD_CONCEPT_VALUE_TYPE T, int N, NSIMD_CONCEPT_SIMD_EXT SimdExt>\nint len(packl<T, N, SimdExt> const &) {\n  return N * len(T(), SimdExt());\n}\n\ntemplate <NSIMD_CONCEPT_VALUE_TYPE T, int N, NSIMD_CONCEPT_SIMD_EXT SimdExt>\nint len(packx1<T, N, SimdExt> const &) {\n  return N * len(T(), SimdExt());\n}\n\ntemplate <NSIMD_CONCEPT_VALUE_TYPE T, int N, NSIMD_CONCEPT_SIMD_EXT SimdExt>\nint len(packx2<T, N, SimdExt> const &) {\n  return 2 * N * len(T(), SimdExt());\n}\n\ntemplate <NSIMD_CONCEPT_VALUE_TYPE T, int N, NSIMD_CONCEPT_SIMD_EXT SimdExt>\nint len(packx3<T, N, SimdExt> const &) {\n  return 3 * N * len(T(), SimdExt());\n}\n\ntemplate <NSIMD_CONCEPT_VALUE_TYPE T, int N, NSIMD_CONCEPT_SIMD_EXT SimdExt>\nint len(packx4<T, N, SimdExt> const &) {\n  return 4 * N * len(T(), SimdExt());\n}\n\ntemplate <NSIMD_CONCEPT_ANY_PACK Pack> int len() { return len(Pack()); }\n\n// ----------------------------------------------------------------------------\n// The addv function cannot be auto-generated\n\ntemplate <NSIMD_CONCEPT_VALUE_TYPE T, NSIMD_CONCEPT_SIMD_EXT SimdExt>\nT addv(pack<T, 1, SimdExt> const &a0) {\n  return addv(a0.car, T(), SimdExt());\n}\n\ntemplate <NSIMD_CONCEPT_VALUE_TYPE T, int N, NSIMD_CONCEPT_SIMD_EXT SimdExt>\nT addv(pack<T, N, SimdExt> const &a0) {\n  return addv(a0.car, T(), SimdExt()) + addv(a0.cdr);\n}\n\n// ----------------------------------------------------------------------------\n// The all function cannot be auto-generated\n\ntemplate <NSIMD_CONCEPT_VALUE_TYPE T, NSIMD_CONCEPT_SIMD_EXT SimdExt>\nint all(packl<T, 1, SimdExt> const &a0) {\n  return all(a0.car, T(), SimdExt());\n}\n\ntemplate <NSIMD_CONCEPT_VALUE_TYPE T, int N, NSIMD_CONCEPT_SIMD_EXT SimdExt>\nint all(packl<T, N, SimdExt> const &a0) {\n  return all(a0.car, T(), SimdExt()) && all(a0.cdr);\n}\n\n// ----------------------------------------------------------------------------\n// The any function cannot be auto-generated\n\ntemplate <NSIMD_CONCEPT_VALUE_TYPE T, NSIMD_CONCEPT_SIMD_EXT SimdExt>\nint any(packl<T, 1, SimdExt> const &a0) {\n  return any(a0.car, T(), SimdExt());\n}\n\ntemplate <NSIMD_CONCEPT_VALUE_TYPE T, int N, NSIMD_CONCEPT_SIMD_EXT SimdExt>\nint any(packl<T, N, SimdExt> const &a0) {\n  return any(a0.car, T(), SimdExt()) || any(a0.cdr);\n}\n\n// ----------------------------------------------------------------------------\n// The nbtrue function cannot be auto-generated\n\ntemplate <NSIMD_CONCEPT_VALUE_TYPE T, NSIMD_CONCEPT_SIMD_EXT SimdExt>\nint nbtrue(packl<T, 1, SimdExt> const &a0) {\n  return nbtrue(a0.car, T(), SimdExt());\n}\n\ntemplate <NSIMD_CONCEPT_VALUE_TYPE T, int N, NSIMD_CONCEPT_SIMD_EXT SimdExt>\nint nbtrue(packl<T, N, SimdExt> const &a0) {\n  return nbtrue(a0.car, T(), SimdExt()) + nbtrue(a0.cdr);\n}\n\n// ----------------------------------------------------------------------------\n// Include functions that act on packs\n\n} // namespace nsimd\n\n#include <nsimd/cxx_adv_api_functions.hpp>\n\nnamespace nsimd {\n\n// ----------------------------------------------------------------------------\n// Arithmetic and assignment operators\n\n// add\ntemplate <NSIMD_CONCEPT_VALUE_TYPE T, NSIMD_CONCEPT_SIMD_EXT SimdExt>\npack<T, 1, SimdExt> &pack<T, 1, SimdExt>::\noperator+=(pack<T, 1, SimdExt> const &other) {\n  this->car = add(this->car, other.car, T());\n  return *this;\n}\n\ntemplate <NSIMD_CONCEPT_VALUE_TYPE T, int N, NSIMD_CONCEPT_SIMD_EXT SimdExt>\npack<T, N, SimdExt> &pack<T, N, SimdExt>::\noperator+=(pack<T, N, SimdExt> const &other) {\n  this->car = add(this->car, other.car, T());\n  this->cdr += other.cdr;\n  return *this;\n}\n\n// sub\ntemplate <NSIMD_CONCEPT_VALUE_TYPE T, NSIMD_CONCEPT_SIMD_EXT SimdExt>\npack<T, 1, SimdExt> &pack<T, 1, SimdExt>::\noperator-=(pack<T, 1, SimdExt> const &other) {\n  this->car = sub(this->car, other.car, T());\n  return *this;\n}\n\ntemplate <NSIMD_CONCEPT_VALUE_TYPE T, int N, NSIMD_CONCEPT_SIMD_EXT SimdExt>\npack<T, N, SimdExt> &pack<T, N, SimdExt>::\noperator-=(pack<T, N, SimdExt> const &other) {\n  this->car = sub(this->car, other.car, T());\n  this->cdr -= other.cdr;\n  return *this;\n}\n\n// mul\ntemplate <NSIMD_CONCEPT_VALUE_TYPE T, NSIMD_CONCEPT_SIMD_EXT SimdExt>\npack<T, 1, SimdExt> &pack<T, 1, SimdExt>::\noperator*=(pack<T, 1, SimdExt> const &other) {\n  this->car = mul(this->car, other.car, T());\n  return *this;\n}\n\ntemplate <NSIMD_CONCEPT_VALUE_TYPE T, int N, NSIMD_CONCEPT_SIMD_EXT SimdExt>\npack<T, N, SimdExt> &pack<T, N, SimdExt>::\noperator*=(pack<T, N, SimdExt> const &other) {\n  this->car = mul(this->car, other.car, T());\n  this->cdr *= other.cdr;\n  return *this;\n}\n\n// div\ntemplate <NSIMD_CONCEPT_VALUE_TYPE T, NSIMD_CONCEPT_SIMD_EXT SimdExt>\npack<T, 1, SimdExt> &pack<T, 1, SimdExt>::\noperator/=(pack<T, 1, SimdExt> const &other) {\n  this->car = div(this->car, other.car, T());\n  return *this;\n}\n\ntemplate <NSIMD_CONCEPT_VALUE_TYPE T, int N, NSIMD_CONCEPT_SIMD_EXT SimdExt>\npack<T, N, SimdExt> &pack<T, N, SimdExt>::\noperator/=(pack<T, N, SimdExt> const &other) {\n  this->car = div(this->car, other.car, T());\n  this->cdr /= other.cdr;\n  return *this;\n}\n\n// orb\ntemplate <NSIMD_CONCEPT_VALUE_TYPE T, NSIMD_CONCEPT_SIMD_EXT SimdExt>\npack<T, 1, SimdExt> &pack<T, 1, SimdExt>::\noperator|=(pack<T, 1, SimdExt> const &other) {\n  this->car = orb(this->car, other.car, T());\n  return *this;\n}\n\ntemplate <NSIMD_CONCEPT_VALUE_TYPE T, int N, NSIMD_CONCEPT_SIMD_EXT SimdExt>\npack<T, N, SimdExt> &pack<T, N, SimdExt>::\noperator|=(pack<T, N, SimdExt> const &other) {\n  this->car = orb(this->car, other.car, T());\n  this->cdr |= other.cdr;\n  return *this;\n}\n\n// andb\ntemplate <NSIMD_CONCEPT_VALUE_TYPE T, NSIMD_CONCEPT_SIMD_EXT SimdExt>\npack<T, 1, SimdExt> &pack<T, 1, SimdExt>::\noperator&=(pack<T, 1, SimdExt> const &other) {\n  this->car = andb(this->car, other.car, T());\n  return *this;\n}\n\ntemplate <NSIMD_CONCEPT_VALUE_TYPE T, int N, NSIMD_CONCEPT_SIMD_EXT SimdExt>\npack<T, N, SimdExt> &pack<T, N, SimdExt>::\noperator&=(pack<T, N, SimdExt> const &other) {\n  this->car = andb(this->car, other.car, T());\n  this->cdr &= other.cdr;\n  return *this;\n}\n\n// xorb\ntemplate <NSIMD_CONCEPT_VALUE_TYPE T, NSIMD_CONCEPT_SIMD_EXT SimdExt>\npack<T, 1, SimdExt> &pack<T, 1, SimdExt>::\noperator^=(pack<T, 1, SimdExt> const &other) {\n  this->car = xorb(this->car, other.car, T());\n  return *this;\n}\n\ntemplate <NSIMD_CONCEPT_VALUE_TYPE T, int N, NSIMD_CONCEPT_SIMD_EXT SimdExt>\npack<T, N, SimdExt> &pack<T, N, SimdExt>::\noperator^=(pack<T, N, SimdExt> const &other) {\n  this->car = xorb(this->car, other.car, T());\n  this->cdr ^= other.cdr;\n  return *this;\n}\n\n// left shift\ntemplate <NSIMD_CONCEPT_VALUE_TYPE T, NSIMD_CONCEPT_SIMD_EXT SimdExt>\npack<T, 1, SimdExt> &pack<T, 1, SimdExt>::operator<<=(int s) {\n  this->car = shl(this->car, s, T());\n  return *this;\n}\n\ntemplate <NSIMD_CONCEPT_VALUE_TYPE T, int N, NSIMD_CONCEPT_SIMD_EXT SimdExt>\npack<T, N, SimdExt> &pack<T, N, SimdExt>::operator<<=(int s) {\n  this->car = shl(this->car, s, T());\n  this->cdr <<= s;\n  return *this;\n}\n\n// right shift\ntemplate <NSIMD_CONCEPT_VALUE_TYPE T, NSIMD_CONCEPT_SIMD_EXT SimdExt>\npack<T, 1, SimdExt> &pack<T, 1, SimdExt>::operator>>=(int s) {\n  this->car = shr(this->car, s, T());\n  return *this;\n}\n\ntemplate <NSIMD_CONCEPT_VALUE_TYPE T, int N, NSIMD_CONCEPT_SIMD_EXT SimdExt>\npack<T, N, SimdExt> &pack<T, N, SimdExt>::operator>>=(int s) {\n  this->car = shr(this->car, s, T());\n  this->cdr >>= s;\n  return *this;\n}\n\n// ----------------------------------------------------------------------------\n// The if_else function cannot be auto-generated\n\ntemplate <NSIMD_CONCEPT_VALUE_TYPE L, NSIMD_CONCEPT_VALUE_TYPE T,\n          NSIMD_CONCEPT_SIMD_EXT SimdExt>\nNSIMD_REQUIRES(sizeof_v<L> == sizeof_v<T>)\npack<T, 1, SimdExt>\nif_else(packl<L, 1, SimdExt> const &a0, pack<T, 1, SimdExt> const &a1,\n        pack<T, 1, SimdExt> const &a2) {\n  pack<T, 1, SimdExt> ret;\n  ret.car = if_else(a0.car, a1.car, a2.car, L(), T(), SimdExt());\n  return ret;\n}\n\ntemplate <NSIMD_CONCEPT_VALUE_TYPE L, NSIMD_CONCEPT_VALUE_TYPE T, int N,\n          NSIMD_CONCEPT_SIMD_EXT SimdExt>\nNSIMD_REQUIRES(sizeof_v<L> == sizeof_v<T>)\npack<T, N, SimdExt>\nif_else(packl<L, N, SimdExt> const &a0, pack<T, N, SimdExt> const &a1,\n        pack<T, N, SimdExt> const &a2) {\n  pack<T, N, SimdExt> ret;\n  ret.car = if_else(a0.car, a1.car, a2.car, L(), T(), SimdExt());\n  ret.cdr = if_else(a0.cdr, a1.cdr, a2.cdr);\n  return ret;\n}\n\n// ----------------------------------------------------------------------------\n// Mask loads and stores cannot be auto-generated\n\ntemplate <NSIMD_CONCEPT_VALUE_TYPE L, NSIMD_CONCEPT_VALUE_TYPE T, int N,\n          NSIMD_CONCEPT_SIMD_EXT SimdExt>\nNSIMD_REQUIRES(sizeof_v<L> == sizeof_v<T>)\nvoid mask_storea(packl<L, N, SimdExt> const &a0, T *a1,\n                 pack<T, N, SimdExt> const &a2) {\n  mask_storea1(reinterpretl<packl<T, N, SimdExt> >(a0), a1, a2);\n}\n\ntemplate <NSIMD_CONCEPT_VALUE_TYPE L, NSIMD_CONCEPT_VALUE_TYPE T, int N,\n          NSIMD_CONCEPT_SIMD_EXT SimdExt>\nNSIMD_REQUIRES(sizeof_v<L> == sizeof_v<T>)\nvoid mask_storeu(packl<L, N, SimdExt> const &a0, T *a1,\n                 pack<T, N, SimdExt> const &a2) {\n  mask_storeu1(reinterpretl<packl<T, N, SimdExt> >(a0), a1, a2);\n}\n\ntemplate <NSIMD_CONCEPT_VALUE_TYPE L, NSIMD_CONCEPT_VALUE_TYPE T, int N,\n          NSIMD_CONCEPT_SIMD_EXT SimdExt>\nNSIMD_REQUIRES(sizeof_v<L> == sizeof_v<T>)\npack<T, N, SimdExt> maskz_loada(packl<L, N, SimdExt> const &a0, const T *a1) {\n  return maskz_loada1(reinterpretl<packl<T, N, SimdExt> >(a0), a1);\n}\n\ntemplate <NSIMD_CONCEPT_VALUE_TYPE L, NSIMD_CONCEPT_VALUE_TYPE T, int N,\n          NSIMD_CONCEPT_SIMD_EXT SimdExt>\nNSIMD_REQUIRES(sizeof_v<L> == sizeof_v<T>)\npack<T, N, SimdExt> maskz_loadu(packl<L, N, SimdExt> const &a0, const T *a1) {\n  return maskz_loadu1(reinterpretl<packl<T, N, SimdExt> >(a0), a1);\n}\n\ntemplate <NSIMD_CONCEPT_VALUE_TYPE L, NSIMD_CONCEPT_VALUE_TYPE T, int N,\n          NSIMD_CONCEPT_SIMD_EXT SimdExt>\nNSIMD_REQUIRES(sizeof_v<L> == sizeof_v<T>)\npack<T, N, SimdExt> masko_loada(packl<L, N, SimdExt> const &a0, const T *a1,\n                                pack<T, N, SimdExt> const &a2) {\n  return masko_loada1(reinterpretl<packl<T, N, SimdExt> >(a0), a1, a2);\n}\n\ntemplate <NSIMD_CONCEPT_VALUE_TYPE L, NSIMD_CONCEPT_VALUE_TYPE T, int N,\n          NSIMD_CONCEPT_SIMD_EXT SimdExt>\nNSIMD_REQUIRES(sizeof_v<L> == sizeof_v<T>)\npack<T, N, SimdExt> masko_loadu(packl<L, N, SimdExt> const &a0, const T *a1,\n                                pack<T, N, SimdExt> const &a2) {\n  return masko_loadu1(reinterpretl<packl<T, N, SimdExt> >(a0), a1, a2);\n}\n\n// ----------------------------------------------------------------------------\n// Loads/Stores templated on the alignment cannot be auto-generated\n\nnamespace detail {\n\ntemplate <NSIMD_CONCEPT_PACKL P> struct loadz_return_t {\n  typedef nsimd::pack<typename P::value_type, P::unroll, typename P::simd_ext>\n      type;\n};\n\ntemplate <NSIMD_CONCEPT_ANY_PACK SimdVector, NSIMD_CONCEPT_ALIGNMENT Alignment>\nstruct load_helper {};\n\ntemplate <NSIMD_CONCEPT_ANY_PACK SimdVector>\nstruct load_helper<SimdVector, aligned> {\n  typedef typename SimdVector::value_type T;\n  typedef typename SimdVector::simd_ext simd_ext;\n  static const int N = SimdVector::unroll;\n\n  static SimdVector load(const T *a0) { return loada<SimdVector>(a0); }\n  static SimdVector loadl(const T *a0) { return loadla<SimdVector>(a0); }\n  static SimdVector load2(const T *a0) { return load2a<SimdVector>(a0); }\n  static SimdVector load3(const T *a0) { return load3a<SimdVector>(a0); }\n  static SimdVector load4(const T *a0) { return load4a<SimdVector>(a0); }\n\n  static SimdVector maskz_load(packl<T, N, simd_ext> const &a0, const T *a1) {\n    return maskz_loada(a0, a1);\n  }\n\n  static pack<T, N, simd_ext> masko_load(packl<T, N, simd_ext> const &a0,\n                                         const T *a1,\n                                         pack<T, N, simd_ext> const &a2) {\n    return masko_loada(a0, a1, a2);\n  }\n};\n\ntemplate <typename SimdVector> struct load_helper<SimdVector, unaligned> {\n  typedef typename SimdVector::value_type T;\n  typedef typename SimdVector::simd_ext simd_ext;\n  static const int N = SimdVector::unroll;\n\n  static SimdVector load(const T *a0) { return loadu<SimdVector>(a0); }\n  static SimdVector loadl(const T *a0) { return loadlu<SimdVector>(a0); }\n  static SimdVector load2(const T *a0) { return load2u<SimdVector>(a0); }\n  static SimdVector load3(const T *a0) { return load3u<SimdVector>(a0); }\n  static SimdVector load4(const T *a0) { return load4u<SimdVector>(a0); }\n\n  static SimdVector maskz_load(packl<T, N, simd_ext> const &a0, const T *a1) {\n    return maskz_loadu(a0, a1);\n  }\n\n  static pack<T, N, simd_ext> masko_load(packl<T, N, simd_ext> const &a0,\n                                         const T *a1,\n                                         pack<T, N, simd_ext> const &a2) {\n    return masko_loadu(a0, a1, a2);\n  }\n};\n\ntemplate <NSIMD_CONCEPT_ALIGNMENT Alignment> struct store_helper {};\n\n#define NSIMD_T typename P::value_type\n\ntemplate <> struct store_helper<aligned> {\n  template <NSIMD_CONCEPT_PACK P> static void store(NSIMD_T *a0, P const &a1) {\n    storea(a0, a1);\n  }\n\n  template <NSIMD_CONCEPT_PACKL PL, NSIMD_CONCEPT_PACK P>\n#if NSIMD_CXX >= 2020\n  requires std::is_same_v<typename PL::value_type, typename P::value_type>\n#endif\n  static void mask_store(PL const &a0, NSIMD_T *a1, P const &a2) {\n    mask_storea(a0, a1, a2);\n  }\n\n  template <NSIMD_CONCEPT_PACK P> static void storel(NSIMD_T *a0, P const &a1) {\n    storela(a0, a1);\n  }\n\n  template <NSIMD_CONCEPT_PACK P>\n  static void store2(NSIMD_T *a0, P const &a1, P const &a2) {\n    store2a(a0, a1, a2);\n  }\n\n  template <NSIMD_CONCEPT_PACK P>\n  static void store3(NSIMD_T *a0, P const &a1, P const &a2, P const &a3) {\n    store3a(a0, a1, a2, a3);\n  }\n\n  template <NSIMD_CONCEPT_PACK P>\n  static void store4(NSIMD_T *a0, P const &a1, P const &a2, P const &a3,\n                     P const &a4) {\n    store4a(a0, a1, a2, a3, a4);\n  }\n};\n\ntemplate <> struct store_helper<unaligned> {\n  template <NSIMD_CONCEPT_PACK P> static void store(NSIMD_T *a0, P const &a1) {\n    storeu(a0, a1);\n  }\n\n  template <NSIMD_CONCEPT_PACKL PL, NSIMD_CONCEPT_PACK P>\n#if NSIMD_CXX >= 2020\n  requires std::is_same_v<typename PL::value_type, typename P::value_type>\n#endif\n  static void mask_store(PL const &a0, NSIMD_T *a1, P const &a2) {\n    mask_storeu(a0, a1, a2);\n  }\n\n  template <NSIMD_CONCEPT_PACK P> static void storel(NSIMD_T *a0, P const &a1) {\n    storelu(a0, a1);\n  }\n\n  template <NSIMD_CONCEPT_PACK P>\n  static void store2(NSIMD_T *a0, P const &a1, P const &a2) {\n    store2u(a0, a1, a2);\n  }\n\n  template <NSIMD_CONCEPT_PACK P>\n  static void store3(NSIMD_T *a0, P const &a1, P const &a2, P const &a3) {\n    store3u(a0, a1, a2, a3);\n  }\n\n  template <NSIMD_CONCEPT_PACK P>\n  static void store4(NSIMD_T *a0, P const &a1, P const &a2, P const &a3,\n                     P const &a4) {\n    store4u(a0, a1, a2, a3, a4);\n  }\n};\n\n#undef NSIMD_T\n\n} // namespace detail\n\ntemplate <NSIMD_CONCEPT_PACK SimdVector, NSIMD_CONCEPT_ALIGNMENT Alignment>\nSimdVector load(const typename SimdVector::value_type *ptr) {\n  return detail::load_helper<SimdVector, Alignment>::load(ptr);\n}\n\ntemplate <NSIMD_CONCEPT_ALIGNMENT Alignment, NSIMD_CONCEPT_PACKL Packl>\npack<typename Packl::value_type, Packl::unroll, typename Packl::simd_ext>\nmaskz_load(Packl const &pl, const typename Packl::value_type *ptr) {\n  return detail::load_helper<pack<typename Packl::value_type, Packl::unroll,\n                                  typename Packl::simd_ext>,\n                             Alignment>::maskz_load(pl, ptr);\n}\n\ntemplate <NSIMD_CONCEPT_ALIGNMENT Alignment, NSIMD_CONCEPT_PACKL Packl,\n          NSIMD_CONCEPT_PACK Pack>\nPack masko_load(Packl const &pl, const typename Pack::value_type *ptr,\n                Pack const &p) {\n  return detail::load_helper<Pack, Alignment>::masko_load(pl, ptr, p);\n}\n\ntemplate <NSIMD_CONCEPT_PACK SimdVector, NSIMD_CONCEPT_ALIGNMENT Alignment>\nSimdVector loadl(const typename SimdVector::value_type *ptr) {\n  return detail::load_helper<SimdVector, Alignment>::loadl(ptr);\n}\n\ntemplate <NSIMD_CONCEPT_PACKX2 SimdVector, NSIMD_CONCEPT_ALIGNMENT Alignment>\nSimdVector load2(const typename SimdVector::value_type *ptr) {\n  return detail::load_helper<SimdVector, Alignment>::load2(ptr);\n}\n\ntemplate <NSIMD_CONCEPT_PACKX3 SimdVector, NSIMD_CONCEPT_ALIGNMENT Alignment>\nSimdVector load3(const typename SimdVector::value_type *ptr) {\n  return detail::load_helper<SimdVector, Alignment>::load3(ptr);\n}\n\ntemplate <NSIMD_CONCEPT_PACKX4 SimdVector, NSIMD_CONCEPT_ALIGNMENT Alignment>\nSimdVector load4(const typename SimdVector::value_type *ptr) {\n  return detail::load_helper<SimdVector, Alignment>::load4(ptr);\n}\n\ntemplate <NSIMD_CONCEPT_ALIGNMENT Alignment, NSIMD_CONCEPT_PACK Pack>\nvoid store(typename Pack::value_type *ptr, Pack const &p) {\n  detail::store_helper<Alignment>::store(ptr, p);\n}\n\ntemplate <NSIMD_CONCEPT_ALIGNMENT Alignment, NSIMD_CONCEPT_PACKL Packl,\n          NSIMD_CONCEPT_PACK Pack>\nvoid mask_store(Packl const &pl, typename Pack::value_type *ptr,\n                Pack const &p) {\n  detail::store_helper<Alignment>::mask_store(pl, ptr, p);\n}\n\ntemplate <NSIMD_CONCEPT_ALIGNMENT Alignment, NSIMD_CONCEPT_PACKL Packl>\nvoid storel(typename Packl::value_type *ptr, Packl const &pl) {\n  return detail::store_helper<Alignment>::storel(ptr, pl);\n}\n\ntemplate <NSIMD_CONCEPT_ALIGNMENT Alignment, NSIMD_CONCEPT_PACK Pack>\nvoid store2(typename Pack::value_type *ptr, Pack const &p1, Pack const &p2) {\n  return detail::store_helper<Alignment>::store2(ptr, p1, p2);\n}\n\ntemplate <NSIMD_CONCEPT_ALIGNMENT Alignment, NSIMD_CONCEPT_PACK Pack>\nvoid store3(typename Pack::value_type *ptr, Pack const &p1, Pack const &p2,\n            Pack const &p3) {\n  return detail::store_helper<Alignment>::store3(ptr, p1, p2, p3);\n}\n\ntemplate <NSIMD_CONCEPT_ALIGNMENT Alignment, NSIMD_CONCEPT_PACK Pack>\nvoid store4(typename Pack::value_type *ptr, Pack const &p1, Pack const &p2,\n            Pack const &p3, Pack const &p4) {\n  return detail::store_helper<Alignment>::store4(ptr, p1, p2, p3, p4);\n}\n\n// ----------------------------------------------------------------------------\n\ntemplate <typename T> T native_register(T a) { return a; }\n\ntemplate <NSIMD_CONCEPT_VALUE_TYPE T, NSIMD_CONCEPT_SIMD_EXT SimdExt>\ntypename pack<T, 1, SimdExt>::simd_vector\nnative_register(pack<T, 1, SimdExt> const &a) {\n  return a.car;\n}\n\n// ----------------------------------------------------------------------------\n// get_pack\n\ntemplate <NSIMD_CONCEPT_VALUE_TYPE T, int N, NSIMD_CONCEPT_SIMD_EXT SimdExt,\n          template <typename, int, typename> class packx, int Ix>\nstruct get_pack_helper {};\n\n// ----------------------------------------------------------------------------\n// get_pack_helper - packx1\n\ntemplate <NSIMD_CONCEPT_VALUE_TYPE T, int N, NSIMD_CONCEPT_SIMD_EXT SimdExt,\n          int Ix>\nstruct get_pack_helper<T, N, SimdExt, packx1, Ix> {};\n\ntemplate <NSIMD_CONCEPT_VALUE_TYPE T, int N, NSIMD_CONCEPT_SIMD_EXT SimdExt>\nstruct get_pack_helper<T, N, SimdExt, packx1, 0> {\n  const nsimd::pack<T, N, SimdExt> &\n  operator()(const packx1<T, N, SimdExt> &packx_) const {\n    return packx_.v0;\n  }\n};\n\n// ----------------------------------------------------------------------------\n// get_pack_helper - packx2\n\ntemplate <NSIMD_CONCEPT_VALUE_TYPE T, int N, NSIMD_CONCEPT_SIMD_EXT SimdExt,\n          int Ix>\nstruct get_pack_helper<T, N, SimdExt, packx2, Ix> {};\n\ntemplate <NSIMD_CONCEPT_VALUE_TYPE T, int N, NSIMD_CONCEPT_SIMD_EXT SimdExt>\nstruct get_pack_helper<T, N, SimdExt, packx2, 0> {\n  const nsimd::pack<T, N, SimdExt> &\n  operator()(const packx2<T, N, SimdExt> &packx_) const {\n    return packx_.v0;\n  }\n};\n\ntemplate <NSIMD_CONCEPT_VALUE_TYPE T, int N, NSIMD_CONCEPT_SIMD_EXT SimdExt>\nstruct get_pack_helper<T, N, SimdExt, packx2, 1> {\n  const nsimd::pack<T, N, SimdExt> &\n  operator()(const packx2<T, N, SimdExt> &packx_) const {\n    return packx_.v1;\n  }\n};\n\n// ----------------------------------------------------------------------------\n// get_pack_helper - packx3\n\ntemplate <NSIMD_CONCEPT_VALUE_TYPE T, int N, NSIMD_CONCEPT_SIMD_EXT SimdExt,\n          int Ix>\nstruct get_pack_helper<T, N, SimdExt, packx3, Ix> {};\n\ntemplate <NSIMD_CONCEPT_VALUE_TYPE T, int N, NSIMD_CONCEPT_SIMD_EXT SimdExt>\nstruct get_pack_helper<T, N, SimdExt, packx3, 0> {\n  const nsimd::pack<T, N, SimdExt> &\n  operator()(const packx3<T, N, SimdExt> &packx_) const {\n    return packx_.v0;\n  }\n};\n\ntemplate <NSIMD_CONCEPT_VALUE_TYPE T, int N, NSIMD_CONCEPT_SIMD_EXT SimdExt>\nstruct get_pack_helper<T, N, SimdExt, packx3, 1> {\n  const nsimd::pack<T, N, SimdExt> &\n  operator()(const packx3<T, N, SimdExt> &packx_) const {\n    return packx_.v1;\n  }\n};\n\ntemplate <NSIMD_CONCEPT_VALUE_TYPE T, int N, NSIMD_CONCEPT_SIMD_EXT SimdExt>\nstruct get_pack_helper<T, N, SimdExt, packx3, 2> {\n  const nsimd::pack<T, N, SimdExt> &\n  operator()(const packx3<T, N, SimdExt> &packx_) const {\n    return packx_.v2;\n  }\n};\n\n// ----------------------------------------------------------------------------\n// get_pack_helper - packx4\n\ntemplate <NSIMD_CONCEPT_VALUE_TYPE T, int N, NSIMD_CONCEPT_SIMD_EXT SimdExt,\n          int Ix>\nstruct get_pack_helper<T, N, SimdExt, packx4, Ix> {};\n\ntemplate <NSIMD_CONCEPT_VALUE_TYPE T, int N, NSIMD_CONCEPT_SIMD_EXT SimdExt>\nstruct get_pack_helper<T, N, SimdExt, packx4, 0> {\n  const nsimd::pack<T, N, SimdExt> &\n  operator()(const packx4<T, N, SimdExt> &packx_) const {\n    return packx_.v0;\n  }\n};\n\ntemplate <NSIMD_CONCEPT_VALUE_TYPE T, int N, NSIMD_CONCEPT_SIMD_EXT SimdExt>\nstruct get_pack_helper<T, N, SimdExt, packx4, 1> {\n  const nsimd::pack<T, N, SimdExt> &\n  operator()(const packx4<T, N, SimdExt> &packx_) const {\n    return packx_.v1;\n  }\n};\n\ntemplate <NSIMD_CONCEPT_VALUE_TYPE T, int N, NSIMD_CONCEPT_SIMD_EXT SimdExt>\nstruct get_pack_helper<T, N, SimdExt, packx4, 2> {\n  const nsimd::pack<T, N, SimdExt> &\n  operator()(const packx4<T, N, SimdExt> &packx_) const {\n    return packx_.v2;\n  }\n};\n\ntemplate <NSIMD_CONCEPT_VALUE_TYPE T, int N, NSIMD_CONCEPT_SIMD_EXT SimdExt>\nstruct get_pack_helper<T, N, SimdExt, packx4, 3> {\n  const nsimd::pack<T, N, SimdExt> &\n  operator()(const packx4<T, N, SimdExt> &packx_) const {\n    return packx_.v3;\n  }\n};\n\n// ----------------------------------------------------------------------------\n// get_pack\n// get_pack for packx[Y]<T, 1..N, SimdExt> with Y = 1\n\ntemplate <int Ix, NSIMD_CONCEPT_VALUE_TYPE T, int N,\n          NSIMD_CONCEPT_SIMD_EXT SimdExt>\npack<T, N, SimdExt> get_pack(const pack<T, N, SimdExt> &pack_) {\n  nsimd_static_assert<0 == Ix>();\n  return pack_;\n}\n\n// ----------------------------------------------------------------------------\n// get_pack\n// get_pack for packx[Y]<T, 1..N, SimdExt> with Y in {2, 3, 4}\n\ntemplate <int Ix, NSIMD_CONCEPT_VALUE_TYPE T, int N,\n          NSIMD_CONCEPT_SIMD_EXT SimdExt,\n          template <typename, int, typename> class packx>\npack<T, N, SimdExt> get_pack(const packx<T, N, SimdExt> &packx_) {\n  return get_pack_helper<T, N, SimdExt, packx, Ix>()(packx_);\n}\n\n// ----------------------------------------------------------------------------\n// to_pack_trait\n\ntemplate <class _packx> struct to_pack_trait {};\n\ntemplate <NSIMD_CONCEPT_VALUE_TYPE T, int N, NSIMD_CONCEPT_SIMD_EXT SimdExt,\n          template <typename, int, typename> class _packx>\nstruct to_pack_trait<_packx<T, N, SimdExt> > {\n  typedef pack<T, _packx<T, N, SimdExt>::soa_num_packs * N, SimdExt>\n      value_type;\n};\n\n// ----------------------------------------------------------------------------\n// to_pack\n// to_pack for packx[Y]<T, 1..N, SimdExt> with Y = 1\n\ntemplate <NSIMD_CONCEPT_VALUE_TYPE T, NSIMD_CONCEPT_SIMD_EXT SimdExt>\npack<T, 1, SimdExt> to_pack(const pack<T, 1, SimdExt> &pack_) {\n  return pack_;\n}\n\ntemplate <NSIMD_CONCEPT_VALUE_TYPE T, int N, NSIMD_CONCEPT_SIMD_EXT SimdExt>\npack<T, N, SimdExt> to_pack(const pack<T, N, SimdExt> &pack_) {\n  return pack_;\n}\n\n// ----------------------------------------------------------------------------\n// to_pack\n// to_pack for packx[Y]<T, N = 1, SimdExt> with Y in {2, 3, 4}\n\ntemplate <NSIMD_CONCEPT_VALUE_TYPE T, NSIMD_CONCEPT_SIMD_EXT SimdExt>\npack<T, 1, SimdExt> to_pack(const packx1<T, 1, SimdExt> &packx_) {\n\n  nsimd::pack<T, 1, SimdExt> pack_;\n  pack_.car = packx_.v0.car;\n\n  return pack_;\n}\n\ntemplate <NSIMD_CONCEPT_VALUE_TYPE T, NSIMD_CONCEPT_SIMD_EXT SimdExt>\npack<T, 2, SimdExt> to_pack(const packx2<T, 1, SimdExt> &packx_) {\n\n  nsimd::pack<T, 2, SimdExt> pack_;\n  pack_.car = packx_.v0.car;\n  pack_.cdr.car = packx_.v1.car;\n\n  return pack_;\n}\n\ntemplate <NSIMD_CONCEPT_VALUE_TYPE T, NSIMD_CONCEPT_SIMD_EXT SimdExt>\npack<T, 3, SimdExt> to_pack(const packx3<T, 1, SimdExt> &packx_) {\n\n  nsimd::pack<T, 3, SimdExt> pack_;\n  pack_.car = packx_.v0.car;\n  pack_.cdr.car = packx_.v1.car;\n  pack_.cdr.cdr.car = packx_.v2.car;\n  return pack_;\n}\n\ntemplate <NSIMD_CONCEPT_VALUE_TYPE T, NSIMD_CONCEPT_SIMD_EXT SimdExt>\npack<T, 4, SimdExt> to_pack(const packx4<T, 1, SimdExt> &packx_) {\n\n  nsimd::pack<T, 4, SimdExt> pack_;\n  pack_.car = packx_.v0.car;\n  pack_.cdr.car = packx_.v1.car;\n  pack_.cdr.cdr.car = packx_.v2.car;\n  pack_.cdr.cdr.cdr.car = packx_.v3.car;\n\n  return pack_;\n}\n\n// ----------------------------------------------------------------------------\n// to_pack for packx[Y]<T, (N > 1), SimdExt> with Y in {2, 3, 4}\n\n// Advance\ntemplate <NSIMD_CONCEPT_VALUE_TYPE T, int from_pack_init_N,\n          int from_pack_unroll_ix, int to_pack_unroll_ix,\n          int which_from_pack_ix, NSIMD_CONCEPT_SIMD_EXT SimdExt,\n          template <typename, int, typename> class packx>\nstruct to_pack_recurs_helper {\n  static pack<T, to_pack_unroll_ix, SimdExt>\n  to_pack(const packx<T, from_pack_init_N, SimdExt> &from_packx,\n          const pack<T, from_pack_unroll_ix, SimdExt> &from_pack) {\n    pack<T, to_pack_unroll_ix, SimdExt> to_pack_;\n    to_pack_.car = from_pack.car;\n    to_pack_.cdr =\n        to_pack_recurs_helper<T, from_pack_init_N, from_pack_unroll_ix - 1,\n                              to_pack_unroll_ix - 1, which_from_pack_ix,\n                              SimdExt, packx>::to_pack(from_packx,\n                                                       from_pack.cdr);\n    return to_pack_;\n  }\n};\n\n// Base case\n// Base case condition: to_pack_unroll_ix == 1\ntemplate <NSIMD_CONCEPT_VALUE_TYPE T, int from_pack_init_N,\n          int which_from_pack_ix, NSIMD_CONCEPT_SIMD_EXT SimdExt,\n          template <typename, int, typename> class packx>\nstruct to_pack_recurs_helper<T, from_pack_init_N, 1 /* from_pack_unroll_ix */,\n                             1 /* to_pack_unroll_ix */, which_from_pack_ix,\n                             SimdExt, packx> {\n  static pack<T, 1, SimdExt>\n  to_pack(const packx<T, from_pack_init_N, SimdExt> &from_packx,\n          const pack<T, 1, SimdExt> &from_pack) {\n    (void)from_packx;\n    pack<T, 1, SimdExt> to_pack_;\n    to_pack_.car = from_pack.car; // simd_vector\n    return to_pack_;\n  }\n};\n\n// Switch: from_packx[i] --> from_packx[i+1]\n// Switch condition: from_pack_unroll_ix == 1 && to_pack_unroll_ix > 1\ntemplate <NSIMD_CONCEPT_VALUE_TYPE T, int from_pack_init_N, int to_pack_unroll_ix,\n          int which_from_pack_ix, NSIMD_CONCEPT_SIMD_EXT SimdExt,\n          template <typename, int, typename> class packx>\nstruct to_pack_recurs_helper<T, from_pack_init_N, 1 /* from_pack_unroll_ix */,\n                             to_pack_unroll_ix, which_from_pack_ix, SimdExt,\n                             packx> {\n  static pack<T, to_pack_unroll_ix, SimdExt>\n  to_pack(const packx<T, from_pack_init_N, SimdExt> &from_packx,\n          const pack<T, 1, SimdExt> &from_pack) {\n\n    pack<T, to_pack_unroll_ix, SimdExt> to_pack_;\n    to_pack_.car = from_pack.car; // simd_vector\n\n    // get next pack\n    to_pack_.cdr = to_pack_recurs_helper<\n        T, from_pack_init_N, from_pack_init_N, to_pack_unroll_ix - 1,\n        which_from_pack_ix + 1, SimdExt,\n        packx>::to_pack(from_packx,\n                        get_pack<which_from_pack_ix + 1>(from_packx));\n    return to_pack_;\n  }\n};\n\ntemplate <NSIMD_CONCEPT_VALUE_TYPE T, int N, NSIMD_CONCEPT_SIMD_EXT SimdExt,\n          template <typename, int, typename> class packx>\ntypename to_pack_trait<packx<T, N, SimdExt> >::value_type\nto_pack(const packx<T, N, SimdExt> &from_packx) {\n  static const int to_pack_unroll_ix = packx<T, N, SimdExt>::soa_num_packs * N;\n  pack<T, to_pack_unroll_ix, SimdExt> to_pack_;\n  to_pack_.car = from_packx.v0.car; // simd_vector\n  to_pack_.cdr = to_pack_recurs_helper<\n      T, N /* from_pack_init_N*/, N - 1 /* from_pack_unroll_ix */,\n      to_pack_unroll_ix - 1 /* to_pack_unroll_ix */,\n      0 /* which_from_pack_ix */, SimdExt, packx>::to_pack(from_packx,\n                                                           from_packx.v0.cdr);\n  return to_pack_;\n}\n\n// ----------------------------------------------------------------------------\n// to_pack_interleave\n\ntemplate <NSIMD_CONCEPT_VALUE_TYPE T, NSIMD_CONCEPT_SIMD_EXT SimdExt>\npack<T, 1, SimdExt> to_pack_interleave(const pack<T, 1, SimdExt> &pack_) {\n  return pack_;\n}\n\ntemplate <NSIMD_CONCEPT_VALUE_TYPE T, int N, NSIMD_CONCEPT_SIMD_EXT SimdExt>\npack<T, N, SimdExt> to_pack_interleave(const pack<T, N, SimdExt> &pack_) {\n  return pack_;\n}\n\n// ----------------------------------------------------------------------------\n\ntemplate <NSIMD_CONCEPT_VALUE_TYPE T, NSIMD_CONCEPT_SIMD_EXT SimdExt>\npack<T, 1, SimdExt> to_pack_interleave(const packx1<T, 1, SimdExt> &packx1_) {\n  pack<T, 1, SimdExt> pack_1;\n  pack_1.car = packx1_.v0.car;\n  pack_1.cdr = packx1_.v0.cdr;\n  return pack_1;\n}\n\ntemplate <NSIMD_CONCEPT_VALUE_TYPE T, int N, NSIMD_CONCEPT_SIMD_EXT SimdExt>\npack<T, N, SimdExt>\nto_pack_interleave(const packx1<T, N, SimdExt> &packx1_N) {\n  pack<T, N, SimdExt> pack_1;\n  pack_1.car = packx1_N.v0.car;\n  pack_1.cdr = packx1_N.v0.cdr;\n  return pack_1;\n}\n\n// ----------------------------------------------------------------------------\n\ntemplate <NSIMD_CONCEPT_VALUE_TYPE T, NSIMD_CONCEPT_SIMD_EXT SimdExt>\npack<T, 2, SimdExt> to_pack_interleave(const packx2<T, 1, SimdExt> &packx2_) {\n\n  nsimd::pack<T, 2, SimdExt> pack_2;\n  pack_2.car = packx2_.v0.car;\n  pack_2.cdr.car = packx2_.v1.car;\n\n  return pack_2;\n}\n\ntemplate <NSIMD_CONCEPT_VALUE_TYPE T, int N, NSIMD_CONCEPT_SIMD_EXT SimdExt>\npack<T, 2 * N, SimdExt>\nto_pack_interleave(const packx2<T, N, SimdExt> &packx2_N) {\n\n  pack<T, 2 * N, SimdExt> pack_2xN;\n  pack_2xN.car = packx2_N.v0.car;\n  pack_2xN.cdr.car = packx2_N.v1.car;\n\n  packx2<T, N - 1, SimdExt> packx2_n_1;\n  packx2_n_1.v0 = packx2_N.v0.cdr;\n  packx2_n_1.v1 = packx2_N.v1.cdr;\n\n  pack_2xN.cdr.cdr = to_pack_interleave(packx2_n_1);\n\n  return pack_2xN;\n}\n\n// ----------------------------------------------------------------------------\n\ntemplate <NSIMD_CONCEPT_VALUE_TYPE T, NSIMD_CONCEPT_SIMD_EXT SimdExt>\npack<T, 3, SimdExt> to_pack_interleave(const packx3<T, 1, SimdExt> &packx3_) {\n\n  nsimd::pack<T, 3, SimdExt> pack_3;\n  pack_3.car = packx3_.v0.car;\n  pack_3.cdr.car = packx3_.v1.car;\n  pack_3.cdr.cdr.car = packx3_.v2.car;\n\n  return pack_3;\n}\n\ntemplate <NSIMD_CONCEPT_VALUE_TYPE T, int N, NSIMD_CONCEPT_SIMD_EXT SimdExt>\npack<T, 3 * N, SimdExt>\nto_pack_interleave(const packx3<T, N, SimdExt> &packx3_n) {\n\n  pack<T, 3 * N, SimdExt> pack_3xn;\n  pack_3xn.car = packx3_n.v0.car;\n  pack_3xn.cdr.car = packx3_n.v1.car;\n  pack_3xn.cdr.cdr.car = packx3_n.v2.car;\n\n  packx3<T, N - 1, SimdExt> packx3_n_1;\n  packx3_n_1.v0 = packx3_n.v0.cdr;\n  packx3_n_1.v1 = packx3_n.v1.cdr;\n  packx3_n_1.v2 = packx3_n.v2.cdr;\n\n  pack_3xn.cdr.cdr.cdr = to_pack_interleave(packx3_n_1);\n\n  return pack_3xn;\n}\n\n// ----------------------------------------------------------------------------\n\ntemplate <NSIMD_CONCEPT_VALUE_TYPE T, NSIMD_CONCEPT_SIMD_EXT SimdExt>\npack<T, 4, SimdExt> to_pack_interleave(const packx4<T, 1, SimdExt> &packx4_) {\n\n  nsimd::pack<T, 4, SimdExt> pack_4;\n  pack_4.car = packx4_.v0.car;\n  pack_4.cdr.car = packx4_.v1.car;\n  pack_4.cdr.cdr.car = packx4_.v2.car;\n  pack_4.cdr.cdr.cdr.car = packx4_.v3.car;\n\n  return pack_4;\n}\n\ntemplate <NSIMD_CONCEPT_VALUE_TYPE T, int N, NSIMD_CONCEPT_SIMD_EXT SimdExt>\npack<T, 4 * N, SimdExt>\nto_pack_interleave(const packx4<T, N, SimdExt> &packx4_n) {\n\n  pack<T, 4 * N, SimdExt> pack_4xn;\n  pack_4xn.car = packx4_n.v0.car;\n  pack_4xn.cdr.car = packx4_n.v1.car;\n  pack_4xn.cdr.cdr.car = packx4_n.v2.car;\n  pack_4xn.cdr.cdr.cdr.car = packx4_n.v3.car;\n\n  packx4<T, N - 1, SimdExt> packx4_n_1;\n  packx4_n_1.v0 = packx4_n.v0.cdr;\n  packx4_n_1.v1 = packx4_n.v1.cdr;\n  packx4_n_1.v2 = packx4_n.v2.cdr;\n  packx4_n_1.v3 = packx4_n.v3.cdr;\n\n  pack_4xn.cdr.cdr.cdr.cdr = to_pack_interleave(packx4_n_1);\n\n  return pack_4xn;\n}\n\n} // namespace nsimd\n\n#endif\n"
  },
  {
    "path": "include/nsimd/cxx_adv_api_aliases.hpp",
    "content": "/*\n\nCopyright (c) 2021 Agenium Scale\n\nPermission is hereby granted, free of charge, to any person obtaining a copy\nof this software and associated documentation files (the \"Software\"), to deal\nin the Software without restriction, including without limitation the rights\nto use, copy, modify, merge, publish, distribute, sublicense, and/or sell\ncopies of the Software, and to permit persons to whom the Software is\nfurnished to do so, subject to the following conditions:\n\nThe above copyright notice and this permission notice shall be included in all\ncopies or substantial portions of the Software.\n\nTHE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\nIMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\nFITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\nAUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\nLIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\nOUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\nSOFTWARE.\n\n*/\n\n#ifndef NSIMD_CXX_ADV_API_ALIASES_HPP\n#define NSIMD_CXX_ADV_API_ALIASES_HPP\n\n#include <nsimd/cxx_adv_api.hpp>\n\nnamespace nsimd {\n\n/* ------------------------------------------------------------------------- */\n\ntemplate <NSIMD_CONCEPT_VALUE_TYPE T, int N, NSIMD_CONCEPT_SIMD_EXT SimdExt>\npack<T, N, SimdExt> fabs(pack<T, N, SimdExt> const &a0) {\n  return abs(a0);\n}\n\n/* ------------------------------------------------------------------------- */\n\ntemplate <NSIMD_CONCEPT_VALUE_TYPE T, int N, NSIMD_CONCEPT_SIMD_EXT SimdExt>\npack<T, N, SimdExt> fmin(pack<T, N, SimdExt> const &a0,\n                         pack<T, N, SimdExt> const &a1) {\n  return min(a0, a1);\n}\n\n/* ------------------------------------------------------------------------- */\n\ntemplate <NSIMD_CONCEPT_VALUE_TYPE T, int N, NSIMD_CONCEPT_SIMD_EXT SimdExt>\npack<T, N, SimdExt> fmax(pack<T, N, SimdExt> const &a0,\n                         pack<T, N, SimdExt> const &a1) {\n  return max(a0, a1);\n}\n\n/* ------------------------------------------------------------------------- */\n\n} // namespace nsimd\n\n#endif\n"
  },
  {
    "path": "include/nsimd/modules/fixed_point.hpp",
    "content": "/*\n\nCopyright (c) 2019 Agenium Scale\n\nPermission is hereby granted, free of charge, to any person obtaining a copy\nof this software and associated documentation files (the \"Software\"), to deal\nin the Software without restriction, including without limitation the rights\nto use, copy, modify, merge, publish, distribute, sublicense, and/or sell\ncopies of the Software, and to permit persons to whom the Software is\nfurnished to do so, subject to the following conditions:\n\nThe above copyright notice and this permission notice shall be included in all\ncopies or substantial portions of the Software.\n\nTHE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\nIMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\nFITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\nAUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\nLIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\nOUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\nSOFTWARE.\n\n*/\n\n#ifndef NSIMD_MODULES_FIXED_POINT_HPP\n#define NSIMD_MODULES_FIXED_POINT_HPP\n\n#include <nsimd/nsimd.h>\n\n#include \"nsimd/modules/fixed_point/fixed.hpp\"\n#include \"nsimd/modules/fixed_point/simd.hpp\"\n#include \"nsimd/modules/fixed_point/simd_math.hpp\"\n\nnamespace nsimd {\nnamespace fixed_point {\n\n// -----------------------------------------------------------------------------\n// ------------------------ Types definitions and len --------------------------\n// -----------------------------------------------------------------------------\n\ntemplate <typename T> NSIMD_STRUCT pack;\n\ntemplate <typename T> int len(const T &) { return fpsimd_n(T()); }\n\ntemplate <typename T> int len(const nsimd::fixed_point::pack<T> &) {\n  return fpsimd_n(fpsimd_t<T::lf, T::rt>());\n}\n\ntemplate <typename T> NSIMD_STRUCT pack {\n  static const u8 lf = T::lf;\n  static const u8 rt = T::rt;\n  typedef fp_t<lf, rt> value_type;\n  fpsimd_t<lf, rt> val;\n\n  friend std::ostream &operator<<(std::ostream &os, pack<T> &a0) {\n    T *buf = new T[nsimd::fixed_point::len(a0)];\n    nsimd::fixed_point::simd_storeu( buf , a0.val );\n    os << \"{ \";\n    int n = nsimd::fixed_point::len(a0);\n    for (int i = 0; i < n; i++) {\n      os << buf[i];\n      if (i < n - 1) {\n        os << \", \";\n      }\n    }\n    os << \" }\";\n    delete[] buf;\n    return os;\n  }\n};\n\ntemplate <typename T> NSIMD_STRUCT packl {\n  static const u8 lf = T::lf;\n  static const u8 rt = T::rt;\n  typedef typename fp_t<lf, rt>::logical_type value_type;\n  fpsimdl_t<lf, rt> val;\n};\n\n// -----------------------------------------------------------------------------\n// ------------------- Basic arithmetic operators ------------------------------\n// -----------------------------------------------------------------------------\n\ntemplate <typename T>\nNSIMD_INLINE pack<T> add(const pack<T> &a0, const pack<T> &a1) {\n  pack<T> res;\n  res.val = simd_add<T::lf, T::rt>(a0.val, a1.val);\n  return res;\n}\n\ntemplate <typename T>\nNSIMD_INLINE pack<T> operator+(const pack<T> &a0, const pack<T> &a1) {\n  return add( a0 , a1 );\n}\n\ntemplate <typename T>\nNSIMD_INLINE pack<T> sub(const pack<T> &a0, const pack<T> &a1) {\n  pack<T> res;\n  res.val = simd_sub<T::lf, T::rt>(a0.val, a1.val);\n  return res;\n}\n\ntemplate <typename T>\nNSIMD_INLINE pack<T> operator-(const pack<T> &a0, const pack<T> &a1) {\n  return sub( a0 , a1 );\n}\n\ntemplate <typename T>\nNSIMD_INLINE pack<T> mul(const pack<T> &a0, const pack<T> &a1) {\n  pack<T> res;\n  res.val = simd_mul<T::lf, T::rt>(a0.val, a1.val);\n  return res;\n}\n\ntemplate <typename T>\nNSIMD_INLINE pack<T> operator*(const pack<T> &a0, const pack<T> &a1) {\n  return mul( a0 , a1 );\n}\n\ntemplate <typename T>\nNSIMD_INLINE pack<T> div(const pack<T> &a0, const pack<T> &a1) {\n  pack<T> res;\n  res.val = simd_div<T::lf, T::rt>(a0.val, a1.val);\n  return res;\n}\n\ntemplate <typename T>\nNSIMD_INLINE pack<T> operator/(const pack<T> &a0, const pack<T> &a1) {\n  return div( a0 , a1 );\n}\n\ntemplate <typename T>\nNSIMD_INLINE pack<T> fma(const pack<T> &a0, const pack<T> &a1,\n                         const pack<T> &a2) {\n  pack<T> res;\n  res.val = simd_fma<T::lf, T::rt>(a0.val, a1.val, a2.val);\n  return res;\n}\n\ntemplate <typename T>\nNSIMD_INLINE pack<T> min(const pack<T> &a0, const pack<T> &a1) {\n  pack<T> res;\n  res.val = simd_min(a0.val, a1.val);\n  return res;\n}\n\ntemplate <typename T>\nNSIMD_INLINE pack<T> max(const pack<T> &a0, const pack<T> &a1) {\n  pack<T> res;\n  res.val = simd_max(a0.val, a1.val);\n  return res;\n}\n\n// -----------------------------------------------------------------------------\n// ------------------- Comparison operators ------------------------------------\n// -----------------------------------------------------------------------------\n\ntemplate <typename T>\nNSIMD_INLINE packl<T> eq(const pack<T> &a0, const pack<T> &a1) {\n  packl<T> res;\n  res.val = simd_eq(a0.val, a1.val);\n  return res;\n}\n\ntemplate <typename T>\nNSIMD_INLINE pack<T> operator==(const pack<T> &a0, const pack<T> &a1) {\n  return eq( a0 , a1 );\n}\n\ntemplate <typename T>\nNSIMD_INLINE packl<T> ne(const pack<T> &a0, const pack<T> &a1) {\n  packl<T> res;\n  res.val = simd_ne(a0.val, a1.val);\n  return res;\n}\n\ntemplate <typename T>\nNSIMD_INLINE pack<T> operator!=(const pack<T> &a0, const pack<T> &a1) {\n  return ne( a0 , a1 );\n}\n\ntemplate <typename T>\nNSIMD_INLINE packl<T> le(const pack<T> &a0, const pack<T> &a1) {\n  packl<T> res;\n  res.val = simd_le(a0.val, a1.val);\n  return res;\n}\n\ntemplate <typename T>\nNSIMD_INLINE pack<T> operator<=(const pack<T> &a0, const pack<T> &a1) {\n  return le( a0 , a1 );\n}\n\ntemplate <typename T>\nNSIMD_INLINE packl<T> lt(const pack<T> &a0, const pack<T> &a1) {\n  packl<T> res;\n  res.val = simd_lt(a0.val, a1.val);\n  return res;\n}\n\ntemplate <typename T>\nNSIMD_INLINE pack<T> operator<(const pack<T> &a0, const pack<T> &a1) {\n  return lt( a0 , a1 );\n}\n\ntemplate <typename T>\nNSIMD_INLINE packl<T> ge(const pack<T> &a0, const pack<T> &a1) {\n  packl<T> res;\n  res.val = simd_ge(a0.val, a1.val);\n  return res;\n}\n\ntemplate <typename T>\nNSIMD_INLINE pack<T> operator>=(const pack<T> &a0, const pack<T> &a1) {\n  return ge( a0 , a1 );\n}\n\ntemplate <typename T>\nNSIMD_INLINE packl<T> gt(const pack<T> &a0, const pack<T> &a1) {\n  packl<T> res;\n  res.val = simd_gt(a0.val, a1.val);\n  return res;\n}\n\ntemplate <typename T>\nNSIMD_INLINE pack<T> operator>(const pack<T> &a0, const pack<T> &a1) {\n  return gt( a0 , a1 );\n}\n\ntemplate <typename T>\nNSIMD_INLINE pack<T> if_else1(const packl<T> &a0, const pack<T> &a1,\n                              const pack<T> &a2) {\n  pack<T> res;\n  res.val = simd_if_else1(a0.val, a1.val, a2.val);\n  return res;\n}\n\n// -----------------------------------------------------------------------------\n// ------------------- Bitwise operators  --------------------------------------\n// -----------------------------------------------------------------------------\n\ntemplate <typename T>\nNSIMD_INLINE pack<T> andb(const pack<T> &a0, const pack<T> &a1) {\n  pack<T> res;\n  res.val = simd_andb(a0.val, a1.val);\n  return res;\n}\n\ntemplate <typename T>\nNSIMD_INLINE packl<T> andl(const packl<T> &a0, const packl<T> &a1) {\n  packl<T> res;\n  res.val = simd_andl(a0.val, a1.val);\n  return res;\n}\n\ntemplate <typename T>\nNSIMD_INLINE pack<T> andnotb(const pack<T> &a0, const pack<T> &a1) {\n  pack<T> res;\n  res.val = simd_andnotb(a0.val, a1.val);\n  return res;\n}\n\ntemplate <typename T>\nNSIMD_INLINE packl<T> andnotl(const packl<T> &a0, const packl<T> &a1) {\n  packl<T> res;\n  res.val = simd_andnotl(a0.val, a1.val);\n  return res;\n}\n\ntemplate <typename T> NSIMD_INLINE pack<T> notb(pack<T> a0) {\n  pack<T> res;\n  res.val = simd_notb(a0.val);\n  return res;\n}\n\ntemplate <typename T> NSIMD_INLINE packl<T> notl(packl<T> a0) {\n  packl<T> res;\n  res.val = simd_notl(a0.val);\n  return res;\n}\n\ntemplate <typename T>\nNSIMD_INLINE pack<T> orb(const pack<T> &a0, const pack<T> &a1) {\n  pack<T> res;\n  res.val = simd_orb(a0.val, a1.val);\n  return res;\n}\n\ntemplate <typename T>\nNSIMD_INLINE packl<T> orl(const packl<T> &a0, const packl<T> &a1) {\n  packl<T> res;\n  res.val = simd_orl(a0.val, a1.val);\n  return res;\n}\n\ntemplate <typename T>\nNSIMD_INLINE pack<T> xorb(const pack<T> &a0, const pack<T> &a1) {\n  pack<T> res;\n  res.val = simd_xorb(a0.val, a1.val);\n  return res;\n}\n\ntemplate <typename T>\nNSIMD_INLINE packl<T> xorl(const packl<T> &a0, const packl<T> &a1) {\n  packl<T> res;\n  res.val = simd_xorl(a0.val, a1.val);\n  return res;\n}\n\n// -----------------------------------------------------------------------------\n// ------------------- Math functions ------------------------------------------\n// -----------------------------------------------------------------------------\n\ntemplate <typename T> NSIMD_INLINE pack<T> abs(pack<T> a0) {\n  pack<T> res;\n  res.val = simd_abs(a0.val);\n  return res;\n}\n\ntemplate <typename T> NSIMD_INLINE pack<T> rec(pack<T> a0) {\n  pack<T> res;\n  res.val = simd_rec(a0.val);\n  return res;\n}\n\n// -----------------------------------------------------------------------------\n// -------------------- Load functions -----------------------------------------\n// -----------------------------------------------------------------------------\n\ntemplate <typename T> NSIMD_INLINE T set1(typename T::value_type a0) {\n  T res;\n  res.val = simd_set1<T::lf, T::rt>(a0);\n  return res;\n}\n\ntemplate <typename T> NSIMD_INLINE T loadu(typename T::value_type *p) {\n  T res;\n  res.val = simd_loadu<T::lf, T::rt>(p);\n  return res;\n}\n\ntemplate <typename T> NSIMD_INLINE T loada(typename T::value_type *p) {\n  T res;\n  res.val = simd_loada<T::lf, T::rt>(p);\n  return res;\n}\n\ntemplate <typename T> NSIMD_INLINE T loadlu(typename T::value_type *p) {\n  T res;\n  res.val = simd_loadlu<T::lf, T::rt>(p);\n  return res;\n}\n\ntemplate <typename T> NSIMD_INLINE T loadla(typename T::value_type *p) {\n  T res;\n  res.val = simd_loadla<T::lf, T::rt>(p);\n  return res;\n}\n\n// -----------------------------------------------------------------------------\n// -------------------- Store functions ----------------------------------------\n// -----------------------------------------------------------------------------\n\ntemplate <typename T>\nNSIMD_INLINE void storeu(typename T::value_type *p, T v) {\n  simd_storeu<T::lf, T::rt>(p, v.val);\n}\n\ntemplate <typename T>\nNSIMD_INLINE void storea(typename T::value_type *p, T v) {\n  simd_storea<T::lf, T::rt>(p, v.val);\n}\n\ntemplate <typename T>\nNSIMD_INLINE void storelu(typename T::value_type *p, T v) {\n  simd_storelu<T::lf, T::rt>(p, v.val);\n}\n\ntemplate <typename T>\nNSIMD_INLINE void storela(typename T::value_type *p, T v) {\n  simd_storela<T::lf, T::rt>(p, v.val);\n}\n\n} // namespace fixed_point\n\n} // namespace nsimd\n\n#endif\n"
  },
  {
    "path": "include/nsimd/modules/memory_management.hpp",
    "content": "/*\n\nCopyright (c) 2021 Agenium Scale\n\nPermission is hereby granted, free of charge, to any person obtaining a copy\nof this software and associated documentation files (the \"Software\"), to deal\nin the Software without restriction, including without limitation the rights\nto use, copy, modify, merge, publish, distribute, sublicense, and/or sell\ncopies of the Software, and to permit persons to whom the Software is\nfurnished to do so, subject to the following conditions:\n\nThe above copyright notice and this permission notice shall be included in all\ncopies or substantial portions of the Software.\n\nTHE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\nIMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\nFITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\nAUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\nLIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\nOUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\nSOFTWARE.\n\n*/\n\n#ifndef NSIMD_MODULES_MEMORY_MANAGEMENT_HPP\n#define NSIMD_MODULES_MEMORY_MANAGEMENT_HPP\n\n#include <cstdlib>\n#include <cstring>\n#include <iostream>\n#include <nsimd/nsimd.h>\n\nnamespace nsimd {\n\n// ----------------------------------------------------------------------------\n// CUDA\n\n#if defined(NSIMD_CUDA)\n\ntemplate <typename T> T *device_malloc(size_t sz) {\n  void *ret;\n  if (cudaMalloc(&ret, sz * sizeof(T)) != cudaSuccess) {\n    return NULL;\n  }\n  return (T *)ret;\n}\n\ntemplate <typename T> T *device_calloc(size_t sz) {\n  void *ret;\n  if (cudaMalloc(&ret, sz * sizeof(T)) != cudaSuccess) {\n    return NULL;\n  }\n  if (cudaMemset((void *)ret, 0, sz * sizeof(T)) != cudaSuccess) {\n    cudaFree(ret);\n    return NULL;\n  }\n  return (T *)ret;\n}\n\ntemplate <typename T> void device_free(T *ptr) { cudaFree((void *)ptr); }\n\ntemplate <typename T>\nvoid copy_to_device(T *device_ptr, T *host_ptr, size_t sz) {\n  cudaMemcpy((void *)device_ptr, (void *)host_ptr, sz * sizeof(T),\n             cudaMemcpyHostToDevice);\n}\n\ntemplate <typename T>\nvoid copy_to_host(T *host_ptr, T *device_ptr, size_t sz) {\n  cudaMemcpy((void *)host_ptr, (void *)device_ptr, sz * sizeof(T),\n             cudaMemcpyDeviceToHost);\n}\n\n#define nsimd_fill_dev_mem_func(func_name, expr)                              \\\n  template <typename T>                                                       \\\n  __global__ void kernel_##func_name##_(T *ptr, int n) {                      \\\n    int i = threadIdx.x + blockIdx.x * blockDim.x;                            \\\n    if (i < n) {                                                              \\\n      ptr[i] = (T)(expr);                                                     \\\n    }                                                                         \\\n  }                                                                           \\\n                                                                              \\\n  template <typename T> void func_name(T *ptr, size_t sz) {                   \\\n    kernel_##func_name##_<<<(unsigned int)((sz + 127) / 128), 128>>>(         \\\n        ptr, int(sz));                                                        \\\n  }\n\n// ----------------------------------------------------------------------------\n// ROCm\n\n#elif defined(NSIMD_ROCM)\n\ntemplate <typename T> T *device_malloc(size_t sz) {\n  void *ret;\n  if (hipMalloc(&ret, sz * sizeof(T)) != hipSuccess) {\n    return NULL;\n  }\n  return (T *)ret;\n}\n\ntemplate <typename T> T *device_calloc(size_t sz) {\n  void *ret;\n  if (hipMalloc(&ret, sz * sizeof(T)) != hipSuccess) {\n    return NULL;\n  }\n  if (hipMemset((void *)ret, 0, sz * sizeof(T)) != hipSuccess) {\n    hipFree(ret);\n    return NULL;\n  }\n  return (T *)ret;\n}\n\ntemplate <typename T> void device_free(T *ptr) { hipFree((void *)ptr); }\n\ntemplate <typename T>\nvoid copy_to_device(T *device_ptr, T *host_ptr, size_t sz) {\n  hipMemcpy((void *)device_ptr, (void *)host_ptr, sz * sizeof(T),\n            hipMemcpyHostToDevice);\n}\n\ntemplate <typename T> void copy_to_host(T *host_ptr, T *device_ptr, size_t sz) {\n  hipMemcpy((void *)host_ptr, (void *)device_ptr, sz * sizeof(T),\n            hipMemcpyDeviceToHost);\n}\n\n#define nsimd_fill_dev_mem_func(func_name, expr)                              \\\n  template <typename T>                                                       \\\n  __global__ void kernel_##func_name##_(T *ptr, size_t n) {                   \\\n    size_t i = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;                \\\n    if (i < n) {                                                              \\\n      ptr[i] = (T)(expr);                                                     \\\n    }                                                                         \\\n  }                                                                           \\\n                                                                              \\\n  template <typename T> void func_name(T *ptr, size_t sz) {                   \\\n    hipLaunchKernelGGL((kernel_##func_name##_<T>),                            \\\n                       (size_t)((sz + 127) / 128), 128, 0, NULL, ptr,         \\\n                       (size_t)sz);                                           \\\n  }\n\n// ----------------------------------------------------------------------------\n// oneAPI\n\n#elif defined(NSIMD_ONEAPI)\n\ntemplate <typename T> T *device_malloc(const size_t sz) {\n  return sycl::malloc_device<T>(sz, nsimd::oneapi::default_queue());\n}\n\ntemplate <typename T> T *device_calloc(const size_t sz) {\n  sycl::queue q = nsimd::oneapi::default_queue();\n  T *const ret = sycl::malloc_device<T>(sz, q);\n  if (ret == NULL) {\n    return NULL;\n  }\n  q.memset((void *)ret, 0, sz * sizeof(T)).wait_and_throw();\n  return ret;\n}\n\ntemplate <typename T> void device_free(T *const ptr) {\n  sycl::queue q = nsimd::oneapi::default_queue();\n  sycl::free(ptr, q);\n}\n\ntemplate <typename T>\nvoid copy_to_device(T *const device_ptr, const T *const host_ptr,\n                    const size_t sz) {\n  sycl::queue q = nsimd::oneapi::default_queue();\n  q.memcpy((void *)device_ptr, (const void *)host_ptr, sz * sizeof(T))\n      .wait_and_throw();\n}\n\ntemplate <typename T>\nvoid copy_to_host(T *const host_ptr, const T *const device_ptr, size_t sz) {\n  sycl::queue q = nsimd::oneapi::default_queue();\n  q.memcpy((void *)host_ptr, (const void *)device_ptr, sz * sizeof(T))\n      .wait_and_throw();\n}\n\n#define nsimd_fill_dev_mem_func(func_name, expr)                              \\\n  template <typename T>                                                       \\\n  void kernel_##func_name##_(T *const ptr, const size_t sz,                   \\\n                             sycl::nd_item<1> item) {                         \\\n    const size_t i = item.get_global_id().get(0);                             \\\n    if (i < sz) {                                                             \\\n      ptr[i] = nsimd::to<T>(expr);                                            \\\n    }                                                                         \\\n  }                                                                           \\\n                                                                              \\\n  template <typename T> void func_name(T *const ptr, const size_t sz) {       \\\n    const size_t total_num_threads =                                          \\\n        nsimd::compute_total_num_threads(sz, THREADS_PER_BLOCK);              \\\n    sycl::queue q = nsimd::oneapi::default_queue();                           \\\n    q.parallel_for(sycl::nd_range<1>(sycl::range<1>(total_num_threads),       \\\n                                     sycl::range<1>(THREADS_PER_BLOCK)),      \\\n                   [=](sycl::nd_item<1> item) {                               \\\n                     kernel_##func_name##_(ptr, sz, item);                    \\\n                   })                                                         \\\n        .wait_and_throw();                                                    \\\n  }\n\n// ----------------------------------------------------------------------------\n// CPU\n\n#else\n\ntemplate <typename T> T *device_malloc(size_t sz) {\n  return (T *)malloc(sz * sizeof(T));\n}\n\ntemplate <typename T> T *device_calloc(size_t sz) {\n  return (T *)calloc(sz * sizeof(T), 1);\n}\n\ntemplate <typename T> void device_free(T *ptr) { free((void *)ptr); }\n\ntemplate <typename T>\nvoid copy_to_device(T *device_ptr, T *host_ptr, size_t sz) {\n  memcpy((void *)device_ptr, (void *)host_ptr, sz * sizeof(T));\n}\n\ntemplate <typename T>\nvoid copy_to_host(T *host_ptr, T *device_ptr, size_t sz) {\n  memcpy((void *)host_ptr, (void *)device_ptr, sz * sizeof(T));\n}\n\n#define nsimd_fill_dev_mem_func(func_name, expr)                              \\\n  template <typename T> void func_name(T *ptr, size_t sz) {                   \\\n    for (size_t i = 0; i < sz; i++) {                                         \\\n      ptr[i] = nsimd::to<T>(expr);                                            \\\n    }                                                                         \\\n  }\n\n#endif\n\n// ----------------------------------------------------------------------------\n// Pair of pointers\n\ntemplate <typename T>\nstruct paired_pointers_t {\n  T *device_ptr, *host_ptr;\n  size_t sz;\n};\n\ntemplate <typename T> paired_pointers_t<T> pair_malloc(size_t sz) {\n  paired_pointers_t<T> ret;\n  ret.sz = 0;\n#if defined(NSIMD_CUDA) || defined(NSIMD_ROCM) || defined(NSIMD_ONEAPI)\n  ret.device_ptr = device_malloc<T>(sz);\n  if (ret.device_ptr == NULL) {\n    ret.host_ptr = NULL;\n    return ret;\n  }\n  ret.host_ptr = (T *)malloc(sz);\n  if (ret.host_ptr == NULL) {\n    device_free(ret.device_ptr);\n    ret.device_ptr = NULL;\n    return ret;\n  }\n#else\n  ret.device_ptr = device_malloc<T>(sz);\n  ret.host_ptr = ret.device_ptr;\n#endif\n  ret.sz = sz;\n  return ret;\n}\n\ntemplate <typename T> paired_pointers_t<T> pair_malloc_or_exit(size_t sz) {\n  paired_pointers_t<T> ret = pair_malloc<T>(sz);\n  if (ret.device_ptr == NULL) {\n    std::cerr << __FILE__ << \":\" << __LINE__ << \": error cannot malloc \" << sz\n              << \" bytes\" << std::endl;\n    exit(EXIT_FAILURE);\n  }\n  return ret;\n}\n\ntemplate <typename T> paired_pointers_t<T> pair_calloc(size_t sz) {\n  paired_pointers_t<T> ret;\n  ret.sz = 0;\n#if defined(NSIMD_CUDA) || defined(NSIMD_ROCM) || defined(NSIMD_ONEAPI)\n  ret.device_ptr = device_calloc<T>(sz);\n  if (ret.device_ptr == NULL) {\n    ret.host_ptr = NULL;\n    return ret;\n  }\n  ret.host_ptr = calloc(sz, 1);\n  if (ret.host_ptr == NULL) {\n    device_free(ret.device_ptr);\n    ret.device_ptr = NULL;\n    return ret;\n  }\n#else\n  ret.device_ptr = device_calloc<T>(sz);\n  ret.host_ptr = ret.device_ptr;\n#endif\n  ret.sz = sz;\n  return ret;\n}\n\ntemplate <typename T> paired_pointers_t<T> pair_calloc_or_exit(size_t sz) {\n  paired_pointers_t<T> ret = pair_calloc<T>(sz);\n  if (ret.device_ptr == NULL) {\n    std::cerr << __FILE__ << \":\" << __LINE__ << \": error cannot calloc \" << sz\n              << \" bytes\" << std::endl;\n    exit(EXIT_FAILURE);\n  }\n  return ret;\n}\n\ntemplate <typename T> void pair_free(paired_pointers_t<T> p) {\n#if defined(NSIMD_CUDA) || defined(NSIMD_ROCM) || defined(NSIMD_ONEAPI)\n  device_free(p.device_free);\n  free((void *)p.host_ptr);\n#else\n  free((void *)p.host_ptr);\n#endif\n}\n\ntemplate <typename T> void copy_to_device(paired_pointers_t<T> p) {\n#if defined(NSIMD_CUDA) || defined(NSIMD_ROCM) || defined(NSIMD_ONEAPI)\n  copy_to_device(p.device_ptr, p.host_ptr, p.sz);\n#else\n  (void)p;\n#endif\n}\n\ntemplate <typename T> void copy_to_host(paired_pointers_t<T> p) {\n#if defined(NSIMD_CUDA) || defined(NSIMD_ROCM) || defined(NSIMD_ONEAPI)\n  copy_to_host(p.host_ptr, p.device_ptr, p.sz);\n#else\n  (void)p;\n#endif\n}\n\n} // namespace nsimd\n\n#endif\n"
  },
  {
    "path": "include/nsimd/modules/spmd.hpp",
    "content": "/*\n\nCopyright (c) 2020 Agenium Scale\n\nPermission is hereby granted, free of charge, to any person obtaining a copy\nof this software and associated documentation files (the \"Software\"), to deal\nin the Software without restriction, including without limitation the rights\nto use, copy, modify, merge, publish, distribute, sublicense, and/or sell\ncopies of the Software, and to permit persons to whom the Software is\nfurnished to do so, subject to the following conditions:\n\nThe above copyright notice and this permission notice shall be included in all\ncopies or substantial portions of the Software.\n\nTHE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\nIMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\nFITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\nAUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\nLIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\nOUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\nSOFTWARE.\n\n*/\n\n#ifndef NSIMD_MODULES_SPMD_HPP\n#define NSIMD_MODULES_SPMD_HPP\n\n#include <nsimd/nsimd-all.hpp>\n\n#include <cassert>\n#include <vector>\n#include <cstring>\n\nnamespace spmd {\n\n#if NSIMD_CXX < 2011 || NSIMD_C < 1999\n  #define NSIMD_VARIADIC_MACROS_IS_EXTENSION\n#endif\n\n#ifdef NSIMD_VARIADIC_MACROS_IS_EXTENSION\n  #if defined(NSIMD_IS_GCC)\n    /* Not emitting the warning -Wvariadic-macros is not possible with\n       GCC <= 12. It is a bug. A workaround is to tell GCC to consider this\n       header file as a system header file so that all warnings are not\n       emitted. This is not satisfying but necessary for the moment.\n       */\n    #pragma GCC system_header\n    #pragma GCC diagnostic push\n    #pragma GCC diagnostic ignored \"-Wvariadic-macros\"\n  #elif defined(NSIMD_IS_CLANG)\n    #pragma clang diagnostic push\n    #pragma clang diagnostic ignored \"-Wvariadic-macros\"\n  #endif\n#endif\n\n// ----------------------------------------------------------------------------\n// GPUs: CUDA, ROCm or oneAPI\n\n#if defined(NSIMD_CUDA) || defined(NSIMD_ROCM) || defined(NSIMD_ONEAPI)\n\n#if defined(NSIMD_CUDA)\n\n// 1d kernel definition\n#define spmd_kernel_1d(name, ...)                                             \\\n  template <int spmd_ScalarBits_> __global__ void name(__VA_ARGS__, int n) {  \\\n    int spmd_i_ = threadIdx.x + blockIdx.x * blockDim.x;                      \\\n    if (spmd_i_ < n) {\n\n// templated kernel definition\n#define spmd_tmpl_kernel_1d(name, template_argument, ...)                     \\\n  template <typename template_argument, int spmd_ScalarBits_>                 \\\n  __global__ void name(__VA_ARGS__, int n) {                                  \\\n    int spmd_i_ = threadIdx.x + blockIdx.x * blockDim.x;                      \\\n    if (spmd_i_ < n) {\n\n#elif defined(NSIMD_ROCM)\n\n// 1d kernel definition\n#define spmd_kernel_1d(name, ...)                                             \\\n  template <int spmd_ScalarBits_>                                             \\\n  __global__ void name(__VA_ARGS__, size_t n) {                               \\\n    size_t spmd_i_ = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;          \\\n    if (spmd_i_ < n) {\n\n// templated kernel definition\n#define spmd_tmpl_kernel_1d(name, template_argument, ...)                     \\\n  template <typename template_argument, int spmd_ScalarBits_>                 \\\n  __global__ void name(__VA_ARGS__, size_t n) {                               \\\n    size_t spmd_i_ = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;          \\\n    if (spmd_i_ < n) {\n\n#else\n\n// 1d kernel definition\n#define spmd_kernel_1d(name, ...)                                             \\\n  template <int spmd_ScalarBits_>                                             \\\n  inline void name(__VA_ARGS__, const size_t n, sycl::nd_item<1> item) {      \\\n    size_t spmd_i_ = item.get_global_id().get(0);                             \\\n    if (spmd_i_ < n) {\n\n// templated kernel definition\n#define spmd_tmpl_kernel_1d(name, template_argument, ...)                     \\\n  template <typename template_argument, int spmd_ScalarBits_>                 \\\n  inline void name(__VA_ARGS__, const size_t n, sycl::nd_item<1> item) {      \\\n    size_t spmd_i_ = item.get_global_id().get(0);                             \\\n    if (spmd_i_ < n) {\n\n#endif\n\n#define spmd_kernel_end                                                       \\\n  }                                                                           \\\n  }\n\n#if defined(NSIMD_CUDA) || defined(NSIMD_ROCM)\n\n// device function\n#define spmd_dev_func(type_name, ...)                                         \\\n  template <int spmd_ScalarBits_> __device__ type_name(__VA_ARGS__) {\n\n// templated device function\n#define spmd_tmpl_dev_func(type_name, template_argument, ...)                 \\\n  template <typename template_argument, int spmd_ScalarBits_>                 \\\n  __device__ type_name(__VA_ARGS__) {\n\n#else\n\n// device function\n#define spmd_dev_func(type_name, ...)                                         \\\n  template <int spmd_ScalarBits_> type_name(__VA_ARGS__) {\n\n// templated device function\n#define spmd_tmpl_dev_func(type_name, template_argument, ...)                 \\\n  template <typename template_argument, int spmd_ScalarBits_>                 \\\n  type_name(__VA_ARGS__) {\n\n#endif\n\n#define spmd_dev_func_end }\n\n// call spmd_dev_function\n#define spmd_call_dev_func(name, ...) name<spmd_ScalarBits_>(__VA_ARGS__)\n\n// call templated spmd_dev_function\n#define spmd_call_tmpl_dev_func(name, template_argument, ...)                 \\\n  name<template_argument, spmd_ScalarBits_>(__VA_ARGS__)\n\n#if defined(NSIMD_CUDA)\n\n// launch 1d kernel CUDA\n#define spmd_launch_kernel_1d(name, spmd_scalar_bits_, threads_per_block, n,  \\\n                              ...)                                            \\\n  name<spmd_scalar_bits_>                                                     \\\n      <<<(unsigned int)nsimd_kernel_param(n, threads_per_block),              \\\n         (unsigned int)(threads_per_block)>>>(__VA_ARGS__, (int)n)\n\n#elif defined(NSIMD_ROCM)\n\n// launch 1d kernel ROCm\n#define spmd_launch_kernel_1d(name, spmd_scalar_bits_, threads_per_block, n,  \\\n                              ...)                                            \\\n  hipLaunchKernelGGL((name<spmd_scalar_bits_>),                               \\\n                     (size_t)nsimd_kernel_param(n, threads_per_block),        \\\n                     (size_t)(threads_per_block), 0, NULL, __VA_ARGS__,       \\\n                     (size_t)n)\n\n#else\n\n// launch 1d kernel oneAPI\n#define spmd_launch_kernel_1d(name, spmd_scalar_bits_, threads_per_block, n,  \\\n                              ...)                                            \\\n  size_t total_num_threads =                                                  \\\n      (size_t)nsimd_kernel_param(n, threads_per_block);                       \\\n  sycl::queue q = nsimd::oneapi::default_queue();                             \\\n  q.parallel_for(sycl::nd_range<1>(sycl::range<1>(total_num_threads),         \\\n                                   sycl::range<1>(threads_per_block)),        \\\n                 [=](sycl::nd_item<1> item) {                                 \\\n                   name<spmd_scalar_bits_>(__VA_ARGS__, (size_t)n, item);     \\\n                 })                                                           \\\n      .wait_and_throw();\n\n#endif\n\n// supported types (generic)\ntemplate <int ScalarBits> struct type_t {};\n\n// supported types (scalar)\ntemplate <> struct type_t<8> {\n  typedef i8 itype;\n  typedef u8 utype;\n  typedef bool btype;\n};\n\ntemplate <> struct type_t<16> {\n  typedef i16 itype;\n  typedef u16 utype;\n  typedef f16 ftype;\n  typedef bool btype;\n};\n\ntemplate <> struct type_t<32> {\n  typedef i32 itype;\n  typedef u32 utype;\n  typedef f32 ftype;\n  typedef bool btype;\n};\n\ntemplate <> struct type_t<64> {\n  typedef i64 itype;\n  typedef u64 utype;\n  typedef f64 ftype;\n  typedef bool btype;\n};\n\n// supported types (generic)\n#define k_int typename spmd::type_t<spmd_ScalarBits_>::itype\n#define k_uint typename spmd::type_t<spmd_ScalarBits_>::utype\n#define k_float typename spmd::type_t<spmd_ScalarBits_>::ftype\n#define k_bool typename spmd::type_t<spmd_ScalarBits_>::btype\n\n// loads and stores (generic)\n#define k_store(base_addr, value)                                             \\\n  do {                                                                        \\\n    base_addr[spmd_i_] = value;                                               \\\n  } while (0)\n\n#define k_unmasked_store(base_addr, value) k_store(base_addr, value)\n#define k_load(base_addr) base_addr[spmd_i_]\n#define k_unmasked_load(base_addr) k_load(base_addr)\n\n// f32 <--> f16 conversions\n#if defined(NSIMD_CUDA) || defined(NSIMD_ROCM)\n#define k_f32_to_f16(a) __float2half(a)\n#define k_f16_to_f32(a) __half2float(a)\n#else\n#define k_f32_to_f16(a) f16(a)\n#define k_f16_to_f32(a) static_cast<f32>(a)\n#endif\n\n// assignment statement\n#define k_set(var, value)                                                     \\\n  do {                                                                        \\\n    var = value;                                                              \\\n  } while (0)\n\n#define k_unmasked_set(var, value) k_set(var, value)\n\n// while statement (k_while)\n#define k_while(cond) while (cond) {\n#define k_endwhile }\n\n// break statement (k_break)\n#define k_break break\n\n// continue statement (k_continue)\n#define k_continue continue\n\n// endwhile statement (k_endwhile)\n#define k_endwhile }\n\n// if statement (k_if)\n#define k_if(cond) if (cond) {\n\n// elseif statement (k_elseif)\n#define k_elseif(cond)                                                        \\\n  }                                                                           \\\n  else if (cond) {\n\n// else statement (k_else)\n#define k_else                                                                \\\n  }                                                                           \\\n  else {\n\n// endif statement (k_endif)\n#define k_endif }\n\n// ----------------------------------------------------------------------------\n// SIMD and SCALAR: dispatch between the two is done on a type\n\n#else\n\n// helpers\ntemplate <typename T, int N> nsimd::pack<T, N> to_pack(T a) {\n  return nsimd::pack<T, N>(a);\n}\n\ntemplate <typename T, int N, typename SimdExt>\nnsimd::pack<T, N, SimdExt> to_pack(nsimd::pack<T, N, SimdExt> const &a) {\n  return a;\n}\n\ntemplate <typename T, int N> nsimd::packl<T, N> to_packl(bool a) {\n  return nsimd::packl<T, N>(int(a));\n}\n\ntemplate <typename T, int N, typename Pack>\nnsimd::packl<T, N> to_packl(Pack const &a) {\n  return nsimd::reinterpretl<nsimd::packl<T, N> >(a);\n}\n\ntemplate <typename T> struct base_type { typedef T type; };\n\ntemplate <typename T, int N, typename SimdExt>\nstruct base_type<nsimd::pack<T, N, SimdExt> > {\n  typedef T type;\n};\n\ntemplate <typename T, int N, typename SimdExt>\nstruct base_type<nsimd::packl<T, N, SimdExt> > {\n  typedef T type;\n};\n\n// type indicating SIMD or scalar kernel\nstruct KernelScalar {};\nstruct KernelSIMD {};\n\n// common to all function: mainly to avoid warnings\n#define spmd_func_begin_                                                      \\\n  (void)spmd_i_;                                                              \\\n  (void)spmd_mask_;                                                           \\\n  k_bool spmd_off_lanes_return_(false);                                       \\\n  (void)spmd_off_lanes_return_;                                               \\\n  k_bool spmd_off_lanes_break_(false);                                        \\\n  (void)spmd_off_lanes_break_;                                                \\\n  k_bool spmd_off_lanes_continue_(false);                                     \\\n  (void)spmd_off_lanes_continue_;\n\n// 1d kernel definition\n#define spmd_kernel_1d(name, ...)                                             \\\n  template <typename spmd_KernelType_, int spmd_ScalarBits_, int spmd_N_,     \\\n            typename spmd_MaskType_>                                          \\\n  void name(nsimd_nat spmd_i_, spmd_MaskType_ spmd_mask_, __VA_ARGS__) {      \\\n    spmd_func_begin_\n\n// templated kernel definition\n#define spmd_tmpl_kernel_1d(name, template_argument, ...)                     \\\n  template <typename template_argument, typename spmd_KernelType_,            \\\n            int spmd_ScalarBits_, int spmd_N_, typename spmd_MaskType_>       \\\n  void name(nsimd_nat spmd_i_, spmd_MaskType_ spmd_mask_, __VA_ARGS__) {      \\\n    spmd_func_begin_\n\n#define spmd_kernel_end }\n\n// device function\n#define spmd_dev_func(type_name, ...)                                         \\\n  template <typename spmd_KernelType_, int spmd_ScalarBits_, int spmd_N_,     \\\n            typename spmd_MaskType_>                                          \\\n  type_name(nsimd_nat spmd_i_, spmd_MaskType_ spmd_mask_, __VA_ARGS__) {      \\\n    spmd_func_begin_\n\n// templated device function\n#define spmd_tmpl_dev_func(type_name, template_argument, ...)                 \\\n  template <typename template_argument, typename spmd_KernelType_,            \\\n            int spmd_ScalarBits_, int spmd_N_, typename spmd_MaskType_>       \\\n  type_name(nsimd_nat spmd_i_, spmd_MaskType_ spmd_mask_, __VA_ARGS__) {      \\\n    spmd_func_begin_\n\n#define spmd_dev_func_end }\n\n// call spmd_dev_function\n#define spmd_call_dev_func(name, ...)                                         \\\n  name<spmd_KernelType_, spmd_ScalarBits_, spmd_N_>(spmd_i_, spmd_mask_,      \\\n                                                    __VA_ARGS__)\n\n// call templated spmd_dev_function\n#define spmd_call_tmpl_dev_func(name, template_argument, ...)                 \\\n  name<template_argument, spmd_KernelType_, spmd_ScalarBits_, spmd_N_>(       \\\n      spmd_i_, spmd_mask_, __VA_ARGS__)\n\n// launch 1d kernel\n#define spmd_launch_kernel_1d(name, spmd_scalar_bits_, spmd_unroll_, spmd_n_, \\\n                              ...)                                            \\\n  {                                                                           \\\n    spmd::type_t<spmd::KernelSIMD, spmd_scalar_bits_, spmd_unroll_>::btype    \\\n        spmd_mask_(true);                                                     \\\n    nsimd_nat spmd_i_;                                                        \\\n    nsimd_nat len =                                                           \\\n        nsimd::len(spmd::type_t<spmd::KernelSIMD, spmd_scalar_bits_,          \\\n                                spmd_unroll_>::itype());                      \\\n    for (spmd_i_ = 0; spmd_i_ + len <= spmd_n_; spmd_i_ += len) {             \\\n      name<spmd::KernelSIMD, spmd_scalar_bits_, spmd_unroll_>(                \\\n          spmd_i_, spmd_mask_, __VA_ARGS__);                                  \\\n    }                                                                         \\\n    for (; spmd_i_ < spmd_n_; spmd_i_++) {                                    \\\n      name<spmd::KernelScalar, spmd_scalar_bits_, spmd_unroll_>(              \\\n          spmd_i_, true, __VA_ARGS__);                                        \\\n    }                                                                         \\\n  }\n\n// launch 1d templated kernel\n#define spmd_launch_tmpl_kernel_1d(                                      \\\n    name, template_argument, spmd_scalar_bits_, spmd_unroll_, spmd_n_, ...)   \\\n  {                                                                           \\\n    typename spmd::type_t<spmd::KernelSIMD, spmd_scalar_bits_,                \\\n                          spmd_unroll_>::btype spmd_mask_(true);              \\\n    nsimd_nat spmd_i_;                                                        \\\n    nsimd_nat len =                                                           \\\n        nsimd::len(typename spmd::type_t<spmd::KernelSIMD, spmd_scalar_bits_, \\\n                                         spmd_unroll_>::itype());             \\\n    for (spmd_i_ = 0; spmd_i_ + len <= spmd_n_; spmd_i_ += len) {             \\\n      name<template_argument, spmd::KernelSIMD, spmd_scalar_bits_,            \\\n           spmd_unroll_>(spmd_i_, spmd_mask_, __VA_ARGS__);                   \\\n    }                                                                         \\\n    for (; spmd_i_ < spmd_n_; spmd_i_++) {                                    \\\n      name<template_argument, spmd::KernelScalar, spmd_scalar_bits_,          \\\n           spmd_unroll_>(spmd_i_, true, __VA_ARGS__);                         \\\n    }                                                                         \\\n  }\n\n// supported types (generic)\ntemplate <typename KernelType, int ScalarBits, int N> struct type_t {};\n\n// supported types (scalar)\ntemplate <int N> struct type_t<KernelScalar, 8, N> {\n  typedef i8 itype;\n  typedef u8 utype;\n  typedef bool btype;\n};\n\ntemplate <int N> struct type_t<KernelScalar, 16, N> {\n  typedef i16 itype;\n  typedef u16 utype;\n  typedef f16 ftype;\n  typedef bool btype;\n};\n\ntemplate <int N> struct type_t<KernelScalar, 32, N> {\n  typedef i32 itype;\n  typedef u32 utype;\n  typedef f32 ftype;\n  typedef bool btype;\n};\n\ntemplate <int N> struct type_t<KernelScalar, 64, N> {\n  typedef i64 itype;\n  typedef u64 utype;\n  typedef f64 ftype;\n  typedef bool btype;\n};\n\n// supported types (SIMD)\ntemplate <int N> struct type_t<KernelSIMD, 8, N> {\n  typedef nsimd::pack<i8, N> itype;\n  typedef nsimd::pack<u8, N> utype;\n  typedef nsimd::packl<i8, N> btype;\n};\n\ntemplate <int N> struct type_t<KernelSIMD, 16, N> {\n  typedef nsimd::pack<i16, N> itype;\n  typedef nsimd::pack<u16, N> utype;\n  typedef nsimd::pack<f16, N> ftype;\n  typedef nsimd::packl<i16, N> btype;\n};\n\ntemplate <int N> struct type_t<KernelSIMD, 32, N> {\n  typedef nsimd::pack<i32, N> itype;\n  typedef nsimd::pack<u32, N> utype;\n  typedef nsimd::pack<f32, N> ftype;\n  typedef nsimd::packl<i32, N> btype;\n};\n\ntemplate <int N> struct type_t<KernelSIMD, 64, N> {\n  typedef nsimd::pack<i64, N> itype;\n  typedef nsimd::pack<u64, N> utype;\n  typedef nsimd::pack<f64, N> ftype;\n  typedef nsimd::packl<i64, N> btype;\n};\n\n// supported types (generic)\n#define k_int                                                                 \\\n  typename spmd::type_t<spmd_KernelType_, spmd_ScalarBits_, spmd_N_>::itype\n#define k_uint                                                                \\\n  typename spmd::type_t<spmd_KernelType_, spmd_ScalarBits_, spmd_N_>::utype\n#define k_float                                                               \\\n  typename spmd::type_t<spmd_KernelType_, spmd_ScalarBits_, spmd_N_>::ftype\n#define k_bool                                                                \\\n  typename spmd::type_t<spmd_KernelType_, spmd_ScalarBits_, spmd_N_>::btype\n\n// loads and stores (generic)\ntemplate <typename KernelType> struct store_helper {};\ntemplate <typename KernelType> struct load_helper {};\n#define k_store(base_addr, value)                                             \\\n  spmd::store_helper<spmd_KernelType_>::impl(spmd_mask_, &base_addr[spmd_i_], \\\n                                             value)\n#define k_unmasked_store(base_addr, value)                                    \\\n  spmd::store_helper<spmd_KernelType_>::unmasked_impl(&base_addr[spmd_i_],    \\\n                                                      value)\n\n#define k_load(base_addr)                                                     \\\n  spmd::load_helper<spmd_KernelType_>::impl(spmd_mask_, &base_addr[spmd_i_])\n#define k_unmasked_load(base_addr)                                            \\\n  spmd::load_helper<spmd_KernelType_>::template unmasked_impl<spmd_N_>(       \\\n      &base_addr[spmd_i_])\n\n// loads and stores (scalar)\ntemplate <> struct store_helper<KernelScalar> {\n  template <typename T, typename S>\n  static void impl(bool mask, T *addr, S value) {\n    if (mask) {\n      *addr = nsimd::to<T>(value);\n    }\n  }\n\n  template <typename T, typename S>\n  static void unmasked_impl(T *addr, S value) {\n    *addr = nsimd::to<T>(value);\n  }\n};\n\ntemplate <> struct load_helper<KernelScalar> {\n  template <typename T> static T impl(bool mask, T *addr) {\n    if (mask) {\n      return *addr;\n    } else {\n      return nsimd::to<T>(0);\n    }\n  }\n\n  template <int N, typename T> static T unmasked_impl(T *addr) {\n    return *addr;\n  }\n};\n\ntemplate <> struct store_helper<KernelSIMD> {\n  template <typename T, typename S, int N, typename SimdExt>\n  static void impl(nsimd::packl<T, N, SimdExt> const &mask, S *addr,\n                   nsimd::pack<S, N, SimdExt> const &value) {\n    nsimd::mask_storeu(mask, addr, value);\n  }\n\n  template <typename T, typename S, typename U, int N, typename SimdExt>\n  static void impl(nsimd::packl<T, N, SimdExt> const &mask, S *addr,\n                   U value) {\n    nsimd::mask_storeu(mask, addr,\n                       nsimd::pack<S, N, SimdExt>(nsimd::to<S>(value)));\n  }\n\n  template <typename T, int N, typename SimdExt>\n  static void unmasked_impl(T *addr, nsimd::pack<T, N, SimdExt> const &value) {\n    nsimd::storeu(addr, value);\n  }\n\n  template <typename T, typename S, int N, typename SimdExt>\n  static void unmasked_impl(T *addr, S value) {\n    nsimd::storeu(addr, nsimd::pack<T, N, SimdExt>(nsimd::to<T>(value)));\n  }\n};\n\ntemplate <> struct load_helper<KernelSIMD> {\n  template <typename T, typename S, int N, typename SimdExt>\n  static nsimd::pack<S, N, SimdExt>\n  impl(nsimd::packl<T, N, SimdExt> const &mask, S *addr) {\n    return nsimd::maskz_loadu(mask, addr);\n  }\n\n  template <int N, typename T>\n  static nsimd::pack<T, N> unmasked_impl(T *addr) {\n    return nsimd::loadu<nsimd::pack<T, N> >(addr);\n  }\n};\n\n// f32 <--> f16 conversions\n#define k_f32_to_f16(a) nsimd_f32_to_f16(a)\n#define k_f16_to_f32(a) nsimd_f16_to_f32(a)\n\n// Clear lanes\ntemplate <typename T, typename S, int N, typename SimdExt>\nnsimd::packl<T, N, SimdExt>\nclear_lanes(nsimd::packl<T, N, SimdExt> const &mask,\n            nsimd::packl<S, N, SimdExt> const &lanes) {\n  return nsimd::andnotl(mask, lanes);\n}\n\ninline bool clear_lanes(bool mask, bool lanes) { return lanes ? false : mask; }\n\n// assignment statement\ntemplate <typename T, typename S> void k_set_(bool mask, T &var, S value) {\n  if (mask) {\n    var = nsimd::to<T>(value);\n  }\n}\n\ntemplate <typename T, typename S, int N, typename SimdExt, typename U>\nvoid k_set_(nsimd::packl<T, N, SimdExt> const &mask,\n            nsimd::pack<S, N, SimdExt> &var, U value) {\n  var = nsimd::if_else(mask, nsimd::pack<S, N, SimdExt>(S(value)), var);\n}\n\ntemplate <typename T, typename S, int N, typename SimdExt>\nvoid k_set_(nsimd::packl<T, N, SimdExt> const &mask,\n            nsimd::pack<S, N, SimdExt> &var,\n            nsimd::pack<S, N, SimdExt> const &value) {\n  var = nsimd::if_else(mask, value, var);\n}\n\ntemplate <typename T, typename S, int N, typename SimdExt, typename U>\nvoid k_set_(nsimd::packl<T, N, SimdExt> const &mask,\n            nsimd::packl<S, N, SimdExt> &var, U value) {\n  var = nsimd::reinterpretl<nsimd::packl<S, N, SimdExt> >(\n      mask && nsimd::pack<S, N, SimdExt>(int(value)));\n}\n\ntemplate <typename T, typename S, int N, typename SimdExt, typename U>\nvoid k_set_(nsimd::packl<T, N, SimdExt> const &mask,\n            nsimd::packl<S, N, SimdExt> &var,\n            nsimd::packl<U, N, SimdExt> const &value) {\n  var = nsimd::reinterpretl<nsimd::packl<S, N, SimdExt> >(mask && value);\n}\n\n#define k_set(var, value) spmd::k_set_(spmd_mask_, var, value)\n#define k_unmasked_set(var, value)                                            \\\n  do {                                                                        \\\n    var = value;                                                              \\\n  } while (0)\n\ntemplate <typename T, int N, typename SimdExt>\nbool any(nsimd::packl<T, N, SimdExt> const a) {\n  return nsimd::any(a);\n}\n\ntemplate <typename KernelType, int ScalarBits, int N, typename Packl>\ntypename type_t<KernelType, ScalarBits, N>::btype to_k_bool_(Packl const &a) {\n  return nsimd::reinterpretl<\n      typename type_t<KernelType, ScalarBits, N>::btype>(a);\n}\n\ntemplate <typename KernelType, int ScalarBits, int N>\ninline bool to_k_bool_(bool a) {\n  return a;\n}\n\n#define k_to_bool(a)                                                          \\\n  spmd::to_k_bool_<spmd_KernelType_, spmd_ScalarBits_, spmd_N_>(a)\n\ninline bool any(bool a) { return a; }\n\n// while statement (k_while)\n#define k_while(cond)                                                         \\\n  {                                                                           \\\n    k_bool spmd_middle_mask_ = spmd_mask_;                                    \\\n    k_bool spmd_off_lanes_break_(false);                                      \\\n    (void)spmd_off_lanes_break_;                                              \\\n    k_bool spmd_off_lanes_continue_(false);                                   \\\n    (void)spmd_off_lanes_continue_;                                           \\\n    {                                                                         \\\n      while (spmd::any(cond)) {                                               \\\n        k_bool spmd_cond_ =                                                   \\\n            spmd::to_k_bool_<spmd_KernelType_, spmd_ScalarBits_, spmd_N_>(    \\\n                cond);                                                        \\\n        {                                                                     \\\n          k_bool spmd_mask_ = spmd_cond_ && spmd_middle_mask_;                \\\n          spmd_mask_ = spmd::clear_lanes(spmd_mask_, spmd_off_lanes_break_);  \\\n          spmd_mask_ = spmd::clear_lanes(spmd_mask_, spmd_off_lanes_return_);\n\n// break statement (k_break)\n#define k_break                                                               \\\n  spmd_off_lanes_break_ = spmd_off_lanes_break_ || spmd_mask_;                \\\n  spmd_mask_ = false;\n\n// continue statement (k_continue)\n#define k_continue                                                            \\\n  spmd_off_lanes_continue_ = spmd_off_lanes_continue_ || spmd_mask_;          \\\n  spmd_mask_ = false;\n\n// endwhile statement (k_endwhile)\n#define k_endwhile                                                            \\\n  }                                                                           \\\n  }                                                                           \\\n  }                                                                           \\\n  }                                                                           \\\n  spmd_mask_ = spmd::clear_lanes(spmd_mask_, spmd_off_lanes_return_);\n\n// return statement (k_return)\n#define k_return                                                              \\\n  spmd_off_lanes_return_ = spmd_off_lanes_return_ || spmd_mask_;              \\\n  spmd_mask_ = false;\n\n// if statement (k_if)\n#define k_if(cond)                                                            \\\n  {                                                                           \\\n    k_bool spmd_cond_ =                                                       \\\n        spmd::to_k_bool_<spmd_KernelType_, spmd_ScalarBits_, spmd_N_>(cond);  \\\n    k_bool spmd_middle_mask_ = spmd_mask_;                                    \\\n    {                                                                         \\\n      k_bool spmd_mask_ = spmd_cond_ && spmd_middle_mask_;\n\n// elseif statement (k_elseif)\n#define k_elseif(cond)                                                        \\\n  }                                                                           \\\n  spmd_mask_ = spmd::clear_lanes(spmd_mask_, spmd_off_lanes_return_);         \\\n  spmd_mask_ = spmd::clear_lanes(spmd_mask_, spmd_off_lanes_break_);          \\\n  spmd_mask_ = spmd::clear_lanes(spmd_mask_, spmd_off_lanes_continue_);       \\\n  spmd_middle_mask_ = spmd::clear_lanes(spmd_middle_mask_, spmd_cond_);       \\\n  spmd_cond_ =                                                                \\\n      spmd::to_k_bool_<spmd_KernelType_, spmd_ScalarBits_, spmd_N_>(cond);    \\\n  {                                                                           \\\n    k_bool spmd_mask_ = spmd_cond_ && spmd_middle_mask_;\n\n// else statement (k_else)\n#define k_else                                                                \\\n  }                                                                           \\\n  spmd_mask_ = spmd::clear_lanes(spmd_mask_, spmd_off_lanes_return_);         \\\n  spmd_mask_ = spmd::clear_lanes(spmd_mask_, spmd_off_lanes_break_);          \\\n  spmd_mask_ = spmd::clear_lanes(spmd_mask_, spmd_off_lanes_continue_);       \\\n  spmd_middle_mask_ = spmd::clear_lanes(spmd_middle_mask_, spmd_cond_);       \\\n  {                                                                           \\\n    k_bool spmd_mask_ = spmd_middle_mask_;\n\n// endif statement (k_endif)\n#define k_endif                                                               \\\n  }                                                                           \\\n  }                                                                           \\\n  spmd_mask_ = spmd::clear_lanes(spmd_mask_, spmd_off_lanes_return_);         \\\n  spmd_mask_ = spmd::clear_lanes(spmd_mask_, spmd_off_lanes_break_);          \\\n  spmd_mask_ = spmd::clear_lanes(spmd_mask_, spmd_off_lanes_continue_);\n\n// ----------------------------------------------------------------------------\n\n#endif\n\n#ifdef NSIMD_VARIADIC_MACROS_IS_EXTENSION\n  #if defined(NSIMD_IS_GCC)\n    #pragma GCC diagnostic pop\n  #elif defined(NSIMD_IS_CLANG)\n    #pragma clang diagnostic pop\n  #endif\n#endif\n\n} // namespace spmd\n\n#include <nsimd/modules/spmd/functions.hpp>\n\n#endif\n"
  },
  {
    "path": "include/nsimd/modules/tet1d.hpp",
    "content": "/*\n\nCopyright (c) 2021 Agenium Scale\n\nPermission is hereby granted, free of charge, to any person obtaining a copy\nof this software and associated documentation files (the \"Software\"), to deal\nin the Software without restriction, including without limitation the rights\nto use, copy, modify, merge, publish, distribute, sublicense, and/or sell\ncopies of the Software, and to permit persons to whom the Software is\nfurnished to do so, subject to the following conditions:\n\nThe above copyright notice and this permission notice shall be included in all\ncopies or substantial portions of the Software.\n\nTHE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\nIMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\nFITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\nAUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\nLIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\nOUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\nSOFTWARE.\n\n*/\n\n#ifndef NSIMD_MODULES_TET1D_HPP\n#define NSIMD_MODULES_TET1D_HPP\n\n#include <nsimd/nsimd-all.hpp>\n\n#include <cassert>\n#include <vector>\n#include <cstring>\n\nnamespace tet1d {\n\n// ----------------------------------------------------------------------------\n// general definitions\n\nstruct none_t {};\n\ntemplate <typename Op, typename Left, typename Right, typename Extra>\nstruct node {};\n\nconst nsimd::nat end = nsimd::nat(-1);\n\n// ----------------------------------------------------------------------------\n// Error management\n\n#if defined(NSIMD_CUDA)\n#define nsimd_cuda_assert(ans) tet1d::gpuCheck((ans), __FILE__, __LINE__)\ninline void gpuCheck(cudaError_t code, const char *file, int line) {\n  if (code != cudaSuccess) {\n    fprintf(stderr, \"NSIMD Internal error:\\n\\ttet1d Error: %s %s %d\\n\",\n        cudaGetErrorString(code), file, line);\n    exit(code);\n  }\n}\n#endif\n\n// ----------------------------------------------------------------------------\n// supported kernels\n\n#if defined(NSIMD_CUDA)\n\n// CUDA component wise kernel\ntemplate <typename T, typename Expr>\n__global__ void gpu_kernel_component_wise(T *dst, Expr const expr,\n                                          nsimd::nat n) {\n  int i = threadIdx.x + blockIdx.x * blockDim.x;\n  if (i < n) {\n    dst[i] = expr.gpu_get(i);\n  }\n}\n\n// CUDA component wise kernel with masked output\ntemplate <typename T, typename Mask, typename Expr>\n__global__ void gpu_kernel_component_wise_mask(T *dst, Mask const mask,\n                                               Expr const expr,\n                                               nsimd::nat n) {\n  int i = threadIdx.x + blockIdx.x * blockDim.x;\n  if (i < n && mask.gpu_get(i)) {\n    dst[i] = expr.gpu_get(i);\n  }\n}\n\n#elif defined(NSIMD_ROCM)\n\n// ROCM component wise kernel\ntemplate <typename T, typename Expr>\n__global__ void gpu_kernel_component_wise(T *dst, Expr const expr,\n                                          nsimd::nat n) {\n  int i = int(hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x);\n  if (i < n) {\n    dst[i] = expr.gpu_get(i);\n  }\n}\n\n// ROCM component wise kernel with masked output\ntemplate <typename T, typename Mask, typename Expr>\n__global__ void gpu_kernel_component_wise_mask(T *dst, Mask const mask,\n                                               Expr const expr,\n                                               nsimd::nat n) {\n  int i = int(hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x);\n  if (i < n && mask.gpu_get(i)) {\n    dst[i] = expr.gpu_get(i);\n  }\n}\n\n#elif defined(NSIMD_ONEAPI)\n\n// oneAPI component wise kernel\ntemplate <typename T, typename Expr>\nvoid oneapi_kernel_component_wise(T *dst, Expr const expr,\n                                  nsimd::nat n, sycl::nd_item<1> item) {\n  const int i = static_cast<int>(item.get_global_id().get(0));\n  if (i < n) {\n    dst[i] = expr.gpu_get(i);\n  }\n}\n\n// oneAPI component wise kernel with masked output\ntemplate <typename T, typename Mask, typename Expr>\nvoid oneapi_kernel_component_wise_mask(T *dst, Mask const mask,\n                                               Expr const expr,\n                                               nsimd::nat n,\n\t\t\t\t\t       sycl::nd_item<1> item) {\n\n  nsimd::nat i = static_cast<nsimd::nat>(item.get_global_id().get(0));\n  if (i < n && mask.gpu_get(i)) {\n    dst[i] = expr.gpu_get(i);\n  }\n}\n\n#else\n\n// CPU component wise kernel\ntemplate <typename Pack, typename T, typename Expr>\nvoid cpu_kernel_component_wise(T *dst, Expr const &expr, nsimd::nat n) {\n  nsimd::nat i;\n  int len = nsimd::len(Pack());\n  for (i = 0; i + len < n; i += len) {\n    nsimd::storeu(&dst[i], expr.template simd_get<Pack>(i));\n  }\n  for (; i < n; i++) {\n    dst[i] = expr.scalar_get(i);\n  }\n}\n\n// CPU component wise kernel with masked output\ntemplate <typename Pack, typename T, typename Mask, typename Expr>\nvoid cpu_kernel_component_wise_mask(T *dst, Mask const &mask, Expr const &expr,\n                                    nsimd::nat n) {\n  nsimd::nat i;\n  int len = nsimd::len(Pack());\n  for (i = 0; i + len < n; i += len) {\n    nsimd::storeu(&dst[i], nsimd::if_else(mask.template simd_get<Pack>(i),\n                                          expr.template simd_get<Pack>(i),\n                                          nsimd::loadu<Pack>(&dst[i])));\n  }\n  for (; i < n; i++) {\n    if (mask.scalar_get(i)) {\n      dst[i] = expr.scalar_get(i);\n    }\n  }\n}\n\n#endif\n\n// ----------------------------------------------------------------------------\n// helper for computing sizes of 1D vectors\n\nnsimd::nat compute_size(nsimd::nat sz1, nsimd::nat sz2) {\n  assert(sz1 >= 0 || sz2 >= 0);\n  assert((sz1 < 0 && sz2 >= 0) || (sz1 >= 0 && sz2 < 0) || (sz1 == sz2));\n  if (sz1 < 0) {\n    return sz2;\n  } else {\n    return sz1;\n  }\n}\n\nnsimd::nat compute_size(nsimd::nat sz1, nsimd::nat sz2, nsimd::nat sz3) {\n  return compute_size(compute_size(sz1, sz2), sz3);\n}\n\n// ----------------------------------------------------------------------------\n// meta for building a pack from another ignoring the base type\n\ntemplate <typename T, typename Pack> struct to_pack_t {\n  static const int unroll = Pack::unroll;\n  typedef typename Pack::simd_ext simd_ext;\n  typedef nsimd::pack<T, unroll, simd_ext> type;\n};\n\ntemplate <typename T, int Unroll, typename SimdExt, typename Pack>\nstruct to_pack_t<nsimd::pack<T, Unroll, SimdExt>, Pack> {\n  static const int unroll = Pack::unroll;\n  typedef typename Pack::simd_ext simd_ext;\n  typedef nsimd::pack<T, unroll, simd_ext> type;\n};\n\ntemplate <typename T, typename Pack> struct to_packl_t {\n  static const int unroll = Pack::unroll;\n  typedef typename Pack::simd_ext simd_ext;\n  typedef nsimd::packl<T, unroll, simd_ext> type;\n};\n\ntemplate <typename T, int Unroll, typename SimdExt, typename Pack>\nstruct to_packl_t<nsimd::pack<T, Unroll, SimdExt>, Pack> {\n  static const int unroll = Pack::unroll;\n  typedef typename Pack::simd_ext simd_ext;\n  typedef nsimd::packl<T, unroll, simd_ext> type;\n};\n\n// ----------------------------------------------------------------------------\n// scalar node\n\nstruct scalar_t {};\n\ntemplate <typename T> struct node<scalar_t, none_t, none_t, T> {\n  typedef T in_type;\n  typedef T out_type;\n  T value;\n\n#if defined(NSIMD_CUDA) || defined(NSIMD_ROCM)\n  __device__ T gpu_get(nsimd::nat) const { return value; }\n#elif defined(NSIMD_ONEAPI)\n  T gpu_get(nsimd::nat) const { return value; }\n#else\n  T scalar_get(nsimd::nat) const { return value; }\n  template <typename Pack>\n  typename to_pack_t<T, Pack>::type simd_get(nsimd::nat) const {\n    typedef typename to_pack_t<T, Pack>::type pack;\n    return pack(value);\n  }\n#endif\n\n  nsimd::nat size() const { return -1; }\n};\n\n// ----------------------------------------------------------------------------\n// build a node from a scalar and a node\n\ntemplate <typename T> struct to_node_t {\n  typedef node<scalar_t, none_t, none_t, T> type;\n\n  static type impl(T n) {\n    type ret;\n    ret.value = n;\n    return ret;\n  }\n};\n\ntemplate <typename Op, typename Left, typename Right, typename Extra>\nstruct to_node_t<node<Op, Left, Right, Extra> > {\n  typedef node<Op, Left, Right, Extra> type;\n  static type impl(type node) { return node; }\n};\n\ntemplate <typename T> typename to_node_t<T>::type to_node(T n) {\n  return to_node_t<T>::impl(n);\n}\n\n// ----------------------------------------------------------------------------\n// convert literal to one NSIMD base type\n\ntemplate <typename T> struct literal_to {\n  template <typename S> static T impl(S a) { return T(a); }\n};\n\ntemplate <> struct literal_to<f16> {\n  template <typename S> static f16 impl(S a) {\n    return nsimd_f32_to_f16(f32(a));\n  }\n};\n\n// ----------------------------------------------------------------------------\n// input node\n\nstruct in_t {};\n\n#define TET1D_IN(T) tet1d::node<tet1d::in_t, tet1d::none_t, tet1d::none_t, T>\n\ntemplate <typename T> struct node<in_t, none_t, none_t, T> {\n  const T *data;\n  nsimd::nat sz;\n  typedef T in_type;\n  typedef T out_type;\n\n#if defined(NSIMD_CUDA) || defined(NSIMD_ROCM)\n  __device__ T gpu_get(nsimd::nat i) const { return data[i]; }\n#elif defined(NSIMD_ONEAPI)\n  T gpu_get(nsimd::nat i) const { return data[i]; }\n#else\n  T scalar_get(nsimd::nat i) const { return data[i]; }\n  template <typename Pack>\n  typename to_pack_t<T, Pack>::type simd_get(nsimd::nat i) const {\n    typedef typename to_pack_t<T, Pack>::type pack;\n    return nsimd::loadu<pack>(&data[i]);\n  }\n#endif\n\n  nsimd::nat size() const { return sz; }\n\n  template <typename I0, typename I1>\n  node<in_t, none_t, none_t, T> operator()(I0 i0_, I1 i1_) const {\n    node<in_t, none_t, none_t, T> ret;\n    nsimd::nat i0 = nsimd::nat(i0_);\n    nsimd::nat i1 = nsimd::nat(i1_);\n    i0 = i0 >= 0 ? i0 : sz + i0;\n    i1 = i1 >= 0 ? i1 : sz + i1;\n    assert(0 <= i0 && i0 < i1 && i1 < sz);\n    ret.data = &data[i0];\n    ret.sz = i1 - i0 + 1;\n    return ret;\n  }\n};\n\n// return an input node from a pointer\ntemplate <typename T, typename I>\ninline node<in_t, none_t, none_t, T> in(const T *data, I sz) {\n  node<in_t, none_t, none_t, T> ret;\n  ret.data = data;\n  ret.sz = nsimd::nat(sz);\n  return ret;\n}\n\n// ----------------------------------------------------------------------------\n// output with condition node: I(I > 50) = ...\n\nstruct mask_out_t {};\n\ntemplate <typename Mask, typename Pack>\nstruct node<mask_out_t, Mask, none_t, Pack> {\n  typedef typename Pack::value_type T;\n  T *data;\n  nsimd::nat threads_per_block;\n  void *stream;\n  Mask mask;\n\n  template <typename Op, typename Left, typename Right, typename Extra>\n  node<mask_out_t, Mask, none_t, Pack>\n  operator=(node<Op, Left, Right, Extra> const &expr) {\n#if defined(NSIMD_CUDA) || defined(NSIMD_ROCM) || defined(NSIMD_ONEAPI)\n    nsimd::nat expr_size = compute_size(mask.size(), expr.size());\n    nsimd::nat nt = threads_per_block < 0 ? 128 : threads_per_block;\n    nsimd::nat param = nsimd_kernel_param(expr_size, nt);\n    assert(nt > 0 && nt <= UINT_MAX);\n    assert(param > 0 && param <= UINT_MAX);\n#if defined(NSIMD_CUDA)\n    cudaStream_t s = (stream == NULL ? NULL : *(cudaStream_t *)stream);\n\n    // clang-format off\n    gpu_kernel_component_wise_mask<<<(unsigned int)(param), (unsigned int)(nt),\n                                     0, s>>>\n                                     (data, mask, expr, expr_size);\n    // clang-format on\n#elif defined(NSIMD_ROCM)\n    hipStream_t s = stream == NULL ? NULL : *(hipStream_t *)stream;\n    hipLaunchKernelGGL(gpu_kernel_component_wise_mask, (unsigned int)(param),\n                       (unsigned int)(nt), 0, s, data, mask, expr,\n                       expr_size);\n#else\n    sycl::queue q = nsimd::oneapi::default_queue();\n    q.parallel_for(sycl::nd_range<1>(sycl::range<1>((size_t)param),\n                                     sycl::range<1>((size_t)nt)),\n                   [=, *this](sycl::nd_item<1> item) {\n                     oneapi_kernel_component_wise_mask(data, mask, expr,\n                                                       expr_size, item);\n                   })\n        .wait_and_throw();\n\n#endif\n#else\n    cpu_kernel_component_wise_mask<Pack>(\n        data, mask, expr, compute_size(mask.size(), expr.size()));\n#endif\n    return *this;\n  }\n\n  template <typename S> node<mask_out_t, Mask, none_t, Pack> operator=(S a) {\n    return operator=(to_node(literal_to<T>::impl(a)));\n  }\n};\n\n// ----------------------------------------------------------------------------\n// output node\n\nstruct out_t {};\n\n#define TET1D_OUT(T)                                                          \\\n  tet1d::node<tet1d::out_t, tet1d::none_t, tet1d::none_t, nsimd::pack<T> >\n\n#define TET1D_OUT_EX(T, N, SimdExt)                                           \\\n  tet1d::node<tet1d::out_t, tet1d::none_t, tet1d::none_t,                     \\\n              nsimd::pack<T, N, SimdExt> >\n\ntemplate <typename Pack> struct node<out_t, none_t, none_t, Pack> {\n  typedef typename Pack::value_type T;\n  T *data;\n  nsimd::nat threads_per_block;\n  void *stream;\n\n  template <typename Mask>\n  node<mask_out_t, Mask, none_t, Pack> operator()(Mask mask) const {\n    node<mask_out_t, Mask, none_t, Pack> ret;\n    ret.data = data;\n    ret.mask = mask;\n    ret.threads_per_block = threads_per_block;\n    ret.stream = stream;\n    return ret;\n  }\n\n  template <typename Op, typename Left, typename Right, typename Extra>\n  node<out_t, none_t, none_t, Pack>\n  operator=(node<Op, Left, Right, Extra> const &expr) {\n#if defined(NSIMD_CUDA) || defined(NSIMD_ROCM) || defined(NSIMD_ONEAPI)\n    nsimd::nat nt = threads_per_block < 0 ? 128 : threads_per_block;\n    nsimd::nat param = nsimd_kernel_param(expr.size(), nt);\n    assert(nt > 0 && nt <= UINT_MAX);\n    assert(param > 0 && param <= UINT_MAX);\n#if defined(NSIMD_CUDA)\n    cudaStream_t s = stream == NULL ? NULL : *(cudaStream_t *)stream;\n\n    // clang-format off\n    gpu_kernel_component_wise<<<(unsigned int)(param), (unsigned int)(nt),\n                                0, s>>>(data, expr, expr.size());\n    // clang-format on\n\n#elif defined(NSIMD_ROCM)\n    hipStream_t s = stream == NULL ? NULL : *(hipStream_t *)stream;\n    hipLaunchKernelGGL(\n        (gpu_kernel_component_wise<T, node<Op, Left, Right, Extra> >),\n        (unsigned int)(param), (unsigned int)(nt), 0, s, data, expr,\n        expr.size());\n#else\n    sycl::queue q = nsimd::oneapi::default_queue();\n    q.parallel_for(\n         sycl::nd_range<1>(sycl::range<1>((size_t)param),\n                                          sycl::range<1>((size_t)nt)),\n         [=, *this](sycl::nd_item<1> item) {\n           oneapi_kernel_component_wise(data, expr, expr.size(), item);\n         })\n        .wait_and_throw();\n#endif\n#else\n    cpu_kernel_component_wise<Pack>(data, expr, expr.size());\n#endif\n    return *this;\n  }\n};\n\n// return an output node from a pointer\ntemplate <typename T>\nnode<out_t, none_t, none_t, nsimd::pack<T> > out(T *data) {\n  node<out_t, none_t, none_t, nsimd::pack<T> > ret;\n  ret.data = data;\n  ret.threads_per_block = 128;\n  ret.stream = NULL;\n  return ret;\n}\n\ntemplate <typename T, typename Pack>\nnode<out_t, none_t, none_t, Pack> out(T *data, int threads_per_block,\n                                      void *stream) {\n  node<out_t, none_t, none_t, Pack> ret;\n  ret.data = data;\n  ret.threads_per_block = threads_per_block;\n  ret.stream = stream;\n  return ret;\n}\n\n// ----------------------------------------------------------------------------\n\n} // namespace tet1d\n\n#include <nsimd/modules/tet1d/functions.hpp>\n\n#endif\n"
  },
  {
    "path": "include/nsimd/nsimd-all.h",
    "content": "/*\n\nCopyright (c) 2021 Agenium Scale\n\nPermission is hereby granted, free of charge, to any person obtaining a copy\nof this software and associated documentation files (the \"Software\"), to deal\nin the Software without restriction, including without limitation the rights\nto use, copy, modify, merge, publish, distribute, sublicense, and/or sell\ncopies of the Software, and to permit persons to whom the Software is\nfurnished to do so, subject to the following conditions:\n\nThe above copyright notice and this permission notice shall be included in all\ncopies or substantial portions of the Software.\n\nTHE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\nIMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\nFITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\nAUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\nLIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\nOUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\nSOFTWARE.\n\n*/\n\n#ifndef NSIMD_ALL_H\n#define NSIMD_ALL_H\n\n#include <nsimd/nsimd.h>\n#include <nsimd/c_adv_api.h>\n\n#endif\n"
  },
  {
    "path": "include/nsimd/nsimd-all.hpp",
    "content": "/*\n\nCopyright (c) 2021 Agenium Scale\n\nPermission is hereby granted, free of charge, to any person obtaining a copy\nof this software and associated documentation files (the \"Software\"), to deal\nin the Software without restriction, including without limitation the rights\nto use, copy, modify, merge, publish, distribute, sublicense, and/or sell\ncopies of the Software, and to permit persons to whom the Software is\nfurnished to do so, subject to the following conditions:\n\nThe above copyright notice and this permission notice shall be included in all\ncopies or substantial portions of the Software.\n\nTHE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\nIMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\nFITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\nAUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\nLIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\nOUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\nSOFTWARE.\n\n*/\n\n#ifndef NSIMD_ALL_HPP\n#define NSIMD_ALL_HPP\n\n#include <nsimd/nsimd.h>\n#include <nsimd/cxx_adv_api.hpp>\n#include <nsimd/cxx_adv_api_aliases.hpp>\n#include <nsimd/friendly_but_not_optimized.hpp>\n\n#endif\n"
  },
  {
    "path": "include/nsimd/nsimd.h",
    "content": "/*\n\nCopyright (c) 2021 Agenium Scale\n\nPermission is hereby granted, free of charge, to any person obtaining a copy\nof this software and associated documentation files (the \"Software\"), to deal\nin the Software without restriction, including without limitation the rights\nto use, copy, modify, merge, publish, distribute, sublicense, and/or sell\ncopies of the Software, and to permit persons to whom the Software is\nfurnished to do so, subject to the following conditions:\n\nThe above copyright notice and this permission notice shall be included in all\ncopies or substantial portions of the Software.\n\nTHE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\nIMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\nFITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\nAUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\nLIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\nOUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\nSOFTWARE.\n\n*/\n\n#ifndef NSIMD_H\n#define NSIMD_H\n\n/* clang-format off */\n\n/* ------------------------------------------------------------------------- */\n/* Compiler detection (order matters https://stackoverflow.com/a/28166605)   */\n\n/* Detect host compiler */\n#if defined(_MSC_VER)\n  #define NSIMD_IS_MSVC\n#elif defined(__ibmxl_version__)\n  #define NSIMD_IS_XLC\n#elif defined(__FCC_version__)\n  #define NSIMD_IS_FCC\n#elif defined(__INTEL_COMPILER)\n  #define NSIMD_IS_ICC\n#elif defined(__clang__)\n  #define NSIMD_IS_CLANG\n#elif defined(__GNUC__) || defined(__GNUG__)\n  #define NSIMD_IS_GCC\n#endif\n\n/* Detect device compiler, if any */\n#if defined(__HIPCC__)\n  #define NSIMD_IS_HIPCC\n#elif defined(__INTEL_CLANG_COMPILER) || defined(__INTEL_LLVM_COMPILER)\n  #define NSIMD_IS_DPCPP\n#elif defined(__NVCC__)\n  #define NSIMD_IS_NVCC\n#endif\n\n/* ------------------------------------------------------------------------- */\n/* C standard detection */\n\n#ifdef NSIMD_IS_MSVC\n  #define NSIMD_C 1999\n#else\n  #ifdef __STDC_VERSION__\n    #if __STDC_VERSION__ == 199901L\n      #define NSIMD_C 1999\n    #elif __STDC_VERSION__ >= 201112L\n      #define NSIMD_C 2011\n    #else\n      #define NSIMD_C 1989\n    #endif\n  #else\n    #define NSIMD_C 1989\n  #endif\n#endif\n\n/* ------------------------------------------------------------------------- */\n/* C++ standard detection */\n\n#ifdef NSIMD_IS_MSVC\n  #ifdef _MSVC_LANG\n    #define NSIMD__cplusplus _MSVC_LANG\n  #else\n    #define NSIMD__cplusplus __cplusplus\n  #endif\n#else\n  #ifdef __cplusplus\n    #define NSIMD__cplusplus __cplusplus\n  #else\n    #define NSIMD__cplusplus 0\n  #endif\n#endif\n\n#if NSIMD__cplusplus > 0 && NSIMD__cplusplus < 201103L\n  #define NSIMD_CXX 1998\n#elif NSIMD__cplusplus >= 201103L && NSIMD__cplusplus < 201402L\n  #define NSIMD_CXX 2011\n#elif NSIMD__cplusplus >= 201402L && NSIMD__cplusplus < 201703L\n  #define NSIMD_CXX 2014\n#elif NSIMD__cplusplus == 201703L\n  #define NSIMD_CXX 2017\n#elif NSIMD__cplusplus >= 201704L\n  #define NSIMD_CXX 2020\n#else\n  #define NSIMD_CXX 0\n#endif\n\n#if NSIMD_CXX >= 2020\n  #include <concepts>\n  #include <cstddef>\n#endif\n\n/* ------------------------------------------------------------------------- */\n/* Use of long long for GCC even in C89 and C++98. Note that for some reason */\n/* the use of the __extension__ keyword does not prevent warning so we deal  */\n/* with them now. We keep the __extension__ keyword in case.                 */\n\n#if NSIMD_CXX < 2011 && NSIMD_C < 1999\n  #define NSIMD_LONGLONG_IS_EXTENSION\n#endif\n\n#ifdef NSIMD_LONGLONG_IS_EXTENSION\n  #if defined(NSIMD_IS_GCC)\n    #pragma GCC diagnostic push\n    #pragma GCC diagnostic ignored \"-Wlong-long\"\n  #elif defined(NSIMD_IS_CLANG)\n    #pragma clang diagnostic push\n    #pragma clang diagnostic ignored \"-Wlong-long\"\n  #endif\n#endif\n\ntypedef long long nsimd_longlong;\ntypedef unsigned long long nsimd_ulonglong;\n\n#if NSIMD_CXX > 0\nnamespace nsimd {\n  typedef long long longlong;\n  typedef unsigned long long ulonglong;\n} // namespace nsimd\n#endif\n\n#ifdef __UINT64_TYPE__\n  typedef __UINT64_TYPE__ nsimd_uint64_type;\n#endif\n\n#ifdef __INT64_TYPE__\n  typedef __INT64_TYPE__ nsimd_int64_type;\n#endif\n\n#ifdef NSIMD_LONGLONG_IS_EXTENSION\n  #if defined(NSIMD_IS_GCC)\n    #pragma GCC diagnostic pop\n  #elif defined(NSIMD_IS_CLANG)\n    #pragma clang diagnostic pop\n  #endif\n#endif\n\n/* ------------------------------------------------------------------------- */\n/* Register size detection */\n\n#if defined(__x86_64) || defined(__x86_64__) || defined(__amd64__) ||         \\\n    defined(__amd64) || defined(_M_AMD64) || defined(__aarch64__) ||          \\\n    defined(_M_ARM64) || defined(__PPC64__)\n  #define NSIMD_WORD_SIZE 64\n#else\n  #define NSIMD_WORD_SIZE 32\n#endif\n\n/* ------------------------------------------------------------------------- */\n/* Architecture detection */\n\n#if defined(i386) || defined(__i386__) || defined(__i486__) ||                \\\n    defined(__i586__) || defined(__i686__) || defined(__i386) ||              \\\n    defined(_M_IX86) || defined(_X86_) || defined(__THW_INTEL__) ||           \\\n    defined(__I86__) || defined(__INTEL__) || defined(__x86_64) ||            \\\n    defined(__x86_64__) || defined(__amd64__) || defined(__amd64) ||          \\\n    defined(_M_X64)\n  #define NSIMD_X86\n#elif defined(__arm__) || defined(__arm64) || defined(__thumb__) ||           \\\n    defined(__TARGET_ARCH_ARM) || defined(__TARGET_ARCH_THUMB) ||             \\\n    defined(_M_ARM) || defined(_M_ARM64) || defined(__arch64__)\n  #define NSIMD_ARM\n#elif defined(__ppc__) || defined(__powerpc__) || defined(__PPC__)\n  #define NSIMD_POWERPC\n#else\n  #define NSIMD_GENERIC\n#endif\n\n/* ------------------------------------------------------------------------- */\n/* Microsoft DLL specifics */\n\n#ifdef NSIMD_IS_MSVC\n  #define NSIMD_DLLEXPORT __declspec(dllexport)\n  #define NSIMD_DLLIMPORT __declspec(dllimport)\n#else\n  #define NSIMD_DLLEXPORT\n  #define NSIMD_DLLIMPORT extern\n#endif\n\n/* ------------------------------------------------------------------------- */\n/* DLL specifics when inside/outside the library */\n\n#ifdef NSIMD_INSIDE\n  #define NSIMD_DLLSPEC NSIMD_DLLEXPORT\n#else\n  #define NSIMD_DLLSPEC NSIMD_DLLIMPORT\n#endif\n\n/* ------------------------------------------------------------------------- */\n/* Vector calling convention: https://devblogs.microsoft.com/cppblog\n                                  /introducing-vector-calling-convention/ */\n\n#if defined(NSIMD_IS_MSVC) && NSIMD_WORD_SIZE == 32\n  #define NSIMD_VECTORCALL __vectorcall\n#else\n  #define NSIMD_VECTORCALL\n#endif\n\n/* ------------------------------------------------------------------------- */\n/* inline in nsimd is ONLY useful for linkage */\n\n#if NSIMD_CXX > 0 || NSIMD_C > 1989\n  #if NSIMD_C > 0 && defined(NSIMD_IS_MSVC)\n    #define NSIMD_INLINE static __inline\n  #else\n    #define NSIMD_INLINE static inline\n  #endif\n#else\n  #if defined(NSIMD_IS_GCC) || defined(NSIMD_IS_CLANG)\n    #define NSIMD_INLINE __extension__ static __inline\n  #else\n    #define NSIMD_INLINE\n  #endif\n#endif\n\n/* ------------------------------------------------------------------------- */\n/* Pre-processor */\n\n#define NSIMD_PP_CAT_2_e(a, b) a##b\n#define NSIMD_PP_CAT_2(a, b) NSIMD_PP_CAT_2_e(a, b)\n\n#define NSIMD_PP_CAT_3_e(a, b, c) a##b##c\n#define NSIMD_PP_CAT_3(a, b, c) NSIMD_PP_CAT_3_e(a, b, c)\n\n#define NSIMD_PP_CAT_4_e(a, b, c, d) a##b##c##d\n#define NSIMD_PP_CAT_4(a, b, c, d) NSIMD_PP_CAT_4_e(a, b, c, d)\n\n#define NSIMD_PP_CAT_5_e(a, b, c, d, e) a##b##c##d##e\n#define NSIMD_PP_CAT_5(a, b, c, d, e) NSIMD_PP_CAT_5_e(a, b, c, d, e)\n\n#define NSIMD_PP_CAT_6_e(a, b, c, d, e, f) a##b##c##d##e##f\n#define NSIMD_PP_CAT_6(a, b, c, d, e, f) NSIMD_PP_CAT_6_e(a, b, c, d, e, f)\n\n#define NSIMD_PP_EXPAND_e(a) a\n#define NSIMD_PP_EXPAND(a) NSIMD_PP_EXPAND_e(a)\n\n/* ------------------------------------------------------------------------- */\n/* Detect architecture/SIMD */\n\n#if defined(CPU) && !defined(NSIMD_CPU)\n  #define NSIMD_CPU\n#endif\n\n/* Intel */\n\n#if defined(SSE2) && !defined(NSIMD_SSE2)\n  #define NSIMD_SSE2\n#endif\n\n#if defined(SSE42) && !defined(NSIMD_SSE42)\n  #define NSIMD_SSE42\n#endif\n\n#if defined(AVX) && !defined(NSIMD_AVX)\n  #define NSIMD_AVX\n#endif\n\n#if defined(AVX2) && !defined(NSIMD_AVX2)\n  #define NSIMD_AVX2\n#endif\n\n#if defined(AVX512_KNL) && !defined(NSIMD_AVX512_KNL)\n  #define NSIMD_AVX512_KNL\n#endif\n\n#if defined(AVX512_SKYLAKE) && !defined(NSIMD_AVX512_SKYLAKE)\n  #define NSIMD_AVX512_SKYLAKE\n#endif\n\n#if defined(FP16) && !defined(NSIMD_FP16)\n  #define NSIMD_FP16\n#endif\n\n#if defined(FMA) && !defined(NSIMD_FMA)\n  #define NSIMD_FMA\n#endif\n\n/* ARM */\n\n#if defined(NEON128) && !defined(NSIMD_NEON128)\n  #define NSIMD_NEON128\n#endif\n\n#if defined(AARCH64) && !defined(NSIMD_AARCH64)\n  #define NSIMD_AARCH64\n#endif\n\n#if defined(SVE) && !defined(NSIMD_SVE)\n  #define NSIMD_SVE\n  #define NSIMD_SVE_FAMILY\n#endif\n\n#if defined(SVE128) && !defined(NSIMD_SVE128)\n  #define NSIMD_SVE128\n  #define NSIMD_SVE_FAMILY\n#endif\n\n#if defined(SVE256) && !defined(NSIMD_SVE256)\n  #define NSIMD_SVE256\n  #define NSIMD_SVE_FAMILY\n#endif\n\n#if defined(SVE512) && !defined(NSIMD_SVE512)\n  #define NSIMD_SVE512\n  #define NSIMD_SVE_FAMILY\n#endif\n\n#if defined(SVE1024) && !defined(NSIMD_SVE1024)\n  #define NSIMD_SVE1024\n  #define NSIMD_SVE_FAMILY\n#endif\n\n#if defined(SVE2048) && !defined(NSIMD_SVE2048)\n  #define NSIMD_SVE2048\n  #define NSIMD_SVE_FAMILY\n#endif\n\n/* PPC */\n\n#if (defined(VMX) || defined(ALTIVEC)) && !defined(NSIMD_VMX)\n#define NSIMD_VMX\n#endif\n\n#if defined(VSX) && !defined(NSIMD_VSX)\n#define NSIMD_VSX\n#endif\n\n/* CUDA */\n\n#if defined(CUDA) && !defined(NSIMD_CUDA)\n  #define NSIMD_CUDA\n#endif\n\n/* ROCm */\n\n#if defined(ROCM) && !defined(NSIMD_ROCM)\n  #define NSIMD_ROCM\n#endif\n\n/* oneAPI */\n\n#if defined(ONEAPI) && !defined(NSIMD_ONEAPI)\n  #define NSIMD_ONEAPI\n  /* undef ONEAPI is needed because ONEAPI is used as a namespace in DPC++:\n     sycl::ONEAPI */\n  #ifdef ONEAPI\n    #undef ONEAPI\n  #endif\n#endif\n\n/* ------------------------------------------------------------------------- */\n/* Set NSIMD_SIMD and NSIMD_PLATFORM macro, include the correct header. */\n\n#if defined(NSIMD_SSE2)\n\n  #define NSIMD_PLATFORM x86\n  #define NSIMD_SIMD sse2\n  #include <emmintrin.h>\n  #if defined(NSIMD_FMA) || defined(NSIMD_FP16)\n    #include <immintrin.h>\n  #endif\n  /* For some reason MSVC <= 2015 has intrinsics defined in another header */\n  #ifdef NSIMD_IS_MSVC\n    #include <intrin.h>\n  #endif\n\n  #if NSIMD_CXX > 0\n    namespace nsimd {\n      struct cpu {};\n      struct sse2 {};\n      #if NSIMD_CXX >= 2020\n        template <typename T>\n        concept simd_ext_c = std::is_same_v<T, nsimd::cpu> ||\n                             std::is_same_v<T, nsimd::sse2>;\n        #define NSIMD_LIST_SIMD_EXT cpu, sse2\n      #endif\n    } // namespace nsimd\n  #endif\n\n#elif defined(NSIMD_SSE42)\n\n  #define NSIMD_PLATFORM x86\n  #define NSIMD_SIMD sse42\n  #include <nmmintrin.h>\n  #if defined(NSIMD_FMA) || defined(NSIMD_FP16)\n    #include <immintrin.h>\n  #endif\n  /* For some reason MSVC <= 2015 has intrinsics defined in another header */\n  #ifdef NSIMD_IS_MSVC\n    #include <intrin.h>\n  #endif\n\n  #if NSIMD_CXX > 0\n    namespace nsimd {\n      struct cpu {};\n      struct sse2 {};\n      struct sse42 {};\n      #if nsIMD_CXX >= 2020\n        template <typename T>\n        concept simd_ext_c = std::is_same_v<T, nsimd::cpu> ||\n                             std::is_same_v<T, nsimd::sse2> ||\n                             std::is_same_v<T, nsimd::sse42>;\n        #define NSIMD_LIST_SIMD_EXT cpu, sse2, sse42\n      #endif\n    } // namespace nsimd\n  #endif\n\n#elif defined(NSIMD_AVX)\n\n  #define NSIMD_PLATFORM x86\n  #define NSIMD_SIMD avx\n  #include <immintrin.h>\n  /* For some reason MSVC <= 2015 has intrinsics defined in another header */\n  #ifdef NSIMD_IS_MSVC\n    #include <intrin.h>\n  #endif\n\n  #if NSIMD_CXX > 0\n    namespace nsimd {\n      struct cpu {};\n      struct sse2 {};\n      struct sse42 {};\n      struct avx {};\n      #if NSIMD_CXX >= 2020\n        template <typename T>\n        concept simd_ext_c = std::is_same_v<T, nsimd::cpu> ||\n                             std::is_same_v<T, nsimd::sse2> ||\n                             std::is_same_v<T, nsimd::sse42> ||\n                             std::is_same_v<T, nsimd::avx>;\n        #define NSIMD_LIST_SIMD_EXT cpu, sse2, sse42, avx\n      #endif\n    } // namespace nsimd\n  #endif\n\n#elif defined(NSIMD_AVX2)\n\n  #define NSIMD_PLATFORM x86\n  #define NSIMD_SIMD avx2\n  #include <immintrin.h>\n  /* For some reason MSVC <= 2015 has intrinsics defined in another header */\n  #ifdef NSIMD_IS_MSVC\n    #include <intrin.h>\n  #endif\n\n  #if NSIMD_CXX > 0\n    namespace nsimd {\n      struct cpu {};\n      struct sse2 {};\n      struct sse42 {};\n      struct avx {};\n      struct avx2 {};\n      #if NSIMD_CXX >= 2020\n        template <typename T>\n        concept simd_ext_c = std::is_same_v<T, nsimd::cpu> ||\n                             std::is_same_v<T, nsimd::sse2> ||\n                             std::is_same_v<T, nsimd::sse42> ||\n                             std::is_same_v<T, nsimd::avx> ||\n                             std::is_same_v<T, nsimd::avx2>;\n        #define NSIMD_LIST_SIMD_EXT cpu, sse2, sse42, avx, avx2\n      #endif\n    } // namespace nsimd\n  #endif\n\n#elif defined(NSIMD_AVX512_KNL)\n\n  #define NSIMD_PLATFORM x86\n  #define NSIMD_SIMD avx512_knl\n  #include <immintrin.h>\n\n  #if NSIMD_CXX > 0\n    namespace nsimd {\n      struct cpu {};\n      struct sse2 {};\n      struct sse42 {};\n      struct avx {};\n      struct avx2 {};\n      struct avx512_knl {};\n      #if NSIMD_CXX >= 2020\n        template <typename T>\n        concept simd_ext_c = std::is_same_v<T, nsimd::cpu> ||\n                             std::is_same_v<T, nsimd::sse2> ||\n                             std::is_same_v<T, nsimd::sse42> ||\n                             std::is_same_v<T, nsimd::avx> ||\n                             std::is_same_v<T, nsimd::avx2> ||\n                             std::is_same_v<T, nsimd::avx512_knl>;\n        #define NSIMD_LIST_SIMD_EXT cpu, sse2, sse42, avx, avx2, avx512_knl\n      #endif\n    } // namespace nsimd\n  #endif\n\n#elif defined(NSIMD_AVX512_SKYLAKE)\n\n  #define NSIMD_PLATFORM x86\n  #define NSIMD_SIMD avx512_skylake\n  #include <immintrin.h>\n\n  #if NSIMD_CXX > 0\n    namespace nsimd {\n      struct cpu {};\n      struct sse2 {};\n      struct sse42 {};\n      struct avx {};\n      struct avx2 {};\n      struct avx512_skylake {};\n      #if NSIMD_CXX >= 2020\n        template <typename T>\n        concept simd_ext_c = std::is_same_v<T, nsimd::cpu> ||\n                             std::is_same_v<T, nsimd::sse2> ||\n                             std::is_same_v<T, nsimd::sse42> ||\n                             std::is_same_v<T, nsimd::avx> ||\n                             std::is_same_v<T, nsimd::avx2> ||\n                             std::is_same_v<T, nsimd::avx512_skylake>;\n        #define NSIMD_LIST_SIMD_EXT cpu, sse2, sse42, avx, avx2, avx512_skylake\n      #endif\n    } // namespace nsimd\n  #endif\n\n#elif defined(NSIMD_NEON128)\n\n  #define NSIMD_PLATFORM arm\n  #define NSIMD_SIMD neon128\n  #include <arm_neon.h>\n\n  #if NSIMD_CXX > 0\n    namespace nsimd {\n      struct cpu {};\n      struct neon128 {};\n      #if NSIMD_CXX >= 2020\n        template <typename T>\n        concept simd_ext_c = std::is_same_v<T, nsimd::cpu> ||\n                             std::is_same_v<T, nsimd::neon128>;\n        #define NSIMD_LIST_SIMD_EXT cpu, neon128\n      #endif\n    } // namespace nsimd\n  #endif\n\n#elif defined(NSIMD_AARCH64)\n\n  #define NSIMD_PLATFORM arm\n  #define NSIMD_SIMD aarch64\n  #include <arm_neon.h>\n\n  #if NSIMD_CXX > 0\n    namespace nsimd {\n      struct cpu {};\n      struct aarch64 {};\n      #if NSIMD_CXX >= 2020\n        template <typename T>\n        concept simd_ext_c = std::is_same_v<T, nsimd::cpu> ||\n                             std::is_same_v<T, nsimd::aarch64>;\n        #define NSIMD_LIST_SIMD_EXT cpu, aarch64\n      #endif\n    } // namespace nsimd\n  #endif\n\n#elif defined(NSIMD_SVE)\n\n  #define NSIMD_PLATFORM arm\n  #define NSIMD_SIMD sve\n  #include <arm_neon.h>\n  #include <arm_sve.h>\n\n  #if NSIMD_CXX > 0\n    namespace nsimd {\n      struct cpu {};\n      struct aarch64 {};\n      struct sve {};\n      #if NSIMD_CXX >= 2020\n        template <typename T>\n        concept simd_ext_c = std::is_same_v<T, nsimd::cpu> ||\n                             std::is_same_v<T, nsimd::aarch64> ||\n                             std::is_same_v<T, nsimd::sve>;\n        #define NSIMD_LIST_SIMD_EXT cpu, aarch64, sve\n      #endif\n    } // namespace nsimd\n  #endif\n\n#elif defined(NSIMD_SVE128)\n\n  #define NSIMD_PLATFORM arm\n  #define NSIMD_SIMD sve128\n  #include <arm_neon.h>\n  #include <arm_sve.h>\n\n  #if NSIMD_CXX > 0\n    namespace nsimd {\n      struct cpu {};\n      struct aarch64 {};\n      struct sve128 {};\n      #if NSIMD_CXX >= 2020\n        template <typename T>\n        concept simd_ext_c = std::is_same_v<T, nsimd::cpu> ||\n                             std::is_same_v<T, nsimd::aarch64> ||\n                             std::is_same_v<T, nsimd::sve128>;\n        #define NSIMD_LIST_SIMD_EXT cpu, aarch64, sve128\n      #endif\n    } // namespace nsimd\n  #endif\n\n#elif defined(NSIMD_SVE256)\n\n  #define NSIMD_PLATFORM arm\n  #define NSIMD_SIMD sve256\n  #include <arm_neon.h>\n  #include <arm_sve.h>\n\n  #if NSIMD_CXX > 0\n    namespace nsimd {\n      struct cpu {};\n      struct aarch64 {};\n      struct sve256 {};\n      #if NSIMD_CXX >= 2020\n        template <typename T>\n        concept simd_ext_c = std::is_same_v<T, nsimd::cpu> ||\n                             std::is_same_v<T, nsimd::aarch64> ||\n                             std::is_same_v<T, nsimd::sve256>;\n        #define NSIMD_LIST_SIMD_EXT cpu, aarch64, sve256\n      #endif\n    } // namespace nsimd\n  #endif\n\n#elif defined(NSIMD_SVE512)\n\n  #define NSIMD_PLATFORM arm\n  #define NSIMD_SIMD sve512\n  #include <arm_neon.h>\n  #include <arm_sve.h>\n\n  #if NSIMD_CXX > 0\n    namespace nsimd {\n      struct cpu {};\n      struct aarch64 {};\n      struct sve512 {};\n      #if NSIMD_CXX >= 2020\n        template <typename T>\n        concept simd_ext_c = std::is_same_v<T, nsimd::cpu> ||\n                             std::is_same_v<T, nsimd::aarch64> ||\n                             std::is_same_v<T, nsimd::sve512>;\n        #define NSIMD_LIST_SIMD_EXT cpu, aarch64, sve512\n      #endif\n    } // namespace nsimd\n  #endif\n\n#elif defined(NSIMD_SVE1024)\n\n  #define NSIMD_PLATFORM arm\n  #define NSIMD_SIMD sve1024\n  #include <arm_neon.h>\n  #include <arm_sve.h>\n\n  #if NSIMD_CXX > 0\n    namespace nsimd {\n      struct cpu {};\n      struct aarch64 {};\n      struct sve1024 {};\n      #if NSIMD_CXX >= 2020\n        template <typename T>\n        concept simd_ext_c = std::is_same_v<T, nsimd::cpu> ||\n                             std::is_same_v<T, nsimd::aarch64> ||\n                             std::is_same_v<T, nsimd::sve1024>;\n        #define NSIMD_LIST_SIMD_EXT cpu, aarch64, sve1024\n      #endif\n    } // namespace nsimd\n  #endif\n\n#elif defined(NSIMD_SVE2048)\n\n  #define NSIMD_PLATFORM arm\n  #define NSIMD_SIMD sve2048\n  #include <arm_neon.h>\n  #include <arm_sve.h>\n\n  #if NSIMD_CXX > 0\n    namespace nsimd {\n      struct cpu {};\n      struct aarch64 {};\n      struct sve2048 {};\n      #if NSIMD_CXX >= 2020\n        template <typename T>\n        concept simd_ext_c = std::is_same_v<T, nsimd::cpu> ||\n                             std::is_same_v<T, nsimd::aarch64> ||\n                             std::is_same_v<T, nsimd::sve2048>;\n        #define NSIMD_LIST_SIMD_EXT cpu, aarch64, sve2048\n      #endif\n    } // namespace nsimd\n  #endif\n\n#elif defined(NSIMD_VMX)\n\n  #define NSIMD_PLATFORM ppc\n  #define NSIMD_SIMD vmx\n\n  #ifdef NSIMD_IS_CLANG\n    /* New version of clang are spamming useless warning comming from their */\n    /* altivec.h file */\n    #pragma clang diagnostic ignored \"-Wc11-extensions\"\n    #pragma clang diagnostic ignored \"-Wc++11-long-long\"\n  #endif\n\n  #include <altivec.h>\n\n  #ifdef bool\n    #undef bool\n  #endif\n  #ifdef pixel\n    #undef pixel\n  #endif\n  #ifdef vector\n    #undef vector\n  #endif\n\n  #if NSIMD_CXX > 0\n    namespace nsimd {\n      struct cpu {};\n      struct vmx {};\n      #if NSIMD_CXX >= 2020\n        template <typename T>\n        concept simd_ext_c = std::is_same_v<T, nsimd::cpu> ||\n                             std::is_same_v<T, nsimd::vmx>;\n        #define NSIMD_LIST_SIMD_EXT cpu, vmx\n      #endif\n    } // namespace nsimd\n  #endif\n\n#elif defined(NSIMD_VSX)\n\n  #define NSIMD_PLATFORM ppc\n  #define NSIMD_SIMD vsx\n\n  #ifdef NSIMD_IS_CLANG\n    /* New version of clang are spamming useless warning comming from their */\n    /* altivec.h file */\n    #pragma clang diagnostic ignored \"-Wc11-extensions\"\n    #pragma clang diagnostic ignored \"-Wc++11-long-long\"\n  #endif\n\n  #include <altivec.h>\n\n  #ifdef bool\n    #undef bool\n  #endif\n  #ifdef pixel\n    #undef pixel\n  #endif\n  #ifdef vector\n    #undef vector\n  #endif\n\n  #if NSIMD_CXX > 0\n    namespace nsimd {\n      struct cpu {};\n      struct vmx {};\n      struct vsx {};\n      #if NSIMD_CXX >= 2020\n        template <typename T>\n        concept simd_ext_c = std::is_same_v<T, nsimd::cpu> ||\n                             std::is_same_v<T, nsimd::vmx> ||\n                             std::is_same_v<T, nsimd::vsx>;\n        #define NSIMD_LIST_SIMD_EXT cpu, vsx\n      #endif\n    } // namespace nsimd\n  #endif\n\n#else\n\n  #ifdef NSIMD_CUDA\n    #if defined(NSIMD_IS_GCC)\n      #pragma GCC diagnostic push\n      #pragma GCC diagnostic ignored \"-Wunused-function\"\n    #elif defined(NSIMD_IS_CLANG)\n      #pragma clang diagnostic push\n      #pragma clang diagnostic ignored \"-Wunused-function\"\n    #endif\n    #include <cuda_fp16.h>\n    #if defined(NSIMD_IS_GCC)\n      #pragma GCC diagnostic pop\n    #elif defined(NSIMD_IS_CLANG)\n      #pragma clang diagnostic pop\n    #endif\n  #endif\n\n  #ifdef NSIMD_ROCM\n    #include <hip/hip_fp16.h>\n    #include <hip/hip_runtime.h>\n  #endif\n\n  #if defined(NSIMD_ONEAPI) && NSIMD_CXX > 0\n    #include <CL/sycl.hpp>\n\n    extern \"C\" {\n\n    NSIMD_DLLSPEC void *nsimd_oneapi_default_queue();\n\n    } // extern \"C\"\n\n    namespace nsimd {\n    namespace oneapi {\n\n    NSIMD_INLINE sycl::queue &default_queue() {\n      return *(sycl::queue *)nsimd_oneapi_default_queue();\n    }\n\n    } // namespace oneapi\n    } // namespace nsimd\n  #endif\n\n  #define NSIMD_SIMD cpu\n  #define NSIMD_PLATFORM cpu\n\n  #ifdef NSIMD_IS_MSVC\n    #include <intrin.h>\n  #endif\n\n  #if NSIMD_CXX > 0\n    namespace nsimd {\n      struct cpu {};\n      #if NSIMD_CXX >= 2020\n        template <typename T>\n        concept simd_ext_c = std::is_same_v<T, nsimd::cpu>;\n        #define NSIMD_LIST_SIMD_EXT cpu\n      #endif\n    } // namespace nsimd\n  #endif\n\n#endif\n\n#if NSIMD_CXX >= 2020\n  #define NSIMD_CONCEPT_SIMD_EXT nsimd::simd_ext_c\n#else\n  #define NSIMD_CONCEPT_SIMD_EXT typename\n#endif\n\n/* ------------------------------------------------------------------------- */\n/* For ARM SVE we need a special struct */\n\n#ifdef NSIMD_SVE\n  #define NSIMD_STRUCT __sizeless_struct\n#else\n  #define NSIMD_STRUCT struct\n#endif\n\n/* ------------------------------------------------------------------------- */\n/* Shorter typedefs for integers and their limits */\n\n#if NSIMD_CXX > 0\n  #include <climits>\n#else\n  #include <limits.h>\n#endif\n\n#if defined(NSIMD_ONEAPI)\n  typedef sycl::cl_char    i8;\n  typedef sycl::cl_uchar   u8;\n  typedef sycl::cl_short   i16;\n  typedef sycl::cl_ushort  u16;\n  typedef sycl::cl_int     i32;\n  typedef sycl::cl_uint    u32;\n  typedef sycl::cl_long    i64;\n  typedef sycl::cl_ulong   u64;\n#elif defined(NSIMD_IS_MSVC)\n  typedef unsigned __int8  u8;\n  typedef signed   __int8  i8;\n  typedef unsigned __int16 u16;\n  typedef signed   __int16 i16;\n  typedef unsigned __int32 u32;\n  typedef signed   __int32 i32;\n  typedef unsigned __int64 u64;\n  typedef signed   __int64 i64;\n#else\n  typedef unsigned char  u8;\n  typedef signed   char  i8;\n  typedef unsigned short u16;\n  typedef signed   short i16;\n  #ifdef __UINT32_TYPE__\n    typedef __UINT32_TYPE__ u32;\n  #else\n    #if defined(NSIMD_NEON128) && __ARM_ARCH <= 6\n      typedef unsigned long u32;\n    #else\n      typedef unsigned int  u32;\n    #endif\n  #endif\n  #ifdef __INT32_TYPE__\n    typedef __INT32_TYPE__  i32;\n  #else\n    #if defined(NSIMD_NEON128) && __ARM_ARCH <= 6\n      typedef signed long   i32;\n    #else\n      typedef signed int    i32;\n    #endif\n  #endif\n  #if defined(NSIMD_VMX) || defined(NSIMD_VSX)\n    typedef nsimd_ulonglong u64;\n    typedef nsimd_longlong  i64;\n  #elif NSIMD_WORD_SIZE == 64\n    #ifdef __UINT64_TYPE__\n      typedef nsimd_uint64_type u64;\n    #else\n      typedef unsigned long     u64;\n    #endif\n    #ifdef __INT64_TYPE__\n      typedef nsimd_int64_type  i64;\n    #else\n      typedef signed long       i64;\n    #endif\n  #else\n    #if defined(NSIMD_IS_GCC) || defined(NSIMD_IS_CLANG)\n      typedef nsimd_ulonglong u64;\n      typedef nsimd_longlong i64;\n    #else\n      typedef unsigned long long u64;\n      typedef signed long long   i64;\n    #endif\n  #endif\n#endif\n\n#define NSIMD_U8_MIN ((u8)0)\n#define NSIMD_U8_MAX UCHAR_MAX\n#define NSIMD_I8_MIN SCHAR_MIN\n#define NSIMD_I8_MAX SCHAR_MAX\n#define NSIMD_U16_MIN ((u16)0)\n#define NSIMD_U16_MAX USHRT_MAX\n#define NSIMD_I16_MIN SHRT_MIN\n#define NSIMD_I16_MAX SHRT_MAX\n#define NSIMD_U32_MIN ((u32)0)\n#define NSIMD_U32_MAX UINT_MAX\n#define NSIMD_I32_MIN INT_MIN\n#define NSIMD_I32_MAX INT_MAX\n\n#ifdef NSIMD_IS_MSVC\n  #define NSIMD_U64_MIN ((u64)0)\n  #define NSIMD_U64_MAX ULLONG_MAX\n  #define NSIMD_I64_MIN LLONG_MIN\n  #define NSIMD_I64_MAX LLONG_MAX\n#else\n  #if NSIMD_WORD_SIZE == 64\n    #define NSIMD_U64_MIN ((u64)0)\n    #define NSIMD_U64_MAX ULONG_MAX\n    #define NSIMD_I64_MIN LONG_MIN\n    #define NSIMD_I64_MAX LONG_MAX\n  #else\n    #define NSIMD_U64_MIN ((u64)0)\n    #define NSIMD_U64_MAX (~((u64)0))\n    #define NSIMD_I64_MIN ((i64)1 << 63)\n    #define NSIMD_I64_MAX (~((i64)1 << 63))\n  #endif\n#endif\n\n/* ------------------------------------------------------------------------- */\n/* Shorter typedefs for floatting point types */\n\n#if ((defined(NSIMD_NEON128) || defined(NSIMD_AARCH64)) &&                    \\\n     defined(NSIMD_FP16)) || defined(NSIMD_SVE_FAMILY)\n  #define NSIMD_ARM_FP16\n#endif\n\n#ifdef NSIMD_ARM_FP16\n  typedef __fp16 f16;\n  #define NSIMD_NATIVE_FP16\n#elif defined(NSIMD_CUDA) || defined(NSIMD_ROCM)\n  typedef __half f16;\n  #define NSIMD_NATIVE_FP16\n#elif defined(NSIMD_ONEAPI)\n  typedef sycl::half f16;\n  #define NSIMD_NATIVE_FP16\n#else\n  typedef struct { u16 u; } f16;\n#endif\n\n#if defined(NSIMD_ONEAPI)\n  typedef sycl::cl_float f32;\n  typedef sycl::cl_double f64;\n#else\n  typedef float  f32;\n  typedef double f64;\n#endif\n\n/* ------------------------------------------------------------------------- */\n/* Native register size (for now only 32 and 64 bits) types */\n\n#if NSIMD_WORD_SIZE == 64\n  typedef i64 nsimd_nat;\n#else\n  typedef i32 nsimd_nat;\n#endif\n\n#if NSIMD_CXX > 0\nnamespace nsimd {\ntypedef nsimd_nat nat;\n} // namespace nsimd\n#endif\n\n/* ------------------------------------------------------------------------- */\n/* C++ traits for base types */\n\n#if NSIMD_CXX > 0\n\nnamespace nsimd {\n\n// Some C++20 concepts first\n\n#if NSIMD_CXX >= 2020\n  template <typename T> concept simd_value_type_c =\n      std::is_same_v<T, u8> || std::is_same_v<T, i8> ||\n      std::is_same_v<T, u16> || std::is_same_v<T, i16> ||\n      std::is_same_v<T, u32> || std::is_same_v<T, i32> ||\n      std::is_same_v<T, u64> || std::is_same_v<T, i64> ||\n      std::is_same_v<T, f16> || std::is_same_v<T, f32> ||\n      std::is_same_v<T, f64>;\n  #define NSIMD_CONCEPT_VALUE_TYPE nsimd::simd_value_type_c\n\n  template <typename T> concept simd_value_type_or_bool_c =\n      simd_value_type_c<T> || std::is_same_v<T, bool>;\n  #define NSIMD_CONCEPT_VALUE_TYPE_OR_BOOL nsimd::simd_value_type_or_bool_c\n\n  // We need our own sizeof because of f16 which can be 4 bytes (i.e. a\n  // float) on systems where there is no support for native f16.\n  template <typename T> struct sizeof_t {\n    static const size_t value = sizeof(T);\n  };\n  template <> struct sizeof_t<f16> { static const size_t value = 2; };\n\n  template <typename T> const size_t sizeof_v = sizeof_t<T>::value;\n\n  #define NSIMD_REQUIRES(cond) requires(cond)\n#else\n  #define NSIMD_CONCEPT_VALUE_TYPE typename\n  #define NSIMD_CONCEPT_VALUE_TYPE_OR_BOOL typename\n  #define NSIMD_REQUIRES(cond)\n#endif\n\ntemplate <NSIMD_CONCEPT_VALUE_TYPE T> struct traits {};\n\n// 8-bits\n\ntemplate <> struct traits<i8> {\n  typedef i8 itype;\n  typedef u8 utype;\n};\n\ntemplate <> struct traits<u8> {\n  typedef i8 itype;\n  typedef u8 utype;\n};\n\n// 16-bits\n\ntemplate <> struct traits<i16> {\n  typedef i16 itype;\n  typedef u16 utype;\n  typedef f16 ftype;\n};\n\ntemplate <> struct traits<u16> {\n  typedef i16 itype;\n  typedef u16 utype;\n  typedef f16 ftype;\n};\n\ntemplate <> struct traits<f16> {\n  typedef i16 itype;\n  typedef u16 utype;\n  typedef f16 ftype;\n};\n\n// 32-bits\n\ntemplate <> struct traits<i32> {\n  typedef i32 itype;\n  typedef u32 utype;\n  typedef f32 ftype;\n};\n\ntemplate <> struct traits<u32> {\n  typedef i32 itype;\n  typedef u32 utype;\n  typedef f32 ftype;\n};\n\ntemplate <> struct traits<f32> {\n  typedef i32 itype;\n  typedef u32 utype;\n  typedef f32 ftype;\n};\n\n// 64-bits\n\ntemplate <> struct traits<i64> {\n  typedef i64 itype;\n  typedef u64 utype;\n  typedef f64 ftype;\n};\n\ntemplate <> struct traits<u64> {\n  typedef i64 itype;\n  typedef u64 utype;\n  typedef f64 ftype;\n};\n\ntemplate <> struct traits<f64> {\n  typedef i64 itype;\n  typedef u64 utype;\n  typedef f64 ftype;\n};\n\n} // namespace nsimd\n\n#endif\n\n/* ------------------------------------------------------------------------- */\n/* Set if denormalized float are set to 0                                    */\n\n#ifdef NSIMD_NEON128\n  #define NSIMD_DNZ_FLUSH_TO_ZERO\n#endif\n\n/* clang-format on */\n\n/* ------------------------------------------------------------------------- */\n/* POPCNT: GCC and Clang have intrinsics */\n\nNSIMD_INLINE int nsimd_popcnt32_(u32 a) {\n#if defined(NSIMD_IS_GCC) || defined(NSIMD_IS_CLANG)\n  return __builtin_popcount(a);\n#elif defined(NSIMD_IS_MSVC)\n  return (int)__popcnt(a);\n#else\n  int i, ret = 0;\n  for (i = 0; i < 32; i++) {\n    ret += (int)((a >> i) & 1);\n  }\n  return ret;\n#endif\n}\n\nNSIMD_INLINE int nsimd_popcnt64_(u64 a) {\n#if defined(NSIMD_IS_GCC) || defined(NSIMD_IS_CLANG)\n#if __SIZEOF_LONG__ == 4\n  return __builtin_popcountl((u32)(a & 0xFFFFFFFF)) +\n         __builtin_popcountl((u32)(a >> 32));\n#else\n  return __builtin_popcountl(a);\n#endif\n#elif defined(NSIMD_IS_MSVC)\n  #if NSIMD_WORD_SIZE == 64\n    return (int)__popcnt64(a);\n  #else\n    return (int)__popcnt((u32)(a & 0xFFFFFFFF)) +\n           (int)__popcnt((u32)(a >> 32));\n  #endif\n#else\n  int i, ret = 0;\n  for (i = 0; i < 64; i++) {\n    ret += (int)((a >> i) & 1);\n  }\n  return ret;\n#endif\n}\n\n/* ------------------------------------------------------------------------- */\n/* Macro to automatically include function depending on detected\n   platform/SIMD */\n\n#define NSIMD_AUTO_INCLUDE(path) <nsimd/NSIMD_PLATFORM/NSIMD_SIMD/path>\n\n/* ------------------------------------------------------------------------- */\n/* Standard includes */\n\n/* clang-format off */\n\n#if NSIMD_CXX > 0\n  #include <cerrno>\n  #include <cstdlib>\n#else\n  #include <errno.h>\n  #include <stdlib.h>\n#endif\n\n/* clang-format on */\n\n/* ------------------------------------------------------------------------- */\n/* Now includes detected SIMD types */\n\n#if NSIMD_CXX > 0\n\nnamespace nsimd {\ntemplate <NSIMD_CONCEPT_VALUE_TYPE T, NSIMD_CONCEPT_SIMD_EXT SimdExt>\nstruct simd_traits {};\n} // namespace nsimd\n\n// Those are for writing shorter code\n#define NSIMD_NSV(T, SIMD_EXT)                                                \\\n  typename nsimd::simd_traits<T, SIMD_EXT>::simd_vector\n#define NSIMD_NSVX2(T, SIMD_EXT)                                              \\\n  typename nsimd::simd_traits<T, SIMD_EXT>::simd_vectorx2\n#define NSIMD_NSVX3(T, SIMD_EXT)                                              \\\n  typename nsimd::simd_traits<T, SIMD_EXT>::simd_vectorx3\n#define NSIMD_NSVX4(T, SIMD_EXT)                                              \\\n  typename nsimd::simd_traits<T, SIMD_EXT>::simd_vectorx4\n#define NSIMD_NSVL(L, SIMD_EXT)                                               \\\n  typename nsimd::simd_traits<L, SIMD_EXT>::simd_vectorl\n\n#endif\n\n#include NSIMD_AUTO_INCLUDE(types.h)\n\n/* ------------------------------------------------------------------------- */\n/* Macro/typedefs for SIMD infos */\n\n#define vec(T) NSIMD_PP_CAT_4(nsimd_, NSIMD_SIMD, _v, T)\n#define vecl(T) NSIMD_PP_CAT_4(nsimd_, NSIMD_SIMD, _vl, T)\n\n#define vecx2(T) NSIMD_PP_CAT_5(nsimd_, NSIMD_SIMD, _v, T, x2)\n#define vecx3(T) NSIMD_PP_CAT_5(nsimd_, NSIMD_SIMD, _v, T, x3)\n#define vecx4(T) NSIMD_PP_CAT_5(nsimd_, NSIMD_SIMD, _v, T, x4)\n\ntypedef vec(i8) vi8;\ntypedef vec(u8) vu8;\ntypedef vec(i16) vi16;\ntypedef vec(u16) vu16;\ntypedef vec(i32) vi32;\ntypedef vec(u32) vu32;\ntypedef vec(i64) vi64;\ntypedef vec(u64) vu64;\ntypedef vec(f16) vf16;\ntypedef vec(f32) vf32;\ntypedef vec(f64) vf64;\n\ntypedef vecx2(i8) vi8x2;\ntypedef vecx2(u8) vu8x2;\ntypedef vecx2(i16) vi16x2;\ntypedef vecx2(u16) vu16x2;\ntypedef vecx2(i32) vi32x2;\ntypedef vecx2(u32) vu32x2;\ntypedef vecx2(i64) vi64x2;\ntypedef vecx2(u64) vu64x2;\ntypedef vecx2(f16) vf16x2;\ntypedef vecx2(f32) vf32x2;\ntypedef vecx2(f64) vf64x2;\n\ntypedef vecx3(i8) vi8x3;\ntypedef vecx3(u8) vu8x3;\ntypedef vecx3(i16) vi16x3;\ntypedef vecx3(u16) vu16x3;\ntypedef vecx3(i32) vi32x3;\ntypedef vecx3(u32) vu32x3;\ntypedef vecx3(i64) vi64x3;\ntypedef vecx3(u64) vu64x3;\ntypedef vecx3(f16) vf16x3;\ntypedef vecx3(f32) vf32x3;\ntypedef vecx3(f64) vf64x3;\n\ntypedef vecx4(i8) vi8x4;\ntypedef vecx4(u8) vu8x4;\ntypedef vecx4(i16) vi16x4;\ntypedef vecx4(u16) vu16x4;\ntypedef vecx4(i32) vi32x4;\ntypedef vecx4(u32) vu32x4;\ntypedef vecx4(i64) vi64x4;\ntypedef vecx4(u64) vu64x4;\ntypedef vecx4(f16) vf16x4;\ntypedef vecx4(f32) vf32x4;\ntypedef vecx4(f64) vf64x4;\n\ntypedef vecl(i8) vli8;\ntypedef vecl(u8) vlu8;\ntypedef vecl(i16) vli16;\ntypedef vecl(u16) vlu16;\ntypedef vecl(i32) vli32;\ntypedef vecl(u32) vlu32;\ntypedef vecl(i64) vli64;\ntypedef vecl(u64) vlu64;\ntypedef vecl(f16) vlf16;\ntypedef vecl(f32) vlf32;\ntypedef vecl(f64) vlf64;\n\n#define vec_a(T, simd_ext) NSIMD_PP_CAT_4(nsimd_, simd_ext, _v, T)\n#define vecl_a(T, simd_ext) NSIMD_PP_CAT_4(nsimd_, simd_ext, _vl, T)\n\n#if NSIMD_CXX > 0\n\nnamespace nsimd {\n\n/* Alignment tags */\nstruct aligned {};\nstruct unaligned {};\n\n#if NSIMD_CXX >= 2020\ntemplate <typename T>\nconcept alignment_c = std::is_same_v<T, aligned> ||\n                      std::is_same_v<T, unaligned>;\n#define NSIMD_CONCEPT_ALIGNMENT nsimd::alignment_c\n#else\n#define NSIMD_CONCEPT_ALIGNMENT typename\n#endif\n\n#if NSIMD_CXX >= 2011\n\ntemplate <NSIMD_CONCEPT_VALUE_TYPE T>\nusing simd_vector = typename simd_traits<T, NSIMD_SIMD>::simd_vector;\n\ntemplate <NSIMD_CONCEPT_VALUE_TYPE T>\nusing simd_vectorl = typename simd_traits<T, NSIMD_SIMD>::simd_vectorl;\n\n#endif\n\n} // namespace nsimd\n\n#endif\n\n/* clang-format off */\n\n#if defined(NSIMD_X86)\n  #define NSIMD_MAX_ALIGNMENT 64\n#elif defined(NSIMD_ARM)\n  #define NSIMD_MAX_ALIGNMENT 256\n#elif defined(NSIMD_POWERPC)\n  #define NSIMD_MAX_ALIGNMENT 64\n#else\n  #define NSIMD_MAX_ALIGNMENT 16\n#endif\n\n/* TODO: provide C++14 alignment constpexxr */\n\n/* clang-format on */\n\n#define NSIMD_NB_REGISTERS NSIMD_PP_CAT_3(NSIMD_, NSIMD_SIMD, _NB_REGISTERS)\n\n#define NSIMD_MAX_LEN_BIT 2048\n\n#define NSIMD_MAX_LEN_i8 (NSIMD_MAX_LEN_BIT / 8)\n#define NSIMD_MAX_LEN_u8 (NSIMD_MAX_LEN_BIT / 8)\n#define NSIMD_MAX_LEN_i16 (NSIMD_MAX_LEN_BIT / 16)\n#define NSIMD_MAX_LEN_u16 (NSIMD_MAX_LEN_BIT / 16)\n#define NSIMD_MAX_LEN_f16 (NSIMD_MAX_LEN_BIT / 16)\n#define NSIMD_MAX_LEN_i32 (NSIMD_MAX_LEN_BIT / 32)\n#define NSIMD_MAX_LEN_u32 (NSIMD_MAX_LEN_BIT / 32)\n#define NSIMD_MAX_LEN_f32 (NSIMD_MAX_LEN_BIT / 32)\n#define NSIMD_MAX_LEN_i64 (NSIMD_MAX_LEN_BIT / 64)\n#define NSIMD_MAX_LEN_u64 (NSIMD_MAX_LEN_BIT / 64)\n#define NSIMD_MAX_LEN_f64 (NSIMD_MAX_LEN_BIT / 64)\n\n#define NSIMD_MAX_LEN_e(typ) NSIMD_MAX_LEN_##typ\n#define NSIMD_MAX_LEN(typ) NSIMD_MAX_LEN_e(typ)\n\n#if NSIMD_CXX > 0\nnamespace nsimd {\n\ntemplate <NSIMD_CONCEPT_VALUE_TYPE T> struct max_len_t {};\n\ntemplate <> struct max_len_t<i8> {\n  static const int value = NSIMD_MAX_LEN_BIT / 8;\n};\ntemplate <> struct max_len_t<u8> {\n  static const int value = NSIMD_MAX_LEN_BIT / 8;\n};\ntemplate <> struct max_len_t<i16> {\n  static const int value = NSIMD_MAX_LEN_BIT / 16;\n};\ntemplate <> struct max_len_t<u16> {\n  static const int value = NSIMD_MAX_LEN_BIT / 16;\n};\ntemplate <> struct max_len_t<f16> {\n  static const int value = NSIMD_MAX_LEN_BIT / 16;\n};\ntemplate <> struct max_len_t<i32> {\n  static const int value = NSIMD_MAX_LEN_BIT / 32;\n};\ntemplate <> struct max_len_t<u32> {\n  static const int value = NSIMD_MAX_LEN_BIT / 32;\n};\ntemplate <> struct max_len_t<f32> {\n  static const int value = NSIMD_MAX_LEN_BIT / 32;\n};\ntemplate <> struct max_len_t<i64> {\n  static const int value = NSIMD_MAX_LEN_BIT / 64;\n};\ntemplate <> struct max_len_t<u64> {\n  static const int value = NSIMD_MAX_LEN_BIT / 64;\n};\ntemplate <> struct max_len_t<f64> {\n  static const int value = NSIMD_MAX_LEN_BIT / 64;\n};\n\n#if NSIMD_CXX >= 2014\ntemplate <NSIMD_CONCEPT_VALUE_TYPE T>\nconstexpr int max_len = max_len_t<T>::value;\n#endif\n\n} // namespace nsimd\n#endif\n\n/* ------------------------------------------------------------------------- */\n/* Memory functions */\n\n/* clang-format off */\n\n#if NSIMD_CXX > 0\n  #include <cstddef>\n  #include <new>\n  #include <vector>\n#endif\n\n/* clang-format on */\n\n/* ------------------------------------------------------------------------- */\n\n#if NSIMD_CXX > 0\nextern \"C\" {\n#endif\n\nNSIMD_DLLSPEC void *nsimd_aligned_alloc(nsimd_nat);\nNSIMD_DLLSPEC void nsimd_aligned_free(void *);\n\n#if NSIMD_CXX > 0\n} // extern \"C\"\n#endif\n\n/* ------------------------------------------------------------------------- */\n/* C++ templated functions */\n\n#if NSIMD_CXX > 0\nnamespace nsimd {\n\nNSIMD_INLINE void *aligned_alloc(nsimd_nat n) {\n  return nsimd_aligned_alloc(n);\n}\n\nNSIMD_INLINE void aligned_free(void *ptr) {\n  nsimd_aligned_free(ptr);\n}\n\ntemplate <NSIMD_CONCEPT_VALUE_TYPE T> T *aligned_alloc_for(nsimd_nat n) {\n  return (T *)aligned_alloc(n * (nsimd_nat)sizeof(T));\n}\n\ntemplate <NSIMD_CONCEPT_VALUE_TYPE T> void aligned_free_for(void *ptr) {\n  return aligned_free((T *)ptr);\n}\n\n} // namespace nsimd\n#endif\n\n/* ------------------------------------------------------------------------- */\n/* C++ <11 allocator */\n\n#if NSIMD_CXX > 0 && NSIMD_CXX < 2011\nnamespace nsimd {\n\ntemplate <typename T> class allocator {\npublic:\n  typedef T value_type;\n  typedef value_type *pointer;\n  typedef const value_type *const_pointer;\n  typedef value_type &reference;\n  typedef const value_type &const_reference;\n  typedef std::size_t size_type;\n  typedef std::ptrdiff_t difference_type;\n\npublic:\n  template <typename U> struct rebind { typedef allocator<U> other; };\n\npublic:\n  allocator() {}\n  ~allocator() {}\n  allocator(allocator const &) {}\n\n  template <typename U> inline explicit allocator(allocator<U> const &) {}\n\n  pointer address(reference r) { return &r; }\n  const_pointer address(const_reference r) { return &r; }\n\n  pointer allocate(size_type n) {\n    return reinterpret_cast<pointer>(aligned_alloc_for<T>((nsimd_nat)n));\n  }\n\n  pointer allocate(size_type n, const void *) { return allocate(n); }\n\n  void deallocate(pointer p, size_type) { aligned_free_for<T>(p); }\n\n  size_type max_size() const { return size_type(-1) / sizeof(T); }\n\n  void construct(pointer p, const T &t) { new (p) T(t); }\n  void destroy(pointer p) { p->~T(); }\n\n  bool operator==(allocator const &) { return true; }\n  bool operator!=(allocator const &a) { return !operator==(a); }\n};\n\n} // namespace nsimd\n#endif\n\n/* ------------------------------------------------------------------------- */\n/* C++ >=11 allocator */\n\n#if NSIMD_CXX >= 2011\nnamespace nsimd {\n\ntemplate <NSIMD_CONCEPT_VALUE_TYPE T> struct allocator {\n  using value_type = T;\n\n  allocator() = default;\n\n  template <typename S> allocator(allocator<S> const &) {}\n\n  T *allocate(std::size_t n) {\n    if (n > std::size_t(-1) / sizeof(T)) {\n      throw std::bad_alloc();\n    }\n    T *ptr = aligned_alloc_for<T>((nsimd_nat)n);\n    if (ptr != NULL) {\n      return ptr;\n    }\n    throw std::bad_alloc();\n  }\n\n  void deallocate(T *ptr, std::size_t) { nsimd::aligned_free(ptr); }\n};\n\ntemplate <class T, class S>\nbool operator==(allocator<T> const &, allocator<S> const &) {\n  return true;\n}\n\ntemplate <class T, class S>\nbool operator!=(allocator<T> const &, allocator<S> const &) {\n  return false;\n}\n\n} // namespace nsimd\n#endif\n\n/* ------------------------------------------------------------------------- */\n/* scoped allocator */\n\n#if NSIMD_CXX > 0\nnamespace nsimd {\n\ntemplate <NSIMD_CONCEPT_VALUE_TYPE T> struct scoped_aligned_mem_for {\n  std::vector<T, nsimd::allocator<T> > data;\n\n  template <typename I>\n#if NSIMD_CXX >= 2020\n  requires std::integral<I>\n#endif\n  scoped_aligned_mem_for(I n) {\n    data.resize(size_t(n));\n  }\n\n  const T *get() const { return &data[0]; }\n  T *get() { return &data[0]; }\n};\n\n} // namespace nsimd\n#endif\n\n/* ------------------------------------------------------------------------- */\n/* Conversion functions f16 <---> f32 for C but only when compiling with a   */\n/* host compiler. Otherwise we must have C++ linkage as fp16 types are       */\n/* defined as C++ classes . */\n\n#if NSIMD_CXX > 0 && !defined(NSIMD_CUDA) && !defined(NSIMD_ROCM)\n  #define NSIMD_C_LINKAGE_FOR_F16\n#endif\n\n#ifdef NSIMD_C_LINKAGE_FOR_F16\nextern \"C\" {\n#endif\n\nNSIMD_DLLSPEC u16 nsimd_f32_to_u16(f32);\nNSIMD_DLLSPEC f32 nsimd_u16_to_f32(u16);\n\n#ifdef NSIMD_ARM_FP16\nNSIMD_INLINE f16 nsimd_f32_to_f16(f32 a) { return (f16)a; }\nNSIMD_INLINE f32 nsimd_f16_to_f32(f16 a) { return (f32)a; }\n#elif (defined(NSIMD_CUDA) && __CUDACC_VER_MAJOR__ >= 10) ||                  \\\n    defined(NSIMD_ROCM)\ninline f16 nsimd_f32_to_f16(f32 a) { return __float2half(a); }\ninline f32 nsimd_f16_to_f32(f16 a) { return __half2float(a); }\n#elif defined(NSIMD_CUDA) && __CUDACC_VER_MAJOR__ < 10\ninline f16 nsimd_f32_to_f16(f32 a) {\n  u16 ret = nsimd_f32_to_u16(a);\n  return *(__half *)&ret;\n}\ninline f32 nsimd_f16_to_f32(f16 a) { return nsimd_u16_to_f32(*(u16 *)&a); }\n#elif defined(NSIMD_ONEAPI)\ninline f16 nsimd_f32_to_f16(f32 a) { return static_cast<sycl::half>(a); }\ninline f32 nsimd_f16_to_f32(f16 a) { return static_cast<float>(a); }\n#else\nNSIMD_DLLSPEC f16 nsimd_f32_to_f16(f32);\nNSIMD_DLLSPEC f32 nsimd_f16_to_f32(f16);\n#endif\n\n#ifdef NSIMD_C_LINKAGE_FOR_F16\n} // extern \"C\"\n#endif\n\n/* ------------------------------------------------------------------------- */\n/* Conversion functions f16 <---> f32 for C++ */\n\n#if NSIMD_CXX > 0\nnamespace nsimd {\nNSIMD_DLLSPEC u16 f32_to_u16(f32);\nNSIMD_DLLSPEC f32 u16_to_f32(u16);\n#ifdef NSIMD_ARM_FP16\nNSIMD_INLINE f16 f32_to_f16(f32 a) { return (f16)a; }\nNSIMD_INLINE f32 f16_to_f32(f16 a) { return (f32)a; }\n#else\nNSIMD_DLLSPEC f16 f32_to_f16(f32);\nNSIMD_DLLSPEC f32 f16_to_f32(f16);\n#endif\n} // namespace nsimd\n#endif\n\n/* ------------------------------------------------------------------------- */\n/* Helper to print scalar values, converts to bigger type */\n\nNSIMD_INLINE u64 nsimd_to_biggest_u8(u8 a) { return (u64)a; }\nNSIMD_INLINE u64 nsimd_to_biggest_u16(u16 a) { return (u64)a; }\nNSIMD_INLINE u64 nsimd_to_biggest_u32(u32 a) { return (u64)a; }\nNSIMD_INLINE u64 nsimd_to_biggest_u64(u64 a) { return a; }\nNSIMD_INLINE i64 nsimd_to_biggest_i8(i8 a) { return (i64)a; }\nNSIMD_INLINE i64 nsimd_to_biggest_i16(i16 a) { return (i64)a; }\nNSIMD_INLINE i64 nsimd_to_biggest_i32(i32 a) { return (i64)a; }\nNSIMD_INLINE i64 nsimd_to_biggest_i64(i64 a) { return a; }\nNSIMD_INLINE f64 nsimd_to_biggest_f16(f16 a) {\n  return (f64)nsimd_f16_to_f32(a);\n}\nNSIMD_INLINE f64 nsimd_to_biggest_f32(f32 a) { return (f64)a; }\nNSIMD_INLINE f64 nsimd_to_biggest_f64(f64 a) { return a; }\n\n#if NSIMD_CXX > 0\nnamespace nsimd {\nNSIMD_INLINE u64 to_biggest(u8 a) { return nsimd_to_biggest_u8(a); }\nNSIMD_INLINE u64 to_biggest(u16 a) { return nsimd_to_biggest_u16(a); }\nNSIMD_INLINE u64 to_biggest(u32 a) { return nsimd_to_biggest_u32(a); }\nNSIMD_INLINE u64 to_biggest(u64 a) { return nsimd_to_biggest_u64(a); }\nNSIMD_INLINE i64 to_biggest(i8 a) { return nsimd_to_biggest_i8(a); }\nNSIMD_INLINE i64 to_biggest(i16 a) { return nsimd_to_biggest_i16(a); }\nNSIMD_INLINE i64 to_biggest(i32 a) { return nsimd_to_biggest_i32(a); }\nNSIMD_INLINE i64 to_biggest(i64 a) { return nsimd_to_biggest_i64(a); }\nNSIMD_INLINE f64 to_biggest(f16 a) { return nsimd_to_biggest_f16(a); }\nNSIMD_INLINE f64 to_biggest(f32 a) { return nsimd_to_biggest_f32(a); }\nNSIMD_INLINE f64 to_biggest(f64 a) { return nsimd_to_biggest_f64(a); }\n} // namespace nsimd\n#endif\n\n/* ------------------------------------------------------------------------- */\n/* General conversion for C++ */\n\n#if NSIMD_CXX > 0\nnamespace nsimd {\n\ntemplate <NSIMD_CONCEPT_VALUE_TYPE T, NSIMD_CONCEPT_VALUE_TYPE S>\nstruct to_helper {\n  static T to(T, S value) { return (T)value; }\n};\n\ntemplate <> struct to_helper<f16, f16> {\n  static f16 to(f16, f16 value) { return value; }\n};\n\ntemplate <NSIMD_CONCEPT_VALUE_TYPE S> struct to_helper<f16, S> {\n  static f16 to(f16, S value) { return nsimd_f32_to_f16((f32)value); }\n};\n\ntemplate <NSIMD_CONCEPT_VALUE_TYPE T> struct to_helper<T, f16> {\n  static T to(T, f16 value) { return (T)nsimd_f16_to_f32(value); }\n};\n\ntemplate <NSIMD_CONCEPT_VALUE_TYPE T, NSIMD_CONCEPT_VALUE_TYPE S>\nT to(S value) {\n  return to_helper<T, S>::to(T(), value);\n}\n\n} // namespace nsimd\n#endif\n\n/* ------------------------------------------------------------------------- */\n/* SIMD-related functions */\n\n/* clang-format off */\n\n#if defined(NSIMD_IS_MSVC)\n  /* We do not want MSVC to warn us about unary minus on an unsigned type.\n     It is well defined in standards: unsigned arithmetic is done modulo\n     2^n. */\n  #pragma warning(push)\n  #pragma warning(disable : 4146)\n#elif defined(NSIMD_IS_CLANG) && NSIMD_CXX < 2011\n  /* When compiling with Clang with C++98 or C++03, some Intel intrinsics are\n     implemented as macros which contain long long but long long are not\n     standard. To get rid of a lot of warning we push the corresponding\n     warning here. */\n  #pragma clang diagnostic push\n  #pragma clang diagnostic ignored \"-Wc++11-long-long\"\n#elif defined(NSIMD_IS_GCC) && defined(NSIMD_SVE_FAMILY)\n  /* Using SVE intrinsics svundef_XXX() is supposed to silence the\n     -Wuninitialized warnings but it does not with GCC 10.0 up to GCC 10.2\n     so we silence the warning manually for now. */\n  #pragma GCC diagnostic push\n  #pragma GCC diagnostic ignored \"-Wuninitialized\"\n#elif defined(NSIMD_IS_GCC) && NSIMD_CXX > 0 && \\\n      (defined(NSIMD_VMX) || defined(NSIMD_VSX))\n  /* When compiling POWERPC intrinsics inside C++ code with GCC we get tons of\n     -Wunused-but-set-parameter. This is a GCC bug. For now we slience the\n     warnings here. */\n  #pragma GCC diagnostic push\n  #pragma GCC diagnostic ignored \"-Wunused-but-set-parameter\"\n  #pragma GCC diagnostic ignored \"-Wunused-but-set-variable\"\n#endif\n\n#include <nsimd/functions.h>\n\n#if defined(NSIMD_IS_MSVC)\n  #pragma warning(pop)\n#elif defined(NSIMD_IS_CLANG) && NSIMD_CXX < 2011\n  #pragma clang diagnostic pop\n#elif defined(NSIMD_IS_GCC) && defined(NSIMD_SVE_FAMILY)\n  #pragma GCC diagnostic pop\n#elif defined(NSIMD_IS_GCC) && NSIMD_CXX > 0 && \\\n      (defined(NSIMD_VMX) || defined(NSIMD_VSX))\n  #pragma GCC diagnostic pop\n#endif\n\n/* clang-format on */\n\n/* ------------------------------------------------------------------------- */\n/* If_else cannot be auto-generated */\n\n#define vif_else(a0, a1, a2, typel, type)                                     \\\n  NSIMD_PP_CAT_4(nsimd_if_else1_, NSIMD_SIMD, _, type)                        \\\n  (NSIMD_PP_CAT_6(nsimd_vreinterpretl_, NSIMD_SIMD, _, type, _, typel)(a0),   \\\n   a1, a2)\n\n#define vif_else_e(a0, a1, a2, typel, type, simd_ext)                         \\\n  NSIMD_PP_CAT_4(nsimd_if_else1_, simd_ext, _, type)                          \\\n  (NSIMD_PP_CAT_6(nsimd_vreinterpretl_, simd_ext, _, type, _, typel)(a0), a1, \\\n   a2)\n\n#if NSIMD_CXX > 0\nnamespace nsimd {\n\ntemplate <NSIMD_CONCEPT_VALUE_TYPE L, NSIMD_CONCEPT_VALUE_TYPE T>\nNSIMD_REQUIRES(sizeof_v<L> == sizeof_v<T>)\nNSIMD_NSV(T, NSIMD_SIMD)\nif_else(NSIMD_NSVL(L, NSIMD_SIMD) a0, NSIMD_NSV(T, NSIMD_SIMD) a1,\n        NSIMD_NSV(T, NSIMD_SIMD) a2, L, T) {\n  return if_else1(reinterpretl(a0, L(), T(), NSIMD_SIMD()), a1, a2, T(),\n                  NSIMD_SIMD());\n}\n\ntemplate <NSIMD_CONCEPT_VALUE_TYPE L, NSIMD_CONCEPT_VALUE_TYPE T,\n          NSIMD_CONCEPT_SIMD_EXT SimdExt>\nNSIMD_REQUIRES(sizeof_v<L> == sizeof_v<T>)\nNSIMD_NSV(T, SimdExt)\nif_else(NSIMD_NSVL(L, SimdExt) a0, NSIMD_NSV(T, SimdExt) a1,\n        NSIMD_NSV(T, SimdExt) a2, L, T, SimdExt) {\n  return if_else1(reinterpretl(a0, L(), T(), SimdExt()), a1, a2, T(),\n                  SimdExt());\n}\n\n} // namespace nsimd\n#endif\n\n/* ------------------------------------------------------------------------- */\n/* Loads/stores can be parametrized/templated by the alignment */\n\n#define NSIMD_ALIGNED a\n#define NSIMD_UNALIGNED u\n\n#define vload(a0, type, alignment)                                            \\\n  NSIMD_PP_CAT_6(nsimd_load, alignment, _, NSIMD_SIMD, _, type)(a0)\n\n#define vload_e(a0, type, simd_ext, alignment)                                \\\n  NSIMD_PP_CAT_6(nsimd_load, alignment, _, simd_ext, _, type)(a0)\n\n#define vload2(a0, type, alignment)                                           \\\n  NSIMD_PP_CAT_6(nsimd_load2, alignment, _, NSIMD_SIMD, _, type)(a0)\n\n#define vload2_e(a0, type, simd_ext, alignment)                               \\\n  NSIMD_PP_CAT_6(nsimd_load2, alignment, _, simd_ext, _, type)(a0)\n\n#define vload3(a0, type, alignment)                                           \\\n  NSIMD_PP_CAT_6(nsimd_load3, alignment, _, NSIMD_SIMD, _, type)(a0)\n\n#define vload3_e(a0, type, simd_ext, alignment)                               \\\n  NSIMD_PP_CAT_6(nsimd_load3, alignment, _, simd_ext, _, type)(a0)\n\n#define vload4(a0, type, alignment)                                           \\\n  NSIMD_PP_CAT_6(nsimd_load4, alignment, _, NSIMD_SIMD, _, type)(a0)\n\n#define vload4_e(a0, type, simd_ext, alignment)                               \\\n  NSIMD_PP_CAT_6(nsimd_load4, alignment, _, simd_ext, _, type)(a0)\n\n#define vloadl(a0, type, alignment)                                           \\\n  NSIMD_PP_CAT_6(nsimd_loadl, alignment_, NSIMD_SIMD, _, type)(a0)\n\n#define vloadl_e(a0, type, simd_ext, alignment)                               \\\n  NSIMD_PP_CAT_6(nsimd_loadl, alignment_, simd_ext, _, type)(a0)\n\n#define vstore(a0, a1, type, alignment)                                       \\\n  NSIMD_PP_CAT_6(nsimd_store, alignment, _, NSIMD_SIMD, _, type)(a0, a1)\n\n#define vstore_e(a0, a1, type, simd_ext, alignment)                           \\\n  NSIMD_PP_CAT_6(nsimd_store, alignment, _, simd_ext, _, type)(a0, a1)\n\n#define vstore2(a0, a1, a2, type, alignment)                                  \\\n  NSIMD_PP_CAT_4(nsimd_store2, alignment, _, NSIMD_SIMD, _, type)(a0, a1, a2)\n\n#define vstore2_e(a0, a1, a2, type, simd_ext, alignment)                      \\\n  NSIMD_PP_CAT_4(nsimd_store2, alignment, _, simd_ext, _, type)(a0, a1, a2)\n\n#define vstore3(a0, a1, a2, a3, type, alignment)                              \\\n  NSIMD_PP_CAT_4(nsimd_store3, alignment, _, NSIMD_SIMD, _, type)             \\\n  (a0, a1, a2, a3)\n\n#define vstore3_e(a0, a1, a2, a3, type, simd_ext, alignment)                  \\\n  NSIMD_PP_CAT_4(nsimd_store3, alignment, _, simd_ext, _, type)(a0, a1, a2, a3)\n\n#define vstore4(a0, a1, a2, a3, a4, type, alignment)                          \\\n  NSIMD_PP_CAT_4(nsimd_store3, alignment, _, NSIMD_SIMD, _, type)             \\\n  (a0, a1, a2, a3, a4)\n\n#define vstore4_e(a0, a1, a2, a3, a4, type, simd_ext, alignment)              \\\n  NSIMD_PP_CAT_4(nsimd_store3, alignment, _, simd_ext, _, type)               \\\n  (a0, a1, a2, a3, a4)\n\n#define vstorel(a0, a1, type, alignment)                                      \\\n  NSIMD_PP_CAT_6(nsimd_storel, alignment, _, NSIMD_SIMD, _, type)(a0, a1)\n\n#define vstorel_e(a0, a1, type, simd_ext, alignment)                          \\\n  NSIMD_PP_CAT_6(nsimd_storel, alignment, _, simd_ext, _, type)(a0, a1)\n\n#if NSIMD_CXX > 0\nnamespace nsimd {\n\ntemplate <NSIMD_CONCEPT_VALUE_TYPE T>\nNSIMD_NSV(T, NSIMD_SIMD)\nload(const T *ptr, T, aligned) {\n  return loada(ptr, T(), NSIMD_SIMD());\n}\n\ntemplate <NSIMD_CONCEPT_VALUE_TYPE T>\nNSIMD_NSV(T, NSIMD_SIMD)\nload(const T *ptr, T, unaligned) {\n  return loadu(ptr, T(), NSIMD_SIMD());\n}\n\ntemplate <NSIMD_CONCEPT_VALUE_TYPE T, NSIMD_CONCEPT_SIMD_EXT SimdExt>\nNSIMD_NSV(T, SimdExt)\nload(const T *ptr, T, SimdExt, aligned) {\n  return loada(ptr, T(), SimdExt());\n}\n\ntemplate <NSIMD_CONCEPT_VALUE_TYPE T, NSIMD_CONCEPT_SIMD_EXT SimdExt>\nNSIMD_NSV(T, SimdExt)\nload(const T *ptr, T, SimdExt, unaligned) {\n  return loadu(ptr, T(), SimdExt());\n}\n\ntemplate <NSIMD_CONCEPT_VALUE_TYPE T>\nNSIMD_NSVX2(T, NSIMD_SIMD)\nload2(const T *ptr, T, aligned) {\n  return load2a(ptr, T(), NSIMD_SIMD());\n}\n\ntemplate <NSIMD_CONCEPT_VALUE_TYPE T>\nNSIMD_NSVX2(T, NSIMD_SIMD)\nload2(const T *ptr, T, unaligned) {\n  return load2u(ptr, T(), NSIMD_SIMD());\n}\n\ntemplate <NSIMD_CONCEPT_VALUE_TYPE T, NSIMD_CONCEPT_SIMD_EXT SimdExt>\nNSIMD_NSVX2(T, SimdExt)\nload2(const T *ptr, T, SimdExt, aligned) {\n  return load2a(ptr, T(), SimdExt());\n}\n\ntemplate <NSIMD_CONCEPT_VALUE_TYPE T, NSIMD_CONCEPT_SIMD_EXT SimdExt>\nNSIMD_NSVX2(T, SimdExt)\nload2(const T *ptr, T, SimdExt, unaligned) {\n  return load2u(ptr, T(), SimdExt());\n}\n\ntemplate <NSIMD_CONCEPT_VALUE_TYPE T>\nNSIMD_NSVX3(T, NSIMD_SIMD)\nload3(const T *ptr, T, aligned) {\n  return load3a(ptr, T(), NSIMD_SIMD());\n}\n\ntemplate <NSIMD_CONCEPT_VALUE_TYPE T>\nNSIMD_NSVX3(T, NSIMD_SIMD)\nload3(const T *ptr, T, unaligned) {\n  return load3u(ptr, T(), NSIMD_SIMD());\n}\n\ntemplate <NSIMD_CONCEPT_VALUE_TYPE T, NSIMD_CONCEPT_SIMD_EXT SimdExt>\nNSIMD_NSVX3(T, SimdExt)\nload3(const T *ptr, T, SimdExt, aligned) {\n  return load3a(ptr, T(), SimdExt());\n}\n\ntemplate <NSIMD_CONCEPT_VALUE_TYPE T, NSIMD_CONCEPT_SIMD_EXT SimdExt>\nNSIMD_NSVX3(T, SimdExt)\nload3(const T *ptr, T, SimdExt, unaligned) {\n  return load3u(ptr, T(), SimdExt());\n}\n\ntemplate <NSIMD_CONCEPT_VALUE_TYPE T>\nNSIMD_NSVX4(T, NSIMD_SIMD)\nload4(const T *ptr, T, aligned) {\n  return load4a(ptr, T(), NSIMD_SIMD());\n}\n\ntemplate <NSIMD_CONCEPT_VALUE_TYPE T>\nNSIMD_NSVX4(T, NSIMD_SIMD)\nload4(const T *ptr, T, unaligned) {\n  return load4u(ptr, T(), NSIMD_SIMD());\n}\n\ntemplate <NSIMD_CONCEPT_VALUE_TYPE T, NSIMD_CONCEPT_SIMD_EXT SimdExt>\nNSIMD_NSVX4(T, SimdExt)\nload4(const T *ptr, T, SimdExt, aligned) {\n  return load4a(ptr, T(), SimdExt());\n}\n\ntemplate <NSIMD_CONCEPT_VALUE_TYPE T, NSIMD_CONCEPT_SIMD_EXT SimdExt>\nNSIMD_NSVX4(T, SimdExt)\nload4(const T *ptr, T, SimdExt, unaligned) {\n  return load4u(ptr, T(), SimdExt());\n}\n\ntemplate <NSIMD_CONCEPT_VALUE_TYPE T>\nNSIMD_NSVL(T, NSIMD_SIMD)\nloadlu(const T *ptr, T, aligned) {\n  return loadla(ptr, T(), NSIMD_SIMD());\n}\n\ntemplate <NSIMD_CONCEPT_VALUE_TYPE T>\nNSIMD_NSVL(T, NSIMD_SIMD)\nloadlu(const T *ptr, T, unaligned) {\n  return loadlu(ptr, T(), NSIMD_SIMD());\n}\n\ntemplate <NSIMD_CONCEPT_VALUE_TYPE T, NSIMD_CONCEPT_SIMD_EXT SimdExt>\nNSIMD_NSVL(T, NSIMD_SIMD)\nloadlu(const T *ptr, T, SimdExt, aligned) {\n  return loadla(ptr, T(), SimdExt());\n}\n\ntemplate <NSIMD_CONCEPT_VALUE_TYPE T, NSIMD_CONCEPT_SIMD_EXT SimdExt>\nNSIMD_NSVL(T, NSIMD_SIMD)\nloadlu(const T *ptr, T, SimdExt, unaligned) {\n  return loadlu(ptr, T(), SimdExt());\n}\n\ntemplate <NSIMD_CONCEPT_VALUE_TYPE T>\nvoid store(T *ptr, NSIMD_NSV(T, NSIMD_SIMD) a1, T, aligned) {\n  storea(ptr, a1, T(), NSIMD_SIMD());\n}\n\ntemplate <NSIMD_CONCEPT_VALUE_TYPE T>\nvoid store(T *ptr, NSIMD_NSV(T, NSIMD_SIMD) a1, T, unaligned) {\n  storeu(ptr, a1, T(), NSIMD_SIMD());\n}\n\ntemplate <NSIMD_CONCEPT_VALUE_TYPE T, NSIMD_CONCEPT_SIMD_EXT SimdExt>\nvoid store(T *ptr, NSIMD_NSV(T, SimdExt) a1, T, SimdExt, aligned) {\n  storea(ptr, a1, T(), SimdExt());\n}\n\ntemplate <NSIMD_CONCEPT_VALUE_TYPE T, NSIMD_CONCEPT_SIMD_EXT SimdExt>\nvoid store(T *ptr, NSIMD_NSV(T, SimdExt) a1, T, SimdExt, unaligned) {\n  storeu(ptr, a1, T(), SimdExt());\n}\n\ntemplate <NSIMD_CONCEPT_VALUE_TYPE T>\nvoid store2(T *ptr, NSIMD_NSV(T, NSIMD_SIMD) a1, NSIMD_NSV(T, NSIMD_SIMD) a2,\n            T, aligned) {\n  store2a(ptr, a1, a2, T(), NSIMD_SIMD());\n}\n\ntemplate <NSIMD_CONCEPT_VALUE_TYPE T>\nvoid store2(T *ptr, NSIMD_NSV(T, NSIMD_SIMD) a1, NSIMD_NSV(T, NSIMD_SIMD) a2,\n            T, unaligned) {\n  store2u(ptr, a1, a2, T(), NSIMD_SIMD());\n}\n\ntemplate <NSIMD_CONCEPT_VALUE_TYPE T, NSIMD_CONCEPT_SIMD_EXT SimdExt>\nvoid store2(T *ptr, NSIMD_NSV(T, SimdExt) a1, NSIMD_NSV(T, SimdExt) a2, T,\n            SimdExt, aligned) {\n  store2a(ptr, a1, a2, T(), SimdExt());\n}\n\ntemplate <NSIMD_CONCEPT_VALUE_TYPE T, NSIMD_CONCEPT_SIMD_EXT SimdExt>\nvoid store2(T *ptr, NSIMD_NSV(T, SimdExt) a1, NSIMD_NSV(T, SimdExt) a2, T,\n            SimdExt, unaligned) {\n  store2u(ptr, a1, a2, T(), SimdExt());\n}\n\ntemplate <NSIMD_CONCEPT_VALUE_TYPE T>\nvoid store3(T *ptr, NSIMD_NSV(T, NSIMD_SIMD) a1, NSIMD_NSV(T, NSIMD_SIMD) a2,\n            NSIMD_NSV(T, NSIMD_SIMD) a3, T, aligned) {\n  store3a(ptr, a1, a2, a3, T(), NSIMD_SIMD());\n}\n\ntemplate <NSIMD_CONCEPT_VALUE_TYPE T>\nvoid store3(T *ptr, NSIMD_NSV(T, NSIMD_SIMD) a1, NSIMD_NSV(T, NSIMD_SIMD) a2,\n            NSIMD_NSV(T, NSIMD_SIMD) a3, T, unaligned) {\n  store3u(ptr, a1, a2, a3, T(), NSIMD_SIMD());\n}\n\ntemplate <NSIMD_CONCEPT_VALUE_TYPE T, NSIMD_CONCEPT_SIMD_EXT SimdExt>\nvoid store3(T *ptr, NSIMD_NSV(T, SimdExt) a1, NSIMD_NSV(T, SimdExt) a2,\n            NSIMD_NSV(T, SimdExt) a3, T, SimdExt, aligned) {\n  store3a(ptr, a1, a2, a3, T(), SimdExt());\n}\n\ntemplate <NSIMD_CONCEPT_VALUE_TYPE T, NSIMD_CONCEPT_SIMD_EXT SimdExt>\nvoid store3(T *ptr, NSIMD_NSV(T, SimdExt) a1, NSIMD_NSV(T, SimdExt) a2,\n            NSIMD_NSV(T, SimdExt) a3, T, SimdExt, unaligned) {\n  store3u(ptr, a1, a2, a3, T(), SimdExt());\n}\n\ntemplate <NSIMD_CONCEPT_VALUE_TYPE T>\nvoid store4(T *ptr, NSIMD_NSV(T, NSIMD_SIMD) a1, NSIMD_NSV(T, NSIMD_SIMD) a2,\n            NSIMD_NSV(T, NSIMD_SIMD) a3, NSIMD_NSV(T, NSIMD_SIMD) a4, T,\n            aligned) {\n  store4a(ptr, a1, a2, a3, a4, T(), NSIMD_SIMD());\n}\n\ntemplate <NSIMD_CONCEPT_VALUE_TYPE T>\nvoid store4(T *ptr, NSIMD_NSV(T, NSIMD_SIMD) a1, NSIMD_NSV(T, NSIMD_SIMD) a2,\n            NSIMD_NSV(T, NSIMD_SIMD) a3, NSIMD_NSV(T, NSIMD_SIMD) a4, T,\n            unaligned) {\n  store4u(ptr, a1, a2, a3, a4, T(), NSIMD_SIMD());\n}\n\ntemplate <NSIMD_CONCEPT_VALUE_TYPE T, NSIMD_CONCEPT_SIMD_EXT SimdExt>\nvoid store4(T *ptr, NSIMD_NSV(T, SimdExt) a1, NSIMD_NSV(T, SimdExt) a2,\n            NSIMD_NSV(T, SimdExt) a3, NSIMD_NSV(T, SimdExt) a4, T, SimdExt,\n            aligned) {\n  store4a(ptr, a1, a2, a3, a4, T(), SimdExt());\n}\n\ntemplate <NSIMD_CONCEPT_VALUE_TYPE T, NSIMD_CONCEPT_SIMD_EXT SimdExt>\nvoid store4(T *ptr, NSIMD_NSV(T, SimdExt) a1, NSIMD_NSV(T, SimdExt) a2,\n            NSIMD_NSV(T, SimdExt) a3, NSIMD_NSV(T, SimdExt) a4, T, SimdExt,\n            unaligned) {\n  store4u(ptr, a1, a2, a3, a4, T(), SimdExt());\n}\n\ntemplate <NSIMD_CONCEPT_VALUE_TYPE T>\nvoid storel(T *ptr, NSIMD_NSV(T, NSIMD_SIMD) a1, T, aligned) {\n  storela(ptr, a1, T(), NSIMD_SIMD());\n}\n\ntemplate <NSIMD_CONCEPT_VALUE_TYPE T>\nvoid storel(T *ptr, NSIMD_NSV(T, NSIMD_SIMD) a1, T, unaligned) {\n  storelu(ptr, a1, T(), NSIMD_SIMD());\n}\n\ntemplate <NSIMD_CONCEPT_VALUE_TYPE T, NSIMD_CONCEPT_SIMD_EXT SimdExt>\nvoid storel(T *ptr, NSIMD_NSV(T, SimdExt) a1, T, SimdExt, aligned) {\n  storela(ptr, a1, T(), SimdExt());\n}\n\ntemplate <NSIMD_CONCEPT_VALUE_TYPE T, NSIMD_CONCEPT_SIMD_EXT SimdExt>\nvoid storel(T *ptr, NSIMD_NSV(T, SimdExt) a1, T, SimdExt, unaligned) {\n  storelu(ptr, a1, T(), SimdExt());\n}\n\n} // namespace nsimd\n#endif\n\n/* ------------------------------------------------------------------------- */\n/* Scalar utilisties */\n\n#include <nsimd/scalar_utilities.h>\n\n/* ------------------------------------------------------------------------- */\n/* Some undefs */\n\n#if NSIMD_CXX > 0\n#undef NSIMD_NSV\n#undef NSIMD_NSVX2\n#undef NSIMD_NSVX3\n#undef NSIMD_NSVX4\n#undef NSIMD_NSVL\n#endif\n\n/* ------------------------------------------------------------------------- */\n/* isnan, isnormal and isinf functions */\n\nNSIMD_INLINE int nsimd_isnan_f16(f16 a) {\n  /* We assume IEEE representation for f16's */\n  u16 b = nsimd_scalar_reinterpret_u16_f16(a);\n  if ((((((u32)b) >> 10) & 0x1F) == 0x1F) && ((((u32)b) << 6) != 0u)) {\n    return 1;\n  } else {\n    return 0;\n  }\n}\n\nNSIMD_INLINE int nsimd_isnan_f32(f32 a) {\n  /* We assume IEEE representation for f32's */\n  u32 b = nsimd_scalar_reinterpret_u32_f32(a);\n  if ((((b >> 23) & 0xFF) == 0xFF) && ((b << 9) != 0u)) {\n    return 1;\n  } else {\n    return 0;\n  }\n}\n\nNSIMD_INLINE int nsimd_isnan_f64(f64 a) {\n  /* We assume IEEE representation for f64's */\n  u64 b = nsimd_scalar_reinterpret_u64_f64(a);\n  if ((((b >> 52) & 0x7FF) == 0x7FF) && ((b << 12) != 0u)) {\n    return 1;\n  } else {\n    return 0;\n  }\n}\n\nNSIMD_INLINE int nsimd_isinf_f16(f16 a) {\n  /* We assume IEEE representation for f16's */\n  u16 b = nsimd_scalar_reinterpret_u16_f16(a);\n  if ((((((u32)b) >> 10) & 0x1F) == 0x1F) && ((((u32)b) << 6) == 0u)) {\n    return 1;\n  } else {\n    return 0;\n  }\n}\n\nNSIMD_INLINE int nsimd_isinf_f32(f32 a) {\n  /* We assume IEEE representation for f32's */\n  u32 b = nsimd_scalar_reinterpret_u32_f32(a);\n  if ((((b >> 23) & 0xFF) == 0xFF) && ((b << 9) == 0u)) {\n    return 1;\n  } else {\n    return 0;\n  }\n}\n\nNSIMD_INLINE int nsimd_isinf_f64(f64 a) {\n  /* We assume IEEE representation for f64's */\n  u64 b = nsimd_scalar_reinterpret_u64_f64(a);\n  if ((((b >> 52) & 0x7FF) == 0x7FF) && ((b << 12) == 0u)) {\n    return 1;\n  } else {\n    return 0;\n  }\n}\n\nNSIMD_INLINE int nsimd_isnormal_f16(f16 a) {\n  /* We assume IEEE representation for f16's */\n  u16 b = nsimd_scalar_reinterpret_u16_f16(a);\n  if ((((((u32)b) >> 10) & 0x1F) == 0u) && ((((u32)b) << 6) != 0u)) {\n    return 1;\n  } else {\n    return 0;\n  }\n}\n\nNSIMD_INLINE int nsimd_isnormal_f32(f32 a) {\n  /* We assume IEEE representation for f32's */\n  u32 b = nsimd_scalar_reinterpret_u32_f32(a);\n  if (!((((b >> 23) & 0xFF) == 0u) && ((b << 9) != 0u))) {\n    return 1;\n  } else {\n    return 0;\n  }\n}\n\nNSIMD_INLINE int nsimd_isnormal_f64(f64 a) {\n  /* We assume IEEE representation for f64's */\n  u64 b = nsimd_scalar_reinterpret_u64_f64(a);\n  if (!((((b >> 52) & 0x7FF) == 0u) && ((b << 12) != 0u))) {\n    return 1;\n  } else {\n    return 0;\n  }\n}\n\n#if NSIMD_CXX > 0\nnamespace nsimd {\nNSIMD_INLINE int isnan(f16 a) { return nsimd_isnan_f16(a); }\nNSIMD_INLINE int isnan(f32 a) { return nsimd_isnan_f32(a); }\nNSIMD_INLINE int isnan(f64 a) { return nsimd_isnan_f64(a); }\nNSIMD_INLINE int isinf(f16 a) { return nsimd_isinf_f16(a); }\nNSIMD_INLINE int isinf(f32 a) { return nsimd_isinf_f32(a); }\nNSIMD_INLINE int isinf(f64 a) { return nsimd_isinf_f64(a); }\nNSIMD_INLINE int isnormal(f16 a) { return nsimd_isnormal_f16(a); }\nNSIMD_INLINE int isnormal(f32 a) { return nsimd_isnormal_f32(a); }\nNSIMD_INLINE int isnormal(f64 a) { return nsimd_isnormal_f64(a); }\n} // namespace nsimd\n#endif\n\n/* ------------------------------------------------------------------------- */\n/* Difference in log UFP, returns an nat, see documentation for more infos   */\n\n#if NSIMD_CXX > 0\nextern \"C\" {\n#endif\n\nNSIMD_DLLSPEC int nsimd_ufp_f16(f16, f16);\nNSIMD_DLLSPEC int nsimd_ufp_f32(f32, f32);\nNSIMD_DLLSPEC int nsimd_ufp_f64(f64, f64);\n\n#if NSIMD_CXX > 0\n} // extern \"C\"\n#endif\n\n#if NSIMD_CXX > 0\nnamespace nsimd {\nNSIMD_INLINE int ufp(f16 a, f16 b) { return nsimd_ufp_f16(a, b); }\nNSIMD_INLINE int ufp(f32 a, f32 b) { return nsimd_ufp_f32(a, b); }\nNSIMD_INLINE int ufp(f64 a, f64 b) { return nsimd_ufp_f64(a, b); }\n} // namespace nsimd\n#endif\n\n/* ------------------------------------------------------------------------- */\n/* Get last kernel parameter */\n\n#if NSIMD_CXX > 0\nextern \"C\" {\n#endif\n\nNSIMD_DLLSPEC nsimd_nat nsimd_kernel_param(nsimd_nat, nsimd_nat);\n\n#if NSIMD_CXX > 0\n} // extern \"C\"\n#endif\n\n/* ------------------------------------------------------------------------- */\n\n#endif\n"
  },
  {
    "path": "scripts/FindNSIMD.cmake",
    "content": "# MIT License\n#\n# Copyright (c) 2020 Agenium Scale\n#\n# Permission is hereby granted, free of charge, to any person obtaining a copy\n# of this software and associated documentation files (the \"Software\"), to deal\n# in the Software without restriction, including without limitation the rights\n# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n# copies of the Software, and to permit persons to whom the Software is\n# furnished to do so, subject to the following conditions:\n#\n# The above copyright notice and this permission notice shall be included in all\n# copies or substantial portions of the Software.\n#\n# THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n# SOFTWARE.\n#\n#.rst:\n# FindNSIMD\n# ---------\n#\n# Find the NSIMD library, Agenium Scale's vectorization library.\n#\n# Result variables\n# ^^^^^^^^^^^^^^^^\n#\n# This module will set the following variables in your project:\n#\n# ``NSIMD_INCLUDE_DIRS``\n#   where to find nsimd.h, etc.\n# ``NSIMD_LIBRARY_DIRS``\n#   where to find the library to link against to use NSIMD.\n# ``NSIMD_LIBRARIES``\n#   the library to link against to use NSIMD.\n# ``NSIMD_FOUND``\n#   If false, do not try to use NSIMD.\n\nif (NOT NSIMD_FOUND AND NOT DEFINED NSIMD_LIBRARIES)\n  list(LENGTH NSIMD_FIND_COMPONENTS l)\n  if (\"${l}\" STREQUAL \"0\")\n    find_library(NSIMD_LIBRARIES NAMES nsimd_cpu\n                                       nsimd_sse2\n                                       nsimd_sse42\n                                       nsimd_avx\n                                       nsimd_avx2\n                                       nsimd_avx512_knl\n                                       nsimd_avx512_skylake\n                                       nsimd_neon128\n                                       nsimd_aarch64\n                                       nsimd_sve\n                                       nsimd_sve128\n                                       nsimd_sve256\n                                       nsimd_sve512\n                                       nsimd_sve1024\n                                       nsimd_sve2048\n                                       nsimd_cuda\n                                       nsimd_rocm)\n  elseif(\"${l}\" STREQUAL \"1\")\n    list(GET NSIMD_FIND_COMPONENTS 0 simd_ext)\n    find_library(NSIMD_LIBRARIES NAMES nsimd_${simd_ext})\n  else()\n    if (NOT NSIMD_FIND_QUIETLY)\n      message(FATAL_ERROR \"cannot handle several components\")\n    endif()\n  endif()\nendif()\n\nif (NOT NSIMD_FOUND AND NOT DEFINED NSIMD_INCLUDE_DIRS)\n  find_path(NSIMD_INCLUDE_DIRS NAMES nsimd/nsimd.h)\nendif()\n\nif (NOT \"${NSIMD_INCLUDE_DIRS}\" STREQUAL \"NSIMD_INCLUDE_DIRS-NOTFOUND\" AND\n    NOT \"${NSIMD_LIBRARIES}\" STREQUAL \"NSIMD_LIBRARIES-NOTFOUND\")\n  get_filename_component(NSIMD_LIBRARY_DIRS ${NSIMD_LIBRARIES} DIRECTORY)\n  if (NOT NSIMD_FIND_QUIETLY)\n    message(STATUS \"[include dir = ${NSIMD_INCLUDE_DIRS}]\"\n                   \" [library = ${NSIMD_LIBRARIES}]\")\n  endif()\n  set(NSIMD_FOUND TRUE)\nelse()\n  if (NOT NSIMD_FIND_QUIETLY)\n    if (NOT DEFINED NSIMD_INCLUDE_DIRS)\n      set(msg \"[cannot determine include dir]\")\n    else()\n      set(msg \"[include dir = ${NSIMD_INCLUDE_DIRS}]\")\n    endif()\n    if (NOT DEFINED NSIMD_LIBRARIES)\n      set(msg \"${msg} [cannot determine library dir]\")\n    else()\n      set(msg \"${msg} [library = ${NSIMD_LIBRARIES}]\")\n    endif()\n    if (NSIMD_FIND_REQUIRED)\n      message(FATAL_ERROR \"${msg}\")\n    else()\n      message(STATUS \"${msg}\")\n    endif()\n  endif()\n  set(NSIMD_FOUND FALSE)\nendif()\n\n"
  },
  {
    "path": "scripts/aarch64-linux-gnu-clang++.sh",
    "content": "#!/bin/bash\n\nclang++ --target=aarch64-linux-gnu \"$@\"\n"
  },
  {
    "path": "scripts/aarch64-linux-gnu-clang.sh",
    "content": "#!/bin/bash\n\nclang --target=aarch64-linux-gnu \"$@\"\n"
  },
  {
    "path": "scripts/build-tests.bat",
    "content": "@echo off\n\nREM Copyright (c) 2020 Agenium Scale\nREM\nREM Permission is hereby granted, free of charge, to any person obtaining a copy\nREM of this software and associated documentation files (the \"Software\"), to deal\nREM in the Software without restriction, including without limitation the rights\nREM to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\nREM copies of the Software, and to permit persons to whom the Software is\nREM furnished to do so, subject to the following conditions:\nREM\nREM The above copyright notice and this permission notice shall be included in all\nREM copies or substantial portions of the Software.\nREM\nREM THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\nREM IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\nREM FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\nREM AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\nREM LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\nREM OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\nREM SOFTWARE.\n\nREM ###########################################################################\n\nsetlocal EnableDelayedExpansion\npushd \"%~dp0\"\n\nREM ###########################################################################\n\nset BUILD_BAT=\"%CD%\\build.bat\"\nset HATCH_PY=\"%CD%\\..\\egg\\hatch.py\"\nset NSCONFIG=\"%CD%\\..\\nstools\\bin\\nsconfig.exe\"\nset BUILD_ROOT=\"%CD%\\..\"\n\nREM ###########################################################################\nREM Run build.bat\n\ncall %BUILD_BAT% %*\nif errorlevel 1 goto end_nok\n\nREM ###########################################################################\nREM Generate NSIMD\n\npython %HATCH_PY% -tf\nif errorlevel 1 goto end_nok\n\nREM ###########################################################################\nREM Build tests (checks on command line arguments has benn done by build.bat)\n\nset SIMD_EXTS_ARG=%2\nset SIMD_EXTS=%SIMD_EXTS_ARG:/=,%\nif \"%3\" == \"\" (\n  set COMPILER_ARG=cl\n) else (\n  set COMPILER_ARG=%4\n)\nset COMPILERS=%COMPILER_ARG:/=,%\n\nfor %%g in (%COMPILERS%) do (\n  for %%h in (%SIMD_EXTS%) do (\n    set BUILD_DIR=%BUILD_ROOT%\\build-%%h-%%g\n    if exist !BUILD_DIR! rd /Q /S !BUILD_DIR!\n    md !BUILD_DIR!\n    pushd !BUILD_DIR!\n      %NSCONFIG% .. -Dsimd=%%h -suite=%%g\n      if exist %BUILD_ROOT%\\targets.txt (\n        set \"TS= \"\n        for /F %%k in ('type %BUILD_ROOT%\\targets.txt') do (\n          ninja -t targets all | findstr /R \"^tests\" | findstr /R \"%%k\" ^\n                   >_targets.txt\n          for /F %%l in ('type _targets.txt') do (\n            set TMP1=%%l\n            set T=!TMP1::=!\n            set TS=!TS! !T!\n          )\n        )\n      ) else (\n        set TS=tests\n      )\n      echo *** !TS!\n      ninja !TS!\n    popd\n  )\n)\n\nREM ###########################################################################\n\n:end_ok\npopd\nendlocal\nexit /B 0\n\n:end_nok\npopd\nendlocal\nexit /B 1\n"
  },
  {
    "path": "scripts/build-tests.sh",
    "content": "#!/bin/bash\n# Copyright (c) 2021 Agenium Scale\n#\n# Permission is hereby granted, free of charge, to any person obtaining a copy\n# of this software and associated documentation files (the \"Software\"), to deal\n# in the Software without restriction, including without limitation the rights\n# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n# copies of the Software, and to permit persons to whom the Software is\n# furnished to do so, subject to the following conditions:\n#\n# The above copyright notice and this permission notice shall be included in all\n# copies or substantial portions of the Software.\n#\n# THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n# SOFTWARE.\n\n###############################################################################\n\ncd `dirname $0`\n#set -x\nset -e\n\n###############################################################################\n# Init\n\nBUILD_SH=\"${PWD}/build.sh\"\nHATCH_PY=\"${PWD}/../egg/hatch.py\"\nBUILD_ROOT=\"${PWD}/..\"\n\n###############################################################################\n# Generate NSIMD tests\n\npython3 --version 1>/dev/null 2>/dev/null && \\\n  python3 \"${HATCH_PY}\" -tf || \\\n  python \"${HATCH_PY}\" -tf\n\n###############################################################################\n# Run build.sh\n\nbash \"${BUILD_SH}\" \"$@\" || exit 1\n\n###############################################################################\n# Parse command line arguments (check has been done by build.sh)\n\nSIMD_EXTS=`echo \"${2}\" | sed -e 's,/, ,g'`\n\nif [ \"${3}\" == \"\" ]; then\n  COMPILER_ARG=\"gcc\"\nelse\n  COMPILER_ARG=\"${4}\"\nfi\nCOMPILERS=`echo ${COMPILER_ARG} | sed 's,/, ,g'`\n\n###############################################################################\n# Build tests\n\nfor compiler in ${COMPILERS}; do\n  for simd_ext in ${SIMD_EXTS}; do\n    BUILD_DIR=\"${BUILD_ROOT}/build-${simd_ext}-${compiler}\"\n    if [ -e \"${BUILD_ROOT}/targets.txt\" ]; then\n      GLOBS=`cat ${BUILD_ROOT}/targets.txt | tr '\\n' '|' | sed 's/|$//g'`\n      TARGETS=`(cd ${BUILD_DIR} && ninja -t targets all | grep -E '^tests.') \\\n               | sed 's/:.*//g' | grep -E \"(${GLOBS})\" | tr '\\n' ' '`\n    else\n      TARGETS=\"tests\"\n    fi\n    (cd \"${BUILD_DIR}\" && ninja ${TARGETS})\n  done\ndone\n"
  },
  {
    "path": "scripts/build.bat",
    "content": "@echo off\n\nREM Copyright (c) 2020 Agenium Scale\nREM\nREM Permission is hereby granted, free of charge, to any person obtaining a copy\nREM of this software and associated documentation files (the \"Software\"), to deal\nREM in the Software without restriction, including without limitation the rights\nREM to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\nREM copies of the Software, and to permit persons to whom the Software is\nREM furnished to do so, subject to the following conditions:\nREM\nREM The above copyright notice and this permission notice shall be included in all\nREM copies or substantial portions of the Software.\nREM\nREM THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\nREM IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\nREM FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\nREM AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\nREM LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\nREM OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\nREM SOFTWARE.\n\nREM ###########################################################################\n\nsetlocal EnableDelayedExpansion\npushd \"%~dp0\"\n\nREM ###########################################################################\nREM Init\n\nset SETUP_BAT=\"%CD%\\setup.bat\"\nset NSCONFIG=\"%CD%\\..\\nstools\\bin\\nsconfig.exe\"\nset HATCH_PY=\"%CD%\\..\\egg\\hatch.py\"\nset BUILD_ROOT=\"%CD%\\..\"\n\nREM ###########################################################################\nREM Run setup\n\ncall %SETUP_BAT%\nif errorlevel 1 goto end_nok\n\nREM ###########################################################################\nREM Generate NSIMD\n\npython %HATCH_PY% -lf\nif errorlevel 1 goto end_nok\n\nREM ###########################################################################\nREM Check/parse command line arguments\n\nif \"%1\" == \"\" (\n  echo %0: usage: %0 for simd_ext1/.../simd_ext2 [with compiler1/.../compiler2]\n  goto end_nok\n)\n\nif not \"%1\" == \"for\" (\n  echo ERROR: expected 'for' as first argument\n  goto end_nok\n)\n\nif \"%2\" == \"\" (\n  echo \"ERROR: no SIMD extension given\"\n  goto end_nok\n)\n\nset SIMD_EXTS_ARG=%2\nset SIMD_EXTS=%SIMD_EXTS_ARG:/=,%\n\nif \"%3\" == \"\" (\n  set COMPILER_ARG=msvc\n) else ( if \"%3\" == \"with\" (\n  if \"%4\" == \"\" (\n    echo \"ERROR: no compiler given after with\"\n    goto end_nok\n  )\n  set COMPILER_ARG=%4\n) else (\n  echo ERROR: expected 'with' as fourth argument\n  goto end_nok\n) )\n\nset COMPILERS=%COMPILER_ARG:/=,%\n\nREM ###########################################################################\nREM Build NSIMD : one build directory per SIMD extension per compiler\n\nfor %%g in (%COMPILERS%) do (\n  for %%h in (%SIMD_EXTS%) do (\n    set BUILD_DIR=%BUILD_ROOT%\\build-%%h-%%g\n    if exist !BUILD_DIR! rd /Q /S !BUILD_DIR!\n    md !BUILD_DIR!\n    pushd !BUILD_DIR!\n      %NSCONFIG% .. -Dsimd=%%h -suite=%%g\n      ninja\n    popd\n  )\n)\n\nREM ###########################################################################\n\n:end_ok\npopd\nendlocal\nexit /B 0\n\n:end_nok\npopd\nendlocal\nexit /B 1\n"
  },
  {
    "path": "scripts/build.sh",
    "content": "#!/bin/bash\n# Copyright (c) 2021 Agenium Scale\n#\n# Permission is hereby granted, free of charge, to any person obtaining a copy\n# of this software and associated documentation files (the \"Software\"), to deal\n# in the Software without restriction, including without limitation the rights\n# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n# copies of the Software, and to permit persons to whom the Software is\n# furnished to do so, subject to the following conditions:\n#\n# The above copyright notice and this permission notice shall be included in all\n# copies or substantial portions of the Software.\n#\n# THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n# SOFTWARE.\n\n###############################################################################\n\ncd `dirname $0`\nset -x\nset -e\n\n###############################################################################\n# Init\n\nSETUP_SH=\"${PWD}/setup.sh\"\nNSCONFIG=\"${PWD}/../nstools/nsconfig/nsconfig\"\nHATCH_PY=\"${PWD}/../egg/hatch.py\"\nBUILD_ROOT=\"${PWD}/..\"\n\n###############################################################################\n# Run setup\n\nbash \"${SETUP_SH}\"\n\n###############################################################################\n# Generate NSIMD\n\npython3 --version 1>/dev/null 2>/dev/null && \\\n  python3 \"${HATCH_PY}\" -lf || \\\n  python \"${HATCH_PY}\" -lf\n\n###############################################################################\n# Check/parse command line arguments\n\nif [ \"${1}\" == \"\" ]; then\n  echo \"$0: usage: $0 for simd_ext1,...,simd_ext2 [with compiler]\"\n  exit 0\nfi\n\nif [ \"${1}\" != \"for\" ]; then\n  echo \"ERROR: expected 'for' as first argument\"\n  exit 1\nfi\n\nif [ \"${2}\" == \"\" ]; then\n  echo \"ERROR: no SIMD extension given after 'for'\"\n  exit 1\nfi\nSIMD_EXTS=`echo \"${2}\" | sed -e 's,/, ,g'`\n\nif [ \"${3}\" == \"\" ]; then\n  COMPILER_ARG=\"gcc\"\nelif [ \"${3}\" == \"with\" ]; then\n  if [ \"${4}\" == \"\" ]; then\n    echo \"ERROR: no compiler given after 'with'\"\n    exit 1\n  fi\n  COMPILER_ARG=\"${4}\"\nelse\n  echo \"ERROR: expected 'with' as fourth argument\"\n  exit 1\nfi\nCOMPILERS=`echo ${COMPILER_ARG} | sed 's,/, ,g'`\n\n###############################################################################\n# Build NSIMD : one build directory per SIMD extension per compiler\n\nfor compiler in ${COMPILERS}; do\n  for simd_ext in ${SIMD_EXTS}; do\n    BUILD_DIR=\"${BUILD_ROOT}/build-${simd_ext}-${compiler}\"\n    rm -rf \"${BUILD_DIR}\"\n    mkdir -p \"${BUILD_DIR}\"\n    (cd \"${BUILD_DIR}\" && \\\n        \"${NSCONFIG}\" .. -Dsimd=${simd_ext} -suite=${compiler})\n    (cd \"${BUILD_DIR}\" && ninja)\n  done\ndone\n"
  },
  {
    "path": "scripts/ci-clang.txt",
    "content": "camelot.numscale.com (sse2-sse42-clang)\n- bash scripts/build-tests.sh for sse2/sse42 with clang\n- cd build-sse2-clang\n- ../nstools/bin/nstest -j80\n- cd ../build-sse42-clang\n- ../nstools/bin/nstest -j80\n\ngaunes.numscale.com (avx-avx2-clang)\n- bash scripts/build-tests.sh for avx/avx2 with clang\n- cd build-avx-clang\n- ../nstools/bin/nstest -j80\n- cd ../build-avx2-clang\n- ../nstools/bin/nstest -j80\n\ncaradigan.numscale.com (aarch64-clang-1)\n- bash scripts/setup.sh\n- python3 egg/hatch.py -ltf\n- mkdir build-aarch64-clang\n- cd build-aarch64-clang\n- ../nstools/bin/nsconfig .. -Dsimd=aarch64 -comp=clang\n- ninja tests.c99 tests.cpp98 tests.cpp11\n- ../nstools/bin/nstest -j80\n\ncarahes.numscale.com (aarch64-clang-2)\n- bash scripts/setup.sh\n- python3 egg/hatch.py -ltf\n- mkdir build-aarch64-clang\n- cd build-aarch64-clang\n- ../nstools/bin/nsconfig .. -Dsimd=aarch64 -comp=clang\n- ninja tests.c99 tests.cpp98 tests.cpp11\n- ../nstools/bin/nstest -j80\n\ncamlann.numscale.com (aarch64-clang-3)\n- bash scripts/setup.sh\n- python3 egg/hatch.py -ltf\n- mkdir build-aarch64-clang\n- cd build-aarch64-clang\n- ../nstools/bin/nsconfig .. -Dsimd=aarch64 -comp=clang\n- ninja tests.c99 tests.cpp98 tests.cpp11\n- ../nstools/bin/nstest -j80\n\n"
  },
  {
    "path": "scripts/ci-scale.txt",
    "content": "camelot.hpc.scale <sse2-sse42-gcc> {/home/gquintin}\n- mkdir cmake-build-sse2\n- cd cmake-build-sse2\n- cmake .. -Dsimd=sse2\n- make -j10\n- cd ..\n- mkdir cmake-build-sse42\n- cd cmake-build-sse42\n- cmake .. -Dsimd=sse42\n- make -j10\n- cd ..\n- bash scripts/build-tests.sh for sse2/sse42 with gcc\n- cd build-sse2-gcc\n- ../nstools/bin/nstest -j80\n- cd ../build-sse42-gcc\n- ../nstools/bin/nstest -j80\n\nglastonbury.hpc.scale <avx512_skylake-gcc> {/home/gquintin}\n- source /etc/profile.d/modules.sh\n- module load cmake/3.1.0\n- mkdir cmake-build-avx512_skylake\n- cd cmake-build-avx512_skylake\n- cmake .. -Dsimd=avx512_skylake\n- make -j10\n- cd ..\n- bash scripts/build-tests.sh for avx512_skylake with gcc\n- cd build-avx512_skylake-gcc\n- ../nstools/bin/nstest -j40\n\ncarduel.hpc.scale <avx512_knl-gcc> {/home/gquintin}\n- source /etc/profile.d/profile.sh\n- module load cmake/3.1.0\n- mkdir cmake-build-avx512_knl\n- cd cmake-build-avx512_knl\n- cmake .. -Dsimd=avx512_knl\n- make -j10\n- cd ..\n- bash scripts/build-tests.sh for avx512_knl with gcc\n- cd build-avx512_knl-gcc\n- ../nstools/bin/nstest -j80\n\ngaunes.hpc.scale <avx-avx2-armel-gcc> {/home/gquintin}\n- mkdir cmake-build-avx\n- cd cmake-build-avx\n- cmake .. -Dsimd=avx\n- make -j10\n- cd ..\n- mkdir cmake-build-avx2\n- cd cmake-build-avx2\n- cmake .. -Dsimd=avx2\n- make -j10\n- cd ..\n- bash scripts/build-tests.sh for avx/avx2 with gcc\n- cd build-avx-gcc\n- ../nstools/bin/nstest -j80\n- cd ../build-avx2-gcc\n- ../nstools/bin/nstest -j80\n- cd ..\n- mkdir cmake-build-armel\n- cd cmake-build-armel\n- cmake .. -Dsimd=neon128 -DCMAKE_CXX_COMPILER=arm-linux-gnueabi-gcc\n- make -j10\n- cd ..\n- mkdir build-neon128-gcc\n- cd build-neon128-gcc\n- ../nstools/bin/nsconfig .. -Dsimd=neon128 -comp=cc,gcc,arm-linux-gnueabi-gcc,5,armel -comp=c++,gcc,arm-linux-gnueabi-g++,5,armel\n- ninja tests\n- ../nstools/bin/nstest -j80 --prefix=\"qemu-arm\"\n\nlogres.hpc.scale <cpu-gcc-cuda-nvcc> {/home/gquintin}\n- mkdir cmake-build-cpu\n- cd cmake-build-cpu\n- cmake .. -Dsimd=cpu\n- make -j10\n- cd ..\n- bash scripts/build-tests.sh for cpu with gcc\n- cd build-cpu-gcc\n- ../nstools/bin/nstest -j80\n- export PATH=${PATH}:/usr/local/cuda/bin\n- export LD_LIBRARY_PATH=/usr/local/cuda/lib64\n- mkdir ../build-cuda-nvcc\n- cd ../build-cuda-nvcc\n- ../nstools/bin/nsconfig .. -Dsimd=cuda -Dcuda_arch_flags=-msm_75 -suite=cuda\n- ninja tests\n- ../nstools/bin/nstest -j20\n\nbowden.hpc.scale <rocm-cpp20-cmakefind> {/home/gquintin}\n- bash scripts/build-tests.sh for rocm with rocm\n- cd build-rocm-rocm\n- ../nstools/bin/nstest -j80\n- cd ..\n- mkdir build-cpp20\n- source /etc/profile.d/profile.sh\n- module load gcc/10.2.0\n- cd build-cpp20\n- ../nstools/bin/nsconfig .. -Dsimd=sse42 -suite=gcc\n- ninja tests.cpp20\n- ../nstools/bin/nstest -j80\n- cd ..\n- bash tests/FindNSIMD.cmake.sh\n\ncaradigan.hpc.scale <armhf-aarch64-gcc> {/home/gquintin}\n- mkdir cmake-build-aarch64\n- cd cmake-build-aarch64\n- cmake .. -Dsimd=aarch64\n- make -j10\n- cd ..\n- bash scripts/build-tests.sh for aarch64 with gcc\n- cd build-aarch64-gcc\n- ../nstools/bin/nstest -j80\n- cd ..\n- mkdir cmake-build-neon128\n- cd cmake-build-neon128\n- cmake .. -Dsimd=neon128 -DCMAKE_CXX_COMPILER=arm-linux-gnueabihf-gcc -DNSIMD_ARM32_IS_ARMEL=OFF\n- make -j10\n- cd ..\n- mkdir build-neon128-gcc\n- cd build-neon128-gcc\n- ../nstools/bin/nsconfig .. -Dsimd=neon128 -comp=cc,gcc,arm-linux-gnueabihf-gcc,5,armhf -comp=c++,gcc,arm-linux-gnueabihf-g++,5,armhf\n- ninja tests\n- ../nstools/bin/nstest -j80\n\ncarahes.hpc.scale <sve128-gcc> {/home/gquintin}\n- source /etc/profile.d/profile.sh\n- module load gcc/10.2.0\n- mkdir cmake-build-sve128\n- cd cmake-build-sve128\n- cmake .. -Dsimd=sve128\n- make -j10\n- cd ..\n- bash scripts/build-tests.sh for sve128 with gcc\n- cd build-sve128-gcc\n- module load qemu/4.2.0\n- ../nstools/bin/nstest -j80 --prefix=\"qemu-aarch64 -cpu max,sve-max-vq=1\"\n\nWIN.gorre2 <msvc15_32-avx2-msvc19_64> {/home/gquintin} [\"C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\Professional\\VC\\Auxiliary\\Build\\vcvars64.bat\"]\n- setlocal\n- call \"C:\\Program Files (x86)\\Microsoft Visual Studio 14.0\\VC\\vcvarsall.bat\" x86\n- set PATH=%PATH%;C:\\Program Files (x86)\\CMake\\bin\n- md cmake-build32-sse2\n- cd cmake-build32-sse2\n- cmake .. -Dsimd=sse2 -DCMAKE_CXX_COMPILER=cl -G \"NMake Makefiles\"\n- nmake\n- cd ..\n- md cmake-build32-sse42\n- cd cmake-build32-sse42\n- cmake .. -Dsimd=sse42 -DCMAKE_CXX_COMPILER=cl -G \"NMake Makefiles\"\n- nmake\n- cd ..\n- md cmake-build32-avx\n- cd cmake-build32-avx\n- cmake .. -Dsimd=avx -DCMAKE_CXX_COMPILER=cl -G \"NMake Makefiles\"\n- nmake\n- cd ..\n- md cmake-build32-avx2\n- cd cmake-build32-avx2\n- cmake .. -Dsimd=avx2 -DCMAKE_CXX_COMPILER=cl -G \"NMake Makefiles\"\n- nmake\n- cd ..\n- call scripts\\build for sse2/sse42/avx/avx2 with msvc\n- endlocal\n- setlocal\n- call \"C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\Professional\\VC\\Auxiliary\\Build\\vcvars64.bat\"\n- md cmake-build64-sse2\n- cd cmake-build64-sse2\n- cmake .. -Dsimd=sse2 -DCMAKE_CXX_COMPILER=cl -G \"NMake Makefiles\"\n- nmake\n- cd ..\n- md cmake-build64-sse42\n- cd cmake-build64-sse42\n- cmake .. -Dsimd=sse42 -DCMAKE_CXX_COMPILER=cl -G \"NMake Makefiles\"\n- nmake\n- cd ..\n- md cmake-build64-avx\n- cd cmake-build64-avx\n- cmake .. -Dsimd=avx -DCMAKE_CXX_COMPILER=cl -G \"NMake Makefiles\"\n- nmake\n- cd ..\n- md cmake-build64-avx2\n- cd cmake-build64-avx2\n- cmake .. -Dsimd=avx2 -DCMAKE_CXX_COMPILER=cl -G \"NMake Makefiles\"\n- nmake\n- cd ..\n- call scripts\\build-tests for avx2 with msvc\n- cd build-avx2-msvc\n- ..\\nstools\\bin\\nstest -j60\n- endlocal\n\ncouillere <aarch64-macos> {/Users/gquintin}\n- export PATH=${PATH}:/opt/homebrew/bin\n- python3 egg/hatch.py -ltf\n- bash scripts/setup.sh\n- mkdir build-aarch64-xcode\n- cd build-aarch64-xcode\n- ../nstools/bin/nsconfig .. -Dsimd=aarch64 -suite=llvm -Dmpfr=\"-I/opt/homebrew/include -L/opt/homebrew/lib -lmpfr\"\n- ninja\n- ninja tests\n- ../nstools/bin/nstest -j16\n\n"
  },
  {
    "path": "scripts/ci-test.txt",
    "content": "couillere <aarch64-macos> {/Users/gquintin}\n- export PATH=${PATH}:/opt/homebrew/bin\n- python3 egg/hatch.py -ltf\n- bash scripts/setup.sh\n- mkdir build-aarch64-xcode\n- cd build-aarch64-xcode\n- ../nstools/bin/nsconfig .. -Dsimd=aarch64 -suite=llvm -Dmpfr=\"-I/opt/homebrew/include -L/opt/homebrew/lib -lmpfr\"\n- ninja\n- ninja tests\n- ../nstools/bin/nstest -j16\n\n"
  },
  {
    "path": "scripts/ci.sh",
    "content": "#!/bin/sh\n# Copyright (c) 2021 Agenium Scale\n#\n# Permission is hereby granted, free of charge, to any person obtaining a copy\n# of this software and associated documentation files (the \"Software\"), to deal\n# in the Software without restriction, including without limitation the rights\n# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n# copies of the Software, and to permit persons to whom the Software is\n# furnished to do so, subject to the following conditions:\n#\n# The above copyright notice and this permission notice shall be included in all\n# copies or substantial portions of the Software.\n#\n# THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n# SOFTWARE.\n\n###############################################################################\n# Argument parsing\n\nif [ \"$2\" == \"\" ]; then\n  echo \"ERROR: usage: $0 JOBS_FILE NSTOOLS_CHECKOUT_LAST_COMMIT\"\n  exit 1\nfi\n\nJOBS_FILE=\"`realpath $1`\"\nNSIMD_NSTOOLS_CHECKOUT_LATER=\"$2\"\n\ncd `dirname $0`\n#set -x\nset -e\n\n###############################################################################\n# Init\n\nSSH=\"ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \\\n         -o LogLevel=error\"\nSCP=\"scp -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \\\n         -o LogLevel=error\"\nGIT_URL=`git remote get-url origin`\nGIT_BRANCH=`git rev-parse --abbrev-ref HEAD`\nTMP_DIR=\"${PWD}/../_ci\"\nONE_LINER_C=\"${PWD}/../scripts/one-liner.c\"\nSSHJOB_C=\"${PWD}/../nstools/sshjob/sshjob.c\"\n\n# Empty tmp directory\nif [ -f \"${JOBS_FILE}\" ]; then\n  rm -rf \"${TMP_DIR}\"\n  mkdir -p \"${TMP_DIR}\"\nfi\n\n###############################################################################\n# Build jobs scripts\n\nif [ -f \"${JOBS_FILE}\" ]; then\n\n  CURRENT_JOB=\"\"\n  DESC=\"\"\n  REMOTE_HOST=\"Linux\"\n  \n  while read -r line; do\n  \n    # Empty lines\n    if [ \"`echo ${line} | sed 's/[ \\t]*//g'`\" == \"\" ]; then\n      continue\n    fi\n\n    # Comments\n    if [ \"`echo ${line} | cut -c 1`\" == \"#\" ]; then\n      continue\n    fi\n  \n    if [ \"`echo ${line} | cut -c 1`\" == \"-\" ]; then\n      echo \"`echo ${line} | cut -c 2- | sed 's/^  *//g'`\" >>\"${CURRENT_JOB}\"\n      echo >>\"${CURRENT_JOB}\"\n    else\n      ADDR=`echo ${line} | sed -e 's/<.*//g' -e 's/  *//g'`\n      DESC=`echo ${line} | sed -e 's/.*<//g' -e 's/>.*//g'`\n      REMOTE_DIR=`echo ${line} | sed -e 's/.*{//g' -e 's/}.*//g'`\n      EXTRA=`echo ${line} | sed -e 's/.*\\[//g' -e 's/].*//g'`\n      REMOTE_HOST=`echo ${ADDR} | head -c 4`\n      echo ${REMOTE_DIR} >\"${TMP_DIR}/${ADDR}--${DESC}.work.dir\"\n      if [ \"${REMOTE_HOST}\" == \"WIN.\" ]; then\n        CURRENT_JOB=\"${TMP_DIR}/${ADDR}--${DESC}.bat\" # <-- this must be before\n        ADDR=\"`echo ${ADDR} | tail -c +5`\"            # <-- this\n        REMOTE_HOST=\"Windows\"\n        cat >\"${CURRENT_JOB}\" <<-EOF\n\t@echo off\n\n\tsetlocal\n\tpushd \"%~dp0\"\n\t\n\tset NSTOOLS_CHECKOUT_LAST_COMMIT=\"${NSTOOLS_CHECKOUT_LAST_COMMIT}\"\n\n\tif exist ci-nsimd-${DESC} rd /Q /S ci-nsimd-${DESC}\n\tgit clone ${GIT_URL} ci-nsimd-${DESC}\n\tgit -C ci-nsimd-${DESC} checkout ${GIT_BRANCH}\n\n\tpushd ci-nsimd-${DESC}\n\n\tREM ----------------------------------------------------------------\n\tREM User commands from here\n\n\tEOF\n        # On Windows we need a native compiler. On Linux we have cc in the\n        # PATH but on Windows we have nothing. We need a MSVC but there is\n        # no easy way to find one. So we parse what is between [...] which\n        # contains the path to the vcvarsall.bat script to load the compiler\n        cat >\"${TMP_DIR}/${ADDR}--${DESC}-native-cl\" <<-EOF\n\t@echo off\n\t\n\tsetlocal\n\tcall ${EXTRA}\n\tcl %*\n\texit /B %ERRORLEVEL%\n\tEOF\n      else\n        CURRENT_JOB=\"${TMP_DIR}/${ADDR}--${DESC}.sh\"\n        REMOTE_HOST=\"Linux\"\n        cat >\"${CURRENT_JOB}\" <<-EOF\n\t#!/bin/sh\n\t\n\tcd \\`dirname \\$0\\`\n\tset -e\n\n        export NSTOOLS_CHECKOUT_LAST_COMMIT=\"${NSTOOLS_CHECKOUT_LAST_COMMIT}\"\n\n\trm -rf ci-nsimd-${DESC}\n\tgit clone ${GIT_URL} ci-nsimd-${DESC}\n\tgit -C ci-nsimd-${DESC} checkout ${GIT_BRANCH}\n\n\tcd ci-nsimd-${DESC}\n\n\t# ------------------------------------------------------------------\n\t# User commands from here\n\n\tEOF\n      fi\n    fi\n\n  done <\"${JOBS_FILE}\"\n\nfi\n\n###############################################################################\n# Launch jobs\n\nif [ -f \"${JOBS_FILE}\" ]; then\n\n  echo \"-- NSIMD CI\"\n  echo \"-- \"\n  echo \"-- Initialization:\"\n  \n  for job in `find ${TMP_DIR} -iregex '.*\\.\\(bat\\|sh\\)'`; do\n    ADDR=`basename ${job} | \\\n          sed -e 's/\\.sh$//g' -e 's/\\.bat$//g' -e 's/--.*//g'`\n    DESC=`basename ${job} | \\\n          sed -e 's/\\.sh$//g' -e 's/\\.bat$//g' -e 's/.*--//g'`\n    REMOTE_DIR=\"`cat ${TMP_DIR}/${ADDR}--${DESC}.work.dir`\"\n    W_REMOTE_DIR=\"`echo ${REMOTE_DIR} | tr / \\\\\\\\\\\\`\"\n    REMOTE_HOST=`echo ${ADDR} | head -c 4`\n    if [ \"${REMOTE_HOST}\" == \"WIN.\" ]; then\n      REMOTE_HOST=\"Windows\"\n      ADDR=\"`echo ${ADDR} | tail -c +5`\"\n    else\n      REMOTE_HOST=\"Linux\"\n    fi\n    echo \"-- Found new job: ${DESC}\"\n    echo \"--   Remote machine will be: ${ADDR}\"\n    if [ \"${REMOTE_HOST}\" == \"Windows\" ]; then\n      echo \"--   Working directory will be: ${W_REMOTE_DIR}\"\n      ${SSH} ${ADDR} if not exist ${W_REMOTE_DIR} md ${W_REMOTE_DIR}\n    else\n      echo \"--   Working directory will be: ${REMOTE_DIR}\"\n      ${SSH} ${ADDR} mkdir -p ${REMOTE_DIR}\n    fi\n    echo \"--   Launching commands\"\n    if [ \"${REMOTE_HOST}\" == \"Windows\" ]; then\n      ${SCP} ${job} ${ADDR}:${W_REMOTE_DIR}\n      ${SCP} ${ONE_LINER_C} ${ADDR}:${W_REMOTE_DIR}\n      ${SCP} ${SSHJOB_C} ${ADDR}:${W_REMOTE_DIR}\n      ${SCP} ${TMP_DIR}/${ADDR}--${DESC}-native-cl \\\n             ${ADDR}:${W_REMOTE_DIR}\\\\native-cl.bat\n      ${SSH} ${ADDR} \"cd ${W_REMOTE_DIR} & \\\n                      native-cl /Ox /W3 /D_CRT_SECURE_NO_WARNINGS one-liner.c\"\n      ${SSH} ${ADDR} \"cd ${W_REMOTE_DIR} & \\\n                      native-cl /Ox /W3 /D_CRT_SECURE_NO_WARNINGS sshjob.c\"\n      ${SSH} ${ADDR} \"cd ${W_REMOTE_DIR} & \\\n                      sshjob run \\\"`basename ${job}` 2>&1 | \\\n                             one-liner ci-nsimd-${DESC}-output.txt \\\n                                       ci-nsimd-${DESC}-one-liner.txt\\\"\" \\\n             | sed 's/\\r//g' >${TMP_DIR}/ci-nsimd-${DESC}-pid.txt\n    else\n      ${SCP} ${job} ${ADDR}:${REMOTE_DIR}\n      ${SCP} ${ONE_LINER_C} ${ADDR}:${REMOTE_DIR}\n      ${SCP} ${SSHJOB_C} ${ADDR}:${REMOTE_DIR}\n      ${SSH} ${ADDR} \"cd ${REMOTE_DIR} && cc -O2 one-liner.c -o one-liner\"\n      ${SSH} ${ADDR} \"cd ${REMOTE_DIR} && cc -O2 sshjob.c -o sshjob\"\n      ${SSH} ${ADDR} \"cd ${REMOTE_DIR} && \\\n                      ./sshjob run \\\"bash `basename ${job}` 2>&1 | \\\n                               ./one-liner ci-nsimd-${DESC}-output.txt \\\n                                           ci-nsimd-${DESC}-one-liner.txt\\\"\" \\\n             >${TMP_DIR}/ci-nsimd-${DESC}-pid.txt\n    fi\n  done\n\n  sleep 2\n\nfi\n\n###############################################################################\n# Build associative arrays\n\nREMOTE_HOST_A=\"\"\nADDR_A=\"\"\nDESC_A=\"\"\nONE_LINER_A=\"\"\nKILL_COMMAND_A=\"\"\nLOG_A=\"\"\nN=0\n\nfor job in `find ${TMP_DIR} -iregex '.*\\.\\(bat\\|sh\\)'`; do\n  ADDR=`basename ${job} | \\\n        sed -e 's/\\.sh$//g' -e 's/\\.bat$//g' -e 's/--.*//g'`\n  DESC=`basename ${job} | \\\n        sed -e 's/\\.sh$//g' -e 's/\\.bat$//g' -e 's/.*--//g'`\n  REMOTE_DIR=\"`cat ${TMP_DIR}/${ADDR}--${DESC}.work.dir`\"\n  W_REMOTE_DIR=\"`echo ${REMOTE_DIR} | tr / \\\\\\\\\\\\`\"\n  LOG=\"${REMOTE_DIR}/ci-nsimd-${DESC}-output.txt\"\n  REMOTE_HOST=\"`echo ${ADDR} | head -c 4`\"\n  PID=\"`sed -e 's/\\r//g' ${TMP_DIR}/ci-nsimd-${DESC}-pid.txt`\"\n  if [ \"${REMOTE_HOST}\" == \"WIN.\" ]; then\n    REMOTE_HOST=\"Windows\"\n    ADDR=\"`echo ${ADDR} | tail -c +5`\"\n    ONE_LINER=\"${W_REMOTE_DIR}\\\\ci-nsimd-${DESC}-one-liner.txt\"\n    KILL_COMMAND=\"${W_REMOTE_DIR}\\\\sshjob kill ${PID}\"\n  else\n    REMOTE_HOST=\"Linux\"\n    ONE_LINER=\"${REMOTE_DIR}/ci-nsimd-${DESC}-one-liner.txt\"\n    KILL_COMMAND=\"${REMOTE_DIR}/sshjob kill ${PID}\"\n  fi\n\n  ADDR_A=\"${ADDR_A}${ADDR}:\"\n  DESC_A=\"${DESC_A}${DESC}:\"\n  ONE_LINER_A=\"${ONE_LINER_A}${ONE_LINER}:\"\n  KILL_COMMAND_A=\"${KILL_COMMAND_A}${KILL_COMMAND}:\"\n  LOG_A=\"${LOG_A}${LOG}:\"\n  REMOTE_HOST_A=\"${REMOTE_HOST_A}${REMOTE_HOST}:\"\n  N=`expr ${N} + 1`\ndone\n\nget_a() {\n  echo ${1} | cut -f${2} -d':'\n}\n\n###############################################################################\n# Monitor jobs (main event loop)\n\nif [ -d \"${JOBS_FILE}\" ]; then\n  TMP_DIR=\"${JOBS_FILE}\"\nfi\n\ntrap \"stty echo icanon; exit 0\" SIGINT\nstty -echo -icanon\nclear\nkey=\"\"\nselected=1\n\necho2() {\n printf \"%-${COLUMNS}s\" \" \"\n printf \"\\r\"\n echo \"${1}\"\n}\n\nwhile true; do\n  if [ \"${selected}\" -gt \"${N}\" ]; then\n    selected=${N}\n  fi\n  if [ \"${selected}\" -lt \"1\" ]; then\n    selected=1\n  fi\n\n  # Display part\n  tput cup 0 0\n  key=\"\"\n  echo2\n  echo2 \"[q] quit         [D] download outputs and quit  [T] kill all jobs\"\n  echo2 \"[j] select next  [k] select previous            [t] kill selected job\"\n  echo2 \"                 [d] see selected job log\"\n  echo2\n  for i in `seq 1 ${N}`; do\n    (\n      ADDR=`get_a ${ADDR_A} ${i}`\n      ONE_LINER=`get_a ${ONE_LINER_A} ${i}`\n      REMOTE_HOST=`get_a ${REMOTE_HOST_A} ${i}`\n      if [ \"${REMOTE_HOST}\" == \"Windows\" ]; then\n        STATUS=`${SSH} ${ADDR} \"if exist ${ONE_LINER} type ${ONE_LINER}\" \\\n                       || true`\n      else\n        STATUS=`${SSH} ${ADDR} \"[ -f ${ONE_LINER} ] && cat ${ONE_LINER}\" \\\n                       || true`\n      fi\n      echo ${STATUS} >${TMP_DIR}/one-liner-${i}.txt\n    ) </dev/null &\n    read -t 0.01 -n 1 key || true\n    if [ \"${key}\" != \"\" ]; then\n      break\n    fi\n  done\n  if [ \"${key}\" == \"\" ]; then\n    wait\n    for i in `seq 1 ${N}`; do\n      ADDR=`get_a ${ADDR_A} ${i}`\n      DESC=`get_a ${DESC_A} ${i}`\n      #if [ \"${REMOTE_HOST}\" == \"Windows\" ]; then\n      #  STATUS=`${SSH} ${ADDR} \"if exist ${ONE_LINER} type ${ONE_LINER}\" \\\n      #                 </dev/null || true`\n      #else\n      #  STATUS=`${SSH} ${ADDR} \"[ -f ${ONE_LINER} ] && cat ${ONE_LINER}\" \\\n      #                 </dev/null || true`\n      #fi\n      STATUS=`cat ${TMP_DIR}/one-liner-${i}.txt`\n      if [ \"${i}\" == \"${selected}\" ]; then\n        echo2 \"++++  ${i}: ${ADDR}, ${DESC}  ++++\"\n      else\n        echo2 \"${i}: ${ADDR}, ${DESC}\"\n      fi\n      W=`expr ${COLUMNS} - 4`\n      echo2 \"    `echo ${STATUS} | cut -c 1-${W}`\"\n      echo2\n      read -t 0.01 -n 1 key || true\n      if [ \"${key}\" != \"\" ]; then\n        break\n      fi\n    done\n  fi\n\n  # Keyboard input part\n  if [ \"${key}\" == \"\" ]; then\n    read -t 1 -n 1 key || true\n  fi\n  if [ \"${key}\" == \"\" ]; then\n    continue\n  fi\n  if [ \"${key}\" == \"q\" ]; then\n    break\n  fi\n  if [ \"${key}\" == \"j\" ]; then\n    selected=`expr ${selected} + 1`\n    continue\n  fi\n  if [ \"${key}\" == \"k\" ]; then\n    selected=`expr ${selected} - 1`\n    continue\n  fi\n  if [ \"`echo ${key} | grep [123456789]`\" != \"\" ]; then\n    selected=${key}\n    continue\n  fi\n  if [ \"${key}\" == \"t\" ]; then\n    ADDR=`get_a ${ADDR_A} ${selected}`\n    KILL_COMMAND=`get_a ${KILL_COMMAND_A} ${selected}`\n    ${SSH} ${ADDR} ${KILL_COMMAND}\n    clear\n    continue\n  fi\n  if [ \"${key}\" == \"T\" ]; then\n    clear\n    echo\n    echo \"Terminating every job...\"\n    echo\n    for i in `seq 1 ${N}`; do\n      ADDR=`get_a ${ADDR_A} ${i}`\n      KILL_COMMAND=`get_a ${KILL_COMMAND_A} ${i}`\n      ${SSH} ${ADDR} ${KILL_COMMAND}\n    done\n    echo\n    echo \"...done\"\n    echo\n    break\n  fi\n  if [ \"${key}\" == \"d\" ]; then\n    ADDR=`get_a ${ADDR_A} ${selected}`\n    LOG=`get_a ${LOG_A} ${selected}`\n    echo DEBUG: ${SCP} ${ADDR}:${LOG} ${TMP_DIR}/log.txt\n    ${SCP} ${ADDR}:${LOG} ${TMP_DIR}/log.txt\n    less ${TMP_DIR}/log.txt\n    clear\n    continue\n  fi\n  if [ \"${key}\" == \"D\" ]; then\n    clear\n    echo\n    echo \"Downloading every log...\"\n    echo\n    for i in `seq 1 ${N}`; do\n      ADDR=`get_a ${ADDR_A} ${i}`\n      LOG=`get_a ${LOG_A} ${i}`\n      ${SCP} ${ADDR}:${LOG} ${TMP_DIR}\n    done\n    echo\n    echo \"...done\"\n    echo\n    break\n  fi\ndone\n\nstty echo icanon\nexit 0\n"
  },
  {
    "path": "scripts/compile-gmp-mpfr-for-wasm.sh",
    "content": "#!/bin/sh\n# Copyright (c) 2021 Agenium Scale\n#\n# Permission is hereby granted, free of charge, to any person obtaining a copy\n# of this software and associated documentation files (the \"Software\"), to deal\n# in the Software without restriction, including without limitation the rights\n# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n# copies of the Software, and to permit persons to whom the Software is\n# furnished to do so, subject to the following conditions:\n#\n# The above copyright notice and this permission notice shall be included in all\n# copies or substantial portions of the Software.\n#\n# THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n# SOFTWARE.\n\ncd `dirname $0`\n\nset -e\nset -x\n\nPREFIX=\"${PWD}/../_wasm-deps\"\nrm -rf ${PREFIX}\nmkdir -p ${PREFIX}\nJ=`nproc || echo 10`\n\n# -----------------------------------------------------------------------------\n# GMP first\n\nif [ \"$1\" != \"\" ]; then\n  VER=$1\nelse\n  VER=6.2.1\nfi\n\nURL=https://gmplib.org/download/gmp/gmp-${VER}.tar.xz\n\ncurl -L ${URL} -o gmp.tar.xz\ntar xf gmp.tar.xz\n(cd gmp-${VER} && \\\n emconfigure ./configure --disable-assembly \\\n                         --host none \\\n                         --enable-cxx \\\n                         --prefix=${PREFIX} && \\\n make -j${J} &&\n make install)\n\n# -----------------------------------------------------------------------------\n# MPFR first\n\nif [ \"$1\" != \"\" ]; then\n  VER=$1\nelse\n  VER=4.1.0\nfi\n\nURL=https://www.mpfr.org/mpfr-current/mpfr-${VER}.tar.gz\n\ncurl -L ${URL} -o mpfr.tar.xz\ntar xf mpfr.tar.xz\n(cd mpfr-${VER} && \\\n emconfigure ./configure --disable-assembly \\\n                         --host none \\\n                         --with-gmp=${PREFIX} \\\n                         --prefix=${PREFIX} && \\\n make -j${J} &&\n make install)\n\n# -----------------------------------------------------------------------------\n# Echo nsconfig parameters to compile for WebAssembly\n\necho\necho\necho\necho\necho \"+---------------------------------------------------------+\"\necho\necho \"Compilation of MPFR + GMP is ok.\"\necho \"Invocation of nsconfig to compile for WebAssembly\"\necho\necho \"CPU emulation:\"\necho\necho \"../nstools/bin/nsconfig .. -Dsimd=cpu \\\\\"\necho \"    -Dmpfr=\\\"-I${PREFIX}/include -L${PREFIX}/lib -lmpfr -lgmp\\\"\"\necho\necho \"WASM SIMD128:\"\necho\necho \"../nstools/bin/nsconfig .. -Dsimd=wasm_simd128 \\\\\"\necho \"    -Dmpfr=\\\"-I${PREFIX}/include -L${PREFIX}/lib -lmpfr -lgmp\\\"\"\necho\necho \"+---------------------------------------------------------+\"\necho\n"
  },
  {
    "path": "scripts/gen_github_doc.sh",
    "content": "#!/bin/sh\n# Copyright (c) 2019 Agenium Scale\n#\n# Permission is hereby granted, free of charge, to any person obtaining a copy\n# of this software and associated documentation files (the \"Software\"), to deal\n# in the Software without restriction, including without limitation the rights\n# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n# copies of the Software, and to permit persons to whom the Software is\n# furnished to do so, subject to the following conditions:\n#\n# The above copyright notice and this permission notice shall be included in all\n# copies or substantial portions of the Software.\n#\n# THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n# SOFTWARE.\n\n###############################################################################\n\ncd `dirname $0`\nset -x\nset -e\n\n###############################################################################\n# Init\n\nGH_PAGES_DIR=\"${PWD}/gh-pages\"\nHTML_DOC_DIR=\"${PWD}/../doc/html\"\nEGG_DIR=\"${PWD}/../egg\"\n\n###############################################################################\n# Generates HTML documentation\n\nrm -f \"${HTML_DOC_DIR}/*.html\"\npython3 \"${EGG_DIR}/hatch.py\" -fd\n\n###############################################################################\n# Put all HTML files into the gh-pages branch of NSIMD\n\nrm -rf \"${GH_PAGES_DIR}\"\ngit clone git@github.com:agenium-scale/nsimd.git \"${GH_PAGES_DIR}\"\ngit -C \"${GH_PAGES_DIR}\" checkout gh-pages\ngit -C \"${GH_PAGES_DIR}\" rm -f '*.html'\ncp ${HTML_DOC_DIR}/*.html ${GH_PAGES_DIR}\nmkdir -p ${GH_PAGES_DIR}/assets\ncp ${HTML_DOC_DIR}/assets/*.js ${GH_PAGES_DIR}/assets\ngit -C \"${GH_PAGES_DIR}\" add '*.html'\ngit -C \"${GH_PAGES_DIR}\" add 'assets/*.js'\ngit -C \"${GH_PAGES_DIR}\" commit -m \"Documentation auto-generated on `date`\"\ngit -C \"${GH_PAGES_DIR}\" push\nrm -rf \"${GH_PAGES_DIR}\"\n"
  },
  {
    "path": "scripts/hipcc.sh",
    "content": "#!/bin/bash\n\n/opt/rocm/bin/hipcc -D__HIPCC__ -D__hcc_major__=3 -D__hcc_minor__=10 \"$@\"\n"
  },
  {
    "path": "scripts/init-benches-deps.sh",
    "content": "#!/bin/sh\n\n## The top-level dir\nROOT_DIR=\"$( git rev-parse --show-toplevel )\"\n\n## Where all the deps are gonna be installed\nINSTALL_DIR=\"${ROOT_DIR}/_install\"\n\nget() {\n    URL=\"$1\"\n    DEST=\"$2\"\n    ## Shift to consume all remaining arguments for cmake\n    shift; shift;\n    ## Get the repo\n    git clone ${URL} ${DEST}\n    ## Prepare build\n    cd ${DEST}\n    mkdir -p build\n    cd build\n    ## Make sure to install in the INSTALL_DIR dir\n    cmake .. -DCMAKE_INSTALL_PREFIX=\"${INSTALL_DIR}\" \"$@\"\n    cmake --build . --target install \n}\n\n## Prepare deps dir + Cleanup\nrm -rf ${INSTALL_DIR}\nmkdir -p _deps\nmkdir -p _install/lib _install/include\n\n## MIPP\ngit clone https://github.com/aff3ct/MIPP.git _deps/mipp\ncp -rfv _deps/mipp/src/* ${INSTALL_DIR}/include\n\n## Sleef\nget https://github.com/shibatch/sleef.git _deps/sleef -DBUILD_TESTS=OFF -DBUILD_DFT=OFF\n\n## Benchmark\nget https://github.com/google/benchmark _deps/benchmark -DBENCHMARK_ENABLE_TESTING=OFF\n"
  },
  {
    "path": "scripts/local-ci-rerun.ini",
    "content": "# -----------------------------------------------------------------------------\n# Intel CPU/SIMD\n\n[sse2,sse42,avx,avx2]\n\nNSTEST -jNPROC\n\n[avx512_knl]\n\nmodule load sde/8.69.1-2021-07-18\nNSTEST --prefix=\"sde64 -knl --\" -jNPROC\n\n[avx512_skylake]\n\nmodule load sde/8.69.1-2021-07-18\nNSTEST --prefix=\"sde64 -skx --\" -jNPROC\n\n# -----------------------------------------------------------------------------\n# Arm\n\n[aarch64]\n\nmodule load qemu/6.1.0\nNSTEST --prefix=\"qemu-aarch64\" -jNPROC\n\n[sve128]\n\nmodule load clang/13.0.0\nmodule load qemu/6.1.0\nNSTEST --prefix=\"qemu-aarch64 -cpu max,sve-max-vq=1\" -jNPROC\n\n[armel]\n\nmodule load qemu/6.1.0\nNSTEST --prefix=\"qemu-arm\" -jNPROC\n\n[armhf]\n\nmodule load qemu/6.1.0\nNSTEST --prefix=\"qemu-arm\" -jNPROC\n\n# -----------------------------------------------------------------------------\n# PowerPC\n\n[vmx]\n\nmodule load clang/13.0.0\nmodule load qemu/6.1.0\nNSTEST --prefix=\"qemu-ppc64le -cpu power8\" -jNPROC\n\n[vsx]\n\nmodule load clang/13.0.0\nmodule load qemu/6.1.0\nNSTEST --prefix=\"qemu-ppc64le -cpu power8\" -jNPROC\n\n# -----------------------------------------------------------------------------\n# Intel oneAPI\n\n[oneapi]\n\nsource /opt/intel/oneapi/setvars.sh\nNSTEST -jNPROC\n\n"
  },
  {
    "path": "scripts/local-ci.ini",
    "content": "# -----------------------------------------------------------------------------\n# Intel CPU/SIMD\n\n[sse2,sse42,avx,avx2]\n\nNSCONFIG -Dsimd=SIMD_EXT -suite=gcc SRC_DIR\nninja TARGET\nNSTEST -jNPROC\n\n[avx512_knl]\n\nNSCONFIG -Dsimd=SIMD_EXT -suite=gcc SRC_DIR\nninja TARGET\nmodule load sde/8.69.1-2021-07-18\nNSTEST --prefix=\"sde64 -knl --\" -jNPROC\n\n[avx512_skylake]\n\nNSCONFIG -Dsimd=SIMD_EXT -suite=gcc SRC_DIR\nninja TARGET\nmodule load sde/8.69.1-2021-07-18\nNSTEST --prefix=\"sde64 -skx --\" -jNPROC\n\n# -----------------------------------------------------------------------------\n# Arm\n\n[aarch64]\n\nmodule load clang/13.0.0\nNSCONFIG -Dsimd=SIMD_EXT \\\n         -comp=cc,clang,SRC_DIR/scripts/aarch64-linux-gnu-clang.sh,13,aarch64 \\\n         -comp=c++,clang,SRC_DIR/scripts/aarch64-linux-gnu-clang++.sh,13,aarch64 \\\n         SRC_DIR\nninja TARGET\nmodule load qemu/6.1.0\nNSTEST --prefix=\"qemu-aarch64\" -jNPROC\n\n[sve128]\n\nmodule load aarch64-linux-gnu/11.2.0\nNSCONFIG -Dsimd=SIMD_EXT \\\n         -comp=cc,gcc,aarch64-linux-gnu-gcc,11,aarch64 \\\n         -comp=c++,gcc,aarch64-linux-gnu-g++,11,aarch64 SRC_DIR\nninja TARGET\nmodule load qemu/6.1.0\nNSTEST --prefix=\"qemu-aarch64 -cpu max,sve-max-vq=1\" -jNPROC\n\n[armel]\n\nNSCONFIG -Dsimd=neon128 -comp=cc,gcc,arm-linux-gnueabi-gcc,6,armel \\\n                        -comp=c++,gcc,arm-linux-gnueabi-g++,6,armel SRC_DIR\nninja TARGET\nmodule load qemu/6.1.0\nNSTEST --prefix=\"qemu-arm\" -jNPROC\n\n[armhf]\n\nNSCONFIG -Dsimd=neon128 -comp=cc,gcc,arm-linux-gnueabihf-gcc,6,armhf \\\n                        -comp=c++,gcc,arm-linux-gnueabihf-g++,6,armhf SRC_DIR\nninja TARGET\nmodule load qemu/6.1.0\nNSTEST --prefix=\"qemu-arm\" -jNPROC\n\n# -----------------------------------------------------------------------------\n# PowerPC\n\n[vmx]\n\nmodule load clang/13.0.0\nNSCONFIG -Dsimd=vmx \\\n         -comp=cc,clang,SRC_DIR/scripts/powerpc64le-linux-gnu-clang.sh,7,ppc64el \\\n         -comp=c++,clang,SRC_DIR/scripts/powerpc64le-linux-gnu-clang++.sh,7,ppc64el \\\n         SRC_DIR\nninja TARGET\nmodule load qemu/6.1.0\nNSTEST --prefix=\"qemu-ppc64le -cpu power8\" -jNPROC\n\n[vsx]\n\nmodule load clang/13.0.0\nNSCONFIG -Dsimd=vsx \\\n         -comp=cc,clang,SRC_DIR/scripts/powerpc64le-linux-gnu-clang.sh,7,ppc64el \\\n         -comp=c++,clang,SRC_DIR/scripts/powerpc64le-linux-gnu-clang++.sh,7,ppc64el \\\n         SRC_DIR\nninja TARGET\nmodule load qemu/6.1.0\nNSTEST --prefix=\"qemu-ppc64le -cpu power8\" -jNPROC\n\n# -----------------------------------------------------------------------------\n# Intel oneAPI\n\n[oneapi]\n\nsource /opt/intel/oneapi/setvars.sh\nNSCONFIG -Dsimd=SIMD_EXT -suite=oneapi SRC_DIR\nninja TARGET\nNSTEST -jNPROC\n\n# -----------------------------------------------------------------------------\n# NVIDIA CUDA (cannot be emulated, or at least I don't know how)\n\n[cuda]\n\nNSCONFIG -Dsimd=SIMD_EXT -suite=cuda SRC_DIR\nninja TARGET\n\n# -----------------------------------------------------------------------------\n# AMD HIP/ROCm (can be emulated with HIP-CPU) but as of now (2021/10/07) the\n# library is marked as \"Please note the library is being actively developed,\n# and is known to be incomplet; it might also be incorrekt and there could be a\n# few bad bugs lurking.\" so that I will wait for a first release.\n\n[rocm]\n\nNSCONFIG -Dsimd=SIMD_EXT -suite=rocm SRC_DIR\nninja TARGET\n\n"
  },
  {
    "path": "scripts/local-ci.sh",
    "content": "#!/bin/sh\n\n# -----------------------------------------------------------------------------\n# Init\n\nINPUT=\"`realpath ${1}`\"\nTARGET=\"${2}\"\ncd `dirname $0`\nROOT=\"${PWD}/../build-local-ci\"\nmkdir -p \"${ROOT}\"\nNPROC=`nproc`\nif [ \"${TARGET}\" == \"\" ]; then\n  TARGET=\"tests\"\nfi\n\n# -----------------------------------------------------------------------------\n# Make sure we have generated nsimd\n\npython3 \"${PWD}/../egg/hatch.py\" -ltf\n\n# -----------------------------------------------------------------------------\n# Make sure we have the latest commit for nsconfig\n\nNSCONFIG=\"${PWD}/../nstools/nsconfig/nsconfig\"\nNSTEST=\"${PWD}/../nstools/nsconfig/nstest\"\n\n[ -e \"${NSCONFIG}\" ] || ( export NSTOOLS_CHECKOUT_LAST_COMMIT=1 && \\\n                          bash \"${PWD}/../scripts/setup.sh\" )\n[ -e \"${NSTEST}\" ] || ( export NSTOOLS_CHECKOUT_LAST_COMMIT=1 && \\\n                        bash \"${PWD}/../scripts/setup.sh\" )\n\n# -----------------------------------------------------------------------------\n# Parse input file\n\nSIMD_EXTS=\"\"\n\nwhile read -r line; do\n\n  # Empty lines\n  if [ \"`echo ${line} | sed 's/[ \\t]*//g'`\" == \"\" ]; then\n    continue\n  fi\n\n  # Comments\n  if [ \"`echo ${line} | cut -c 1`\" == \"#\" ]; then\n    continue\n  fi\n\n  # New architectures\n  if [ \"`echo ${line} | cut -c 1`\" == \"[\" ]; then\n    SIMD_EXTS=\"`echo ${line} | sed -e 's/[][,]/ /g'`\"\n    for s in ${SIMD_EXTS}; do\n      echo '#!/bin/bash' >\"${ROOT}/run-${s}.sh\"\n      echo >>\"${ROOT}/run-${s}.sh\"\n      echo 'cd `dirname $0`' >>\"${ROOT}/run-${s}.sh\"\n      echo \"mkdir -p ${s}\" >>\"${ROOT}/run-${s}.sh\"\n      echo \"cd ${s}\" >>\"${ROOT}/run-${s}.sh\"\n      echo >>\"${ROOT}/run-${s}.sh\"\n    done\n    continue\n  fi\n\n  # Standard line (part of a script)\n  if [ \"${SIMD_EXTS}\" != \"\" ]; then\n    for s in ${SIMD_EXTS}; do\n      echo ${line} | sed -e \"s,SIMD_EXT,${s},g\" \\\n                         -e \"s,SRC_DIR,${PWD}/..,g\" \\\n                         -e \"s,NSCONFIG,${NSCONFIG},g\" \\\n                         -e \"s,NSTEST,${NSTEST},g\" \\\n                         -e \"s,NPROC,${NPROC},g\" \\\n                         -e \"s,TARGET,${TARGET},g\" \\\n                         >>\"${ROOT}/run-${s}.sh\"\n    done\n  fi\n \ndone <\"${INPUT}\"\n\n# -----------------------------------------------------------------------------\n# Compile all tests\n\nfor i in ${ROOT}/*.sh; do\n  ( bash ${i} || true ) | tee ${i}.log\ndone\n"
  },
  {
    "path": "scripts/one-liner.c",
    "content": "/*\n\nCopyright (c) 2020 Agenium Scale\n\nPermission is hereby granted, free of charge, to any person obtaining a copy\nof this software and associated documentation files (the \"Software\"), to deal\nin the Software without restriction, including without limitation the rights\nto use, copy, modify, merge, publish, distribute, sublicense, and/or sell\ncopies of the Software, and to permit persons to whom the Software is\nfurnished to do so, subject to the following conditions:\n\nThe above copyright notice and this permission notice shall be included in all\ncopies or substantial portions of the Software.\n\nTHE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\nIMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\nFITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\nAUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\nLIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\nOUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\nSOFTWARE.\n\n*/\n\n/* ------------------------------------------------------------------------- */\n\n/*\n\nThis program needs to be as portable as possible as it is intended for\nWindows hosts with an unknown version of Visual Studio. It is compiled\nbefore running the tests of NSIMD.\n\nIts purpose is to read stdin and put all into an accumulator file and from\ntime to time (approximatively every second) put a line of text into another\nfile.\n\n*/\n\n#include <stdlib.h>\n#include <stdio.h>\n#include <time.h>\n#include <errno.h>\n#include <string.h>\n\n#define DO(cmd, error_code, goto_label_on_error)                              \\\n  do {                                                                        \\\n    errno = 0;                                                                \\\n    if ((cmd) == error_code) {                                                \\\n      fprintf(stderr, \"%s: error: \" #cmd \": %s\\n\", argv[0], strerror(errno)); \\\n      ret = -1;                                                               \\\n      goto goto_label_on_error;                                               \\\n    }                                                                         \\\n  } while (0)\n\nint main(int argc, char **argv) {\n  FILE *acc, *one = NULL;\n  char *buf;\n  int ret = 0;\n  size_t n = 1024;\n  time_t tick;\n\n  if (argc != 3) {\n    fprintf(stderr, \"%s: ERROR: usage: one-liner acc.txt one-liner.txt\",\n            argv[0]);\n    return -1;\n  }\n\n  DO(acc = fopen(argv[1], \"wb\"), NULL, end);\n  DO(buf = malloc(n), NULL, free_acc);\n\n  tick = time(NULL);\n  for (;;) {\n    time_t t;\n    size_t i = 0;\n    int end_of_file = 0;\n\n    for (;;) {\n      int code = fgetc(stdin);\n      if (code == EOF || code == '\\n') {\n        buf[i] = '\\n';\n        buf[i + 1] = 0;\n        end_of_file = (code == EOF);\n        break;\n      }\n      buf[i] = (char)code;\n      if (i >= n - 2) {\n        n = n * 2;\n        DO(buf = realloc(buf, n), NULL, free_buf);\n      }\n      i++;\n    }\n\n    DO(fputs(buf, acc), EOF, free_buf);\n    DO(fflush(acc), EOF, free_buf);\n    t = time(NULL);\n    if (t - tick >= 1) {\n      DO(one = fopen(argv[2], \"wb\"), NULL, free_buf);\n      DO(fputs(buf, one), EOF, free_one);\n      DO(fflush(one), EOF, free_one);\n      DO(fclose(one), EOF, free_one);\n      one = NULL;\n      tick = t;\n    }\n\n    if (end_of_file) {\n      break;\n    }\n  }\n\n  DO(one = fopen(argv[2], \"wb\"), NULL, free_buf);\n  DO(fputs(\"Finished\", one), EOF, free_one);\n  DO(fflush(one), EOF, free_one);\n\nfree_one:\n  if (one != NULL && fclose(one) == EOF) {\n    fprintf(stderr, \"%s: NOTE: error on closing '%s': %s\\n\", argv[0], argv[2],\n            strerror(errno));\n  }\n\nfree_buf:\n  free(buf);\n\nfree_acc:\n  if (fclose(acc) == EOF) {\n    fprintf(stderr, \"%s: NOTE: error on closing '%s': %s\\n\", argv[0], argv[1],\n            strerror(errno));\n  }\n\nend:\n  return ret;\n}\n"
  },
  {
    "path": "scripts/powerpc64le-linux-gnu-clang++.sh",
    "content": "#!/bin/bash\n\nclang++ --target=powerpc64le-linux-gnu \\\n        -I/usr/powerpc64le-linux-gnu/include/c++/8/powerpc64le-linux-gnu \"$@\"\n"
  },
  {
    "path": "scripts/powerpc64le-linux-gnu-clang.sh",
    "content": "#!/bin/bash\n\nclang --target=powerpc64le-linux-gnu \\\n      -I/usr/powerpc64le-linux-gnu/include/c++/8/powerpc64le-linux-gnu \"$@\"\n"
  },
  {
    "path": "scripts/setup.bat",
    "content": "@echo off\n\nREM Copyright (c) 2020 Agenium Scale\nREM\nREM Permission is hereby granted, free of charge, to any person obtaining a copy\nREM of this software and associated documentation files (the \"Software\"), to deal\nREM in the Software without restriction, including without limitation the rights\nREM to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\nREM copies of the Software, and to permit persons to whom the Software is\nREM furnished to do so, subject to the following conditions:\nREM\nREM The above copyright notice and this permission notice shall be included in all\nREM copies or substantial portions of the Software.\nREM\nREM THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\nREM IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\nREM FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\nREM AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\nREM LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\nREM OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\nREM SOFTWARE.\n\nREM ###########################################################################\n\nsetlocal EnableDelayedExpansion\npushd \"%~dp0\"\n\nREM ###########################################################################\nREM Init\n\nset NSTOOLS_DIR=\"%CD%\\..\\nstools\"\n\nREM ###########################################################################\nREM Pull nstools\n\nif exist \"%NSTOOLS_DIR%\\README.md\" (\n  pushd %NSTOOLS_DIR%\n  git pull || cd .\n  popd\n) else (\n  if exist \"..\\.git\" (\n    git remote get-url origin >_tmp-nsimd-url.txt\n    set /P NSIMD_URL=<_tmp-nsimd-url.txt\n    set NSTOOLS_URL=!NSIMD_URL:nsimd=nstools!\n    del /F /Q _tmp-nsimd-url.txt\n    pushd \"..\"\n    git clone !NSTOOLS_URL! nstools\n    popd\n  ) else (\n    pushd \"..\"\n    git clone \"https://github.com/agenium-scale/nstools.git\" nstools\n    popd\n  )\n)\n\nif \"%NSTOOLS_CHECKOUT_LAST_COMMIT%\" == \"\" (\n  git -C %NSTOOLS_DIR% checkout v3.0\n) else (\n  git -C %NSTOOLS_DIR% checkout master\n)\n\nREM ###########################################################################\nREM Create bin directory\n\nif not exist %NSTOOLS_DIR%\\bin (\n  md %NSTOOLS_DIR%\\bin\n)\n\nREM ###########################################################################\nREM Build nsconfig (if not already built)\n\npushd %NSTOOLS_DIR%\\nsconfig\nnmake /F Makefile.win nsconfig.exe\nnmake /F Makefile.win nstest.exe\ncopy /Y \"nsconfig.exe\" %NSTOOLS_DIR%\\bin\ncopy /Y \"nstest.exe\" %NSTOOLS_DIR%\\bin\npopd\n\npopd\nendlocal\nexit /B 0\n"
  },
  {
    "path": "scripts/setup.sh",
    "content": "#!/bin/bash\n# Copyright (c) 2021 Agenium Scale\n#\n# Permission is hereby granted, free of charge, to any person obtaining a copy\n# of this software and associated documentation files (the \"Software\"), to deal\n# in the Software without restriction, including without limitation the rights\n# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n# copies of the Software, and to permit persons to whom the Software is\n# furnished to do so, subject to the following conditions:\n#\n# The above copyright notice and this permission notice shall be included in all\n# copies or substantial portions of the Software.\n#\n# THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n# SOFTWARE.\n\n###############################################################################\n\ncd `dirname $0`\nset -x\nset -e\n\n###############################################################################\n# Init\n\nNSTOOLS_DIR=\"${PWD}/../nstools\"\n\n###############################################################################\n# Build nsconfig (if not already built)\n\n[ -d \"${NSTOOLS_DIR}\" ] || \\\n    ( cd \"${PWD}/..\" && \\\n      ( [ -d .git ] \\\n        && ( git clone `git remote get-url origin | sed s/nsimd/nstools/g` ) \\\n        || ( git clone \"https://github.com/agenium-scale/nstools.git\" ) ) )\n\nif [ \"${NSTOOLS_CHECKOUT_LAST_COMMIT}\" == \"\" ]; then\n  git -C \"${NSTOOLS_DIR}\" checkout v3.0\nelse\n  git -C \"${NSTOOLS_DIR}\" checkout master\n  git -C \"${NSTOOLS_DIR}\" pull\nfi\n\n( cd \"${NSTOOLS_DIR}/nsconfig\" && \\\n  make -B -j8 -f Makefile.nix nsconfig && \\\n  make -B -j8 -f Makefile.nix nstest )\n"
  },
  {
    "path": "src/dd.h",
    "content": "//   Copyright Naoki Shibata and contributors 2010 - 2020.\n// Distributed under the Boost Software License, Version 1.0.\n//    (See accompanying file LICENSE.txt or copy at\n//          http://www.boost.org/LICENSE_1_0.txt)\n\n#if !(defined(ENABLE_SVE) || defined(ENABLE_SVENOFMA))\ntypedef struct {\n  vdouble x, y;\n} vdouble2;\n\nstatic vdouble  vd2getx_vd_vd2(vdouble2 v) { return v.x; }\nstatic vdouble  vd2gety_vd_vd2(vdouble2 v) { return v.y; }\nstatic vdouble2 vd2setxy_vd2_vd_vd(vdouble x, vdouble y)  { vdouble2 v; v.x = x; v.y = y; return v; }\nstatic vdouble2 vd2setx_vd2_vd2_vd(vdouble2 v, vdouble d) { v.x = d; return v; }\nstatic vdouble2 vd2sety_vd2_vd2_vd(vdouble2 v, vdouble d) { v.y = d; return v; }\n#endif\n\nstatic INLINE CONST VECTOR_CC vdouble vupper_vd_vd(vdouble d) {\n  return vreinterpret_vd_vm(vand_vm_vm_vm(vreinterpret_vm_vd(d), vcast_vm_i_i(0xffffffff, 0xf8000000)));\n}\n\nstatic INLINE CONST VECTOR_CC vdouble2 vcast_vd2_vd_vd(vdouble h, vdouble l) {\n  return vd2setxy_vd2_vd_vd(h, l);\n}\n\nstatic INLINE CONST VECTOR_CC vdouble2 vcast_vd2_d_d(double h, double l) {\n  return vd2setxy_vd2_vd_vd(vcast_vd_d(h), vcast_vd_d(l));\n}\n\nstatic INLINE CONST VECTOR_CC vdouble2 vsel_vd2_vo_vd2_vd2(vopmask m, vdouble2 x, vdouble2 y) {\n  return vd2setxy_vd2_vd_vd(vsel_vd_vo_vd_vd(m, vd2getx_vd_vd2(x), vd2getx_vd_vd2(y)),\n\t\t\t    vsel_vd_vo_vd_vd(m, vd2gety_vd_vd2(x), vd2gety_vd_vd2(y)));\n}\n\nstatic INLINE CONST VECTOR_CC vdouble2 vsel_vd2_vo_d_d_d_d(vopmask o, double x1, double y1, double x0, double y0) {\n  return vd2setxy_vd2_vd_vd(vsel_vd_vo_d_d(o, x1, x0),\n\t\t\t    vsel_vd_vo_d_d(o, y1, y0));\n}\n\nstatic INLINE CONST VECTOR_CC vdouble vadd_vd_3vd(vdouble v0, vdouble v1, vdouble v2) {\n  return vadd_vd_vd_vd(vadd_vd_vd_vd(v0, v1), v2);\n}\n\nstatic INLINE CONST VECTOR_CC vdouble vadd_vd_4vd(vdouble v0, vdouble v1, vdouble v2, vdouble v3) {\n  return vadd_vd_3vd(vadd_vd_vd_vd(v0, v1), v2, v3);\n}\n\nstatic INLINE CONST VECTOR_CC vdouble vadd_vd_5vd(vdouble v0, vdouble v1, vdouble v2, vdouble v3, vdouble v4) {\n  return vadd_vd_4vd(vadd_vd_vd_vd(v0, v1), v2, v3, v4);\n}\n\nstatic INLINE CONST VECTOR_CC vdouble vadd_vd_6vd(vdouble v0, vdouble v1, vdouble v2, vdouble v3, vdouble v4, vdouble v5) {\n  return vadd_vd_5vd(vadd_vd_vd_vd(v0, v1), v2, v3, v4, v5);\n}\n\nstatic INLINE CONST VECTOR_CC vdouble vadd_vd_7vd(vdouble v0, vdouble v1, vdouble v2, vdouble v3, vdouble v4, vdouble v5, vdouble v6) {\n  return vadd_vd_6vd(vadd_vd_vd_vd(v0, v1), v2, v3, v4, v5, v6);\n}\n\nstatic INLINE CONST VECTOR_CC vdouble vsub_vd_3vd(vdouble v0, vdouble v1, vdouble v2) {\n  return vsub_vd_vd_vd(vsub_vd_vd_vd(v0, v1), v2);\n}\n\nstatic INLINE CONST VECTOR_CC vdouble vsub_vd_4vd(vdouble v0, vdouble v1, vdouble v2, vdouble v3) {\n  return vsub_vd_3vd(vsub_vd_vd_vd(v0, v1), v2, v3);\n}\n\nstatic INLINE CONST VECTOR_CC vdouble vsub_vd_5vd(vdouble v0, vdouble v1, vdouble v2, vdouble v3, vdouble v4) {\n  return vsub_vd_4vd(vsub_vd_vd_vd(v0, v1), v2, v3, v4);\n}\n\nstatic INLINE CONST VECTOR_CC vdouble vsub_vd_6vd(vdouble v0, vdouble v1, vdouble v2, vdouble v3, vdouble v4, vdouble v5) {\n  return vsub_vd_5vd(vsub_vd_vd_vd(v0, v1), v2, v3, v4, v5);\n}\n\n//\n\nstatic INLINE CONST VECTOR_CC vdouble2 ddneg_vd2_vd2(vdouble2 x) {\n  return vcast_vd2_vd_vd(vneg_vd_vd(vd2getx_vd_vd2(x)), vneg_vd_vd(vd2gety_vd_vd2(x)));\n}\n\nstatic INLINE CONST VECTOR_CC vdouble2 ddabs_vd2_vd2(vdouble2 x) {\n  return vcast_vd2_vd_vd(vabs_vd_vd(vd2getx_vd_vd2(x)),\n\t\t\t vreinterpret_vd_vm(vxor_vm_vm_vm(vreinterpret_vm_vd(vd2gety_vd_vd2(x)),\n\t\t\t\t\t\t\t  vand_vm_vm_vm(vreinterpret_vm_vd(vd2getx_vd_vd2(x)),\n\t\t\t\t\t\t\t\t\tvreinterpret_vm_vd(vcast_vd_d(-0.0))))));\n}\n\nstatic INLINE CONST VECTOR_CC vdouble2 ddnormalize_vd2_vd2(vdouble2 t) {\n  vdouble s = vadd_vd_vd_vd(vd2getx_vd_vd2(t), vd2gety_vd_vd2(t));\n  return vd2setxy_vd2_vd_vd(s, vadd_vd_vd_vd(vsub_vd_vd_vd(vd2getx_vd_vd2(t), s), vd2gety_vd_vd2(t)));\n}\n\nstatic INLINE CONST VECTOR_CC vdouble2 ddscale_vd2_vd2_vd(vdouble2 d, vdouble s) {\n  return vd2setxy_vd2_vd_vd(vmul_vd_vd_vd(vd2getx_vd_vd2(d), s), vmul_vd_vd_vd(vd2gety_vd_vd2(d), s));\n}\n\nstatic INLINE CONST VECTOR_CC vdouble2 ddadd_vd2_vd_vd(vdouble x, vdouble y) {\n  vdouble s = vadd_vd_vd_vd(x, y);\n  return vd2setxy_vd2_vd_vd(s, vadd_vd_vd_vd(vsub_vd_vd_vd(x, s), y));\n}\n\nstatic INLINE CONST VECTOR_CC vdouble2 ddadd2_vd2_vd_vd(vdouble x, vdouble y) {\n  vdouble s = vadd_vd_vd_vd(x, y);\n  vdouble v = vsub_vd_vd_vd(s, x);\n  return vd2setxy_vd2_vd_vd(s, vadd_vd_vd_vd(vsub_vd_vd_vd(x, vsub_vd_vd_vd(s, v)), vsub_vd_vd_vd(y, v)));\n}\n\nstatic INLINE CONST VECTOR_CC vdouble2 ddadd_vd2_vd2_vd(vdouble2 x, vdouble y) {\n  vdouble s = vadd_vd_vd_vd(vd2getx_vd_vd2(x), y);\n  return vd2setxy_vd2_vd_vd(s, vadd_vd_3vd(vsub_vd_vd_vd(vd2getx_vd_vd2(x), s), y, vd2gety_vd_vd2(x)));\n}\n\nstatic INLINE CONST VECTOR_CC vdouble2 ddsub_vd2_vd2_vd(vdouble2 x, vdouble y) {\n  vdouble s = vsub_vd_vd_vd(vd2getx_vd_vd2(x), y);\n  return vd2setxy_vd2_vd_vd(s, vadd_vd_vd_vd(vsub_vd_vd_vd(vsub_vd_vd_vd(vd2getx_vd_vd2(x), s), y), vd2gety_vd_vd2(x)));\n}\n\nstatic INLINE CONST VECTOR_CC vdouble2 ddadd2_vd2_vd2_vd(vdouble2 x, vdouble y) {\n  vdouble s = vadd_vd_vd_vd(vd2getx_vd_vd2(x), y);\n  vdouble v = vsub_vd_vd_vd(s, vd2getx_vd_vd2(x));\n  vdouble w = vadd_vd_vd_vd(vsub_vd_vd_vd(vd2getx_vd_vd2(x), vsub_vd_vd_vd(s, v)), vsub_vd_vd_vd(y, v));\n  return vd2setxy_vd2_vd_vd(s, vadd_vd_vd_vd(w, vd2gety_vd_vd2(x)));\n}\n\nstatic INLINE CONST VECTOR_CC vdouble2 ddadd_vd2_vd_vd2(vdouble x, vdouble2 y) {\n  vdouble s = vadd_vd_vd_vd(x, vd2getx_vd_vd2(y));\n  return vd2setxy_vd2_vd_vd(s, vadd_vd_3vd(vsub_vd_vd_vd(x, s), vd2getx_vd_vd2(y), vd2gety_vd_vd2(y)));\n}\n\nstatic INLINE CONST VECTOR_CC vdouble2 ddadd2_vd2_vd_vd2(vdouble x, vdouble2 y) {\n  vdouble s = vadd_vd_vd_vd(x, vd2getx_vd_vd2(y));\n  vdouble v = vsub_vd_vd_vd(s, x);\n  return vd2setxy_vd2_vd_vd(s, vadd_vd_vd_vd(vadd_vd_vd_vd(vsub_vd_vd_vd(x, vsub_vd_vd_vd(s, v)),\n\t\t\t\t\t\t\t   vsub_vd_vd_vd(vd2getx_vd_vd2(y), v)), vd2gety_vd_vd2(y)));\n}\n\nstatic INLINE CONST VECTOR_CC vdouble2 ddadd_vd2_vd2_vd2(vdouble2 x, vdouble2 y) {\n  // |x| >= |y|\n\n  vdouble s = vadd_vd_vd_vd(vd2getx_vd_vd2(x), vd2getx_vd_vd2(y));\n  return vd2setxy_vd2_vd_vd(s, vadd_vd_4vd(vsub_vd_vd_vd(vd2getx_vd_vd2(x), s), vd2getx_vd_vd2(y), vd2gety_vd_vd2(x), vd2gety_vd_vd2(y)));\n}\n\nstatic INLINE CONST VECTOR_CC vdouble2 ddadd2_vd2_vd2_vd2(vdouble2 x, vdouble2 y) {\n  vdouble s = vadd_vd_vd_vd(vd2getx_vd_vd2(x), vd2getx_vd_vd2(y));\n  vdouble v = vsub_vd_vd_vd(s, vd2getx_vd_vd2(x));\n  vdouble t = vadd_vd_vd_vd(vsub_vd_vd_vd(vd2getx_vd_vd2(x), vsub_vd_vd_vd(s, v)), vsub_vd_vd_vd(vd2getx_vd_vd2(y), v));\n  return vd2setxy_vd2_vd_vd(s, vadd_vd_vd_vd(t, vadd_vd_vd_vd(vd2gety_vd_vd2(x), vd2gety_vd_vd2(y))));\n}\n\nstatic INLINE CONST VECTOR_CC vdouble2 ddsub_vd2_vd_vd(vdouble x, vdouble y) {\n  // |x| >= |y|\n\n  vdouble s = vsub_vd_vd_vd(x, y);\n  return vd2setxy_vd2_vd_vd(s, vsub_vd_vd_vd(vsub_vd_vd_vd(x, s), y));\n}\n\nstatic INLINE CONST VECTOR_CC vdouble2 ddsub_vd2_vd2_vd2(vdouble2 x, vdouble2 y) {\n  // |x| >= |y|\n\n  vdouble s = vsub_vd_vd_vd(vd2getx_vd_vd2(x), vd2getx_vd_vd2(y));\n  vdouble t = vsub_vd_vd_vd(vd2getx_vd_vd2(x), s);\n  t = vsub_vd_vd_vd(t, vd2getx_vd_vd2(y));\n  t = vadd_vd_vd_vd(t, vd2gety_vd_vd2(x));\n  return vd2setxy_vd2_vd_vd(s, vsub_vd_vd_vd(t, vd2gety_vd_vd2(y)));\n}\n\n#ifdef ENABLE_FMA_DP\nstatic INLINE CONST VECTOR_CC vdouble2 dddiv_vd2_vd2_vd2(vdouble2 n, vdouble2 d) {\n  vdouble t = vrec_vd_vd(vd2getx_vd_vd2(d));\n  vdouble s = vmul_vd_vd_vd(vd2getx_vd_vd2(n), t);\n  vdouble u = vfmapn_vd_vd_vd_vd(t, vd2getx_vd_vd2(n), s);\n  vdouble v = vfmanp_vd_vd_vd_vd(vd2gety_vd_vd2(d), t, vfmanp_vd_vd_vd_vd(vd2getx_vd_vd2(d), t, vcast_vd_d(1)));\n  return vd2setxy_vd2_vd_vd(s, vfma_vd_vd_vd_vd(s, v, vfma_vd_vd_vd_vd(vd2gety_vd_vd2(n), t, u)));\n}\n\nstatic INLINE CONST VECTOR_CC vdouble2 ddmul_vd2_vd_vd(vdouble x, vdouble y) {\n  vdouble s = vmul_vd_vd_vd(x, y);\n  return vd2setxy_vd2_vd_vd(s, vfmapn_vd_vd_vd_vd(x, y, s));\n}\n\nstatic INLINE CONST VECTOR_CC vdouble2 ddsqu_vd2_vd2(vdouble2 x) {\n  vdouble s = vmul_vd_vd_vd(vd2getx_vd_vd2(x), vd2getx_vd_vd2(x));\n  return vd2setxy_vd2_vd_vd(s, vfma_vd_vd_vd_vd(vadd_vd_vd_vd(vd2getx_vd_vd2(x), vd2getx_vd_vd2(x)), vd2gety_vd_vd2(x), vfmapn_vd_vd_vd_vd(vd2getx_vd_vd2(x), vd2getx_vd_vd2(x), s)));\n}\n\nstatic INLINE CONST VECTOR_CC vdouble2 ddmul_vd2_vd2_vd2(vdouble2 x, vdouble2 y) {\n  vdouble s = vmul_vd_vd_vd(vd2getx_vd_vd2(x), vd2getx_vd_vd2(y));\n  return vd2setxy_vd2_vd_vd(s, vfma_vd_vd_vd_vd(vd2getx_vd_vd2(x), vd2gety_vd_vd2(y), vfma_vd_vd_vd_vd(vd2gety_vd_vd2(x), vd2getx_vd_vd2(y), vfmapn_vd_vd_vd_vd(vd2getx_vd_vd2(x), vd2getx_vd_vd2(y), s))));\n}\n\nstatic INLINE CONST VECTOR_CC vdouble ddmul_vd_vd2_vd2(vdouble2 x, vdouble2 y) {\n  return vfma_vd_vd_vd_vd(vd2getx_vd_vd2(x), vd2getx_vd_vd2(y), vfma_vd_vd_vd_vd(vd2gety_vd_vd2(x), vd2getx_vd_vd2(y), vmul_vd_vd_vd(vd2getx_vd_vd2(x), vd2gety_vd_vd2(y))));\n}\n\nstatic INLINE CONST VECTOR_CC vdouble ddsqu_vd_vd2(vdouble2 x) {\n  return vfma_vd_vd_vd_vd(vd2getx_vd_vd2(x), vd2getx_vd_vd2(x), vadd_vd_vd_vd(vmul_vd_vd_vd(vd2getx_vd_vd2(x), vd2gety_vd_vd2(x)), vmul_vd_vd_vd(vd2getx_vd_vd2(x), vd2gety_vd_vd2(x))));\n}\n\nstatic INLINE CONST VECTOR_CC vdouble2 ddmul_vd2_vd2_vd(vdouble2 x, vdouble y) {\n  vdouble s = vmul_vd_vd_vd(vd2getx_vd_vd2(x), y);\n  return vd2setxy_vd2_vd_vd(s, vfma_vd_vd_vd_vd(vd2gety_vd_vd2(x), y, vfmapn_vd_vd_vd_vd(vd2getx_vd_vd2(x), y, s)));\n}\n\nstatic INLINE CONST VECTOR_CC vdouble2 ddrec_vd2_vd(vdouble d) {\n  vdouble s = vrec_vd_vd(d);\n  return vd2setxy_vd2_vd_vd(s, vmul_vd_vd_vd(s, vfmanp_vd_vd_vd_vd(d, s, vcast_vd_d(1))));\n}\n\nstatic INLINE CONST VECTOR_CC vdouble2 ddrec_vd2_vd2(vdouble2 d) {\n  vdouble s = vrec_vd_vd(vd2getx_vd_vd2(d));\n  return vd2setxy_vd2_vd_vd(s, vmul_vd_vd_vd(s, vfmanp_vd_vd_vd_vd(vd2gety_vd_vd2(d), s, vfmanp_vd_vd_vd_vd(vd2getx_vd_vd2(d), s, vcast_vd_d(1)))));\n}\n#else\nstatic INLINE CONST VECTOR_CC vdouble2 dddiv_vd2_vd2_vd2(vdouble2 n, vdouble2 d) {\n  vdouble t = vrec_vd_vd(vd2getx_vd_vd2(d));\n  vdouble dh  = vupper_vd_vd(vd2getx_vd_vd2(d)), dl  = vsub_vd_vd_vd(vd2getx_vd_vd2(d),  dh);\n  vdouble th  = vupper_vd_vd(t  ), tl  = vsub_vd_vd_vd(t  ,  th);\n  vdouble nhh = vupper_vd_vd(vd2getx_vd_vd2(n)), nhl = vsub_vd_vd_vd(vd2getx_vd_vd2(n), nhh);\n\n  vdouble s = vmul_vd_vd_vd(vd2getx_vd_vd2(n), t);\n\n  vdouble u = vadd_vd_5vd(vsub_vd_vd_vd(vmul_vd_vd_vd(nhh, th), s), vmul_vd_vd_vd(nhh, tl), vmul_vd_vd_vd(nhl, th), vmul_vd_vd_vd(nhl, tl),\n\t\t    vmul_vd_vd_vd(s, vsub_vd_5vd(vcast_vd_d(1), vmul_vd_vd_vd(dh, th), vmul_vd_vd_vd(dh, tl), vmul_vd_vd_vd(dl, th), vmul_vd_vd_vd(dl, tl))));\n\n  return vd2setxy_vd2_vd_vd(s, vmla_vd_vd_vd_vd(t, vsub_vd_vd_vd(vd2gety_vd_vd2(n), vmul_vd_vd_vd(s, vd2gety_vd_vd2(d))), u));\n}\n\nstatic INLINE CONST VECTOR_CC vdouble2 ddmul_vd2_vd_vd(vdouble x, vdouble y) {\n  vdouble xh = vupper_vd_vd(x), xl = vsub_vd_vd_vd(x, xh);\n  vdouble yh = vupper_vd_vd(y), yl = vsub_vd_vd_vd(y, yh);\n\n  vdouble s = vmul_vd_vd_vd(x, y);\n  return vd2setxy_vd2_vd_vd(s, vadd_vd_5vd(vmul_vd_vd_vd(xh, yh), vneg_vd_vd(s), vmul_vd_vd_vd(xl, yh), vmul_vd_vd_vd(xh, yl), vmul_vd_vd_vd(xl, yl)));\n}\n\nstatic INLINE CONST VECTOR_CC vdouble2 ddmul_vd2_vd2_vd(vdouble2 x, vdouble y) {\n  vdouble xh = vupper_vd_vd(vd2getx_vd_vd2(x)), xl = vsub_vd_vd_vd(vd2getx_vd_vd2(x), xh);\n  vdouble yh = vupper_vd_vd(y  ), yl = vsub_vd_vd_vd(y  , yh);\n\n  vdouble s = vmul_vd_vd_vd(vd2getx_vd_vd2(x), y);\n  return vd2setxy_vd2_vd_vd(s, vadd_vd_6vd(vmul_vd_vd_vd(xh, yh), vneg_vd_vd(s), vmul_vd_vd_vd(xl, yh), vmul_vd_vd_vd(xh, yl), vmul_vd_vd_vd(xl, yl), vmul_vd_vd_vd(vd2gety_vd_vd2(x), y)));\n}\n\nstatic INLINE CONST VECTOR_CC vdouble2 ddmul_vd2_vd2_vd2(vdouble2 x, vdouble2 y) {\n  vdouble xh = vupper_vd_vd(vd2getx_vd_vd2(x)), xl = vsub_vd_vd_vd(vd2getx_vd_vd2(x), xh);\n  vdouble yh = vupper_vd_vd(vd2getx_vd_vd2(y)), yl = vsub_vd_vd_vd(vd2getx_vd_vd2(y), yh);\n\n  vdouble s = vmul_vd_vd_vd(vd2getx_vd_vd2(x), vd2getx_vd_vd2(y));\n  return vd2setxy_vd2_vd_vd(s, vadd_vd_7vd(vmul_vd_vd_vd(xh, yh), vneg_vd_vd(s), vmul_vd_vd_vd(xl, yh), vmul_vd_vd_vd(xh, yl), vmul_vd_vd_vd(xl, yl), vmul_vd_vd_vd(vd2getx_vd_vd2(x), vd2gety_vd_vd2(y)), vmul_vd_vd_vd(vd2gety_vd_vd2(x), vd2getx_vd_vd2(y))));\n}\n\nstatic INLINE CONST VECTOR_CC vdouble ddmul_vd_vd2_vd2(vdouble2 x, vdouble2 y) {\n  vdouble xh = vupper_vd_vd(vd2getx_vd_vd2(x)), xl = vsub_vd_vd_vd(vd2getx_vd_vd2(x), xh);\n  vdouble yh = vupper_vd_vd(vd2getx_vd_vd2(y)), yl = vsub_vd_vd_vd(vd2getx_vd_vd2(y), yh);\n\n  return vadd_vd_6vd(vmul_vd_vd_vd(vd2gety_vd_vd2(x), yh), vmul_vd_vd_vd(xh, vd2gety_vd_vd2(y)), vmul_vd_vd_vd(xl, yl), vmul_vd_vd_vd(xh, yl), vmul_vd_vd_vd(xl, yh), vmul_vd_vd_vd(xh, yh));\n}\n\nstatic INLINE CONST VECTOR_CC vdouble2 ddsqu_vd2_vd2(vdouble2 x) {\n  vdouble xh = vupper_vd_vd(vd2getx_vd_vd2(x)), xl = vsub_vd_vd_vd(vd2getx_vd_vd2(x), xh);\n\n  vdouble s = vmul_vd_vd_vd(vd2getx_vd_vd2(x), vd2getx_vd_vd2(x));\n  return vd2setxy_vd2_vd_vd(s, vadd_vd_5vd(vmul_vd_vd_vd(xh, xh), vneg_vd_vd(s), vmul_vd_vd_vd(vadd_vd_vd_vd(xh, xh), xl), vmul_vd_vd_vd(xl, xl), vmul_vd_vd_vd(vd2getx_vd_vd2(x), vadd_vd_vd_vd(vd2gety_vd_vd2(x), vd2gety_vd_vd2(x)))));\n}\n\nstatic INLINE CONST VECTOR_CC vdouble ddsqu_vd_vd2(vdouble2 x) {\n  vdouble xh = vupper_vd_vd(vd2getx_vd_vd2(x)), xl = vsub_vd_vd_vd(vd2getx_vd_vd2(x), xh);\n\n  return vadd_vd_5vd(vmul_vd_vd_vd(xh, vd2gety_vd_vd2(x)), vmul_vd_vd_vd(xh, vd2gety_vd_vd2(x)), vmul_vd_vd_vd(xl, xl), vadd_vd_vd_vd(vmul_vd_vd_vd(xh, xl), vmul_vd_vd_vd(xh, xl)), vmul_vd_vd_vd(xh, xh));\n}\n\nstatic INLINE CONST VECTOR_CC vdouble2 ddrec_vd2_vd(vdouble d) {\n  vdouble t = vrec_vd_vd(d);\n  vdouble dh = vupper_vd_vd(d), dl = vsub_vd_vd_vd(d, dh);\n  vdouble th = vupper_vd_vd(t), tl = vsub_vd_vd_vd(t, th);\n\n  return vd2setxy_vd2_vd_vd(t, vmul_vd_vd_vd(t, vsub_vd_5vd(vcast_vd_d(1), vmul_vd_vd_vd(dh, th), vmul_vd_vd_vd(dh, tl), vmul_vd_vd_vd(dl, th), vmul_vd_vd_vd(dl, tl))));\n}\n\nstatic INLINE CONST VECTOR_CC vdouble2 ddrec_vd2_vd2(vdouble2 d) {\n  vdouble t = vrec_vd_vd(vd2getx_vd_vd2(d));\n  vdouble dh = vupper_vd_vd(vd2getx_vd_vd2(d)), dl = vsub_vd_vd_vd(vd2getx_vd_vd2(d), dh);\n  vdouble th = vupper_vd_vd(t  ), tl = vsub_vd_vd_vd(t  , th);\n\n  return vd2setxy_vd2_vd_vd(t, vmul_vd_vd_vd(t, vsub_vd_6vd(vcast_vd_d(1), vmul_vd_vd_vd(dh, th), vmul_vd_vd_vd(dh, tl), vmul_vd_vd_vd(dl, th), vmul_vd_vd_vd(dl, tl), vmul_vd_vd_vd(vd2gety_vd_vd2(d), t))));\n}\n#endif\n\nstatic INLINE CONST VECTOR_CC vdouble2 ddsqrt_vd2_vd2(vdouble2 d) {\n  vdouble t = vsqrt_vd_vd(vadd_vd_vd_vd(vd2getx_vd_vd2(d), vd2gety_vd_vd2(d)));\n  return ddscale_vd2_vd2_vd(ddmul_vd2_vd2_vd2(ddadd2_vd2_vd2_vd2(d, ddmul_vd2_vd_vd(t, t)), ddrec_vd2_vd(t)), vcast_vd_d(0.5));\n}\n\nstatic INLINE CONST VECTOR_CC vdouble2 ddsqrt_vd2_vd(vdouble d) {\n  vdouble t = vsqrt_vd_vd(d);\n  return ddscale_vd2_vd2_vd(ddmul_vd2_vd2_vd2(ddadd2_vd2_vd_vd2(d, ddmul_vd2_vd_vd(t, t)), ddrec_vd2_vd(t)), vcast_vd_d(0.5));\n}\n"
  },
  {
    "path": "src/df.h",
    "content": "//   Copyright Naoki Shibata and contributors 2010 - 2020.\n// Distributed under the Boost Software License, Version 1.0.\n//    (See accompanying file LICENSE.txt or copy at\n//          http://www.boost.org/LICENSE_1_0.txt)\n\n#if !(defined(ENABLE_SVE) || defined(ENABLE_SVENOFMA))\ntypedef struct {\n  vfloat x, y;\n} vfloat2;\n\nstatic vfloat  vf2getx_vf_vf2(vfloat2 v) { return v.x; }\nstatic vfloat  vf2gety_vf_vf2(vfloat2 v) { return v.y; }\nstatic vfloat2 vf2setxy_vf2_vf_vf(vfloat x, vfloat y)  { vfloat2 v; v.x = x; v.y = y; return v; }\nstatic vfloat2 vf2setx_vf2_vf2_vf(vfloat2 v, vfloat d) { v.x = d; return v; }\nstatic vfloat2 vf2sety_vf2_vf2_vf(vfloat2 v, vfloat d) { v.y = d; return v; }\n#endif\n\nstatic INLINE CONST VECTOR_CC vfloat vupper_vf_vf(vfloat d) {\n  return vreinterpret_vf_vi2(vand_vi2_vi2_vi2(vreinterpret_vi2_vf(d), vcast_vi2_i(0xfffff000)));\n}\n\nstatic INLINE CONST VECTOR_CC vfloat2 vcast_vf2_vf_vf(vfloat h, vfloat l) {\n  return vf2setxy_vf2_vf_vf(h, l);\n}\n\nstatic INLINE CONST VECTOR_CC vfloat2 vcast_vf2_f_f(float h, float l) {\n  return vf2setxy_vf2_vf_vf(vcast_vf_f(h), vcast_vf_f(l));\n}\n\nstatic INLINE CONST VECTOR_CC vfloat2 vcast_vf2_d(double d) {\n  return vf2setxy_vf2_vf_vf(vcast_vf_f(d), vcast_vf_f(d - (float)d));\n}\n\nstatic INLINE CONST VECTOR_CC vfloat2 vsel_vf2_vo_vf2_vf2(vopmask m, vfloat2 x, vfloat2 y) {\n  return vf2setxy_vf2_vf_vf(vsel_vf_vo_vf_vf(m, vf2getx_vf_vf2(x), vf2getx_vf_vf2(y)), vsel_vf_vo_vf_vf(m, vf2gety_vf_vf2(x), vf2gety_vf_vf2(y)));\n}\n\nstatic INLINE CONST VECTOR_CC vfloat2 vsel_vf2_vo_f_f_f_f(vopmask o, float x1, float y1, float x0, float y0) {\n  return vf2setxy_vf2_vf_vf(vsel_vf_vo_f_f(o, x1, x0), vsel_vf_vo_f_f(o, y1, y0));\n}\n\nstatic INLINE CONST VECTOR_CC vfloat2 vsel_vf2_vo_vo_d_d_d(vopmask o0, vopmask o1, double d0, double d1, double d2) {\n  return vsel_vf2_vo_vf2_vf2(o0, vcast_vf2_d(d0), vsel_vf2_vo_vf2_vf2(o1, vcast_vf2_d(d1), vcast_vf2_d(d2)));\n}\n\nstatic INLINE CONST VECTOR_CC vfloat2 vsel_vf2_vo_vo_vo_d_d_d_d(vopmask o0, vopmask o1, vopmask o2, double d0, double d1, double d2, double d3) {\n  return vsel_vf2_vo_vf2_vf2(o0, vcast_vf2_d(d0), vsel_vf2_vo_vf2_vf2(o1, vcast_vf2_d(d1), vsel_vf2_vo_vf2_vf2(o2, vcast_vf2_d(d2), vcast_vf2_d(d3))));\n}\n\nstatic INLINE CONST VECTOR_CC vfloat2 vabs_vf2_vf2(vfloat2 x) {\n  return vcast_vf2_vf_vf(vreinterpret_vf_vm(vxor_vm_vm_vm(vand_vm_vm_vm(vreinterpret_vm_vf(vcast_vf_f(-0.0)), vreinterpret_vm_vf(vf2getx_vf_vf2(x))), vreinterpret_vm_vf(vf2getx_vf_vf2(x)))),\n\t\t\t vreinterpret_vf_vm(vxor_vm_vm_vm(vand_vm_vm_vm(vreinterpret_vm_vf(vcast_vf_f(-0.0)), vreinterpret_vm_vf(vf2getx_vf_vf2(x))), vreinterpret_vm_vf(vf2gety_vf_vf2(x)))));\n}\n\nstatic INLINE CONST VECTOR_CC vfloat vadd_vf_3vf(vfloat v0, vfloat v1, vfloat v2) {\n  return vadd_vf_vf_vf(vadd_vf_vf_vf(v0, v1), v2);\n}\n\nstatic INLINE CONST VECTOR_CC vfloat vadd_vf_4vf(vfloat v0, vfloat v1, vfloat v2, vfloat v3) {\n  return vadd_vf_3vf(vadd_vf_vf_vf(v0, v1), v2, v3);\n}\n\nstatic INLINE CONST VECTOR_CC vfloat vadd_vf_5vf(vfloat v0, vfloat v1, vfloat v2, vfloat v3, vfloat v4) {\n  return vadd_vf_4vf(vadd_vf_vf_vf(v0, v1), v2, v3, v4);\n}\n\nstatic INLINE CONST VECTOR_CC vfloat vadd_vf_6vf(vfloat v0, vfloat v1, vfloat v2, vfloat v3, vfloat v4, vfloat v5) {\n  return vadd_vf_5vf(vadd_vf_vf_vf(v0, v1), v2, v3, v4, v5);\n}\n\nstatic INLINE CONST VECTOR_CC vfloat vadd_vf_7vf(vfloat v0, vfloat v1, vfloat v2, vfloat v3, vfloat v4, vfloat v5, vfloat v6) {\n  return vadd_vf_6vf(vadd_vf_vf_vf(v0, v1), v2, v3, v4, v5, v6);\n}\n\nstatic INLINE CONST VECTOR_CC vfloat vsub_vf_3vf(vfloat v0, vfloat v1, vfloat v2) {\n  return vsub_vf_vf_vf(vsub_vf_vf_vf(v0, v1), v2);\n}\n\nstatic INLINE CONST VECTOR_CC vfloat vsub_vf_4vf(vfloat v0, vfloat v1, vfloat v2, vfloat v3) {\n  return vsub_vf_3vf(vsub_vf_vf_vf(v0, v1), v2, v3);\n}\n\nstatic INLINE CONST VECTOR_CC vfloat vsub_vf_5vf(vfloat v0, vfloat v1, vfloat v2, vfloat v3, vfloat v4) {\n  return vsub_vf_4vf(vsub_vf_vf_vf(v0, v1), v2, v3, v4);\n}\n\n//\n\nstatic INLINE CONST VECTOR_CC vfloat2 dfneg_vf2_vf2(vfloat2 x) {\n  return vcast_vf2_vf_vf(vneg_vf_vf(vf2getx_vf_vf2(x)), vneg_vf_vf(vf2gety_vf_vf2(x)));\n}\n\nstatic INLINE CONST VECTOR_CC vfloat2 dfabs_vf2_vf2(vfloat2 x) {\n  return vcast_vf2_vf_vf(vabs_vf_vf(vf2getx_vf_vf2(x)),\n\t\t\t vreinterpret_vf_vm(vxor_vm_vm_vm(vreinterpret_vm_vf(vf2gety_vf_vf2(x)), vand_vm_vm_vm(vreinterpret_vm_vf(vf2getx_vf_vf2(x)), vreinterpret_vm_vf(vcast_vf_f(-0.0f))))));\n}\n\nstatic INLINE CONST VECTOR_CC vfloat2 dfnormalize_vf2_vf2(vfloat2 t) {\n  vfloat s = vadd_vf_vf_vf(vf2getx_vf_vf2(t), vf2gety_vf_vf2(t));\n  return vf2setxy_vf2_vf_vf(s, vadd_vf_vf_vf(vsub_vf_vf_vf(vf2getx_vf_vf2(t), s), vf2gety_vf_vf2(t)));\n}\n\nstatic INLINE CONST VECTOR_CC vfloat2 dfscale_vf2_vf2_vf(vfloat2 d, vfloat s) {\n  return vf2setxy_vf2_vf_vf(vmul_vf_vf_vf(vf2getx_vf_vf2(d), s), vmul_vf_vf_vf(vf2gety_vf_vf2(d), s));\n}\n\nstatic INLINE CONST VECTOR_CC vfloat2 dfadd_vf2_vf_vf(vfloat x, vfloat y) {\n  vfloat s = vadd_vf_vf_vf(x, y);\n  return vf2setxy_vf2_vf_vf(s, vadd_vf_vf_vf(vsub_vf_vf_vf(x, s), y));\n}\n\nstatic INLINE CONST VECTOR_CC vfloat2 dfadd2_vf2_vf_vf(vfloat x, vfloat y) {\n  vfloat s = vadd_vf_vf_vf(x, y);\n  vfloat v = vsub_vf_vf_vf(s, x);\n  return vf2setxy_vf2_vf_vf(s, vadd_vf_vf_vf(vsub_vf_vf_vf(x, vsub_vf_vf_vf(s, v)), vsub_vf_vf_vf(y, v)));\n}\n\nstatic INLINE CONST VECTOR_CC vfloat2 dfadd2_vf2_vf_vf2(vfloat x, vfloat2 y) {\n  vfloat s = vadd_vf_vf_vf(x, vf2getx_vf_vf2(y));\n  vfloat v = vsub_vf_vf_vf(s, x);\n  return vf2setxy_vf2_vf_vf(s, vadd_vf_vf_vf(vadd_vf_vf_vf(vsub_vf_vf_vf(x, vsub_vf_vf_vf(s, v)), vsub_vf_vf_vf(vf2getx_vf_vf2(y), v)), vf2gety_vf_vf2(y)));\n\n}\n\nstatic INLINE CONST VECTOR_CC vfloat2 dfadd_vf2_vf2_vf(vfloat2 x, vfloat y) {\n  vfloat s = vadd_vf_vf_vf(vf2getx_vf_vf2(x), y);\n  return vf2setxy_vf2_vf_vf(s, vadd_vf_3vf(vsub_vf_vf_vf(vf2getx_vf_vf2(x), s), y, vf2gety_vf_vf2(x)));\n}\n\nstatic INLINE CONST VECTOR_CC vfloat2 dfsub_vf2_vf2_vf(vfloat2 x, vfloat y) {\n  vfloat s = vsub_vf_vf_vf(vf2getx_vf_vf2(x), y);\n  return vf2setxy_vf2_vf_vf(s, vadd_vf_vf_vf(vsub_vf_vf_vf(vsub_vf_vf_vf(vf2getx_vf_vf2(x), s), y), vf2gety_vf_vf2(x)));\n}\n\nstatic INLINE CONST VECTOR_CC vfloat2 dfadd2_vf2_vf2_vf(vfloat2 x, vfloat y) {\n  vfloat s = vadd_vf_vf_vf(vf2getx_vf_vf2(x), y);\n  vfloat v = vsub_vf_vf_vf(s, vf2getx_vf_vf2(x));\n  vfloat t = vadd_vf_vf_vf(vsub_vf_vf_vf(vf2getx_vf_vf2(x), vsub_vf_vf_vf(s, v)), vsub_vf_vf_vf(y, v));\n  return vf2setxy_vf2_vf_vf(s, vadd_vf_vf_vf(t, vf2gety_vf_vf2(x)));\n}\n\nstatic INLINE CONST VECTOR_CC vfloat2 dfadd_vf2_vf_vf2(vfloat x, vfloat2 y) {\n  vfloat s = vadd_vf_vf_vf(x, vf2getx_vf_vf2(y));\n  return vf2setxy_vf2_vf_vf(s, vadd_vf_3vf(vsub_vf_vf_vf(x, s), vf2getx_vf_vf2(y), vf2gety_vf_vf2(y)));\n}\n\nstatic INLINE CONST VECTOR_CC vfloat2 dfadd_vf2_vf2_vf2(vfloat2 x, vfloat2 y) {\n  // |x| >= |y|\n\n  vfloat s = vadd_vf_vf_vf(vf2getx_vf_vf2(x), vf2getx_vf_vf2(y));\n  return vf2setxy_vf2_vf_vf(s, vadd_vf_4vf(vsub_vf_vf_vf(vf2getx_vf_vf2(x), s), vf2getx_vf_vf2(y), vf2gety_vf_vf2(x), vf2gety_vf_vf2(y)));\n}\n\nstatic INLINE CONST VECTOR_CC vfloat2 dfadd2_vf2_vf2_vf2(vfloat2 x, vfloat2 y) {\n  vfloat s = vadd_vf_vf_vf(vf2getx_vf_vf2(x), vf2getx_vf_vf2(y));\n  vfloat v = vsub_vf_vf_vf(s, vf2getx_vf_vf2(x));\n  vfloat t = vadd_vf_vf_vf(vsub_vf_vf_vf(vf2getx_vf_vf2(x), vsub_vf_vf_vf(s, v)), vsub_vf_vf_vf(vf2getx_vf_vf2(y), v));\n  return vf2setxy_vf2_vf_vf(s, vadd_vf_vf_vf(t, vadd_vf_vf_vf(vf2gety_vf_vf2(x), vf2gety_vf_vf2(y))));\n}\n\nstatic INLINE CONST VECTOR_CC vfloat2 dfsub_vf2_vf_vf(vfloat x, vfloat y) {\n  // |x| >= |y|\n\n  vfloat s = vsub_vf_vf_vf(x, y);\n  return vf2setxy_vf2_vf_vf(s, vsub_vf_vf_vf(vsub_vf_vf_vf(x, s), y));\n}\n\nstatic INLINE CONST VECTOR_CC vfloat2 dfsub_vf2_vf2_vf2(vfloat2 x, vfloat2 y) {\n  // |x| >= |y|\n\n  vfloat s = vsub_vf_vf_vf(vf2getx_vf_vf2(x), vf2getx_vf_vf2(y));\n  vfloat t = vsub_vf_vf_vf(vf2getx_vf_vf2(x), s);\n  t = vsub_vf_vf_vf(t, vf2getx_vf_vf2(y));\n  t = vadd_vf_vf_vf(t, vf2gety_vf_vf2(x));\n  return vf2setxy_vf2_vf_vf(s, vsub_vf_vf_vf(t, vf2gety_vf_vf2(y)));\n}\n\n#ifdef ENABLE_FMA_SP\nstatic INLINE CONST VECTOR_CC vfloat2 dfdiv_vf2_vf2_vf2(vfloat2 n, vfloat2 d) {\n  vfloat t = vrec_vf_vf(vf2getx_vf_vf2(d));\n  vfloat s = vmul_vf_vf_vf(vf2getx_vf_vf2(n), t);\n  vfloat u = vfmapn_vf_vf_vf_vf(t, vf2getx_vf_vf2(n), s);\n  vfloat v = vfmanp_vf_vf_vf_vf(vf2gety_vf_vf2(d), t, vfmanp_vf_vf_vf_vf(vf2getx_vf_vf2(d), t, vcast_vf_f(1)));\n  return vf2setxy_vf2_vf_vf(s, vfma_vf_vf_vf_vf(s, v, vfma_vf_vf_vf_vf(vf2gety_vf_vf2(n), t, u)));\n}\n\nstatic INLINE CONST VECTOR_CC vfloat2 dfmul_vf2_vf_vf(vfloat x, vfloat y) {\n  vfloat s = vmul_vf_vf_vf(x, y);\n  return vf2setxy_vf2_vf_vf(s, vfmapn_vf_vf_vf_vf(x, y, s));\n}\n\nstatic INLINE CONST VECTOR_CC vfloat2 dfsqu_vf2_vf2(vfloat2 x) {\n  vfloat s = vmul_vf_vf_vf(vf2getx_vf_vf2(x), vf2getx_vf_vf2(x));\n  return vf2setxy_vf2_vf_vf(s, vfma_vf_vf_vf_vf(vadd_vf_vf_vf(vf2getx_vf_vf2(x), vf2getx_vf_vf2(x)), vf2gety_vf_vf2(x), vfmapn_vf_vf_vf_vf(vf2getx_vf_vf2(x), vf2getx_vf_vf2(x), s)));\n}\n\nstatic INLINE CONST VECTOR_CC vfloat dfsqu_vf_vf2(vfloat2 x) {\n  return vfma_vf_vf_vf_vf(vf2getx_vf_vf2(x), vf2getx_vf_vf2(x), vadd_vf_vf_vf(vmul_vf_vf_vf(vf2getx_vf_vf2(x), vf2gety_vf_vf2(x)), vmul_vf_vf_vf(vf2getx_vf_vf2(x), vf2gety_vf_vf2(x))));\n}\n\nstatic INLINE CONST VECTOR_CC vfloat2 dfmul_vf2_vf2_vf2(vfloat2 x, vfloat2 y) {\n  vfloat s = vmul_vf_vf_vf(vf2getx_vf_vf2(x), vf2getx_vf_vf2(y));\n  return vf2setxy_vf2_vf_vf(s, vfma_vf_vf_vf_vf(vf2getx_vf_vf2(x), vf2gety_vf_vf2(y), vfma_vf_vf_vf_vf(vf2gety_vf_vf2(x), vf2getx_vf_vf2(y), vfmapn_vf_vf_vf_vf(vf2getx_vf_vf2(x), vf2getx_vf_vf2(y), s))));\n}\n\nstatic INLINE CONST VECTOR_CC vfloat dfmul_vf_vf2_vf2(vfloat2 x, vfloat2 y) {\n  return vfma_vf_vf_vf_vf(vf2getx_vf_vf2(x), vf2getx_vf_vf2(y), vfma_vf_vf_vf_vf(vf2gety_vf_vf2(x), vf2getx_vf_vf2(y), vmul_vf_vf_vf(vf2getx_vf_vf2(x), vf2gety_vf_vf2(y))));\n}\n\nstatic INLINE CONST VECTOR_CC vfloat2 dfmul_vf2_vf2_vf(vfloat2 x, vfloat y) {\n  vfloat s = vmul_vf_vf_vf(vf2getx_vf_vf2(x), y);\n  return vf2setxy_vf2_vf_vf(s, vfma_vf_vf_vf_vf(vf2gety_vf_vf2(x), y, vfmapn_vf_vf_vf_vf(vf2getx_vf_vf2(x), y, s)));\n}\n\nstatic INLINE CONST VECTOR_CC vfloat2 dfrec_vf2_vf(vfloat d) {\n  vfloat s = vrec_vf_vf(d);\n  return vf2setxy_vf2_vf_vf(s, vmul_vf_vf_vf(s, vfmanp_vf_vf_vf_vf(d, s, vcast_vf_f(1))));\n}\n\nstatic INLINE CONST VECTOR_CC vfloat2 dfrec_vf2_vf2(vfloat2 d) {\n  vfloat s = vrec_vf_vf(vf2getx_vf_vf2(d));\n  return vf2setxy_vf2_vf_vf(s, vmul_vf_vf_vf(s, vfmanp_vf_vf_vf_vf(vf2gety_vf_vf2(d), s, vfmanp_vf_vf_vf_vf(vf2getx_vf_vf2(d), s, vcast_vf_f(1)))));\n}\n#else\nstatic INLINE CONST VECTOR_CC vfloat2 dfdiv_vf2_vf2_vf2(vfloat2 n, vfloat2 d) {\n  vfloat t = vrec_vf_vf(vf2getx_vf_vf2(d));\n  vfloat dh  = vupper_vf_vf(vf2getx_vf_vf2(d)), dl  = vsub_vf_vf_vf(vf2getx_vf_vf2(d),  dh);\n  vfloat th  = vupper_vf_vf(t  ), tl  = vsub_vf_vf_vf(t  ,  th);\n  vfloat nhh = vupper_vf_vf(vf2getx_vf_vf2(n)), nhl = vsub_vf_vf_vf(vf2getx_vf_vf2(n), nhh);\n\n  vfloat s = vmul_vf_vf_vf(vf2getx_vf_vf2(n), t);\n\n  vfloat u, w;\n  w = vcast_vf_f(-1);\n  w = vmla_vf_vf_vf_vf(dh, th, w);\n  w = vmla_vf_vf_vf_vf(dh, tl, w);\n  w = vmla_vf_vf_vf_vf(dl, th, w);\n  w = vmla_vf_vf_vf_vf(dl, tl, w);\n  w = vneg_vf_vf(w);\n\n  u = vmla_vf_vf_vf_vf(nhh, th, vneg_vf_vf(s));\n  u = vmla_vf_vf_vf_vf(nhh, tl, u);\n  u = vmla_vf_vf_vf_vf(nhl, th, u);\n  u = vmla_vf_vf_vf_vf(nhl, tl, u);\n  u = vmla_vf_vf_vf_vf(s, w, u);\n\n  return vf2setxy_vf2_vf_vf(s, vmla_vf_vf_vf_vf(t, vsub_vf_vf_vf(vf2gety_vf_vf2(n), vmul_vf_vf_vf(s, vf2gety_vf_vf2(d))), u));\n}\n\nstatic INLINE CONST VECTOR_CC vfloat2 dfmul_vf2_vf_vf(vfloat x, vfloat y) {\n  vfloat xh = vupper_vf_vf(x), xl = vsub_vf_vf_vf(x, xh);\n  vfloat yh = vupper_vf_vf(y), yl = vsub_vf_vf_vf(y, yh);\n\n  vfloat s = vmul_vf_vf_vf(x, y), t;\n\n  t = vmla_vf_vf_vf_vf(xh, yh, vneg_vf_vf(s));\n  t = vmla_vf_vf_vf_vf(xl, yh, t);\n  t = vmla_vf_vf_vf_vf(xh, yl, t);\n  t = vmla_vf_vf_vf_vf(xl, yl, t);\n\n  return vf2setxy_vf2_vf_vf(s, t);\n}\n\nstatic INLINE CONST VECTOR_CC vfloat2 dfmul_vf2_vf2_vf(vfloat2 x, vfloat y) {\n  vfloat xh = vupper_vf_vf(vf2getx_vf_vf2(x)), xl = vsub_vf_vf_vf(vf2getx_vf_vf2(x), xh);\n  vfloat yh = vupper_vf_vf(y  ), yl = vsub_vf_vf_vf(y  , yh);\n\n  vfloat s = vmul_vf_vf_vf(vf2getx_vf_vf2(x), y), t;\n\n  t = vmla_vf_vf_vf_vf(xh, yh, vneg_vf_vf(s));\n  t = vmla_vf_vf_vf_vf(xl, yh, t);\n  t = vmla_vf_vf_vf_vf(xh, yl, t);\n  t = vmla_vf_vf_vf_vf(xl, yl, t);\n  t = vmla_vf_vf_vf_vf(vf2gety_vf_vf2(x), y, t);\n\n  return vf2setxy_vf2_vf_vf(s, t);\n}\n\nstatic INLINE CONST VECTOR_CC vfloat2 dfmul_vf2_vf2_vf2(vfloat2 x, vfloat2 y) {\n  vfloat xh = vupper_vf_vf(vf2getx_vf_vf2(x)), xl = vsub_vf_vf_vf(vf2getx_vf_vf2(x), xh);\n  vfloat yh = vupper_vf_vf(vf2getx_vf_vf2(y)), yl = vsub_vf_vf_vf(vf2getx_vf_vf2(y), yh);\n\n  vfloat s = vmul_vf_vf_vf(vf2getx_vf_vf2(x), vf2getx_vf_vf2(y)), t;\n\n  t = vmla_vf_vf_vf_vf(xh, yh, vneg_vf_vf(s));\n  t = vmla_vf_vf_vf_vf(xl, yh, t);\n  t = vmla_vf_vf_vf_vf(xh, yl, t);\n  t = vmla_vf_vf_vf_vf(xl, yl, t);\n  t = vmla_vf_vf_vf_vf(vf2getx_vf_vf2(x), vf2gety_vf_vf2(y), t);\n  t = vmla_vf_vf_vf_vf(vf2gety_vf_vf2(x), vf2getx_vf_vf2(y), t);\n\n  return vf2setxy_vf2_vf_vf(s, t);\n}\n\nstatic INLINE CONST VECTOR_CC vfloat dfmul_vf_vf2_vf2(vfloat2 x, vfloat2 y) {\n  vfloat xh = vupper_vf_vf(vf2getx_vf_vf2(x)), xl = vsub_vf_vf_vf(vf2getx_vf_vf2(x), xh);\n  vfloat yh = vupper_vf_vf(vf2getx_vf_vf2(y)), yl = vsub_vf_vf_vf(vf2getx_vf_vf2(y), yh);\n\n  return vadd_vf_6vf(vmul_vf_vf_vf(vf2gety_vf_vf2(x), yh), vmul_vf_vf_vf(xh, vf2gety_vf_vf2(y)), vmul_vf_vf_vf(xl, yl), vmul_vf_vf_vf(xh, yl), vmul_vf_vf_vf(xl, yh), vmul_vf_vf_vf(xh, yh));\n}\n\nstatic INLINE CONST VECTOR_CC vfloat2 dfsqu_vf2_vf2(vfloat2 x) {\n  vfloat xh = vupper_vf_vf(vf2getx_vf_vf2(x)), xl = vsub_vf_vf_vf(vf2getx_vf_vf2(x), xh);\n\n  vfloat s = vmul_vf_vf_vf(vf2getx_vf_vf2(x), vf2getx_vf_vf2(x)), t;\n\n  t = vmla_vf_vf_vf_vf(xh, xh, vneg_vf_vf(s));\n  t = vmla_vf_vf_vf_vf(vadd_vf_vf_vf(xh, xh), xl, t);\n  t = vmla_vf_vf_vf_vf(xl, xl, t);\n  t = vmla_vf_vf_vf_vf(vf2getx_vf_vf2(x), vadd_vf_vf_vf(vf2gety_vf_vf2(x), vf2gety_vf_vf2(x)), t);\n\n  return vf2setxy_vf2_vf_vf(s, t);\n}\n\nstatic INLINE CONST VECTOR_CC vfloat dfsqu_vf_vf2(vfloat2 x) {\n  vfloat xh = vupper_vf_vf(vf2getx_vf_vf2(x)), xl = vsub_vf_vf_vf(vf2getx_vf_vf2(x), xh);\n\n  return vadd_vf_5vf(vmul_vf_vf_vf(xh, vf2gety_vf_vf2(x)), vmul_vf_vf_vf(xh, vf2gety_vf_vf2(x)), vmul_vf_vf_vf(xl, xl), vadd_vf_vf_vf(vmul_vf_vf_vf(xh, xl), vmul_vf_vf_vf(xh, xl)), vmul_vf_vf_vf(xh, xh));\n}\n\nstatic INLINE CONST VECTOR_CC vfloat2 dfrec_vf2_vf(vfloat d) {\n  vfloat t = vrec_vf_vf(d);\n  vfloat dh = vupper_vf_vf(d), dl = vsub_vf_vf_vf(d, dh);\n  vfloat th = vupper_vf_vf(t), tl = vsub_vf_vf_vf(t, th);\n\n  vfloat u = vcast_vf_f(-1);\n  u = vmla_vf_vf_vf_vf(dh, th, u);\n  u = vmla_vf_vf_vf_vf(dh, tl, u);\n  u = vmla_vf_vf_vf_vf(dl, th, u);\n  u = vmla_vf_vf_vf_vf(dl, tl, u);\n\n  return vf2setxy_vf2_vf_vf(t, vmul_vf_vf_vf(vneg_vf_vf(t), u));\n}\n\nstatic INLINE CONST VECTOR_CC vfloat2 dfrec_vf2_vf2(vfloat2 d) {\n  vfloat t = vrec_vf_vf(vf2getx_vf_vf2(d));\n  vfloat dh = vupper_vf_vf(vf2getx_vf_vf2(d)), dl = vsub_vf_vf_vf(vf2getx_vf_vf2(d), dh);\n  vfloat th = vupper_vf_vf(t  ), tl = vsub_vf_vf_vf(t  , th);\n\n  vfloat u = vcast_vf_f(-1);\n  u = vmla_vf_vf_vf_vf(dh, th, u);\n  u = vmla_vf_vf_vf_vf(dh, tl, u);\n  u = vmla_vf_vf_vf_vf(dl, th, u);\n  u = vmla_vf_vf_vf_vf(dl, tl, u);\n  u = vmla_vf_vf_vf_vf(vf2gety_vf_vf2(d), t, u);\n\n  return vf2setxy_vf2_vf_vf(t, vmul_vf_vf_vf(vneg_vf_vf(t), u));\n}\n#endif\n\nstatic INLINE CONST VECTOR_CC vfloat2 dfsqrt_vf2_vf2(vfloat2 d) {\n#ifdef ENABLE_RECSQRT_SP\n  vfloat x = vrecsqrt_vf_vf(vadd_vf_vf_vf(vf2getx_vf_vf2(d), vf2gety_vf_vf2(d)));\n  vfloat2 r = dfmul_vf2_vf2_vf(d, x);\n  return dfscale_vf2_vf2_vf(dfmul_vf2_vf2_vf2(r, dfadd2_vf2_vf2_vf(dfmul_vf2_vf2_vf(r, x), vcast_vf_f(-3.0))), vcast_vf_f(-0.5));\n#else\n  vfloat t = vsqrt_vf_vf(vadd_vf_vf_vf(vf2getx_vf_vf2(d), vf2gety_vf_vf2(d)));\n  return dfscale_vf2_vf2_vf(dfmul_vf2_vf2_vf2(dfadd2_vf2_vf2_vf2(d, dfmul_vf2_vf_vf(t, t)), dfrec_vf2_vf(t)), vcast_vf_f(0.5));\n#endif\n}\n\nstatic INLINE CONST VECTOR_CC vfloat2 dfsqrt_vf2_vf(vfloat d) {\n  vfloat t = vsqrt_vf_vf(d);\n  return dfscale_vf2_vf2_vf(dfmul_vf2_vf2_vf2(dfadd2_vf2_vf_vf2(d, dfmul_vf2_vf_vf(t, t)), dfrec_vf2_vf(t)), vcast_vf_f(0.5f));\n}\n"
  },
  {
    "path": "src/estrin.h",
    "content": "//   Copyright Naoki Shibata and contributors 2010 - 2020.\n// Distributed under the Boost Software License, Version 1.0.\n//    (See accompanying file LICENSE.txt or copy at\n//          http://www.boost.org/LICENSE_1_0.txt)\n\n// These are macros for evaluating polynomials using Estrin's method\n\n#define POLY2(x, c1, c0) MLA(x, C2V(c1), C2V(c0))\n#define POLY3(x, x2, c2, c1, c0) MLA(x2, C2V(c2), MLA(x, C2V(c1), C2V(c0)))\n#define POLY4(x, x2, c3, c2, c1, c0) MLA(x2, MLA(x, C2V(c3), C2V(c2)), MLA(x, C2V(c1), C2V(c0)))\n#define POLY5(x, x2, x4, c4, c3, c2, c1, c0) MLA(x4, C2V(c4), POLY4(x, x2, c3, c2, c1, c0))\n#define POLY6(x, x2, x4, c5, c4, c3, c2, c1, c0) MLA(x4, POLY2(x, c5, c4), POLY4(x, x2, c3, c2, c1, c0))\n#define POLY7(x, x2, x4, c6, c5, c4, c3, c2, c1, c0) MLA(x4, POLY3(x, x2, c6, c5, c4), POLY4(x, x2, c3, c2, c1, c0))\n#define POLY8(x, x2, x4, c7, c6, c5, c4, c3, c2, c1, c0) MLA(x4, POLY4(x, x2, c7, c6, c5, c4), POLY4(x, x2, c3, c2, c1, c0))\n#define POLY9(x, x2, x4, x8, c8, c7, c6, c5, c4, c3, c2, c1, c0)\\\n  MLA(x8, C2V(c8), POLY8(x, x2, x4, c7, c6, c5, c4, c3, c2, c1, c0))\n#define POLY10(x, x2, x4, x8, c9, c8, c7, c6, c5, c4, c3, c2, c1, c0)\\\n  MLA(x8, POLY2(x, c9, c8), POLY8(x, x2, x4, c7, c6, c5, c4, c3, c2, c1, c0))\n#define POLY11(x, x2, x4, x8, ca, c9, c8, c7, c6, c5, c4, c3, c2, c1, c0)\\\n  MLA(x8, POLY3(x, x2, ca, c9, c8), POLY8(x, x2, x4, c7, c6, c5, c4, c3, c2, c1, c0))\n#define POLY12(x, x2, x4, x8, cb, ca, c9, c8, c7, c6, c5, c4, c3, c2, c1, c0)\\\n  MLA(x8, POLY4(x, x2, cb, ca, c9, c8), POLY8(x, x2, x4, c7, c6, c5, c4, c3, c2, c1, c0))\n#define POLY13(x, x2, x4, x8, cc, cb, ca, c9, c8, c7, c6, c5, c4, c3, c2, c1, c0)\\\n  MLA(x8, POLY5(x, x2, x4, cc, cb, ca, c9, c8), POLY8(x, x2, x4, c7, c6, c5, c4, c3, c2, c1, c0))\n#define POLY14(x, x2, x4, x8, cd, cc, cb, ca, c9, c8, c7, c6, c5, c4, c3, c2, c1, c0)\\\n  MLA(x8, POLY6(x, x2, x4, cd, cc, cb, ca, c9, c8), POLY8(x, x2, x4, c7, c6, c5, c4, c3, c2, c1, c0))\n#define POLY15(x, x2, x4, x8, ce, cd, cc, cb, ca, c9, c8, c7, c6, c5, c4, c3, c2, c1, c0)\\\n  MLA(x8, POLY7(x, x2, x4, ce, cd, cc, cb, ca, c9, c8), POLY8(x, x2, x4, c7, c6, c5, c4, c3, c2, c1, c0))\n#define POLY16(x, x2, x4, x8, cf, ce, cd, cc, cb, ca, c9, c8, c7, c6, c5, c4, c3, c2, c1, c0)\\\n  MLA(x8, POLY8(x, x2, x4, cf, ce, cd, cc, cb, ca, c9, c8), POLY8(x, x2, x4, c7, c6, c5, c4, c3, c2, c1, c0))\n#define POLY17(x, x2, x4, x8, x16, d0, cf, ce, cd, cc, cb, ca, c9, c8, c7, c6, c5, c4, c3, c2, c1, c0)\\\n  MLA(x16, C2V(d0), POLY16(x, x2, x4, x8, cf, ce, cd, cc, cb, ca, c9, c8, c7, c6, c5, c4, c3, c2, c1, c0))\n#define POLY18(x, x2, x4, x8, x16, d1, d0, cf, ce, cd, cc, cb, ca, c9, c8, c7, c6, c5, c4, c3, c2, c1, c0)\\\n  MLA(x16, POLY2(x, d1, d0), POLY16(x, x2, x4, x8, cf, ce, cd, cc, cb, ca, c9, c8, c7, c6, c5, c4, c3, c2, c1, c0))\n#define POLY19(x, x2, x4, x8, x16, d2, d1, d0, cf, ce, cd, cc, cb, ca, c9, c8, c7, c6, c5, c4, c3, c2, c1, c0)\\\n  MLA(x16, POLY3(x, x2, d2, d1, d0), POLY16(x, x2, x4, x8, cf, ce, cd, cc, cb, ca, c9, c8, c7, c6, c5, c4, c3, c2, c1, c0))\n"
  },
  {
    "path": "src/fp16.cpp",
    "content": "/*\n\nCopyright (c) 2021 Agenium Scale\n\nPermission is hereby granted, free of charge, to any person obtaining a copy\nof this software and associated documentation files (the \"Software\"), to deal\nin the Software without restriction, including without limitation the rights\nto use, copy, modify, merge, publish, distribute, sublicense, and/or sell\ncopies of the Software, and to permit persons to whom the Software is\nfurnished to do so, subject to the following conditions:\n\nThe above copyright notice and this permission notice shall be included in all\ncopies or substantial portions of the Software.\n\nTHE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\nIMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\nFITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\nAUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\nLIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\nOUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\nSOFTWARE.\n\n*/\n\n/*\n\nWe follow IEEE754-2008 for FP16 (= binary16) storage.\nHowever IEEE754 compliance is not guaranteed by C/C++ standards\nand therefore we propose two modes:\n\n- IEEE754 mode with NaNs, INFs, ... (this is the default)\n- non IEEE754 mode compatible with only C89 (no NaNs, INFs...)\n\nFP16 format\n-----------\n\n    +---+--------+--------------+\n    | S | E EEEE | MM MMMM MMMM |\n    +---+--------+--------------+\n     15  14   10   9          0\n\nFP16 interpretation\n-------------------\n\nS = sign bit\nE = exponent bits (offset is 15), emin = -14, emax = 15\nM = mantissa bits\n\nE == 0 and M != 0 => subnormal => (-1)^S x 2^(-14) x (0 + 2^(-10) x T)\n32 > E >  0       =>    normal => (-1)^S x 2^(E - 15) x (1 + 2^(-10) x T)\n\nFP32 format\n-----------\n\n    +---+-----------+------------------------------+\n    | S | EEEE EEEE | MMM MMMM MMMM MMMM MMMM MMMM |\n    +---+-----------+------------------------------+\n     31  30      23  22                          0\n\nFP32 interpretation\n-------------------\n\nS = sign bit\nE = exponent bits (offset is 127), emin = -126, emax = 127\nM = mantissa bits\n\nE == 0 and M != 0 => subnormal => (-1)^S x 2^(-126) x (0 + 2^(-23) x T)\n256 > E > 0       =>    normal => (-1)^S x 2^(E - 127) x (1 + 2^(-23) x T)\n\nIn both cases we treat subnormal numbers as zeros. Moreover the\nimplementation below was written so that it can easily be SIMD'ed.\n\n*/\n\n#define NSIMD_INSIDE\n#include <nsimd/nsimd.h>\n\n#ifdef NSIMD_NO_IEEE754\n  #include <cmath>\n#endif\n#include <algorithm>\n\n#ifdef NSIMD_C_LINKAGE_FOR_F16\nextern \"C\" {\n#endif\n\n// ----------------------------------------------------------------------------\n// Convert a FP16 as an u16 to a float\n\nNSIMD_DLLEXPORT float nsimd_u16_to_f32(u16 a) {\n#ifdef NSIMD_NO_IEEE754\n  float sign;\n  int exponent, mantissa;\n\n  sign = (a >> 15) == 1 ? -1.0f : 1.0f;\n  exponent = (a >> 10) & 0x1F;\n  mantissa = (float)(a & 0x3FF);\n\n  if (exponent == 0) {\n    return std::ldexp(sign * mantissa, -24);\n  } else {\n    return std::ldexp(sign * (0x400 | mantissa), exponent - 25);\n  }\n#else\n  u32 sign, mantissa, exponent;\n\n  sign = a & 0x8000;\n  exponent = (a >> 10) & 0x1F;\n  mantissa = (a & 0x3FF);\n\n  if (exponent == 31) {\n    /* We have a NaN of an INF. */\n    exponent = 255;\n    /* Force the first bit of the mantissa to 1 to be compatible with the way\n     * Intel convert f16 to f32 */\n    if (mantissa != 0) {\n      //mantissa |= 0x200;\n    }\n  } else if (exponent == 0 && mantissa == 0) {\n    /* Nothing to do */\n  } else if (exponent == 0) {\n    u32 mask = mantissa;\n    /* Find the most significant bit of the mantissa (could use a better\n     * algorithm) */\n    int i = -1;\n    do {\n      ++i;\n      mask <<= 1;\n    } while ((mask & 0x400) == 0);\n\n    /* Update the mantissa and the exponent */\n    mantissa = (mask & 0x3ff);\n    exponent += (u32)(112 - i);\n  } else {\n    /* the exponent must be recomputed -15 + 127 */\n    exponent += 112;\n  }\n\n  /* We then rebuild the float */\n  return nsimd_scalar_reinterpret_f32_u32(\n      (sign << 16) | (((u32)exponent) << 23) | (mantissa << 13));\n#endif\n}\n\n// ----------------------------------------------------------------------------\n// Convert a FP16 to a float\n\n#ifndef NSIMD_NATIVE_FP16\nNSIMD_DLLEXPORT f32 nsimd_f16_to_f32(f16 a) { return nsimd_u16_to_f32(a.u); }\n#endif\n\n// ----------------------------------------------------------------------------\n// Convert a float to a FP16 as an u16\n\nNSIMD_DLLEXPORT u16 nsimd_f32_to_u16(f32 a) {\n#ifdef NSIMD_NO_IEEE754\n  double frac;\n  int exponent;\n  u32 sign, mantissa;\n\n  /* Get mantissa (= fractional part) and exponent. */\n  frac = std::frexp(a, &exponent);\n\n  /* Get sign and make sure frac is positive. */\n  if (frac < 0) {\n    sign = 1u;\n    frac = -frac;\n  } else {\n    sign = 0u;\n  }\n\n  /* Add 1 to the exponent to have the IEEE exponent: The mantissa here\n     lives in [0.5, 1) whereas for IEEE it must live in [1, 2). */\n  exponent++;\n\n  if (exponent < -14) {\n    /* We have a too small number, returns zero */\n    return (u16)(sign << 15);\n  } else if (exponent > 15) {\n    /* We have a too big number, return INF */\n    return (u16)((sign << 15) | 0x7C00);\n  } else {\n    /* We have a normal number. Get the mantissa:\n       frac lives in [0.5, 1) and is of the form 0.1XXXXXXX, therefore\n       to get the mantissa frac must be multiplied by 2^11 = 2048. Then\n       it will be of the form 1XX XXXX XXXX.XXXXX, so we have to get rid\n       of the leading bit. */\n    mantissa = (u32)(frac * 2048.0) & 0x3FF;\n    return (u16)((sign << 15) | ((u32)(exponent + 15) << 10) | mantissa);\n  }\n#else\n  u32 sign, mantissa;\n  int exponent;\n\n  u32 in_u = nsimd_scalar_reinterpret_u32_f32(a);\n  sign = in_u & 0x80000000;\n  exponent = (int)((in_u >> 23) & 0xFF);\n  mantissa = (in_u & 0x7FFFFF);\n\n  if (exponent == 255 && mantissa != 0) {\n    /* Nan */\n    return (u16)(0xffff);\n  }\n\n  const f32 biggest_f16 = nsimd_scalar_reinterpret_f32_u32(0x477ff000);\n  if (a >= biggest_f16 || a <= -biggest_f16) {\n    /* Number is too big to be representable in half => return infinity */\n    return (u16)(sign >> 16 | 0x1f << 10);\n  }\n\n  const f32 smallest_f16 = nsimd_scalar_reinterpret_f32_u32(0x33000000);\n  if (a <= smallest_f16 && a >= -smallest_f16) {\n    /* Number is too small to be representable in half => return ±0 */\n    return (u16)(sign >> 16);\n  }\n\n  /* For FP32 exponent bias is 127, compute the real exponent. */\n  exponent -= 127;\n\n  /* Following algorithm taken from:\n   * https://fgiesen.wordpress.com/2012/03/28/half-to-float-done-quic/ */\n  const f32 denormal_f16 = nsimd_scalar_reinterpret_f32_u32(0x38800000);\n  if (a < denormal_f16 && a > -denormal_f16) {\n    /* Denormalized half */\n    const u32 magic_u = ((127 - 15) + (23 - 10) + 1) << 23;\n    const f32 magic_f = nsimd_scalar_reinterpret_f32_u32(magic_u);\n\n    u32 in_u = nsimd_scalar_reinterpret_u32_f32(a);\n    in_u &= ~0x80000000u;\n    f32 in_f = nsimd_scalar_reinterpret_f32_u32(in_u);\n    in_f += magic_f;\n    in_u = nsimd_scalar_reinterpret_u32_f32(in_f);\n    in_u -= magic_u;\n\n    return (u16)((sign >> 16) | in_u);\n  }\n\n  /* Normal half */\n  in_u &= ~0x80000000U;\n  u32 mant_odd = (in_u >> 13) & 1;\n  in_u += ((u32)(15 - 127) << 23) + 0xfffU;\n  in_u += mant_odd;\n\n  return (u16)((sign >> 16) | (in_u >> 13));\n#endif\n}\n\n// ----------------------------------------------------------------------------\n// Convert a float to a FP16\n\n#ifndef NSIMD_NATIVE_FP16\nNSIMD_DLLEXPORT f16 nsimd_f32_to_f16(f32 a) {\n  f16 ret;\n  ret.u = nsimd_f32_to_u16(a);\n  return ret;\n}\n#endif\n\n// ----------------------------------------------------------------------------\n\n#ifdef NSIMD_C_LINKAGE_FOR_F16\n} // extern \"C\"\n#endif\n\n// ----------------------------------------------------------------------------\n// C++ versions in namespace nsimd\n\nnamespace nsimd {\n\nNSIMD_DLLEXPORT u16 f32_to_u16(f32 a) { return nsimd_f32_to_u16(a); }\nNSIMD_DLLEXPORT f32 u16_to_f32(u16 a) { return nsimd_u16_to_f32(a); }\n#ifndef NSIMD_NATIVE_FP16\nNSIMD_DLLEXPORT f16 f32_to_f16(f32 a) { return nsimd_f32_to_f16(a); }\nNSIMD_DLLEXPORT f32 f16_to_f32(f16 a) { return nsimd_f16_to_f32(a); }\n#endif\n\n} // namespace nsimd\n"
  },
  {
    "path": "src/gpu.cpp",
    "content": "/*\n\nCopyright (c) 2021 Agenium Scale\n\nPermission is hereby granted, free of charge, to any person obtaining a copy\nof this software and associated documentation files (the \"Software\"), to deal\nin the Software without restriction, including without limitation the rights\nto use, copy, modify, merge, publish, distribute, sublicense, and/or sell\ncopies of the Software, and to permit persons to whom the Software is\nfurnished to do so, subject to the following conditions:\n\nThe above copyright notice and this permission notice shall be included in all\ncopies or substantial portions of the Software.\n\nTHE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\nIMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\nFITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\nAUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\nLIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\nOUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\nSOFTWARE.\n\n*/\n\n#define NSIMD_INSIDE\n#include <nsimd/nsimd.h>\n\n#if defined(NSIMD_ONEAPI) && NSIMD_CXX > 0\n\n// ----------------------------------------------------------------------------\n// oneAPI\n\n// NSIMD error handler\nnamespace nsimd {\nnamespace oneapi {\ntemplate <typename Exception = sycl::exception>\nstruct sycl_async_error_handler {\n  void operator()(const sycl::exception_list &elist) {\n    for (const auto &exc : elist) {\n      try {\n        std::rethrow_exception(exc);\n      } catch (const Exception &exc) {\n        fprintf(stderr, \"NSIMD Internal error:\\n\\tError: %s %s %d\\n\",\n                exc.what(), __FILE__, __LINE__);\n        exit(EXIT_FAILURE);\n      }\n    }\n  }\n};\n} // namespace oneapi\n} // namespace nsimd\n\nextern \"C\" {\n\n// Singleton to get default oneAPI queue\nNSIMD_DLLSPEC void *nsimd_oneapi_default_queue() {\n  static sycl::queue ret(sycl::default_selector{},\n                         nsimd::oneapi::sycl_async_error_handler<>{});\n  return (void *)&ret;\n}\n\nNSIMD_DLLSPEC nsimd_nat nsimd_kernel_param(nsimd_nat nb_items,\n                                           nsimd_nat block_size) {\n  return block_size * ((nb_items + block_size - 1) / block_size);\n}\n\n} // extern \"C\"\n\n#elif defined(NSIMD_CUDA) || defined(NSIMD_ROCM)\n\n// ----------------------------------------------------------------------------\n// CUDA/ROCm\n\nNSIMD_DLLSPEC nsimd_nat nsimd_kernel_param(nsimd_nat nb_items,\n                                           nsimd_nat block_size) {\n  return (nb_items + block_size - 1) / block_size;\n}\n\n#else\n\n// ----------------------------------------------------------------------------\n// CPU/SIMD\n\nNSIMD_DLLSPEC nsimd_nat nsimd_kernel_param(nsimd_nat nb_items,\n                                           nsimd_nat block_size) {\n  return nb_items / block_size;\n}\n\n// ----------------------------------------------------------------------------\n\n#endif\n"
  },
  {
    "path": "src/helperadvsimd.h",
    "content": "/*********************************************************************/\n/*          Copyright ARM Ltd. 2010 - 2019.                          */\n/* Distributed under the Boost Software License, Version 1.0.        */\n/*    (See accompanying file LICENSE.txt or copy at                  */\n/*          http://www.boost.org/LICENSE_1_0.txt)                    */\n/*********************************************************************/\n\n#ifndef __ARM_NEON\n#error Please specify advsimd flags.\n#endif\n\n#if !defined(SLEEF_GENHEADER)\n#include <arm_neon.h>\n#include <stdint.h>\n\n#include \"misc.h\"\n#endif // #if !defined(SLEEF_GENHEADER)\n\n#define ENABLE_DP\n//@#define ENABLE_DP\n#define LOG2VECTLENDP 1\n//@#define LOG2VECTLENDP 1\n#define VECTLENDP (1 << LOG2VECTLENDP)\n//@#define VECTLENDP (1 << LOG2VECTLENDP)\n\n#define ENABLE_SP\n//@#define ENABLE_SP\n#define LOG2VECTLENSP 2\n//@#define LOG2VECTLENSP 2\n#define VECTLENSP (1 << LOG2VECTLENSP)\n//@#define VECTLENSP (1 << LOG2VECTLENSP)\n\n#if CONFIG == 1\n#define ENABLE_FMA_DP\n//@#define ENABLE_FMA_DP\n#define ENABLE_FMA_SP\n//@#define ENABLE_FMA_SP\n#endif\n\n#define FULL_FP_ROUNDING\n//@#define FULL_FP_ROUNDING\n#define ACCURATE_SQRT\n//@#define ACCURATE_SQRT\n\n#define ISANAME \"AArch64 AdvSIMD\"\n\n// Mask definition\ntypedef uint32x4_t vmask;\ntypedef uint32x4_t vopmask;\n\n// Single precision definitions\ntypedef float32x4_t vfloat;\ntypedef int32x4_t vint2;\n\n// Double precision definitions\ntypedef float64x2_t vdouble;\ntypedef int32x2_t vint;\n\ntypedef struct {\n  vmask x, y;\n} vmask2;\n\n#define DFTPRIORITY 10\n\nstatic INLINE int vavailability_i(int name) { return 3; }\nstatic INLINE void vprefetch_v_p(const void *ptr) { }\n\nstatic INLINE VECTOR_CC int vtestallones_i_vo32(vopmask g) {\n  uint32x2_t x0 = vand_u32(vget_low_u32(g), vget_high_u32(g));\n  uint32x2_t x1 = vpmin_u32(x0, x0);\n  return vget_lane_u32(x1, 0);\n}\n\nstatic INLINE VECTOR_CC int vtestallones_i_vo64(vopmask g) {\n  uint32x2_t x0 = vand_u32(vget_low_u32(g), vget_high_u32(g));\n  uint32x2_t x1 = vpmin_u32(x0, x0);\n  return vget_lane_u32(x1, 0);\n}\n\n// Vector load / store\nstatic INLINE VECTOR_CC vdouble vload_vd_p(const double *ptr) { return vld1q_f64(ptr); }\nstatic INLINE VECTOR_CC vdouble vloadu_vd_p(const double *ptr) { return vld1q_f64(ptr); }\nstatic INLINE VECTOR_CC void vstore_v_p_vd(double *ptr, vdouble v) { vst1q_f64(ptr, v); }\nstatic INLINE VECTOR_CC void vstoreu_v_p_vd(double *ptr, vdouble v) { vst1q_f64(ptr, v); }\nstatic INLINE VECTOR_CC vfloat vload_vf_p(const float *ptr) { return vld1q_f32(ptr); }\nstatic INLINE VECTOR_CC vfloat vloadu_vf_p(const float *ptr) { return vld1q_f32(ptr); }\nstatic INLINE VECTOR_CC void vstore_v_p_vf(float *ptr, vfloat v) { vst1q_f32(ptr, v); }\nstatic INLINE VECTOR_CC void vstoreu_v_p_vf(float *ptr, vfloat v) { vst1q_f32(ptr, v); }\nstatic INLINE VECTOR_CC vint2 vloadu_vi2_p(int32_t *p) { return vld1q_s32(p); }\nstatic INLINE VECTOR_CC void vstoreu_v_p_vi2(int32_t *p, vint2 v) { vst1q_s32(p, v); }\nstatic INLINE VECTOR_CC vint vloadu_vi_p(int32_t *p) { return vld1_s32(p); }\nstatic INLINE VECTOR_CC void vstoreu_v_p_vi(int32_t *p, vint v) { vst1_s32(p, v); }\n\nstatic INLINE VECTOR_CC vdouble vgather_vd_p_vi(const double *ptr, vint vi) {\n  return ((vdouble) { ptr[vget_lane_s32(vi, 0)], ptr[vget_lane_s32(vi, 1)]} );\n}\n\nstatic INLINE VECTOR_CC vfloat vgather_vf_p_vi2(const float *ptr, vint2 vi2) {\n  return ((vfloat) {\n      ptr[vgetq_lane_s32(vi2, 0)],\n      ptr[vgetq_lane_s32(vi2, 1)],\n      ptr[vgetq_lane_s32(vi2, 2)],\n      ptr[vgetq_lane_s32(vi2, 3)]\n    });\n}\n\n// Basic logical operations for mask\nstatic INLINE VECTOR_CC vmask vand_vm_vm_vm(vmask x, vmask y) { return vandq_u32(x, y); }\nstatic INLINE VECTOR_CC vmask vandnot_vm_vm_vm(vmask x, vmask y) {\n  return vbicq_u32(y, x);\n}\nstatic INLINE VECTOR_CC vmask vor_vm_vm_vm(vmask x, vmask y) { return vorrq_u32(x, y); }\nstatic INLINE VECTOR_CC vmask vxor_vm_vm_vm(vmask x, vmask y) { return veorq_u32(x, y); }\n\n// Mask <--> single precision reinterpret\nstatic INLINE VECTOR_CC vmask vreinterpret_vm_vf(vfloat vf) {\n  return vreinterpretq_u32_f32(vf);\n}\nstatic INLINE VECTOR_CC vfloat vreinterpret_vf_vm(vmask vm) {\n  return vreinterpretq_f32_u32(vm);\n}\nstatic INLINE VECTOR_CC vint2 vcast_vi2_vm(vmask vm) { return vreinterpretq_s32_u32(vm); }\nstatic INLINE VECTOR_CC vmask vcast_vm_vi2(vint2 vi) { return vreinterpretq_u32_s32(vi); }\n\n// Mask <--> double precision reinterpret\nstatic INLINE VECTOR_CC vmask vreinterpret_vm_vd(vdouble vd) {\n  return vreinterpretq_u32_f64(vd);\n}\nstatic INLINE VECTOR_CC vdouble vreinterpret_vd_vm(vmask vm) {\n  return vreinterpretq_f64_u32(vm);\n}\nstatic INLINE VECTOR_CC vfloat vreinterpret_vf_vi2(vint2 vm) {\n  return vreinterpretq_f32_s32(vm);\n}\nstatic INLINE VECTOR_CC vint2 vreinterpret_vi2_vf(vfloat vf) {\n  return vreinterpretq_s32_f32(vf);\n}\nstatic INLINE VECTOR_CC vint2 vreinterpret_vi2_vd(vdouble vd) {\n  return vreinterpretq_s32_f64(vd);\n}\n\n/****************************************/\n/* Single precision FP operations */\n/****************************************/\n// Broadcast\nstatic INLINE VECTOR_CC vfloat vcast_vf_f(float f) { return vdupq_n_f32(f); }\n\n// Add, Sub, Mul\nstatic INLINE VECTOR_CC vfloat vadd_vf_vf_vf(vfloat x, vfloat y) {\n  return vaddq_f32(x, y);\n}\nstatic INLINE VECTOR_CC vfloat vsub_vf_vf_vf(vfloat x, vfloat y) {\n  return vsubq_f32(x, y);\n}\nstatic INLINE VECTOR_CC vfloat vmul_vf_vf_vf(vfloat x, vfloat y) {\n  return vmulq_f32(x, y);\n}\n\n// |x|, -x\nstatic INLINE VECTOR_CC vfloat vabs_vf_vf(vfloat f) { return vabsq_f32(f); }\nstatic INLINE VECTOR_CC vfloat vneg_vf_vf(vfloat f) { return vnegq_f32(f); }\n\n#if CONFIG == 1\n// Multiply accumulate: z = z + x * y\nstatic INLINE VECTOR_CC vfloat vmla_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) {\n  return vfmaq_f32(z, x, y);\n}\n// Multiply subtract: z = z - x * y\nstatic INLINE VECTOR_CC vfloat vmlanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) {\n  return vfmsq_f32(z, x, y);\n}\n// Multiply subtract: z = x * y - z\nstatic INLINE VECTOR_CC vfloat vmlapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) {\n  return vneg_vf_vf(vfmsq_f32(z, x, y));\n}\n#else\nstatic INLINE VECTOR_CC vfloat vmla_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vadd_vf_vf_vf(vmul_vf_vf_vf(x, y), z); }\nstatic INLINE VECTOR_CC vfloat vmlanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vsub_vf_vf_vf(z, vmul_vf_vf_vf(x, y)); }\nstatic INLINE VECTOR_CC vfloat vmlapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vsub_vf_vf_vf(vmul_vf_vf_vf(x, y), z); }\n#endif\n\nstatic INLINE VECTOR_CC vfloat vfma_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { // z + x * y\n  return vfmaq_f32(z, x, y);\n}\n\nstatic INLINE VECTOR_CC vfloat vfmanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { // z - x * y\n  return vfmsq_f32(z, x, y);\n}\n\nstatic INLINE VECTOR_CC vfloat vfmapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { // x * y - z\n  return vneg_vf_vf(vfmanp_vf_vf_vf_vf(x, y, z));\n}\n\n// Reciprocal 1/x, Division, Square root\nstatic INLINE VECTOR_CC vfloat vdiv_vf_vf_vf(vfloat n, vfloat d) {\n#ifndef ENABLE_ALTDIV\n  return vdivq_f32(n, d);\n#else\n  // Finite numbers (including denormal) only, gives mostly correctly rounded result\n  float32x4_t t, u, x, y;\n  uint32x4_t i0, i1;\n  i0 = vandq_u32(vreinterpretq_u32_f32(n), vdupq_n_u32(0x7c000000));\n  i1 = vandq_u32(vreinterpretq_u32_f32(d), vdupq_n_u32(0x7c000000));\n  i0 = vsubq_u32(vdupq_n_u32(0x7d000000), vshrq_n_u32(vaddq_u32(i0, i1), 1));\n  t = vreinterpretq_f32_u32(i0);\n  y = vmulq_f32(d, t);\n  x = vmulq_f32(n, t);\n  t = vrecpeq_f32(y);\n  t = vmulq_f32(t, vrecpsq_f32(y, t));\n  t = vmulq_f32(t, vrecpsq_f32(y, t));\n  u = vmulq_f32(x, t);\n  u = vfmaq_f32(u, vfmsq_f32(x, y, u), t);\n  return u;\n#endif\n}\nstatic INLINE VECTOR_CC vfloat vrec_vf_vf(vfloat d) {\n#ifndef ENABLE_ALTDIV\n  return vdiv_vf_vf_vf(vcast_vf_f(1.0f), d);\n#else\n  return vbslq_f32(vceqq_f32(vabs_vf_vf(d), vcast_vf_f(SLEEF_INFINITYf)),\n\t\t   vcast_vf_f(0), vdiv_vf_vf_vf(vcast_vf_f(1.0f), d));\n#endif\n}\n\nstatic INLINE VECTOR_CC vfloat vsqrt_vf_vf(vfloat d) {\n#ifndef ENABLE_ALTSQRT\n  return vsqrtq_f32(d);\n#else\n  // Gives correctly rounded result for all input range\n  vfloat w, x, y, z;\n\n  y = vrsqrteq_f32(d);\n  x = vmul_vf_vf_vf(d, y);         w = vmul_vf_vf_vf(vcast_vf_f(0.5), y);\n  y = vfmanp_vf_vf_vf_vf(x, w, vcast_vf_f(0.5));\n  x = vfma_vf_vf_vf_vf(x, y, x);   w = vfma_vf_vf_vf_vf(w, y, w);\n\n  y = vfmanp_vf_vf_vf_vf(x, w, vcast_vf_f(1.5));  w = vadd_vf_vf_vf(w, w);\n  w = vmul_vf_vf_vf(w, y);\n  x = vmul_vf_vf_vf(w, d);\n  y = vfmapn_vf_vf_vf_vf(w, d, x); z = vfmanp_vf_vf_vf_vf(w, x, vcast_vf_f(1));\n  z = vfmanp_vf_vf_vf_vf(w, y, z); w = vmul_vf_vf_vf(vcast_vf_f(0.5), x);\n  w = vfma_vf_vf_vf_vf(w, z, y);\n  w = vadd_vf_vf_vf(w, x);\n\n  return vbslq_f32(vorrq_u32(vceqq_f32(d, vcast_vf_f(0)),\n\t\t\t     vceqq_f32(d, vcast_vf_f(SLEEF_INFINITYf))), d, w);\n#endif\n}\n\n// max, min\nstatic INLINE VECTOR_CC vfloat vmax_vf_vf_vf(vfloat x, vfloat y) {\n  return vmaxq_f32(x, y);\n}\nstatic INLINE VECTOR_CC vfloat vmin_vf_vf_vf(vfloat x, vfloat y) {\n  return vminq_f32(x, y);\n}\n\n// Comparisons\nstatic INLINE VECTOR_CC vmask veq_vm_vf_vf(vfloat x, vfloat y) { return vceqq_f32(x, y); }\nstatic INLINE VECTOR_CC vmask vneq_vm_vf_vf(vfloat x, vfloat y) {\n  return vmvnq_u32(vceqq_f32(x, y));\n}\nstatic INLINE VECTOR_CC vmask vlt_vm_vf_vf(vfloat x, vfloat y) { return vcltq_f32(x, y); }\nstatic INLINE VECTOR_CC vmask vle_vm_vf_vf(vfloat x, vfloat y) { return vcleq_f32(x, y); }\nstatic INLINE VECTOR_CC vmask vgt_vm_vf_vf(vfloat x, vfloat y) { return vcgtq_f32(x, y); }\nstatic INLINE VECTOR_CC vmask vge_vm_vf_vf(vfloat x, vfloat y) { return vcgeq_f32(x, y); }\n\n// Conditional select\nstatic INLINE VECTOR_CC vfloat vsel_vf_vm_vf_vf(vmask mask, vfloat x, vfloat y) {\n  return vbslq_f32(mask, x, y);\n}\n\n// int <--> float conversions\nstatic INLINE VECTOR_CC vint2 vtruncate_vi2_vf(vfloat vf) { return vcvtq_s32_f32(vf); }\nstatic INLINE VECTOR_CC vfloat vcast_vf_vi2(vint2 vi) { return vcvtq_f32_s32(vi); }\nstatic INLINE VECTOR_CC vint2 vcast_vi2_i(int i) { return vdupq_n_s32(i); }\nstatic INLINE VECTOR_CC vint2 vrint_vi2_vf(vfloat d) {\n  return vcvtq_s32_f32(vrndnq_f32(d));\n}\n\n/***************************************/\n/* Single precision integer operations */\n/***************************************/\n\n// Add, Sub, Neg (-x)\nstatic INLINE VECTOR_CC vint2 vadd_vi2_vi2_vi2(vint2 x, vint2 y) {\n  return vaddq_s32(x, y);\n}\nstatic INLINE VECTOR_CC vint2 vsub_vi2_vi2_vi2(vint2 x, vint2 y) {\n  return vsubq_s32(x, y);\n}\nstatic INLINE VECTOR_CC vint2 vneg_vi2_vi2(vint2 e) { return vnegq_s32(e); }\n\n// Logical operations\nstatic INLINE VECTOR_CC vint2 vand_vi2_vi2_vi2(vint2 x, vint2 y) {\n  return vandq_s32(x, y);\n}\nstatic INLINE VECTOR_CC vint2 vandnot_vi2_vi2_vi2(vint2 x, vint2 y) {\n  return vbicq_s32(y, x);\n}\nstatic INLINE VECTOR_CC vint2 vor_vi2_vi2_vi2(vint2 x, vint2 y) {\n  return vorrq_s32(x, y);\n}\nstatic INLINE VECTOR_CC vint2 vxor_vi2_vi2_vi2(vint2 x, vint2 y) {\n  return veorq_s32(x, y);\n}\n\n// Shifts\n#define vsll_vi2_vi2_i(x, c) vshlq_n_s32(x, c)\n//@#define vsll_vi2_vi2_i(x, c) vshlq_n_s32(x, c)\n#define vsrl_vi2_vi2_i(x, c)                                                   \\\n  vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(x), c))\n//@#define vsrl_vi2_vi2_i(x, c) vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(x), c))\n\n#define vsra_vi2_vi2_i(x, c) vshrq_n_s32(x, c)\n//@#define vsra_vi2_vi2_i(x, c) vshrq_n_s32(x, c)\n#define vsra_vi_vi_i(x, c) vshr_n_s32(x, c)\n//@#define vsra_vi_vi_i(x, c) vshr_n_s32(x, c)\n#define vsll_vi_vi_i(x, c) vshl_n_s32(x, c)\n//@#define vsll_vi_vi_i(x, c) vshl_n_s32(x, c)\n#define vsrl_vi_vi_i(x, c)                                                     \\\n  vreinterpret_s32_u32(vshr_n_u32(vreinterpret_u32_s32(x), c))\n//@#define vsrl_vi_vi_i(x, c) vreinterpret_s32_u32(vshr_n_u32(vreinterpret_u32_s32(x), c))\n\n// Comparison returning masks\nstatic INLINE VECTOR_CC vmask veq_vm_vi2_vi2(vint2 x, vint2 y) { return vceqq_s32(x, y); }\nstatic INLINE VECTOR_CC vmask vgt_vm_vi2_vi2(vint2 x, vint2 y) { return vcgeq_s32(x, y); }\n// Comparison returning integers\nstatic INLINE VECTOR_CC vint2 vgt_vi2_vi2_vi2(vint2 x, vint2 y) {\n  return vreinterpretq_s32_u32(vcgeq_s32(x, y));\n}\nstatic INLINE VECTOR_CC vint2 veq_vi2_vi2_vi2(vint2 x, vint2 y) {\n  return vreinterpretq_s32_u32(vceqq_s32(x, y));\n}\n\n// Conditional select\nstatic INLINE VECTOR_CC vint2 vsel_vi2_vm_vi2_vi2(vmask m, vint2 x, vint2 y) {\n  return vbslq_s32(m, x, y);\n}\n\n/* -------------------------------------------------------------------------- */\n/* -------------------------------------------------------------------------- */\n/* -------------------------------------------------------------------------- */\n/* -------------------------------------------------------------------------- */\n\n/****************************************/\n/* Double precision FP operations */\n/****************************************/\n// Broadcast\nstatic INLINE VECTOR_CC vdouble vcast_vd_d(double f) { return vdupq_n_f64(f); }\n\n// Add, Sub, Mul\nstatic INLINE VECTOR_CC vdouble vadd_vd_vd_vd(vdouble x, vdouble y) {\n  return vaddq_f64(x, y);\n}\nstatic INLINE VECTOR_CC vdouble vsub_vd_vd_vd(vdouble x, vdouble y) {\n  return vsubq_f64(x, y);\n}\nstatic INLINE VECTOR_CC vdouble vmul_vd_vd_vd(vdouble x, vdouble y) {\n  return vmulq_f64(x, y);\n}\n\n// |x|, -x\nstatic INLINE VECTOR_CC vdouble vabs_vd_vd(vdouble f) { return vabsq_f64(f); }\nstatic INLINE VECTOR_CC vdouble vneg_vd_vd(vdouble f) { return vnegq_f64(f); }\n\n// max, min\nstatic INLINE VECTOR_CC vdouble vmax_vd_vd_vd(vdouble x, vdouble y) {\n  return vmaxq_f64(x, y);\n}\nstatic INLINE VECTOR_CC vdouble vmin_vd_vd_vd(vdouble x, vdouble y) {\n  return vminq_f64(x, y);\n}\n\n#if CONFIG == 1\n// Multiply accumulate: z = z + x * y\nstatic INLINE VECTOR_CC vdouble vmla_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) {\n  return vfmaq_f64(z, x, y);\n}\n\nstatic INLINE VECTOR_CC vdouble vmlanp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) {\n  return vfmsq_f64(z, x, y);\n}\n\n//[z = x * y - z]\nstatic INLINE VECTOR_CC vdouble vmlapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) {\n  return vneg_vd_vd(vfmsq_f64(z, x, y));\n}\n#else\nstatic INLINE VECTOR_CC vdouble vmla_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vadd_vd_vd_vd(vmul_vd_vd_vd(x, y), z); }\nstatic INLINE VECTOR_CC vdouble vmlapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vsub_vd_vd_vd(vmul_vd_vd_vd(x, y), z); }\n#endif\n\nstatic INLINE VECTOR_CC vdouble vfma_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { // z + x * y\n  return vfmaq_f64(z, x, y);\n}\n\nstatic INLINE VECTOR_CC vdouble vfmanp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { // z - x * y\n  return vfmsq_f64(z, x, y);\n}\n\nstatic INLINE VECTOR_CC vdouble vfmapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { // x * y - z\n  return vneg_vd_vd(vfmanp_vd_vd_vd_vd(x, y, z));\n}\n\n// Reciprocal 1/x, Division, Square root\nstatic INLINE VECTOR_CC vdouble vdiv_vd_vd_vd(vdouble n, vdouble d) {\n#ifndef ENABLE_ALTDIV\n  return vdivq_f64(n, d);\n#else\n  // Finite numbers (including denormal) only, gives mostly correctly rounded result\n  float64x2_t t, u, x, y;\n  uint64x2_t i0, i1;\n  i0 = vandq_u64(vreinterpretq_u64_f64(n), vdupq_n_u64(0x7fc0000000000000L));\n  i1 = vandq_u64(vreinterpretq_u64_f64(d), vdupq_n_u64(0x7fc0000000000000L));\n  i0 = vsubq_u64(vdupq_n_u64(0x7fd0000000000000L), vshrq_n_u64(vaddq_u64(i0, i1), 1));\n  t = vreinterpretq_f64_u64(i0);\n  y = vmulq_f64(d, t);\n  x = vmulq_f64(n, t);\n  t = vrecpeq_f64(y);\n  t = vmulq_f64(t, vrecpsq_f64(y, t));\n  t = vmulq_f64(t, vrecpsq_f64(y, t));\n  t = vmulq_f64(t, vrecpsq_f64(y, t));\n  u = vmulq_f64(x, t);\n  u = vfmaq_f64(u, vfmsq_f64(x, y, u), t);\n  return u;\n#endif\n}\nstatic INLINE VECTOR_CC vdouble vrec_vd_vd(vdouble d) {\n#ifndef ENABLE_ALTDIV\n  return vdiv_vd_vd_vd(vcast_vd_d(1.0f), d);\n#else\n  return vbslq_f64(vceqq_f64(vabs_vd_vd(d), vcast_vd_d(SLEEF_INFINITY)),\n\t\t   vcast_vd_d(0), vdiv_vd_vd_vd(vcast_vd_d(1.0f), d));\n#endif\n}\n\nstatic INLINE VECTOR_CC vdouble vsqrt_vd_vd(vdouble d) {\n#ifndef ENABLE_ALTSQRT\n  return vsqrtq_f64(d);\n#else\n  // Gives correctly rounded result for all input range\n  vdouble w, x, y, z;\n\n  y = vrsqrteq_f64(d);\n  x = vmul_vd_vd_vd(d, y);         w = vmul_vd_vd_vd(vcast_vd_d(0.5), y);\n  y = vfmanp_vd_vd_vd_vd(x, w, vcast_vd_d(0.5));\n  x = vfma_vd_vd_vd_vd(x, y, x);   w = vfma_vd_vd_vd_vd(w, y, w);\n  y = vfmanp_vd_vd_vd_vd(x, w, vcast_vd_d(0.5));\n  x = vfma_vd_vd_vd_vd(x, y, x);   w = vfma_vd_vd_vd_vd(w, y, w);\n\n  y = vfmanp_vd_vd_vd_vd(x, w, vcast_vd_d(1.5));  w = vadd_vd_vd_vd(w, w);\n  w = vmul_vd_vd_vd(w, y);\n  x = vmul_vd_vd_vd(w, d);\n  y = vfmapn_vd_vd_vd_vd(w, d, x); z = vfmanp_vd_vd_vd_vd(w, x, vcast_vd_d(1));\n  z = vfmanp_vd_vd_vd_vd(w, y, z); w = vmul_vd_vd_vd(vcast_vd_d(0.5), x);\n  w = vfma_vd_vd_vd_vd(w, z, y);\n  w = vadd_vd_vd_vd(w, x);\n\n  return vbslq_f64(vorrq_u64(vceqq_f64(d, vcast_vd_d(0)),\n\t\t\t     vceqq_f64(d, vcast_vd_d(SLEEF_INFINITY))), d, w);\n#endif\n}\n\n/* Comparisons */\nstatic INLINE VECTOR_CC vopmask veq_vo_vd_vd(vdouble x, vdouble y) {\n  return vreinterpretq_u32_u64(vceqq_f64(x, y));\n}\nstatic INLINE VECTOR_CC vopmask vneq_vo_vd_vd(vdouble x, vdouble y) {\n  return vmvnq_u32(vreinterpretq_u32_u64(vceqq_f64(x, y)));\n}\nstatic INLINE VECTOR_CC vopmask vlt_vo_vd_vd(vdouble x, vdouble y) {\n  return vreinterpretq_u32_u64(vcltq_f64(x, y));\n}\nstatic INLINE VECTOR_CC vopmask vgt_vo_vd_vd(vdouble x, vdouble y) {\n  return vreinterpretq_u32_u64(vcgtq_f64(x, y));\n}\nstatic INLINE VECTOR_CC vopmask vle_vo_vd_vd(vdouble x, vdouble y) {\n  return vreinterpretq_u32_u64(vcleq_f64(x, y));\n}\nstatic INLINE VECTOR_CC vopmask vge_vo_vd_vd(vdouble x, vdouble y) {\n  return vreinterpretq_u32_u64(vcgeq_f64(x, y));\n}\n\n// Conditional select\nstatic INLINE VECTOR_CC vdouble vsel_vd_vo_vd_vd(vopmask mask, vdouble x, vdouble y) {\n  return vbslq_f64(vreinterpretq_u64_u32(mask), x, y);\n}\n\n#if 1\nstatic INLINE CONST VECTOR_CC vdouble vsel_vd_vo_d_d(vopmask o, double v1, double v0) {\n  return vsel_vd_vo_vd_vd(o, vcast_vd_d(v1), vcast_vd_d(v0));\n}\n\nstatic INLINE VECTOR_CC vdouble vsel_vd_vo_vo_d_d_d(vopmask o0, vopmask o1, double d0, double d1, double d2) {\n  return vsel_vd_vo_vd_vd(o0, vcast_vd_d(d0), vsel_vd_vo_d_d(o1, d1, d2));\n}\n\nstatic INLINE VECTOR_CC vdouble vsel_vd_vo_vo_vo_d_d_d_d(vopmask o0, vopmask o1, vopmask o2, double d0, double d1, double d2, double d3) {\n  return vsel_vd_vo_vd_vd(o0, vcast_vd_d(d0), vsel_vd_vo_vd_vd(o1, vcast_vd_d(d1), vsel_vd_vo_d_d(o2, d2, d3)));\n}\n#else\n// This implementation is slower on the current CPU models (as of May 2017.)\n// I(Naoki Shibata) expect that on future CPU models with hardware similar to Super Shuffle Engine, this implementation will be faster.\nstatic INLINE CONST VECTOR_CC vdouble vsel_vd_vo_d_d(vopmask o, double d0, double d1) {\n  uint8x16_t idx = vbslq_u8(vreinterpretq_u8_u32(o), (uint8x16_t) { 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7 },\n\t\t\t    (uint8x16_t) { 8, 9, 10, 11, 12, 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15 });\n  \n  uint8x16_t tab = (uint8x16_t) (float64x2_t) { d0, d1 };\n  return (vdouble) vqtbl1q_u8(tab, idx);\n}\n\nstatic INLINE VECTOR_CC vdouble vsel_vd_vo_vo_vo_d_d_d_d(vopmask o0, vopmask o1, vopmask o2, double d0, double d1, double d2, double d3) {\n  uint8x16_t idx = vbslq_u8(vreinterpretq_u8_u32(o0), (uint8x16_t) { 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7 },\n\t\t\t    vbslq_u8(vreinterpretq_u8_u32(o1), (uint8x16_t) { 8, 9, 10, 11, 12, 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15 },\n\t\t\t\t     vbslq_u8(vreinterpretq_u8_u32(o2), (uint8x16_t) { 16, 17, 18, 19, 20, 21, 22, 23, 16, 17, 18, 19, 20, 21, 22, 23 },\n\t\t\t\t\t      (uint8x16_t) { 24, 25, 26, 27, 28, 29, 30, 31, 24, 25, 26, 27, 28, 29, 30, 31 })));\n  \n  uint8x16x2_t tab = { { (uint8x16_t) (float64x2_t) { d0, d1 }, (uint8x16_t) (float64x2_t) { d2, d3 } } }; \n  return (vdouble) vqtbl2q_u8(tab, idx);\n}\n\nstatic INLINE VECTOR_CC vdouble vsel_vd_vo_vo_d_d_d(vopmask o0, vopmask o1, double d0, double d1, double d2) {\n  return vsel_vd_vo_vo_vo_d_d_d_d(o0, o1, o1, d0, d1, d2, d2);\n}\n#endif\n\nstatic INLINE VECTOR_CC vdouble vrint_vd_vd(vdouble d) { return vrndnq_f64(d); }\nstatic INLINE VECTOR_CC vfloat vrint_vf_vf(vfloat d) { return vrndnq_f32(d); }\n\n/****************************************/\n/* int <--> float conversions           */\n/****************************************/\nstatic INLINE VECTOR_CC vint vtruncate_vi_vd(vdouble vf) {\n  return vmovn_s64(vcvtq_s64_f64(vf));\n}\nstatic INLINE VECTOR_CC vdouble vcast_vd_vi(vint vi) {\n  return vcvtq_f64_s64(vmovl_s32(vi));\n}\nstatic INLINE VECTOR_CC vint vcast_vi_i(int i) { return vdup_n_s32(i); }\nstatic INLINE VECTOR_CC vint vrint_vi_vd(vdouble d) {\n  return vqmovn_s64(vcvtq_s64_f64(vrndnq_f64(d)));\n}\n\n/***************************************/\n/* Integer operations */\n/***************************************/\n\n// Add, Sub, Neg (-x)\nstatic INLINE VECTOR_CC vint vadd_vi_vi_vi(vint x, vint y) { return vadd_s32(x, y); }\nstatic INLINE VECTOR_CC vint vsub_vi_vi_vi(vint x, vint y) { return vsub_s32(x, y); }\nstatic INLINE VECTOR_CC vint vneg_vi_vi(vint e) { return vneg_s32(e); }\n\n// Logical operations\nstatic INLINE VECTOR_CC vint vand_vi_vi_vi(vint x, vint y) { return vand_s32(x, y); }\nstatic INLINE VECTOR_CC vint vandnot_vi_vi_vi(vint x, vint y) { return vbic_s32(y, x); }\nstatic INLINE VECTOR_CC vint vor_vi_vi_vi(vint x, vint y) { return vorr_s32(x, y); }\nstatic INLINE VECTOR_CC vint vxor_vi_vi_vi(vint x, vint y) { return veor_s32(x, y); }\n\n// Comparison returning masks\nstatic INLINE VECTOR_CC vopmask veq_vo_vi_vi(vint x, vint y) {\n  return vcombine_u32(vceq_s32(x, y), vdup_n_u32(0));\n}\n\n// Conditional select\nstatic INLINE VECTOR_CC vint vsel_vi_vm_vi_vi(vmask m, vint x, vint y) {\n  return vbsl_s32(vget_low_u32(m), x, y);\n}\n\n/***************************************/\n/* Predicates                          */\n/***************************************/\nstatic INLINE VECTOR_CC vopmask visinf_vo_vd(vdouble d) {\n  const float64x2_t inf = vdupq_n_f64(SLEEF_INFINITY);\n  const float64x2_t neg_inf = vdupq_n_f64(-SLEEF_INFINITY);\n  uint64x2_t cmp = vorrq_u64(vceqq_f64(d, inf), vceqq_f64(d, neg_inf));\n  return vreinterpretq_u32_u64(cmp);\n}\n\nstatic INLINE VECTOR_CC vopmask visnan_vo_vd(vdouble d) {\n  return vmvnq_u32(vreinterpretq_u32_u64(vceqq_f64(d, d)));\n}\n\nstatic INLINE VECTOR_CC vopmask vispinf_vo_vd(vdouble d) {\n  return vreinterpretq_u32_u64(vceqq_f64(d, vdupq_n_f64(SLEEF_INFINITY)));\n}\n\nstatic INLINE VECTOR_CC vopmask visminf_vo_vd(vdouble d) {\n  return vreinterpretq_u32_u64(vceqq_f64(d, vdupq_n_f64(-SLEEF_INFINITY)));\n}\n\nstatic INLINE VECTOR_CC vfloat vsel_vf_vo_vf_vf(vopmask mask, vfloat x, vfloat y) {\n  return vbslq_f32(mask, x, y);\n}\n\nstatic INLINE CONST VECTOR_CC vfloat vsel_vf_vo_f_f(vopmask o, float v1, float v0) {\n  return vsel_vf_vo_vf_vf(o, vcast_vf_f(v1), vcast_vf_f(v0));\n}\n\nstatic INLINE VECTOR_CC vfloat vsel_vf_vo_vo_f_f_f(vopmask o0, vopmask o1, float d0, float d1, float d2) {\n  return vsel_vf_vo_vf_vf(o0, vcast_vf_f(d0), vsel_vf_vo_f_f(o1, d1, d2));\n}\n\nstatic INLINE VECTOR_CC vfloat vsel_vf_vo_vo_vo_f_f_f_f(vopmask o0, vopmask o1, vopmask o2, float d0, float d1, float d2, float d3) {\n  return vsel_vf_vo_vf_vf(o0, vcast_vf_f(d0), vsel_vf_vo_vf_vf(o1, vcast_vf_f(d1), vsel_vf_vo_f_f(o2, d2, d3)));\n}\n\nstatic INLINE VECTOR_CC vopmask veq_vo_vf_vf(vfloat x, vfloat y) {\n  return vceqq_f32(x, y);\n}\nstatic INLINE VECTOR_CC vopmask vneq_vo_vf_vf(vfloat x, vfloat y) {\n  return vmvnq_u32(vceqq_f32(x, y));\n}\nstatic INLINE VECTOR_CC vopmask vlt_vo_vf_vf(vfloat x, vfloat y) {\n  return vcltq_f32(x, y);\n}\nstatic INLINE VECTOR_CC vopmask vle_vo_vf_vf(vfloat x, vfloat y) {\n  return vcleq_f32(x, y);\n}\nstatic INLINE VECTOR_CC vopmask vgt_vo_vf_vf(vfloat x, vfloat y) {\n  return vcgtq_f32(x, y);\n}\nstatic INLINE VECTOR_CC vopmask vge_vo_vf_vf(vfloat x, vfloat y) {\n  return vcgeq_f32(x, y);\n}\n\nstatic INLINE VECTOR_CC vopmask veq_vo_vi2_vi2(vint2 x, vint2 y) {\n  return vceqq_s32(x, y);\n}\nstatic INLINE VECTOR_CC vopmask vgt_vo_vi2_vi2(vint2 x, vint2 y) {\n  return vcgtq_s32(x, y);\n}\nstatic INLINE VECTOR_CC vopmask vgt_vo_vi_vi(vint x, vint y) {\n  return vcombine_u32(vcgt_s32(x, y), vdup_n_u32(0));\n}\nstatic INLINE VECTOR_CC vopmask visinf_vo_vf(vfloat d) {\n  return veq_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(SLEEF_INFINITYf));\n}\nstatic INLINE VECTOR_CC vopmask vispinf_vo_vf(vfloat d) {\n  return veq_vo_vf_vf(d, vcast_vf_f(SLEEF_INFINITYf));\n}\nstatic INLINE VECTOR_CC vopmask visminf_vo_vf(vfloat d) {\n  return veq_vo_vf_vf(d, vcast_vf_f(-SLEEF_INFINITYf));\n}\nstatic INLINE VECTOR_CC vopmask visnan_vo_vf(vfloat d) { return vneq_vo_vf_vf(d, d); }\n\nstatic INLINE VECTOR_CC vopmask vcast_vo32_vo64(vopmask m) {\n  return vuzpq_u32(m, m).val[0];\n}\nstatic INLINE VECTOR_CC vopmask vcast_vo64_vo32(vopmask m) {\n  return vzipq_u32(m, m).val[0];\n}\n\nstatic INLINE VECTOR_CC vopmask vand_vo_vo_vo(vopmask x, vopmask y) {\n  return vandq_u32(x, y);\n}\nstatic INLINE VECTOR_CC vopmask vandnot_vo_vo_vo(vopmask x, vopmask y) {\n  return vbicq_u32(y, x);\n}\nstatic INLINE VECTOR_CC vopmask vor_vo_vo_vo(vopmask x, vopmask y) {\n  return vorrq_u32(x, y);\n}\nstatic INLINE VECTOR_CC vopmask vxor_vo_vo_vo(vopmask x, vopmask y) {\n  return veorq_u32(x, y);\n}\n\nstatic INLINE VECTOR_CC vint2 vsel_vi2_vo_vi2_vi2(vopmask m, vint2 x, vint2 y) {\n  return vbslq_s32(m, x, y);\n}\nstatic INLINE VECTOR_CC vint2 vand_vi2_vo_vi2(vopmask x, vint2 y) {\n  return vandq_s32(vreinterpretq_s32_u32(x), y);\n}\nstatic INLINE VECTOR_CC vint2 vandnot_vi2_vo_vi2(vopmask x, vint2 y) {\n  return vbicq_s32(y, vreinterpretq_s32_u32(x));\n}\nstatic INLINE VECTOR_CC vint vandnot_vi_vo_vi(vopmask x, vint y) {\n  return vbic_s32(y, vget_low_s32(vreinterpretq_s32_u32(x)));\n}\nstatic INLINE VECTOR_CC vmask vand_vm_vo32_vm(vopmask x, vmask y) {\n  return vandq_u32(x, y);\n}\nstatic INLINE VECTOR_CC vmask vand_vm_vo64_vm(vopmask x, vmask y) {\n  return vandq_u32(x, y);\n}\nstatic INLINE VECTOR_CC vmask vandnot_vm_vo32_vm(vopmask x, vmask y) {\n  return vbicq_u32(y, x);\n}\nstatic INLINE VECTOR_CC vmask vandnot_vm_vo64_vm(vopmask x, vmask y) {\n  return vbicq_u32(y, x);\n}\nstatic INLINE VECTOR_CC vmask vor_vm_vo32_vm(vopmask x, vmask y) {\n  return vorrq_u32(x, y);\n}\nstatic INLINE VECTOR_CC vmask vor_vm_vo64_vm(vopmask x, vmask y) {\n  return vorrq_u32(x, y);\n}\nstatic INLINE VECTOR_CC vmask vxor_vm_vo32_vm(vopmask x, vmask y) {\n  return veorq_u32(x, y);\n}\n\nstatic INLINE VECTOR_CC vfloat vtruncate_vf_vf(vfloat vd) { return vrndq_f32(vd); }\n\nstatic INLINE VECTOR_CC vmask vcast_vm_i_i(int i0, int i1) {\n  return vreinterpretq_u32_u64(vdupq_n_u64((0xffffffff & (uint64_t)i1) | (((uint64_t)i0) << 32)));\n}\n\nstatic INLINE VECTOR_CC vopmask veq64_vo_vm_vm(vmask x, vmask y) {\n  return vreinterpretq_u32_u64(vceqq_s64(vreinterpretq_s64_u32(x), vreinterpretq_s64_u32(y)));\n}\n\nstatic INLINE VECTOR_CC vmask vadd64_vm_vm_vm(vmask x, vmask y) {\n  return vreinterpretq_u32_s64(vaddq_s64(vreinterpretq_s64_u32(x), vreinterpretq_s64_u32(y)));\n}\n\nstatic INLINE VECTOR_CC vint vsel_vi_vo_vi_vi(vopmask m, vint x, vint y) {\n  return vbsl_s32(vget_low_u32(m), x, y);\n}\n\n// Logical operations\nstatic INLINE VECTOR_CC vint vand_vi_vo_vi(vopmask x, vint y) {\n  return vand_s32(vreinterpret_s32_u32(vget_low_u32(x)), y);\n}\n\nstatic INLINE VECTOR_CC vint2 vcastu_vi2_vi(vint vi) {\n  return vreinterpretq_s32_u32(vrev64q_u32(vreinterpretq_u32_u64(vmovl_u32(vreinterpret_u32_s32(vi)))));\n}\nstatic INLINE VECTOR_CC vint vcastu_vi_vi2(vint2 vi2) {\n  return vreinterpret_s32_u32(vmovn_u64(vreinterpretq_u64_u32(vrev64q_u32(vreinterpretq_u32_s32(vi2)))));\n}\nstatic INLINE VECTOR_CC vdouble vreinterpret_vd_vi2(vint2 vi) {\n  return vreinterpretq_f64_s32(vi);\n}\nstatic INLINE VECTOR_CC vdouble vtruncate_vd_vd(vdouble vd) { return vrndq_f64(vd); }\n\n//\n\n#define PNMASK ((vdouble) { +0.0, -0.0 })\n#define NPMASK ((vdouble) { -0.0, +0.0 })\n#define PNMASKf ((vfloat) { +0.0f, -0.0f, +0.0f, -0.0f })\n#define NPMASKf ((vfloat) { -0.0f, +0.0f, -0.0f, +0.0f })\n\nstatic INLINE VECTOR_CC vdouble vposneg_vd_vd(vdouble d) { return vreinterpret_vd_vm(vxor_vm_vm_vm(vreinterpret_vm_vd(d), vreinterpret_vm_vd(PNMASK))); }\nstatic INLINE VECTOR_CC vdouble vnegpos_vd_vd(vdouble d) { return vreinterpret_vd_vm(vxor_vm_vm_vm(vreinterpret_vm_vd(d), vreinterpret_vm_vd(NPMASK))); }\nstatic INLINE VECTOR_CC vfloat vposneg_vf_vf(vfloat d) { return (vfloat)vxor_vm_vm_vm((vmask)d, (vmask)PNMASKf); }\nstatic INLINE VECTOR_CC vfloat vnegpos_vf_vf(vfloat d) { return (vfloat)vxor_vm_vm_vm((vmask)d, (vmask)NPMASKf); }\n\nstatic INLINE VECTOR_CC vdouble vsubadd_vd_vd_vd(vdouble x, vdouble y) { return vadd_vd_vd_vd(x, vnegpos_vd_vd(y)); }\nstatic INLINE VECTOR_CC vfloat vsubadd_vf_vf_vf(vfloat d0, vfloat d1) { return vadd_vf_vf_vf(d0, vnegpos_vf_vf(d1)); }\nstatic INLINE VECTOR_CC vdouble vmlsubadd_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vsubadd_vd_vd_vd(vmul_vd_vd_vd(x, y), z); }\nstatic INLINE VECTOR_CC vfloat vmlsubadd_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vsubadd_vf_vf_vf(vmul_vf_vf_vf(x, y), z); }\n\nstatic INLINE VECTOR_CC vdouble vrev21_vd_vd(vdouble d0) { return (float64x2_t)vcombine_u64(vget_high_u64((uint64x2_t)d0), vget_low_u64((uint64x2_t)d0)); }\nstatic INLINE VECTOR_CC vdouble vreva2_vd_vd(vdouble vd) { return vd; }\n\nstatic INLINE VECTOR_CC void vstream_v_p_vd(double *ptr, vdouble v) { vstore_v_p_vd(ptr, v); }\nstatic INLINE VECTOR_CC void vscatter2_v_p_i_i_vd(double *ptr, int offset, int step, vdouble v) { vstore_v_p_vd((double *)(&ptr[2*offset]), v); }\nstatic INLINE VECTOR_CC void vsscatter2_v_p_i_i_vd(double *ptr, int offset, int step, vdouble v) { vstore_v_p_vd((double *)(&ptr[2*offset]), v); }\n\nstatic INLINE VECTOR_CC vfloat vrev21_vf_vf(vfloat d0) { return vrev64q_f32(d0); }\nstatic INLINE VECTOR_CC vfloat vreva2_vf_vf(vfloat d0) { return vcombine_f32(vget_high_f32(d0), vget_low_f32(d0)); }\nstatic INLINE VECTOR_CC vint2 vrev21_vi2_vi2(vint2 i) { return vreinterpret_vi2_vf(vrev21_vf_vf(vreinterpret_vf_vi2(i))); }\n\nstatic INLINE VECTOR_CC void vstream_v_p_vf(float *ptr, vfloat v) { vstore_v_p_vf(ptr, v); }\n\nstatic INLINE VECTOR_CC void vscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v) {\n  vst1_f32((float *)(ptr+(offset + step * 0)*2), vget_low_f32(v));\n  vst1_f32((float *)(ptr+(offset + step * 1)*2), vget_high_f32(v));\n}\n\nstatic INLINE VECTOR_CC void vsscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v) {\n  vst1_f32((float *)(ptr+(offset + step * 0)*2), vget_low_f32(v));\n  vst1_f32((float *)(ptr+(offset + step * 1)*2), vget_high_f32(v));\n}\n\n//\n\nstatic INLINE vmask2 vinterleave_vm2_vm2(vmask2 v) {\n  return (vmask2) {\n    vreinterpretq_u32_u64(vtrn1q_u64(vreinterpretq_u64_u32(v.x), vreinterpretq_u64_u32(v.y))),\n    vreinterpretq_u32_u64(vtrn2q_u64(vreinterpretq_u64_u32(v.x), vreinterpretq_u64_u32(v.y))) };\n}\n\nstatic INLINE vmask2 vuninterleave_vm2_vm2(vmask2 v) {\n  return (vmask2) {\n    vreinterpretq_u32_u64(vtrn1q_u64(vreinterpretq_u64_u32(v.x), vreinterpretq_u64_u32(v.y))),\n    vreinterpretq_u32_u64(vtrn2q_u64(vreinterpretq_u64_u32(v.x), vreinterpretq_u64_u32(v.y))) };\n}\n\nstatic INLINE vint vuninterleave_vi_vi(vint v) { return v; }\nstatic INLINE vdouble vinterleave_vd_vd(vdouble vd) { return vd; }\nstatic INLINE vdouble vuninterleave_vd_vd(vdouble vd) { return vd; }\nstatic INLINE vmask vinterleave_vm_vm(vmask vm) { return vm; }\nstatic INLINE vmask vuninterleave_vm_vm(vmask vm) { return vm; }\n\nstatic vmask2 vloadu_vm2_p(void *p) {\n  vmask2 vm2;\n  memcpy(&vm2, p, VECTLENDP * 16);\n  return vm2;\n}\n\n#if !defined(SLEEF_GENHEADER)\ntypedef Sleef_quad2 vargquad;\n\nstatic INLINE vmask2 vcast_vm2_aq(vargquad aq) {\n  return vinterleave_vm2_vm2(vloadu_vm2_p(&aq));\n}\n\nstatic INLINE vargquad vcast_aq_vm2(vmask2 vm2) {\n  vm2 = vuninterleave_vm2_vm2(vm2);\n  vargquad aq;\n  memcpy(&aq, &vm2, VECTLENDP * 16);\n  return aq;\n}\n#endif // #if !defined(SLEEF_GENHEADER)\n\nstatic INLINE int vtestallzeros_i_vo64(vopmask g) {\n  uint32x2_t x0 = vorr_u32(vget_low_u32(g), vget_high_u32(g));\n  uint32x2_t x1 = vpmax_u32(x0, x0);\n  return ~vget_lane_u32(x1, 0);\n}\n\nstatic INLINE vmask vsel_vm_vo64_vm_vm(vopmask m, vmask x, vmask y) { return vbslq_u32(m, x, y); }\n\nstatic INLINE vmask vsub64_vm_vm_vm(vmask x, vmask y) {\n  return vreinterpretq_u32_s64(vsubq_s64(vreinterpretq_s64_u32(x), vreinterpretq_s64_u32(y)));\n}\n\nstatic INLINE vmask vneg64_vm_vm(vmask x) {\n  return vreinterpretq_u32_s64(vnegq_s64(vreinterpretq_s64_u32(x)));\n}\n\nstatic INLINE vopmask vgt64_vo_vm_vm(vmask x, vmask y) {\n  return vreinterpretq_u32_u64(vcgtq_s64(vreinterpretq_s64_u32(x), vreinterpretq_s64_u32(y)));\n}\n\n#define vsll64_vm_vm_i(x, c) vreinterpretq_u32_u64(vshlq_n_u64(vreinterpretq_u64_u32(x), c))\n//@#define vsll64_vm_vm_i(x, c) vreinterpretq_u32_u64(vshlq_n_u64(vreinterpretq_u64_u32(x), c))\n#define vsrl64_vm_vm_i(x, c) vreinterpretq_u32_u64(vshrq_n_u64(vreinterpretq_u64_u32(x), c))\n//@#define vsrl64_vm_vm_i(x, c) vreinterpretq_u32_u64(vshrq_n_u64(vreinterpretq_u64_u32(x), c))\n\nstatic INLINE vmask vcast_vm_vi(vint vi) {\n  vmask m = vreinterpretq_u32_u64(vmovl_u32(vreinterpret_u32_s32(vi)));\n  return vor_vm_vm_vm(vcast_vm_vi2(vcastu_vi2_vi(vreinterpret_s32_u32(vget_low_u32(vgt_vo_vi_vi(vcast_vi_i(0), vi))))), m);\n}\nstatic INLINE vint vcast_vi_vm(vmask vm) { return vreinterpret_s32_u32(vmovn_u64(vreinterpretq_u64_u32(vm))); }\n"
  },
  {
    "path": "src/helperavx.h",
    "content": "//   Copyright Naoki Shibata and contributors 2010 - 2020.\n// Distributed under the Boost Software License, Version 1.0.\n//    (See accompanying file LICENSE.txt or copy at\n//          http://www.boost.org/LICENSE_1_0.txt)\n\n#if CONFIG == 1\n\n#if !defined(__AVX__) && !defined(SLEEF_GENHEADER)\n#error Please specify -mavx.\n#endif\n\n#elif CONFIG == 4\n\n#if (!defined(__AVX__) || !defined(__FMA4__)) && !defined(SLEEF_GENHEADER)\n#error Please specify -mavx and -mfma4.\n#endif\n\n#else\n#error CONFIG macro invalid or not defined\n#endif\n\n#define ENABLE_DP\n//@#define ENABLE_DP\n#define LOG2VECTLENDP 2\n//@#define LOG2VECTLENDP 2\n#define VECTLENDP (1 << LOG2VECTLENDP)\n//@#define VECTLENDP (1 << LOG2VECTLENDP)\n\n#define ENABLE_SP\n//@#define ENABLE_SP\n#define LOG2VECTLENSP (LOG2VECTLENDP+1)\n//@#define LOG2VECTLENSP (LOG2VECTLENDP+1)\n#define VECTLENSP (1 << LOG2VECTLENSP)\n//@#define VECTLENSP (1 << LOG2VECTLENSP)\n\n#define FULL_FP_ROUNDING\n//@#define FULL_FP_ROUNDING\n#define ACCURATE_SQRT\n//@#define ACCURATE_SQRT\n\n#if !defined(SLEEF_GENHEADER)\n#if defined(_MSC_VER)\n#include <intrin.h>\n#else\n#include <x86intrin.h>\n#endif\n\n#include <stdint.h>\n#include \"misc.h\"\n#endif // #if !defined(SLEEF_GENHEADER)\n\ntypedef __m256i vmask;\ntypedef __m256i vopmask;\n\ntypedef __m256d vdouble;\ntypedef __m128i vint;\n\ntypedef __m256 vfloat;\ntypedef struct { __m128i x, y; } vint2;\n\ntypedef struct {\n  vmask x, y;\n} vmask2;\n\n//\n\n#if !defined(SLEEF_GENHEADER)\n\n#ifndef __SLEEF_H__\nstatic inline\n                       void Sleef_x86CpuID(int32_t out[4], uint32_t eax,\n                                           uint32_t ecx) {\n                         /* We don't care for cpuid detection */\n                         out[0] = 0xFFFFFFFF;\n                         out[1] = 0xFFFFFFFF;\n                         out[2] = 0xFFFFFFFF;\n                         out[3] = 0xFFFFFFFF;\n                       }\n                       #endif\n\nstatic INLINE int cpuSupportsAVX() {\n    int32_t reg[4];\n    Sleef_x86CpuID(reg, 1, 0);\n    return (reg[2] & (1 << 28)) != 0;\n}\n\nstatic INLINE int cpuSupportsFMA4() {\n    int32_t reg[4];\n    Sleef_x86CpuID(reg, 0x80000001, 0);\n    return (reg[2] & (1 << 16)) != 0;\n}\n\n#if CONFIG == 4 && defined(__AVX__) && defined(__FMA4__)\nstatic INLINE int vavailability_i(int name) {\n  //int d = __builtin_cpu_supports(\"avx\") && __builtin_cpu_supports(\"fma4\");\n  int d = cpuSupportsAVX() && cpuSupportsFMA4();\n  return d ? 3 : 0;\n}\n\n//typedef vint2 vint2_fma4;\n\n#define ENABLE_FMA_DP\n#define ENABLE_FMA_SP\n\n#define ISANAME \"AVX + AMD FMA4\"\n#define DFTPRIORITY 21\n#else\nstatic INLINE int vavailability_i(int name) {\n  int d = cpuSupportsAVX();\n  return d ? 3 : 0;\n}\n//typedef vint2 vint2_avx;\n\n#define ISANAME \"AVX\"\n#define DFTPRIORITY 20\n#endif\n\n#endif // #if !defined(SLEEF_GENHEADER)\n\nstatic INLINE void vprefetch_v_p(const void *ptr) { _mm_prefetch(ptr, _MM_HINT_T0); }\n\nstatic INLINE int vtestallones_i_vo32(vopmask g) {\n  return _mm_test_all_ones(_mm_and_si128(_mm256_extractf128_si256(g, 0), _mm256_extractf128_si256(g, 1)));\n}\n\nstatic INLINE int vtestallones_i_vo64(vopmask g) {\n  return _mm_test_all_ones(_mm_and_si128(_mm256_extractf128_si256(g, 0), _mm256_extractf128_si256(g, 1)));\n}\n\n//\n\nstatic INLINE vdouble vcast_vd_d(double d) { return _mm256_set1_pd(d); }\nstatic INLINE vmask vreinterpret_vm_vd(vdouble vd) { return _mm256_castpd_si256(vd); }\nstatic INLINE vdouble vreinterpret_vd_vm(vmask vm) { return _mm256_castsi256_pd(vm);  }\nstatic INLINE vint2 vreinterpret_vi2_vd(vdouble vd) {\n  vint2 r;\n  r.x = _mm256_castsi256_si128(vreinterpret_vm_vd(vd));\n  r.y = _mm256_extractf128_si256(vreinterpret_vm_vd(vd), 1);\n  return r;\n}\nstatic INLINE vdouble vreinterpret_vd_vi2(vint2 vi) {\n  vmask m = _mm256_castsi128_si256(vi.x);\n  m = _mm256_insertf128_si256(m, vi.y, 1);\n  return vreinterpret_vd_vm(m);\n}\n\n//\n\nstatic vint2 vloadu_vi2_p(int32_t *p) {\n  vint2 r;\n  r.x = _mm_loadu_si128((__m128i *) p     );\n  r.y = _mm_loadu_si128((__m128i *)(p + 4));\n  return r;\n}\n\nstatic void vstoreu_v_p_vi2(int32_t *p, vint2 v) {\n  _mm_storeu_si128((__m128i *) p     , v.x);\n  _mm_storeu_si128((__m128i *)(p + 4), v.y);  \n}\n\nstatic vint vloadu_vi_p(int32_t *p) { return _mm_loadu_si128((__m128i *)p); }\nstatic void vstoreu_v_p_vi(int32_t *p, vint v) { _mm_storeu_si128((__m128i *)p, v); }\n\n//\n\nstatic INLINE vmask vand_vm_vm_vm(vmask x, vmask y) { return vreinterpret_vm_vd(_mm256_and_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }\nstatic INLINE vmask vandnot_vm_vm_vm(vmask x, vmask y) { return vreinterpret_vm_vd(_mm256_andnot_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }\nstatic INLINE vmask vor_vm_vm_vm(vmask x, vmask y) { return vreinterpret_vm_vd(_mm256_or_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }\nstatic INLINE vmask vxor_vm_vm_vm(vmask x, vmask y) { return vreinterpret_vm_vd(_mm256_xor_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }\n\nstatic INLINE vopmask vand_vo_vo_vo(vopmask x, vopmask y) { return vreinterpret_vm_vd(_mm256_and_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }\nstatic INLINE vopmask vandnot_vo_vo_vo(vopmask x, vopmask y) { return vreinterpret_vm_vd(_mm256_andnot_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }\nstatic INLINE vopmask vor_vo_vo_vo(vopmask x, vopmask y) { return vreinterpret_vm_vd(_mm256_or_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }\nstatic INLINE vopmask vxor_vo_vo_vo(vopmask x, vopmask y) { return vreinterpret_vm_vd(_mm256_xor_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }\n\nstatic INLINE vmask vand_vm_vo64_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm256_and_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }\nstatic INLINE vmask vandnot_vm_vo64_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm256_andnot_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }\nstatic INLINE vmask vor_vm_vo64_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm256_or_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }\nstatic INLINE vmask vxor_vm_vo64_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm256_xor_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }\n\nstatic INLINE vmask vand_vm_vo32_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm256_and_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }\nstatic INLINE vmask vandnot_vm_vo32_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm256_andnot_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }\nstatic INLINE vmask vor_vm_vo32_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm256_or_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }\nstatic INLINE vmask vxor_vm_vo32_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm256_xor_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }\n\nstatic INLINE vopmask vcast_vo32_vo64(vopmask o) {\n  return _mm256_castsi128_si256(_mm256_cvtpd_epi32(_mm256_and_pd(vreinterpret_vd_vm(o), _mm256_set1_pd(-1.0))));\n}\n\nstatic INLINE vopmask vcast_vo64_vo32(vopmask o) {\n  return vreinterpret_vm_vd(_mm256_cmp_pd(_mm256_cvtepi32_pd(_mm256_castsi256_si128(o)), _mm256_set1_pd(-1.0), _CMP_EQ_OQ));\n}\n\n//\n\nstatic INLINE vint vrint_vi_vd(vdouble vd) { return _mm256_cvtpd_epi32(vd); }\nstatic INLINE vint vtruncate_vi_vd(vdouble vd) { return _mm256_cvttpd_epi32(vd); }\nstatic INLINE vdouble vrint_vd_vd(vdouble vd) { return _mm256_round_pd(vd, _MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC); }\nstatic INLINE vdouble vtruncate_vd_vd(vdouble vd) { return _mm256_round_pd(vd, _MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC); }\nstatic INLINE vfloat vrint_vf_vf(vfloat vd) { return _mm256_round_ps(vd, _MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC); }\nstatic INLINE vfloat vtruncate_vf_vf(vfloat vf) { return _mm256_round_ps(vf, _MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC); }\nstatic INLINE vdouble vcast_vd_vi(vint vi) { return _mm256_cvtepi32_pd(vi); }\nstatic INLINE vint vcast_vi_i(int i) { return _mm_set1_epi32(i); }\nstatic INLINE vint2 vcastu_vi2_vi(vint vi) {\n  vint2 r;\n  r.x = _mm_and_si128(_mm_shuffle_epi32(vi, 0x40), _mm_set_epi32(-1, 0, -1, 0));\n  r.y = _mm_and_si128(_mm_shuffle_epi32(vi, 0xc8), _mm_set_epi32(-1, 0, -1, 0));\n  return r;\n}\n\nstatic INLINE vint vcastu_vi_vi2(vint2 vi) {\n  return _mm_or_si128(_mm_and_si128(_mm_shuffle_epi32(vi.x, 0x0d), _mm_set_epi32( 0,  0, -1, -1)),\n\t\t      _mm_and_si128(_mm_shuffle_epi32(vi.y, 0xd0), _mm_set_epi32(-1, -1,  0,  0)));\n}\n\nstatic INLINE vmask vcast_vm_i_i(int i0, int i1) {\n  return _mm256_set_epi32(i0, i1, i0, i1, i0, i1, i0, i1);\n}\n\nstatic INLINE vopmask veq64_vo_vm_vm(vmask x, vmask y) {\n  return vreinterpret_vm_vd(_mm256_cmp_pd(vreinterpret_vd_vm(vxor_vm_vm_vm(vxor_vm_vm_vm(x, y), vreinterpret_vm_vd(_mm256_set1_pd(1.0)))), _mm256_set1_pd(1.0), _CMP_EQ_OQ));\n}\n\n//\n\nstatic INLINE vdouble vadd_vd_vd_vd(vdouble x, vdouble y) { return _mm256_add_pd(x, y); }\nstatic INLINE vdouble vsub_vd_vd_vd(vdouble x, vdouble y) { return _mm256_sub_pd(x, y); }\nstatic INLINE vdouble vmul_vd_vd_vd(vdouble x, vdouble y) { return _mm256_mul_pd(x, y); }\nstatic INLINE vdouble vdiv_vd_vd_vd(vdouble x, vdouble y) { return _mm256_div_pd(x, y); }\nstatic INLINE vdouble vrec_vd_vd(vdouble x) { return _mm256_div_pd(_mm256_set1_pd(1), x); }\nstatic INLINE vdouble vsqrt_vd_vd(vdouble x) { return _mm256_sqrt_pd(x); }\nstatic INLINE vdouble vabs_vd_vd(vdouble d) { return _mm256_andnot_pd(_mm256_set1_pd(-0.0), d); }\nstatic INLINE vdouble vneg_vd_vd(vdouble d) { return _mm256_xor_pd(_mm256_set1_pd(-0.0), d); }\nstatic INLINE vdouble vmax_vd_vd_vd(vdouble x, vdouble y) { return _mm256_max_pd(x, y); }\nstatic INLINE vdouble vmin_vd_vd_vd(vdouble x, vdouble y) { return _mm256_min_pd(x, y); }\n\n#if CONFIG == 1\nstatic INLINE vdouble vmla_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vadd_vd_vd_vd(vmul_vd_vd_vd(x, y), z); }\nstatic INLINE vdouble vmlapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vsub_vd_vd_vd(vmul_vd_vd_vd(x, y), z); }\nstatic INLINE vdouble vmlanp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vsub_vd_vd_vd(z, vmul_vd_vd_vd(x, y)); }\n#else\nstatic INLINE vdouble vmla_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_macc_pd(x, y, z); }\nstatic INLINE vdouble vmlapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_msub_pd(x, y, z); }\nstatic INLINE vdouble vmlanp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_nmacc_pd(x, y, z); }\nstatic INLINE vdouble vfma_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_macc_pd(x, y, z); }\nstatic INLINE vdouble vfmapp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_macc_pd(x, y, z); }\nstatic INLINE vdouble vfmapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_msub_pd(x, y, z); }\nstatic INLINE vdouble vfmanp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_nmacc_pd(x, y, z); }\nstatic INLINE vdouble vfmann_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_nmsub_pd(x, y, z); }\n#endif\n\nstatic INLINE vopmask veq_vo_vd_vd(vdouble x, vdouble y) { return vreinterpret_vm_vd(_mm256_cmp_pd(x, y, _CMP_EQ_OQ)); }\nstatic INLINE vopmask vneq_vo_vd_vd(vdouble x, vdouble y) { return vreinterpret_vm_vd(_mm256_cmp_pd(x, y, _CMP_NEQ_UQ)); }\nstatic INLINE vopmask vlt_vo_vd_vd(vdouble x, vdouble y) { return vreinterpret_vm_vd(_mm256_cmp_pd(x, y, _CMP_LT_OQ)); }\nstatic INLINE vopmask vle_vo_vd_vd(vdouble x, vdouble y) { return vreinterpret_vm_vd(_mm256_cmp_pd(x, y, _CMP_LE_OQ)); }\nstatic INLINE vopmask vgt_vo_vd_vd(vdouble x, vdouble y) { return vreinterpret_vm_vd(_mm256_cmp_pd(x, y, _CMP_GT_OQ)); }\nstatic INLINE vopmask vge_vo_vd_vd(vdouble x, vdouble y) { return vreinterpret_vm_vd(_mm256_cmp_pd(x, y, _CMP_GE_OQ)); }\n\n//\n\nstatic INLINE vint vadd_vi_vi_vi(vint x, vint y) { return _mm_add_epi32(x, y); }\nstatic INLINE vint vsub_vi_vi_vi(vint x, vint y) { return _mm_sub_epi32(x, y); }\nstatic INLINE vint vneg_vi_vi(vint e) { return vsub_vi_vi_vi(vcast_vi_i(0), e); }\n\nstatic INLINE vint vand_vi_vi_vi(vint x, vint y) { return _mm_and_si128(x, y); }\nstatic INLINE vint vandnot_vi_vi_vi(vint x, vint y) { return _mm_andnot_si128(x, y); }\nstatic INLINE vint vor_vi_vi_vi(vint x, vint y) { return _mm_or_si128(x, y); }\nstatic INLINE vint vxor_vi_vi_vi(vint x, vint y) { return _mm_xor_si128(x, y); }\n\nstatic INLINE vint vandnot_vi_vo_vi(vopmask m, vint y) { return _mm_andnot_si128(_mm256_castsi256_si128(m), y); }\nstatic INLINE vint vand_vi_vo_vi(vopmask m, vint y) { return _mm_and_si128(_mm256_castsi256_si128(m), y); }\n\nstatic INLINE vint vsll_vi_vi_i(vint x, int c) { return _mm_slli_epi32(x, c); }\nstatic INLINE vint vsrl_vi_vi_i(vint x, int c) { return _mm_srli_epi32(x, c); }\nstatic INLINE vint vsra_vi_vi_i(vint x, int c) { return _mm_srai_epi32(x, c); }\n\nstatic INLINE vint veq_vi_vi_vi(vint x, vint y) { return _mm_cmpeq_epi32(x, y); }\nstatic INLINE vint vgt_vi_vi_vi(vint x, vint y) { return _mm_cmpgt_epi32(x, y); }\n\nstatic INLINE vopmask veq_vo_vi_vi(vint x, vint y) { return _mm256_castsi128_si256(_mm_cmpeq_epi32(x, y)); }\nstatic INLINE vopmask vgt_vo_vi_vi(vint x, vint y) { return _mm256_castsi128_si256(_mm_cmpgt_epi32(x, y)); }\n\nstatic INLINE vint vsel_vi_vo_vi_vi(vopmask o, vint x, vint y) { return _mm_blendv_epi8(y, x, _mm256_castsi256_si128(o)); }\n\nstatic INLINE vdouble vsel_vd_vo_vd_vd(vopmask o, vdouble x, vdouble y) { return _mm256_blendv_pd(y, x, _mm256_castsi256_pd(o)); }\n\nstatic INLINE CONST vdouble vsel_vd_vo_d_d(vopmask o, double v1, double v0) {\n  return vsel_vd_vo_vd_vd(o, vcast_vd_d(v1), vcast_vd_d(v0));\n}\n\nstatic INLINE vdouble vsel_vd_vo_vo_d_d_d(vopmask o0, vopmask o1, double d0, double d1, double d2) {\n  return vsel_vd_vo_vd_vd(o0, vcast_vd_d(d0), vsel_vd_vo_d_d(o1, d1, d2));\n}\n\nstatic INLINE vdouble vsel_vd_vo_vo_vo_d_d_d_d(vopmask o0, vopmask o1, vopmask o2, double d0, double d1, double d2, double d3) {\n  return vsel_vd_vo_vd_vd(o0, vcast_vd_d(d0), vsel_vd_vo_vd_vd(o1, vcast_vd_d(d1), vsel_vd_vo_d_d(o2, d2, d3)));\n}\n\nstatic INLINE vopmask visinf_vo_vd(vdouble d) {\n  return vreinterpret_vm_vd(_mm256_cmp_pd(vabs_vd_vd(d), _mm256_set1_pd(SLEEF_INFINITY), _CMP_EQ_OQ));\n}\n\nstatic INLINE vopmask vispinf_vo_vd(vdouble d) {\n  return vreinterpret_vm_vd(_mm256_cmp_pd(d, _mm256_set1_pd(SLEEF_INFINITY), _CMP_EQ_OQ));\n}\n\nstatic INLINE vopmask visminf_vo_vd(vdouble d) {\n  return vreinterpret_vm_vd(_mm256_cmp_pd(d, _mm256_set1_pd(-SLEEF_INFINITY), _CMP_EQ_OQ));\n}\n\nstatic INLINE vopmask visnan_vo_vd(vdouble d) {\n  return vreinterpret_vm_vd(_mm256_cmp_pd(d, d, _CMP_NEQ_UQ));\n}\n\nstatic INLINE vdouble vload_vd_p(const double *ptr) { return _mm256_load_pd(ptr); }\nstatic INLINE vdouble vloadu_vd_p(const double *ptr) { return _mm256_loadu_pd(ptr); }\n\nstatic INLINE void vstore_v_p_vd(double *ptr, vdouble v) { _mm256_store_pd(ptr, v); }\nstatic INLINE void vstoreu_v_p_vd(double *ptr, vdouble v) { _mm256_storeu_pd(ptr, v); }\n\nstatic INLINE vdouble vgather_vd_p_vi(const double *ptr, vint vi) {\n  int a[VECTLENDP];\n  vstoreu_v_p_vi(a, vi);\n  return _mm256_set_pd(ptr[a[3]], ptr[a[2]], ptr[a[1]], ptr[a[0]]);\n}\n\n#if defined(_MSC_VER)\n// This function is needed when debugging on MSVC.\nstatic INLINE double vcast_d_vd(vdouble v) {\n  double a[VECTLENDP];\n  vstoreu_v_p_vd(a, v);\n  return a[0];\n}\n#endif\n\n//\n\nstatic INLINE vint2 vcast_vi2_vm(vmask vm) {\n  vint2 r;\n  r.x = _mm256_castsi256_si128(vm);\n  r.y = _mm256_extractf128_si256(vm, 1);\n  return r;\n}\n\nstatic INLINE vmask vcast_vm_vi2(vint2 vi) {\n  vmask m = _mm256_castsi128_si256(vi.x);\n  m = _mm256_insertf128_si256(m, vi.y, 1);\n  return m;\n}\n\nstatic INLINE vint2 vrint_vi2_vf(vfloat vf) { return vcast_vi2_vm(_mm256_cvtps_epi32(vf)); }\nstatic INLINE vint2 vtruncate_vi2_vf(vfloat vf) { return vcast_vi2_vm(_mm256_cvttps_epi32(vf)); }\nstatic INLINE vfloat vcast_vf_vi2(vint2 vi) { return _mm256_cvtepi32_ps(vcast_vm_vi2(vi)); }\nstatic INLINE vfloat vcast_vf_f(float f) { return _mm256_set1_ps(f); }\nstatic INLINE vint2 vcast_vi2_i(int i) { vint2 r; r.x = r.y = _mm_set1_epi32(i); return r; }\nstatic INLINE vmask vreinterpret_vm_vf(vfloat vf) { return _mm256_castps_si256(vf); }\nstatic INLINE vfloat vreinterpret_vf_vm(vmask vm) { return _mm256_castsi256_ps(vm); }\n\nstatic INLINE vfloat vreinterpret_vf_vi2(vint2 vi) { return vreinterpret_vf_vm(vcast_vm_vi2(vi)); }\nstatic INLINE vint2 vreinterpret_vi2_vf(vfloat vf) { return vcast_vi2_vm(vreinterpret_vm_vf(vf)); }\n\nstatic INLINE vfloat vadd_vf_vf_vf(vfloat x, vfloat y) { return _mm256_add_ps(x, y); }\nstatic INLINE vfloat vsub_vf_vf_vf(vfloat x, vfloat y) { return _mm256_sub_ps(x, y); }\nstatic INLINE vfloat vmul_vf_vf_vf(vfloat x, vfloat y) { return _mm256_mul_ps(x, y); }\nstatic INLINE vfloat vdiv_vf_vf_vf(vfloat x, vfloat y) { return _mm256_div_ps(x, y); }\nstatic INLINE vfloat vrec_vf_vf(vfloat x) { return vdiv_vf_vf_vf(vcast_vf_f(1.0f), x); }\nstatic INLINE vfloat vsqrt_vf_vf(vfloat x) { return _mm256_sqrt_ps(x); }\nstatic INLINE vfloat vabs_vf_vf(vfloat f) { return vreinterpret_vf_vm(vandnot_vm_vm_vm(vreinterpret_vm_vf(vcast_vf_f(-0.0f)), vreinterpret_vm_vf(f))); }\nstatic INLINE vfloat vneg_vf_vf(vfloat d) { return vreinterpret_vf_vm(vxor_vm_vm_vm(vreinterpret_vm_vf(vcast_vf_f(-0.0f)), vreinterpret_vm_vf(d))); }\nstatic INLINE vfloat vmax_vf_vf_vf(vfloat x, vfloat y) { return _mm256_max_ps(x, y); }\nstatic INLINE vfloat vmin_vf_vf_vf(vfloat x, vfloat y) { return _mm256_min_ps(x, y); }\n\n#if CONFIG == 1\nstatic INLINE vfloat vmla_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vadd_vf_vf_vf(vmul_vf_vf_vf(x, y), z); }\nstatic INLINE vfloat vmlanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vsub_vf_vf_vf(z, vmul_vf_vf_vf(x, y)); }\nstatic INLINE vfloat vmlapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vsub_vf_vf_vf(vmul_vf_vf_vf(x, y), z); }\n#else\nstatic INLINE vfloat vmla_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_macc_ps(x, y, z); }\nstatic INLINE vfloat vmlanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_nmacc_ps(x, y, z); }\nstatic INLINE vfloat vmlapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_msub_ps(x, y, z); }\nstatic INLINE vfloat vfma_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_macc_ps(x, y, z); }\nstatic INLINE vfloat vfmapp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_macc_ps(x, y, z); }\nstatic INLINE vfloat vfmapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_msub_ps(x, y, z); }\nstatic INLINE vfloat vfmanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_nmacc_ps(x, y, z); }\nstatic INLINE vfloat vfmann_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_nmsub_ps(x, y, z); }\n#endif\n\nstatic INLINE vopmask veq_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm256_cmp_ps(x, y, _CMP_EQ_OQ)); }\nstatic INLINE vopmask vneq_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm256_cmp_ps(x, y, _CMP_NEQ_UQ)); }\nstatic INLINE vopmask vlt_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm256_cmp_ps(x, y, _CMP_LT_OQ)); }\nstatic INLINE vopmask vle_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm256_cmp_ps(x, y, _CMP_LE_OQ)); }\nstatic INLINE vopmask vgt_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm256_cmp_ps(x, y, _CMP_GT_OQ)); }\nstatic INLINE vopmask vge_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm256_cmp_ps(x, y, _CMP_GE_OQ)); }\n\nstatic INLINE vint2 vadd_vi2_vi2_vi2(vint2 x, vint2 y) {\n  vint2 vi = { _mm_add_epi32(x.x, y.x), _mm_add_epi32(x.y, y.y) };\n  return vi;\n}\n\nstatic INLINE vint2 vsub_vi2_vi2_vi2(vint2 x, vint2 y) {\n  vint2 vi = { _mm_sub_epi32(x.x, y.x), _mm_sub_epi32(x.y, y.y) };\n  return vi;\n}\n\nstatic INLINE vint2 vneg_vi2_vi2(vint2 e) {\n  vint2 vi = { _mm_sub_epi32(_mm_set1_epi32(0), e.x), _mm_sub_epi32(_mm_set1_epi32(0), e.y) };\n  return vi;\n}\n\nstatic INLINE vint2 vand_vi2_vi2_vi2(vint2 x, vint2 y) {\n  vint2 vi = { _mm_and_si128(x.x, y.x), _mm_and_si128(x.y, y.y) };\n  return vi;\n}\n\nstatic INLINE vint2 vandnot_vi2_vi2_vi2(vint2 x, vint2 y) {\n  vint2 vi = { _mm_andnot_si128(x.x, y.x), _mm_andnot_si128(x.y, y.y) };\n  return vi;\n}\n\nstatic INLINE vint2 vor_vi2_vi2_vi2(vint2 x, vint2 y) {\n  vint2 vi = { _mm_or_si128(x.x, y.x), _mm_or_si128(x.y, y.y) };\n  return vi;\n}\n\nstatic INLINE vint2 vxor_vi2_vi2_vi2(vint2 x, vint2 y) {\n  vint2 vi = { _mm_xor_si128(x.x, y.x), _mm_xor_si128(x.y, y.y) };\n  return vi;\n}\n\nstatic INLINE vint2 vand_vi2_vo_vi2(vopmask x, vint2 y) { return vand_vi2_vi2_vi2(vcast_vi2_vm(x), y); }\nstatic INLINE vint2 vandnot_vi2_vo_vi2(vopmask x, vint2 y) { return vandnot_vi2_vi2_vi2(vcast_vi2_vm(x), y); }\n\nstatic INLINE vint2 vsll_vi2_vi2_i(vint2 x, int c) {\n  vint2 vi = { _mm_slli_epi32(x.x, c), _mm_slli_epi32(x.y, c) };\n  return vi;\n}\n\nstatic INLINE vint2 vsrl_vi2_vi2_i(vint2 x, int c) {\n  vint2 vi = { _mm_srli_epi32(x.x, c), _mm_srli_epi32(x.y, c) };\n  return vi;\n}\n\nstatic INLINE vint2 vsra_vi2_vi2_i(vint2 x, int c) {\n  vint2 vi = { _mm_srai_epi32(x.x, c), _mm_srai_epi32(x.y, c) };\n  return vi;\n}\n\nstatic INLINE vopmask veq_vo_vi2_vi2(vint2 x, vint2 y) {\n  vint2 r;\n  r.x = _mm_cmpeq_epi32(x.x, y.x);\n  r.y = _mm_cmpeq_epi32(x.y, y.y);\n  return vcast_vm_vi2(r);\n}\n\nstatic INLINE vopmask vgt_vo_vi2_vi2(vint2 x, vint2 y) {\n  vint2 r;\n  r.x = _mm_cmpgt_epi32(x.x, y.x);\n  r.y = _mm_cmpgt_epi32(x.y, y.y);\n  return vcast_vm_vi2(r);\n}\n\nstatic INLINE vint2 veq_vi2_vi2_vi2(vint2 x, vint2 y) {\n  vint2 r;\n  r.x = _mm_cmpeq_epi32(x.x, y.x);\n  r.y = _mm_cmpeq_epi32(x.y, y.y);\n  return r;\n}\n\nstatic INLINE vint2 vgt_vi2_vi2_vi2(vint2 x, vint2 y) {\n  vint2 r;\n  r.x = _mm_cmpgt_epi32(x.x, y.x);\n  r.y = _mm_cmpgt_epi32(x.y, y.y);\n  return r;\n}\n\nstatic INLINE vint2 vsel_vi2_vo_vi2_vi2(vopmask m, vint2 x, vint2 y) {\n  vint2 n = vcast_vi2_vm(m);\n  vint2 r = { _mm_blendv_epi8(y.x, x.x, n.x), _mm_blendv_epi8(y.y, x.y, n.y) };\n  return r;\n}\n\nstatic INLINE vmask vadd64_vm_vm_vm(vmask x, vmask y) {\n  vint2 ix = vcast_vi2_vm(x), iy = vcast_vi2_vm(y), iz;\n  iz.x = _mm_add_epi64(ix.x, iy.x);\n  iz.y = _mm_add_epi64(ix.y, iy.y);\n  return vcast_vm_vi2(iz);\n}\n\nstatic INLINE vfloat vsel_vf_vo_vf_vf(vopmask o, vfloat x, vfloat y) { return _mm256_blendv_ps(y, x, _mm256_castsi256_ps(o)); }\n\nstatic INLINE CONST vfloat vsel_vf_vo_f_f(vopmask o, float v1, float v0) {\n  return vsel_vf_vo_vf_vf(o, vcast_vf_f(v1), vcast_vf_f(v0));\n}\n\nstatic INLINE vfloat vsel_vf_vo_vo_f_f_f(vopmask o0, vopmask o1, float d0, float d1, float d2) {\n  return vsel_vf_vo_vf_vf(o0, vcast_vf_f(d0), vsel_vf_vo_f_f(o1, d1, d2));\n}\n\nstatic INLINE vfloat vsel_vf_vo_vo_vo_f_f_f_f(vopmask o0, vopmask o1, vopmask o2, float d0, float d1, float d2, float d3) {\n  return vsel_vf_vo_vf_vf(o0, vcast_vf_f(d0), vsel_vf_vo_vf_vf(o1, vcast_vf_f(d1), vsel_vf_vo_f_f(o2, d2, d3)));\n}\n\nstatic INLINE vopmask visinf_vo_vf(vfloat d) { return veq_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(SLEEF_INFINITYf)); }\nstatic INLINE vopmask vispinf_vo_vf(vfloat d) { return veq_vo_vf_vf(d, vcast_vf_f(SLEEF_INFINITYf)); }\nstatic INLINE vopmask visminf_vo_vf(vfloat d) { return veq_vo_vf_vf(d, vcast_vf_f(-SLEEF_INFINITYf)); }\nstatic INLINE vopmask visnan_vo_vf(vfloat d) { return vneq_vo_vf_vf(d, d); }\n\n//\n\nstatic INLINE vfloat vload_vf_p(const float *ptr) { return _mm256_load_ps(ptr); }\nstatic INLINE vfloat vloadu_vf_p(const float *ptr) { return _mm256_loadu_ps(ptr); }\n\nstatic INLINE void vstore_v_p_vf(float *ptr, vfloat v) { _mm256_store_ps(ptr, v); }\nstatic INLINE void vstoreu_v_p_vf(float *ptr, vfloat v) { _mm256_storeu_ps(ptr, v); }\n\nstatic INLINE vfloat vgather_vf_p_vi2(const float *ptr, vint2 vi2) {\n  int a[VECTLENSP];\n  vstoreu_v_p_vi2(a, vi2);\n  return _mm256_set_ps(ptr[a[7]], ptr[a[6]], ptr[a[5]], ptr[a[4]],\n\t\t       ptr[a[3]], ptr[a[2]], ptr[a[1]], ptr[a[0]]);\n}\n\n#ifdef _MSC_VER\n// This function is needed when debugging on MSVC.\nstatic INLINE float vcast_f_vf(vfloat v) {\n  float a[VECTLENSP];\n  vstoreu_v_p_vf(a, v);\n  return a[0];\n}\n#endif\n//\n\n#define PNMASK ((vdouble) { +0.0, -0.0, +0.0, -0.0 })\n#define NPMASK ((vdouble) { -0.0, +0.0, -0.0, +0.0 })\n#define PNMASKf ((vfloat) { +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f })\n#define NPMASKf ((vfloat) { -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f })\n\nstatic INLINE vdouble vposneg_vd_vd(vdouble d) { return vreinterpret_vd_vm(vxor_vm_vm_vm(vreinterpret_vm_vd(d), vreinterpret_vm_vd(PNMASK))); }\nstatic INLINE vdouble vnegpos_vd_vd(vdouble d) { return vreinterpret_vd_vm(vxor_vm_vm_vm(vreinterpret_vm_vd(d), vreinterpret_vm_vd(NPMASK))); }\nstatic INLINE vfloat vposneg_vf_vf(vfloat d) { return vreinterpret_vf_vm(vxor_vm_vm_vm(vreinterpret_vm_vf(d), vreinterpret_vm_vf(PNMASKf))); }\nstatic INLINE vfloat vnegpos_vf_vf(vfloat d) { return vreinterpret_vf_vm(vxor_vm_vm_vm(vreinterpret_vm_vf(d), vreinterpret_vm_vf(NPMASKf))); }\n\nstatic INLINE vdouble vsubadd_vd_vd_vd(vdouble x, vdouble y) { return _mm256_addsub_pd(x, y); }\nstatic INLINE vfloat vsubadd_vf_vf_vf(vfloat x, vfloat y) { return _mm256_addsub_ps(x, y); }\n\n#if CONFIG == 1\nstatic INLINE vdouble vmlsubadd_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vsubadd_vd_vd_vd(vmul_vd_vd_vd(x, y), z); }\nstatic INLINE vfloat vmlsubadd_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vsubadd_vf_vf_vf(vmul_vf_vf_vf(x, y), z); }\n#else\nstatic INLINE vdouble vmlsubadd_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vmla_vd_vd_vd_vd(x, y, vnegpos_vd_vd(z)); }\nstatic INLINE vfloat vmlsubadd_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vmla_vf_vf_vf_vf(x, y, vnegpos_vf_vf(z)); }\n#endif\n\n\nstatic INLINE vdouble vrev21_vd_vd(vdouble d0) { return  _mm256_shuffle_pd(d0, d0, (0 << 3) | (1 << 2) | (0 << 1) | (1 << 0)); }\nstatic INLINE vdouble vreva2_vd_vd(vdouble d0) { d0 = _mm256_permute2f128_pd(d0, d0, 1); return _mm256_shuffle_pd(d0, d0, (1 << 3) | (0 << 2) | (1 << 1) | (0 << 0)); }\n\nstatic INLINE void vstream_v_p_vd(double *ptr, vdouble v) { _mm256_stream_pd(ptr, v); }\nstatic INLINE void vscatter2_v_p_i_i_vd(double *ptr, int offset, int step, vdouble v) {\n  _mm_store_pd(&ptr[(offset + step * 0)*2], _mm256_extractf128_pd(v, 0));\n  _mm_store_pd(&ptr[(offset + step * 1)*2], _mm256_extractf128_pd(v, 1));\n}\n\nstatic INLINE void vsscatter2_v_p_i_i_vd(double *ptr, int offset, int step, vdouble v) {\n  _mm_stream_pd(&ptr[(offset + step * 0)*2], _mm256_extractf128_pd(v, 0));\n  _mm_stream_pd(&ptr[(offset + step * 1)*2], _mm256_extractf128_pd(v, 1));\n}\n\n//\n\nstatic INLINE vfloat vrev21_vf_vf(vfloat d0) { return _mm256_shuffle_ps(d0, d0, (2 << 6) | (3 << 4) | (0 << 2) | (1 << 0)); }\nstatic INLINE vfloat vreva2_vf_vf(vfloat d0) { d0 = _mm256_permute2f128_ps(d0, d0, 1); return _mm256_shuffle_ps(d0, d0, (1 << 6) | (0 << 4) | (3 << 2) | (2 << 0)); }\nstatic INLINE vint2 vrev21_vi2_vi2(vint2 i) { return vreinterpret_vi2_vf(vrev21_vf_vf(vreinterpret_vf_vi2(i))); }\n\nstatic INLINE void vstream_v_p_vf(float *ptr, vfloat v) { _mm256_stream_ps(ptr, v); }\n\nstatic INLINE void vscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v) {\n  _mm_storel_pd((double *)(ptr+(offset + step * 0)*2), _mm_castsi128_pd(_mm_castps_si128(_mm256_extractf128_ps(v, 0))));\n  _mm_storeh_pd((double *)(ptr+(offset + step * 1)*2), _mm_castsi128_pd(_mm_castps_si128(_mm256_extractf128_ps(v, 0))));\n  _mm_storel_pd((double *)(ptr+(offset + step * 2)*2), _mm_castsi128_pd(_mm_castps_si128(_mm256_extractf128_ps(v, 1))));\n  _mm_storeh_pd((double *)(ptr+(offset + step * 3)*2), _mm_castsi128_pd(_mm_castps_si128(_mm256_extractf128_ps(v, 1))));\n}\n\nstatic INLINE void vsscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v) { vscatter2_v_p_i_i_vf(ptr, offset, step, v); }\n\n//\n\nstatic INLINE vmask2 vinterleave_vm2_vm2(vmask2 v) {\n  return (vmask2) {\n    vreinterpret_vm_vd(_mm256_unpacklo_pd(vreinterpret_vd_vm(v.x), vreinterpret_vd_vm(v.y))),\n      vreinterpret_vm_vd(_mm256_unpackhi_pd(vreinterpret_vd_vm(v.x), vreinterpret_vd_vm(v.y))) };\n}\n\nstatic INLINE vmask2 vuninterleave_vm2_vm2(vmask2 v) {\n  return (vmask2) {\n    vreinterpret_vm_vd(_mm256_unpacklo_pd(vreinterpret_vd_vm(v.x), vreinterpret_vd_vm(v.y))),\n      vreinterpret_vm_vd(_mm256_unpackhi_pd(vreinterpret_vd_vm(v.x), vreinterpret_vd_vm(v.y))) };\n}\n\nstatic INLINE vint vuninterleave_vi_vi(vint v) {\n  return _mm_shuffle_epi32(v, (0 << 0) | (2 << 2) | (1 << 4) | (3 << 6));\n}\n\nstatic INLINE vdouble vinterleave_vd_vd(vdouble vd) {\n  double tmp[4];\n  vstoreu_v_p_vd(tmp, vd);\n  double t = tmp[1]; tmp[1] = tmp[2]; tmp[2] = t;\n  return vloadu_vd_p(tmp);\n}\n\nstatic INLINE vdouble vuninterleave_vd_vd(vdouble vd) {\n  double tmp[4];\n  vstoreu_v_p_vd(tmp, vd);\n  double t = tmp[1]; tmp[1] = tmp[2]; tmp[2] = t;\n  return vloadu_vd_p(tmp);\n}\n\nstatic INLINE vmask vinterleave_vm_vm(vmask vm) {\n  double tmp[4];\n  vstoreu_v_p_vd(tmp, vreinterpret_vd_vm(vm));\n  double t = tmp[1]; tmp[1] = tmp[2]; tmp[2] = t;\n  return vreinterpret_vm_vd(vloadu_vd_p(tmp));\n}\n\nstatic INLINE vmask vuninterleave_vm_vm(vmask vm) {\n  double tmp[4];\n  vstoreu_v_p_vd(tmp, vreinterpret_vd_vm(vm));\n  double t = tmp[1]; tmp[1] = tmp[2]; tmp[2] = t;\n  return vreinterpret_vm_vd(vloadu_vd_p(tmp));\n}\n\nstatic vmask2 vloadu_vm2_p(void *p) {\n  vmask2 vm2;\n  memcpy(&vm2, p, VECTLENDP * 16);\n  return vm2;\n}\n\n#if !defined(SLEEF_GENHEADER)\ntypedef Sleef_quad4 vargquad;\n\nstatic INLINE vmask2 vcast_vm2_aq(vargquad aq) {\n  return vinterleave_vm2_vm2(vloadu_vm2_p(&aq));\n}\n\nstatic INLINE vargquad vcast_aq_vm2(vmask2 vm2) {\n  vm2 = vuninterleave_vm2_vm2(vm2);\n  vargquad aq;\n  memcpy(&aq, &vm2, VECTLENDP * 16);\n  return aq;\n}\n#endif // #if !defined(SLEEF_GENHEADER)\n\nstatic INLINE int vtestallzeros_i_vo64(vopmask g) {\n  return _mm_movemask_epi8(_mm_or_si128(_mm256_extractf128_si256(g, 0), _mm256_extractf128_si256(g, 1))) == 0;\n}\n\nstatic INLINE vmask vsel_vm_vo64_vm_vm(vopmask o, vmask x, vmask y) {\n  return vreinterpret_vm_vd(_mm256_blendv_pd(vreinterpret_vd_vm(y), vreinterpret_vd_vm(x), vreinterpret_vd_vm(o)));\n}\n\nstatic INLINE vmask vsub64_vm_vm_vm(vmask x, vmask y) {\n  __m128i xh = _mm256_extractf128_si256(x, 1), xl = _mm256_extractf128_si256(x, 0);\n  __m128i yh = _mm256_extractf128_si256(y, 1), yl = _mm256_extractf128_si256(y, 0);\n  vmask r = _mm256_castsi128_si256(_mm_sub_epi64(xl, yl));\n  return _mm256_insertf128_si256(r, _mm_sub_epi64(xh, yh), 1);\n}\n\nstatic INLINE vmask vneg64_vm_vm(vmask x) { return vsub64_vm_vm_vm(vcast_vm_i_i(0, 0), x); }\nstatic INLINE vopmask vgt64_vo_vm_vm(vmask x, vmask y) {\n  __m128i xh = _mm256_extractf128_si256(x, 1), xl = _mm256_extractf128_si256(x, 0);\n  __m128i yh = _mm256_extractf128_si256(y, 1), yl = _mm256_extractf128_si256(y, 0);\n  vmask r = _mm256_castsi128_si256(_mm_cmpgt_epi64(xl, yl));\n  return _mm256_insertf128_si256(r, _mm_cmpgt_epi64(xh, yh), 1);\n}\n\n#define vsll64_vm_vm_i(x, c) \\\n  _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_slli_epi64(_mm256_extractf128_si256(x, 0), c)), \\\n\t\t\t  _mm_slli_epi64(_mm256_extractf128_si256(x, 1), c), 1)\n#define vsrl64_vm_vm_i(x, c) \\\n  _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_srli_epi64(_mm256_extractf128_si256(x, 0), c)), \\\n\t\t\t  _mm_srli_epi64(_mm256_extractf128_si256(x, 1), c), 1)\n\n//@#define vsll64_vm_vm_i(x, c) _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_slli_epi64(_mm256_extractf128_si256(x, 0), c)), _mm_slli_epi64(_mm256_extractf128_si256(x, 1), c), 1)\n//@#define vsrl64_vm_vm_i(x, c) _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_srli_epi64(_mm256_extractf128_si256(x, 0), c)), _mm_srli_epi64(_mm256_extractf128_si256(x, 1), c), 1)\n\nstatic INLINE vmask vcast_vm_vi(vint vi) {\n  vint vi0 = _mm_and_si128(_mm_shuffle_epi32(vi, (1 << 4) | (1 << 6)), _mm_set_epi32(0, -1, 0, -1));\n  vint vi1 = _mm_and_si128(_mm_shuffle_epi32(vi, (2 << 0) | (2 << 2) | (3 << 4) | (3 << 6)), _mm_set_epi32(0, -1, 0, -1));\n  vmask m = _mm256_insertf128_si256(_mm256_castsi128_si256(vi0), vi1, 1);\n  return vor_vm_vm_vm(vcast_vm_vi2(vcastu_vi2_vi(vand_vi_vo_vi(vgt_vo_vi_vi(vcast_vi_i(0), vi), vcast_vi_i(-1)))), m);\n}\nstatic INLINE vint vcast_vi_vm(vmask vm) {\n  return _mm_or_si128(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(_mm256_castsi256_si128(vm)), _mm_set1_ps(0), 0x08)),\n  \t\t      _mm_castps_si128(_mm_shuffle_ps(_mm_set1_ps(0), _mm_castsi128_ps(_mm256_extractf128_si256(vm, 1)), 0x80)));\n}\n"
  },
  {
    "path": "src/helperavx2.h",
    "content": "//   Copyright Naoki Shibata and contributors 2010 - 2020.\n// Distributed under the Boost Software License, Version 1.0.\n//    (See accompanying file LICENSE.txt or copy at\n//          http://www.boost.org/LICENSE_1_0.txt)\n\n#if CONFIG == 1\n\n#if !defined(__AVX2__) && !defined(SLEEF_GENHEADER)\n#error Please specify -mavx2.\n#endif\n\n#else\n#error CONFIG macro invalid or not defined\n#endif\n\n#define ENABLE_DP\n//@#define ENABLE_DP\n#define LOG2VECTLENDP 2\n//@#define LOG2VECTLENDP 2\n#define VECTLENDP (1 << LOG2VECTLENDP)\n//@#define VECTLENDP (1 << LOG2VECTLENDP)\n#define ENABLE_FMA_DP\n//@#define ENABLE_FMA_DP\n\n#define ENABLE_SP\n//@#define ENABLE_SP\n#define LOG2VECTLENSP (LOG2VECTLENDP+1)\n//@#define LOG2VECTLENSP (LOG2VECTLENDP+1)\n#define VECTLENSP (1 << LOG2VECTLENSP)\n//@#define VECTLENSP (1 << LOG2VECTLENSP)\n#define ENABLE_FMA_SP\n//@#define ENABLE_FMA_SP\n\n#define FULL_FP_ROUNDING\n//@#define FULL_FP_ROUNDING\n#define ACCURATE_SQRT\n//@#define ACCURATE_SQRT\n\n#if !defined(SLEEF_GENHEADER)\n#if defined(_MSC_VER)\n#include <intrin.h>\n#else\n#include <x86intrin.h>\n#endif\n\n#include <stdint.h>\n#include \"misc.h\"\n#endif // #if !defined(SLEEF_GENHEADER)\n\ntypedef __m256i vmask;\ntypedef __m256i vopmask;\n\ntypedef __m256d vdouble;\ntypedef __m128i vint;\n\ntypedef __m256 vfloat;\ntypedef __m256i vint2;\n\ntypedef struct {\n  vmask x, y;\n} vmask2;\n\n//\n\n#if !defined(SLEEF_GENHEADER)\n\n#ifndef __SLEEF_H__\nstatic inline\n                       void Sleef_x86CpuID(int32_t out[4], uint32_t eax,\n                                           uint32_t ecx) {\n                         /* We don't care for cpuid detection */\n                         out[0] = 0xFFFFFFFF;\n                         out[1] = 0xFFFFFFFF;\n                         out[2] = 0xFFFFFFFF;\n                         out[3] = 0xFFFFFFFF;\n                       }\n                       #endif\n\nstatic INLINE int cpuSupportsAVX2() {\n    int32_t reg[4];\n    Sleef_x86CpuID(reg, 7, 0);\n    return (reg[1] & (1 << 5)) != 0;\n}\n\nstatic INLINE int cpuSupportsFMA() {\n    int32_t reg[4];\n    Sleef_x86CpuID(reg, 1, 0);\n    return (reg[2] & (1 << 12)) != 0;\n}\n\n#if CONFIG == 1 && defined(__AVX2__)\nstatic INLINE int vavailability_i(int name) {\n  int d = cpuSupportsAVX2() && cpuSupportsFMA();\n  return d ? 3 : 0;\n}\n#define ISANAME \"AVX2\"\n#define DFTPRIORITY 25\n#endif\n\n#endif // #if !defined(SLEEF_GENHEADER)\n\nstatic INLINE void vprefetch_v_p(const void *ptr) { _mm_prefetch(ptr, _MM_HINT_T0); }\n\nstatic INLINE int vtestallones_i_vo32(vopmask g) {\n  return _mm_test_all_ones(_mm_and_si128(_mm256_extractf128_si256(g, 0), _mm256_extractf128_si256(g, 1)));\n}\n\nstatic INLINE int vtestallones_i_vo64(vopmask g) {\n  return _mm_test_all_ones(_mm_and_si128(_mm256_extractf128_si256(g, 0), _mm256_extractf128_si256(g, 1)));\n}\n\n//\n\nstatic INLINE vdouble vcast_vd_d(double d) { return _mm256_set1_pd(d); }\nstatic INLINE vmask vreinterpret_vm_vd(vdouble vd) { return _mm256_castpd_si256(vd); }\nstatic INLINE vdouble vreinterpret_vd_vm(vmask vm) { return _mm256_castsi256_pd(vm);  }\nstatic INLINE vint2 vreinterpret_vi2_vd(vdouble vd) { return _mm256_castpd_si256(vd); }\nstatic INLINE vdouble vreinterpret_vd_vi2(vint2 vi) { return _mm256_castsi256_pd(vi); }\n\n//\n\nstatic vint2 vloadu_vi2_p(int32_t *p) { return _mm256_loadu_si256((__m256i const *)p); }\nstatic void vstoreu_v_p_vi2(int32_t *p, vint2 v) { _mm256_storeu_si256((__m256i *)p, v); }\nstatic vint vloadu_vi_p(int32_t *p) { return _mm_loadu_si128((__m128i *)p); }\nstatic void vstoreu_v_p_vi(int32_t *p, vint v) { _mm_storeu_si128((__m128i *)p, v); }\n\n//\n\nstatic INLINE vmask vand_vm_vm_vm(vmask x, vmask y) { return vreinterpret_vm_vd(_mm256_and_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }\nstatic INLINE vmask vandnot_vm_vm_vm(vmask x, vmask y) { return vreinterpret_vm_vd(_mm256_andnot_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }\nstatic INLINE vmask vor_vm_vm_vm(vmask x, vmask y) { return vreinterpret_vm_vd(_mm256_or_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }\nstatic INLINE vmask vxor_vm_vm_vm(vmask x, vmask y) { return vreinterpret_vm_vd(_mm256_xor_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }\n\nstatic INLINE vopmask vand_vo_vo_vo(vopmask x, vopmask y) { return vreinterpret_vm_vd(_mm256_and_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }\nstatic INLINE vopmask vandnot_vo_vo_vo(vopmask x, vopmask y) { return vreinterpret_vm_vd(_mm256_andnot_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }\nstatic INLINE vopmask vor_vo_vo_vo(vopmask x, vopmask y) { return vreinterpret_vm_vd(_mm256_or_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }\nstatic INLINE vopmask vxor_vo_vo_vo(vopmask x, vopmask y) { return vreinterpret_vm_vd(_mm256_xor_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }\n\nstatic INLINE vmask vand_vm_vo64_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm256_and_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }\nstatic INLINE vmask vandnot_vm_vo64_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm256_andnot_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }\nstatic INLINE vmask vor_vm_vo64_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm256_or_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }\nstatic INLINE vmask vxor_vm_vo64_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm256_xor_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }\n\nstatic INLINE vmask vand_vm_vo32_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm256_and_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }\nstatic INLINE vmask vandnot_vm_vo32_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm256_andnot_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }\nstatic INLINE vmask vor_vm_vo32_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm256_or_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }\nstatic INLINE vmask vxor_vm_vo32_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm256_xor_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }\n\nstatic INLINE vopmask vcast_vo32_vo64(vopmask o) {\n  return _mm256_permutevar8x32_epi32(o, _mm256_set_epi32(0, 0, 0, 0, 6, 4, 2, 0));\n}\n\nstatic INLINE vopmask vcast_vo64_vo32(vopmask o) {\n  return _mm256_permutevar8x32_epi32(o, _mm256_set_epi32(3, 3, 2, 2, 1, 1, 0, 0));\n}\n\n//\n\nstatic INLINE vint vrint_vi_vd(vdouble vd) { return _mm256_cvtpd_epi32(vd); }\nstatic INLINE vint vtruncate_vi_vd(vdouble vd) { return _mm256_cvttpd_epi32(vd); }\nstatic INLINE vdouble vrint_vd_vd(vdouble vd) { return _mm256_round_pd(vd, _MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC); }\nstatic INLINE vfloat vrint_vf_vf(vfloat vd) { return _mm256_round_ps(vd, _MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC); }\nstatic INLINE vdouble vtruncate_vd_vd(vdouble vd) { return _mm256_round_pd(vd, _MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC); }\nstatic INLINE vfloat vtruncate_vf_vf(vfloat vf) { return _mm256_round_ps(vf, _MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC); }\nstatic INLINE vdouble vcast_vd_vi(vint vi) { return _mm256_cvtepi32_pd(vi); }\nstatic INLINE vint vcast_vi_i(int i) { return _mm_set1_epi32(i); }\n\nstatic INLINE vint2 vcastu_vi2_vi(vint vi) {\n  return _mm256_slli_epi64(_mm256_cvtepi32_epi64(vi), 32);\n}\n\nstatic INLINE vint vcastu_vi_vi2(vint2 vi) {\n  return _mm_or_si128(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(_mm256_castsi256_si128(vi)), _mm_set1_ps(0), 0x0d)),\n  \t\t      _mm_castps_si128(_mm_shuffle_ps(_mm_set1_ps(0), _mm_castsi128_ps(_mm256_extractf128_si256(vi, 1)), 0xd0)));\n}\n\nstatic INLINE vmask vcast_vm_i_i(int i0, int i1) {\n  return _mm256_set_epi32(i0, i1, i0, i1, i0, i1, i0, i1);\n}\n\nstatic INLINE vopmask veq64_vo_vm_vm(vmask x, vmask y) { return _mm256_cmpeq_epi64(x, y); }\nstatic INLINE vmask vadd64_vm_vm_vm(vmask x, vmask y) { return _mm256_add_epi64(x, y); }\n\n//\n\nstatic INLINE vdouble vadd_vd_vd_vd(vdouble x, vdouble y) { return _mm256_add_pd(x, y); }\nstatic INLINE vdouble vsub_vd_vd_vd(vdouble x, vdouble y) { return _mm256_sub_pd(x, y); }\nstatic INLINE vdouble vmul_vd_vd_vd(vdouble x, vdouble y) { return _mm256_mul_pd(x, y); }\nstatic INLINE vdouble vdiv_vd_vd_vd(vdouble x, vdouble y) { return _mm256_div_pd(x, y); }\nstatic INLINE vdouble vrec_vd_vd(vdouble x) { return _mm256_div_pd(_mm256_set1_pd(1), x); }\nstatic INLINE vdouble vsqrt_vd_vd(vdouble x) { return _mm256_sqrt_pd(x); }\nstatic INLINE vdouble vabs_vd_vd(vdouble d) { return _mm256_andnot_pd(_mm256_set1_pd(-0.0), d); }\nstatic INLINE vdouble vneg_vd_vd(vdouble d) { return _mm256_xor_pd(_mm256_set1_pd(-0.0), d); }\nstatic INLINE vdouble vmla_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_fmadd_pd(x, y, z); }\nstatic INLINE vdouble vmlapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_fmsub_pd(x, y, z); }\nstatic INLINE vdouble vmlanp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_fnmadd_pd(x, y, z); }\nstatic INLINE vdouble vmax_vd_vd_vd(vdouble x, vdouble y) { return _mm256_max_pd(x, y); }\nstatic INLINE vdouble vmin_vd_vd_vd(vdouble x, vdouble y) { return _mm256_min_pd(x, y); }\n\nstatic INLINE vdouble vfma_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_fmadd_pd(x, y, z); }\nstatic INLINE vdouble vfmapp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_fmadd_pd(x, y, z); }\nstatic INLINE vdouble vfmapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_fmsub_pd(x, y, z); }\nstatic INLINE vdouble vfmanp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_fnmadd_pd(x, y, z); }\nstatic INLINE vdouble vfmann_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_fnmsub_pd(x, y, z); }\n\nstatic INLINE vopmask veq_vo_vd_vd(vdouble x, vdouble y) { return vreinterpret_vm_vd(_mm256_cmp_pd(x, y, _CMP_EQ_OQ)); }\nstatic INLINE vopmask vneq_vo_vd_vd(vdouble x, vdouble y) { return vreinterpret_vm_vd(_mm256_cmp_pd(x, y, _CMP_NEQ_UQ)); }\nstatic INLINE vopmask vlt_vo_vd_vd(vdouble x, vdouble y) { return vreinterpret_vm_vd(_mm256_cmp_pd(x, y, _CMP_LT_OQ)); }\nstatic INLINE vopmask vle_vo_vd_vd(vdouble x, vdouble y) { return vreinterpret_vm_vd(_mm256_cmp_pd(x, y, _CMP_LE_OQ)); }\nstatic INLINE vopmask vgt_vo_vd_vd(vdouble x, vdouble y) { return vreinterpret_vm_vd(_mm256_cmp_pd(x, y, _CMP_GT_OQ)); }\nstatic INLINE vopmask vge_vo_vd_vd(vdouble x, vdouble y) { return vreinterpret_vm_vd(_mm256_cmp_pd(x, y, _CMP_GE_OQ)); }\n\n//\n\nstatic INLINE vint vadd_vi_vi_vi(vint x, vint y) { return _mm_add_epi32(x, y); }\nstatic INLINE vint vsub_vi_vi_vi(vint x, vint y) { return _mm_sub_epi32(x, y); }\nstatic INLINE vint vneg_vi_vi(vint e) { return vsub_vi_vi_vi(vcast_vi_i(0), e); }\n\nstatic INLINE vint vand_vi_vi_vi(vint x, vint y) { return _mm_and_si128(x, y); }\nstatic INLINE vint vandnot_vi_vi_vi(vint x, vint y) { return _mm_andnot_si128(x, y); }\nstatic INLINE vint vor_vi_vi_vi(vint x, vint y) { return _mm_or_si128(x, y); }\nstatic INLINE vint vxor_vi_vi_vi(vint x, vint y) { return _mm_xor_si128(x, y); }\n\nstatic INLINE vint vandnot_vi_vo_vi(vopmask m, vint y) { return _mm_andnot_si128(_mm256_castsi256_si128(m), y); }\nstatic INLINE vint vand_vi_vo_vi(vopmask m, vint y) { return _mm_and_si128(_mm256_castsi256_si128(m), y); }\n\nstatic INLINE vint vsll_vi_vi_i(vint x, int c) { return _mm_slli_epi32(x, c); }\nstatic INLINE vint vsrl_vi_vi_i(vint x, int c) { return _mm_srli_epi32(x, c); }\nstatic INLINE vint vsra_vi_vi_i(vint x, int c) { return _mm_srai_epi32(x, c); }\n\nstatic INLINE vint veq_vi_vi_vi(vint x, vint y) { return _mm_cmpeq_epi32(x, y); }\nstatic INLINE vint vgt_vi_vi_vi(vint x, vint y) { return _mm_cmpgt_epi32(x, y); }\n\nstatic INLINE vopmask veq_vo_vi_vi(vint x, vint y) { return _mm256_castsi128_si256(_mm_cmpeq_epi32(x, y)); }\nstatic INLINE vopmask vgt_vo_vi_vi(vint x, vint y) { return _mm256_castsi128_si256(_mm_cmpgt_epi32(x, y)); }\n\nstatic INLINE vint vsel_vi_vo_vi_vi(vopmask m, vint x, vint y) { return _mm_blendv_epi8(y, x, _mm256_castsi256_si128(m)); }\n\nstatic INLINE vdouble vsel_vd_vo_vd_vd(vopmask o, vdouble x, vdouble y) { return _mm256_blendv_pd(y, x, _mm256_castsi256_pd(o)); }\nstatic INLINE vdouble vsel_vd_vo_d_d(vopmask o, double v1, double v0) { return _mm256_permutevar_pd(_mm256_set_pd(v1, v0, v1, v0), o); }\n\nstatic INLINE vdouble vsel_vd_vo_vo_vo_d_d_d_d(vopmask o0, vopmask o1, vopmask o2, double d0, double d1, double d2, double d3) {\n  __m256i v = _mm256_castpd_si256(vsel_vd_vo_vd_vd(o0, _mm256_castsi256_pd(_mm256_set_epi32(1, 0, 1, 0, 1, 0, 1, 0)),\n\t\t\t\t\t\t   vsel_vd_vo_vd_vd(o1, _mm256_castsi256_pd(_mm256_set_epi32(3, 2, 3, 2, 3, 2, 3, 2)),\n\t\t\t\t\t\t\t\t    vsel_vd_vo_vd_vd(o2, _mm256_castsi256_pd(_mm256_set_epi32(5, 4, 5, 4, 5, 4, 5, 4)),\n\t\t\t\t\t\t\t\t\t\t     _mm256_castsi256_pd(_mm256_set_epi32(7, 6, 7, 6, 7, 6, 7, 6))))));\n  return _mm256_castsi256_pd(_mm256_permutevar8x32_epi32(_mm256_castpd_si256(_mm256_set_pd(d3, d2, d1, d0)), v));\n}\n\nstatic INLINE vdouble vsel_vd_vo_vo_d_d_d(vopmask o0, vopmask o1, double d0, double d1, double d2) {\n  return vsel_vd_vo_vo_vo_d_d_d_d(o0, o1, o1, d0, d1, d2, d2);\n}\n\nstatic INLINE vopmask visinf_vo_vd(vdouble d) {\n  return vreinterpret_vm_vd(_mm256_cmp_pd(vabs_vd_vd(d), _mm256_set1_pd(SLEEF_INFINITY), _CMP_EQ_OQ));\n}\n\nstatic INLINE vopmask vispinf_vo_vd(vdouble d) {\n  return vreinterpret_vm_vd(_mm256_cmp_pd(d, _mm256_set1_pd(SLEEF_INFINITY), _CMP_EQ_OQ));\n}\n\nstatic INLINE vopmask visminf_vo_vd(vdouble d) {\n  return vreinterpret_vm_vd(_mm256_cmp_pd(d, _mm256_set1_pd(-SLEEF_INFINITY), _CMP_EQ_OQ));\n}\n\nstatic INLINE vopmask visnan_vo_vd(vdouble d) {\n  return vreinterpret_vm_vd(_mm256_cmp_pd(d, d, _CMP_NEQ_UQ));\n}\n\n#if defined(_MSC_VER)\n// This function is needed when debugging on MSVC.\nstatic INLINE double vcast_d_vd(vdouble v) {\n  double s[4];\n  _mm256_storeu_pd(s, v);\n  return s[0];\n}\n#endif\n\nstatic INLINE vdouble vload_vd_p(const double *ptr) { return _mm256_load_pd(ptr); }\nstatic INLINE vdouble vloadu_vd_p(const double *ptr) { return _mm256_loadu_pd(ptr); }\n\nstatic INLINE void vstore_v_p_vd(double *ptr, vdouble v) { _mm256_store_pd(ptr, v); }\nstatic INLINE void vstoreu_v_p_vd(double *ptr, vdouble v) { _mm256_storeu_pd(ptr, v); }\n\nstatic INLINE vdouble vgather_vd_p_vi(const double *ptr, vint vi) { return _mm256_i32gather_pd(ptr, vi, 8); }\n\n//\n\nstatic INLINE vint2 vcast_vi2_vm(vmask vm) { return vm; }\nstatic INLINE vmask vcast_vm_vi2(vint2 vi) { return vi; }\n\nstatic INLINE vint2 vrint_vi2_vf(vfloat vf) { return vcast_vi2_vm(_mm256_cvtps_epi32(vf)); }\nstatic INLINE vint2 vtruncate_vi2_vf(vfloat vf) { return vcast_vi2_vm(_mm256_cvttps_epi32(vf)); }\nstatic INLINE vfloat vcast_vf_vi2(vint2 vi) { return _mm256_cvtepi32_ps(vcast_vm_vi2(vi)); }\nstatic INLINE vfloat vcast_vf_f(float f) { return _mm256_set1_ps(f); }\nstatic INLINE vint2 vcast_vi2_i(int i) { return _mm256_set1_epi32(i); }\nstatic INLINE vmask vreinterpret_vm_vf(vfloat vf) { return _mm256_castps_si256(vf); }\nstatic INLINE vfloat vreinterpret_vf_vm(vmask vm) { return _mm256_castsi256_ps(vm); }\n\nstatic INLINE vfloat vreinterpret_vf_vi2(vint2 vi) { return vreinterpret_vf_vm(vcast_vm_vi2(vi)); }\nstatic INLINE vint2 vreinterpret_vi2_vf(vfloat vf) { return vcast_vi2_vm(vreinterpret_vm_vf(vf)); }\n\nstatic INLINE vfloat vadd_vf_vf_vf(vfloat x, vfloat y) { return _mm256_add_ps(x, y); }\nstatic INLINE vfloat vsub_vf_vf_vf(vfloat x, vfloat y) { return _mm256_sub_ps(x, y); }\nstatic INLINE vfloat vmul_vf_vf_vf(vfloat x, vfloat y) { return _mm256_mul_ps(x, y); }\nstatic INLINE vfloat vdiv_vf_vf_vf(vfloat x, vfloat y) { return _mm256_div_ps(x, y); }\nstatic INLINE vfloat vrec_vf_vf(vfloat x) { return vdiv_vf_vf_vf(vcast_vf_f(1.0f), x); }\nstatic INLINE vfloat vsqrt_vf_vf(vfloat x) { return _mm256_sqrt_ps(x); }\nstatic INLINE vfloat vabs_vf_vf(vfloat f) { return vreinterpret_vf_vm(vandnot_vm_vm_vm(vreinterpret_vm_vf(vcast_vf_f(-0.0f)), vreinterpret_vm_vf(f))); }\nstatic INLINE vfloat vneg_vf_vf(vfloat d) { return vreinterpret_vf_vm(vxor_vm_vm_vm(vreinterpret_vm_vf(vcast_vf_f(-0.0f)), vreinterpret_vm_vf(d))); }\nstatic INLINE vfloat vmla_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_fmadd_ps(x, y, z); }\nstatic INLINE vfloat vmlapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_fmsub_ps(x, y, z); }\nstatic INLINE vfloat vmlanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_fnmadd_ps(x, y, z); }\nstatic INLINE vfloat vmax_vf_vf_vf(vfloat x, vfloat y) { return _mm256_max_ps(x, y); }\nstatic INLINE vfloat vmin_vf_vf_vf(vfloat x, vfloat y) { return _mm256_min_ps(x, y); }\n\nstatic INLINE vfloat vfma_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_fmadd_ps(x, y, z); }\nstatic INLINE vfloat vfmapp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_fmadd_ps(x, y, z); }\nstatic INLINE vfloat vfmapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_fmsub_ps(x, y, z); }\nstatic INLINE vfloat vfmanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_fnmadd_ps(x, y, z); }\nstatic INLINE vfloat vfmann_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_fnmsub_ps(x, y, z); }\n\nstatic INLINE vopmask veq_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm256_cmp_ps(x, y, _CMP_EQ_OQ)); }\nstatic INLINE vopmask vneq_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm256_cmp_ps(x, y, _CMP_NEQ_UQ)); }\nstatic INLINE vopmask vlt_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm256_cmp_ps(x, y, _CMP_LT_OQ)); }\nstatic INLINE vopmask vle_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm256_cmp_ps(x, y, _CMP_LE_OQ)); }\nstatic INLINE vopmask vgt_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm256_cmp_ps(x, y, _CMP_GT_OQ)); }\nstatic INLINE vopmask vge_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm256_cmp_ps(x, y, _CMP_GE_OQ)); }\n\nstatic INLINE vint2 vadd_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm256_add_epi32(x, y); }\nstatic INLINE vint2 vsub_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm256_sub_epi32(x, y); }\nstatic INLINE vint2 vneg_vi2_vi2(vint2 e) { return vsub_vi2_vi2_vi2(vcast_vi2_i(0), e); }\n\nstatic INLINE vint2 vand_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm256_and_si256(x, y); }\nstatic INLINE vint2 vandnot_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm256_andnot_si256(x, y); }\nstatic INLINE vint2 vor_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm256_or_si256(x, y); }\nstatic INLINE vint2 vxor_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm256_xor_si256(x, y); }\n\nstatic INLINE vint2 vand_vi2_vo_vi2(vopmask x, vint2 y) { return vand_vi2_vi2_vi2(vcast_vi2_vm(x), y); }\nstatic INLINE vint2 vandnot_vi2_vo_vi2(vopmask x, vint2 y) { return vandnot_vi2_vi2_vi2(vcast_vi2_vm(x), y); }\n\nstatic INLINE vint2 vsll_vi2_vi2_i(vint2 x, int c) { return _mm256_slli_epi32(x, c); }\nstatic INLINE vint2 vsrl_vi2_vi2_i(vint2 x, int c) { return _mm256_srli_epi32(x, c); }\nstatic INLINE vint2 vsra_vi2_vi2_i(vint2 x, int c) { return _mm256_srai_epi32(x, c); }\n\nstatic INLINE vopmask veq_vo_vi2_vi2(vint2 x, vint2 y) { return _mm256_cmpeq_epi32(x, y); }\nstatic INLINE vopmask vgt_vo_vi2_vi2(vint2 x, vint2 y) { return _mm256_cmpgt_epi32(x, y); }\nstatic INLINE vint2 veq_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm256_cmpeq_epi32(x, y); }\nstatic INLINE vint2 vgt_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm256_cmpgt_epi32(x, y); }\n\nstatic INLINE vint2 vsel_vi2_vo_vi2_vi2(vopmask m, vint2 x, vint2 y) {\n  return _mm256_blendv_epi8(y, x, m);\n}\n\nstatic INLINE vfloat vsel_vf_vo_vf_vf(vopmask o, vfloat x, vfloat y) { return _mm256_blendv_ps(y, x, _mm256_castsi256_ps(o)); }\n\n// At this point, the following three functions are implemented in a generic way,\n// but I will try target-specific optimization later on.\nstatic INLINE CONST vfloat vsel_vf_vo_f_f(vopmask o, float v1, float v0) {\n  return vsel_vf_vo_vf_vf(o, vcast_vf_f(v1), vcast_vf_f(v0));\n}\n\nstatic INLINE vfloat vsel_vf_vo_vo_f_f_f(vopmask o0, vopmask o1, float d0, float d1, float d2) {\n  return vsel_vf_vo_vf_vf(o0, vcast_vf_f(d0), vsel_vf_vo_f_f(o1, d1, d2));\n}\n\nstatic INLINE vfloat vsel_vf_vo_vo_vo_f_f_f_f(vopmask o0, vopmask o1, vopmask o2, float d0, float d1, float d2, float d3) {\n  return vsel_vf_vo_vf_vf(o0, vcast_vf_f(d0), vsel_vf_vo_vf_vf(o1, vcast_vf_f(d1), vsel_vf_vo_f_f(o2, d2, d3)));\n}\n\nstatic INLINE vopmask visinf_vo_vf(vfloat d) { return veq_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(SLEEF_INFINITYf)); }\nstatic INLINE vopmask vispinf_vo_vf(vfloat d) { return veq_vo_vf_vf(d, vcast_vf_f(SLEEF_INFINITYf)); }\nstatic INLINE vopmask visminf_vo_vf(vfloat d) { return veq_vo_vf_vf(d, vcast_vf_f(-SLEEF_INFINITYf)); }\nstatic INLINE vopmask visnan_vo_vf(vfloat d) { return vneq_vo_vf_vf(d, d); }\n\n#ifdef _MSC_VER\n// This function is needed when debugging on MSVC.\nstatic INLINE float vcast_f_vf(vfloat v) {\n  float s[8];\n  _mm256_storeu_ps(s, v);\n  return s[0];\n}\n#endif\n\nstatic INLINE vfloat vload_vf_p(const float *ptr) { return _mm256_load_ps(ptr); }\nstatic INLINE vfloat vloadu_vf_p(const float *ptr) { return _mm256_loadu_ps(ptr); }\n\nstatic INLINE void vstore_v_p_vf(float *ptr, vfloat v) { _mm256_store_ps(ptr, v); }\nstatic INLINE void vstoreu_v_p_vf(float *ptr, vfloat v) { _mm256_storeu_ps(ptr, v); }\n\nstatic INLINE vfloat vgather_vf_p_vi2(const float *ptr, vint2 vi2) { return _mm256_i32gather_ps(ptr, vi2, 4); }\n\n//\n\n#define PNMASK ((vdouble) { +0.0, -0.0, +0.0, -0.0 })\n#define NPMASK ((vdouble) { -0.0, +0.0, -0.0, +0.0 })\n#define PNMASKf ((vfloat) { +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f })\n#define NPMASKf ((vfloat) { -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f })\n\nstatic INLINE vdouble vposneg_vd_vd(vdouble d) { return vreinterpret_vd_vm(vxor_vm_vm_vm(vreinterpret_vm_vd(d), vreinterpret_vm_vd(PNMASK))); }\nstatic INLINE vdouble vnegpos_vd_vd(vdouble d) { return vreinterpret_vd_vm(vxor_vm_vm_vm(vreinterpret_vm_vd(d), vreinterpret_vm_vd(NPMASK))); }\nstatic INLINE vfloat vposneg_vf_vf(vfloat d) { return vreinterpret_vf_vm(vxor_vm_vm_vm(vreinterpret_vm_vf(d), vreinterpret_vm_vf(PNMASKf))); }\nstatic INLINE vfloat vnegpos_vf_vf(vfloat d) { return vreinterpret_vf_vm(vxor_vm_vm_vm(vreinterpret_vm_vf(d), vreinterpret_vm_vf(NPMASKf))); }\n\nstatic INLINE vdouble vsubadd_vd_vd_vd(vdouble x, vdouble y) { return _mm256_addsub_pd(x, y); }\nstatic INLINE vfloat vsubadd_vf_vf_vf(vfloat x, vfloat y) { return _mm256_addsub_ps(x, y); }\n\nstatic INLINE vdouble vmlsubadd_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vmla_vd_vd_vd_vd(x, y, vnegpos_vd_vd(z)); }\nstatic INLINE vfloat vmlsubadd_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vmla_vf_vf_vf_vf(x, y, vnegpos_vf_vf(z)); }\n\nstatic INLINE vdouble vrev21_vd_vd(vdouble d0) { return  _mm256_shuffle_pd(d0, d0, (0 << 3) | (1 << 2) | (0 << 1) | (1 << 0)); }\nstatic INLINE vdouble vreva2_vd_vd(vdouble d0) { d0 = _mm256_permute2f128_pd(d0, d0, 1); return _mm256_shuffle_pd(d0, d0, (1 << 3) | (0 << 2) | (1 << 1) | (0 << 0)); }\n\nstatic INLINE void vstream_v_p_vd(double *ptr, vdouble v) { _mm256_stream_pd(ptr, v); }\nstatic INLINE void vscatter2_v_p_i_i_vd(double *ptr, int offset, int step, vdouble v) {\n  _mm_store_pd(&ptr[(offset + step * 0)*2], _mm256_extractf128_pd(v, 0));\n  _mm_store_pd(&ptr[(offset + step * 1)*2], _mm256_extractf128_pd(v, 1));\n}\n\nstatic INLINE void vsscatter2_v_p_i_i_vd(double *ptr, int offset, int step, vdouble v) {\n  _mm_stream_pd(&ptr[(offset + step * 0)*2], _mm256_extractf128_pd(v, 0));\n  _mm_stream_pd(&ptr[(offset + step * 1)*2], _mm256_extractf128_pd(v, 1));\n}\n\n//\n\nstatic INLINE vfloat vrev21_vf_vf(vfloat d0) { return _mm256_shuffle_ps(d0, d0, (2 << 6) | (3 << 4) | (0 << 2) | (1 << 0)); }\nstatic INLINE vfloat vreva2_vf_vf(vfloat d0) { d0 = _mm256_permute2f128_ps(d0, d0, 1); return _mm256_shuffle_ps(d0, d0, (1 << 6) | (0 << 4) | (3 << 2) | (2 << 0)); }\nstatic INLINE vint2 vrev21_vi2_vi2(vint2 i) { return vreinterpret_vi2_vf(vrev21_vf_vf(vreinterpret_vf_vi2(i))); }\n\nstatic INLINE void vstream_v_p_vf(float *ptr, vfloat v) { _mm256_stream_ps(ptr, v); }\n\nstatic INLINE void vscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v) {\n  _mm_storel_pd((double *)(ptr+(offset + step * 0)*2), _mm_castsi128_pd(_mm_castps_si128(_mm256_extractf128_ps(v, 0))));\n  _mm_storeh_pd((double *)(ptr+(offset + step * 1)*2), _mm_castsi128_pd(_mm_castps_si128(_mm256_extractf128_ps(v, 0))));\n  _mm_storel_pd((double *)(ptr+(offset + step * 2)*2), _mm_castsi128_pd(_mm_castps_si128(_mm256_extractf128_ps(v, 1))));\n  _mm_storeh_pd((double *)(ptr+(offset + step * 3)*2), _mm_castsi128_pd(_mm_castps_si128(_mm256_extractf128_ps(v, 1))));\n}\n\nstatic INLINE void vsscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v) { vscatter2_v_p_i_i_vf(ptr, offset, step, v); }\n\n//\n\nstatic INLINE vmask2 vinterleave_vm2_vm2(vmask2 v) {\n  return (vmask2) { _mm256_unpacklo_epi64(v.x, v.y), _mm256_unpackhi_epi64(v.x, v.y) };\n}\n\nstatic INLINE vmask2 vuninterleave_vm2_vm2(vmask2 v) {\n  return (vmask2) { _mm256_unpacklo_epi64(v.x, v.y), _mm256_unpackhi_epi64(v.x, v.y) };\n}\n\nstatic INLINE vint vuninterleave_vi_vi(vint v) {\n  return _mm_shuffle_epi32(v, (0 << 0) | (2 << 2) | (1 << 4) | (3 << 6));\n}\n\nstatic INLINE vdouble vinterleave_vd_vd(vdouble vd) {\n  return vreinterpret_vd_vm(_mm256_permute4x64_epi64(vreinterpret_vm_vd(vd), (3 << 6) | (1 << 4) | (2 << 2) | (0 << 0)));\n}\n\nstatic INLINE vdouble vuninterleave_vd_vd(vdouble vd) {\n  return vreinterpret_vd_vm(_mm256_permute4x64_epi64(vreinterpret_vm_vd(vd), (3 << 6) | (1 << 4) | (2 << 2) | (0 << 0)));\n}\n\nstatic INLINE vmask vinterleave_vm_vm(vmask vm) {\n  return _mm256_permute4x64_epi64(vm, (3 << 6) | (1 << 4) | (2 << 2) | (0 << 0));\n}\n\nstatic INLINE vmask vuninterleave_vm_vm(vmask vm) {\n  return _mm256_permute4x64_epi64(vm, (3 << 6) | (1 << 4) | (2 << 2) | (0 << 0));\n}\n\nstatic vmask2 vloadu_vm2_p(void *p) {\n  vmask2 vm2;\n  memcpy(&vm2, p, VECTLENDP * 16);\n  return vm2;\n}\n\n#if !defined(SLEEF_GENHEADER)\ntypedef Sleef_quad4 vargquad;\n\nstatic INLINE vmask2 vcast_vm2_aq(vargquad aq) {\n  return vinterleave_vm2_vm2(vloadu_vm2_p(&aq));\n}\n\nstatic INLINE vargquad vcast_aq_vm2(vmask2 vm2) {\n  vm2 = vuninterleave_vm2_vm2(vm2);\n  vargquad aq;\n  memcpy(&aq, &vm2, VECTLENDP * 16);\n  return aq;\n}\n#endif // #if !defined(SLEEF_GENHEADER)\n\nstatic INLINE int vtestallzeros_i_vo64(vopmask g) {\n  return _mm_movemask_epi8(_mm_or_si128(_mm256_extractf128_si256(g, 0), _mm256_extractf128_si256(g, 1))) == 0;\n}\n\nstatic INLINE vmask vsel_vm_vo64_vm_vm(vopmask o, vmask x, vmask y) { return _mm256_blendv_epi8(y, x, o); }\n\nstatic INLINE vmask vsub64_vm_vm_vm(vmask x, vmask y) { return _mm256_sub_epi64(x, y); }\nstatic INLINE vmask vneg64_vm_vm(vmask x) { return _mm256_sub_epi64(vcast_vm_i_i(0, 0), x); }\nstatic INLINE vopmask vgt64_vo_vm_vm(vmask x, vmask y) { return _mm256_cmpgt_epi64(x, y); } // signed compare\n\n#define vsll64_vm_vm_i(x, c) _mm256_slli_epi64(x, c)\n#define vsrl64_vm_vm_i(x, c) _mm256_srli_epi64(x, c)\n//@#define vsll64_vm_vm_i(x, c) _mm256_slli_epi64(x, c)\n//@#define vsrl64_vm_vm_i(x, c) _mm256_srli_epi64(x, c)\n\nstatic INLINE vmask vcast_vm_vi(vint vi) { return _mm256_cvtepi32_epi64(vi); }\nstatic INLINE vint vcast_vi_vm(vmask vm) {\n  return _mm_or_si128(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(_mm256_castsi256_si128(vm)), _mm_set1_ps(0), 0x08)),\n  \t\t      _mm_castps_si128(_mm_shuffle_ps(_mm_set1_ps(0), _mm_castsi128_ps(_mm256_extractf128_si256(vm, 1)), 0x80)));\n}\n"
  },
  {
    "path": "src/helperavx512f.h",
    "content": "//   Copyright Naoki Shibata and contributors 2010 - 2020.\n// Distributed under the Boost Software License, Version 1.0.\n//    (See accompanying file LICENSE.txt or copy at\n//          http://www.boost.org/LICENSE_1_0.txt)\n\n#if CONFIG == 1 || CONFIG == 2\n\n#if !defined(__AVX512F__) && !defined(SLEEF_GENHEADER)\n#error Please specify -mavx512f.\n#endif\n\n#else\n#error CONFIG macro invalid or not defined\n#endif\n\n#define ENABLE_DP\n//@#define ENABLE_DP\n#define LOG2VECTLENDP 3\n//@#define LOG2VECTLENDP 3\n#define VECTLENDP (1 << LOG2VECTLENDP)\n//@#define VECTLENDP (1 << LOG2VECTLENDP)\n\n#define ENABLE_SP\n//@#define ENABLE_SP\n#define LOG2VECTLENSP (LOG2VECTLENDP+1)\n//@#define LOG2VECTLENSP (LOG2VECTLENDP+1)\n#define VECTLENSP (1 << LOG2VECTLENSP)\n//@#define VECTLENSP (1 << LOG2VECTLENSP)\n\n#if CONFIG == 1\n#define ENABLE_FMA_DP\n//@#define ENABLE_FMA_DP\n#define ENABLE_FMA_SP\n//@#define ENABLE_FMA_SP\n#endif\n\n#define FULL_FP_ROUNDING\n//@#define FULL_FP_ROUNDING\n#define ACCURATE_SQRT\n//@#define ACCURATE_SQRT\n\n#if !defined(SLEEF_GENHEADER)\n#if defined(_MSC_VER)\n#include <intrin.h>\n#else\n#include <x86intrin.h>\n#endif\n\n#include <stdint.h>\n#include \"misc.h\"\n#endif // #if !defined(SLEEF_GENHEADER)\n\ntypedef __m512i vmask;\ntypedef __mmask16 vopmask;\n\ntypedef __m512d vdouble;\ntypedef __m256i vint;\n\ntypedef __m512 vfloat;\ntypedef __m512i vint2;\n\ntypedef struct {\n  vmask x, y;\n} vmask2;\n\n//\n\n#if !defined(SLEEF_GENHEADER)\n\n#ifndef __SLEEF_H__\nstatic inline\n                       void Sleef_x86CpuID(int32_t out[4], uint32_t eax,\n                                           uint32_t ecx) {\n                         /* We don't care for cpuid detection */\n                         out[0] = 0xFFFFFFFF;\n                         out[1] = 0xFFFFFFFF;\n                         out[2] = 0xFFFFFFFF;\n                         out[3] = 0xFFFFFFFF;\n                       }\n                       #endif\n\nstatic INLINE int cpuSupportsAVX512F() {\n    int32_t reg[4];\n    Sleef_x86CpuID(reg, 7, 0);\n    return (reg[1] & (1 << 16)) != 0;\n}\n\n#if CONFIG == 1 && defined(__AVX512F__)\nstatic INLINE int vavailability_i(int name) {\n  int d = cpuSupportsAVX512F();\n  return d ? 3 : 0;\n}\n#define ISANAME \"AVX512F\"\n#define DFTPRIORITY 30\n#endif\n\n#if CONFIG == 2 && defined(__AVX512F__)\nstatic INLINE int vavailability_i(int name) {\n  int d = cpuSupportsAVX512F();\n  return d ? 3 : 0;\n}\n#define ISANAME \"AVX512FNOFMA\"\n#define DFTPRIORITY 0\n#endif\n\n#endif // #if !defined(SLEEF_GENHEADER)\n\nstatic INLINE void vprefetch_v_p(const void *ptr) { _mm_prefetch(ptr, _MM_HINT_T0); }\n\n#ifdef __INTEL_COMPILER\nstatic INLINE int vtestallones_i_vo64(vopmask g) { return _mm512_mask2int(g) == 0xff; }\nstatic INLINE int vtestallones_i_vo32(vopmask g) { return _mm512_mask2int(g) == 0xffff; }\n#else\nstatic INLINE int vtestallones_i_vo64(vopmask g) { return g == 0xff; }\nstatic INLINE int vtestallones_i_vo32(vopmask g) { return g == 0xffff; }\n#endif\n\n//\n\nstatic vint2 vloadu_vi2_p(int32_t *p) { return _mm512_loadu_si512((__m512i const *)p); }\nstatic void vstoreu_v_p_vi2(int32_t *p, vint2 v) { _mm512_storeu_si512((__m512i *)p, v); }\nstatic vint vloadu_vi_p(int32_t *p) { return _mm256_loadu_si256((__m256i const *)p); }\nstatic void vstoreu_v_p_vi(int32_t *p, vint v) { _mm256_storeu_si256((__m256i *)p, v); }\n\n//\n\nstatic INLINE vmask vand_vm_vm_vm(vmask x, vmask y) { return _mm512_and_si512(x, y); }\nstatic INLINE vmask vandnot_vm_vm_vm(vmask x, vmask y) { return _mm512_andnot_si512(x, y); }\nstatic INLINE vmask vor_vm_vm_vm(vmask x, vmask y) { return _mm512_or_si512(x, y); }\nstatic INLINE vmask vxor_vm_vm_vm(vmask x, vmask y) { return _mm512_xor_si512(x, y); }\n\nstatic INLINE vopmask vand_vo_vo_vo(vopmask x, vopmask y) { return _mm512_kand(x, y); }\nstatic INLINE vopmask vandnot_vo_vo_vo(vopmask x, vopmask y) { return _mm512_kandn(x, y); }\nstatic INLINE vopmask vor_vo_vo_vo(vopmask x, vopmask y) { return _mm512_kor(x, y); }\nstatic INLINE vopmask vxor_vo_vo_vo(vopmask x, vopmask y) { return _mm512_kxor(x, y); }\n\nstatic INLINE vmask vand_vm_vo64_vm(vopmask o, vmask m) { return _mm512_mask_and_epi64(_mm512_set1_epi32(0), o, m, m); }\nstatic INLINE vmask vandnot_vm_vo64_vm(vopmask o, vmask m) { return _mm512_mask_and_epi64(m, o, _mm512_set1_epi32(0), _mm512_set1_epi32(0)); }\nstatic INLINE vmask vor_vm_vo64_vm(vopmask o, vmask m) { return _mm512_mask_or_epi64(m, o, _mm512_set1_epi32(-1), _mm512_set1_epi32(-1)); }\n\nstatic INLINE vmask vand_vm_vo32_vm(vopmask o, vmask m) { return _mm512_mask_and_epi32(_mm512_set1_epi32(0), o, m, m); }\nstatic INLINE vmask vandnot_vm_vo32_vm(vopmask o, vmask m) { return _mm512_mask_and_epi32(m, o, _mm512_set1_epi32(0), _mm512_set1_epi32(0)); }\nstatic INLINE vmask vor_vm_vo32_vm(vopmask o, vmask m) { return _mm512_mask_or_epi32(m, o, _mm512_set1_epi32(-1), _mm512_set1_epi32(-1)); }\n\nstatic INLINE vopmask vcast_vo32_vo64(vopmask o) { return o; }\nstatic INLINE vopmask vcast_vo64_vo32(vopmask o) { return o; }\n\n//\n\nstatic INLINE vint vrint_vi_vd(vdouble vd) {\n  return _mm512_cvt_roundpd_epi32(vd, _MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC);\n}\n\nstatic INLINE vint vtruncate_vi_vd(vdouble vd) {\n  return _mm512_cvt_roundpd_epi32(vd, _MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC);\n}\n\nstatic INLINE vdouble vcast_vd_vi(vint vi) { return _mm512_cvtepi32_pd(vi); }\nstatic INLINE vint vcast_vi_i(int i) { return _mm256_set1_epi32(i); }\n\nstatic INLINE vdouble vtruncate_vd_vd(vdouble vd) {\n  return _mm512_roundscale_pd(vd, _MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC);\n}\n\nstatic INLINE vdouble vrint_vd_vd(vdouble vd) {\n  return _mm512_roundscale_pd(vd, _MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC);\n}\n\nstatic INLINE vint2 vcastu_vi2_vi(vint vi) {\n  return _mm512_maskz_permutexvar_epi32(0xaaaa, _mm512_set_epi32(7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0), _mm512_castsi256_si512(vi));\n}\n\nstatic INLINE vint vcastu_vi_vi2(vint2 vi) {\n  return _mm512_castsi512_si256(_mm512_maskz_permutexvar_epi32(0x00ff, _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 15, 13, 11, 9, 7, 5, 3, 1), vi));\n}\n\nstatic INLINE vmask vcast_vm_i_i(int i0, int i1) { return _mm512_set_epi32(i0, i1, i0, i1, i0, i1, i0, i1, i0, i1, i0, i1, i0, i1, i0, i1); }\n\nstatic INLINE vopmask veq64_vo_vm_vm(vmask x, vmask y) { return _mm512_cmp_epi64_mask(x, y, _MM_CMPINT_EQ); }\nstatic INLINE vmask vadd64_vm_vm_vm(vmask x, vmask y) { return _mm512_add_epi64(x, y); }\n\n//\n\nstatic INLINE vdouble vcast_vd_d(double d) { return _mm512_set1_pd(d); }\nstatic INLINE vmask vreinterpret_vm_vd(vdouble vd) { return _mm512_castpd_si512(vd); }\nstatic INLINE vdouble vreinterpret_vd_vm(vmask vm) { return _mm512_castsi512_pd(vm); }\nstatic INLINE vint2 vreinterpret_vi2_vd(vdouble vd) { return _mm512_castpd_si512(vd); }\nstatic INLINE vdouble vreinterpret_vd_vi2(vint2 vi) { return _mm512_castsi512_pd(vi); }\n\nstatic INLINE vdouble vadd_vd_vd_vd(vdouble x, vdouble y) { return _mm512_add_pd(x, y); }\nstatic INLINE vdouble vsub_vd_vd_vd(vdouble x, vdouble y) { return _mm512_sub_pd(x, y); }\nstatic INLINE vdouble vmul_vd_vd_vd(vdouble x, vdouble y) { return _mm512_mul_pd(x, y); }\nstatic INLINE vdouble vdiv_vd_vd_vd(vdouble x, vdouble y) { return _mm512_div_pd(x, y); }\nstatic INLINE vdouble vrec_vd_vd(vdouble x) { return _mm512_div_pd(_mm512_set1_pd(1), x); }\nstatic INLINE vdouble vsqrt_vd_vd(vdouble x) { return _mm512_sqrt_pd(x); }\nstatic INLINE vdouble vabs_vd_vd(vdouble d) { return vreinterpret_vd_vm(_mm512_andnot_si512(vreinterpret_vm_vd(_mm512_set1_pd(-0.0)), vreinterpret_vm_vd(d))); }\nstatic INLINE vdouble vneg_vd_vd(vdouble d) { return vreinterpret_vd_vm(_mm512_xor_si512(vreinterpret_vm_vd(_mm512_set1_pd(-0.0)), vreinterpret_vm_vd(d))); }\nstatic INLINE vdouble vmax_vd_vd_vd(vdouble x, vdouble y) { return _mm512_max_pd(x, y); }\nstatic INLINE vdouble vmin_vd_vd_vd(vdouble x, vdouble y) { return _mm512_min_pd(x, y); }\n\n#if CONFIG == 1\nstatic INLINE vdouble vmla_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm512_fmadd_pd(x, y, z); }\nstatic INLINE vdouble vmlapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm512_fmsub_pd(x, y, z); }\nstatic INLINE vdouble vmlanp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm512_fnmadd_pd(x, y, z); }\n#else\nstatic INLINE vdouble vmla_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vadd_vd_vd_vd(vmul_vd_vd_vd(x, y), z); }\nstatic INLINE vdouble vmlapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vsub_vd_vd_vd(vmul_vd_vd_vd(x, y), z); }\n#endif\n\nstatic INLINE vdouble vfma_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm512_fmadd_pd(x, y, z); }\nstatic INLINE vdouble vfmapp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm512_fmadd_pd(x, y, z); }\nstatic INLINE vdouble vfmapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm512_fmsub_pd(x, y, z); }\nstatic INLINE vdouble vfmanp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm512_fnmadd_pd(x, y, z); }\nstatic INLINE vdouble vfmann_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm512_fnmsub_pd(x, y, z); }\n\nstatic INLINE vopmask veq_vo_vd_vd(vdouble x, vdouble y) { return _mm512_cmp_pd_mask(x, y, _CMP_EQ_OQ); }\nstatic INLINE vopmask vneq_vo_vd_vd(vdouble x, vdouble y) { return _mm512_cmp_pd_mask(x, y, _CMP_NEQ_UQ); }\nstatic INLINE vopmask vlt_vo_vd_vd(vdouble x, vdouble y) { return _mm512_cmp_pd_mask(x, y, _CMP_LT_OQ); }\nstatic INLINE vopmask vle_vo_vd_vd(vdouble x, vdouble y) { return _mm512_cmp_pd_mask(x, y, _CMP_LE_OQ); }\nstatic INLINE vopmask vgt_vo_vd_vd(vdouble x, vdouble y) { return _mm512_cmp_pd_mask(x, y, _CMP_GT_OQ); }\nstatic INLINE vopmask vge_vo_vd_vd(vdouble x, vdouble y) { return _mm512_cmp_pd_mask(x, y, _CMP_GE_OQ); }\n\n//\n\nstatic INLINE vint vadd_vi_vi_vi(vint x, vint y) { return _mm256_add_epi32(x, y); }\nstatic INLINE vint vsub_vi_vi_vi(vint x, vint y) { return _mm256_sub_epi32(x, y); }\nstatic INLINE vint vneg_vi_vi(vint e) { return vsub_vi_vi_vi(vcast_vi_i(0), e); }\n\nstatic INLINE vint vand_vi_vi_vi(vint x, vint y) { return _mm256_and_si256(x, y); }\nstatic INLINE vint vandnot_vi_vi_vi(vint x, vint y) { return _mm256_andnot_si256(x, y); }\n\nstatic INLINE vint vandnot_vi_vo_vi(vopmask o, vint y) {\n  return _mm512_castsi512_si256(_mm512_mask_and_epi32(_mm512_castsi256_si512(y), o, _mm512_set1_epi32(0), _mm512_set1_epi32(0)));\n}\nstatic INLINE vint vand_vi_vo_vi(vopmask o, vint y) {\n  return _mm512_castsi512_si256(_mm512_mask_and_epi32(_mm512_set1_epi32(0), o, _mm512_castsi256_si512(y), _mm512_castsi256_si512(y)));\n}\n\nstatic INLINE vint vor_vi_vi_vi(vint x, vint y) { return _mm256_or_si256(x, y); }\nstatic INLINE vint vxor_vi_vi_vi(vint x, vint y) { return _mm256_xor_si256(x, y); }\n#define vsll_vi_vi_i(x, c) _mm256_slli_epi32(x, c)\n#define vsrl_vi_vi_i(x, c) _mm256_srli_epi32(x, c)\n#define vsra_vi_vi_i(x, c) _mm256_srai_epi32(x, c)\n//@#define vsll_vi_vi_i(x, c) _mm256_slli_epi32(x, c)\n//@#define vsrl_vi_vi_i(x, c) _mm256_srli_epi32(x, c)\n//@#define vsra_vi_vi_i(x, c) _mm256_srai_epi32(x, c)\n\nstatic INLINE vint veq_vi_vi_vi(vint x, vint y) { return _mm256_cmpeq_epi32(x, y); }\nstatic INLINE vint vgt_vi_vi_vi(vint x, vint y) { return _mm256_cmpgt_epi32(x, y); }\n\nstatic INLINE vopmask veq_vo_vi_vi(vint x, vint y) {\n  return _mm512_cmp_epi32_mask(_mm512_castsi256_si512(x), _mm512_castsi256_si512(y), _MM_CMPINT_EQ);\n}\nstatic INLINE vopmask vgt_vo_vi_vi(vint x, vint y) {\n  return _mm512_cmp_epi32_mask(_mm512_castsi256_si512(y), _mm512_castsi256_si512(x), _MM_CMPINT_LT);\n}\n\nstatic INLINE vdouble vsel_vd_vo_vd_vd(vopmask mask, vdouble x, vdouble y) {\n  return _mm512_mask_blend_pd(mask, y, x);\n}\n\nstatic INLINE CONST vdouble vsel_vd_vo_d_d(vopmask o, double v1, double v0) {\n  return vsel_vd_vo_vd_vd(o, vcast_vd_d(v1), vcast_vd_d(v0));\n}\n\n#if 1\n// Probably this is faster\nstatic INLINE vdouble vsel_vd_vo_vo_vo_d_d_d_d(vopmask o0, vopmask o1, vopmask o2, double d0, double d1, double d2, double d3) {\n  __m512i v = _mm512_castpd_si512(vsel_vd_vo_vd_vd(o0, _mm512_castsi512_pd(_mm512_set_epi64(0, 0, 0, 0, 0, 0, 0, 0)),\n\t\t\t\t\t\t   vsel_vd_vo_vd_vd(o1, _mm512_castsi512_pd(_mm512_set_epi64(1, 1, 1, 1, 1, 1, 1, 1)),\n\t\t\t\t\t\t\t\t    vsel_vd_vo_vd_vd(o2, _mm512_castsi512_pd(_mm512_set_epi64(2, 2, 2, 2, 2, 2, 2, 2)),\n\t\t\t\t\t\t\t\t\t\t     _mm512_castsi512_pd(_mm512_set_epi64(3, 3, 3, 3, 3, 3, 3, 3))))));\n  return _mm512_permutexvar_pd(v, _mm512_castpd256_pd512(_mm256_set_pd(d3, d2, d1, d0)));\n}\n\nstatic INLINE vdouble vsel_vd_vo_vo_d_d_d(vopmask o0, vopmask o1, double d0, double d1, double d2) {\n  return vsel_vd_vo_vo_vo_d_d_d_d(o0, o1, o1, d0, d1, d2, d2);\n}\n#else\nstatic INLINE vdouble vsel_vd_vo_vo_d_d_d(vopmask o0, vopmask o1, double d0, double d1, double d2) {\n  return vsel_vd_vo_vd_vd(o0, vcast_vd_d(d0), vsel_vd_vo_d_d(o1, d1, d2));\n}\n\nstatic INLINE vdouble vsel_vd_vo_vo_vo_d_d_d_d(vopmask o0, vopmask o1, vopmask o2, double d0, double d1, double d2, double d3) {\n  return vsel_vd_vo_vd_vd(o0, vcast_vd_d(d0), vsel_vd_vo_vd_vd(o1, vcast_vd_d(d1), vsel_vd_vo_d_d(o2, d2, d3)));\n}\n#endif\n\nstatic INLINE vopmask visinf_vo_vd(vdouble d) {\n  return _mm512_cmp_pd_mask(vabs_vd_vd(d), _mm512_set1_pd(SLEEF_INFINITY), _CMP_EQ_OQ);\n}\n\nstatic INLINE vopmask vispinf_vo_vd(vdouble d) {\n  return _mm512_cmp_pd_mask(d, _mm512_set1_pd(SLEEF_INFINITY), _CMP_EQ_OQ);\n}\n\nstatic INLINE vopmask visminf_vo_vd(vdouble d) {\n  return _mm512_cmp_pd_mask(d, _mm512_set1_pd(-SLEEF_INFINITY), _CMP_EQ_OQ);\n}\n\nstatic INLINE vopmask visnan_vo_vd(vdouble d) {\n  return _mm512_cmp_pd_mask(d, d, _CMP_NEQ_UQ);\n}\n\nstatic INLINE vint vilogbk_vi_vd(vdouble d) { return vrint_vi_vd(_mm512_getexp_pd(d)); }\n\n// vilogb2k_vi_vd is similar to vilogbk_vi_vd, but the argument has to\n// be a normalized FP value.\nstatic INLINE vint vilogb2k_vi_vd(vdouble d) { return vrint_vi_vd(_mm512_getexp_pd(d)); }\n\nstatic INLINE vdouble vgetexp_vd_vd(vdouble d) { return _mm512_getexp_pd(d); }\nstatic INLINE vfloat vgetexp_vf_vf(vfloat d) { return _mm512_getexp_ps(d); }\n\nstatic INLINE vdouble vgetmant_vd_vd(vdouble d) { return _mm512_getmant_pd(d, _MM_MANT_NORM_p75_1p5, _MM_MANT_SIGN_nan); }\nstatic INLINE vfloat vgetmant_vf_vf(vfloat d) { return _mm512_getmant_ps(d, _MM_MANT_NORM_p75_1p5, _MM_MANT_SIGN_nan); }\n\n#define vfixup_vd_vd_vd_vi2_i(a, b, c, imm) _mm512_fixupimm_pd((a), (b), (c), (imm))\n#define vfixup_vf_vf_vf_vi2_i(a, b, c, imm) _mm512_fixupimm_ps((a), (b), (c), (imm))\n//@#define vfixup_vd_vd_vd_vi2_i(a, b, c, imm) _mm512_fixupimm_pd((a), (b), (c), (imm))\n//@#define vfixup_vf_vf_vf_vi2_i(a, b, c, imm) _mm512_fixupimm_ps((a), (b), (c), (imm))\n\n#if defined(_MSC_VER)\n// This function is needed when debugging on MSVC.\nstatic INLINE double vcast_d_vd(vdouble v) {\n  double s[VECTLENDP];\n  _mm512_storeu_pd(s, v);\n  return s[0];\n}\n#endif\n\nstatic INLINE vdouble vload_vd_p(const double *ptr) { return _mm512_load_pd(ptr); }\nstatic INLINE vdouble vloadu_vd_p(const double *ptr) { return _mm512_loadu_pd(ptr); }\n\nstatic INLINE void vstore_v_p_vd(double *ptr, vdouble v) { _mm512_store_pd(ptr, v); }\nstatic INLINE void vstoreu_v_p_vd(double *ptr, vdouble v) { _mm512_storeu_pd(ptr, v); }\n\nstatic INLINE vdouble vgather_vd_p_vi(const double *ptr, vint vi) { return _mm512_i32gather_pd(vi, ptr, 8); }\n\n//\n\nstatic INLINE vint vsel_vi_vo_vi_vi(vopmask m, vint x, vint y) {\n  return _mm512_castsi512_si256(_mm512_mask_blend_epi32(m, _mm512_castsi256_si512(y), _mm512_castsi256_si512(x)));\n}\n\n//\n\nstatic INLINE vmask vreinterpret_vm_vf(vfloat vf) { return _mm512_castps_si512(vf); }\nstatic INLINE vfloat vreinterpret_vf_vm(vmask vm) { return _mm512_castsi512_ps(vm); }\nstatic INLINE vfloat vreinterpret_vf_vi2(vint2 vi) { return _mm512_castsi512_ps(vi); }\nstatic INLINE vint2 vreinterpret_vi2_vf(vfloat vf) { return _mm512_castps_si512(vf); }\n\nstatic INLINE vdouble vreinterpret_vd_vf(vfloat vf) { return _mm512_castps_pd(vf); }\nstatic INLINE vfloat vreinterpret_vf_vd(vdouble vd) { return _mm512_castpd_ps(vd); }\n\nstatic INLINE vint2 vcast_vi2_vm(vmask vm) { return vm; }\nstatic INLINE vmask vcast_vm_vi2(vint2 vi) { return vi; }\n \nstatic INLINE vfloat vcast_vf_vi2(vint2 vi) { return _mm512_cvtepi32_ps(vcast_vm_vi2(vi)); }\nstatic INLINE vfloat vcast_vf_f(float f) { return _mm512_set1_ps(f); }\nstatic INLINE vint2 vcast_vi2_i(int i) { return _mm512_set1_epi32(i); }\nstatic INLINE vint2 vrint_vi2_vf(vfloat vf) { return vcast_vi2_vm(_mm512_cvtps_epi32(vf)); }\nstatic INLINE vint2 vtruncate_vi2_vf(vfloat vf) { return vcast_vi2_vm(_mm512_cvttps_epi32(vf)); }\n\nstatic INLINE vfloat vtruncate_vf_vf(vfloat vd) {\n  return _mm512_roundscale_ps(vd, _MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC);\n}\n \nstatic INLINE vfloat vrint_vf_vf(vfloat vd) {\n  return _mm512_roundscale_ps(vd, _MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC);\n}\n\nstatic INLINE vfloat vadd_vf_vf_vf(vfloat x, vfloat y) { return _mm512_add_ps(x, y); }\nstatic INLINE vfloat vsub_vf_vf_vf(vfloat x, vfloat y) { return _mm512_sub_ps(x, y); }\nstatic INLINE vfloat vmul_vf_vf_vf(vfloat x, vfloat y) { return _mm512_mul_ps(x, y); }\nstatic INLINE vfloat vdiv_vf_vf_vf(vfloat x, vfloat y) { return _mm512_div_ps(x, y); }\nstatic INLINE vfloat vrec_vf_vf(vfloat x) { return vdiv_vf_vf_vf(vcast_vf_f(1.0f), x); }\nstatic INLINE vfloat vsqrt_vf_vf(vfloat x) { return _mm512_sqrt_ps(x); }\nstatic INLINE vfloat vabs_vf_vf(vfloat f) { return vreinterpret_vf_vm(vandnot_vm_vm_vm(vreinterpret_vm_vf(vcast_vf_f(-0.0f)), vreinterpret_vm_vf(f))); }\nstatic INLINE vfloat vneg_vf_vf(vfloat d) { return vreinterpret_vf_vm(vxor_vm_vm_vm(vreinterpret_vm_vf(vcast_vf_f(-0.0f)), vreinterpret_vm_vf(d))); }\nstatic INLINE vfloat vmax_vf_vf_vf(vfloat x, vfloat y) { return _mm512_max_ps(x, y); }\nstatic INLINE vfloat vmin_vf_vf_vf(vfloat x, vfloat y) { return _mm512_min_ps(x, y); }\n\n#if CONFIG == 1\nstatic INLINE vfloat vmla_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm512_fmadd_ps(x, y, z); }\nstatic INLINE vfloat vmlapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm512_fmsub_ps(x, y, z); }\nstatic INLINE vfloat vmlanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm512_fnmadd_ps(x, y, z); }\n#else\nstatic INLINE vfloat vmla_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vadd_vf_vf_vf(vmul_vf_vf_vf(x, y), z); }\nstatic INLINE vfloat vmlanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vsub_vf_vf_vf(z, vmul_vf_vf_vf(x, y)); }\nstatic INLINE vfloat vmlapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vsub_vf_vf_vf(vmul_vf_vf_vf(x, y), z); }\n#endif\n\nstatic INLINE vfloat vfma_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm512_fmadd_ps(x, y, z); }\nstatic INLINE vfloat vfmapp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm512_fmadd_ps(x, y, z); }\nstatic INLINE vfloat vfmapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm512_fmsub_ps(x, y, z); }\nstatic INLINE vfloat vfmanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm512_fnmadd_ps(x, y, z); }\nstatic INLINE vfloat vfmann_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm512_fnmsub_ps(x, y, z); }\n\nstatic INLINE vopmask veq_vo_vf_vf(vfloat x, vfloat y) { return _mm512_cmp_ps_mask(x, y, _CMP_EQ_OQ); }\nstatic INLINE vopmask vneq_vo_vf_vf(vfloat x, vfloat y) { return _mm512_cmp_ps_mask(x, y, _CMP_NEQ_UQ); }\nstatic INLINE vopmask vlt_vo_vf_vf(vfloat x, vfloat y) { return _mm512_cmp_ps_mask(x, y, _CMP_LT_OQ); }\nstatic INLINE vopmask vle_vo_vf_vf(vfloat x, vfloat y) { return _mm512_cmp_ps_mask(x, y, _CMP_LE_OQ); }\nstatic INLINE vopmask vgt_vo_vf_vf(vfloat x, vfloat y) { return _mm512_cmp_ps_mask(x, y, _CMP_GT_OQ); }\nstatic INLINE vopmask vge_vo_vf_vf(vfloat x, vfloat y) { return _mm512_cmp_ps_mask(x, y, _CMP_GE_OQ); }\n\nstatic INLINE vint2 vadd_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm512_add_epi32(x, y); }\nstatic INLINE vint2 vsub_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm512_sub_epi32(x, y); }\nstatic INLINE vint2 vneg_vi2_vi2(vint2 e) { return vsub_vi2_vi2_vi2(vcast_vi2_i(0), e); }\nstatic INLINE vint2 vand_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm512_and_si512(x, y); }\nstatic INLINE vint2 vandnot_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm512_andnot_si512(x, y); }\nstatic INLINE vint2 vor_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm512_or_si512(x, y); }\nstatic INLINE vint2 vxor_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm512_xor_si512(x, y); }\n\nstatic INLINE vint2 vand_vi2_vo_vi2(vopmask o, vint2 m) {\n  return _mm512_mask_and_epi32(_mm512_set1_epi32(0), o, m, m);\n}\n\nstatic INLINE vint2 vandnot_vi2_vo_vi2(vopmask o, vint2 m) {\n  return _mm512_mask_and_epi32(m, o, _mm512_set1_epi32(0), _mm512_set1_epi32(0));\n}\n\n#define vsll_vi2_vi2_i(x, c) _mm512_slli_epi32(x, c)\n#define vsrl_vi2_vi2_i(x, c) _mm512_srli_epi32(x, c)\n#define vsra_vi2_vi2_i(x, c) _mm512_srai_epi32(x, c)\n//@#define vsll_vi2_vi2_i(x, c) _mm512_slli_epi32(x, c)\n//@#define vsrl_vi2_vi2_i(x, c) _mm512_srli_epi32(x, c)\n//@#define vsra_vi2_vi2_i(x, c) _mm512_srai_epi32(x, c)\nstatic INLINE vopmask veq_vo_vi2_vi2(vint2 x, vint2 y) { return _mm512_cmpeq_epi32_mask(x, y); }\nstatic INLINE vopmask vgt_vo_vi2_vi2(vint2 x, vint2 y) { return _mm512_cmpgt_epi32_mask(x, y); }\n\nstatic INLINE vint2 veq_vi2_vi2_vi2(vint2 x, vint2 y) {\n  __mmask16 m = _mm512_cmp_epi32_mask(x, y, _MM_CMPINT_EQ);\n  return _mm512_mask_and_epi32(_mm512_set1_epi32(0), m, _mm512_set1_epi32(-1), _mm512_set1_epi32(-1));\n}\nstatic INLINE vint2 vgt_vi2_vi2_vi2(vint2 x, vint2 y) {\n  __mmask16 m = _mm512_cmp_epi32_mask(y, x, _MM_CMPINT_LT);\n  return _mm512_mask_and_epi32(_mm512_set1_epi32(0), m, _mm512_set1_epi32(-1), _mm512_set1_epi32(-1));\n}\n\nstatic INLINE vint2 vsel_vi2_vo_vi2_vi2(vopmask m, vint2 x, vint2 y) {\n  return _mm512_mask_blend_epi32(m, y, x);\n}\n\nstatic INLINE vfloat vsel_vf_vo_vf_vf(vopmask m, vfloat x, vfloat y) {\n  return _mm512_mask_blend_ps(m, y, x);\n}\n\n// At this point, the following three functions are implemented in a generic way,\n// but I will try target-specific optimization later on.\nstatic INLINE CONST vfloat vsel_vf_vo_f_f(vopmask o, float v1, float v0) {\n  return vsel_vf_vo_vf_vf(o, vcast_vf_f(v1), vcast_vf_f(v0));\n}\n\nstatic INLINE vfloat vsel_vf_vo_vo_f_f_f(vopmask o0, vopmask o1, float d0, float d1, float d2) {\n  return vsel_vf_vo_vf_vf(o0, vcast_vf_f(d0), vsel_vf_vo_f_f(o1, d1, d2));\n}\n\nstatic INLINE vfloat vsel_vf_vo_vo_vo_f_f_f_f(vopmask o0, vopmask o1, vopmask o2, float d0, float d1, float d2, float d3) {\n  return vsel_vf_vo_vf_vf(o0, vcast_vf_f(d0), vsel_vf_vo_vf_vf(o1, vcast_vf_f(d1), vsel_vf_vo_f_f(o2, d2, d3)));\n}\n\nstatic INLINE vopmask visinf_vo_vf(vfloat d) { return veq_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(SLEEF_INFINITYf)); }\nstatic INLINE vopmask vispinf_vo_vf(vfloat d) { return veq_vo_vf_vf(d, vcast_vf_f(SLEEF_INFINITYf)); }\nstatic INLINE vopmask visminf_vo_vf(vfloat d) { return veq_vo_vf_vf(d, vcast_vf_f(-SLEEF_INFINITYf)); }\nstatic INLINE vopmask visnan_vo_vf(vfloat d) { return vneq_vo_vf_vf(d, d); }\n\nstatic INLINE vint2 vilogbk_vi2_vf(vfloat d) { return vrint_vi2_vf(_mm512_getexp_ps(d)); }\nstatic INLINE vint2 vilogb2k_vi2_vf(vfloat d) { return vrint_vi2_vf(_mm512_getexp_ps(d)); }\n\n#ifdef _MSC_VER\n// This function is needed when debugging on MSVC.\nstatic INLINE float vcast_f_vf(vfloat v) {\n  float s[VECTLENSP];\n  _mm512_storeu_ps(s, v);\n  return s[0];\n}\n#endif\n\nstatic INLINE vfloat vload_vf_p(const float *ptr) { return _mm512_load_ps(ptr); }\nstatic INLINE vfloat vloadu_vf_p(const float *ptr) { return _mm512_loadu_ps(ptr); }\n\nstatic INLINE void vstore_v_p_vf(float *ptr, vfloat v) { _mm512_store_ps(ptr, v); }\nstatic INLINE void vstoreu_v_p_vf(float *ptr, vfloat v) { _mm512_storeu_ps(ptr, v); }\n\nstatic INLINE vfloat vgather_vf_p_vi2(const float *ptr, vint2 vi2) { return _mm512_i32gather_ps(vi2, ptr, 4); }\n\n//\n\nstatic INLINE vdouble vposneg_vd_vd(vdouble d) {\n  return vreinterpret_vd_vm(_mm512_mask_xor_epi32(vreinterpret_vm_vd(d), 0xcccc, vreinterpret_vm_vd(d), vreinterpret_vm_vd(_mm512_set1_pd(-0.0))));\n}\nstatic INLINE vdouble vnegpos_vd_vd(vdouble d) {\n  return vreinterpret_vd_vm(_mm512_mask_xor_epi32(vreinterpret_vm_vd(d), 0x3333, vreinterpret_vm_vd(d), vreinterpret_vm_vd(_mm512_set1_pd(-0.0))));\n}\nstatic INLINE vfloat vposneg_vf_vf(vfloat d) {\n  return vreinterpret_vf_vm(_mm512_mask_xor_epi32(vreinterpret_vm_vf(d), 0xaaaa, vreinterpret_vm_vf(d), vreinterpret_vm_vf(_mm512_set1_ps(-0.0f))));\n}\nstatic INLINE vfloat vnegpos_vf_vf(vfloat d) {\n  return vreinterpret_vf_vm(_mm512_mask_xor_epi32(vreinterpret_vm_vf(d), 0x5555, vreinterpret_vm_vf(d), vreinterpret_vm_vf(_mm512_set1_ps(-0.0f))));\n}\n\nstatic INLINE vdouble vsubadd_vd_vd_vd(vdouble x, vdouble y) { return vadd_vd_vd_vd(x, vnegpos_vd_vd(y)); }\nstatic INLINE vfloat vsubadd_vf_vf_vf(vfloat x, vfloat y) { return vadd_vf_vf_vf(x, vnegpos_vf_vf(y)); }\n\nstatic INLINE vdouble vmlsubadd_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm512_fmaddsub_pd(x, y, z); }\nstatic INLINE vfloat vmlsubadd_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm512_fmaddsub_ps(x, y, z); }\n\nstatic INLINE vdouble vrev21_vd_vd(vdouble vd) { return _mm512_permute_pd(vd, 0x55); }\n\nstatic INLINE vdouble vreva2_vd_vd(vdouble vd) {\n  return vreinterpret_vd_vm(_mm512_permutexvar_epi32(_mm512_set_epi32(3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12), vreinterpret_vm_vd(vd)));\n}\n\nstatic INLINE void vstream_v_p_vd(double *ptr, vdouble v) { _mm512_stream_pd(ptr, v); }\n\nstatic INLINE void vscatter2_v_p_i_i_vd(double *ptr, int offset, int step, vdouble v) {\n  _mm_store_pd(&ptr[(offset + step * 0)*2], _mm_castps_pd(_mm512_extractf32x4_ps(vreinterpret_vf_vd(v), 0)));\n  _mm_store_pd(&ptr[(offset + step * 1)*2], _mm_castps_pd(_mm512_extractf32x4_ps(vreinterpret_vf_vd(v), 1)));\n  _mm_store_pd(&ptr[(offset + step * 2)*2], _mm_castps_pd(_mm512_extractf32x4_ps(vreinterpret_vf_vd(v), 2)));\n  _mm_store_pd(&ptr[(offset + step * 3)*2], _mm_castps_pd(_mm512_extractf32x4_ps(vreinterpret_vf_vd(v), 3)));\n}\n\nstatic INLINE void vsscatter2_v_p_i_i_vd(double *ptr, int offset, int step, vdouble v) {\n  _mm_stream_pd(&ptr[(offset + step * 0)*2], _mm_castps_pd(_mm512_extractf32x4_ps(vreinterpret_vf_vd(v), 0)));\n  _mm_stream_pd(&ptr[(offset + step * 1)*2], _mm_castps_pd(_mm512_extractf32x4_ps(vreinterpret_vf_vd(v), 1)));\n  _mm_stream_pd(&ptr[(offset + step * 2)*2], _mm_castps_pd(_mm512_extractf32x4_ps(vreinterpret_vf_vd(v), 2)));\n  _mm_stream_pd(&ptr[(offset + step * 3)*2], _mm_castps_pd(_mm512_extractf32x4_ps(vreinterpret_vf_vd(v), 3)));\n}\n\n//\n\nstatic INLINE vfloat vrev21_vf_vf(vfloat vf) { return _mm512_permute_ps(vf, 0xb1); }\nstatic INLINE vint2 vrev21_vi2_vi2(vint2 i) { return vreinterpret_vi2_vf(vrev21_vf_vf(vreinterpret_vf_vi2(i))); }\n\nstatic INLINE vfloat vreva2_vf_vf(vfloat vf) {\n  return vreinterpret_vf_vm(_mm512_permutexvar_epi32(_mm512_set_epi32(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14), vreinterpret_vm_vf(vf)));\n}\n\nstatic INLINE void vstream_v_p_vf(float *ptr, vfloat v) { _mm512_stream_ps(ptr, v); }\n\nstatic INLINE void vscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v) {\n  _mm_storel_pd((double *)(ptr+(offset + step * 0)*2), _mm_castps_pd(_mm512_extractf32x4_ps(v, 0)));\n  _mm_storeh_pd((double *)(ptr+(offset + step * 1)*2), _mm_castps_pd(_mm512_extractf32x4_ps(v, 0)));\n  _mm_storel_pd((double *)(ptr+(offset + step * 2)*2), _mm_castps_pd(_mm512_extractf32x4_ps(v, 1)));\n  _mm_storeh_pd((double *)(ptr+(offset + step * 3)*2), _mm_castps_pd(_mm512_extractf32x4_ps(v, 1)));\n  _mm_storel_pd((double *)(ptr+(offset + step * 4)*2), _mm_castps_pd(_mm512_extractf32x4_ps(v, 2)));\n  _mm_storeh_pd((double *)(ptr+(offset + step * 5)*2), _mm_castps_pd(_mm512_extractf32x4_ps(v, 2)));\n  _mm_storel_pd((double *)(ptr+(offset + step * 6)*2), _mm_castps_pd(_mm512_extractf32x4_ps(v, 3)));\n  _mm_storeh_pd((double *)(ptr+(offset + step * 7)*2), _mm_castps_pd(_mm512_extractf32x4_ps(v, 3)));\n}\n\nstatic INLINE void vsscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v) { vscatter2_v_p_i_i_vf(ptr, offset, step, v); }\n\n//\n\nstatic INLINE vmask2 vinterleave_vm2_vm2(vmask2 v) {\n  return (vmask2) { _mm512_unpacklo_epi64(v.x, v.y), _mm512_unpackhi_epi64(v.x, v.y) };\n}\n\nstatic INLINE vmask2 vuninterleave_vm2_vm2(vmask2 v) {\n  return (vmask2) { _mm512_unpacklo_epi64(v.x, v.y), _mm512_unpackhi_epi64(v.x, v.y) };\n}\n\nstatic INLINE vint vuninterleave_vi_vi(vint v) {\n  return _mm256_permutevar8x32_epi32(v, _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0));\n}\n\nstatic INLINE vdouble vinterleave_vd_vd(vdouble vd) {\n  return vreinterpret_vd_vm(_mm512_permutexvar_epi32(_mm512_set_epi32(15, 14, 7, 6, 13, 12, 5, 4, 11, 10, 3, 2, 9, 8, 1, 0), vreinterpret_vm_vd(vd)));\n}\n\nstatic INLINE vdouble vuninterleave_vd_vd(vdouble vd) {\n  return vreinterpret_vd_vm(_mm512_permutexvar_epi32(_mm512_set_epi32(15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0), vreinterpret_vm_vd(vd)));\n}\n\nstatic INLINE vmask vinterleave_vm_vm(vmask vm) {\n  return _mm512_permutexvar_epi32(_mm512_set_epi32(15, 14, 7, 6, 13, 12, 5, 4, 11, 10, 3, 2, 9, 8, 1, 0), vm);\n}\n\nstatic INLINE vmask vuninterleave_vm_vm(vmask vm) {\n  return _mm512_permutexvar_epi32(_mm512_set_epi32(15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0), vm);\n}\n\nstatic vmask2 vloadu_vm2_p(void *p) {\n  vmask2 vm2;\n  memcpy(&vm2, p, VECTLENDP * 16);\n  return vm2;\n}\n\n#if !defined(SLEEF_GENHEADER)\ntypedef Sleef_quad8 vargquad;\n\nstatic INLINE vmask2 vcast_vm2_aq(vargquad aq) {\n  return vinterleave_vm2_vm2(vloadu_vm2_p(&aq));\n}\n\nstatic INLINE vargquad vcast_aq_vm2(vmask2 vm2) {\n  vm2 = vuninterleave_vm2_vm2(vm2);\n  vargquad aq;\n  memcpy(&aq, &vm2, VECTLENDP * 16);\n  return aq;\n}\n#endif // #if !defined(SLEEF_GENHEADER)\n\n#ifdef __INTEL_COMPILER\nstatic INLINE int vtestallzeros_i_vo64(vopmask g) { return _mm512_mask2int(g) == 0; }\n#else\nstatic INLINE int vtestallzeros_i_vo64(vopmask g) { return g == 0; }\n#endif\n\nstatic INLINE vmask vsel_vm_vo64_vm_vm(vopmask m, vmask x, vmask y) { return _mm512_mask_blend_epi64(m, y, x); }\n\nstatic INLINE vmask vsub64_vm_vm_vm(vmask x, vmask y) { return _mm512_sub_epi64(x, y); }\nstatic INLINE vmask vneg64_vm_vm(vmask x) { return _mm512_sub_epi64(vcast_vm_i_i(0, 0), x); }\nstatic INLINE vopmask vgt64_vo_vm_vm(vmask x, vmask y) { return _mm512_cmp_epi64_mask(y, x, _MM_CMPINT_LT); } // signed compare\n\n#define vsll64_vm_vm_i(x, c) _mm512_slli_epi64(x, c)\n#define vsrl64_vm_vm_i(x, c) _mm512_srli_epi64(x, c)\n//@#define vsll64_vm_vm_i(x, c) _mm512_slli_epi64(x, c)\n//@#define vsrl64_vm_vm_i(x, c) _mm512_srli_epi64(x, c)\n\nstatic INLINE vmask vcast_vm_vi(vint vi) {\n  return _mm512_cvtepi32_epi64(vi);\n}\nstatic INLINE vint vcast_vi_vm(vmask vm) {\n  return _mm512_cvtepi64_epi32(vm);\n}\n"
  },
  {
    "path": "src/helperneon32.h",
    "content": "//   Copyright Naoki Shibata and contributors 2010 - 2020.\n// Distributed under the Boost Software License, Version 1.0.\n//    (See accompanying file LICENSE.txt or copy at\n//          http://www.boost.org/LICENSE_1_0.txt)\n\n#ifndef __ARM_NEON\n#error Please specify -mfpu=neon.\n#endif\n\n#ifdef __aarch64__\n#warning This implementation is for AARCH32.\n#endif\n\n#define ENABLE_SP\n//@#define ENABLE_SP\n#define LOG2VECTLENSP 2\n//@#define LOG2VECTLENSP 2\n#define VECTLENSP (1 << LOG2VECTLENSP)\n//@#define VECTLENSP (1 << LOG2VECTLENSP)\n\n#if CONFIG == 4\n#define ISANAME \"AARCH32 NEON-VFPV4\"\n#define ENABLE_FMA_SP\n//@#define ENABLE_FMA_SP\n#else\n#define ISANAME \"AARCH32 NEON\"\n#endif\n#define DFTPRIORITY 10\n\n#define ENABLE_RECSQRT_SP\n//@#define ENABLE_RECSQRT_SP\n\n#include <arm_neon.h>\n#include <stdint.h>\n\n#include \"misc.h\"\n\ntypedef uint32x4_t vmask;\ntypedef uint32x4_t vopmask;\n\n//typedef int32x4_t vint;\n\ntypedef float32x4_t vfloat;\ntypedef int32x4_t vint2;\n\n//\n\nstatic INLINE void vprefetch_v_p(const void *ptr) { }\n\nstatic INLINE int vtestallones_i_vo32(vopmask g) {\n  uint32x2_t x0 = vand_u32(vget_low_u32(g), vget_high_u32(g));\n  uint32x2_t x1 = vpmin_u32(x0, x0);\n  return vget_lane_u32(x1, 0);\n}\n\nstatic vfloat vloaduf(float *p) { return vld1q_f32(p); }\nstatic void vstoreuf(float *p, vfloat v) { vst1q_f32(p, v); }\n\nstatic vint2 vloadu_vi2_p(int32_t *p) { return vld1q_s32(p); }\nstatic void vstoreu_v_p_vi2(int32_t *p, vint2 v) { vst1q_s32(p, v); }\n\n//\n\nstatic INLINE vmask vand_vm_vm_vm(vmask x, vmask y) { return vandq_u32(x, y); }\nstatic INLINE vmask vandnot_vm_vm_vm(vmask x, vmask y) { return vbicq_u32(y, x); }\nstatic INLINE vmask vor_vm_vm_vm(vmask x, vmask y) { return vorrq_u32(x, y); }\nstatic INLINE vmask vxor_vm_vm_vm(vmask x, vmask y) { return veorq_u32(x, y); }\n\nstatic INLINE vopmask vand_vo_vo_vo(vopmask x, vopmask y) { return vandq_u32(x, y); }\nstatic INLINE vopmask vandnot_vo_vo_vo(vopmask x, vopmask y) { return vbicq_u32(y, x); }\nstatic INLINE vopmask vor_vo_vo_vo(vopmask x, vopmask y) { return vorrq_u32(x, y); }\nstatic INLINE vopmask vxor_vo_vo_vo(vopmask x, vopmask y) { return veorq_u32(x, y); }\n\nstatic INLINE vmask vand_vm_vo64_vm(vopmask x, vmask y) { return vandq_u32(x, y); }\nstatic INLINE vmask vandnot_vm_vo64_vm(vopmask x, vmask y) { return vbicq_u32(y, x); }\nstatic INLINE vmask vor_vm_vo64_vm(vopmask x, vmask y) { return vorrq_u32(x, y); }\nstatic INLINE vmask vxor_vm_vo64_vm(vopmask x, vmask y) { return veorq_u32(x, y); }\n\nstatic INLINE vmask vand_vm_vo32_vm(vopmask x, vmask y) { return vandq_u32(x, y); }\nstatic INLINE vmask vandnot_vm_vo32_vm(vopmask x, vmask y) { return vbicq_u32(y, x); }\nstatic INLINE vmask vor_vm_vo32_vm(vopmask x, vmask y) { return vorrq_u32(x, y); }\nstatic INLINE vmask vxor_vm_vo32_vm(vopmask x, vmask y) { return veorq_u32(x, y); }\n\nstatic INLINE vopmask vcast_vo32_vo64(vopmask m) { return vuzpq_u32(m, m).val[0]; }\nstatic INLINE vopmask vcast_vo64_vo32(vopmask m) { return vzipq_u32(m, m).val[0]; }\n\n//\n\nstatic INLINE vmask vcast_vm_i_i(int i0, int i1) { return (vmask)vdupq_n_u64((uint64_t)i0 | (((uint64_t)i1) << 32)); }\nstatic INLINE vopmask veq64_vo_vm_vm(vmask x, vmask y) {\n  uint32x4_t t = vceqq_u32(x, y);\n  return vandq_u32(t, vrev64q_u32(t));\n}\n\n//\n\nstatic INLINE vint2 vcast_vi2_vm(vmask vm) { return (vint2)vm; }\nstatic INLINE vmask vcast_vm_vi2(vint2 vi) { return (vmask)vi; }\nstatic INLINE vint2 vrint_vi2_vf(vfloat d) {\n  return vcvtq_s32_f32(vaddq_f32(d, (float32x4_t)vorrq_u32(vandq_u32((uint32x4_t)d, (uint32x4_t)vdupq_n_f32(-0.0f)), (uint32x4_t)vdupq_n_f32(0.5f))));\n}\nstatic INLINE vint2 vtruncate_vi2_vf(vfloat vf) { return vcvtq_s32_f32(vf); }\nstatic INLINE vfloat vcast_vf_vi2(vint2 vi) { return vcvtq_f32_s32(vi); }\n\nstatic INLINE vfloat vtruncate_vf_vf(vfloat vd) { return vcast_vf_vi2(vtruncate_vi2_vf(vd)); }\nstatic INLINE vfloat vrint_vf_vf(vfloat vd) { return vcast_vf_vi2(vrint_vi2_vf(vd)); }\n\nstatic INLINE vfloat vcast_vf_f(float f) { return vdupq_n_f32(f); }\nstatic INLINE vint2 vcast_vi2_i(int i) { return vdupq_n_s32(i); }\nstatic INLINE vmask vreinterpret_vm_vf(vfloat vf) { return (vmask)vf; }\nstatic INLINE vfloat vreinterpret_vf_vm(vmask vm) { return (vfloat)vm; }\nstatic INLINE vfloat vreinterpret_vf_vi2(vint2 vm) { return (vfloat)vm; }\nstatic INLINE vint2 vreinterpret_vi2_vf(vfloat vf) { return (vint2)vf; }\n\nstatic INLINE vfloat vadd_vf_vf_vf(vfloat x, vfloat y) { return vaddq_f32(x, y); }\nstatic INLINE vfloat vsub_vf_vf_vf(vfloat x, vfloat y) { return vsubq_f32(x, y); }\nstatic INLINE vfloat vmul_vf_vf_vf(vfloat x, vfloat y) { return vmulq_f32(x, y); }\n\nstatic INLINE vfloat vabs_vf_vf(vfloat f) { return vabsq_f32(f); }\nstatic INLINE vfloat vneg_vf_vf(vfloat f) { return vnegq_f32(f); }\n#if CONFIG == 4\nstatic INLINE vfloat vmla_vf_vf_vf_vf  (vfloat x, vfloat y, vfloat z) { return vfmaq_f32(z, x, y); }\nstatic INLINE vfloat vmlanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vfmsq_f32(z, x, y); }\nstatic INLINE vfloat vfma_vf_vf_vf_vf  (vfloat x, vfloat y, vfloat z) { return vfmaq_f32(z, x, y); }\nstatic INLINE vfloat vfmanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vfmsq_f32(z, x, y); }\nstatic INLINE vfloat vfmapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vneg_vf_vf(vfmanp_vf_vf_vf_vf(x, y, z)); }\nstatic INLINE vfloat vmlapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vneg_vf_vf(vfmanp_vf_vf_vf_vf(x, y, z)); }\n\nstatic INLINE vfloat vdiv_vf_vf_vf(vfloat x, vfloat y) {\n  float32x4_t t = vrecpeq_f32(y), u;\n  t = vmulq_f32(t, vrecpsq_f32(y, t));\n  t = vfmaq_f32(t, vfmsq_f32(vdupq_n_f32(1.0f), y, t), t);\n  u = vmulq_f32(x, t);\n  return vfmaq_f32(u, vfmsq_f32(x, y, u), t);\n}\n\nstatic INLINE vfloat vsqrt_vf_vf(vfloat d) {\n  float32x4_t x = vrsqrteq_f32(d);\n  x = vmulq_f32(x, vrsqrtsq_f32(d, vmulq_f32(x, x)));\n  x = vmulq_f32(x, vrsqrtsq_f32(d, vmulq_f32(x, x)));\n  float32x4_t u = vmulq_f32(x, d);\n  u = vfmaq_f32(u, vfmsq_f32(d, u, u), vmulq_f32(x, vdupq_n_f32(0.5)));\n  return vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(u), vceqq_f32(d, vdupq_n_f32(0.0f))));\n}\n\nstatic INLINE vfloat vrec_vf_vf(vfloat y) {\n  float32x4_t t = vrecpeq_f32(y), u;\n  t = vmulq_f32(t, vrecpsq_f32(y, t));\n  t = vfmaq_f32(t, vfmsq_f32(vdupq_n_f32(1.0f), y, t), t);\n  return vfmaq_f32(t, vfmsq_f32(vdupq_n_f32(1.0f), y, t), t);\n}\n\nstatic INLINE vfloat vrecsqrt_vf_vf(vfloat d) {\n  float32x4_t x = vrsqrteq_f32(d);\n  x = vmulq_f32(x, vrsqrtsq_f32(d, vmulq_f32(x, x)));\n  return vfmaq_f32(x, vfmsq_f32(vdupq_n_f32(1), x, vmulq_f32(x, d)), vmulq_f32(x, vdupq_n_f32(0.5)));\n}\n#else // #if CONFIG == 4\nstatic INLINE vfloat vmla_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vmlaq_f32(z, x, y); }\nstatic INLINE vfloat vmlanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vmlsq_f32(z, x, y); }\nstatic INLINE vfloat vmlapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vneg_vf_vf(vmlsq_f32(z, x, y)); }\n\nstatic INLINE vfloat vdiv_vf_vf_vf(vfloat n, vfloat d) {\n  float32x4_t x = vrecpeq_f32(d);\n  x = vmulq_f32(x, vrecpsq_f32(d, x));\n  float32x4_t t = vmulq_f32(n, x);\n  return vmlsq_f32(vaddq_f32(t, t), vmulq_f32(t, x), d);\n}\n\nstatic INLINE vfloat vsqrt_vf_vf(vfloat d) {\n  float32x4_t x = vrsqrteq_f32(d);\n  x = vmulq_f32(x, vrsqrtsq_f32(d, vmulq_f32(x, x)));\n  float32x4_t u = vmulq_f32(x, d);\n  u = vmlaq_f32(u, vmlsq_f32(d, u, u), vmulq_f32(x, vdupq_n_f32(0.5)));\n  return vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(u), vceqq_f32(d, vdupq_n_f32(0.0f))));\n}\n\nstatic INLINE vfloat vrec_vf_vf(vfloat d) {\n  float32x4_t x = vrecpeq_f32(d);\n  x = vmulq_f32(x, vrecpsq_f32(d, x));\n  return vmlsq_f32(vaddq_f32(x, x), vmulq_f32(x, x), d);\n}\n\nstatic INLINE vfloat vrecsqrt_vf_vf(vfloat d) {\n  float32x4_t x = vrsqrteq_f32(d);\n  x = vmulq_f32(x, vrsqrtsq_f32(d, vmulq_f32(x, x)));\n  return vmlaq_f32(x, vmlsq_f32(vdupq_n_f32(1), x, vmulq_f32(x, d)), vmulq_f32(x, vdupq_n_f32(0.5)));\n}\n#endif // #if CONFIG == 4\nstatic INLINE vfloat vmax_vf_vf_vf(vfloat x, vfloat y) { return vmaxq_f32(x, y); }\nstatic INLINE vfloat vmin_vf_vf_vf(vfloat x, vfloat y) { return vminq_f32(x, y); }\n\nstatic INLINE vopmask veq_vo_vf_vf(vfloat x, vfloat y) { return vceqq_f32(x, y); }\nstatic INLINE vopmask vneq_vo_vf_vf(vfloat x, vfloat y) { return vmvnq_u32(vceqq_f32(x, y)); }\nstatic INLINE vopmask vlt_vo_vf_vf(vfloat x, vfloat y) { return vcltq_f32(x, y); }\nstatic INLINE vopmask vle_vo_vf_vf(vfloat x, vfloat y) { return vcleq_f32(x, y); }\nstatic INLINE vopmask vgt_vo_vf_vf(vfloat x, vfloat y) { return vcgtq_f32(x, y); }\nstatic INLINE vopmask vge_vo_vf_vf(vfloat x, vfloat y) { return vcgeq_f32(x, y); }\n\nstatic INLINE vint2 vadd_vi2_vi2_vi2(vint2 x, vint2 y) { return vaddq_s32(x, y); }\nstatic INLINE vint2 vsub_vi2_vi2_vi2(vint2 x, vint2 y) { return vsubq_s32(x, y); }\nstatic INLINE vint2 vneg_vi2_vi2(vint2 e) { return vnegq_s32(e); }\n\nstatic INLINE vint2 vand_vi2_vi2_vi2(vint2 x, vint2 y) { return vandq_s32(x, y); }\nstatic INLINE vint2 vandnot_vi2_vi2_vi2(vint2 x, vint2 y) { return vbicq_s32(y, x); }\nstatic INLINE vint2 vor_vi2_vi2_vi2(vint2 x, vint2 y) { return vorrq_s32(x, y); }\nstatic INLINE vint2 vxor_vi2_vi2_vi2(vint2 x, vint2 y) { return veorq_s32(x, y); }\n\nstatic INLINE vint2 vand_vi2_vo_vi2(vopmask x, vint2 y) { return (vint2)vandq_u32(x, (vopmask)y); }\nstatic INLINE vint2 vandnot_vi2_vo_vi2(vopmask x, vint2 y) { return (vint2)vbicq_u32((vopmask)y, x); }\n\n#define vsll_vi2_vi2_i(x, c) vshlq_n_s32(x, c)\n#define vsrl_vi2_vi2_i(x, c) vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(x), c))\n#define vsra_vi2_vi2_i(x, c) vshrq_n_s32(x, c)\n//@#define vsll_vi2_vi2_i(x, c) vshlq_n_s32(x, c)\n//@#define vsrl_vi2_vi2_i(x, c) vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(x), c))\n//@#define vsra_vi2_vi2_i(x, c) vshrq_n_s32(x, c)\n\nstatic INLINE vopmask veq_vo_vi2_vi2(vint2 x, vint2 y) { return vceqq_s32(x, y); }\nstatic INLINE vopmask vgt_vo_vi2_vi2(vint2 x, vint2 y) { return vcgtq_s32(x, y); }\nstatic INLINE vint2 veq_vi2_vi2_vi2(vint2 x, vint2 y) { return (vint2)vceqq_s32(x, y); }\nstatic INLINE vint2 vgt_vi2_vi2_vi2(vint2 x, vint2 y) { return (vint2)vcgtq_s32(x, y); }\n\nstatic INLINE vint2 vsel_vi2_vo_vi2_vi2(vopmask m, vint2 x, vint2 y) { return (vint2)vbslq_u32(m, (vmask)x, (vmask)y); }\n\nstatic INLINE vfloat vsel_vf_vo_vf_vf(vopmask mask, vfloat x, vfloat y) {\n  return (vfloat)vbslq_u32(mask, (vmask)x, (vmask)y);\n}\n\nstatic INLINE CONST vfloat vsel_vf_vo_f_f(vopmask o, float v1, float v0) {\n  return vsel_vf_vo_vf_vf(o, vcast_vf_f(v1), vcast_vf_f(v0));\n}\n\nstatic INLINE vfloat vsel_vf_vo_vo_f_f_f(vopmask o0, vopmask o1, float d0, float d1, float d2) {\n  return vsel_vf_vo_vf_vf(o0, vcast_vf_f(d0), vsel_vf_vo_f_f(o1, d1, d2));\n}\n\nstatic INLINE vfloat vsel_vf_vo_vo_vo_f_f_f_f(vopmask o0, vopmask o1, vopmask o2, float d0, float d1, float d2, float d3) {\n  return vsel_vf_vo_vf_vf(o0, vcast_vf_f(d0), vsel_vf_vo_vf_vf(o1, vcast_vf_f(d1), vsel_vf_vo_f_f(o2, d2, d3)));\n}\n\nstatic INLINE vopmask visinf_vo_vf(vfloat d) { return veq_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(SLEEF_INFINITYf)); }\nstatic INLINE vopmask vispinf_vo_vf(vfloat d) { return veq_vo_vf_vf(d, vcast_vf_f(SLEEF_INFINITYf)); }\nstatic INLINE vopmask visminf_vo_vf(vfloat d) { return veq_vo_vf_vf(d, vcast_vf_f(-SLEEF_INFINITYf)); }\nstatic INLINE vopmask visnan_vo_vf(vfloat d) { return vneq_vo_vf_vf(d, d); }\n\n// This function is needed when debugging on MSVC.\nstatic INLINE float vcast_f_vf(vfloat v) {\n  float p[4];\n  vst1q_f32 (p, v);\n  return p[0];\n}\n\nstatic INLINE int vavailability_i(int name) {\n  if (name != 2) return 0;\n  return vcast_f_vf(vadd_vf_vf_vf(vcast_vf_f(name), vcast_vf_f(name))) != 0.0;\n}\n\n\nstatic INLINE vfloat vload_vf_p(const float *ptr) { return vld1q_f32(__builtin_assume_aligned(ptr, 16)); }\nstatic INLINE vfloat vloadu_vf_p(const float *ptr) { return vld1q_f32(ptr); }\n\nstatic INLINE void vstore_v_p_vf(float *ptr, vfloat v) { vst1q_f32(__builtin_assume_aligned(ptr, 16), v); }\nstatic INLINE void vstoreu_v_p_vf(float *ptr, vfloat v) { vst1q_f32(ptr, v); }\n\nstatic INLINE vfloat vgather_vf_p_vi2(const float *ptr, vint2 vi2) {\n  return ((vfloat) {\n      ptr[vgetq_lane_s32(vi2, 0)],\n      ptr[vgetq_lane_s32(vi2, 1)],\n      ptr[vgetq_lane_s32(vi2, 2)],\n      ptr[vgetq_lane_s32(vi2, 3)]\n    });\n}\n\n#define PNMASKf ((vfloat) { +0.0f, -0.0f, +0.0f, -0.0f })\n#define NPMASKf ((vfloat) { -0.0f, +0.0f, -0.0f, +0.0f })\n\nstatic INLINE vfloat vposneg_vf_vf(vfloat d) { return (vfloat)vxor_vm_vm_vm((vmask)d, (vmask)PNMASKf); }\nstatic INLINE vfloat vnegpos_vf_vf(vfloat d) { return (vfloat)vxor_vm_vm_vm((vmask)d, (vmask)NPMASKf); }\n\nstatic INLINE vfloat vsubadd_vf_vf_vf(vfloat d0, vfloat d1) { return vadd_vf_vf_vf(d0, vnegpos_vf_vf(d1)); }\nstatic INLINE vfloat vmlsubadd_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vsubadd_vf_vf_vf(vmul_vf_vf_vf(x, y), z); }\n\nstatic INLINE vfloat vrev21_vf_vf(vfloat d0) { return vrev64q_f32(d0); }\nstatic INLINE vfloat vreva2_vf_vf(vfloat d0) { return vcombine_f32(vget_high_f32(d0), vget_low_f32(d0)); }\nstatic INLINE vint2 vrev21_vi2_vi2(vint2 i) { return vreinterpret_vi2_vf(vrev21_vf_vf(vreinterpret_vf_vi2(i))); }\n\nstatic INLINE void vstream_v_p_vf(float *ptr, vfloat v) { vstore_v_p_vf(ptr, v); }\n\nstatic INLINE void vscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v) {\n  vst1_f32((float *)(ptr+(offset + step * 0)*2), vget_low_f32(v));\n  vst1_f32((float *)(ptr+(offset + step * 1)*2), vget_high_f32(v));\n}\n\nstatic INLINE void vsscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v) {\n  vst1_f32((float *)(ptr+(offset + step * 0)*2), vget_low_f32(v));\n  vst1_f32((float *)(ptr+(offset + step * 1)*2), vget_high_f32(v));\n}\n"
  },
  {
    "path": "src/helperpower_128.h",
    "content": "//   Copyright Naoki Shibata and contributors 2010 - 2020.\n// Distributed under the Boost Software License, Version 1.0.\n//    (See accompanying file LICENSE.txt or copy at\n//          http://www.boost.org/LICENSE_1_0.txt)\n\n#if CONFIG == 1 || CONFIG == 2\n\n#ifndef __VSX__\n#error Please specify -mcpu=power8 or -mcpu=power9\n#endif\n\n#else\n#error CONFIG macro invalid or not defined\n#endif\n\n#define ENABLE_DP\n//@#define ENABLE_DP\n#define LOG2VECTLENDP 1\n//@#define LOG2VECTLENDP 1\n#define VECTLENDP (1 << LOG2VECTLENDP)\n//@#define VECTLENDP (1 << LOG2VECTLENDP)\n\n#define ENABLE_SP\n//@#define ENABLE_SP\n#define LOG2VECTLENSP (LOG2VECTLENDP+1)\n//@#define LOG2VECTLENSP (LOG2VECTLENDP+1)\n#define VECTLENSP (1 << LOG2VECTLENSP)\n//@#define VECTLENSP (1 << LOG2VECTLENSP)\n\n#if CONFIG == 1\n#define ENABLE_FMA_DP\n//@#define ENABLE_FMA_DP\n#define ENABLE_FMA_SP\n//@#define ENABLE_FMA_SP\n#endif\n\n#define ACCURATE_SQRT\n//@#define ACCURATE_SQRT\n#define FULL_FP_ROUNDING\n//@#define FULL_FP_ROUNDING\n\n#if !defined(SLEEF_GENHEADER)\n#include <altivec.h>\n// undef altivec types since CPP and C99 use them as compiler tokens\n// use __vector and __bool instead\n#undef vector\n#undef bool\n\n#include <stdint.h>\n#include \"misc.h\"\n#endif // #if !defined(SLEEF_GENHEADER)\n\n#define ISANAME \"VSX\"\n#define DFTPRIORITY 25\n\nstatic INLINE int vavailability_i(int name) { return 3; }\nstatic INLINE void vprefetch_v_p(const void *ptr) { }\n\n/**********************************************\n ** Types\n***********************************************/\ntypedef __vector unsigned int vmask;\n// using __bool with typedef may cause ambiguous errors\n#define vopmask __vector __bool int\n//@#define vopmask __vector __bool int\ntypedef __vector signed int vint;\ntypedef __vector signed int vint2;\ntypedef __vector float  vfloat;\ntypedef __vector double vdouble;\n\n// internal use types\ntypedef __vector unsigned int v__u32;\ntypedef __vector unsigned char v__u8;\ntypedef __vector signed long long  v__i64;\ntypedef __vector unsigned long long  v__u64;\n#define v__b64 __vector __bool long long\n\n/**********************************************\n ** Utilities\n***********************************************/\n#define vset__vi(v0, v1) ((vint) {v0, v1, v0, v1})\n#define vset__vi2(...) ((vint2) {__VA_ARGS__})\n#define vset__vm(...) ((vmask) {__VA_ARGS__})\n#define vset__vo(...) ((vopmask) {__VA_ARGS__})\n#define vset__vf(...) ((vfloat) {__VA_ARGS__})\n#define vset__vd(...) ((vdouble) {__VA_ARGS__})\n#define vset__u8(...) ((v__u8) {__VA_ARGS__})\n#define vset__u32(...) ((v__u32) {__VA_ARGS__})\n#define vset__s64(...) ((v__i64) {__VA_ARGS__})\n#define vset__u64(...) ((v__u64) {__VA_ARGS__})\n\n#define vsetall__vi(v)  vset__vi(v, v)\n#define vsetall__vi2(v) vset__vi2(v, v, v, v)\n#define vsetall__vm(v)  vset__vm(v, v, v, v)\n#define vsetall__vo(v)  vset__vo(v, v, v, v)\n#define vsetall__vf(v)  vset__vf(v, v, v, v)\n#define vsetall__vd(v)  vset__vd(v, v)\n#define vsetall__u8(v)  vset__u8(v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v)\n#define vsetall__u32(v) vset__u32(v, v, v, v)\n#define vsetall__s64(v) vset__s64(v, v)\n#define vsetall__u64(v) vset__u64(v, v)\n\n#define vzero__vi()  vsetall__vi(0)\n#define vzero__vi2() vsetall__vi2(0)\n#define vzero__vm()  vsetall__vm(0)\n#define vzero__vo()  vsetall__vo(0)\n#define vzero__vf()  vsetall__vf(0)\n#define vzero__vd()  vsetall__vd(0)\n#define vzero__u8()  vsetall__u8(0)\n#define vzero__u32() vsetall__u32(0)\n#define vzero__s64() vsetall__s64(0)\n#define vzero__u64() vsetall__u64(0)\n\n//// Swap doubleword elements\n#ifdef __clang__\n  static INLINE v__u64 v__swapd_u64(v__u64 v)\n  { return vec_xxpermdi(v, v, 2); }\n#else\n  static INLINE v__u64 v__swapd_u64(v__u64 v)\n  {\n    __asm__ __volatile__(\"xxswapd %x0,%x1\" : \"=wa\" (v) : \"wa\" (v));\n    return v;\n  }\n#endif\n\n/**********************************************\n ** Memory\n***********************************************/\n\n////////////// Unaligned memory access //////////////\n/**\n * It's not safe to use vector assignment via (cast & dereference) for unaligned memory access\n * with almost all clang versions and gcc8 when VSX3 isn't enabled,\n * these compilers tends to generate instructions 'lvx/stvx' instead of 'lxvd2x/lxvw4x/stxvd2x/stxvw4x'\n * for more information check https://github.com/seiko2plus/vsx_mem_test\n *\n * TODO: check GCC(9, 10)\n*/\n//// load\n#if defined(__POWER9_VECTOR__) || (!defined(__clang__) && defined(__GNUC__) && __GNUC__ < 8)\nstatic vint vloadu_vi_p(const int32_t *ptr)\n{ return *((vint*)ptr); }\nstatic INLINE vint2 vloadu_vi2_p(const int32_t *ptr)\n{ return *((vint2*)ptr); }\nstatic INLINE vfloat vloadu_vf_p(const float *ptr)\n{ return *((vfloat*)ptr); }\nstatic INLINE vdouble vloadu_vd_p(const double *ptr)\n{ return *((vdouble*)ptr); }\n#else\nstatic vint vloadu_vi_p(const int32_t *ptr)\n{ return vec_vsx_ld(0, ptr); }\nstatic INLINE vint2 vloadu_vi2_p(const int32_t *ptr)\n{ return vec_vsx_ld(0, ptr); }\nstatic INLINE vfloat vloadu_vf_p(const float *ptr)\n{ return vec_vsx_ld(0, ptr); }\nstatic INLINE vdouble vloadu_vd_p(const double *ptr)\n{ return vec_vsx_ld(0, ptr); }\n#endif\n\n//// store\n#if defined(__POWER9_VECTOR__) || (!defined(__clang__) && defined(__GNUC__) && __GNUC__ < 8)\nstatic void vstoreu_v_p_vi(int32_t *ptr, vint v)\n{ *((vint*)ptr) = v; }\nstatic void vstoreu_v_p_vi2(int32_t *ptr, vint2 v)\n{ *((vint2*)ptr) = v; }\nstatic INLINE void vstoreu_v_p_vf(float *ptr, vfloat v)\n{ *((vfloat*)ptr) = v; }\nstatic INLINE void vstoreu_v_p_vd(double *ptr, vdouble v)\n{ *((vdouble*)ptr) = v; }\n#else\nstatic void vstoreu_v_p_vi(int32_t *ptr, vint v)\n{ vec_vsx_st(v, 0, ptr); }\nstatic void vstoreu_v_p_vi2(int32_t *ptr, vint2 v)\n{ vec_vsx_st(v, 0, ptr); }\nstatic INLINE void vstoreu_v_p_vf(float *ptr, vfloat v)\n{ vec_vsx_st(v, 0, ptr); }\nstatic INLINE void vstoreu_v_p_vd(double *ptr, vdouble v)\n{ vec_vsx_st(v, 0, ptr); }\n#endif\n\n////////////// aligned memory access //////////////\n//// load\nstatic INLINE vfloat vload_vf_p(const float *ptr)\n{ return vec_ld(0, ptr); }\nstatic INLINE vdouble vload_vd_p(const double *ptr)\n{ return *((vdouble*)ptr); }\n\n//// store\nstatic INLINE void vstore_v_p_vf(float *ptr, vfloat v)\n{ vec_st(v, 0, ptr); }\nstatic INLINE void vstore_v_p_vd(double *ptr, vdouble v)\n{ *((vdouble*)ptr) = v; }\n\n////////////// non-temporal memory access //////////////\n//// store\nstatic INLINE void vstream_v_p_vf(float *ptr, vfloat v)\n{ vstore_v_p_vf(ptr, v); }\nstatic INLINE void vstream_v_p_vd(double *ptr, vdouble v)\n{ vstore_v_p_vd(ptr, v); }\n\n////////////// LUT //////////////\n//// load\nstatic INLINE vdouble vgather_vd_p_vi(const double *ptr, vint vi)\n{ return vset__vd(ptr[vec_extract(vi, 0)], ptr[vec_extract(vi, 1)]); }\n\nstatic INLINE vfloat vgather_vf_p_vi2(const float *ptr, vint2 vi2)\n{\n  return vset__vf(\n    ptr[vec_extract(vi2, 0)], ptr[vec_extract(vi2, 1)],\n    ptr[vec_extract(vi2, 2)], ptr[vec_extract(vi2, 3)]\n  );\n}\n\n//// store\nstatic INLINE void vscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v)\n{\n  const v__u64 vll = (v__u64)v;\n  float *ptr_low = ptr + offset*2;\n  float *ptr_high = ptr + (offset + step)*2;\n  *((uint64_t*)ptr_low) = vec_extract(vll, 0);\n  *((uint64_t*)ptr_high) = vec_extract(vll, 1);\n}\n\nstatic INLINE void vsscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v)\n{ vscatter2_v_p_i_i_vf(ptr, offset, step, v); }\n\nstatic INLINE void vscatter2_v_p_i_i_vd(double *ptr, int offset, int step, vdouble v)\n{ vstore_v_p_vd((double *)(&ptr[2*offset]), v); }\n\nstatic INLINE void vsscatter2_v_p_i_i_vd(double *ptr, int offset, int step, vdouble v)\n{ vscatter2_v_p_i_i_vd(ptr, offset, step, v); }\n\n/**********************************************\n ** Misc\n **********************************************/\n\n// vector with a specific value set to all lanes (Vector Splat)\nstatic INLINE vint vcast_vi_i(int i)\n{ return vsetall__vi(i); }\nstatic INLINE vint2 vcast_vi2_i(int i)\n{ return vsetall__vi2(i); }\nstatic INLINE vfloat vcast_vf_f(float f)\n{ return vsetall__vf(f); }\nstatic INLINE vdouble vcast_vd_d(double d)\n{ return vsetall__vd(d); }\n// cast\nstatic INLINE vint2 vcast_vi2_vm(vmask vm)\n{ return (vint2)vm; }\nstatic INLINE vmask vcast_vm_vi2(vint2 vi)\n{ return (vmask)vi; }\n// get the first element\nstatic INLINE float vcast_f_vf(vfloat v)\n{ return vec_extract(v, 0); }\nstatic INLINE double vcast_d_vd(vdouble v)\n{ return vec_extract(v, 0); }\n\nstatic INLINE vmask vreinterpret_vm_vd(vdouble vd)\n{ return (vmask)vd; }\nstatic INLINE vdouble vreinterpret_vd_vm(vmask vm)\n{ return (vdouble)vm; }\nstatic INLINE vint2 vreinterpret_vi2_vd(vdouble vd)\n{ return (vint2)vd; }\nstatic INLINE vdouble vreinterpret_vd_vi2(vint2 vi)\n{ return (vdouble)vi; }\n\nstatic INLINE vmask vreinterpret_vm_vf(vfloat vf)\n{ return (vmask)vf; }\nstatic INLINE vfloat vreinterpret_vf_vm(vmask vm)\n{ return (vfloat)vm; }\nstatic INLINE vfloat vreinterpret_vf_vi2(vint2 vi)\n{ return (vfloat)vi; }\nstatic INLINE vint2 vreinterpret_vi2_vf(vfloat vf)\n{ return (vint2)vf; }\n\n// per element select via mask (blend)\nstatic INLINE vdouble vsel_vd_vo_vd_vd(vopmask o, vdouble x, vdouble y)\n{ return vec_sel(y, x, (v__b64)o); }\nstatic INLINE vfloat vsel_vf_vo_vf_vf(vopmask o, vfloat x, vfloat y)\n{ return vec_sel(y, x, o); }\n\nstatic INLINE vint vsel_vi_vo_vi_vi(vopmask o, vint x, vint y)\n{ return vec_sel(y, x, o); }\n\nstatic INLINE vint2 vsel_vi2_vo_vi2_vi2(vopmask o, vint2 x, vint2 y)\n{ return vec_sel(y, x, o); }\n\nstatic INLINE vfloat vsel_vf_vo_f_f(vopmask o, float v1, float v0)\n{\n  return vsel_vf_vo_vf_vf(o, vsetall__vf(v1), vsetall__vf(v0));\n}\nstatic INLINE vfloat vsel_vf_vo_vo_f_f_f(vopmask o0, vopmask o1, float d0, float d1, float d2)\n{\n  return vsel_vf_vo_vf_vf(o0, vsetall__vf(d0), vsel_vf_vo_f_f(o1, d1, d2));\n}\nstatic INLINE vfloat vsel_vf_vo_vo_vo_f_f_f_f(vopmask o0, vopmask o1, vopmask o2, float d0, float d1, float d2, float d3)\n{\n  return vsel_vf_vo_vf_vf(o0, vsetall__vf(d0), vsel_vf_vo_vf_vf(o1, vsetall__vf(d1), vsel_vf_vo_f_f(o2, d2, d3)));\n}\n\nstatic INLINE vdouble vsel_vd_vo_d_d(vopmask o, double v1, double v0)\n{\n  return vsel_vd_vo_vd_vd(o, vsetall__vd(v1), vsetall__vd(v0));\n}\nstatic INLINE vdouble vsel_vd_vo_vo_d_d_d(vopmask o0, vopmask o1, double d0, double d1, double d2)\n{\n  return vsel_vd_vo_vd_vd(o0, vsetall__vd(d0), vsel_vd_vo_d_d(o1, d1, d2));\n}\nstatic INLINE vdouble vsel_vd_vo_vo_vo_d_d_d_d(vopmask o0, vopmask o1, vopmask o2, double d0, double d1, double d2, double d3)\n{\n  return vsel_vd_vo_vd_vd(o0, vsetall__vd(d0), vsel_vd_vo_vd_vd(o1, vsetall__vd(d1), vsel_vd_vo_d_d(o2, d2, d3)));\n}\n\nstatic INLINE int vtestallones_i_vo32(vopmask g)\n{ return vec_all_ne((vint2)g, vzero__vi2()); }\nstatic INLINE int vtestallones_i_vo64(vopmask g)\n{ return vec_all_ne((v__i64)g, vzero__s64()); }\n\n/**********************************************\n ** Conversions\n **********************************************/\n\n////////////// Numeric //////////////\n// pack 64-bit mask to 32-bit\nstatic INLINE vopmask vcast_vo32_vo64(vopmask m)\n{ return (vopmask)vec_pack((v__u64)m, (v__u64)m); }\n// clip 64-bit lanes to lower 32-bit\nstatic INLINE vint vcastu_vi_vi2(vint2 vi2)\n{ return vec_mergeo(vi2, vec_splat(vi2, 3)); }\n\n// expand lower 32-bit mask\nstatic INLINE vopmask vcast_vo64_vo32(vopmask m)\n{ return vec_mergeh(m, m); }\n// unsigned expand lower 32-bit integer\nstatic INLINE vint2 vcastu_vi2_vi(vint vi)\n{ return vec_mergeh(vzero__vi(), vi); }\n\n// signed int to single-precision\nstatic INLINE vfloat vcast_vf_vi2(vint2 vi)\n{\n  vfloat ret;\n#ifdef __clang__\n  ret = __builtin_convertvector(vi, vfloat);\n#else\n  __asm__ __volatile__(\"xvcvsxwsp %x0,%x1\" : \"=wa\" (ret) : \"wa\" (vi));\n#endif\n  return ret;\n}\n\n// lower signed int to double-precision\nstatic INLINE vdouble vcast_vd_vi(vint vi)\n{\n  vdouble ret;\n  vint swap = vec_mergeh(vi, vi);\n#ifdef __clang__\n  ret = __builtin_vsx_xvcvsxwdp(swap);\n#else\n  __asm__ __volatile__(\"xvcvsxwdp %x0,%x1\" : \"=wa\" (ret) : \"wa\" (swap));\n#endif\n  return ret;\n}\n\n// zip two scalars\nstatic INLINE vmask vcast_vm_i_i(int l, int h)\n{ return (vmask)vec_mergeh(vsetall__vi2(h), vsetall__vi2(l)); }\n\n////////////// Truncation //////////////\n\nstatic INLINE vint2 vtruncate_vi2_vf(vfloat vf)\n{\n  vint2 ret;\n#ifdef __clang__\n  ret = __builtin_convertvector(vf, vint2);\n#else\n  __asm__ __volatile__(\"xvcvspsxws %x0,%x1\" : \"=wa\" (ret) : \"wa\" (vf));\n#endif\n  return ret;\n}\n\nstatic INLINE vint vtruncate_vi_vd(vdouble vd)\n{\n  vint ret;\n#ifdef __clang__\n  ret = __builtin_vsx_xvcvdpsxws(vd);\n#else\n  __asm__ __volatile__(\"xvcvdpsxws %x0,%x1\" : \"=wa\" (ret) : \"wa\" (vd));\n#endif\n  return vec_mergeo(ret, vec_splat(ret, 3));\n}\n\nstatic INLINE vdouble vtruncate_vd_vd(vdouble vd)\n{ return vec_trunc(vd); }\nstatic INLINE vfloat vtruncate_vf_vf(vfloat vf)\n{ return vec_trunc(vf); }\n\n////////////// Rounding //////////////\n\n// towards the nearest even\nstatic INLINE vint vrint_vi_vd(vdouble vd)\n{ return vtruncate_vi_vd(vec_rint(vd)); }\nstatic INLINE vint2 vrint_vi2_vf(vfloat vf)\n{ return vtruncate_vi2_vf(vec_rint(vf)); }\nstatic INLINE vdouble vrint_vd_vd(vdouble vd)\n{ return vec_rint(vd); }\nstatic INLINE vfloat vrint_vf_vf(vfloat vf)\n{ return vec_rint(vf); }\n\n/**********************************************\n ** Logical\n **********************************************/\n\n////////////// And //////////////\nstatic INLINE vint vand_vi_vi_vi(vint x, vint y)\n{ return vec_and(x, y); }\nstatic INLINE vint vand_vi_vo_vi(vopmask x, vint y)\n{ return vec_and((vint)x, y); }\nstatic INLINE vint2 vand_vi2_vi2_vi2(vint2 x, vint2 y)\n{ return vec_and(x, y); }\nstatic INLINE vint2 vand_vi2_vo_vi2(vopmask x, vint2 y)\n{ return (vint2)vec_and((vint2)x, y); }\n\nstatic INLINE vmask vand_vm_vm_vm(vmask x, vmask y)\n{ return vec_and(x, y); }\nstatic INLINE vmask vand_vm_vo32_vm(vopmask x, vmask y)\n{ return vec_and((vmask)x, y); }\nstatic INLINE vmask vand_vm_vo64_vm(vopmask x, vmask y)\n{ return vec_and((vmask)x, y); }\nstatic INLINE vopmask vand_vo_vo_vo(vopmask x, vopmask y)\n{ return vec_and(x, y); }\n\n////////////// Or //////////////\nstatic INLINE vint vor_vi_vi_vi(vint x, vint y)\n{ return vec_or(x, y); }\nstatic INLINE vint2 vor_vi2_vi2_vi2(vint2 x, vint2 y)\n{ return vec_or(x, y); }\n\nstatic INLINE vmask vor_vm_vm_vm(vmask x, vmask y)\n{ return vec_or(x, y); }\nstatic INLINE vmask vor_vm_vo32_vm(vopmask x, vmask y)\n{ return vec_or((vmask)x, y); }\nstatic INLINE vmask vor_vm_vo64_vm(vopmask x, vmask y)\n{ return vec_or((vmask)x, y); }\nstatic INLINE vopmask vor_vo_vo_vo(vopmask x, vopmask y)\n{ return vec_or(x, y); }\n\n////////////// Xor //////////////\nstatic INLINE vint vxor_vi_vi_vi(vint x, vint y)\n{ return vec_xor(x, y); }\nstatic INLINE vint2 vxor_vi2_vi2_vi2(vint2 x, vint2 y)\n{ return vec_xor(x, y); }\n\nstatic INLINE vmask vxor_vm_vm_vm(vmask x, vmask y)\n{ return vec_xor(x, y); }\nstatic INLINE vmask vxor_vm_vo32_vm(vopmask x, vmask y)\n{ return vec_xor((vmask)x, y); }\nstatic INLINE vmask vxor_vm_vo64_vm(vopmask x, vmask y)\n{ return vec_xor((vmask)x, y); }\nstatic INLINE vopmask vxor_vo_vo_vo(vopmask x, vopmask y)\n{ return vec_xor(x, y); }\n\n////////////// Not //////////////\nstatic INLINE vopmask vnot_vo_vo(vopmask o)\n{ return vec_nor(o, o); }\n\n////////////// And Not ((~x) & y) //////////////\nstatic INLINE vint vandnot_vi_vi_vi(vint x, vint y)\n{ return vec_andc(y, x); }\nstatic INLINE vint vandnot_vi_vo_vi(vopmask x, vint y)\n{ return vec_andc(y, (vint)x); }\nstatic INLINE vint2 vandnot_vi2_vi2_vi2(vint2 x, vint2 y)\n{ return vec_andc(y, x); }\nstatic INLINE vmask vandnot_vm_vm_vm(vmask x, vmask y)\n{ return vec_andc(y, x); }\nstatic INLINE vmask vandnot_vm_vo64_vm(vopmask x, vmask y)\n{ return vec_andc(y, x); }\nstatic INLINE vmask vandnot_vm_vo32_vm(vopmask x, vmask y)\n{ return vec_andc(y, x); }\nstatic INLINE vopmask vandnot_vo_vo_vo(vopmask x, vopmask y)\n{ return vec_andc(y, x); }\nstatic INLINE vint2 vandnot_vi2_vo_vi2(vopmask x, vint2 y)\n{ return vec_andc(y, (vint2)x); }\n\n/**********************************************\n ** Comparison\n **********************************************/\n\n////////////// Equal //////////////\nstatic INLINE vint veq_vi_vi_vi(vint x, vint y)\n{ return (vint)vec_cmpeq(x, y); }\nstatic INLINE vopmask veq_vo_vi_vi(vint x, vint y)\n{ return vec_cmpeq(x, y); }\n\nstatic INLINE vopmask veq_vo_vi2_vi2(vint2 x, vint2 y)\n{ return vec_cmpeq(x, y); }\nstatic INLINE vint2 veq_vi2_vi2_vi2(vint2 x, vint2 y)\n{ return (vint2)vec_cmpeq(x, y); }\n\nstatic INLINE vopmask veq64_vo_vm_vm(vmask x, vmask y)\n{ return (vopmask)vec_cmpeq((v__u64)x, (v__u64)y); }\n\nstatic INLINE vopmask veq_vo_vf_vf(vfloat x, vfloat y)\n{ return vec_cmpeq(x, y); }\nstatic INLINE vopmask veq_vo_vd_vd(vdouble x, vdouble y)\n{ return (vopmask)vec_cmpeq(x, y); }\n\n////////////// Not Equal //////////////\nstatic INLINE vopmask vneq_vo_vf_vf(vfloat x, vfloat y)\n{ return vnot_vo_vo(vec_cmpeq(x, y)); }\nstatic INLINE vopmask vneq_vo_vd_vd(vdouble x, vdouble y)\n{ return vnot_vo_vo((vopmask)vec_cmpeq(x, y)); }\n\n////////////// Less Than //////////////\nstatic INLINE vopmask vlt_vo_vf_vf(vfloat x, vfloat y)\n{ return vec_cmplt(x, y); }\nstatic INLINE vopmask vlt_vo_vd_vd(vdouble x, vdouble y)\n{ return (vopmask)vec_cmplt(x, y); }\n\n////////////// Greater Than //////////////\nstatic INLINE vint vgt_vi_vi_vi(vint x, vint y)\n{ return (vint)vec_cmpgt(x, y); }\nstatic INLINE vopmask vgt_vo_vi_vi(vint x, vint y)\n{ return vec_cmpgt(x, y);}\n\nstatic INLINE vint2 vgt_vi2_vi2_vi2(vint2 x, vint2 y)\n{ return (vint2)vec_cmpgt(x, y); }\nstatic INLINE vopmask vgt_vo_vi2_vi2(vint2 x, vint2 y)\n{ return vec_cmpgt(x, y); }\n\nstatic INLINE vopmask vgt_vo_vf_vf(vfloat x, vfloat y)\n{ return vec_cmpgt(x, y); }\nstatic INLINE vopmask vgt_vo_vd_vd(vdouble x, vdouble y)\n{ return (vopmask)vec_cmpgt(x, y); }\n\n////////////// Less Than Or Equal //////////////\nstatic INLINE vopmask vle_vo_vf_vf(vfloat x, vfloat y)\n{ return vec_cmple(x, y); }\nstatic INLINE vopmask vle_vo_vd_vd(vdouble x, vdouble y)\n{ return (vopmask)vec_cmple(x, y); }\n\n////////////// Greater Than Or Equal //////////////\nstatic INLINE vopmask vge_vo_vf_vf(vfloat x, vfloat y)\n{ return vec_cmpge(x, y); }\nstatic INLINE vopmask vge_vo_vd_vd(vdouble x, vdouble y)\n{ return (vopmask)vec_cmpge(x, y); }\n\n////////////// Special Cases //////////////\nstatic INLINE vopmask visinf_vo_vf(vfloat d)\n{ return vec_cmpeq(vec_abs(d), vsetall__vf(SLEEF_INFINITYf)); }\nstatic INLINE vopmask visinf_vo_vd(vdouble d)\n{ return (vopmask)vec_cmpeq(vec_abs(d), vsetall__vd(SLEEF_INFINITY)); }\n\nstatic INLINE vopmask vispinf_vo_vf(vfloat d)\n{ return vec_cmpeq(d, vsetall__vf(SLEEF_INFINITYf)); }\nstatic INLINE vopmask vispinf_vo_vd(vdouble d)\n{ return (vopmask)vec_cmpeq(d, vsetall__vd(SLEEF_INFINITY)); }\n\nstatic INLINE vopmask visminf_vo_vf(vfloat d)\n{ return vec_cmpeq(d, vsetall__vf(-SLEEF_INFINITYf)); }\nstatic INLINE vopmask visminf_vo_vd(vdouble d)\n{ return (vopmask)vec_cmpeq(d, vsetall__vd(-SLEEF_INFINITY)); }\n\nstatic INLINE vopmask visnan_vo_vf(vfloat d)\n{ return vnot_vo_vo(vec_cmpeq(d, d)); }\nstatic INLINE vopmask visnan_vo_vd(vdouble d)\n{ return vnot_vo_vo((vopmask)vec_cmpeq(d, d)); }\n\n/**********************************************\n ** Shift\n **********************************************/\n////////////// Left //////////////\nstatic INLINE vint vsll_vi_vi_i(vint x, int c)\n{ return vec_sl (x, vsetall__u32(c)); }\nstatic INLINE vint2 vsll_vi2_vi2_i(vint2 x, int c)\n{ return vec_sl(x, vsetall__u32(c)); }\n\n////////////// Right //////////////\nstatic INLINE vint vsrl_vi_vi_i(vint x, int c)\n{ return vec_sr(x, vsetall__u32(c)); }\nstatic INLINE vint2 vsrl_vi2_vi2_i(vint2 x, int c)\n{ return vec_sr(x, vsetall__u32(c)); }\n\n////////////// Algebraic Right //////////////\nstatic INLINE vint vsra_vi_vi_i(vint x, int c)\n{ return vec_sra(x, vsetall__u32(c)); }\nstatic INLINE vint2 vsra_vi2_vi2_i(vint2 x, int c)\n{ return vec_sra(x, vsetall__u32(c)); }\n\n/**********************************************\n ** Reorder\n **********************************************/\n\n////////////// Reverse //////////////\n// Reverse elements order inside the lower and higher parts\nstatic INLINE vint2 vrev21_vi2_vi2(vint2 vi)\n{ return vec_mergee(vec_mergeo(vi, vi), vi); }\nstatic INLINE vfloat vrev21_vf_vf(vfloat vf)\n{ return (vfloat)vrev21_vi2_vi2((vint2)vf); }\n\n// Swap the lower and higher parts\nstatic INLINE vfloat vreva2_vf_vf(vfloat vf)\n{ return (vfloat)v__swapd_u64((v__u64)vf); }\nstatic INLINE vdouble vrev21_vd_vd(vdouble vd)\n{ return (vdouble)v__swapd_u64((v__u64)vd); }\nstatic INLINE vdouble vreva2_vd_vd(vdouble vd)\n{ return vd; }\n\n/**********************************************\n ** Arithmetic\n **********************************************/\n\n////////////// Negation //////////////\nstatic INLINE vint vneg_vi_vi(vint e) {\n#ifdef __clang__\n  return vec_neg(e);\n#else\n  return vec_sub(vzero__vi(), e);\n#endif\n}\nstatic INLINE vint2 vneg_vi2_vi2(vint2 e)\n{ return vneg_vi_vi(e); }\n\nstatic INLINE vfloat vneg_vf_vf(vfloat d)\n{\n  vfloat ret;\n#ifdef __clang__\n  ret = vec_neg(d);\n#else\n  __asm__ __volatile__(\"xvnegsp %x0,%x1\" : \"=wa\" (ret) : \"wa\" (d));\n#endif\n  return ret;\n}\n\nstatic INLINE vdouble vneg_vd_vd(vdouble d)\n{\n  vdouble ret;\n#ifdef __clang__\n  ret = vec_neg(d);\n#else\n  __asm__ __volatile__(\"xvnegdp %x0,%x1\" : \"=wa\" (ret) : \"wa\" (d));\n#endif\n  return ret;\n}\n\nstatic INLINE vfloat vposneg_vf_vf(vfloat d)\n{ return vec_xor(d, vset__vf(+0.0f, -0.0f, +0.0f, -0.0f)); }\nstatic INLINE vdouble vposneg_vd_vd(vdouble d)\n{ return vec_xor(d, vset__vd(+0.0, -0.0)); }\n\nstatic INLINE vfloat vnegpos_vf_vf(vfloat d)\n{ return vec_xor(d, vset__vf(-0.0f, +0.0f, -0.0f, +0.0f)); }\nstatic INLINE vdouble vnegpos_vd_vd(vdouble d)\n{ return vec_xor(d, vset__vd(-0.0, +0.0)); }\n\n////////////// Addition //////////////\nstatic INLINE vint vadd_vi_vi_vi(vint x, vint y)\n{ return vec_add(x, y); }\nstatic INLINE vint2 vadd_vi2_vi2_vi2(vint2 x, vint2 y)\n{ return vec_add(x, y); }\n\nstatic INLINE vfloat vadd_vf_vf_vf(vfloat x, vfloat y)\n{ return vec_add(x, y); }\nstatic INLINE vdouble vadd_vd_vd_vd(vdouble x, vdouble y)\n{ return vec_add(x, y); }\n\nstatic INLINE vmask vadd64_vm_vm_vm(vmask x, vmask y)\n{ return (vmask)vec_add((v__i64)x, (v__i64)y); }\n\n////////////// Subtraction //////////////\nstatic INLINE vint vsub_vi_vi_vi(vint x, vint y)\n{ return vec_sub(x, y); }\nstatic INLINE vint2 vsub_vi2_vi2_vi2(vint2 x, vint2 y)\n{ return vec_sub(x, y); }\n\nstatic INLINE vfloat vsub_vf_vf_vf(vfloat x, vfloat y)\n{ return vec_sub(x, y); }\nstatic INLINE vdouble vsub_vd_vd_vd(vdouble x, vdouble y)\n{ return vec_sub(x, y); }\n\nstatic INLINE vdouble vsubadd_vd_vd_vd(vdouble x, vdouble y)\n{ return vec_add(x, vnegpos_vd_vd(y)); }\nstatic INLINE vfloat vsubadd_vf_vf_vf(vfloat x, vfloat y)\n{ return vec_add(x, vnegpos_vf_vf(y)); }\n\n////////////// Multiplication //////////////\nstatic INLINE vfloat vmul_vf_vf_vf(vfloat x, vfloat y)\n{ return vec_mul(x, y); }\nstatic INLINE vdouble vmul_vd_vd_vd(vdouble x, vdouble y)\n{ return vec_mul(x, y); }\n\nstatic INLINE vfloat vdiv_vf_vf_vf(vfloat x, vfloat y)\n{ return vec_div(x, y); }\nstatic INLINE vdouble vdiv_vd_vd_vd(vdouble x, vdouble y)\n{ return vec_div(x, y); }\n\nstatic INLINE vfloat vrec_vf_vf(vfloat x)\n{ return vec_div(vsetall__vf(1.0f), x); }\nstatic INLINE vdouble vrec_vd_vd(vdouble x)\n{ return vec_div(vsetall__vd(1.0), x); }\n\n/**********************************************\n ** Math\n **********************************************/\n\nstatic INLINE vfloat vmax_vf_vf_vf(vfloat x, vfloat y)\n{ return vec_max(x, y); }\nstatic INLINE vdouble vmax_vd_vd_vd(vdouble x, vdouble y)\n{ return vec_max(x, y); }\n\nstatic INLINE vfloat vmin_vf_vf_vf(vfloat x, vfloat y)\n{ return vec_min(x, y); }\nstatic INLINE vdouble vmin_vd_vd_vd(vdouble x, vdouble y)\n{ return vec_min(x, y); }\n\nstatic INLINE vfloat vabs_vf_vf(vfloat f)\n{ return vec_abs(f); }\nstatic INLINE vdouble vabs_vd_vd(vdouble d)\n{ return vec_abs(d); }\n\nstatic INLINE vfloat vsqrt_vf_vf(vfloat f)\n{ return vec_sqrt(f); }\nstatic INLINE vdouble vsqrt_vd_vd(vdouble d)\n{ return vec_sqrt(d); }\n\n\n/**********************************************\n ** FMA3\n **********************************************/\n#if CONFIG == 1\n\nstatic INLINE vfloat vmla_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z)\n{ return vec_madd(x, y, z); }\nstatic INLINE vdouble vmla_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z)\n{ return vec_madd(x, y, z); }\n\nstatic INLINE vfloat vmlapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z)\n{ return vec_msub(x, y, z); }\nstatic INLINE vdouble vmlapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z)\n{ return vec_msub(x, y, z); }\n\nstatic INLINE vfloat vmlanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z)\n{ return vec_nmsub(x, y, z); }\nstatic INLINE vdouble vmlanp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z)\n{ return vec_nmsub(x, y, z); }\n\n#else\n\nstatic INLINE vfloat vmla_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z)\n{ return vec_add(vec_mul(x, y), z); }\nstatic INLINE vdouble vmla_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z)\n{ return vec_add(vec_mul(x, y), z); }\n\nstatic INLINE vfloat vmlapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z)\n{ return vec_sub(vec_mul(x, y), z); }\nstatic INLINE vdouble vmlapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z)\n{ return vec_sub(vec_mul(x, y), z); }\n\nstatic INLINE vfloat vmlanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z)\n{ return vec_sub(z, vec_mul(x, y)); }\nstatic INLINE vdouble vmlanp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z)\n{ return vec_sub(z, vec_mul(x, y)); }\n\n#endif\n\nstatic INLINE vfloat vfma_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z)\n{ return vec_madd(x, y, z); }\nstatic INLINE vdouble vfma_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z)\n{ return vec_madd(x, y, z); }\nstatic INLINE vfloat vfmapp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z)\n{ return vec_madd(x, y, z); }\nstatic INLINE vdouble vfmapp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z)\n{ return vec_madd(x, y, z); }\n\nstatic INLINE vfloat vfmapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z)\n{ return vec_msub(x, y, z); }\nstatic INLINE vdouble vfmapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z)\n{ return vec_msub(x, y, z); }\n\nstatic INLINE vfloat vfmanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z)\n{ return vec_nmsub(x, y, z); }\nstatic INLINE vdouble vfmanp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z)\n{ return vec_nmsub(x, y, z); }\n\nstatic INLINE vfloat vfmann_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z)\n{ return vec_nmadd(x, y, z); }\nstatic INLINE vdouble vfmann_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z)\n{ return vec_nmadd(x, y, z); }\n\nstatic INLINE vfloat vmlsubadd_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z)\n{ return vmla_vf_vf_vf_vf(x, y, vnegpos_vf_vf(z)); }\nstatic INLINE vdouble vmlsubadd_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z)\n{ return vmla_vd_vd_vd_vd(x, y, vnegpos_vd_vd(z)); }\n"
  },
  {
    "path": "src/helpersse2.h",
    "content": "//   Copyright Naoki Shibata and contributors 2010 - 2020.\n// Distributed under the Boost Software License, Version 1.0.\n//    (See accompanying file LICENSE.txt or copy at\n//          http://www.boost.org/LICENSE_1_0.txt)\n\n#if CONFIG == 2\n\n#if !defined(__SSE2__) && !defined(SLEEF_GENHEADER)\n#error Please specify -msse2.\n#endif\n\n#elif CONFIG == 3\n\n#if (!defined(__SSE2__) || !defined(__SSE3__)) && !defined(SLEEF_GENHEADER)\n#error Please specify -msse2 and -msse3\n#endif\n\n#elif CONFIG == 4\n\n#if (!defined(__SSE2__) || !defined(__SSE3__) || !defined(__SSE4_1__)) && !defined(SLEEF_GENHEADER)\n#error Please specify -msse2, -msse3 and -msse4.1\n#endif\n\n#else\n#error CONFIG macro invalid or not defined\n#endif\n\n#define ENABLE_DP\n//@#define ENABLE_DP\n#define LOG2VECTLENDP 1\n//@#define LOG2VECTLENDP 1\n#define VECTLENDP (1 << LOG2VECTLENDP)\n//@#define VECTLENDP (1 << LOG2VECTLENDP)\n\n#define ENABLE_SP\n//@#define ENABLE_SP\n#define LOG2VECTLENSP (LOG2VECTLENDP+1)\n//@#define LOG2VECTLENSP (LOG2VECTLENDP+1)\n#define VECTLENSP (1 << LOG2VECTLENSP)\n//@#define VECTLENSP (1 << LOG2VECTLENSP)\n\n#define ACCURATE_SQRT\n//@#define ACCURATE_SQRT\n\n#if !defined(SLEEF_GENHEADER)\n#if defined(_MSC_VER)\n#include <intrin.h>\n#else\n#include <x86intrin.h>\n#endif\n\n#include <stdint.h>\n#include \"misc.h\"\n#endif // #if !defined(SLEEF_GENHEADER)\n\ntypedef __m128i vmask;\ntypedef __m128i vopmask;\n\ntypedef __m128d vdouble;\ntypedef __m128i vint;\n\ntypedef __m128  vfloat;\ntypedef __m128i vint2;\n\ntypedef struct {\n  vmask x, y;\n} vmask2;\n\n//\n\n#if !defined(SLEEF_GENHEADER)\n\n#ifndef __SLEEF_H__\nstatic inline\n                       void Sleef_x86CpuID(int32_t out[4], uint32_t eax,\n                                           uint32_t ecx) {\n                         /* We don't care for cpuid detection */\n                         out[0] = 0xFFFFFFFF;\n                         out[1] = 0xFFFFFFFF;\n                         out[2] = 0xFFFFFFFF;\n                         out[3] = 0xFFFFFFFF;\n                       }\n                       #endif\n\nstatic INLINE int cpuSupportsSSE2() {\n    int32_t reg[4];\n    Sleef_x86CpuID(reg, 1, 0);\n    return (reg[3] & (1 << 26)) != 0;\n}\n\nstatic INLINE int cpuSupportsSSE3() {\n    int32_t reg[4];\n    Sleef_x86CpuID(reg, 1, 0);\n    return (reg[2] & (1 << 0)) != 0;\n}\n\nstatic INLINE int cpuSupportsSSE4_1() {\n    int32_t reg[4];\n    Sleef_x86CpuID(reg, 1, 0);\n    return (reg[2] & (1 << 19)) != 0;\n}\n\n#if defined(__SSE2__) && defined(__SSE3__) && defined(__SSE4_1__)\nstatic INLINE int vavailability_i(int name) {\n  //int d = __builtin_cpu_supports(\"sse2\") && __builtin_cpu_supports(\"sse3\") && __builtin_cpu_supports(\"sse4.1\");\n  int d = cpuSupportsSSE2() && cpuSupportsSSE3() && cpuSupportsSSE4_1();\n  return d ? 3 : 0;\n}\n#define ISANAME \"SSE4.1\"\n#define DFTPRIORITY 12\n#elif defined(__SSE2__) && defined(__SSE3__)\nstatic INLINE int vavailability_i(int name) {\n  //int d = __builtin_cpu_supports(\"sse2\") && __builtin_cpu_supports(\"sse3\");\n  int d = cpuSupportsSSE2() && cpuSupportsSSE3();\n  return d ? 3 : 0;\n}\n#define ISANAME \"SSE3\"\n#define DFTPRIORITY 11\n#else\nstatic INLINE int vavailability_i(int name) {\n  int d = cpuSupportsSSE2();\n  return d ? 3 : 0;\n}\n#define ISANAME \"SSE2\"\n#define DFTPRIORITY 10\n#endif\n\n#endif // #if !defined(SLEEF_GENHEADER)\n\nstatic INLINE void vprefetch_v_p(const void *ptr) { _mm_prefetch(ptr, _MM_HINT_T0); }\n\nstatic INLINE int vtestallones_i_vo32(vopmask g) { return _mm_movemask_epi8(g) == 0xFFFF; }\nstatic INLINE int vtestallones_i_vo64(vopmask g) { return _mm_movemask_epi8(g) == 0xFFFF; }\n\n//\n\nstatic vint2 vloadu_vi2_p(int32_t *p) { return _mm_loadu_si128((__m128i *)p); }\nstatic void vstoreu_v_p_vi2(int32_t *p, vint2 v) { _mm_storeu_si128((__m128i *)p, v); }\n\nstatic vint vloadu_vi_p(int32_t *p) { return _mm_loadu_si128((__m128i *)p); }\nstatic void vstoreu_v_p_vi(int32_t *p, vint v) { _mm_storeu_si128((__m128i *)p, v); }\n\n//\n\nstatic INLINE vmask vand_vm_vm_vm(vmask x, vmask y) { return _mm_and_si128(x, y); }\nstatic INLINE vmask vandnot_vm_vm_vm(vmask x, vmask y) { return _mm_andnot_si128(x, y); }\nstatic INLINE vmask vor_vm_vm_vm(vmask x, vmask y) { return _mm_or_si128(x, y); }\nstatic INLINE vmask vxor_vm_vm_vm(vmask x, vmask y) { return _mm_xor_si128(x, y); }\n\nstatic INLINE vopmask vand_vo_vo_vo(vopmask x, vopmask y) { return _mm_and_si128(x, y); }\nstatic INLINE vopmask vandnot_vo_vo_vo(vopmask x, vopmask y) { return _mm_andnot_si128(x, y); }\nstatic INLINE vopmask vor_vo_vo_vo(vopmask x, vopmask y) { return _mm_or_si128(x, y); }\nstatic INLINE vopmask vxor_vo_vo_vo(vopmask x, vopmask y) { return _mm_xor_si128(x, y); }\n\nstatic INLINE vmask vand_vm_vo64_vm(vopmask x, vmask y) { return _mm_and_si128(x, y); }\nstatic INLINE vmask vor_vm_vo64_vm(vopmask x, vmask y) { return _mm_or_si128(x, y); }\nstatic INLINE vmask vandnot_vm_vo64_vm(vmask x, vmask y) { return _mm_andnot_si128(x, y); }\nstatic INLINE vmask vxor_vm_vo64_vm(vmask x, vmask y) { return _mm_xor_si128(x, y); }\n\nstatic INLINE vmask vand_vm_vo32_vm(vopmask x, vmask y) { return _mm_and_si128(x, y); }\nstatic INLINE vmask vor_vm_vo32_vm(vopmask x, vmask y) { return _mm_or_si128(x, y); }\nstatic INLINE vmask vandnot_vm_vo32_vm(vmask x, vmask y) { return _mm_andnot_si128(x, y); }\nstatic INLINE vmask vxor_vm_vo32_vm(vmask x, vmask y) { return _mm_xor_si128(x, y); }\n\nstatic INLINE vopmask vcast_vo32_vo64(vopmask m) { return _mm_shuffle_epi32(m, 0x08); }\nstatic INLINE vopmask vcast_vo64_vo32(vopmask m) { return _mm_shuffle_epi32(m, 0x50); }\n\n//\n\nstatic INLINE vint vrint_vi_vd(vdouble vd) { return _mm_cvtpd_epi32(vd); }\nstatic INLINE vint vtruncate_vi_vd(vdouble vd) { return _mm_cvttpd_epi32(vd); }\nstatic INLINE vdouble vcast_vd_vi(vint vi) { return _mm_cvtepi32_pd(vi); }\nstatic INLINE vint vcast_vi_i(int i) { return _mm_set_epi32(0, 0, i, i); }\nstatic INLINE vint2 vcastu_vi2_vi(vint vi) { return _mm_and_si128(_mm_shuffle_epi32(vi, 0x73), _mm_set_epi32(-1, 0, -1, 0)); }\nstatic INLINE vint vcastu_vi_vi2(vint2 vi) { return _mm_shuffle_epi32(vi, 0x0d); }\n\n#if CONFIG == 4\nstatic INLINE vdouble vtruncate_vd_vd(vdouble vd) { return _mm_round_pd(vd, _MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC); }\nstatic INLINE vdouble vrint_vd_vd(vdouble vd) { return _mm_round_pd(vd, _MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC); }\nstatic INLINE vfloat vtruncate_vf_vf(vfloat vf) { return _mm_round_ps(vf, _MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC); }\nstatic INLINE vfloat vrint_vf_vf(vfloat vd) { return _mm_round_ps(vd, _MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC); }\nstatic INLINE vopmask veq64_vo_vm_vm(vmask x, vmask y) { return _mm_cmpeq_epi64(x, y); }\n#define FULL_FP_ROUNDING\n//@#define FULL_FP_ROUNDING\n#else\nstatic INLINE vdouble vtruncate_vd_vd(vdouble vd) { return vcast_vd_vi(vtruncate_vi_vd(vd)); }\nstatic INLINE vdouble vrint_vd_vd(vdouble vd) { return vcast_vd_vi(vrint_vi_vd(vd)); }\nstatic INLINE vopmask veq64_vo_vm_vm(vmask x, vmask y) {\n  vmask t = _mm_cmpeq_epi32(x, y);\n  return vand_vm_vm_vm(t, _mm_shuffle_epi32(t, 0xb1));\n}\n#endif\n\nstatic INLINE vmask vadd64_vm_vm_vm(vmask x, vmask y) { return _mm_add_epi64(x, y); }\n\nstatic INLINE vmask vcast_vm_i_i(int i0, int i1) { return _mm_set_epi32(i0, i1, i0, i1); }\n\n//\n\nstatic INLINE vdouble vcast_vd_d(double d) { return _mm_set1_pd(d); }\nstatic INLINE vmask vreinterpret_vm_vd(vdouble vd) { return _mm_castpd_si128(vd); }\nstatic INLINE vint2 vreinterpret_vi2_vd(vdouble vd) { return _mm_castpd_si128(vd); }\nstatic INLINE vdouble vreinterpret_vd_vi2(vint2 vi) { return _mm_castsi128_pd(vi); }\nstatic INLINE vdouble vreinterpret_vd_vm(vmask vm) { return _mm_castsi128_pd(vm); }\n\nstatic INLINE vdouble vadd_vd_vd_vd(vdouble x, vdouble y) { return _mm_add_pd(x, y); }\nstatic INLINE vdouble vsub_vd_vd_vd(vdouble x, vdouble y) { return _mm_sub_pd(x, y); }\nstatic INLINE vdouble vmul_vd_vd_vd(vdouble x, vdouble y) { return _mm_mul_pd(x, y); }\nstatic INLINE vdouble vdiv_vd_vd_vd(vdouble x, vdouble y) { return _mm_div_pd(x, y); }\nstatic INLINE vdouble vrec_vd_vd(vdouble x) { return _mm_div_pd(_mm_set1_pd(1), x); }\nstatic INLINE vdouble vsqrt_vd_vd(vdouble x) { return _mm_sqrt_pd(x); }\nstatic INLINE vdouble vabs_vd_vd(vdouble d) { return _mm_andnot_pd(_mm_set1_pd(-0.0), d); }\nstatic INLINE vdouble vneg_vd_vd(vdouble d) { return _mm_xor_pd(_mm_set1_pd(-0.0), d); }\nstatic INLINE vdouble vmla_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vadd_vd_vd_vd(vmul_vd_vd_vd(x, y), z); }\nstatic INLINE vdouble vmlapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vsub_vd_vd_vd(vmul_vd_vd_vd(x, y), z); }\nstatic INLINE vdouble vmlanp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vsub_vd_vd_vd(z, vmul_vd_vd_vd(x, y)); }\nstatic INLINE vdouble vmax_vd_vd_vd(vdouble x, vdouble y) { return _mm_max_pd(x, y); }\nstatic INLINE vdouble vmin_vd_vd_vd(vdouble x, vdouble y) { return _mm_min_pd(x, y); }\n\nstatic INLINE vopmask veq_vo_vd_vd(vdouble x, vdouble y) { return _mm_castpd_si128(_mm_cmpeq_pd(x, y)); }\nstatic INLINE vopmask vneq_vo_vd_vd(vdouble x, vdouble y) { return _mm_castpd_si128(_mm_cmpneq_pd(x, y)); }\nstatic INLINE vopmask vlt_vo_vd_vd(vdouble x, vdouble y) { return _mm_castpd_si128(_mm_cmplt_pd(x, y)); }\nstatic INLINE vopmask vle_vo_vd_vd(vdouble x, vdouble y) { return _mm_castpd_si128(_mm_cmple_pd(x, y)); }\nstatic INLINE vopmask vgt_vo_vd_vd(vdouble x, vdouble y) { return _mm_castpd_si128(_mm_cmpgt_pd(x, y)); }\nstatic INLINE vopmask vge_vo_vd_vd(vdouble x, vdouble y) { return _mm_castpd_si128(_mm_cmpge_pd(x, y)); }\n\nstatic INLINE vint vadd_vi_vi_vi(vint x, vint y) { return _mm_add_epi32(x, y); }\nstatic INLINE vint vsub_vi_vi_vi(vint x, vint y) { return _mm_sub_epi32(x, y); }\nstatic INLINE vint vneg_vi_vi(vint e) { return vsub_vi_vi_vi(vcast_vi_i(0), e); }\n\nstatic INLINE vint vand_vi_vi_vi(vint x, vint y) { return _mm_and_si128(x, y); }\nstatic INLINE vint vandnot_vi_vi_vi(vint x, vint y) { return _mm_andnot_si128(x, y); }\nstatic INLINE vint vor_vi_vi_vi(vint x, vint y) { return _mm_or_si128(x, y); }\nstatic INLINE vint vxor_vi_vi_vi(vint x, vint y) { return _mm_xor_si128(x, y); }\n\nstatic INLINE vint vand_vi_vo_vi(vopmask x, vint y) { return _mm_and_si128(x, y); }\nstatic INLINE vint vandnot_vi_vo_vi(vopmask x, vint y) { return _mm_andnot_si128(x, y); }\n\nstatic INLINE vint vsll_vi_vi_i(vint x, int c) { return _mm_slli_epi32(x, c); }\nstatic INLINE vint vsrl_vi_vi_i(vint x, int c) { return _mm_srli_epi32(x, c); }\nstatic INLINE vint vsra_vi_vi_i(vint x, int c) { return _mm_srai_epi32(x, c); }\n\nstatic INLINE vint veq_vi_vi_vi(vint x, vint y) { return _mm_cmpeq_epi32(x, y); }\nstatic INLINE vint vgt_vi_vi_vi(vint x, vint y) { return _mm_cmpgt_epi32(x, y); }\n\nstatic INLINE vopmask veq_vo_vi_vi(vint x, vint y) { return _mm_cmpeq_epi32(x, y); }\nstatic INLINE vopmask vgt_vo_vi_vi(vint x, vint y) { return _mm_cmpgt_epi32(x, y); }\n\n#if CONFIG == 4\nstatic INLINE vint vsel_vi_vo_vi_vi(vopmask m, vint x, vint y) { return _mm_blendv_epi8(y, x, m); }\n\nstatic INLINE vdouble vsel_vd_vo_vd_vd(vopmask m, vdouble x, vdouble y) { return _mm_blendv_pd(y, x, _mm_castsi128_pd(m)); }\n#else\nstatic INLINE vint vsel_vi_vo_vi_vi(vopmask m, vint x, vint y) { return vor_vm_vm_vm(vand_vm_vm_vm(m, x), vandnot_vm_vm_vm(m, y)); }\n\nstatic INLINE vdouble vsel_vd_vo_vd_vd(vopmask opmask, vdouble x, vdouble y) {\n  return _mm_or_pd(_mm_and_pd(_mm_castsi128_pd(opmask), x), _mm_andnot_pd(_mm_castsi128_pd(opmask), y));\n}\n#endif\n\nstatic INLINE CONST vdouble vsel_vd_vo_d_d(vopmask o, double v1, double v0) {\n  return vsel_vd_vo_vd_vd(o, vcast_vd_d(v1), vcast_vd_d(v0));\n}\n\nstatic INLINE vdouble vsel_vd_vo_vo_d_d_d(vopmask o0, vopmask o1, double d0, double d1, double d2) {\n  return vsel_vd_vo_vd_vd(o0, vcast_vd_d(d0), vsel_vd_vo_d_d(o1, d1, d2));\n}\n\nstatic INLINE vdouble vsel_vd_vo_vo_vo_d_d_d_d(vopmask o0, vopmask o1, vopmask o2, double d0, double d1, double d2, double d3) {\n  return vsel_vd_vo_vd_vd(o0, vcast_vd_d(d0), vsel_vd_vo_vd_vd(o1, vcast_vd_d(d1), vsel_vd_vo_d_d(o2, d2, d3)));\n}\n\nstatic INLINE vopmask visinf_vo_vd(vdouble d) {\n  return vreinterpret_vm_vd(_mm_cmpeq_pd(vabs_vd_vd(d), _mm_set1_pd(SLEEF_INFINITY)));\n}\n\nstatic INLINE vopmask vispinf_vo_vd(vdouble d) {\n  return vreinterpret_vm_vd(_mm_cmpeq_pd(d, _mm_set1_pd(SLEEF_INFINITY)));\n}\n\nstatic INLINE vopmask visminf_vo_vd(vdouble d) {\n  return vreinterpret_vm_vd(_mm_cmpeq_pd(d, _mm_set1_pd(-SLEEF_INFINITY)));\n}\n\nstatic INLINE vopmask visnan_vo_vd(vdouble d) {\n  return vreinterpret_vm_vd(_mm_cmpneq_pd(d, d));\n}\n\n//\n\nstatic INLINE vdouble vload_vd_p(const double *ptr) { return _mm_load_pd(ptr); }\nstatic INLINE vdouble vloadu_vd_p(const double *ptr) { return _mm_loadu_pd(ptr); }\n\nstatic INLINE void vstore_v_p_vd(double *ptr, vdouble v) { _mm_store_pd(ptr, v); }\nstatic INLINE void vstoreu_v_p_vd(double *ptr, vdouble v) { _mm_storeu_pd(ptr, v); }\n\nstatic INLINE vdouble vgather_vd_p_vi(const double *ptr, vint vi) {\n  int a[sizeof(vint)/sizeof(int)];\n  vstoreu_v_p_vi(a, vi);\n  return _mm_set_pd(ptr[a[1]], ptr[a[0]]);\n}\n\n// This function is for debugging\nstatic INLINE double vcast_d_vd(vdouble v) {\n  double a[VECTLENDP];\n  vstoreu_v_p_vd(a, v);\n  return a[0];\n}\n\n//\n\nstatic INLINE vint2 vcast_vi2_vm(vmask vm) { return vm; }\nstatic INLINE vmask vcast_vm_vi2(vint2 vi) { return vi; }\nstatic INLINE vint2 vrint_vi2_vf(vfloat vf) { return _mm_cvtps_epi32(vf); }\nstatic INLINE vint2 vtruncate_vi2_vf(vfloat vf) { return _mm_cvttps_epi32(vf); }\nstatic INLINE vfloat vcast_vf_vi2(vint2 vi) { return _mm_cvtepi32_ps(vcast_vm_vi2(vi)); }\nstatic INLINE vfloat vcast_vf_f(float f) { return _mm_set1_ps(f); }\nstatic INLINE vint2 vcast_vi2_i(int i) { return _mm_set1_epi32(i); }\nstatic INLINE vmask vreinterpret_vm_vf(vfloat vf) { return _mm_castps_si128(vf); }\nstatic INLINE vfloat vreinterpret_vf_vm(vmask vm) { return _mm_castsi128_ps(vm); }\nstatic INLINE vfloat vreinterpret_vf_vi2(vint2 vm) { return _mm_castsi128_ps(vm); }\nstatic INLINE vint2 vreinterpret_vi2_vf(vfloat vf) { return _mm_castps_si128(vf); }\n\n#if CONFIG != 4\nstatic INLINE vfloat vtruncate_vf_vf(vfloat vd) { return vcast_vf_vi2(vtruncate_vi2_vf(vd)); }\nstatic INLINE vfloat vrint_vf_vf(vfloat vf) { return vcast_vf_vi2(vrint_vi2_vf(vf)); }\n#endif\n\nstatic INLINE vfloat vadd_vf_vf_vf(vfloat x, vfloat y) { return _mm_add_ps(x, y); }\nstatic INLINE vfloat vsub_vf_vf_vf(vfloat x, vfloat y) { return _mm_sub_ps(x, y); }\nstatic INLINE vfloat vmul_vf_vf_vf(vfloat x, vfloat y) { return _mm_mul_ps(x, y); }\nstatic INLINE vfloat vdiv_vf_vf_vf(vfloat x, vfloat y) { return _mm_div_ps(x, y); }\nstatic INLINE vfloat vrec_vf_vf(vfloat x) { return vdiv_vf_vf_vf(vcast_vf_f(1.0f), x); }\nstatic INLINE vfloat vsqrt_vf_vf(vfloat x) { return _mm_sqrt_ps(x); }\nstatic INLINE vfloat vabs_vf_vf(vfloat f) { return vreinterpret_vf_vm(vandnot_vm_vm_vm(vreinterpret_vm_vf(vcast_vf_f(-0.0f)), vreinterpret_vm_vf(f))); }\nstatic INLINE vfloat vneg_vf_vf(vfloat d) { return vreinterpret_vf_vm(vxor_vm_vm_vm(vreinterpret_vm_vf(vcast_vf_f(-0.0f)), vreinterpret_vm_vf(d))); }\nstatic INLINE vfloat vmla_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vadd_vf_vf_vf(vmul_vf_vf_vf(x, y), z); }\nstatic INLINE vfloat vmlapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vsub_vf_vf_vf(vmul_vf_vf_vf(x, y), z); }\nstatic INLINE vfloat vmlanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vsub_vf_vf_vf(z, vmul_vf_vf_vf(x, y)); }\nstatic INLINE vfloat vmax_vf_vf_vf(vfloat x, vfloat y) { return _mm_max_ps(x, y); }\nstatic INLINE vfloat vmin_vf_vf_vf(vfloat x, vfloat y) { return _mm_min_ps(x, y); }\n\nstatic INLINE vopmask veq_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm_cmpeq_ps(x, y)); }\nstatic INLINE vopmask vneq_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm_cmpneq_ps(x, y)); }\nstatic INLINE vopmask vlt_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm_cmplt_ps(x, y)); }\nstatic INLINE vopmask vle_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm_cmple_ps(x, y)); }\nstatic INLINE vopmask vgt_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm_cmpgt_ps(x, y)); }\nstatic INLINE vopmask vge_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm_cmpge_ps(x, y)); }\n\nstatic INLINE vint2 vadd_vi2_vi2_vi2(vint2 x, vint2 y) { return vadd_vi_vi_vi(x, y); }\nstatic INLINE vint2 vsub_vi2_vi2_vi2(vint2 x, vint2 y) { return vsub_vi_vi_vi(x, y); }\nstatic INLINE vint2 vneg_vi2_vi2(vint2 e) { return vsub_vi2_vi2_vi2(vcast_vi2_i(0), e); }\n\nstatic INLINE vint2 vand_vi2_vi2_vi2(vint2 x, vint2 y) { return vand_vi_vi_vi(x, y); }\nstatic INLINE vint2 vandnot_vi2_vi2_vi2(vint2 x, vint2 y) { return vandnot_vi_vi_vi(x, y); }\nstatic INLINE vint2 vor_vi2_vi2_vi2(vint2 x, vint2 y) { return vor_vi_vi_vi(x, y); }\nstatic INLINE vint2 vxor_vi2_vi2_vi2(vint2 x, vint2 y) { return vxor_vi_vi_vi(x, y); }\n\nstatic INLINE vint2 vand_vi2_vo_vi2(vopmask x, vint2 y) { return vand_vi_vo_vi(x, y); }\nstatic INLINE vint2 vandnot_vi2_vo_vi2(vopmask x, vint2 y) { return vandnot_vi_vo_vi(x, y); }\n\nstatic INLINE vint2 vsll_vi2_vi2_i(vint2 x, int c) { return vsll_vi_vi_i(x, c); }\nstatic INLINE vint2 vsrl_vi2_vi2_i(vint2 x, int c) { return vsrl_vi_vi_i(x, c); }\nstatic INLINE vint2 vsra_vi2_vi2_i(vint2 x, int c) { return vsra_vi_vi_i(x, c); }\n\nstatic INLINE vopmask veq_vo_vi2_vi2(vint2 x, vint2 y) { return _mm_cmpeq_epi32(x, y); }\nstatic INLINE vopmask vgt_vo_vi2_vi2(vint2 x, vint2 y) { return _mm_cmpgt_epi32(x, y); }\nstatic INLINE vint2 veq_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm_cmpeq_epi32(x, y); }\nstatic INLINE vint2 vgt_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm_cmpgt_epi32(x, y); }\n\n#if CONFIG == 4\nstatic INLINE vint2 vsel_vi2_vo_vi2_vi2(vopmask m, vint2 x, vint2 y) { return _mm_blendv_epi8(y, x, m); }\n\nstatic INLINE vfloat vsel_vf_vo_vf_vf(vopmask m, vfloat x, vfloat y) { return _mm_blendv_ps(y, x, _mm_castsi128_ps(m)); }\n#else\nstatic INLINE vint2 vsel_vi2_vo_vi2_vi2(vopmask m, vint2 x, vint2 y) {\n  return vor_vi2_vi2_vi2(vand_vi2_vi2_vi2(m, x), vandnot_vi2_vi2_vi2(m, y));\n}\n\nstatic INLINE vfloat vsel_vf_vo_vf_vf(vopmask opmask, vfloat x, vfloat y) {\n  return _mm_or_ps(_mm_and_ps(_mm_castsi128_ps(opmask), x), _mm_andnot_ps(_mm_castsi128_ps(opmask), y));\n}\n#endif\n\nstatic INLINE CONST vfloat vsel_vf_vo_f_f(vopmask o, float v1, float v0) {\n  return vsel_vf_vo_vf_vf(o, vcast_vf_f(v1), vcast_vf_f(v0));\n}\n\nstatic INLINE vfloat vsel_vf_vo_vo_f_f_f(vopmask o0, vopmask o1, float d0, float d1, float d2) {\n  return vsel_vf_vo_vf_vf(o0, vcast_vf_f(d0), vsel_vf_vo_f_f(o1, d1, d2));\n}\n\nstatic INLINE vfloat vsel_vf_vo_vo_vo_f_f_f_f(vopmask o0, vopmask o1, vopmask o2, float d0, float d1, float d2, float d3) {\n  return vsel_vf_vo_vf_vf(o0, vcast_vf_f(d0), vsel_vf_vo_vf_vf(o1, vcast_vf_f(d1), vsel_vf_vo_f_f(o2, d2, d3)));\n}\n\nstatic INLINE vopmask visinf_vo_vf(vfloat d) { return veq_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(SLEEF_INFINITYf)); }\nstatic INLINE vopmask vispinf_vo_vf(vfloat d) { return veq_vo_vf_vf(d, vcast_vf_f(SLEEF_INFINITYf)); }\nstatic INLINE vopmask visminf_vo_vf(vfloat d) { return veq_vo_vf_vf(d, vcast_vf_f(-SLEEF_INFINITYf)); }\nstatic INLINE vopmask visnan_vo_vf(vfloat d) { return vneq_vo_vf_vf(d, d); }\n\nstatic INLINE vfloat vload_vf_p(const float *ptr) { return _mm_load_ps(ptr); }\nstatic INLINE vfloat vloadu_vf_p(const float *ptr) { return _mm_loadu_ps(ptr); }\n\nstatic INLINE void vstore_v_p_vf(float *ptr, vfloat v) { _mm_store_ps(ptr, v); }\nstatic INLINE void vstoreu_v_p_vf(float *ptr, vfloat v) { _mm_storeu_ps(ptr, v); }\n\nstatic INLINE vfloat vgather_vf_p_vi2(const float *ptr, vint2 vi) {\n  int a[VECTLENSP];\n  vstoreu_v_p_vi2(a, vi);\n  return _mm_set_ps(ptr[a[3]], ptr[a[2]], ptr[a[1]], ptr[a[0]]);\n}\n\n// This function is for debugging\nstatic INLINE float vcast_f_vf(vfloat v) {\n  float a[VECTLENSP];\n  vstoreu_v_p_vf(a, v);\n  return a[0];\n}\n\n//\n\n#define PNMASK ((vdouble) { +0.0, -0.0 })\n#define NPMASK ((vdouble) { -0.0, +0.0 })\n#define PNMASKf ((vfloat) { +0.0f, -0.0f, +0.0f, -0.0f })\n#define NPMASKf ((vfloat) { -0.0f, +0.0f, -0.0f, +0.0f })\n\nstatic INLINE vdouble vposneg_vd_vd(vdouble d) { return vreinterpret_vd_vm(vxor_vm_vm_vm(vreinterpret_vm_vd(d), vreinterpret_vm_vd(PNMASK))); }\nstatic INLINE vdouble vnegpos_vd_vd(vdouble d) { return vreinterpret_vd_vm(vxor_vm_vm_vm(vreinterpret_vm_vd(d), vreinterpret_vm_vd(NPMASK))); }\nstatic INLINE vfloat vposneg_vf_vf(vfloat d) { return vreinterpret_vf_vm(vxor_vm_vm_vm(vreinterpret_vm_vf(d), vreinterpret_vm_vf(PNMASKf))); }\nstatic INLINE vfloat vnegpos_vf_vf(vfloat d) { return vreinterpret_vf_vm(vxor_vm_vm_vm(vreinterpret_vm_vf(d), vreinterpret_vm_vf(NPMASKf))); }\n\n#if CONFIG >= 3\nstatic INLINE vdouble vsubadd_vd_vd_vd(vdouble x, vdouble y) { return _mm_addsub_pd(x, y); }\nstatic INLINE vfloat vsubadd_vf_vf_vf(vfloat x, vfloat y) { return _mm_addsub_ps(x, y); }\n#else\nstatic INLINE vdouble vsubadd_vd_vd_vd(vdouble x, vdouble y) { return vadd_vd_vd_vd(x, vnegpos_vd_vd(y)); }\nstatic INLINE vfloat vsubadd_vf_vf_vf(vfloat x, vfloat y) { return vadd_vf_vf_vf(x, vnegpos_vf_vf(y)); }\n#endif\nstatic INLINE vdouble vmlsubadd_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vsubadd_vd_vd_vd(vmul_vd_vd_vd(x, y), z); }\nstatic INLINE vfloat vmlsubadd_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vsubadd_vf_vf_vf(vmul_vf_vf_vf(x, y), z); }\n\nstatic INLINE vdouble vrev21_vd_vd(vdouble d0) { return _mm_shuffle_pd(d0, d0, 1); }\nstatic INLINE vdouble vreva2_vd_vd(vdouble vd) { return vd; }\n\nstatic INLINE void vstream_v_p_vd(double *ptr, vdouble v) { _mm_stream_pd(ptr, v); }\nstatic INLINE void vscatter2_v_p_i_i_vd(double *ptr, int offset, int step, vdouble v) { vstore_v_p_vd((double *)(&ptr[2*offset]), v); }\nstatic INLINE void vsscatter2_v_p_i_i_vd(double *ptr, int offset, int step, vdouble v) { _mm_stream_pd((double *)(&ptr[2*offset]), v); }\n\n//\n\nstatic INLINE vfloat vrev21_vf_vf(vfloat d0) { return _mm_shuffle_ps(d0, d0, (2 << 6) | (3 << 4) | (0 << 2) | (1 << 0)); }\nstatic INLINE vfloat vreva2_vf_vf(vfloat d0) { return _mm_shuffle_ps(d0, d0, (1 << 6) | (0 << 4) | (3 << 2) | (2 << 0)); }\nstatic INLINE vint2 vrev21_vi2_vi2(vint2 i) { return vreinterpret_vi2_vf(vrev21_vf_vf(vreinterpret_vf_vi2(i))); }\n\nstatic INLINE void vstream_v_p_vf(float *ptr, vfloat v) { _mm_stream_ps(ptr, v); }\n\nstatic INLINE void vscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v) {\n  _mm_storel_pd((double *)(ptr+(offset + step * 0)*2), vreinterpret_vd_vm(vreinterpret_vm_vf(v)));\n  _mm_storeh_pd((double *)(ptr+(offset + step * 1)*2), vreinterpret_vd_vm(vreinterpret_vm_vf(v)));\n}\n\nstatic INLINE void vsscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v) {\n  _mm_storel_pd((double *)(ptr+(offset + step * 0)*2), vreinterpret_vd_vm(vreinterpret_vm_vf(v)));\n  _mm_storeh_pd((double *)(ptr+(offset + step * 1)*2), vreinterpret_vd_vm(vreinterpret_vm_vf(v)));\n}\n\n//\n\nstatic INLINE vmask2 vinterleave_vm2_vm2(vmask2 v) {\n  return (vmask2) { _mm_unpacklo_epi64(v.x, v.y), _mm_unpackhi_epi64(v.x, v.y) };\n}\n\nstatic INLINE vmask2 vuninterleave_vm2_vm2(vmask2 v) {\n  return (vmask2) { _mm_unpacklo_epi64(v.x, v.y), _mm_unpackhi_epi64(v.x, v.y) };\n}\n\nstatic INLINE vint vuninterleave_vi_vi(vint v) { return v; }\nstatic INLINE vdouble vinterleave_vd_vd(vdouble vd) { return vd; }\nstatic INLINE vdouble vuninterleave_vd_vd(vdouble vd) { return vd; }\nstatic INLINE vmask vinterleave_vm_vm(vmask vm) { return vm; }\nstatic INLINE vmask vuninterleave_vm_vm(vmask vm) { return vm; }\n\nstatic vmask2 vloadu_vm2_p(void *p) {\n  vmask2 vm2;\n  memcpy(&vm2, p, VECTLENDP * 16);\n  return vm2;\n}\n\n#if !defined(SLEEF_GENHEADER)\ntypedef Sleef_quad2 vargquad;\n\nstatic INLINE vmask2 vcast_vm2_aq(vargquad aq) {\n  return vinterleave_vm2_vm2(vloadu_vm2_p(&aq));\n}\n\nstatic INLINE vargquad vcast_aq_vm2(vmask2 vm2) {\n  vm2 = vuninterleave_vm2_vm2(vm2);\n  vargquad aq;\n  memcpy(&aq, &vm2, VECTLENDP * 16);\n  return aq;\n}\n#endif // #if !defined(SLEEF_GENHEADER)\n\nstatic INLINE int vtestallzeros_i_vo64(vopmask g) { return _mm_movemask_epi8(g) == 0; }\n\nstatic INLINE vmask vsel_vm_vo64_vm_vm(vopmask o, vmask x, vmask y) {\n  return vor_vm_vm_vm(vand_vm_vm_vm(o, x), vandnot_vm_vm_vm(o, y));\n}\n\nstatic INLINE vmask vsub64_vm_vm_vm(vmask x, vmask y) { return _mm_sub_epi64(x, y); }\nstatic INLINE vmask vneg64_vm_vm(vmask x) { return _mm_sub_epi64(vcast_vm_i_i(0, 0), x); }\n\n#define vsll64_vm_vm_i(x, c) _mm_slli_epi64(x, c)\n#define vsrl64_vm_vm_i(x, c) _mm_srli_epi64(x, c)\n//@#define vsll64_vm_vm_i(x, c) _mm_slli_epi64(x, c)\n//@#define vsrl64_vm_vm_i(x, c) _mm_srli_epi64(x, c)\n\nstatic INLINE vopmask vgt64_vo_vm_vm(vmask x, vmask y) {\n  int64_t ax[2], ay[2];\n  _mm_storeu_si128((__m128i *)ax, x);\n  _mm_storeu_si128((__m128i *)ay, y);\n  return _mm_set_epi64x(ax[1] > ay[1] ? -1 : 0, ax[0] > ay[0] ? -1 : 0);\n}\n\nstatic INLINE vmask vcast_vm_vi(vint vi) {\n  vmask m = _mm_and_si128(_mm_shuffle_epi32(vi, (0 << 6) | (1 << 4) | (0 << 2) | (0 << 0)), _mm_set_epi32(0, -1, 0, -1));\n  return vor_vm_vm_vm(vcastu_vi2_vi(vgt_vo_vi_vi(vcast_vi_i(0), vi)), m);\n}\nstatic INLINE vint vcast_vi_vm(vmask vm) { return _mm_shuffle_epi32(vm, 0x08); }\n"
  },
  {
    "path": "src/helpersve.h",
    "content": "/*********************************************************************/\n/*          Copyright ARM Ltd. 2010 - 2019.                          */\n/* Distributed under the Boost Software License, Version 1.0.        */\n/*    (See accompanying file LICENSE.txt or copy at                  */\n/*          http://www.boost.org/LICENSE_1_0.txt)                    */\n/*********************************************************************/\n\n#if !defined(__ARM_FEATURE_SVE) && !defined(SLEEF_GENHEADER)\n#error Please specify SVE flags.\n#endif\n\n#if !defined(SLEEF_GENHEADER)\n#include <arm_sve.h>\n#include <stdint.h>\n\n#include \"misc.h\"\n#endif // #if !defined(SLEEF_GENHEADER)\n\n#if defined(VECTLENDP) || defined(VECTLENSP)\n#error VECTLENDP or VECTLENSP already defined\n#endif\n\n#if CONFIG == 1 || CONFIG == 2\n// Vector length agnostic\n#define VECTLENSP (svcntw())\n//@#define VECTLENSP (svcntw())\n#define VECTLENDP (svcntd())\n//@#define VECTLENDP (svcntd())\n#define ISANAME \"AArch64 SVE\"\n#define ptrue svptrue_b8()\n//@#define ptrue svptrue_b8()\n#elif CONFIG == 8\n// 256-bit vector length\n#define ISANAME \"AArch64 SVE 256-bit\"\n#define LOG2VECTLENDP 2\n#define ptrue svptrue_pat_b8(SV_VL32)\n#define DFTPRIORITY 20\n#elif CONFIG == 9\n// 512-bit vector length\n#define ISANAME \"AArch64 SVE 512-bit\"\n#define LOG2VECTLENDP 3\n#define ptrue svptrue_pat_b8(SV_VL64)\n#define DFTPRIORITY 21\n#elif CONFIG == 10\n// 1024-bit vector length\n#define ISANAME \"AArch64 SVE 1024-bit\"\n#define LOG2VECTLENDP 4\n#define ptrue svptrue_pat_b8(SV_VL128)\n#define DFTPRIORITY 22\n#elif CONFIG == 11\n// 2048-bit vector length\n#define ISANAME \"AArch64 SVE 2048-bit\"\n#define LOG2VECTLENDP 5\n#define ptrue svptrue_pat_b8(SV_VL256)\n#define DFTPRIORITY 23\n#else\n#error CONFIG macro invalid or not defined\n#endif\n\n#ifdef LOG2VECTLENDP\n// For DFT, VECTLENDP and VECTLENSP are not the size of the available\n// vector length, but the size of the partial vectors utilized in the\n// computation. The appropriate VECTLENDP and VECTLENSP are chosen by\n// the dispatcher according to the value of svcntd().\n\n#define LOG2VECTLENSP (LOG2VECTLENDP+1)\n#define VECTLENDP (1 << LOG2VECTLENDP)\n#define VECTLENSP (1 << LOG2VECTLENSP)\nstatic INLINE int vavailability_i(int name) { return svcntd() >= VECTLENDP ? 3 : 0; }\n#else\nstatic INLINE int vavailability_i(int name) { return 3; }\n#endif\n\n#define ENABLE_SP\n//@#define ENABLE_SP\n#define ENABLE_DP\n//@#define ENABLE_DP\n\n#if CONFIG != 2\n#define ENABLE_FMA_SP\n//@#define ENABLE_FMA_SP\n#define ENABLE_FMA_DP\n//@#define ENABLE_FMA_DP\n//#define SPLIT_KERNEL // Benchmark comparison is needed to determine whether this option should be enabled.\n#endif\n\n#define FULL_FP_ROUNDING\n//@#define FULL_FP_ROUNDING\n#define ACCURATE_SQRT\n//@#define ACCURATE_SQRT\n\n// Type definitions\n\n// Mask definition\ntypedef svint32_t vmask;\ntypedef svbool_t vopmask;\n\n// Single precision definitions\ntypedef svfloat32_t vfloat;\ntypedef svint32_t vint2;\n\n// Double precision definitions\ntypedef svfloat64_t vdouble;\ntypedef svint32_t vint;\n\n// Double-double data type with setter/getter functions\ntypedef svfloat64x2_t vdouble2;\nstatic INLINE vdouble  vd2getx_vd_vd2(vdouble2 v) { return svget2_f64(v, 0); }\nstatic INLINE vdouble  vd2gety_vd_vd2(vdouble2 v) { return svget2_f64(v, 1); }\nstatic INLINE vdouble2 vd2setxy_vd2_vd_vd(vdouble x, vdouble y)  { return svcreate2_f64(x, y); }\nstatic INLINE vdouble2 vd2setx_vd2_vd2_vd(vdouble2 v, vdouble d) { return svset2_f64(v, 0, d); }\nstatic INLINE vdouble2 vd2sety_vd2_vd2_vd(vdouble2 v, vdouble d) { return svset2_f64(v, 1, d); }\n\n// Double-float data type with setter/getter functions\ntypedef svfloat32x2_t vfloat2;\nstatic INLINE vfloat  vf2getx_vf_vf2(vfloat2 v) { return svget2_f32(v, 0); }\nstatic INLINE vfloat  vf2gety_vf_vf2(vfloat2 v) { return svget2_f32(v, 1); }\nstatic INLINE vfloat2 vf2setxy_vf2_vf_vf(vfloat x, vfloat y)  { return svcreate2_f32(x, y); }\nstatic INLINE vfloat2 vf2setx_vf2_vf2_vf(vfloat2 v, vfloat d) { return svset2_f32(v, 0, d); }\nstatic INLINE vfloat2 vf2sety_vf2_vf2_vf(vfloat2 v, vfloat d) { return svset2_f32(v, 1, d); }\n\n// vmask2 is mainly used for quad-precision functions\ntypedef svint32x2_t vmask2;\nstatic INLINE vmask vm2getx_vm_vm2(vmask2 v) { return svget2_s32(v, 0); }\nstatic INLINE vmask vm2gety_vm_vm2(vmask2 v) { return svget2_s32(v, 1); }\nstatic INLINE vmask2 vm2setxy_vm2_vm_vm(vmask x, vmask y) { return svcreate2_s32(x, y); }\nstatic INLINE vmask2 vm2setx_vm2_vm2_vm(vmask2 v, vmask x) { return svset2_s32(v, 0, x); }\nstatic INLINE vmask2 vm2sety_vm2_vm2_vm(vmask2 v, vmask y) { return svset2_s32(v, 1, y); }\n\n// Auxiliary data types\n\ntypedef svfloat64x2_t di_t;\n\nstatic INLINE vdouble digetd_vd_di(di_t d) { return svget2_f64(d, 0); }\nstatic INLINE vint digeti_vi_di(di_t d) { return svreinterpret_s32_f64(svget2_f64(d, 1)); }\nstatic INLINE di_t disetdi_di_vd_vi(vdouble d, vint i) {\n  return svcreate2_f64(d, svreinterpret_f64_s32(i));\n}\n\n//\n\ntypedef svfloat32x2_t fi_t;\n\nstatic INLINE vfloat figetd_vf_di(fi_t d) { return svget2_f32(d, 0); }\nstatic INLINE vint2 figeti_vi2_di(fi_t d) { return svreinterpret_s32_f32(svget2_f32(d, 1)); }\nstatic INLINE fi_t fisetdi_fi_vf_vi2(vfloat d, vint2 i) {\n  return svcreate2_f32(d, svreinterpret_f32_s32(i));\n}\n\n//\n\ntypedef svfloat64x3_t ddi_t;\n\nstatic INLINE vdouble2 ddigetdd_vd2_ddi(ddi_t d) {\n  return svcreate2_f64(svget3_f64(d, 0), svget3_f64(d, 1));\n}\nstatic INLINE vint ddigeti_vi_ddi(ddi_t d) { return svreinterpret_s32_f64(svget3_f64(d, 2)); }\nstatic INLINE ddi_t ddisetddi_ddi_vd2_vi(vdouble2 v, vint i) {\n  return svcreate3_f64(svget2_f64(v, 0), svget2_f64(v, 1),\n\t\t       svreinterpret_f64_s32(i));\n}\nstatic INLINE ddi_t ddisetdd_ddi_ddi_vd2(ddi_t ddi, vdouble2 v) {\n  return svcreate3_f64(svget2_f64(v, 0), svget2_f64(v, 1), svget3_f64(ddi, 2));\n}\n\n//\n\ntypedef svfloat32x3_t dfi_t;\n\nstatic INLINE vfloat2 dfigetdf_vf2_dfi(dfi_t d) {\n  return svcreate2_f32(svget3_f32(d, 0), svget3_f32(d, 1));\n}\nstatic INLINE vint2 dfigeti_vi2_dfi(dfi_t d) { return svreinterpret_s32_f32(svget3_f32(d, 2)); }\nstatic INLINE dfi_t dfisetdfi_dfi_vf2_vi2(vfloat2 v, vint2 i) {\n  return svcreate3_f32(svget2_f32(v, 0), svget2_f32(v, 1),\n\t\t       svreinterpret_f32_s32(i));\n}\nstatic INLINE dfi_t dfisetdf_dfi_dfi_vf2(dfi_t dfi, vfloat2 v) {\n  return svcreate3_f32(svget2_f32(v, 0), svget2_f32(v, 1), svget3_f32(dfi, 2));\n}\n\n//\n\ntypedef svfloat64x4_t dd2;\n\nstatic INLINE dd2 dd2setab_dd2_vd2_vd2(vdouble2 a, vdouble2 b) {\n  return svcreate4_f64(svget2_f64(a, 0), svget2_f64(a, 1),\n\t\t       svget2_f64(b, 0), svget2_f64(b, 1));\n}\nstatic INLINE vdouble2 dd2geta_vd2_dd2(dd2 d) {\n  return svcreate2_f64(svget4_f64(d, 0), svget4_f64(d, 1));\n}\nstatic INLINE vdouble2 dd2getb_vd2_dd2(dd2 d) {\n  return svcreate2_f64(svget4_f64(d, 2), svget4_f64(d, 3));\n}\n\n//\n\ntypedef svfloat32x4_t df2;\n\nstatic INLINE df2 df2setab_df2_vf2_vf2(vfloat2 a, vfloat2 b) {\n  return svcreate4_f32(svget2_f32(a, 0), svget2_f32(a, 1),\n\t\t       svget2_f32(b, 0), svget2_f32(b, 1));\n}\nstatic INLINE vfloat2 df2geta_vf2_df2(df2 d) {\n  return svcreate2_f32(svget4_f32(d, 0), svget4_f32(d, 1));\n}\nstatic INLINE vfloat2 df2getb_vf2_df2(df2 d) {\n  return svcreate2_f32(svget4_f32(d, 2), svget4_f32(d, 3));\n}\n\n//\n\ntypedef svfloat64x3_t vdouble3;\n\nstatic INLINE vdouble  vd3getx_vd_vd3(vdouble3 v) { return svget3_f64(v, 0); }\nstatic INLINE vdouble  vd3gety_vd_vd3(vdouble3 v) { return svget3_f64(v, 1); }\nstatic INLINE vdouble  vd3getz_vd_vd3(vdouble3 v) { return svget3_f64(v, 2); }\nstatic INLINE vdouble3 vd3setxyz_vd3_vd_vd_vd(vdouble x, vdouble y, vdouble z)  { return svcreate3_f64(x, y, z); }\nstatic INLINE vdouble3 vd3setx_vd3_vd3_vd(vdouble3 v, vdouble d) { return svset3_f64(v, 0, d); }\nstatic INLINE vdouble3 vd3sety_vd3_vd3_vd(vdouble3 v, vdouble d) { return svset3_f64(v, 1, d); }\nstatic INLINE vdouble3 vd3setz_vd3_vd3_vd(vdouble3 v, vdouble d) { return svset3_f64(v, 2, d); }\n\n//\n\ntypedef svfloat64x4_t tdx;\n\nstatic INLINE vmask tdxgete_vm_tdx(tdx t) {\n  return svreinterpret_s32_f64(svget4_f64(t, 0));\n}\nstatic INLINE vdouble3 tdxgetd3_vd3_tdx(tdx t) {\n  return svcreate3_f64(svget4_f64(t, 1), svget4_f64(t, 2), svget4_f64(t, 3));\n}\nstatic INLINE vdouble tdxgetd3x_vd_tdx(tdx t) { return svget4_f64(t, 1); }\nstatic INLINE vdouble tdxgetd3y_vd_tdx(tdx t) { return svget4_f64(t, 2); }\nstatic INLINE vdouble tdxgetd3z_vd_tdx(tdx t) { return svget4_f64(t, 3); }\nstatic INLINE tdx tdxsete_tdx_tdx_vm(tdx t, vmask e) {\n  return svset4_f64(t, 0, svreinterpret_f64_s32(e));\n}\nstatic INLINE tdx tdxsetd3_tdx_tdx_vd3(tdx t, vdouble3 d3) {\n  return svcreate4_f64(svget4_f64(t, 0), svget3_f64(d3, 0), svget3_f64(d3, 1), svget3_f64(d3, 2));\n}\nstatic INLINE tdx tdxsetx_tdx_tdx_vd(tdx t, vdouble x) { return svset4_f64(t, 1, x); }\nstatic INLINE tdx tdxsety_tdx_tdx_vd(tdx t, vdouble y) { return svset4_f64(t, 2, y); }\nstatic INLINE tdx tdxsetz_tdx_tdx_vd(tdx t, vdouble z) { return svset4_f64(t, 3, z); }\nstatic INLINE tdx tdxsetxyz_tdx_tdx_vd_vd_vd(tdx t, vdouble x, vdouble y, vdouble z) {\n  return svcreate4_f64(svget4_f64(t, 0), x, y, z);\n}\n\nstatic INLINE tdx tdxseted3_tdx_vm_vd3(vmask e, vdouble3 d3) {\n  return svcreate4_f64(svreinterpret_f64_s32(e), svget3_f64(d3, 0), svget3_f64(d3, 1), svget3_f64(d3, 2));\n}\nstatic INLINE tdx tdxsetexyz_tdx_vm_vd_vd_vd(vmask e, vdouble x, vdouble y, vdouble z) {\n  return svcreate4_f64(svreinterpret_f64_s32(e), x, y, z);\n}\n\n//\n\ntypedef svfloat64x4_t tdi_t;\n\nstatic INLINE vdouble3 tdigettd_vd3_tdi(tdi_t d) {\n  return svcreate3_f64(svget4_f64(d, 0), svget4_f64(d, 1), svget4_f64(d, 2));\n}\nstatic INLINE vdouble tdigetx_vd_tdi(tdi_t d) { return svget4_f64(d, 0); }\nstatic INLINE vint tdigeti_vi_tdi(tdi_t d) { return svreinterpret_s32_f64(svget4_f64(d, 3)); }\nstatic INLINE tdi_t tdisettdi_tdi_vd3_vi(vdouble3 v, vint i) {\n  return svcreate4_f64(svget3_f64(v, 0), svget3_f64(v, 1), svget3_f64(v, 2),\n\t\t       svreinterpret_f64_s32(i));\n}\nstatic INLINE tdi_t tdisettd_tdi_tdi_vd3(tdi_t tdi, vdouble3 v) {\n  return svcreate4_f64(svget3_f64(v, 0), svget3_f64(v, 1), svget3_f64(v, 2), svget4_f64(tdi, 3));\n}\n\n//\n\n// masking predicates\n#define ALL_TRUE_MASK svdup_n_s32(0xffffffff)\n#define ALL_FALSE_MASK svdup_n_s32(0x0)\n//@#define ALL_TRUE_MASK svdup_n_s32(0xffffffff)\n//@#define ALL_FALSE_MASK svdup_n_s32(0x0)\n\nstatic INLINE void vprefetch_v_p(const void *ptr) {}\n\n//\n//\n//\n// Test if all lanes are active\n//\n//\n//\nstatic INLINE int vtestallones_i_vo32(vopmask g) {\n  svbool_t pg = svptrue_b32();\n  return (svcntp_b32(pg, g) == svcntw());\n}\n\nstatic INLINE int vtestallones_i_vo64(vopmask g) {\n  svbool_t pg = svptrue_b64();\n  return (svcntp_b64(pg, g) == svcntd());\n}\n//\n//\n//\n//\n//\n//\n\n// Vector load / store\nstatic INLINE void vstoreu_v_p_vi2(int32_t *p, vint2 v) { svst1_s32(ptrue, p, v); }\n\nstatic INLINE vfloat vload_vf_p(const float *ptr) {\n  return svld1_f32(ptrue, ptr);\n}\nstatic INLINE vfloat vloadu_vf_p(const float *ptr) {\n  return svld1_f32(ptrue, ptr);\n}\nstatic INLINE void vstoreu_v_p_vf(float *ptr, vfloat v) {\n  svst1_f32(ptrue, ptr, v);\n}\n\n// Basic logical operations for mask\nstatic INLINE vmask vand_vm_vm_vm(vmask x, vmask y) {\n  return svand_s32_x(ptrue, x, y);\n}\nstatic INLINE vmask vandnot_vm_vm_vm(vmask x, vmask y) {\n  return svbic_s32_x(ptrue, y, x);\n}\nstatic INLINE vmask vor_vm_vm_vm(vmask x, vmask y) {\n  return svorr_s32_x(ptrue, x, y);\n}\nstatic INLINE vmask vxor_vm_vm_vm(vmask x, vmask y) {\n  return sveor_s32_x(ptrue, x, y);\n}\n\nstatic INLINE vmask vadd64_vm_vm_vm(vmask x, vmask y) {\n  return svreinterpret_s32_s64(\n           svadd_s64_x(ptrue, svreinterpret_s64_s32(x),\n                              svreinterpret_s64_s32(y)));\n}\n\n// Mask <--> single precision reinterpret\nstatic INLINE vmask vreinterpret_vm_vf(vfloat vf) {\n  return svreinterpret_s32_f32(vf);\n}\nstatic INLINE vfloat vreinterpret_vf_vm(vmask vm) {\n  return svreinterpret_f32_s32(vm);\n}\nstatic INLINE vfloat vreinterpret_vf_vi2(vint2 vm) {\n  return svreinterpret_f32_s32(vm);\n}\nstatic INLINE vint2 vreinterpret_vi2_vf(vfloat vf) {\n  return svreinterpret_s32_f32(vf);\n}\nstatic INLINE vint2 vcast_vi2_vm(vmask vm) { return vm; }\nstatic INLINE vmask vcast_vm_vi2(vint2 vi) { return vi; }\n\n// Conditional select\nstatic INLINE vint2 vsel_vi2_vm_vi2_vi2(vmask m, vint2 x, vint2 y) {\n  return svsel_s32(svcmpeq_s32(ptrue, m, ALL_TRUE_MASK), x, y);\n}\n\n/****************************************/\n/* Single precision FP operations */\n/****************************************/\n// Broadcast\nstatic INLINE vfloat vcast_vf_f(float f) { return svdup_n_f32(f); }\n\n// Add, Sub, Mul\nstatic INLINE vfloat vadd_vf_vf_vf(vfloat x, vfloat y) {\n  return svadd_f32_x(ptrue, x, y);\n}\nstatic INLINE vfloat vsub_vf_vf_vf(vfloat x, vfloat y) {\n  return svsub_f32_x(ptrue, x, y);\n}\nstatic INLINE vfloat vmul_vf_vf_vf(vfloat x, vfloat y) {\n  return svmul_f32_x(ptrue, x, y);\n}\n\n// |x|, -x\nstatic INLINE vfloat vabs_vf_vf(vfloat f) { return svabs_f32_x(ptrue, f); }\nstatic INLINE vfloat vneg_vf_vf(vfloat f) { return svneg_f32_x(ptrue, f); }\n\n// max, min\nstatic INLINE vfloat vmax_vf_vf_vf(vfloat x, vfloat y) {\n  return svmax_f32_x(ptrue, x, y);\n}\nstatic INLINE vfloat vmin_vf_vf_vf(vfloat x, vfloat y) {\n  return svmin_f32_x(ptrue, x, y);\n}\n\n// int <--> float conversions\nstatic INLINE vint2 vtruncate_vi2_vf(vfloat vf) {\n  return svcvt_s32_f32_x(ptrue, vf);\n}\nstatic INLINE vfloat vcast_vf_vi2(vint2 vi) {\n  return svcvt_f32_s32_x(ptrue, vi);\n}\nstatic INLINE vint2 vcast_vi2_i(int i) { return svdup_n_s32(i); }\nstatic INLINE vint2 vrint_vi2_vf(vfloat d) {\n  return svcvt_s32_f32_x(ptrue, svrintn_f32_x(ptrue, d));\n}\n\n#if CONFIG == 1\n// Multiply accumulate: z = z + x * y\nstatic INLINE vfloat vmla_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) {\n  return svmad_f32_x(ptrue, x, y, z);\n}\n// Multiply subtract: z = z - x * y\nstatic INLINE vfloat vmlanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) {\n  return svmsb_f32_x(ptrue, x, y, z);\n}\nstatic INLINE vfloat vmlapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) {\n  return svnmsb_f32_x(ptrue, x, y, z);\n}\n#else\nstatic INLINE vfloat vmla_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vadd_vf_vf_vf(vmul_vf_vf_vf(x, y), z); }\nstatic INLINE vfloat vmlanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vsub_vf_vf_vf(z, vmul_vf_vf_vf(x, y)); }\nstatic INLINE vfloat vmlapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vsub_vf_vf_vf(vmul_vf_vf_vf(x, y), z); }\n#endif\n\n// fused multiply add / sub\nstatic INLINE vfloat vfma_vf_vf_vf_vf(vfloat x, vfloat y,\n                                      vfloat z) { // z + x * y\n  return svmad_f32_x(ptrue, x, y, z);\n}\nstatic INLINE vfloat vfmanp_vf_vf_vf_vf(vfloat x, vfloat y,\n                                        vfloat z) { // z - x * y\n  return svmsb_f32_x(ptrue, x, y, z);\n}\nstatic INLINE vfloat vfmapn_vf_vf_vf_vf(vfloat x, vfloat y,\n                                        vfloat z) { // x * y - z\n  return svnmsb_f32_x(ptrue, x, y, z);\n}\n\n// conditional select\nstatic INLINE vfloat vsel_vf_vo_vf_vf(vopmask mask, vfloat x, vfloat y) {\n  return svsel_f32(mask, x, y);\n}\n\n// Reciprocal 1/x, Division, Square root\nstatic INLINE vfloat vdiv_vf_vf_vf(vfloat n, vfloat d) {\n#ifndef ENABLE_ALTDIV\n  return svdiv_f32_x(ptrue, n, d);\n#else\n  // Finite numbers (including denormal) only, gives mostly correctly rounded result\n  vfloat t, u, x, y;\n  svuint32_t i0, i1;\n  i0 = svand_u32_x(ptrue, svreinterpret_u32_f32(n), svdup_n_u32(0x7c000000));\n  i1 = svand_u32_x(ptrue, svreinterpret_u32_f32(d), svdup_n_u32(0x7c000000));\n  i0 = svsub_u32_x(ptrue, svdup_n_u32(0x7d000000), svlsr_n_u32_x(ptrue, svadd_u32_x(ptrue, i0, i1), 1));\n  t = svreinterpret_f32_u32(i0);\n  y = svmul_f32_x(ptrue, d, t);\n  x = svmul_f32_x(ptrue, n, t);\n  t = svrecpe_f32(y);\n  t = svmul_f32_x(ptrue, t, svrecps_f32(y, t));\n  t = svmul_f32_x(ptrue, t, svrecps_f32(y, t));\n  u = svmul_f32_x(ptrue, x, t);\n  u = svmad_f32_x(ptrue, svmsb_f32_x(ptrue, y, u, x), t, u);\n  return u;\n#endif\n}\nstatic INLINE vfloat vrec_vf_vf(vfloat d) {\n#ifndef ENABLE_ALTDIV\n  return svdivr_n_f32_x(ptrue, d, 1.0f);\n#else\n  return vsel_vf_vo_vf_vf(svcmpeq_f32(ptrue, vabs_vf_vf(d), vcast_vf_f(SLEEF_INFINITYf)),\n\t\t\t  vcast_vf_f(0), vdiv_vf_vf_vf(vcast_vf_f(1.0f), d));\n#endif\n}\nstatic INLINE vfloat vsqrt_vf_vf(vfloat d) {\n#ifndef ENABLE_ALTSQRT\n  return svsqrt_f32_x(ptrue, d);\n#else\n  // Gives correctly rounded result for all input range\n  vfloat w, x, y, z;\n\n  y = svrsqrte_f32(d);\n  x = vmul_vf_vf_vf(d, y);         w = vmul_vf_vf_vf(vcast_vf_f(0.5), y);\n  y = vfmanp_vf_vf_vf_vf(x, w, vcast_vf_f(0.5));\n  x = vfma_vf_vf_vf_vf(x, y, x);   w = vfma_vf_vf_vf_vf(w, y, w);\n\n  y = vfmanp_vf_vf_vf_vf(x, w, vcast_vf_f(1.5));  w = vadd_vf_vf_vf(w, w);\n  w = vmul_vf_vf_vf(w, y);\n  x = vmul_vf_vf_vf(w, d);\n  y = vfmapn_vf_vf_vf_vf(w, d, x); z = vfmanp_vf_vf_vf_vf(w, x, vcast_vf_f(1));\n  z = vfmanp_vf_vf_vf_vf(w, y, z); w = vmul_vf_vf_vf(vcast_vf_f(0.5), x);\n  w = vfma_vf_vf_vf_vf(w, z, y);\n  w = vadd_vf_vf_vf(w, x);\n\n  return svsel_f32(svorr_b_z(ptrue, svcmpeq_f32(ptrue, d, vcast_vf_f(0)),\n\t\t\t     svcmpeq_f32(ptrue, d, vcast_vf_f(SLEEF_INFINITYf))), d, w);\n#endif\n}\n//\n//\n//\n//\n//\n//\nstatic INLINE CONST vfloat vsel_vf_vo_f_f(vopmask o, float v1, float v0) {\n  return vsel_vf_vo_vf_vf(o, vcast_vf_f(v1), vcast_vf_f(v0));\n}\n\nstatic INLINE vfloat vsel_vf_vo_vo_f_f_f(vopmask o0, vopmask o1, float d0, float d1, float d2) {\n  return vsel_vf_vo_vf_vf(o0, vcast_vf_f(d0), vsel_vf_vo_f_f(o1, d1, d2));\n}\n\nstatic INLINE vfloat vsel_vf_vo_vo_vo_f_f_f_f(vopmask o0, vopmask o1, vopmask o2, float d0, float d1, float d2, float d3) {\n  return vsel_vf_vo_vf_vf(o0, vcast_vf_f(d0), vsel_vf_vo_vf_vf(o1, vcast_vf_f(d1), vsel_vf_vo_f_f(o2, d2, d3)));\n}\n//\n//\n//\n//\n//\n//\n\n// truncate\nstatic INLINE vfloat vtruncate_vf_vf(vfloat vd) {\n  return svrintz_f32_x(ptrue, vd);\n}\n\n//\n//\n//\n// Round float\n//\n//\n//\nstatic INLINE vfloat vrint_vf_vf(vfloat vf) {\n  return svrintn_f32_x(svptrue_b32(), vf);\n}\n//\n//\n//\n//\n//\n//\n\n/***************************************/\n/* Single precision integer operations */\n/***************************************/\n\n// Add, Sub, Neg (-x)\nstatic INLINE vint2 vadd_vi2_vi2_vi2(vint2 x, vint2 y) {\n  return svadd_s32_x(ptrue, x, y);\n}\nstatic INLINE vint2 vsub_vi2_vi2_vi2(vint2 x, vint2 y) {\n  return svsub_s32_x(ptrue, x, y);\n}\nstatic INLINE vint2 vneg_vi2_vi2(vint2 e) { return svneg_s32_x(ptrue, e); }\n\n// Logical operations\nstatic INLINE vint2 vand_vi2_vi2_vi2(vint2 x, vint2 y) {\n  return svand_s32_x(ptrue, x, y);\n}\nstatic INLINE vint2 vandnot_vi2_vi2_vi2(vint2 x, vint2 y) {\n  return svbic_s32_x(ptrue, y, x);\n}\nstatic INLINE vint2 vor_vi2_vi2_vi2(vint2 x, vint2 y) {\n  return svorr_s32_x(ptrue, x, y);\n}\nstatic INLINE vint2 vxor_vi2_vi2_vi2(vint2 x, vint2 y) {\n  return sveor_s32_x(ptrue, x, y);\n}\n\n// Shifts\n#define vsll_vi2_vi2_i(x, c) svlsl_n_s32_x(ptrue, x, c)\n//@#define vsll_vi2_vi2_i(x, c) svlsl_n_s32_x(ptrue, x, c)\n#define vsrl_vi2_vi2_i(x, c)                                                   \\\n  svreinterpret_s32_u32(svlsr_n_u32_x(ptrue, svreinterpret_u32_s32(x), c))\n//@#define vsrl_vi2_vi2_i(x, c) svreinterpret_s32_u32(svlsr_n_u32_x(ptrue, svreinterpret_u32_s32(x), c))\n#define vsra_vi2_vi2_i(x, c) svasr_n_s32_x(ptrue, x, c)\n//@#define vsra_vi2_vi2_i(x, c) svasr_n_s32_x(ptrue, x, c)\n\n// Comparison returning integers\nstatic INLINE vint2 vgt_vi2_vi2_vi2(vint2 x, vint2 y) {\n  return svsel_s32(svcmpgt_s32(ptrue, x, y), ALL_TRUE_MASK, ALL_FALSE_MASK);\n}\n\n// conditional select\nstatic INLINE vint2 vsel_vi2_vo_vi2_vi2(vopmask m, vint2 x, vint2 y) {\n  return svsel_s32(m, x, y);\n}\n\n/****************************************/\n/* opmask operations                    */\n/****************************************/\n// single precision FP\nstatic INLINE vopmask veq_vo_vf_vf(vfloat x, vfloat y) {\n  return svcmpeq_f32(ptrue, x, y);\n}\nstatic INLINE vopmask vneq_vo_vf_vf(vfloat x, vfloat y) {\n  return svcmpne_f32(ptrue, x, y);\n}\nstatic INLINE vopmask vlt_vo_vf_vf(vfloat x, vfloat y) {\n  return svcmplt_f32(ptrue, x, y);\n}\nstatic INLINE vopmask vle_vo_vf_vf(vfloat x, vfloat y) {\n  return svcmple_f32(ptrue, x, y);\n}\nstatic INLINE vopmask vgt_vo_vf_vf(vfloat x, vfloat y) {\n  return svcmpgt_f32(ptrue, x, y);\n}\nstatic INLINE vopmask vge_vo_vf_vf(vfloat x, vfloat y) {\n  return svcmpge_f32(ptrue, x, y);\n}\nstatic INLINE vopmask visinf_vo_vf(vfloat d) {\n  return svcmpeq_n_f32(ptrue, vabs_vf_vf(d), SLEEF_INFINITYf);\n}\nstatic INLINE vopmask vispinf_vo_vf(vfloat d) {\n  return svcmpeq_n_f32(ptrue, d, SLEEF_INFINITYf);\n}\nstatic INLINE vopmask visminf_vo_vf(vfloat d) {\n  return svcmpeq_n_f32(ptrue, d, -SLEEF_INFINITYf);\n}\nstatic INLINE vopmask visnan_vo_vf(vfloat d) { return vneq_vo_vf_vf(d, d); }\n\n// integers\nstatic INLINE vopmask veq_vo_vi2_vi2(vint2 x, vint2 y) {\n  return svcmpeq_s32(ptrue, x, y);\n}\nstatic INLINE vopmask vgt_vo_vi2_vi2(vint2 x, vint2 y) {\n  return svcmpgt_s32(ptrue, x, y);\n}\n\n// logical opmask\nstatic INLINE vopmask vand_vo_vo_vo(vopmask x, vopmask y) {\n  return svand_b_z(ptrue, x, y);\n}\nstatic INLINE vopmask vandnot_vo_vo_vo(vopmask x, vopmask y) {\n  return svbic_b_z(ptrue, y, x);\n}\nstatic INLINE vopmask vor_vo_vo_vo(vopmask x, vopmask y) {\n  return svorr_b_z(ptrue, x, y);\n}\nstatic INLINE vopmask vxor_vo_vo_vo(vopmask x, vopmask y) {\n  return sveor_b_z(ptrue, x, y);\n}\n\nstatic INLINE vint2 vand_vi2_vo_vi2(vopmask x, vint2 y) {\n  // This needs to be zeroing to prevent asinf and atanf denormal test\n  // failing.\n  return svand_s32_z(x, y, y);\n}\n\n// bitmask logical operations\nstatic INLINE vmask vand_vm_vo32_vm(vopmask x, vmask y) {\n  return svsel_s32(x, y, ALL_FALSE_MASK);\n}\nstatic INLINE vmask vandnot_vm_vo32_vm(vopmask x, vmask y) {\n  return svsel_s32(x, ALL_FALSE_MASK, y);\n}\nstatic INLINE vmask vor_vm_vo32_vm(vopmask x, vmask y) {\n  return svsel_s32(x, ALL_TRUE_MASK, y);\n}\n\n// broadcast bitmask\nstatic INLINE vmask vcast_vm_i_i(int i0, int i1) {\n  return svreinterpret_s32_u64(\n      svdup_n_u64((0xffffffff & (uint64_t)i1) | (((uint64_t)i0) << 32)));\n}\n\n/*********************************/\n/* SVE for double precision math */\n/*********************************/\n\n// Vector load/store\nstatic INLINE vdouble vload_vd_p(const double *ptr) {\n  return svld1_f64(ptrue, ptr);\n}\nstatic INLINE vdouble vloadu_vd_p(const double *ptr) {\n  return svld1_f64(ptrue, ptr);\n}\nstatic INLINE void vstoreu_v_p_vd(double *ptr, vdouble v) {\n  svst1_f64(ptrue, ptr, v);\n}\n\nstatic INLINE void vstoreu_v_p_vi(int *ptr, vint v) {\n  svst1w_s64(ptrue, ptr, svreinterpret_s64_s32(v));\n}\nstatic vint vloadu_vi_p(int32_t *p) {\n  return svreinterpret_s32_s64(svld1uw_s64(ptrue, (uint32_t *)p));\n}\n\n// Reinterpret\nstatic INLINE vdouble vreinterpret_vd_vm(vmask vm) {\n  return svreinterpret_f64_s32(vm);\n}\nstatic INLINE vmask vreinterpret_vm_vd(vdouble vd) {\n  return svreinterpret_s32_f64(vd);\n}\nstatic INLINE vdouble vreinterpret_vd_vi2(vint2 x) {\n  return svreinterpret_f64_s32(x);\n}\nstatic INLINE vint2 vreinterpret_vi2_vd(vdouble x) {\n  return svreinterpret_s32_f64(x);\n}\nstatic INLINE vint2 vcastu_vi2_vi(vint x) {\n  return svreinterpret_s32_s64(\n      svlsl_n_s64_x(ptrue, svreinterpret_s64_s32(x), 32));\n}\nstatic INLINE vint vcastu_vi_vi2(vint2 x) {\n  return svreinterpret_s32_u64(\n      svlsr_n_u64_x(ptrue, svreinterpret_u64_s32(x), 32));\n}\nstatic INLINE vdouble vcast_vd_vi(vint vi) {\n  return svcvt_f64_s32_x(ptrue, vi);\n}\n\n// Splat\nstatic INLINE vdouble vcast_vd_d(double d) { return svdup_n_f64(d); }\n\n// Conditional select\nstatic INLINE vdouble vsel_vd_vo_vd_vd(vopmask o, vdouble x, vdouble y) {\n  return svsel_f64(o, x, y);\n}\n\nstatic INLINE CONST vdouble vsel_vd_vo_d_d(vopmask o, double v1, double v0) {\n  return vsel_vd_vo_vd_vd(o, vcast_vd_d(v1), vcast_vd_d(v0));\n}\n\nstatic INLINE vdouble vsel_vd_vo_vo_d_d_d(vopmask o0, vopmask o1, double d0, double d1, double d2) {\n  return vsel_vd_vo_vd_vd(o0, vcast_vd_d(d0), vsel_vd_vo_d_d(o1, d1, d2));\n}\n\nstatic INLINE vdouble vsel_vd_vo_vo_vo_d_d_d_d(vopmask o0, vopmask o1, vopmask o2, double d0, double d1, double d2, double d3) {\n  return vsel_vd_vo_vd_vd(o0, vcast_vd_d(d0), vsel_vd_vo_vd_vd(o1, vcast_vd_d(d1), vsel_vd_vo_d_d(o2, d2, d3)));\n}\n\nstatic INLINE vint vsel_vi_vo_vi_vi(vopmask o, vint x, vint y) {\n  return svsel_s32(o, x, y);\n}\n// truncate\nstatic INLINE vdouble vtruncate_vd_vd(vdouble vd) {\n  return svrintz_f64_x(ptrue, vd);\n}\nstatic INLINE vint vtruncate_vi_vd(vdouble vd) {\n  return svcvt_s32_f64_x(ptrue, vd);\n}\nstatic INLINE vint vrint_vi_vd(vdouble vd) {\n  return svcvt_s32_f64_x(ptrue, svrintn_f64_x(ptrue, vd));\n}\nstatic INLINE vdouble vrint_vd_vd(vdouble vd) {\n  return svrintn_f64_x(ptrue, vd);\n}\n\n// FP math operations\nstatic INLINE vdouble vadd_vd_vd_vd(vdouble x, vdouble y) {\n  return svadd_f64_x(ptrue, x, y);\n}\nstatic INLINE vdouble vsub_vd_vd_vd(vdouble x, vdouble y) {\n  return svsub_f64_x(ptrue, x, y);\n}\nstatic INLINE vdouble vneg_vd_vd(vdouble x) { return svneg_f64_x(ptrue, x); }\nstatic INLINE vdouble vmul_vd_vd_vd(vdouble x, vdouble y) {\n  return svmul_f64_x(ptrue, x, y);\n}\nstatic INLINE vdouble vabs_vd_vd(vdouble x) { return svabs_f64_x(ptrue, x); }\nstatic INLINE vdouble vmax_vd_vd_vd(vdouble x, vdouble y) {\n  return svmax_f64_x(ptrue, x, y);\n}\nstatic INLINE vdouble vmin_vd_vd_vd(vdouble x, vdouble y) {\n  return svmin_f64_x(ptrue, x, y);\n}\n\n#if CONFIG == 1\n// Multiply accumulate / subtract\nstatic INLINE vdouble vmla_vd_vd_vd_vd(vdouble x, vdouble y,\n                                       vdouble z) { // z = x*y + z\n  return svmad_f64_x(ptrue, x, y, z);\n}\nstatic INLINE vdouble vmlapn_vd_vd_vd_vd(vdouble x, vdouble y,\n                                         vdouble z) { // z = x * y - z\n  return svnmsb_f64_x(ptrue, x, y, z);\n}\nstatic INLINE vdouble vmlanp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) {\n  return svmsb_f64_x(ptrue, x, y, z);\n}\n#else\nstatic INLINE vdouble vmla_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vadd_vd_vd_vd(vmul_vd_vd_vd(x, y), z); }\nstatic INLINE vdouble vmlapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vsub_vd_vd_vd(vmul_vd_vd_vd(x, y), z); }\n#endif\n\nstatic INLINE vdouble vfma_vd_vd_vd_vd(vdouble x, vdouble y,\n                                       vdouble z) { // z + x * y\n  return svmad_f64_x(ptrue, x, y, z);\n}\nstatic INLINE vdouble vfmanp_vd_vd_vd_vd(vdouble x, vdouble y,\n                                         vdouble z) { // z - x * y\n  return svmsb_f64_x(ptrue, x, y, z);\n}\nstatic INLINE vdouble vfmapn_vd_vd_vd_vd(vdouble x, vdouble y,\n                                         vdouble z) { // x * y - z\n  return svnmsb_f64_x(ptrue, x, y, z);\n}\n\n// Reciprocal 1/x, Division, Square root\nstatic INLINE vdouble vdiv_vd_vd_vd(vdouble n, vdouble d) {\n#ifndef ENABLE_ALTDIV\n  return svdiv_f64_x(ptrue, n, d);\n#else\n  // Finite numbers (including denormal) only, gives mostly correctly rounded result\n  vdouble t, u, x, y;\n  svuint64_t i0, i1;\n  i0 = svand_u64_x(ptrue, svreinterpret_u64_f64(n), svdup_n_u64(0x7fc0000000000000L));\n  i1 = svand_u64_x(ptrue, svreinterpret_u64_f64(d), svdup_n_u64(0x7fc0000000000000L));\n  i0 = svsub_u64_x(ptrue, svdup_n_u64(0x7fd0000000000000L), svlsr_n_u64_x(ptrue, svadd_u64_x(ptrue, i0, i1), 1));\n  t = svreinterpret_f64_u64(i0);\n  y = svmul_f64_x(ptrue, d, t);\n  x = svmul_f64_x(ptrue, n, t);\n  t = svrecpe_f64(y);\n  t = svmul_f64_x(ptrue, t, svrecps_f64(y, t));\n  t = svmul_f64_x(ptrue, t, svrecps_f64(y, t));\n  t = svmul_f64_x(ptrue, t, svrecps_f64(y, t));\n  u = svmul_f64_x(ptrue, x, t);\n  u = svmad_f64_x(ptrue, svmsb_f64_x(ptrue, y, u, x), t, u);\n  return u;\n#endif\n}\nstatic INLINE vdouble vrec_vd_vd(vdouble d) {\n#ifndef ENABLE_ALTDIV\n  return svdivr_n_f64_x(ptrue, d, 1.0);\n#else\n  return vsel_vd_vo_vd_vd(svcmpeq_f64(ptrue, vabs_vd_vd(d), vcast_vd_d(SLEEF_INFINITY)),\n\t\t\t  vcast_vd_d(0), vdiv_vd_vd_vd(vcast_vd_d(1.0f), d));\n#endif\n}\nstatic INLINE vdouble vsqrt_vd_vd(vdouble d) {\n#ifndef ENABLE_ALTSQRT\n  return svsqrt_f64_x(ptrue, d);\n#else\n  // Gives correctly rounded result for all input range\n  vdouble w, x, y, z;\n\n  y = svrsqrte_f64(d);\n  x = vmul_vd_vd_vd(d, y);         w = vmul_vd_vd_vd(vcast_vd_d(0.5), y);\n  y = vfmanp_vd_vd_vd_vd(x, w, vcast_vd_d(0.5));\n  x = vfma_vd_vd_vd_vd(x, y, x);   w = vfma_vd_vd_vd_vd(w, y, w);\n  y = vfmanp_vd_vd_vd_vd(x, w, vcast_vd_d(0.5));\n  x = vfma_vd_vd_vd_vd(x, y, x);   w = vfma_vd_vd_vd_vd(w, y, w);\n\n  y = vfmanp_vd_vd_vd_vd(x, w, vcast_vd_d(1.5));  w = vadd_vd_vd_vd(w, w);\n  w = vmul_vd_vd_vd(w, y);\n  x = vmul_vd_vd_vd(w, d);\n  y = vfmapn_vd_vd_vd_vd(w, d, x); z = vfmanp_vd_vd_vd_vd(w, x, vcast_vd_d(1));\n  z = vfmanp_vd_vd_vd_vd(w, y, z); w = vmul_vd_vd_vd(vcast_vd_d(0.5), x);\n  w = vfma_vd_vd_vd_vd(w, z, y);\n  w = vadd_vd_vd_vd(w, x);\n\n  return svsel_f64(svorr_b_z(ptrue, svcmpeq_f64(ptrue, d, vcast_vd_d(0)),\n\t\t\t     svcmpeq_f64(ptrue, d, vcast_vd_d(SLEEF_INFINITY))), d, w);\n#endif\n}\n\n// Float comparison\nstatic INLINE vopmask vlt_vo_vd_vd(vdouble x, vdouble y) {\n  return svcmplt_f64(ptrue, x, y);\n}\nstatic INLINE vopmask veq_vo_vd_vd(vdouble x, vdouble y) {\n  return svcmpeq_f64(ptrue, x, y);\n}\nstatic INLINE vopmask vgt_vo_vd_vd(vdouble x, vdouble y) {\n  return svcmpgt_f64(ptrue, x, y);\n}\nstatic INLINE vopmask vge_vo_vd_vd(vdouble x, vdouble y) {\n  return svcmpge_f64(ptrue, x, y);\n}\nstatic INLINE vopmask vneq_vo_vd_vd(vdouble x, vdouble y) {\n  return svcmpne_f64(ptrue, x, y);\n}\nstatic INLINE vopmask vle_vo_vd_vd(vdouble x, vdouble y) {\n  return svcmple_f64(ptrue, x, y);\n}\n\n// predicates\nstatic INLINE vopmask visnan_vo_vd(vdouble vd) {\n  return svcmpne_f64(ptrue, vd, vd);\n}\nstatic INLINE vopmask visinf_vo_vd(vdouble vd) {\n  return svcmpeq_n_f64(ptrue, svabs_f64_x(ptrue, vd), SLEEF_INFINITY);\n}\nstatic INLINE vopmask vispinf_vo_vd(vdouble vd) {\n  return svcmpeq_n_f64(ptrue, vd, SLEEF_INFINITY);\n}\nstatic INLINE vopmask visminf_vo_vd(vdouble vd) {\n  return svcmpeq_n_f64(ptrue, vd, -SLEEF_INFINITY);\n}\n\n// Comparing bit masks\nstatic INLINE vopmask veq64_vo_vm_vm(vmask x, vmask y) {\n  return svcmpeq_s64(ptrue, svreinterpret_s64_s32(x), svreinterpret_s64_s32(y));\n}\n\n// pure predicate operations\nstatic INLINE vopmask vcast_vo32_vo64(vopmask o) { return o; }\nstatic INLINE vopmask vcast_vo64_vo32(vopmask o) { return o; }\n\n// logical integer operations\nstatic INLINE vint vand_vi_vo_vi(vopmask x, vint y) {\n  // This needs to be a zeroing instruction because we need to make\n  // sure that the inactive elements for the unpacked integers vector\n  // are zero.\n  return svand_s32_z(x, y, y);\n}\n\nstatic INLINE vint vandnot_vi_vo_vi(vopmask x, vint y) {\n  return svsel_s32(x, ALL_FALSE_MASK, y);\n}\n#define vsra_vi_vi_i(x, c) svasr_n_s32_x(ptrue, x, c)\n//@#define vsra_vi_vi_i(x, c) svasr_n_s32_x(ptrue, x, c)\n#define vsll_vi_vi_i(x, c) svlsl_n_s32_x(ptrue, x, c)\n//@#define vsll_vi_vi_i(x, c) svlsl_n_s32_x(ptrue, x, c)\n\nstatic INLINE vint vsrl_vi_vi_i(vint x, int c) {\n  return svreinterpret_s32_u32(svlsr_n_u32_x(ptrue, svreinterpret_u32_s32(x), c));\n}\n\nstatic INLINE vint vand_vi_vi_vi(vint x, vint y) {\n  return svand_s32_x(ptrue, x, y);\n}\nstatic INLINE vint vandnot_vi_vi_vi(vint x, vint y) {\n  return svbic_s32_x(ptrue, y, x);\n}\nstatic INLINE vint vxor_vi_vi_vi(vint x, vint y) {\n  return sveor_s32_x(ptrue, x, y);\n}\n\n// integer math\nstatic INLINE vint vadd_vi_vi_vi(vint x, vint y) {\n  return svadd_s32_x(ptrue, x, y);\n}\nstatic INLINE vint vsub_vi_vi_vi(vint x, vint y) {\n  return svsub_s32_x(ptrue, x, y);\n}\nstatic INLINE vint vneg_vi_vi(vint x) { return svneg_s32_x(ptrue, x); }\n\n// integer comparison\nstatic INLINE vopmask vgt_vo_vi_vi(vint x, vint y) {\n  return svcmpgt_s32(ptrue, x, y);\n}\nstatic INLINE vopmask veq_vo_vi_vi(vint x, vint y) {\n  return svcmpeq_s32(ptrue, x, y);\n}\n\n// Splat\nstatic INLINE vint vcast_vi_i(int i) { return svdup_n_s32(i); }\n\n// bitmask logical operations\nstatic INLINE vmask vand_vm_vo64_vm(vopmask x, vmask y) {\n  // This needs to be a zeroing instruction because we need to make\n  // sure that the inactive elements for the unpacked integers vector\n  // are zero.\n  return svreinterpret_s32_s64(\n      svand_s64_z(x, svreinterpret_s64_s32(y), svreinterpret_s64_s32(y)));\n}\nstatic INLINE vmask vandnot_vm_vo64_vm(vopmask x, vmask y) {\n  return svreinterpret_s32_s64(svsel_s64(\n      x, svreinterpret_s64_s32(ALL_FALSE_MASK), svreinterpret_s64_s32(y)));\n}\nstatic INLINE vmask vor_vm_vo64_vm(vopmask x, vmask y) {\n  return svreinterpret_s32_s64(svsel_s64(\n      x, svreinterpret_s64_s32(ALL_TRUE_MASK), svreinterpret_s64_s32(y)));\n}\n\nstatic INLINE vfloat vrev21_vf_vf(vfloat vf) {\n  return svreinterpret_f32_u64(svrevw_u64_x(ptrue, svreinterpret_u64_f32(vf)));\n}\n\nstatic INLINE vint2 vrev21_vi2_vi2(vint2 i) { return vreinterpret_vi2_vf(vrev21_vf_vf(vreinterpret_vf_vi2(i))); }\n\n// Comparison returning integer\nstatic INLINE vint2 veq_vi2_vi2_vi2(vint2 x, vint2 y) {\n  return svsel_s32(svcmpeq_s32(ptrue, x, y), ALL_TRUE_MASK, ALL_FALSE_MASK);\n}\n\n// Gather\n\nstatic INLINE vdouble vgather_vd_p_vi(const double *ptr, vint vi) {\n  return svld1_gather_s64index_f64(ptrue, ptr, svreinterpret_s64_s32(vi));\n}\n\nstatic INLINE vfloat vgather_vf_p_vi2(const float *ptr, vint2 vi2) {\n  return svld1_gather_s32index_f32(ptrue, ptr, vi2);\n}\n\n// Operations for DFT\n\nstatic INLINE vdouble vposneg_vd_vd(vdouble d) {\n  return svneg_f64_m(d, svdupq_n_b64(0, 1), d);\n}\n\nstatic INLINE vdouble vnegpos_vd_vd(vdouble d) {\n  return svneg_f64_m(d, svdupq_n_b64(1, 0), d);\n}\n\nstatic INLINE vfloat vposneg_vf_vf(vfloat d) {\n  return svneg_f32_m(d, svdupq_n_b32(0, 1, 0, 1), d);\n}\n\nstatic INLINE vfloat vnegpos_vf_vf(vfloat d) {\n  return svneg_f32_m(d, svdupq_n_b32(1, 0, 1, 0), d);\n}\n\nstatic INLINE vdouble vsubadd_vd_vd_vd(vdouble x, vdouble y) { return vadd_vd_vd_vd(x, vnegpos_vd_vd(y)); }\nstatic INLINE vfloat vsubadd_vf_vf_vf(vfloat d0, vfloat d1) { return vadd_vf_vf_vf(d0, vnegpos_vf_vf(d1)); }\nstatic INLINE vdouble vmlsubadd_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vfma_vd_vd_vd_vd(x, y, vnegpos_vd_vd(z)); }\nstatic INLINE vfloat vmlsubadd_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vfma_vf_vf_vf_vf(x, y, vnegpos_vf_vf(z)); }\n\n//\n\nstatic INLINE vdouble vrev21_vd_vd(vdouble x) { return svzip1_f64(svuzp2_f64(x, x), svuzp1_f64(x, x)); }\n\nstatic INLINE vdouble vreva2_vd_vd(vdouble vd) {\n  svint64_t x = svindex_s64((VECTLENDP-1), -1);\n  x = svzip1_s64(svuzp2_s64(x, x), svuzp1_s64(x, x));\n  return svtbl_f64(vd, svreinterpret_u64_s64(x));\n}\n\nstatic INLINE vfloat vreva2_vf_vf(vfloat vf) {\n  svint32_t x = svindex_s32((VECTLENSP-1), -1);\n  x = svzip1_s32(svuzp2_s32(x, x), svuzp1_s32(x, x));\n  return svtbl_f32(vf, svreinterpret_u32_s32(x));\n}\n\n//\n\nstatic INLINE void vscatter2_v_p_i_i_vd(double *ptr, int offset, int step, vdouble v) {\n  svst1_scatter_u64index_f64(ptrue, ptr + offset*2, svzip1_u64(svindex_u64(0, step*2), svindex_u64(1, step*2)), v);\n}\n\nstatic INLINE void vscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v) {\n  svst1_scatter_u32index_f32(ptrue, ptr + offset*2, svzip1_u32(svindex_u32(0, step*2), svindex_u32(1, step*2)), v);\n}\n\nstatic INLINE void vstore_v_p_vd(double *ptr, vdouble v) { vstoreu_v_p_vd(ptr, v); }\nstatic INLINE void vstream_v_p_vd(double *ptr, vdouble v) { vstore_v_p_vd(ptr, v); }\nstatic INLINE void vstore_v_p_vf(float *ptr, vfloat v) { vstoreu_v_p_vf(ptr, v); }\nstatic INLINE void vstream_v_p_vf(float *ptr, vfloat v) { vstore_v_p_vf(ptr, v); }\nstatic INLINE void vsscatter2_v_p_i_i_vd(double *ptr, int offset, int step, vdouble v) { vscatter2_v_p_i_i_vd(ptr, offset, step, v); }\nstatic INLINE void vsscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v) { vscatter2_v_p_i_i_vf(ptr, offset, step, v); }\n\n// These functions are for debugging\nstatic double vcast_d_vd(vdouble v) {\n  double a[svcntd()];\n  vstoreu_v_p_vd(a, v);\n  return a[0];\n}\n\nstatic float vcast_f_vf(vfloat v) {\n  float a[svcntw()];\n  vstoreu_v_p_vf(a, v);\n  return a[0];\n}\n\nstatic int vcast_i_vi(vint v) {\n  int a[svcntw()];\n  vstoreu_v_p_vi(a, v);\n  return a[0];\n}\n\nstatic int vcast_i_vi2(vint2 v) {\n  int a[svcntw()];\n  vstoreu_v_p_vi2(a, v);\n  return a[0];\n}\n\n//\n\nstatic INLINE vmask2 vinterleave_vm2_vm2(vmask2 v) {\n  return vm2setxy_vm2_vm_vm(svreinterpret_s32_u64(svtrn1_u64(svreinterpret_u64_s32(vm2getx_vm_vm2(v)), svreinterpret_u64_s32(vm2gety_vm_vm2(v)))),\n\t\t\t    svreinterpret_s32_u64(svtrn2_u64(svreinterpret_u64_s32(vm2getx_vm_vm2(v)), svreinterpret_u64_s32(vm2gety_vm_vm2(v)))));\n}\n\nstatic INLINE vmask2 vuninterleave_vm2_vm2(vmask2 v) {\n  return vm2setxy_vm2_vm_vm(svreinterpret_s32_u64(svtrn1_u64(svreinterpret_u64_s32(vm2getx_vm_vm2(v)), svreinterpret_u64_s32(vm2gety_vm_vm2(v)))),\n\t\t\t    svreinterpret_s32_u64(svtrn2_u64(svreinterpret_u64_s32(vm2getx_vm_vm2(v)), svreinterpret_u64_s32(vm2gety_vm_vm2(v)))));\n}\n\nstatic INLINE vint vuninterleave_vi_vi(vint v) {\n  return svreinterpret_s32_u64(svuzp1_u64(svtrn1_u64(svreinterpret_u64_s32(v), svreinterpret_u64_s32(v)),\n\t\t\t\t\t  svtrn2_u64(svreinterpret_u64_s32(v), svreinterpret_u64_s32(v))));\n}\n\nstatic INLINE vdouble vinterleave_vd_vd(vdouble vd) {\n  return svtrn1_f64(svzip1_f64(vd, vd), svzip2_f64(vd, vd));\n}\n\nstatic INLINE vdouble vuninterleave_vd_vd(vdouble vd) {\n  return svuzp1_f64(svtrn1_f64(vd, vd), svtrn2_f64(vd, vd));\n}\n\nstatic INLINE vmask vinterleave_vm_vm(vmask vm) {\n  return svreinterpret_s32_u64(svtrn1_u64(svzip1_u64(svreinterpret_u64_s32(vm), svreinterpret_u64_s32(vm)),\n\t\t\t\t\t  svzip2_u64(svreinterpret_u64_s32(vm), svreinterpret_u64_s32(vm))));\n}\nstatic INLINE vmask vuninterleave_vm_vm(vmask vm) {\n  return svreinterpret_s32_u64(svuzp1_u64(svtrn1_u64(svreinterpret_u64_s32(vm), svreinterpret_u64_s32(vm)),\n\t\t\t\t\t  svtrn2_u64(svreinterpret_u64_s32(vm), svreinterpret_u64_s32(vm))));\n}\n\nstatic vmask2 vloadu_vm2_p(void *p) {\n  vmask2 vm2;\n  memcpy(&vm2, p, VECTLENDP * 16);\n  return vm2;\n}\n\n#if !defined(SLEEF_GENHEADER)\ntypedef Sleef_quadx vargquad;\n\nstatic INLINE vmask2 vcast_vm2_aq(vargquad aq) {\n  return vinterleave_vm2_vm2(vloadu_vm2_p(&aq));\n}\n\nstatic INLINE vargquad vcast_aq_vm2(vmask2 vm2) {\n  vm2 = vuninterleave_vm2_vm2(vm2);\n  vargquad aq;\n  memcpy(&aq, &vm2, VECTLENDP * 16);\n  return aq;\n}\n#endif // #if !defined(SLEEF_GENHEADER)\n\nstatic INLINE int vtestallzeros_i_vo64(vopmask g) {\n  return svcntp_b64(svptrue_b64(), g) == 0;\n}\n\nstatic INLINE vmask vsel_vm_vo64_vm_vm(vopmask o, vmask x, vmask y) {\n  return svreinterpret_s32_s64(svsel_s64(o, svreinterpret_s64_s32(x), svreinterpret_s64_s32(y)));\n}\n\nstatic INLINE vmask vsub64_vm_vm_vm(vmask x, vmask y) {\n  return svreinterpret_s32_s64(\n           svsub_s64_x(ptrue, svreinterpret_s64_s32(x),\n                              svreinterpret_s64_s32(y)));\n}\n\nstatic INLINE vmask vneg64_vm_vm(vmask x) {\n  return svreinterpret_s32_s64(svneg_s64_x(ptrue, svreinterpret_s64_s32(x)));\n}\n\nstatic INLINE vopmask vgt64_vo_vm_vm(vmask x, vmask y) {\n  return svcmpgt_s64(ptrue, svreinterpret_s64_s32(x), svreinterpret_s64_s32(y));\n}\n\n#define vsll64_vm_vm_i(x, c) svreinterpret_s32_u64(svlsl_n_u64_x(ptrue, svreinterpret_u64_s32(x), c))\n//@#define vsll64_vm_vm_i(x, c) svreinterpret_s32_u64(svlsl_n_u64_x(ptrue, svreinterpret_u64_s32(x), c))\n#define vsrl64_vm_vm_i(x, c) svreinterpret_s32_u64(svlsr_n_u64_x(ptrue, svreinterpret_u64_s32(x), c))\n//@#define vsrl64_vm_vm_i(x, c) svreinterpret_s32_u64(svlsr_n_u64_x(ptrue, svreinterpret_u64_s32(x), c))\n\nstatic INLINE vmask vcast_vm_vi(vint vi) { return svreinterpret_s32_s64(svextw_s64_z(ptrue, svreinterpret_s64_s32(vi))); }\nstatic INLINE vint vcast_vi_vm(vmask vm) { return vand_vm_vm_vm(vm, vcast_vm_i_i(0, 0xffffffff)); }\n"
  },
  {
    "path": "src/memory.cpp",
    "content": "/*\n\nCopyright (c) 2019 Agenium Scale\n\nPermission is hereby granted, free of charge, to any person obtaining a copy\nof this software and associated documentation files (the \"Software\"), to deal\nin the Software without restriction, including without limitation the rights\nto use, copy, modify, merge, publish, distribute, sublicense, and/or sell\ncopies of the Software, and to permit persons to whom the Software is\nfurnished to do so, subject to the following conditions:\n\nThe above copyright notice and this permission notice shall be included in all\ncopies or substantial portions of the Software.\n\nTHE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\nIMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\nFITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\nAUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\nLIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\nOUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\nSOFTWARE.\n\n*/\n\n#ifdef NSIMD_IS_MSVC\n  #include <malloc.h>\n#else\n  #ifndef _POSIX_C_SOURCE\n    #define _POSIX_C_SOURCE 200112L\n  #endif\n  #include <stdlib.h>\n#endif\n\n// ----------------------------------------------------------------------------\n\n#define NSIMD_INSIDE\n#include <nsimd/nsimd.h>\n\n// ----------------------------------------------------------------------------\n\nextern \"C\" {\n\nNSIMD_DLLEXPORT void *nsimd_aligned_alloc(nsimd_nat n) {\n#ifdef NSIMD_IS_MSVC\n  return _aligned_malloc(n, NSIMD_MAX_ALIGNMENT);\n#else\n  void *ptr;\n  if (posix_memalign(&ptr, NSIMD_MAX_ALIGNMENT, (size_t)n)) {\n    return NULL;\n  } else {\n    return ptr;\n  }\n#endif\n}\n\n// ----------------------------------------------------------------------------\n\nNSIMD_DLLEXPORT void nsimd_aligned_free(void *ptr) {\n#ifdef NSIMD_IS_MSVC\n  _aligned_free(ptr);\n#else\n  free(ptr);\n#endif\n}\n\n} // extern \"C\"\n\n"
  },
  {
    "path": "src/misc.h",
    "content": "//   Copyright Naoki Shibata and contributors 2010 - 2020.\n// Distributed under the Boost Software License, Version 1.0.\n//    (See accompanying file LICENSE.txt or copy at\n//          http://www.boost.org/LICENSE_1_0.txt)\n\n//\n\n#ifndef __MISC_H__\n#define __MISC_H__\n\n#if !defined(SLEEF_GENHEADER)\n#include <stdint.h>\n#include <string.h>\n#endif\n\n#ifndef M_PI\n#define M_PI 3.141592653589793238462643383279502884\n#endif\n\n#ifndef M_PIl\n#define M_PIl 3.141592653589793238462643383279502884L\n#endif\n\n#ifndef M_1_PI\n#define M_1_PI 0.318309886183790671537767526745028724\n#endif\n\n#ifndef M_1_PIl\n#define M_1_PIl 0.318309886183790671537767526745028724L\n#endif\n\n#ifndef M_2_PI\n#define M_2_PI 0.636619772367581343075535053490057448\n#endif\n\n#ifndef M_2_PIl\n#define M_2_PIl 0.636619772367581343075535053490057448L\n#endif\n\n#ifndef SLEEF_FP_ILOGB0\n#define SLEEF_FP_ILOGB0 ((int)-2147483648)\n#endif\n\n#ifndef SLEEF_FP_ILOGBNAN\n#define SLEEF_FP_ILOGBNAN ((int)2147483647)\n#endif\n\n#define SLEEF_SNAN (((union { long long int i; double d; }) { .i = INT64_C(0x7ff0000000000001) }).d)\n#define SLEEF_SNANf (((union { long int i; float f; }) { .i = 0xff800001 }).f)\n\n\n//\n\n/*\n  PI_A to PI_D are constants that satisfy the following two conditions.\n\n  * For PI_A, PI_B and PI_C, the last 28 bits are zero.\n  * PI_A + PI_B + PI_C + PI_D is close to PI as much as possible.\n\n  The argument of a trig function is multiplied by 1/PI, and the\n  integral part is divided into two parts, each has at most 28\n  bits. So, the maximum argument that could be correctly reduced\n  should be 2^(28*2-1) PI = 1.1e+17. However, due to internal\n  double precision calculation, the actual maximum argument that can\n  be correctly reduced is around 2^47.\n */\n\n#define PI_A 3.1415926218032836914\n#define PI_B 3.1786509424591713469e-08\n#define PI_C 1.2246467864107188502e-16\n#define PI_D 1.2736634327021899816e-24\n#define TRIGRANGEMAX 1e+14\n\n/*\n  PI_A2 and PI_B2 are constants that satisfy the following two conditions.\n\n  * The last 3 bits of PI_A2 are zero.\n  * PI_A2 + PI_B2 is close to PI as much as possible.\n\n  The argument of a trig function is multiplied by 1/PI, and the\n  integral part is multiplied by PI_A2. So, the maximum argument that\n  could be correctly reduced should be 2^(3-1) PI = 12.6. By testing,\n  we confirmed that it correctly reduces the argument up to around 15.\n */\n\n#define PI_A2 3.141592653589793116\n#define PI_B2 1.2246467991473532072e-16\n#define TRIGRANGEMAX2 15\n\n#define M_2_PI_H 0.63661977236758138243\n#define M_2_PI_L -3.9357353350364971764e-17\n\n#define SQRT_DBL_MAX 1.3407807929942596355e+154\n\n#define TRIGRANGEMAX3 1e+9\n\n#define M_4_PI 1.273239544735162542821171882678754627704620361328125\n\n#define L2U .69314718055966295651160180568695068359375\n#define L2L .28235290563031577122588448175013436025525412068e-12\n#define R_LN2 1.442695040888963407359924681001892137426645954152985934135449406931\n\n#define L10U 0.30102999566383914498 // log 2 / log 10\n#define L10L 1.4205023227266099418e-13\n#define LOG10_2 3.3219280948873623478703194294893901758648313930\n\n#define L10Uf 0.3010253906f\n#define L10Lf 4.605038981e-06f\n\n//\n\n#define PI_Af 3.140625f\n#define PI_Bf 0.0009670257568359375f\n#define PI_Cf 6.2771141529083251953e-07f\n#define PI_Df 1.2154201256553420762e-10f\n#define TRIGRANGEMAXf 39000\n\n#define PI_A2f 3.1414794921875f\n#define PI_B2f 0.00011315941810607910156f\n#define PI_C2f 1.9841872589410058936e-09f\n#define TRIGRANGEMAX2f 125.0f\n\n#define TRIGRANGEMAX4f 8e+6f\n\n#define SQRT_FLT_MAX 18446743523953729536.0\n\n#define L2Uf 0.693145751953125f\n#define L2Lf 1.428606765330187045e-06f\n\n#define R_LN2f 1.442695040888963407359924681001892137426645954152985934135449406931f\n#define M_PIf ((float)M_PI)\n\n//\n\n#ifndef MIN\n#define MIN(x, y) ((x) < (y) ? (x) : (y))\n#endif\n\n#ifndef MAX\n#define MAX(x, y) ((x) > (y) ? (x) : (y))\n#endif\n\n#ifndef ABS\n#define ABS(x) ((x) < 0 ? -(x) : (x))\n#endif\n\n#define stringify(s) stringify_(s)\n#define stringify_(s) #s\n\n#if !defined(SLEEF_GENHEADER)\ntypedef long double longdouble;\n#endif\n\n#if !defined(Sleef_double2_DEFINED) && !defined(SLEEF_GENHEADER)\n#define Sleef_double2_DEFINED\ntypedef struct {\n  double x, y;\n} Sleef_double2;\n#endif\n\n#if !defined(Sleef_float2_DEFINED) && !defined(SLEEF_GENHEADER)\n#define Sleef_float2_DEFINED\ntypedef struct {\n  float x, y;\n} Sleef_float2;\n#endif\n\n#if !defined(Sleef_longdouble2_DEFINED) && !defined(SLEEF_GENHEADER)\n#define Sleef_longdouble2_DEFINED\ntypedef struct {\n  long double x, y;\n} Sleef_longdouble2;\n#endif\n\n#if !defined(Sleef_quad_DEFINED) && !defined(SLEEF_GENHEADER)\n#define Sleef_quad_DEFINED\n#if defined(ENABLEFLOAT128)\ntypedef __float128 Sleef_quad;\n#else\ntypedef struct { double x, y; } Sleef_quad;\n#endif\n#endif\n\n#if !defined(Sleef_quad1_DEFINED) && !defined(SLEEF_GENHEADER)\n#define Sleef_quad1_DEFINED\ntypedef union {\n  struct {\n    Sleef_quad x;\n  };\n  Sleef_quad s[1];\n} Sleef_quad1;\n#endif\n\n#if !defined(Sleef_quad2_DEFINED) && !defined(SLEEF_GENHEADER)\n#define Sleef_quad2_DEFINED\ntypedef union {\n  struct {\n    Sleef_quad x, y;\n  };\n  Sleef_quad s[2];\n} Sleef_quad2;\n#endif\n\n#if !defined(Sleef_quad4_DEFINED) && !defined(SLEEF_GENHEADER)\n#define Sleef_quad4_DEFINED\ntypedef union {\n  struct {\n    Sleef_quad x, y, z, w;\n  };\n  Sleef_quad s[4];\n} Sleef_quad4;\n#endif\n\n#if !defined(Sleef_quad8_DEFINED) && !defined(SLEEF_GENHEADER)\n#define Sleef_quad8_DEFINED\ntypedef union {\n  Sleef_quad s[8];\n} Sleef_quad8;\n#endif\n\n#if defined(__ARM_FEATURE_SVE) && !defined(Sleef_quadx_DEFINED) && !defined(SLEEF_GENHEADER)\n#define Sleef_quadx_DEFINED\ntypedef union {\n  Sleef_quad s[32];\n} Sleef_quadx;\n#endif\n\n//\n\n#if (defined (__GNUC__) || defined (__clang__) || defined(__INTEL_COMPILER)) && !defined(_MSC_VER)\n\n#define LIKELY(condition) __builtin_expect(!!(condition), 1)\n#define UNLIKELY(condition) __builtin_expect(!!(condition), 0)\n#define RESTRICT __restrict__\n\n#ifndef __arm__\n#define ALIGNED(x) __attribute__((aligned(x)))\n#else\n#define ALIGNED(x)\n#endif\n\n#if defined(SLEEF_GENHEADER)\n\n#define INLINE SLEEF_ALWAYS_INLINE\n#define EXPORT SLEEF_INLINE\n#define CONST SLEEF_CONST\n#define NOEXPORT\n\n#else // #if defined(SLEEF_GENHEADER)\n\n#ifndef __INTEL_COMPILER\n#define CONST const\n#else\n#define CONST\n#endif\n#define INLINE __attribute__((always_inline))\n\n#if defined(__MINGW32__) || defined(__MINGW64__) || defined(__CYGWIN__)\n#ifndef SLEEF_STATIC_LIBS\n#define EXPORT __stdcall __declspec(dllexport)\n#define NOEXPORT\n#else // #ifndef SLEEF_STATIC_LIBS\n#define EXPORT\n#define NOEXPORT\n#endif // #ifndef SLEEF_STATIC_LIBS\n#else // #if defined(__MINGW32__) || defined(__MINGW64__) || defined(__CYGWIN__)\n#define EXPORT __attribute__((visibility(\"default\")))\n#define NOEXPORT __attribute__ ((visibility (\"hidden\")))\n#endif // #if defined(__MINGW32__) || defined(__MINGW64__) || defined(__CYGWIN__)\n\n#endif // #if defined(SLEEF_GENHEADER)\n\n#define SLEEF_NAN __builtin_nan(\"\")\n#define SLEEF_NANf __builtin_nanf(\"\")\n#define SLEEF_NANl __builtin_nanl(\"\")\n#define SLEEF_INFINITY __builtin_inf()\n#define SLEEF_INFINITYf __builtin_inff()\n#define SLEEF_INFINITYl __builtin_infl()\n\n#if defined(__INTEL_COMPILER) || defined (__clang__)\n#define SLEEF_INFINITYq __builtin_inf()\n#define SLEEF_NANq __builtin_nan(\"\")\n#else\n#define SLEEF_INFINITYq __builtin_infq()\n#define SLEEF_NANq (SLEEF_INFINITYq - SLEEF_INFINITYq)\n#endif\n\n#elif defined(_MSC_VER) // #if (defined (__GNUC__) || defined (__clang__) || defined(__INTEL_COMPILER)) && !defined(_MSC_VER)\n\n#define INLINE __forceinline\n#define CONST\n#define RESTRICT\n#define ALIGNED(x)\n#define LIKELY(condition) (condition)\n#define UNLIKELY(condition) (condition)\n\n#ifndef SLEEF_STATIC_LIBS\n#define EXPORT __declspec(dllexport)\n#define NOEXPORT\n#else\n#define EXPORT\n#define NOEXPORT\n#endif\n\n#if (defined(__GNUC__) || defined(__CLANG__)) && (defined(__i386__) || defined(__x86_64__)) && !defined(SLEEF_GENHEADER)\n#include <x86intrin.h>\n#endif\n\n#define SLEEF_INFINITY (1e+300 * 1e+300)\n#define SLEEF_NAN (SLEEF_INFINITY - SLEEF_INFINITY)\n#define SLEEF_INFINITYf ((float)SLEEF_INFINITY)\n#define SLEEF_NANf ((float)SLEEF_NAN)\n#define SLEEF_INFINITYl ((long double)SLEEF_INFINITY)\n#define SLEEF_NANl ((long double)SLEEF_NAN)\n\n#if (defined(_M_AMD64) || defined(_M_X64))\n#ifndef __SSE2__\n#define __SSE2__\n#define __SSE3__\n#define __SSE4_1__\n#endif\n#elif _M_IX86_FP == 2\n#ifndef __SSE2__\n#define __SSE2__\n#define __SSE3__\n#define __SSE4_1__\n#endif\n#elif _M_IX86_FP == 1\n#ifndef __SSE__\n#define __SSE__\n#endif\n#endif\n\n#endif // #elif defined(_MSC_VER) // #if (defined (__GNUC__) || defined (__clang__) || defined(__INTEL_COMPILER)) && !defined(_MSC_VER)\n\n#if !defined(__linux__)\n#define isinff(x) ((x) == SLEEF_INFINITYf || (x) == -SLEEF_INFINITYf)\n#define isinfl(x) ((x) == SLEEF_INFINITYl || (x) == -SLEEF_INFINITYl)\n#define isnanf(x) ((x) != (x))\n#define isnanl(x) ((x) != (x))\n#endif\n\n#endif // #ifndef __MISC_H__\n\n#ifdef ENABLE_AAVPCS\n#define VECTOR_CC __attribute__((aarch64_vector_pcs))\n#else\n#define VECTOR_CC\n#endif\n\n\n        /* NSIMD specific */\n        #ifndef NSIMD_SLEEF_MISC_H\n        #define NSIMD_SLEEF_MISC_H\n\n        #ifdef INLINE\n        #undef INLINE\n        #endif\n        #define INLINE inline\n\n        #define Sleef_rempitabdp nsimd_sleef_rempitab_f64\n        #define Sleef_rempitabsp nsimd_sleef_rempitab_f32\n\n        #endif\n\n        "
  },
  {
    "path": "src/rempitab.c",
    "content": "//   Copyright Naoki Shibata and contributors 2010 - 2020.\n// Distributed under the Boost Software License, Version 1.0.\n//    (See accompanying file LICENSE.txt or copy at\n//          http://www.boost.org/LICENSE_1_0.txt)\n\n#include \"misc.h\"\n\n#if !defined(SLEEF_GENHEADER)\n#define FUNCATR NOEXPORT ALIGNED(64)\n#else\n#define FUNCATR EXPORT ALIGNED(64)\n#endif\n\nFUNCATR const double Sleef_rempitabdp[] = {\n  0.15915494309189531785, 1.7916237278037667488e-17, 2.5454160968749269937e-33, 2.1132476107887107169e-49,\n  0.03415494309189533173, 4.0384494702232122736e-18, 1.0046721413651383112e-33, 2.1132476107887107169e-49,\n  0.03415494309189533173, 4.0384494702232122736e-18, 1.0046721413651383112e-33, 2.1132476107887107169e-49,\n  0.0029049430918953351999, 5.6900251826959904774e-19, 4.1707169171520598517e-35, -2.496415728504571394e-51,\n  0.0029049430918953351999, 5.6900251826959904774e-19, 4.1707169171520598517e-35, -2.496415728504571394e-51,\n  0.0029049430918953351999, 5.6900251826959904774e-19, 4.1707169171520598517e-35, -2.496415728504571394e-51,\n  0.0029049430918953351999, 5.6900251826959904774e-19, 4.1707169171520598517e-35, -2.496415728504571394e-51,\n  0.00095181809189533563356, 1.3532164927539732229e-19, -6.4410794381603004826e-36, 1.7634898158762436344e-52,\n  0.00095181809189533563356, 1.3532164927539732229e-19, -6.4410794381603004826e-36, 1.7634898158762436344e-52,\n  0.00046353684189533574198, 2.6901432026846872871e-20, -4.2254836195018827479e-37, 9.301187206862134399e-54,\n  0.00021939621689533574198, 2.6901432026846872871e-20, -4.2254836195018827479e-37, 9.301187206862134399e-54,\n  9.7325904395335769087e-05, -2.0362228529073840241e-22, 6.2960434583523738135e-40, 2.6283399642369025999e-57,\n  3.6290748145335769087e-05, -2.0362228529073840241e-22, 6.2960434583523738135e-40, 2.6283399642369025999e-57,\n  5.7731700203357690874e-06, -2.0362228529073840241e-22, 6.2960434583523738135e-40, 2.6283399642369025999e-57,\n  5.7731700203357690874e-06, -2.0362228529073840241e-22, 6.2960434583523738135e-40, 2.6283399642369025999e-57,\n  5.7731700203357690874e-06, -2.0362228529073840241e-22, 6.2960434583523738135e-40, 2.6283399642369025999e-57,\n  1.9584727547107690874e-06, -2.0362228529073840241e-22, 6.2960434583523738135e-40, 2.6283399642369025999e-57,\n  5.1124121898268875627e-08, 8.135951522836682362e-24, 6.2960434583523738135e-40, 2.6283399642369025999e-57,\n  5.1124121898268875627e-08, 8.135951522836682362e-24, 6.2960434583523738135e-40, 2.6283399642369025999e-57,\n  5.1124121898268875627e-08, 8.135951522836682362e-24, 6.2960434583523738135e-40, 2.6283399642369025999e-57,\n  5.1124121898268875627e-08, 8.135951522836682362e-24, 6.2960434583523738135e-40, 2.6283399642369025999e-57,\n  5.1124121898268875627e-08, 8.135951522836682362e-24, 6.2960434583523738135e-40, 2.6283399642369025999e-57,\n  5.1124121898268875627e-08, 8.135951522836682362e-24, 6.2960434583523738135e-40, 2.6283399642369025999e-57,\n  2.1321799510573569745e-08, 1.5185066224124613304e-24, 2.6226236120327253511e-40, 2.6283399642369025999e-57,\n  6.4206383167259151492e-09, -1.3585460269359374382e-25, -1.3244127270701094468e-41, -2.4695541513869446866e-57,\n  6.4206383167259151492e-09, -1.3585460269359374382e-25, -1.3244127270701094468e-41, -2.4695541513869446866e-57,\n  2.6953480182640010867e-09, -1.3585460269359374382e-25, -1.3244127270701094468e-41, -2.4695541513869446866e-57,\n  8.3270286903304384868e-10, 7.0940550444663151936e-26, 9.7147467687967058732e-42, 7.9392906424978921242e-59,\n  8.3270286903304384868e-10, 7.0940550444663151936e-26, 9.7147467687967058732e-42, 7.9392906424978921242e-59,\n  3.6704158172530459087e-10, 7.0940550444663151936e-26, 9.7147467687967058732e-42, 7.9392906424978921242e-59,\n  1.3421093807143501366e-10, 1.9241762160098927996e-26, 3.9750282589222551507e-42, 7.9392906424978921242e-59,\n  1.7795616244500218596e-11, -1.452834466126541428e-28, -1.5869767474823787636e-44, -2.6168913164368963837e-61,\n  1.7795616244500218596e-11, -1.452834466126541428e-28, -1.5869767474823787636e-44, -2.6168913164368963837e-61,\n  1.7795616244500218596e-11, -1.452834466126541428e-28, -1.5869767474823787636e-44, -2.6168913164368963837e-61,\n  3.2437010161333667893e-12, -1.452834466126541428e-28, -1.5869767474823787636e-44, -2.6168913164368963837e-61,\n  3.2437010161333667893e-12, -1.452834466126541428e-28, -1.5869767474823787636e-44, -2.6168913164368963837e-61,\n  3.2437010161333667893e-12, -1.452834466126541428e-28, -1.5869767474823787636e-44, -2.6168913164368963837e-61,\n  1.4247116125875099096e-12, 2.5861333686050385673e-28, 2.8971783383570358633e-44, -2.6168913164368963837e-61,\n  5.1521691081458187359e-13, 5.6664945123924856962e-29, 6.5510079543732854985e-45, -2.6168913164368963837e-61,\n  6.0469559928117805118e-14, 6.1778471897801070206e-30, 9.4581409707401690366e-46, 4.9461632249367446986e-62,\n  6.0469559928117805118e-14, 6.1778471897801070206e-30, 9.4581409707401690366e-46, 4.9461632249367446986e-62,\n  6.0469559928117805118e-14, 6.1778471897801070206e-30, 9.4581409707401690366e-46, 4.9461632249367446986e-62,\n  3.6261410673097965595e-15, -1.3304005198798645927e-31, -1.7578597149294783985e-47, 8.4432539107728104262e-64,\n  3.6261410673097965595e-15, -1.3304005198798645927e-31, -1.7578597149294783985e-47, 8.4432539107728104262e-64,\n  3.6261410673097965595e-15, -1.3304005198798645927e-31, -1.7578597149294783985e-47, 8.4432539107728104262e-64,\n  3.6261410673097965595e-15, -1.3304005198798645927e-31, -1.7578597149294783985e-47, 8.4432539107728104262e-64,\n  7.3427388509295482183e-17, 1.4871367740953237822e-32, -1.1571307704883330232e-48, -6.7249112515659578102e-65,\n  7.3427388509295482183e-17, 1.4871367740953237822e-32, -1.1571307704883330232e-48, -6.7249112515659578102e-65,\n  7.3427388509295482183e-17, 1.4871367740953237822e-32, -1.1571307704883330232e-48, -6.7249112515659578102e-65,\n  7.3427388509295482183e-17, 1.4871367740953237822e-32, -1.1571307704883330232e-48, -6.7249112515659578102e-65,\n  7.3427388509295482183e-17, 1.4871367740953237822e-32, -1.1571307704883330232e-48, -6.7249112515659578102e-65,\n  7.3427388509295482183e-17, 1.4871367740953237822e-32, -1.1571307704883330232e-48, -6.7249112515659578102e-65,\n  1.7916237278037667488e-17, 2.5454160968749269937e-33, 2.1132476107887107169e-49, 8.7154294504188129325e-66,\n  1.7916237278037667488e-17, 2.5454160968749269937e-33, 2.1132476107887107169e-49, 8.7154294504188129325e-66,\n  4.0384494702232122736e-18, 1.0046721413651383112e-33, 2.1132476107887107169e-49, 8.7154294504188129325e-66,\n  4.0384494702232122736e-18, 1.0046721413651383112e-33, 2.1132476107887107169e-49, 8.7154294504188129325e-66,\n  5.6900251826959904774e-19, 4.1707169171520598517e-35, -2.4964157285045710972e-51, -1.866653112309982615e-67,\n  5.6900251826959904774e-19, 4.1707169171520598517e-35, -2.4964157285045710972e-51, -1.866653112309982615e-67,\n  5.6900251826959904774e-19, 4.1707169171520598517e-35, -2.4964157285045710972e-51, -1.866653112309982615e-67,\n  1.3532164927539732229e-19, -6.4410794381603004826e-36, 1.7634898158762432635e-52, 3.5887057810247033998e-68,\n  1.3532164927539732229e-19, -6.4410794381603004826e-36, 1.7634898158762432635e-52, 3.5887057810247033998e-68,\n  2.6901432026846872871e-20, -4.2254836195018827479e-37, 9.3011872068621332399e-54, 1.113250147552460308e-69,\n  2.6901432026846872871e-20, -4.2254836195018827479e-37, 9.3011872068621332399e-54, 1.113250147552460308e-69,\n  2.6901432026846872871e-20, -4.2254836195018827479e-37, 9.3011872068621332399e-54, 1.113250147552460308e-69,\n  1.3348904870778067446e-20, -4.2254836195018827479e-37, 9.3011872068621332399e-54, 1.113250147552460308e-69,\n  6.5726412927436632287e-21, 1.0820844071023395684e-36, 1.7634898158762432635e-52, 3.5887057810247033998e-68,\n  3.1845095037264626247e-21, 3.2976802257607573031e-37, 9.3011872068621332399e-54, 1.113250147552460308e-69,\n  1.4904436092178623228e-21, -4.6390169687056261795e-38, -1.1392999419355048437e-54, -4.587677453735884283e-71,\n  6.4341066196356198368e-22, -4.6390169687056261795e-38, -1.1392999419355048437e-54, -4.587677453735884283e-71,\n  2.1989418833641172011e-22, 4.7649378378726728402e-38, 9.3011872068621332399e-54, 1.113250147552460308e-69,\n  8.135951522836682362e-24, 6.2960434583523738135e-40, 2.6283399642369020339e-57, 5.3358074162805516304e-73,\n  8.135951522836682362e-24, 6.2960434583523738135e-40, 2.6283399642369020339e-57, 5.3358074162805516304e-73,\n  8.135951522836682362e-24, 6.2960434583523738135e-40, 2.6283399642369020339e-57, 5.3358074162805516304e-73,\n  8.135951522836682362e-24, 6.2960434583523738135e-40, 2.6283399642369020339e-57, 5.3358074162805516304e-73,\n  8.135951522836682362e-24, 6.2960434583523738135e-40, 2.6283399642369020339e-57, 5.3358074162805516304e-73,\n  1.5185066224124613304e-24, 2.6226236120327253511e-40, 2.6283399642369020339e-57, 5.3358074162805516304e-73,\n  1.5185066224124613304e-24, 2.6226236120327253511e-40, 2.6283399642369020339e-57, 5.3358074162805516304e-73,\n  1.5185066224124613304e-24, 2.6226236120327253511e-40, 2.6283399642369020339e-57, 5.3358074162805516304e-73,\n  6.9132600985943383921e-25, 7.8591368887290111994e-41, 2.6283399642369020339e-57, 5.3358074162805516304e-73,\n  2.7773570358292009361e-25, -1.3244127270701094468e-41, -2.4695541513869446866e-57, -3.2399200798614356002e-74,\n  7.0940550444663151936e-26, 9.7147467687967058732e-42, 7.9392906424978921242e-59, 2.9745456030524896742e-75,\n  7.0940550444663151936e-26, 9.7147467687967058732e-42, 7.9392906424978921242e-59, 2.9745456030524896742e-75,\n  1.9241762160098927996e-26, 3.9750282589222551507e-42, 7.9392906424978921242e-59, 2.9745456030524896742e-75,\n  1.9241762160098927996e-26, 3.9750282589222551507e-42, 7.9392906424978921242e-59, 2.9745456030524896742e-75,\n  6.317065088957874881e-27, -3.2976062348358281152e-43, -2.6168913164368963837e-61, 3.7036201000008290615e-78,\n  6.317065088957874881e-27, -3.2976062348358281152e-43, -2.6168913164368963837e-61, 3.7036201000008290615e-78,\n  3.0858908211726098086e-27, 3.8770419025072344914e-43, 7.9392906424978921242e-59, 2.9745456030524896742e-75,\n  1.4703036872799779898e-27, 2.8971783383570358633e-44, -2.6168913164368963837e-61, 3.7036201000008290615e-78,\n  6.625101203336619011e-28, 2.8971783383570358633e-44, -2.6168913164368963837e-61, 3.7036201000008290615e-78,\n  2.5861333686050385673e-28, 2.8971783383570358633e-44, -2.6168913164368963837e-61, 3.7036201000008290615e-78,\n  5.6664945123924856962e-29, 6.5510079543732854985e-45, -2.6168913164368963837e-61, 3.7036201000008290615e-78,\n  5.6664945123924856962e-29, 6.5510079543732854985e-45, -2.6168913164368963837e-61, 3.7036201000008290615e-78,\n  6.1778471897801070206e-30, 9.4581409707401690366e-46, 4.9461632249367446986e-62, 3.7036201000008290615e-78,\n  6.1778471897801070206e-30, 9.4581409707401690366e-46, 4.9461632249367446986e-62, 3.7036201000008290615e-78,\n  6.1778471897801070206e-30, 9.4581409707401690366e-46, 4.9461632249367446986e-62, 3.7036201000008290615e-78,\n  6.1778471897801070206e-30, 9.4581409707401690366e-46, 4.9461632249367446986e-62, 3.7036201000008290615e-78,\n  3.0224035688960604996e-30, 2.451648649116083682e-46, 4.9461632249367446986e-62, 3.7036201000008290615e-78,\n  1.4446817584540368888e-30, 2.451648649116083682e-46, 4.9461632249367446986e-62, 3.7036201000008290615e-78,\n  6.5582085323302525856e-31, 7.0002556871006273225e-47, 1.0567786762735315635e-62, -6.1446417754639313137e-79,\n  2.6139040062251944343e-31, -1.7578597149294783985e-47, 8.4432539107728090768e-64, 1.9517662449371102229e-79,\n  6.4175174317266470186e-32, 4.3166913557804827486e-48, 8.4432539107728090768e-64, 1.9517662449371102229e-79,\n  6.4175174317266470186e-32, 4.3166913557804827486e-48, 8.4432539107728090768e-64, 1.9517662449371102229e-79,\n  1.4871367740953237822e-32, -1.1571307704883330232e-48, -6.7249112515659569668e-65, -7.2335760163150273591e-81,\n  1.4871367740953237822e-32, -1.1571307704883330232e-48, -6.7249112515659569668e-65, -7.2335760163150273591e-81,\n  2.5454160968749269937e-33, 2.1132476107887107169e-49, 8.7154294504188118783e-66, 1.2001823382693912203e-81,\n  2.5454160968749269937e-33, 2.1132476107887107169e-49, 8.7154294504188118783e-66, 1.2001823382693912203e-81,\n  2.5454160968749269937e-33, 2.1132476107887107169e-49, 8.7154294504188118783e-66, 1.2001823382693912203e-81,\n  1.0046721413651383112e-33, 2.1132476107887107169e-49, 8.7154294504188118783e-66, 1.2001823382693912203e-81,\n  2.3430016361024414106e-34, 4.0267819632970559834e-50, -7.8013829534098555144e-67, -1.1759240463442418271e-82,\n  2.3430016361024414106e-34, 4.0267819632970559834e-50, -7.8013829534098555144e-67, -1.1759240463442418271e-82,\n  4.1707169171520598517e-35, -2.4964157285045710972e-51, -1.866653112309982615e-67, 1.4185069655957361252e-83,\n  4.1707169171520598517e-35, -2.4964157285045710972e-51, -1.866653112309982615e-67, 1.4185069655957361252e-83,\n  4.1707169171520598517e-35, -2.4964157285045710972e-51, -1.866653112309982615e-67, 1.4185069655957361252e-83,\n  1.7633044866680145008e-35, 2.8491136916798196016e-51, 4.0680767287898916022e-67, 1.4185069655957361252e-83,\n  5.595982714259923599e-36, 1.7634898158762432635e-52, 3.588705781024702988e-68, 5.9489775128085140685e-84,\n  5.595982714259923599e-36, 1.7634898158762432635e-52, 3.588705781024702988e-68, 5.9489775128085140685e-84,\n  2.5867171761548675786e-36, 1.7634898158762432635e-52, 3.588705781024702988e-68, 5.9489775128085140685e-84,\n  1.0820844071023395684e-36, 1.7634898158762432635e-52, 3.588705781024702988e-68, 5.9489775128085140685e-84,\n  3.2976802257607573031e-37, 9.3011872068621332399e-54, 1.113250147552460308e-69, 2.9286284920280944778e-86,\n  3.2976802257607573031e-37, 9.3011872068621332399e-54, 1.113250147552460308e-69, 2.9286284920280944778e-86,\n  1.4168892644450972904e-37, 9.3011872068621332399e-54, 1.113250147552460308e-69, 2.9286284920280944778e-86,\n  4.7649378378726728402e-38, 9.3011872068621332399e-54, 1.113250147552460308e-69, 2.9286284920280944778e-86,\n  6.2960434583523738135e-40, 2.6283399642369020339e-57, 5.3358074162805516304e-73, 4.524218473063975309e-90,\n  6.2960434583523738135e-40, 2.6283399642369020339e-57, 5.3358074162805516304e-73, 4.524218473063975309e-90,\n  6.2960434583523738135e-40, 2.6283399642369020339e-57, 5.3358074162805516304e-73, 4.524218473063975309e-90,\n  6.2960434583523738135e-40, 2.6283399642369020339e-57, 5.3358074162805516304e-73, 4.524218473063975309e-90,\n  6.2960434583523738135e-40, 2.6283399642369020339e-57, 5.3358074162805516304e-73, 4.524218473063975309e-90,\n  6.2960434583523738135e-40, 2.6283399642369020339e-57, 5.3358074162805516304e-73, 4.524218473063975309e-90,\n  6.2960434583523738135e-40, 2.6283399642369020339e-57, 5.3358074162805516304e-73, 4.524218473063975309e-90,\n  2.6226236120327253511e-40, 2.6283399642369020339e-57, 5.3358074162805516304e-73, 4.524218473063975309e-90,\n  7.8591368887290111994e-41, 2.6283399642369020339e-57, 5.3358074162805516304e-73, 4.524218473063975309e-90,\n  7.8591368887290111994e-41, 2.6283399642369020339e-57, 5.3358074162805516304e-73, 4.524218473063975309e-90,\n  3.2673620808294506214e-41, 2.6283399642369020339e-57, 5.3358074162805516304e-73, 4.524218473063975309e-90,\n  9.7147467687967058732e-42, 7.9392906424978921242e-59, 2.9745456030524891833e-75, 5.969437008257943935e-91,\n  9.7147467687967058732e-42, 7.9392906424978921242e-59, 2.9745456030524891833e-75, 5.969437008257943935e-91,\n  3.9750282589222551507e-42, 7.9392906424978921242e-59, 2.9745456030524891833e-75, 5.969437008257943935e-91,\n  1.1051690039850297894e-42, 7.9392906424978921242e-59, 2.9745456030524891833e-75, 5.969437008257943935e-91,\n  1.1051690039850297894e-42, 7.9392906424978921242e-59, 2.9745456030524891833e-75, 5.969437008257943935e-91,\n  3.8770419025072344914e-43, 7.9392906424978921242e-59, 2.9745456030524891833e-75, 5.969437008257943935e-91,\n  2.8971783383570358633e-44, -2.6168913164368963837e-61, 3.7036201000008285821e-78, 5.6554937751584084315e-94,\n  2.8971783383570358633e-44, -2.6168913164368963837e-61, 3.7036201000008285821e-78, 5.6554937751584084315e-94,\n  2.8971783383570358633e-44, -2.6168913164368963837e-61, 3.7036201000008285821e-78, 5.6554937751584084315e-94,\n  2.8971783383570358633e-44, -2.6168913164368963837e-61, 3.7036201000008285821e-78, 5.6554937751584084315e-94,\n  6.5510079543732854985e-45, -2.6168913164368963837e-61, 3.7036201000008285821e-78, 5.6554937751584084315e-94,\n  6.5510079543732854985e-45, -2.6168913164368963837e-61, 3.7036201000008285821e-78, 5.6554937751584084315e-94,\n  9.4581409707401690366e-46, 4.9461632249367446986e-62, 3.7036201000008285821e-78, 5.6554937751584084315e-94,\n  9.4581409707401690366e-46, 4.9461632249367446986e-62, 3.7036201000008285821e-78, 5.6554937751584084315e-94,\n  9.4581409707401690366e-46, 4.9461632249367446986e-62, 3.7036201000008285821e-78, 5.6554937751584084315e-94,\n  2.451648649116083682e-46, 4.9461632249367446986e-62, 3.7036201000008285821e-78, 5.6554937751584084315e-94,\n  2.451648649116083682e-46, 4.9461632249367446986e-62, 3.7036201000008285821e-78, 5.6554937751584084315e-94,\n  7.0002556871006273225e-47, 1.0567786762735315635e-62, -6.1446417754639301152e-79, -1.5355611056488084652e-94,\n  7.0002556871006273225e-47, 1.0567786762735315635e-62, -6.1446417754639301152e-79, -1.5355611056488084652e-94,\n  2.6211979860855749482e-47, 8.4432539107728090768e-64, 1.9517662449371099233e-79, 2.62202614552995759e-95,\n  4.3166913557804827486e-48, 8.4432539107728090768e-64, 1.9517662449371099233e-79, 2.62202614552995759e-95,\n  4.3166913557804827486e-48, 8.4432539107728090768e-64, 1.9517662449371099233e-79, 2.62202614552995759e-95,\n  4.3166913557804827486e-48, 8.4432539107728090768e-64, 1.9517662449371099233e-79, 2.62202614552995759e-95,\n  1.5797802926460750146e-48, 2.3660905534865399025e-64, -7.2335760163150273591e-81, 2.8738690232659205689e-99,\n  2.1132476107887107169e-49, 8.7154294504188118783e-66, 1.2001823382693912203e-81, 2.8738690232659205689e-99,\n  2.1132476107887107169e-49, 8.7154294504188118783e-66, 1.2001823382693912203e-81, 2.8738690232659205689e-99,\n  2.1132476107887107169e-49, 8.7154294504188118783e-66, 1.2001823382693912203e-81, 2.8738690232659205689e-99,\n  4.0267819632970559834e-50, -7.8013829534098555144e-67, -1.1759240463442418271e-82, 2.8738690232659205689e-99,\n  4.0267819632970559834e-50, -7.8013829534098555144e-67, -1.1759240463442418271e-82, 2.8738690232659205689e-99,\n  4.0267819632970559834e-50, -7.8013829534098555144e-67, -1.1759240463442418271e-82, 2.8738690232659205689e-99,\n  1.8885701952232994665e-50, -7.8013829534098555144e-67, -1.1759240463442418271e-82, 2.8738690232659205689e-99,\n  8.1946431118642097069e-51, 1.5937536410989638719e-66, 1.459625439463388979e-82, 2.8738690232659205689e-99,\n  2.8491136916798196016e-51, 4.0680767287898916022e-67, 1.4185069655957361252e-83, -7.8369062883735917115e-100,\n  1.7634898158762432635e-52, 3.588705781024702988e-68, 5.9489775128085131541e-84, 1.0450891972142808004e-99,\n  1.7634898158762432635e-52, 3.588705781024702988e-68, 5.9489775128085131541e-84, 1.0450891972142808004e-99,\n  1.7634898158762432635e-52, 3.588705781024702988e-68, 5.9489775128085131541e-84, 1.0450891972142808004e-99,\n  1.7634898158762432635e-52, 3.588705781024702988e-68, 5.9489775128085131541e-84, 1.0450891972142808004e-99,\n  9.3011872068621332399e-54, 1.113250147552460308e-69, 2.9286284920280941206e-86, 2.1132026692048600853e-102,\n  9.3011872068621332399e-54, 1.113250147552460308e-69, 2.9286284920280941206e-86, 2.1132026692048600853e-102,\n  9.3011872068621332399e-54, 1.113250147552460308e-69, 2.9286284920280941206e-86, 2.1132026692048600853e-102,\n  9.3011872068621332399e-54, 1.113250147552460308e-69, 2.9286284920280941206e-86, 2.1132026692048600853e-102,\n  9.3011872068621332399e-54, 1.113250147552460308e-69, 2.9286284920280941206e-86, 2.1132026692048600853e-102,\n  4.0809436324633147776e-54, -4.587677453735884283e-71, -2.8859500138942368532e-87, -5.6567402911297190423e-103,\n  1.470821845263904967e-54, -4.587677453735884283e-71, -2.8859500138942368532e-87, -5.6567402911297190423e-103,\n  1.6576095166419998917e-55, 2.6568658093254848067e-71, 5.1571087196495574384e-87, 3.2728487032630537605e-103,\n  1.6576095166419998917e-55, 2.6568658093254848067e-71, 5.1571087196495574384e-87, 3.2728487032630537605e-103,\n  1.6576095166419998917e-55, 2.6568658093254848067e-71, 5.1571087196495574384e-87, 3.2728487032630537605e-103,\n  2.6283399642369020339e-57, 5.3358074162805516304e-73, 4.5242184730639744369e-90, 1.145584788913072936e-105,\n  2.6283399642369020339e-57, 5.3358074162805516304e-73, 4.5242184730639744369e-90, 1.145584788913072936e-105,\n  2.6283399642369020339e-57, 5.3358074162805516304e-73, 4.5242184730639744369e-90, 1.145584788913072936e-105,\n  2.6283399642369020339e-57, 5.3358074162805516304e-73, 4.5242184730639744369e-90, 1.145584788913072936e-105,\n  2.6283399642369020339e-57, 5.3358074162805516304e-73, 4.5242184730639744369e-90, 1.145584788913072936e-105,\n  2.6283399642369020339e-57, 5.3358074162805516304e-73, 4.5242184730639744369e-90, 1.145584788913072936e-105,\n  7.9392906424978921242e-59, 2.9745456030524891833e-75, 5.969437008257942845e-91, 5.554706987098633963e-107,\n  7.9392906424978921242e-59, 2.9745456030524891833e-75, 5.969437008257942845e-91, 5.554706987098633963e-107,\n  7.9392906424978921242e-59, 2.9745456030524891833e-75, 5.969437008257942845e-91, 5.554706987098633963e-107,\n  7.9392906424978921242e-59, 2.9745456030524891833e-75, 5.969437008257942845e-91, 5.554706987098633963e-107,\n  7.9392906424978921242e-59, 2.9745456030524891833e-75, 5.969437008257942845e-91, 5.554706987098633963e-107,\n  7.9392906424978921242e-59, 2.9745456030524891833e-75, 5.969437008257942845e-91, 5.554706987098633963e-107,\n  3.9565608646667614317e-59, 2.9745456030524891833e-75, 5.969437008257942845e-91, 5.554706987098633963e-107,\n  1.9651959757511960854e-59, 2.9745456030524891833e-75, 5.969437008257942845e-91, 5.554706987098633963e-107,\n  9.6951353129341363331e-60, 7.6368645294831185015e-76, 1.0603435429602168369e-91, 1.0451839188820145747e-108,\n  4.7167230906452229674e-60, 7.6368645294831185015e-76, 1.0603435429602168369e-91, 1.0451839188820145747e-108,\n  2.2275169795007668372e-60, 2.1097166542226745549e-76, 4.4670685979800101779e-92, 1.0451839188820145747e-108,\n  9.8291392392853877215e-61, -6.5385728340754726503e-77, -1.3520652573660833788e-93, -2.3220403312043059402e-109,\n  3.6061239614242446325e-61, 7.2792968540756372162e-77, 1.3988851821689310822e-92, 1.0451839188820145747e-108,\n  4.9461632249367446986e-62, 3.7036201000008285821e-78, 5.6554937751584084315e-94, -1.9306041120023063932e-110,\n  4.9461632249367446986e-62, 3.7036201000008285821e-78, 5.6554937751584084315e-94, -1.9306041120023063932e-110,\n  4.9461632249367446986e-62, 3.7036201000008285821e-78, 5.6554937751584084315e-94, -1.9306041120023063932e-110,\n  1.0567786762735315635e-62, -6.1446417754639301152e-79, -1.535561105648808199e-94, -1.9306041120023063932e-110,\n  1.0567786762735315635e-62, -6.1446417754639301152e-79, -1.535561105648808199e-94, -1.9306041120023063932e-110,\n  8.4432539107728090768e-64, 1.9517662449371099233e-79, 2.62202614552995759e-95, 6.5314563001514358328e-112,\n  8.4432539107728090768e-64, 1.9517662449371099233e-79, 2.62202614552995759e-95, 6.5314563001514358328e-112,\n  8.4432539107728090768e-64, 1.9517662449371099233e-79, 2.62202614552995759e-95, 6.5314563001514358328e-112,\n  8.4432539107728090768e-64, 1.9517662449371099233e-79, 2.62202614552995759e-95, 6.5314563001514358328e-112,\n  2.3660905534865399025e-64, -7.2335760163150273591e-81, 2.8738690232659205689e-99, 1.8395411057335783574e-115,\n  2.3660905534865399025e-64, -7.2335760163150273591e-81, 2.8738690232659205689e-99, 1.8395411057335783574e-115,\n  8.4679971416497210292e-65, -7.2335760163150273591e-81, 2.8738690232659205689e-99, 1.8395411057335783574e-115,\n  8.7154294504188118783e-66, 1.2001823382693912203e-81, 2.8738690232659205689e-99, 1.8395411057335783574e-115,\n  8.7154294504188118783e-66, 1.2001823382693912203e-81, 2.8738690232659205689e-99, 1.8395411057335783574e-115,\n  8.7154294504188118783e-66, 1.2001823382693912203e-81, 2.8738690232659205689e-99, 1.8395411057335783574e-115,\n  8.7154294504188118783e-66, 1.2001823382693912203e-81, 2.8738690232659205689e-99, 1.8395411057335783574e-115,\n  3.9676455775389135587e-66, 1.459625439463388979e-82, 2.8738690232659205689e-99, 1.8395411057335783574e-115,\n  1.5937536410989638719e-66, 1.459625439463388979e-82, 2.8738690232659205689e-99, 1.8395411057335783574e-115,\n  4.0680767287898916022e-67, 1.4185069655957361252e-83, -7.8369062883735917115e-100, -1.9081236411894110579e-116,\n  4.0680767287898916022e-67, 1.4185069655957361252e-83, -7.8369062883735917115e-100, -1.9081236411894110579e-116,\n  1.1007118082399544936e-67, 1.4185069655957361252e-83, -7.8369062883735917115e-100, -1.9081236411894110579e-116,\n  1.1007118082399544936e-67, 1.4185069655957361252e-83, -7.8369062883735917115e-100, -1.9081236411894110579e-116,\n  3.588705781024702988e-68, 5.9489775128085131541e-84, 1.0450891972142805974e-99, 1.8395411057335783574e-115,\n  3.588705781024702988e-68, 5.9489775128085131541e-84, 1.0450891972142805974e-99, 1.8395411057335783574e-115,\n  1.7341027056809927069e-68, 1.830931441234090934e-84, 1.3069928418846076386e-100, 3.1677600334418876704e-116,\n  8.0680116800913756637e-69, -2.2809159455312046184e-85, -4.0748824503880445403e-101, -6.3915272253158644628e-117,\n  3.4315039917320989315e-69, -2.2809159455312046184e-85, -4.0748824503880445403e-101, -6.3915272253158644628e-117,\n  1.113250147552460308e-69, 2.9286284920280941206e-86, 2.1132026692048600853e-102, -4.6672632026740766185e-119,\n  1.113250147552460308e-69, 2.9286284920280941206e-86, 2.1132026692048600853e-102, -4.6672632026740766185e-119,\n  5.3368668650755071652e-70, 2.9286284920280941206e-86, 2.1132026692048600853e-102, -4.6672632026740766185e-119,\n  2.4390495598509592076e-70, 2.9286284920280941206e-86, 2.1132026692048600853e-102, -4.6672632026740766185e-119,\n  9.901409072386855505e-71, -2.8859500138942368532e-87, -5.6567402911297190423e-103, -4.6672632026740766185e-119,\n  2.6568658093254848067e-71, 5.1571087196495574384e-87, 3.2728487032630532648e-103, 5.2465720993401781599e-119,\n  2.6568658093254848067e-71, 5.1571087196495574384e-87, 3.2728487032630532648e-103, 5.2465720993401781599e-119,\n  8.4572999356014273536e-72, 1.1355793528776598461e-87, 3.2728487032630532648e-103, 5.2465720993401781599e-119,\n  8.4572999356014273536e-72, 1.1355793528776598461e-87, 3.2728487032630532648e-103, 5.2465720993401781599e-119,\n  3.9294603961880721752e-72, 1.3019701118468578292e-88, -7.5747169634236195447e-105, -2.0152904854894729832e-121,\n  1.6655406264813940833e-72, 1.3019701118468578292e-88, -7.5747169634236195447e-105, -2.0152904854894729832e-121,\n  5.3358074162805516304e-73, 4.5242184730639744369e-90, 1.1455847889130727424e-105, 1.8573014293598455046e-121,\n  5.3358074162805516304e-73, 4.5242184730639744369e-90, 1.1455847889130727424e-105, 1.8573014293598455046e-121,\n  2.5059077041472040156e-73, 4.5242184730639744369e-90, 1.1455847889130727424e-105, 1.8573014293598455046e-121,\n  1.0909578480805302081e-73, 4.5242184730639744369e-90, 1.1455847889130727424e-105, 1.8573014293598455046e-121,\n  3.8348292004719330442e-74, 4.5242184730639744369e-90, 1.1455847889130727424e-105, 1.8573014293598455046e-121,\n  2.9745456030524891833e-75, 5.969437008257942845e-91, 5.5547069870986327528e-107, 1.6304246661326865276e-122,\n  2.9745456030524891833e-75, 5.969437008257942845e-91, 5.5547069870986327528e-107, 1.6304246661326865276e-122,\n  2.9745456030524891833e-75, 5.969437008257942845e-91, 5.5547069870986327528e-107, 1.6304246661326865276e-122,\n  2.9745456030524891833e-75, 5.969437008257942845e-91, 5.5547069870986327528e-107, 1.6304246661326865276e-122,\n  7.6368645294831185015e-76, 1.0603435429602168369e-91, 1.0451839188820145747e-108, 4.2386081393205242443e-125,\n  7.6368645294831185015e-76, 1.0603435429602168369e-91, 1.0451839188820145747e-108, 4.2386081393205242443e-125,\n  2.1097166542226745549e-76, 4.4670685979800101779e-92, 1.0451839188820145747e-108, 4.2386081393205242443e-125,\n  2.1097166542226745549e-76, 4.4670685979800101779e-92, 1.0451839188820145747e-108, 4.2386081393205242443e-125,\n  7.2792968540756372162e-77, 1.3988851821689310822e-92, 1.0451839188820145747e-108, 4.2386081393205242443e-125,\n  3.7036201000008285821e-78, 5.6554937751584084315e-94, -1.9306041120023063932e-110, 1.0223371855251472933e-126,\n  3.7036201000008285821e-78, 5.6554937751584084315e-94, -1.9306041120023063932e-110, 1.0223371855251472933e-126,\n  3.7036201000008285821e-78, 5.6554937751584084315e-94, -1.9306041120023063932e-110, 1.0223371855251472933e-126,\n  3.7036201000008285821e-78, 5.6554937751584084315e-94, -1.9306041120023063932e-110, 1.0223371855251472933e-126,\n  3.7036201000008285821e-78, 5.6554937751584084315e-94, -1.9306041120023063932e-110, 1.0223371855251472933e-126,\n  1.5445779612272179051e-78, 8.6145718795359707834e-95, 7.3062078800278780675e-111, 1.0223371855251472933e-126,\n  4.6505689184041232695e-79, 8.6145718795359707834e-95, 7.3062078800278780675e-111, 1.0223371855251472933e-126,\n  4.6505689184041232695e-79, 8.6145718795359707834e-95, 7.3062078800278780675e-111, 1.0223371855251472933e-126,\n  1.9517662449371099233e-79, 2.62202614552995759e-95, 6.5314563001514349095e-112, 9.9039323746573674262e-128,\n  6.0236490820360325022e-80, -3.7424672147304925625e-96, -1.784871512364483542e-112, 6.7095375687163151728e-129,\n  6.0236490820360325022e-80, -3.7424672147304925625e-96, -1.784871512364483542e-112, 6.7095375687163151728e-129,\n  2.6501457402022643213e-80, 3.7482149527770239293e-96, 6.5314563001514349095e-112, 9.9039323746573674262e-128,\n  9.6339406928538097998e-81, 2.8738690232659205689e-99, 1.8395411057335783574e-115, -7.8150389500644475446e-132,\n  1.2001823382693912203e-81, 2.8738690232659205689e-99, 1.8395411057335783574e-115, -7.8150389500644475446e-132,\n  1.2001823382693912203e-81, 2.8738690232659205689e-99, 1.8395411057335783574e-115, -7.8150389500644475446e-132,\n  1.2001823382693912203e-81, 2.8738690232659205689e-99, 1.8395411057335783574e-115, -7.8150389500644475446e-132,\n  1.459625439463388979e-82, 2.8738690232659205689e-99, 1.8395411057335783574e-115, -7.8150389500644475446e-132,\n  1.459625439463388979e-82, 2.8738690232659205689e-99, 1.8395411057335783574e-115, -7.8150389500644475446e-132,\n  1.459625439463388979e-82, 2.8738690232659205689e-99, 1.8395411057335783574e-115, -7.8150389500644475446e-132,\n  1.4185069655957361252e-83, -7.8369062883735917115e-100, -1.9081236411894107761e-116, -2.1796760241698337334e-132,\n  1.4185069655957361252e-83, -7.8369062883735917115e-100, -1.9081236411894107761e-116, -2.1796760241698337334e-132,\n  1.4185069655957361252e-83, -7.8369062883735917115e-100, -1.9081236411894107761e-116, -2.1796760241698337334e-132,\n  1.4185069655957361252e-83, -7.8369062883735917115e-100, -1.9081236411894107761e-116, -2.1796760241698337334e-132,\n  5.9489775128085131541e-84, 1.0450891972142805974e-99, 1.8395411057335783574e-115, -7.8150389500644475446e-132,\n  1.830931441234090934e-84, 1.3069928418846076386e-100, 3.1677600334418871069e-116, 3.4556869017247800778e-132,\n  1.830931441234090934e-84, 1.3069928418846076386e-100, 3.1677600334418871069e-116, 3.4556869017247800778e-132,\n  8.0141992334048515034e-85, 1.3069928418846076386e-100, 3.1677600334418871069e-116, 3.4556869017247800778e-132,\n  2.8666416439368237283e-85, 1.6400545060233297363e-101, -4.6672632026740766185e-119, -3.755176715260116501e-136,\n  2.9286284920280941206e-86, 2.1132026692048600853e-102, -4.6672632026740766185e-119, -3.755176715260116501e-136,\n  2.9286284920280941206e-86, 2.1132026692048600853e-102, -4.6672632026740766185e-119, -3.755176715260116501e-136,\n  2.9286284920280941206e-86, 2.1132026692048600853e-102, -4.6672632026740766185e-119, -3.755176715260116501e-136,\n  2.9286284920280941206e-86, 2.1132026692048600853e-102, -4.6672632026740766185e-119, -3.755176715260116501e-136,\n  1.3200167453193350837e-86, 2.1132026692048600853e-102, -4.6672632026740766185e-119, -3.755176715260116501e-136,\n  5.1571087196495574384e-87, 3.2728487032630532648e-103, 5.2465720993401781599e-119, -3.755176715260116501e-136,\n  1.1355793528776598461e-87, 3.2728487032630532648e-103, 5.2465720993401781599e-119, -3.755176715260116501e-136,\n  1.1355793528776598461e-87, 3.2728487032630532648e-103, 5.2465720993401781599e-119, -3.755176715260116501e-136,\n  1.3019701118468578292e-88, -7.5747169634236195447e-105, -2.0152904854894725532e-121, -3.1562414818576682143e-137,\n  1.3019701118468578292e-88, -7.5747169634236195447e-105, -2.0152904854894725532e-121, -3.1562414818576682143e-137,\n  1.3019701118468578292e-88, -7.5747169634236195447e-105, -2.0152904854894725532e-121, -3.1562414818576682143e-137,\n  4.5242184730639744369e-90, 1.1455847889130727424e-105, 1.8573014293598452896e-121, 1.1431992269852683481e-137,\n  4.5242184730639744369e-90, 1.1455847889130727424e-105, 1.8573014293598452896e-121, 1.1431992269852683481e-137,\n  4.5242184730639744369e-90, 1.1455847889130727424e-105, 1.8573014293598452896e-121, 1.1431992269852683481e-137,\n  4.5242184730639744369e-90, 1.1455847889130727424e-105, 1.8573014293598452896e-121, 1.1431992269852683481e-137,\n  4.5242184730639744369e-90, 1.1455847889130727424e-105, 1.8573014293598452896e-121, 1.1431992269852683481e-137,\n  5.969437008257942845e-91, 5.5547069870986327528e-107, 1.6304246661326865276e-122, 6.8339049774534162772e-139,\n  5.969437008257942845e-91, 5.5547069870986327528e-107, 1.6304246661326865276e-122, 6.8339049774534162772e-139,\n  5.969437008257942845e-91, 5.5547069870986327528e-107, 1.6304246661326865276e-122, 6.8339049774534162772e-139,\n  1.0603435429602168369e-91, 1.0451839188820145747e-108, 4.2386081393205242443e-125, 1.1062055705591188256e-141,\n  1.0603435429602168369e-91, 1.0451839188820145747e-108, 4.2386081393205242443e-125, 1.1062055705591188256e-141,\n  1.0603435429602168369e-91, 1.0451839188820145747e-108, 4.2386081393205242443e-125, 1.1062055705591188256e-141,\n  4.4670685979800101779e-92, 1.0451839188820145747e-108, 4.2386081393205242443e-125, 1.1062055705591188256e-141,\n  1.3988851821689310822e-92, 1.0451839188820145747e-108, 4.2386081393205242443e-125, 1.1062055705591188256e-141,\n  1.3988851821689310822e-92, 1.0451839188820145747e-108, 4.2386081393205242443e-125, 1.1062055705591188256e-141,\n  6.3183932821616130831e-93, 1.0451839188820145747e-108, 4.2386081393205242443e-125, 1.1062055705591188256e-141,\n  2.4831640123977650651e-93, 1.9359195088038447797e-109, -4.8867691298577234423e-126, -2.0587960670007823264e-142,\n  5.6554937751584084315e-94, -1.9306041120023063932e-110, 1.0223371855251471293e-126, 1.2214168761472102282e-142,\n  5.6554937751584084315e-94, -1.9306041120023063932e-110, 1.0223371855251471293e-126, 1.2214168761472102282e-142,\n  8.6145718795359707834e-95, 7.3062078800278780675e-111, 1.0223371855251471293e-126, 1.2214168761472102282e-142,\n  8.6145718795359707834e-95, 7.3062078800278780675e-111, 1.0223371855251471293e-126, 1.2214168761472102282e-142,\n  8.6145718795359707834e-95, 7.3062078800278780675e-111, 1.0223371855251471293e-126, 1.2214168761472102282e-142,\n  2.62202614552995759e-95, 6.5314563001514349095e-112, 9.9039323746573674262e-128, -8.6629775332868987041e-145,\n  2.62202614552995759e-95, 6.5314563001514349095e-112, 9.9039323746573674262e-128, -8.6629775332868987041e-145,\n  1.1238897120284541253e-95, 6.5314563001514349095e-112, 9.9039323746573674262e-128, -8.6629775332868987041e-145,\n  3.7482149527770239293e-96, 6.5314563001514349095e-112, 9.9039323746573674262e-128, -8.6629775332868987041e-145,\n  2.8738690232659205689e-99, 1.8395411057335783574e-115, -7.8150389500644475446e-132, -3.9681466199873824165e-148,\n  2.8738690232659205689e-99, 1.8395411057335783574e-115, -7.8150389500644475446e-132, -3.9681466199873824165e-148,\n  2.8738690232659205689e-99, 1.8395411057335783574e-115, -7.8150389500644475446e-132, -3.9681466199873824165e-148,\n  2.8738690232659205689e-99, 1.8395411057335783574e-115, -7.8150389500644475446e-132, -3.9681466199873824165e-148,\n  2.8738690232659205689e-99, 1.8395411057335783574e-115, -7.8150389500644475446e-132, -3.9681466199873824165e-148,\n  2.8738690232659205689e-99, 1.8395411057335783574e-115, -7.8150389500644475446e-132, -3.9681466199873824165e-148,\n  2.8738690232659205689e-99, 1.8395411057335783574e-115, -7.8150389500644475446e-132, -3.9681466199873824165e-148,\n  2.8738690232659205689e-99, 1.8395411057335783574e-115, -7.8150389500644475446e-132, -3.9681466199873824165e-148,\n  2.8738690232659205689e-99, 1.8395411057335783574e-115, -7.8150389500644475446e-132, -3.9681466199873824165e-148,\n  2.8738690232659205689e-99, 1.8395411057335783574e-115, -7.8150389500644475446e-132, -3.9681466199873824165e-148,\n  2.8738690232659205689e-99, 1.8395411057335783574e-115, -7.8150389500644475446e-132, -3.9681466199873824165e-148,\n  1.0450891972142805974e-99, 1.8395411057335783574e-115, -7.8150389500644475446e-132, -3.9681466199873824165e-148,\n  1.3069928418846076386e-100, 3.1677600334418871069e-116, 3.4556869017247794521e-132, 8.5448727249069983612e-148,\n  1.3069928418846076386e-100, 3.1677600334418871069e-116, 3.4556869017247794521e-132, 8.5448727249069983612e-148,\n  1.3069928418846076386e-100, 3.1677600334418871069e-116, 3.4556869017247794521e-132, 8.5448727249069983612e-148,\n  1.6400545060233297363e-101, -4.6672632026740766185e-119, -3.755176715260116501e-136, 2.1571619860435652883e-152,\n  1.6400545060233297363e-101, -4.6672632026740766185e-119, -3.755176715260116501e-136, 2.1571619860435652883e-152,\n  1.6400545060233297363e-101, -4.6672632026740766185e-119, -3.755176715260116501e-136, 2.1571619860435652883e-152,\n  2.1132026692048600853e-102, -4.6672632026740766185e-119, -3.755176715260116501e-136, 2.1571619860435652883e-152,\n  2.1132026692048600853e-102, -4.6672632026740766185e-119, -3.755176715260116501e-136, 2.1571619860435652883e-152,\n  2.1132026692048600853e-102, -4.6672632026740766185e-119, -3.755176715260116501e-136, 2.1571619860435652883e-152,\n  3.2728487032630532648e-103, 5.2465720993401781599e-119, -3.755176715260116501e-136, 2.1571619860435652883e-152,\n  3.2728487032630532648e-103, 5.2465720993401781599e-119, -3.755176715260116501e-136, 2.1571619860435652883e-152,\n  3.2728487032630532648e-103, 5.2465720993401781599e-119, -3.755176715260116501e-136, 2.1571619860435652883e-152,\n  1.0404514546648604359e-103, 2.896544483330507019e-120, 3.1239284188885823808e-136, 2.1571619860435652883e-152,\n  1.0404514546648604359e-103, 2.896544483330507019e-120, 3.1239284188885823808e-136, 2.1571619860435652883e-152,\n  4.8235214251531210473e-104, 2.896544483330507019e-120, 3.1239284188885823808e-136, 2.1571619860435652883e-152,\n  2.0330248644053793915e-104, 2.896544483330507019e-120, 3.1239284188885823808e-136, 2.1571619860435652883e-152,\n  6.3777658403150887343e-105, -2.0152904854894725532e-121, -3.156241481857667737e-137, -7.0684085473731388916e-153,\n  6.3777658403150887343e-105, -2.0152904854894725532e-121, -3.156241481857667737e-137, -7.0684085473731388916e-153,\n  2.88964513938041089e-105, 5.7298933442091639924e-121, -3.156241481857667737e-137, -7.0684085473731388916e-153,\n  1.1455847889130727424e-105, 1.8573014293598452896e-121, 1.1431992269852681095e-137, 2.4782675885631257398e-153,\n  2.7355461367940366859e-106, -7.8994528064813712419e-123, -2.0037599452814940222e-138, 9.1598554579059548847e-155,\n  2.7355461367940366859e-106, -7.8994528064813712419e-123, -2.0037599452814940222e-138, 9.1598554579059548847e-155,\n  5.5547069870986327528e-107, 1.6304246661326865276e-122, 6.8339049774534147855e-139, 9.1598554579059548847e-155,\n  5.5547069870986327528e-107, 1.6304246661326865276e-122, 6.8339049774534147855e-139, 9.1598554579059548847e-155,\n  1.0451839188820145747e-108, 4.2386081393205242443e-125, 1.1062055705591186799e-141, 1.1734404793201255869e-157,\n  1.0451839188820145747e-108, 4.2386081393205242443e-125, 1.1062055705591186799e-141, 1.1734404793201255869e-157,\n  1.0451839188820145747e-108, 4.2386081393205242443e-125, 1.1062055705591186799e-141, 1.1734404793201255869e-157,\n  1.0451839188820145747e-108, 4.2386081393205242443e-125, 1.1062055705591186799e-141, 1.1734404793201255869e-157,\n  1.0451839188820145747e-108, 4.2386081393205242443e-125, 1.1062055705591186799e-141, 1.1734404793201255869e-157,\n  1.0451839188820145747e-108, 4.2386081393205242443e-125, 1.1062055705591186799e-141, 1.1734404793201255869e-157,\n  1.9359195088038447797e-109, -4.8867691298577234423e-126, -2.0587960670007819622e-142, -2.8326669474241479263e-158,\n  1.9359195088038447797e-109, -4.8867691298577234423e-126, -2.0587960670007819622e-142, -2.8326669474241479263e-158,\n  1.9359195088038447797e-109, -4.8867691298577234423e-126, -2.0587960670007819622e-142, -2.8326669474241479263e-158,\n  8.7142954880180709975e-110, -4.8867691298577234423e-126, -2.0587960670007819622e-142, -2.8326669474241479263e-158,\n  3.3918456880078814158e-110, 6.931443500908017045e-126, 1.1062055705591186799e-141, 1.1734404793201255869e-157,\n  7.3062078800278780675e-111, 1.0223371855251471293e-126, 1.2214168761472102282e-142, 8.0910098773220312367e-159,\n  7.3062078800278780675e-111, 1.0223371855251471293e-126, 1.2214168761472102282e-142, 8.0910098773220312367e-159,\n  6.5314563001514349095e-112, 9.9039323746573674262e-128, -8.6629775332868972816e-145, -1.5987060076657616072e-160,\n  6.5314563001514349095e-112, 9.9039323746573674262e-128, -8.6629775332868972816e-145, -1.5987060076657616072e-160,\n  6.5314563001514349095e-112, 9.9039323746573674262e-128, -8.6629775332868972816e-145, -1.5987060076657616072e-160,\n  6.5314563001514349095e-112, 9.9039323746573674262e-128, -8.6629775332868972816e-145, -1.5987060076657616072e-160,\n  2.3732923938934761454e-112, 6.7095375687163138915e-129, 1.6963686085056791706e-144, 1.2464251916751375716e-160,\n  2.9421044076449630171e-113, 6.7095375687163138915e-129, 1.6963686085056791706e-144, 1.2464251916751375716e-160,\n  2.9421044076449630171e-113, 6.7095375687163138915e-129, 1.6963686085056791706e-144, 1.2464251916751375716e-160,\n  2.9421044076449630171e-113, 6.7095375687163138915e-129, 1.6963686085056791706e-144, 1.2464251916751375716e-160,\n  3.4325196623373878948e-114, 9.3892593260023063019e-130, 9.4702132359198537748e-146, 1.7950099192230045857e-161,\n  3.4325196623373878948e-114, 9.3892593260023063019e-130, 9.4702132359198537748e-146, 1.7950099192230045857e-161,\n  3.4325196623373878948e-114, 9.3892593260023063019e-130, 9.4702132359198537748e-146, 1.7950099192230045857e-161,\n  1.8395411057335783574e-115, -7.8150389500644475446e-132, -3.9681466199873824165e-148, 2.9106774506606945839e-164,\n  1.8395411057335783574e-115, -7.8150389500644475446e-132, -3.9681466199873824165e-148, 2.9106774506606945839e-164,\n  1.8395411057335783574e-115, -7.8150389500644475446e-132, -3.9681466199873824165e-148, 2.9106774506606945839e-164,\n  1.8395411057335783574e-115, -7.8150389500644475446e-132, -3.9681466199873824165e-148, 2.9106774506606945839e-164,\n  1.8395411057335783574e-115, -7.8150389500644475446e-132, -3.9681466199873824165e-148, 2.9106774506606945839e-164,\n  8.2436437080731844263e-116, 1.4726412753514008951e-131, -3.9681466199873824165e-148, 2.9106774506606945839e-164,\n  3.1677600334418871069e-116, 3.4556869017247794521e-132, 8.544872724906996972e-148, 1.6802919634942429241e-163,\n  6.2981819612623816536e-117, 6.3800543877747317218e-133, 7.2423563434801054878e-149, 1.1741471776254779927e-164,\n  6.2981819612623816536e-117, 6.3800543877747317218e-133, 7.2423563434801054878e-149, 1.1741471776254779927e-164,\n  6.2981819612623816536e-117, 6.3800543877747317218e-133, 7.2423563434801054878e-149, 1.1741471776254779927e-164,\n  3.1257546646178208289e-117, -6.6414926959353515111e-134, -5.7828074707888119584e-150, -1.2825052715093464343e-165,\n  1.5395410162955400644e-117, -6.6414926959353515111e-134, -5.7828074707888119584e-150, -1.2825052715093464343e-165,\n  7.4643419213439950602e-118, 1.0969016447485317626e-133, -5.7828074707888119584e-150, -1.2825052715093464343e-165,\n  3.4988078005382940294e-118, 2.1637618757749825688e-134, -8.9490928918944555247e-151, -1.9717385086233606481e-166,\n  1.5160407401354430737e-118, 2.1637618757749825688e-134, -8.9490928918944555247e-151, -1.9717385086233606481e-166,\n  5.2465720993401781599e-119, -3.755176715260116501e-136, 2.1571619860435648643e-152, 6.3257905089784152346e-168,\n  2.896544483330507019e-120, 3.1239284188885823808e-136, 2.1571619860435648643e-152, 6.3257905089784152346e-168,\n  2.896544483330507019e-120, 3.1239284188885823808e-136, 2.1571619860435648643e-152, 6.3257905089784152346e-168,\n  2.896544483330507019e-120, 3.1239284188885823808e-136, 2.1571619860435648643e-152, 6.3257905089784152346e-168,\n  2.896544483330507019e-120, 3.1239284188885823808e-136, 2.1571619860435648643e-152, 6.3257905089784152346e-168,\n  2.896544483330507019e-120, 3.1239284188885823808e-136, 2.1571619860435648643e-152, 6.3257905089784152346e-168,\n  1.3475077173907800538e-120, -3.156241481857667737e-137, -7.0684085473731388916e-153, -3.3573283875161501977e-170,\n  5.7298933442091639924e-121, -3.156241481857667737e-137, -7.0684085473731388916e-153, -3.3573283875161501977e-170,\n  1.8573014293598452896e-121, 1.1431992269852681095e-137, 2.4782675885631257398e-153, -3.3573283875161501977e-170,\n  1.8573014293598452896e-121, 1.1431992269852681095e-137, 2.4782675885631257398e-153, -3.3573283875161501977e-170,\n  8.8915345064751572143e-122, 1.1431992269852681095e-137, 2.4782675885631257398e-153, -3.3573283875161501977e-170,\n  4.0507946129135104481e-122, 6.8339049774534147855e-139, 9.1598554579059548847e-155, -4.5159745404911825673e-172,\n  1.6304246661326865276e-122, 6.8339049774534147855e-139, 9.1598554579059548847e-155, -4.5159745404911825673e-172,\n  4.2023969274227456735e-123, 6.8339049774534147855e-139, 9.1598554579059548847e-155, -4.5159745404911825673e-172,\n  4.2023969274227456735e-123, 6.8339049774534147855e-139, 9.1598554579059548847e-155, -4.5159745404911825673e-172,\n  1.1769344939467164447e-123, 1.1602886988632691941e-140, 3.0307583960570927356e-156, 5.8345524661064369683e-172,\n  1.1769344939467164447e-123, 1.1602886988632691941e-140, 3.0307583960570927356e-156, 5.8345524661064369683e-172,\n  4.2056888557770896953e-124, 1.1602886988632691941e-140, 3.0307583960570927356e-156, 5.8345524661064369683e-172,\n  4.2386081393205242443e-125, 1.1062055705591186799e-141, 1.1734404793201255869e-157, 1.2381024895275844856e-174,\n  4.2386081393205242443e-125, 1.1062055705591186799e-141, 1.1734404793201255869e-157, 1.2381024895275844856e-174,\n  4.2386081393205242443e-125, 1.1062055705591186799e-141, 1.1734404793201255869e-157, 1.2381024895275844856e-174,\n  4.2386081393205242443e-125, 1.1062055705591186799e-141, 1.1734404793201255869e-157, 1.2381024895275844856e-174,\n  1.8749656131673758844e-125, 1.1062055705591186799e-141, 1.1734404793201255869e-157, 1.2381024895275844856e-174,\n  6.931443500908017045e-126, 1.1062055705591186799e-141, 1.1734404793201255869e-157, 1.2381024895275844856e-174,\n  1.0223371855251471293e-126, 1.2214168761472102282e-142, 8.0910098773220302259e-159, 1.2381024895275844856e-174,\n  1.0223371855251471293e-126, 1.2214168761472102282e-142, 8.0910098773220302259e-159, 1.2381024895275844856e-174,\n  1.0223371855251471293e-126, 1.2214168761472102282e-142, 8.0910098773220302259e-159, 1.2381024895275844856e-174,\n  2.8369889610228834887e-127, 4.0136364036021218058e-143, -1.0134099605688458828e-159, -2.5389576707476506925e-176,\n  2.8369889610228834887e-127, 4.0136364036021218058e-143, -1.0134099605688458828e-159, -2.5389576707476506925e-176,\n  9.9039323746573674262e-128, -8.6629775332868972816e-145, -1.5987060076657612913e-160, -2.5389576707476506925e-176,\n  6.7095375687163138915e-129, 1.6963686085056791706e-144, 1.2464251916751375716e-160, 6.197724948400014906e-177,\n  6.7095375687163138915e-129, 1.6963686085056791706e-144, 1.2464251916751375716e-160, 6.197724948400014906e-177,\n  6.7095375687163138915e-129, 1.6963686085056791706e-144, 1.2464251916751375716e-160, 6.197724948400014906e-177,\n  6.7095375687163138915e-129, 1.6963686085056791706e-144, 1.2464251916751375716e-160, 6.197724948400014906e-177,\n  9.3892593260023063019e-130, 9.4702132359198537748e-146, 1.7950099192230045857e-161, -1.6991004655691155518e-177,\n  9.3892593260023063019e-130, 9.4702132359198537748e-146, 1.7950099192230045857e-161, -1.6991004655691155518e-177,\n  9.3892593260023063019e-130, 9.4702132359198537748e-146, 1.7950099192230045857e-161, -1.6991004655691155518e-177,\n  2.175994780857201024e-130, 1.4618808551874518553e-146, 1.6802919634942426156e-163, 2.8330093736631818036e-179,\n  2.175994780857201024e-130, 1.4618808551874518553e-146, 1.6802919634942426156e-163, 2.8330093736631818036e-179,\n  3.7267864457092460442e-131, 4.6083930759590139305e-147, 1.6802919634942426156e-163, 2.8330093736631818036e-179,\n  3.7267864457092460442e-131, 4.6083930759590139305e-147, 1.6802919634942426156e-163, 2.8330093736631818036e-179,\n  3.7267864457092460442e-131, 4.6083930759590139305e-147, 1.6802919634942426156e-163, 2.8330093736631818036e-179,\n  1.4726412753514008951e-131, -3.9681466199873824165e-148, 2.9106774506606941983e-164, 5.1948630316441296498e-180,\n  3.4556869017247794521e-132, 8.544872724906996972e-148, 1.6802919634942426156e-163, 2.8330093736631818036e-179,\n  3.4556869017247794521e-132, 8.544872724906996972e-148, 1.6802919634942426156e-163, 2.8330093736631818036e-179,\n  6.3800543877747317218e-133, 7.2423563434801054878e-149, 1.1741471776254777999e-164, 1.3389912474795152755e-180,\n  6.3800543877747317218e-133, 7.2423563434801054878e-149, 1.1741471776254777999e-164, 1.3389912474795152755e-180,\n  6.3800543877747317218e-133, 7.2423563434801054878e-149, 1.1741471776254777999e-164, 1.3389912474795152755e-180,\n  2.8579525590905986764e-133, -5.7828074707888119584e-150, -1.2825052715093464343e-165, -1.0696067158221530218e-181,\n  1.0969016447485317626e-133, -5.7828074707888119584e-150, -1.2825052715093464343e-165, -1.0696067158221530218e-181,\n  2.1637618757749825688e-134, -8.9490928918944555247e-151, -1.9717385086233606481e-166, 1.3535321672928907047e-182,\n  2.1637618757749825688e-134, -8.9490928918944555247e-151, -1.9717385086233606481e-166, 1.3535321672928907047e-182,\n  2.1637618757749825688e-134, -8.9490928918944555247e-151, -1.9717385086233606481e-166, 1.3535321672928907047e-182,\n  1.0631050543111905033e-134, 1.5490398016102376505e-150, 3.4549185946116918017e-166, 1.3535321672928907047e-182,\n  5.1277664357929471499e-135, 3.2706525621039604902e-151, 7.4159004299416557678e-167, 1.3535321672928907047e-182,\n  2.3761243821334675971e-135, 3.2706525621039604902e-151, 7.4159004299416557678e-167, 1.3535321672928907047e-182,\n  1.0003033553037281263e-135, 2.1571619860435648643e-152, 6.3257905089784152346e-168, 3.5607241064750984115e-184,\n  3.1239284188885823808e-136, 2.1571619860435648643e-152, 6.3257905089784152346e-168, 3.5607241064750984115e-184,\n  3.1239284188885823808e-136, 2.1571619860435648643e-152, 6.3257905089784152346e-168, 3.5607241064750984115e-184,\n  1.4041521353514076604e-136, 2.1571619860435648643e-152, 6.3257905089784152346e-168, 3.5607241064750984115e-184,\n  5.4426399358282049106e-137, 2.4782675885631257398e-153, -3.3573283875161501977e-170, 3.0568054078295488291e-186,\n  1.1431992269852681095e-137, 2.4782675885631257398e-153, -3.3573283875161501977e-170, 3.0568054078295488291e-186,\n  1.1431992269852681095e-137, 2.4782675885631257398e-153, -3.3573283875161501977e-170, 3.0568054078295488291e-186,\n  6.8339049774534147855e-139, 9.1598554579059548847e-155, -4.5159745404911819927e-172, -4.5870810097328578981e-188,\n  6.8339049774534147855e-139, 9.1598554579059548847e-155, -4.5159745404911819927e-172, -4.5870810097328578981e-188,\n  6.8339049774534147855e-139, 9.1598554579059548847e-155, -4.5159745404911819927e-172, -4.5870810097328578981e-188,\n  6.8339049774534147855e-139, 9.1598554579059548847e-155, -4.5159745404911819927e-172, -4.5870810097328578981e-188,\n  1.1602886988632691941e-140, 3.0307583960570927356e-156, 5.8345524661064358191e-172, 6.9043123899963188689e-188,\n  1.1602886988632691941e-140, 3.0307583960570927356e-156, 5.8345524661064358191e-172, 6.9043123899963188689e-188,\n  1.1602886988632691941e-140, 3.0307583960570927356e-156, 5.8345524661064358191e-172, 6.9043123899963188689e-188,\n  1.1602886988632691941e-140, 3.0307583960570927356e-156, 5.8345524661064358191e-172, 6.9043123899963188689e-188,\n  1.1602886988632691941e-140, 3.0307583960570927356e-156, 5.8345524661064358191e-172, 6.9043123899963188689e-188,\n  1.1602886988632691941e-140, 3.0307583960570927356e-156, 5.8345524661064358191e-172, 6.9043123899963188689e-188,\n  1.1062055705591186799e-141, 1.1734404793201255869e-157, 1.2381024895275844856e-174, -8.4789520282639751913e-191,\n  1.1062055705591186799e-141, 1.1734404793201255869e-157, 1.2381024895275844856e-174, -8.4789520282639751913e-191,\n  1.1062055705591186799e-141, 1.1734404793201255869e-157, 1.2381024895275844856e-174, -8.4789520282639751913e-191,\n  1.1062055705591186799e-141, 1.1734404793201255869e-157, 1.2381024895275844856e-174, -8.4789520282639751913e-191,\n  4.5016298192952031469e-142, -2.8326669474241479263e-158, 1.2381024895275844856e-174, -8.4789520282639751913e-191,\n  1.2214168761472102282e-142, 8.0910098773220302259e-159, 1.2381024895275844856e-174, -8.4789520282639751913e-191,\n  1.2214168761472102282e-142, 8.0910098773220302259e-159, 1.2381024895275844856e-174, -8.4789520282639751913e-191,\n  4.0136364036021218058e-143, -1.0134099605688458828e-159, -2.5389576707476506925e-176, -6.2404128071707654958e-193,\n  4.0136364036021218058e-143, -1.0134099605688458828e-159, -2.5389576707476506925e-176, -6.2404128071707654958e-193,\n  1.9635033141346264592e-143, -1.0134099605688458828e-159, -2.5389576707476506925e-176, -6.2404128071707654958e-193,\n  9.3843676940087855824e-144, 1.2626949989038732076e-159, 2.2730883653953564668e-175, 2.7431118386590483722e-191,\n  4.2590349703400483539e-144, 1.2464251916751375716e-160, 6.1977249484000140293e-177, 1.1294061984896458822e-192,\n  1.6963686085056791706e-144, 1.2464251916751375716e-160, 6.1977249484000140293e-177, 1.1294061984896458822e-192,\n  4.1503542758849472122e-145, -1.7614040799531193879e-161, -1.6991004655691153326e-177, -1.856794109153959173e-193,\n  4.1503542758849472122e-145, -1.7614040799531193879e-161, -1.6991004655691153326e-177, -1.856794109153959173e-193,\n  9.4702132359198537748e-146, 1.7950099192230045857e-161, -1.6991004655691153326e-177, -1.856794109153959173e-193,\n  9.4702132359198537748e-146, 1.7950099192230045857e-161, -1.6991004655691153326e-177, -1.856794109153959173e-193,\n  1.4618808551874518553e-146, 1.6802919634942426156e-163, 2.8330093736631818036e-179, -7.4549709281190454638e-196,\n  1.4618808551874518553e-146, 1.6802919634942426156e-163, 2.8330093736631818036e-179, -7.4549709281190454638e-196,\n  1.4618808551874518553e-146, 1.6802919634942426156e-163, 2.8330093736631818036e-179, -7.4549709281190454638e-196,\n  4.6083930759590139305e-147, 1.6802919634942426156e-163, 2.8330093736631818036e-179, -7.4549709281190454638e-196,\n  4.6083930759590139305e-147, 1.6802919634942426156e-163, 2.8330093736631818036e-179, -7.4549709281190454638e-196,\n  2.105789206980137775e-147, 1.6802919634942426156e-163, 2.8330093736631818036e-179, -7.4549709281190454638e-196,\n  8.544872724906996972e-148, 1.6802919634942426156e-163, 2.8330093736631818036e-179, -7.4549709281190454638e-196,\n  2.2883630524598079723e-148, 2.9106774506606941983e-164, 5.1948630316441287936e-180, 9.6685396110091032843e-196,\n  2.2883630524598079723e-148, 2.9106774506606941983e-164, 5.1948630316441287936e-180, 9.6685396110091032843e-196,\n  7.2423563434801054878e-149, 1.1741471776254777999e-164, 1.3389912474795150614e-180, 1.1067843414450286726e-196,\n  7.2423563434801054878e-149, 1.1741471776254777999e-164, 1.3389912474795150614e-180, 1.1067843414450286726e-196,\n  3.3320377982006123631e-149, 3.0588204110786950436e-165, 3.7502330143836152136e-181, 3.6564932749519464998e-198,\n  1.3768785255608653665e-149, 3.0588204110786950436e-165, 3.7502330143836152136e-181, 3.6564932749519464998e-198,\n  3.9929888924099219388e-150, -1.9717385086233606481e-166, 1.3535321672928907047e-182, 3.1205762277848031878e-199,\n  3.9929888924099219388e-150, -1.9717385086233606481e-166, 1.3535321672928907047e-182, 3.1205762277848031878e-199,\n  1.5490398016102376505e-150, 3.4549185946116918017e-166, 1.3535321672928907047e-182, 3.1205762277848031878e-199,\n  3.2706525621039604902e-151, 7.4159004299416557678e-167, 1.3535321672928907047e-182, 3.1205762277848031878e-199,\n  3.2706525621039604902e-151, 7.4159004299416557678e-167, 1.3535321672928907047e-182, 3.1205762277848031878e-199,\n  2.1571619860435648643e-152, 6.3257905089784152346e-168, 3.5607241064750984115e-184, -1.4832196127821708615e-201,\n  2.1571619860435648643e-152, 6.3257905089784152346e-168, 3.5607241064750984115e-184, -1.4832196127821708615e-201,\n  2.1571619860435648643e-152, 6.3257905089784152346e-168, 3.5607241064750984115e-184, -1.4832196127821708615e-201,\n  2.1571619860435648643e-152, 6.3257905089784152346e-168, 3.5607241064750984115e-184, -1.4832196127821708615e-201,\n  2.4782675885631257398e-153, -3.3573283875161501977e-170, 3.0568054078295488291e-186, 1.4980560800565462618e-202,\n  2.4782675885631257398e-153, -3.3573283875161501977e-170, 3.0568054078295488291e-186, 1.4980560800565462618e-202,\n  2.4782675885631257398e-153, -3.3573283875161501977e-170, 3.0568054078295488291e-186, 1.4980560800565462618e-202,\n  9.1598554579059548847e-155, -4.5159745404911819927e-172, -4.5870810097328572602e-188, -3.2905064432040069127e-204,\n  9.1598554579059548847e-155, -4.5159745404911819927e-172, -4.5870810097328572602e-188, -3.2905064432040069127e-204,\n  9.1598554579059548847e-155, -4.5159745404911819927e-172, -4.5870810097328572602e-188, -3.2905064432040069127e-204,\n  9.1598554579059548847e-155, -4.5159745404911819927e-172, -4.5870810097328572602e-188, -3.2905064432040069127e-204,\n  9.1598554579059548847e-155, -4.5159745404911819927e-172, -4.5870810097328572602e-188, -3.2905064432040069127e-204,\n  1.7015147267057481414e-155, -4.5159745404911819927e-172, -4.5870810097328572602e-188, -3.2905064432040069127e-204,\n  1.7015147267057481414e-155, -4.5159745404911819927e-172, -4.5870810097328572602e-188, -3.2905064432040069127e-204,\n  1.7015147267057481414e-155, -4.5159745404911819927e-172, -4.5870810097328572602e-188, -3.2905064432040069127e-204,\n  7.6922213530572229852e-156, -4.5159745404911819927e-172, -4.5870810097328572602e-188, -3.2905064432040069127e-204,\n  3.0307583960570927356e-156, 5.8345524661064358191e-172, 6.9043123899963188689e-188, -3.2905064432040069127e-204,\n  7.0002691755702864582e-157, 6.5928896280762691321e-173, 1.1586156901317304854e-188, -1.0100405885278530137e-205,\n  7.0002691755702864582e-157, 6.5928896280762691321e-173, 1.1586156901317304854e-188, -1.0100405885278530137e-205,\n  1.1734404793201255869e-157, 1.2381024895275844856e-174, -8.4789520282639751913e-191, -1.3321093418096261919e-207,\n  1.1734404793201255869e-157, 1.2381024895275844856e-174, -8.4789520282639751913e-191, -1.3321093418096261919e-207,\n  1.1734404793201255869e-157, 1.2381024895275844856e-174, -8.4789520282639751913e-191, -1.3321093418096261919e-207,\n  4.4508689228885539715e-158, 1.2381024895275844856e-174, -8.4789520282639751913e-191, -1.3321093418096261919e-207,\n  8.0910098773220302259e-159, 1.2381024895275844856e-174, -8.4789520282639751913e-191, -1.3321093418096261919e-207,\n  8.0910098773220302259e-159, 1.2381024895275844856e-174, -8.4789520282639751913e-191, -1.3321093418096261919e-207,\n  8.0910098773220302259e-159, 1.2381024895275844856e-174, -8.4789520282639751913e-191, -1.3321093418096261919e-207,\n  3.5387999583765925506e-159, 2.2730883653953564668e-175, 2.7431118386590483722e-191, -1.3321093418096261919e-207,\n  1.2626949989038732076e-159, 2.2730883653953564668e-175, 2.7431118386590483722e-191, -1.3321093418096261919e-207,\n  1.2464251916751375716e-160, 6.1977249484000140293e-177, 1.1294061984896456875e-192, 2.2526486929936882202e-208,\n  1.2464251916751375716e-160, 6.1977249484000140293e-177, 1.1294061984896456875e-192, 2.2526486929936882202e-208,\n  1.2464251916751375716e-160, 6.1977249484000140293e-177, 1.1294061984896456875e-192, 2.2526486929936882202e-208,\n  1.2464251916751375716e-160, 6.1977249484000140293e-177, 1.1294061984896456875e-192, 2.2526486929936882202e-208,\n  5.3514239183991277695e-161, 6.1977249484000140293e-177, 1.1294061984896456875e-192, 2.2526486929936882202e-208,\n  1.7950099192230045857e-161, -1.6991004655691153326e-177, -1.8567941091539589297e-193, -1.8074851186411640793e-209,\n  1.6802919634942426156e-163, 2.8330093736631818036e-179, -7.4549709281190454638e-196, -1.4481306607622412036e-212,\n  1.6802919634942426156e-163, 2.8330093736631818036e-179, -7.4549709281190454638e-196, -1.4481306607622412036e-212,\n  1.6802919634942426156e-163, 2.8330093736631818036e-179, -7.4549709281190454638e-196, -1.4481306607622412036e-212,\n  1.6802919634942426156e-163, 2.8330093736631818036e-179, -7.4549709281190454638e-196, -1.4481306607622412036e-212,\n  1.6802919634942426156e-163, 2.8330093736631818036e-179, -7.4549709281190454638e-196, -1.4481306607622412036e-212,\n  1.6802919634942426156e-163, 2.8330093736631818036e-179, -7.4549709281190454638e-196, -1.4481306607622412036e-212,\n  1.6802919634942426156e-163, 2.8330093736631818036e-179, -7.4549709281190454638e-196, -1.4481306607622412036e-212,\n  2.9106774506606941983e-164, 5.1948630316441287936e-180, 9.6685396110091013832e-196, 1.7562785002189357559e-211,\n  2.9106774506606941983e-164, 5.1948630316441287936e-180, 9.6685396110091013832e-196, 1.7562785002189357559e-211,\n  2.9106774506606941983e-164, 5.1948630316441287936e-180, 9.6685396110091013832e-196, 1.7562785002189357559e-211,\n  1.1741471776254777999e-164, 1.3389912474795150614e-180, 1.106784341445028435e-196, 3.3045982549756583552e-212,\n  3.0588204110786950436e-165, 3.7502330143836152136e-181, 3.6564932749519464998e-198, 3.7097125405852507464e-214,\n  3.0588204110786950436e-165, 3.7502330143836152136e-181, 3.6564932749519464998e-198, 3.7097125405852507464e-214,\n  8.8815756978467430465e-166, 1.3403131492807310959e-181, 3.6564932749519464998e-198, 3.7097125405852507464e-214,\n  8.8815756978467430465e-166, 1.3403131492807310959e-181, 3.6564932749519464998e-198, 3.7097125405852507464e-214,\n  3.4549185946116918017e-166, 1.3535321672928907047e-182, 3.1205762277848031878e-199, -3.3569248349832580936e-217,\n  7.4159004299416557678e-167, 1.3535321672928907047e-182, 3.1205762277848031878e-199, -3.3569248349832580936e-217,\n  7.4159004299416557678e-167, 1.3535321672928907047e-182, 3.1205762277848031878e-199, -3.3569248349832580936e-217,\n  6.3257905089784152346e-168, 3.5607241064750984115e-184, -1.4832196127821708615e-201, 2.6911956484118910092e-218,\n  6.3257905089784152346e-168, 3.5607241064750984115e-184, -1.4832196127821708615e-201, 2.6911956484118910092e-218,\n  6.3257905089784152346e-168, 3.5607241064750984115e-184, -1.4832196127821708615e-201, 2.6911956484118910092e-218,\n  6.3257905089784152346e-168, 3.5607241064750984115e-184, -1.4832196127821708615e-201, 2.6911956484118910092e-218,\n  2.0862146470760309789e-168, -1.146150630053972131e-184, -1.4832196127821708615e-201, 2.6911956484118910092e-218,\n  2.0862146470760309789e-168, -1.146150630053972131e-184, -1.4832196127821708615e-201, 2.6911956484118910092e-218,\n  1.026320681600434562e-168, 1.2072867382105631402e-184, -1.4832196127821708615e-201, 2.6911956484118910092e-218,\n  4.9637369886263658882e-169, 3.0568054078295488291e-186, 1.4980560800565460352e-202, 2.6911956484118910092e-218,\n  2.3140020749373754342e-169, 3.0568054078295488291e-186, 1.4980560800565460352e-202, 2.6911956484118910092e-218,\n  9.8913461809288020723e-170, 3.0568054078295488291e-186, 1.4980560800565460352e-202, 2.6911956484118910092e-218,\n  3.2670088967063259373e-170, 3.0568054078295488291e-186, 1.4980560800565460352e-202, 2.6911956484118910092e-218,\n  3.2670088967063259373e-170, 3.0568054078295488291e-186, 1.4980560800565460352e-202, 2.6911956484118910092e-218,\n  1.6109245756507072713e-170, -6.2044048008378732802e-187, -5.4322544592823556944e-203, 4.2491789852161138683e-219,\n  7.8288241512289757055e-171, 1.2181824638728806485e-186, 1.4980560800565460352e-202, 2.6911956484118910092e-218,\n  3.6886133485899290404e-171, 2.9887099189454666024e-187, 4.774153170641553462e-203, 4.2491789852161138683e-219,\n  1.6185079472704052482e-171, 2.9887099189454666024e-187, 4.774153170641553462e-203, 4.2491789852161138683e-219,\n  5.8345524661064358191e-172, 6.9043123899963188689e-188, -3.2905064432040069127e-204, -9.1795828160190082842e-224,\n  6.5928896280762691321e-173, 1.1586156901317304854e-188, -1.0100405885278530137e-205, -9.1795828160190082842e-224,\n  6.5928896280762691321e-173, 1.1586156901317304854e-188, -1.0100405885278530137e-205, -9.1795828160190082842e-224,\n  6.5928896280762691321e-173, 1.1586156901317304854e-188, -1.0100405885278530137e-205, -9.1795828160190082842e-224,\n  1.2381024895275844856e-174, -8.4789520282639751913e-191, -1.332109341809626019e-207, -9.1795828160190082842e-224,\n  1.2381024895275844856e-174, -8.4789520282639751913e-191, -1.332109341809626019e-207, -9.1795828160190082842e-224,\n  1.2381024895275844856e-174, -8.4789520282639751913e-191, -1.332109341809626019e-207, -9.1795828160190082842e-224,\n  1.2381024895275844856e-174, -8.4789520282639751913e-191, -1.332109341809626019e-207, -9.1795828160190082842e-224,\n  1.2381024895275844856e-174, -8.4789520282639751913e-191, -1.332109341809626019e-207, -9.1795828160190082842e-224,\n  1.2381024895275844856e-174, -8.4789520282639751913e-191, -1.332109341809626019e-207, -9.1795828160190082842e-224,\n  2.2730883653953564668e-175, 2.7431118386590483722e-191, -1.332109341809626019e-207, -9.1795828160190082842e-224,\n  2.2730883653953564668e-175, 2.7431118386590483722e-191, -1.332109341809626019e-207, -9.1795828160190082842e-224,\n  2.2730883653953564668e-175, 2.7431118386590483722e-191, -1.332109341809626019e-207, -9.1795828160190082842e-224,\n  1.0095962991602958391e-175, -6.2404128071707654958e-193, 3.0593092910744445285e-209, 5.4622616159087170031e-225,\n  3.7785026604276538491e-176, -6.2404128071707654958e-193, 3.0593092910744445285e-209, 5.4622616159087170031e-225,\n  6.1977249484000140293e-177, 1.1294061984896456875e-192, 2.2526486929936882202e-208, -5.3441928036578162463e-225,\n  6.1977249484000140293e-177, 1.1294061984896456875e-192, 2.2526486929936882202e-208, -5.3441928036578162463e-225,\n  6.1977249484000140293e-177, 1.1294061984896456875e-192, 2.2526486929936882202e-208, -5.3441928036578162463e-225,\n  2.2493122414154495675e-177, 2.5268245888628466632e-193, 3.0593092910744445285e-209, 5.4622616159087170031e-225,\n  2.7510588792316711745e-178, 3.3501523985444386676e-194, 6.2591208621664049475e-210, 5.9034406125450500218e-227,\n  2.7510588792316711745e-178, 3.3501523985444386676e-194, 6.2591208621664049475e-210, 5.9034406125450500218e-227,\n  2.7510588792316711745e-178, 3.3501523985444386676e-194, 6.2591208621664049475e-210, 5.9034406125450500218e-227,\n  2.8330093736631818036e-179, -7.4549709281190454638e-196, -1.4481306607622412036e-212, 9.9192633285681635836e-229,\n  2.8330093736631818036e-179, -7.4549709281190454638e-196, -1.4481306607622412036e-212, 9.9192633285681635836e-229,\n  2.8330093736631818036e-179, -7.4549709281190454638e-196, -1.4481306607622412036e-212, 9.9192633285681635836e-229,\n  2.8330093736631818036e-179, -7.4549709281190454638e-196, -1.4481306607622412036e-212, 9.9192633285681635836e-229,\n  1.2906606599973359683e-179, -7.4549709281190454638e-196, -1.4481306607622412036e-212, 9.9192633285681635836e-229,\n  5.1948630316441287936e-180, 9.6685396110091013832e-196, 1.7562785002189355449e-211, 1.6821693549018732055e-227,\n  1.3389912474795150614e-180, 1.106784341445028435e-196, 3.3045982549756578275e-212, 6.2685154049107876715e-228,\n  1.3389912474795150614e-180, 1.106784341445028435e-196, 3.3045982549756578275e-212, 6.2685154049107876715e-228,\n  3.7502330143836152136e-181, 3.6564932749519464998e-198, 3.7097125405852507464e-214, 2.5658818466966882188e-231,\n  3.7502330143836152136e-181, 3.6564932749519464998e-198, 3.7097125405852507464e-214, 2.5658818466966882188e-231,\n  1.3403131492807310959e-181, 3.6564932749519464998e-198, 3.7097125405852507464e-214, 2.5658818466966882188e-231,\n  1.3535321672928907047e-182, 3.1205762277848031878e-199, -3.3569248349832580936e-217, -1.0577661142165146927e-233,\n  1.3535321672928907047e-182, 3.1205762277848031878e-199, -3.3569248349832580936e-217, -1.0577661142165146927e-233,\n  1.3535321672928907047e-182, 3.1205762277848031878e-199, -3.3569248349832580936e-217, -1.0577661142165146927e-233,\n  1.3535321672928907047e-182, 3.1205762277848031878e-199, -3.3569248349832580936e-217, -1.0577661142165146927e-233,\n  6.0043220944823941786e-183, 3.1205762277848031878e-199, -3.3569248349832580936e-217, -1.0577661142165146927e-233,\n  2.2388223052591377446e-183, 3.1205762277848031878e-199, -3.3569248349832580936e-217, -1.0577661142165146927e-233,\n  3.5607241064750984115e-184, -1.4832196127821708615e-201, 2.6911956484118910092e-218, -5.1336618966962585332e-235,\n  3.5607241064750984115e-184, -1.4832196127821708615e-201, 2.6911956484118910092e-218, -5.1336618966962585332e-235,\n  3.5607241064750984115e-184, -1.4832196127821708615e-201, 2.6911956484118910092e-218, -5.1336618966962585332e-235,\n  1.2072867382105631402e-184, -1.4832196127821708615e-201, 2.6911956484118910092e-218, -5.1336618966962585332e-235,\n  3.0568054078295488291e-186, 1.4980560800565460352e-202, 2.6911956484118910092e-218, -5.1336618966962585332e-235,\n  3.0568054078295488291e-186, 1.4980560800565460352e-202, 2.6911956484118910092e-218, -5.1336618966962585332e-235,\n  3.0568054078295488291e-186, 1.4980560800565460352e-202, 2.6911956484118910092e-218, -5.1336618966962585332e-235,\n  3.0568054078295488291e-186, 1.4980560800565460352e-202, 2.6911956484118910092e-218, -5.1336618966962585332e-235,\n  3.0568054078295488291e-186, 1.4980560800565460352e-202, 2.6911956484118910092e-218, -5.1336618966962585332e-235,\n  3.0568054078295488291e-186, 1.4980560800565460352e-202, 2.6911956484118910092e-218, -5.1336618966962585332e-235,\n  1.2181824638728806485e-186, 1.4980560800565460352e-202, 2.6911956484118910092e-218, -5.1336618966962585332e-235,\n  2.9887099189454666024e-187, 4.774153170641553462e-203, 4.2491789852161132393e-219, 7.4467067939231424594e-235,\n  2.9887099189454666024e-187, 4.774153170641553462e-203, 4.2491789852161132393e-219, 7.4467067939231424594e-235,\n  6.9043123899963188689e-188, -3.2905064432040069127e-204, -9.1795828160190063645e-224, -2.3569545504732004486e-239,\n  6.9043123899963188689e-188, -3.2905064432040069127e-204, -9.1795828160190063645e-224, -2.3569545504732004486e-239,\n  1.1586156901317304854e-188, -1.0100405885278530137e-205, -9.1795828160190063645e-224, -2.3569545504732004486e-239,\n  1.1586156901317304854e-188, -1.0100405885278530137e-205, -9.1795828160190063645e-224, -2.3569545504732004486e-239,\n  1.1586156901317304854e-188, -1.0100405885278530137e-205, -9.1795828160190063645e-224, -2.3569545504732004486e-239,\n  4.4040360264865697732e-189, -1.0100405885278530137e-205, -9.1795828160190063645e-224, -2.3569545504732004486e-239,\n  8.129755890712020335e-190, 9.8339840169166049336e-206, -9.1795828160190063645e-224, -2.3569545504732004486e-239,\n  8.129755890712020335e-190, 9.8339840169166049336e-206, -9.1795828160190063645e-224, -2.3569545504732004486e-239,\n  8.129755890712020335e-190, 9.8339840169166049336e-206, -9.1795828160190063645e-224, -2.3569545504732004486e-239,\n  3.6409303439428119063e-190, -1.332109341809626019e-207, -9.1795828160190063645e-224, -2.3569545504732004486e-239,\n  1.3965175705582071936e-190, -1.332109341809626019e-207, -9.1795828160190063645e-224, -2.3569545504732004486e-239,\n  2.7431118386590483722e-191, -1.332109341809626019e-207, -9.1795828160190063645e-224, -2.3569545504732004486e-239,\n  2.7431118386590483722e-191, -1.332109341809626019e-207, -9.1795828160190063645e-224, -2.3569545504732004486e-239,\n  2.7431118386590483722e-191, -1.332109341809626019e-207, -9.1795828160190063645e-224, -2.3569545504732004486e-239,\n  1.3403538552936701153e-191, 1.7826390804083638359e-207, -9.1795828160190063645e-224, -2.3569545504732004486e-239,\n  6.389748636109812983e-192, 2.2526486929936882202e-208, -5.3441928036578156465e-225, -7.741539335184153052e-241,\n  2.8828536776963681193e-192, 2.2526486929936882202e-208, -5.3441928036578156465e-225, -7.741539335184153052e-241,\n  1.1294061984896456875e-192, 2.2526486929936882202e-208, -5.3441928036578156465e-225, -7.741539335184153052e-241,\n  2.5268245888628466632e-193, 3.0593092910744445285e-209, 5.4622616159087170031e-225, 4.2560351759808952526e-241,\n  2.5268245888628466632e-193, 3.0593092910744445285e-209, 5.4622616159087170031e-225, 4.2560351759808952526e-241,\n  3.3501523985444386676e-194, 6.2591208621664049475e-210, 5.9034406125450490845e-227, 1.3186893776791012681e-242,\n  3.3501523985444386676e-194, 6.2591208621664049475e-210, 5.9034406125450490845e-227, 1.3186893776791012681e-242,\n  3.3501523985444386676e-194, 6.2591208621664049475e-210, 5.9034406125450490845e-227, 1.3186893776791012681e-242,\n  6.1039071228393547627e-195, 1.7562785002189355449e-211, 1.6821693549018732055e-227, -8.7276385348052817035e-244,\n  6.1039071228393547627e-195, 1.7562785002189355449e-211, 1.6821693549018732055e-227, -8.7276385348052817035e-244,\n  6.1039071228393547627e-195, 1.7562785002189355449e-211, 1.6821693549018732055e-227, -8.7276385348052817035e-244,\n  2.6792050150137250131e-195, 1.7562785002189355449e-211, 1.6821693549018732055e-227, -8.7276385348052817035e-244,\n  9.6685396110091013832e-196, 1.7562785002189355449e-211, 1.6821693549018732055e-227, -8.7276385348052817035e-244,\n  2.0416567491425607157e-177, 6.0959078275963141821e-193, 1.156336993964950812e-208, 2.7126166236326293347e-224,\n  2.0416567491425607157e-177, 6.0959078275963141821e-193, 1.156336993964950812e-208, 2.7126166236326293347e-224,\n  2.0416567491425607157e-177, 6.0959078275963141821e-193, 1.156336993964950812e-208, 2.7126166236326293347e-224,\n  6.7450395650278649168e-179, 6.8432117823206978686e-195, 4.7332165749391048364e-212, 4.4984059688774601837e-228,\n  6.7450395650278649168e-179, 6.8432117823206978686e-195, 4.7332165749391048364e-212, 4.4984059688774601837e-228,\n  6.7450395650278649168e-179, 6.8432117823206978686e-195, 4.7332165749391048364e-212, 4.4984059688774601837e-228,\n  6.7450395650278649168e-179, 6.8432117823206978686e-195, 4.7332165749391048364e-212, 4.4984059688774601837e-228,\n  6.7450395650278649168e-179, 6.8432117823206978686e-195, 4.7332165749391048364e-212, 4.4984059688774601837e-228,\n  5.756447103644822603e-180, -6.1924333305615830735e-198, -1.9512340798794268979e-214, -3.6162764918921697356e-230,\n  5.756447103644822603e-180, -6.1924333305615830735e-198, -1.9512340798794268979e-214, -3.6162764918921697356e-230,\n  5.756447103644822603e-180, -6.1924333305615830735e-198, -1.9512340798794268979e-214, -3.6162764918921697356e-230,\n  5.756447103644822603e-180, -6.1924333305615830735e-198, -1.9512340798794268979e-214, -3.6162764918921697356e-230,\n  1.9005753194802080146e-180, -6.1924333305615830735e-198, -1.9512340798794268979e-214, -3.6162764918921697356e-230,\n  1.9005753194802080146e-180, -6.1924333305615830735e-198, -1.9512340798794268979e-214, -3.6162764918921697356e-230,\n  9.3660737343905436753e-181, -6.1924333305615830735e-198, -1.9512340798794268979e-214, -3.6162764918921697356e-230,\n  4.5462340041847754398e-181, -6.1924333305615830735e-198, -1.9512340798794268979e-214, -3.6162764918921697356e-230,\n  2.1363141390818913221e-181, -6.1924333305615830735e-198, -1.9512340798794268979e-214, -3.6162764918921697356e-230,\n  9.3135420653044926323e-182, -6.1924333305615830735e-198, -1.9512340798794268979e-214, -3.6162764918921697356e-230,\n  3.2887424025472810002e-182, 7.185309278132283136e-198, -1.9512340798794268979e-214, -3.6162764918921697356e-230,\n  2.7634257116867652192e-183, 4.9643797378534984559e-199, -9.4699347169310243473e-216, -9.2331809177749095611e-233,\n  2.7634257116867652192e-183, 4.9643797378534984559e-199, -9.4699347169310243473e-216, -9.2331809177749095611e-233,\n  2.7634257116867652192e-183, 4.9643797378534984559e-199, -9.4699347169310243473e-216, -9.2331809177749095611e-233,\n  2.7634257116867652192e-183, 4.9643797378534984559e-199, -9.4699347169310243473e-216, -9.2331809177749095611e-233,\n  8.806758170751374203e-184, 7.8383517263666503337e-200, 1.3736749441945438342e-215, -9.2331809177749095611e-233,\n  8.806758170751374203e-184, 7.8383517263666503337e-200, 1.3736749441945438342e-215, -9.2331809177749095611e-233,\n  4.0998834342223036605e-184, 7.8383517263666503337e-200, 1.3736749441945438342e-215, -9.2331809177749095611e-233,\n  1.7464460659577689118e-184, 2.612671019845610006e-200, 2.1334073625072069974e-216, -9.2331809177749095611e-233,\n  5.697273818255015375e-185, -1.6933341491052464293e-204, -4.3478137385944270631e-220, -2.3353910329236990725e-236,\n  5.697273818255015375e-185, -1.6933341491052464293e-204, -4.3478137385944270631e-220, -2.3353910329236990725e-236,\n  2.755477107924346286e-185, -1.6933341491052464293e-204, -4.3478137385944270631e-220, -2.3353910329236990725e-236,\n  1.2845787527590117414e-185, -1.6933341491052464293e-204, -4.3478137385944270631e-220, -2.3353910329236990725e-236,\n  5.4912957517634446918e-186, -1.6933341491052464293e-204, -4.3478137385944270631e-220, -2.3353910329236990725e-236,\n  1.8140498638501083305e-186, -1.6933341491052464293e-204, -4.3478137385944270631e-220, -2.3353910329236990725e-236,\n  1.8140498638501083305e-186, -1.6933341491052464293e-204, -4.3478137385944270631e-220, -2.3353910329236990725e-236,\n  8.9473839187177424013e-187, -1.6933341491052464293e-204, -4.3478137385944270631e-220, -2.3353910329236990725e-236,\n  4.3508265588260719497e-187, -1.6933341491052464293e-204, -4.3478137385944270631e-220, -2.3353910329236990725e-236,\n  2.0525478788802367239e-187, -1.6933341491052464293e-204, -4.3478137385944270631e-220, -2.3353910329236990725e-236,\n  9.0340853890731911095e-188, -1.6933341491052464293e-204, -4.3478137385944270631e-220, -2.3353910329236990725e-236,\n  3.288388689208603045e-188, -1.6933341491052464293e-204, -4.3478137385944270631e-220, -2.3353910329236990725e-236,\n  4.1554033927630885323e-189, -9.8582956929636044137e-206, -1.4280619485269765742e-221, 1.2171222696290252021e-237,\n  4.1554033927630885323e-189, -9.8582956929636044137e-206, -1.4280619485269765742e-221, 1.2171222696290252021e-237,\n  4.1554033927630885323e-189, -9.8582956929636044137e-206, -1.4280619485269765742e-221, 1.2171222696290252021e-237,\n  5.643429553477207926e-190, 1.0076094209231528444e-205, 7.8509991660024955813e-222, 1.2171222696290252021e-237,\n  5.643429553477207926e-190, 1.0076094209231528444e-205, 7.8509991660024955813e-222, 1.2171222696290252021e-237,\n  5.643429553477207926e-190, 1.0076094209231528444e-205, 7.8509991660024955813e-222, 1.2171222696290252021e-237,\n  1.1546040067079994973e-190, 1.0889925813396166947e-207, 2.4325525462765697993e-223, -1.1429360314275701698e-239,\n  1.1546040067079994973e-190, 1.0889925813396166947e-207, 2.4325525462765697993e-223, -1.1429360314275701698e-239,\n  3.2397620015697148712e-192, 3.1030547578511949035e-208, -1.609965144193984205e-224, -1.8313007053436627876e-240,\n  3.2397620015697148712e-192, 3.1030547578511949035e-208, -1.609965144193984205e-224, -1.8313007053436627876e-240,\n  3.2397620015697148712e-192, 3.1030547578511949035e-208, -1.609965144193984205e-224, -1.8313007053436627876e-240,\n  3.2397620015697148712e-192, 3.1030547578511949035e-208, -1.609965144193984205e-224, -1.8313007053436627876e-240,\n  3.2397620015697148712e-192, 3.1030547578511949035e-208, -1.609965144193984205e-224, -1.8313007053436627876e-240,\n  3.2397620015697148712e-192, 3.1030547578511949035e-208, -1.609965144193984205e-224, -1.8313007053436627876e-240,\n  1.4863145223629928288e-192, -7.9038076992129241506e-209, -1.609965144193984205e-224, -1.8313007053436627876e-240,\n  6.0959078275963141821e-193, 1.156336993964950812e-208, 2.7126166236326293347e-224, -1.8313007053436627876e-240,\n  1.712289129579509076e-193, 1.8297811202182925249e-209, 1.1003018740995688645e-226, 5.827891678485165325e-243,\n  1.712289129579509076e-193, 1.8297811202182925249e-209, 1.1003018740995688645e-226, 5.827891678485165325e-243,\n  6.1638445507530779946e-194, -6.0361608463951204924e-210, 1.1003018740995688645e-226, 5.827891678485165325e-243,\n  6.8432117823206978686e-195, 4.7332165749391048364e-212, 4.4984059688774601837e-228, -3.029900079464340522e-245,\n  6.8432117823206978686e-195, 4.7332165749391048364e-212, 4.4984059688774601837e-228, -3.029900079464340522e-245,\n  6.8432117823206978686e-195, 4.7332165749391048364e-212, 4.4984059688774601837e-228, -3.029900079464340522e-245,\n  6.8432117823206978686e-195, 4.7332165749391048364e-212, 4.4984059688774601837e-228, -3.029900079464340522e-245,\n  3.418509674495068119e-195, 4.7332165749391048364e-212, 4.4984059688774601837e-228, -3.029900079464340522e-245,\n  1.7061586205822532442e-195, 4.7332165749391048364e-212, 4.4984059688774601837e-228, -3.029900079464340522e-245,\n  8.499830936258458068e-196, 4.7332165749391048364e-212, 4.4984059688774601837e-228, -3.029900079464340522e-245,\n  4.218953301476420881e-196, 4.7332165749391048364e-212, 4.4984059688774601837e-228, -3.029900079464340522e-245,\n  2.0785144840854027628e-196, -1.9512340798794268979e-214, -3.6162764918921692779e-230, -2.8387319855193022476e-246,\n  1.008295075389893466e-196, -1.9512340798794268979e-214, -3.6162764918921692779e-230, -2.8387319855193022476e-246,\n  4.7318537104213881764e-197, -1.9512340798794268979e-214, -3.6162764918921692779e-230, -2.8387319855193022476e-246,\n  2.0563051886826149345e-197, -1.9512340798794268979e-214, -3.6162764918921692779e-230, -2.8387319855193022476e-246,\n  7.185309278132283136e-198, -1.9512340798794268979e-214, -3.6162764918921692779e-230, -2.8387319855193022476e-246,\n  4.9643797378534984559e-199, -9.4699347169310243473e-216, -9.2331809177749077733e-233, -1.4042876247421728101e-248,\n  4.9643797378534984559e-199, -9.4699347169310243473e-216, -9.2331809177749077733e-233, -1.4042876247421728101e-248,\n  4.9643797378534984559e-199, -9.4699347169310243473e-216, -9.2331809177749077733e-233, -1.4042876247421728101e-248,\n  4.9643797378534984559e-199, -9.4699347169310243473e-216, -9.2331809177749077733e-233, -1.4042876247421728101e-248,\n  7.8383517263666503337e-200, 1.3736749441945438342e-215, -9.2331809177749077733e-233, -1.4042876247421728101e-248,\n  7.8383517263666503337e-200, 1.3736749441945438342e-215, -9.2331809177749077733e-233, -1.4042876247421728101e-248,\n  7.8383517263666503337e-200, 1.3736749441945438342e-215, -9.2331809177749077733e-233, -1.4042876247421728101e-248,\n  2.612671019845610006e-200, 2.1334073625072069974e-216, -9.2331809177749077733e-233, -1.4042876247421728101e-248,\n  2.612671019845610006e-200, 2.1334073625072069974e-216, -9.2331809177749077733e-233, -1.4042876247421728101e-248,\n  1.306250843215349634e-200, 2.1334073625072069974e-216, -9.2331809177749077733e-233, -1.4042876247421728101e-248,\n  6.5304075490021959302e-201, 6.8298960257742791824e-217, 6.8696910062179237095e-233, 3.8349029251851101018e-249,\n  3.2643571074265457254e-201, -4.2219277387461470355e-218, -1.753154605289404553e-234, -7.5861268822635538093e-251,\n  1.6313318866387202604e-201, -4.2219277387461470355e-218, -1.753154605289404553e-234, -7.5861268822635538093e-251,\n  8.1481927624480752786e-202, -4.2219277387461470355e-218, -1.753154605289404553e-234, -7.5861268822635538093e-251,\n  4.0656297104785107096e-202, 4.8431832608149701961e-218, 8.3111403472061145651e-234, 1.6001805286092554504e-249,\n  2.0243481844937293316e-202, 3.1062776103441183191e-219, 7.6291913283447536617e-235, 2.0347903074934629333e-250,\n  1.0037074215013384159e-202, 3.1062776103441183191e-219, 7.6291913283447536617e-235, 2.0347903074934629333e-250,\n  4.9338704000514295811e-203, 3.1062776103441183191e-219, 7.6291913283447536617e-235, 2.0347903074934629333e-250,\n  2.3822684925704522921e-203, 3.1062776103441183191e-219, 7.6291913283447536617e-235, 2.0347903074934629333e-250,\n  1.1064675388299639308e-203, 2.7343042298126957741e-220, 5.5273393987134252385e-236, 1.1432574793608782288e-251,\n  4.6856706195971960852e-204, 2.7343042298126957741e-220, 5.5273393987134252385e-236, 1.1432574793608782288e-251,\n  1.4961682352459748279e-204, -8.0675475439086544798e-221, -3.6970842501441777651e-237, -5.7032870362481275794e-253,\n  1.4961682352459748279e-204, -8.0675475439086544798e-221, -3.6970842501441777651e-237, -5.7032870362481275794e-253,\n  6.9879263915816924805e-205, 9.6377473771091526132e-221, 1.5959741828948633012e-236, 2.7031904319843495713e-252,\n  3.0010484111426663515e-205, 7.8509991660024955813e-222, 1.2171222696290252021e-237, -2.4742181023285720738e-254,\n  1.0076094209231528444e-205, 7.8509991660024955813e-222, 1.2171222696290252021e-237, -2.4742181023285720738e-254,\n  1.0889925813396166947e-207, 2.4325525462765697993e-223, -1.1429360314275701698e-239, 8.3218722366085688343e-256,\n  1.0889925813396166947e-207, 2.4325525462765697993e-223, -1.1429360314275701698e-239, 8.3218722366085688343e-256,\n  1.0889925813396166947e-207, 2.4325525462765697993e-223, -1.1429360314275701698e-239, 8.3218722366085688343e-256,\n  1.0889925813396166947e-207, 2.4325525462765697993e-223, -1.1429360314275701698e-239, 8.3218722366085688343e-256,\n  1.0889925813396166947e-207, 2.4325525462765697993e-223, -1.1429360314275701698e-239, 8.3218722366085688343e-256,\n  1.0889925813396166947e-207, 2.4325525462765697993e-223, -1.1429360314275701698e-239, 8.3218722366085688343e-256,\n  1.0889925813396166947e-207, 2.4325525462765697993e-223, -1.1429360314275701698e-239, 8.3218722366085688343e-256,\n  3.1030547578511949035e-208, -1.609965144193984205e-224, -1.8313007053436625212e-240, -2.3341145329525059632e-256,\n  3.1030547578511949035e-208, -1.609965144193984205e-224, -1.8313007053436625212e-240, -2.3341145329525059632e-256,\n  1.156336993964950812e-208, 2.7126166236326293347e-224, -1.8313007053436625212e-240, -2.3341145329525059632e-256,\n  1.8297811202182925249e-209, 1.1003018740995688645e-226, 5.827891678485165325e-243, -3.1174271110208206547e-259,\n  1.8297811202182925249e-209, 1.1003018740995688645e-226, 5.827891678485165325e-243, -3.1174271110208206547e-259,\n  1.8297811202182925249e-209, 1.1003018740995688645e-226, 5.827891678485165325e-243, -3.1174271110208206547e-259,\n  6.1308251778939023781e-210, 1.1003018740995688645e-226, 5.827891678485165325e-243, -3.1174271110208206547e-259,\n  4.7332165749391048364e-212, 4.4984059688774601837e-228, -3.0299000794643401155e-245, -2.8075477999879273582e-261,\n  4.7332165749391048364e-212, 4.4984059688774601837e-228, -3.0299000794643401155e-245, -2.8075477999879273582e-261,\n  4.7332165749391048364e-212, 4.4984059688774601837e-228, -3.0299000794643401155e-245, -2.8075477999879273582e-261,\n  4.7332165749391048364e-212, 4.4984059688774601837e-228, -3.0299000794643401155e-245, -2.8075477999879273582e-261,\n  4.7332165749391048364e-212, 4.4984059688774601837e-228, -3.0299000794643401155e-245, -2.8075477999879273582e-261,\n  4.7332165749391048364e-212, 4.4984059688774601837e-228, -3.0299000794643401155e-245, -2.8075477999879273582e-261,\n  4.7332165749391048364e-212, 4.4984059688774601837e-228, -3.0299000794643401155e-245, -2.8075477999879273582e-261,\n  4.7332165749391048364e-212, 4.4984059688774601837e-228, -3.0299000794643401155e-245, -2.8075477999879273582e-261,\n  2.3568521170701555846e-212, -7.7818310317651142243e-229, -3.0299000794643401155e-245, -2.8075477999879273582e-261,\n  1.1686698881356804311e-212, 1.8601114328504743806e-228, -3.0299000794643401155e-245, -2.8075477999879273582e-261,\n  5.7457877366844311816e-213, 5.409641648369814791e-229, -3.0299000794643401155e-245, -2.8075477999879273582e-261,\n  2.7753321643482446169e-213, -1.1860946916976500828e-229, 6.3146909508553973881e-246, 1.2573885592501532045e-261,\n  1.290104378180150675e-213, 2.1117734783360818049e-229, 4.2928382696354204061e-245, -2.8075477999879273582e-261,\n  5.4749048509610403382e-214, 4.6283939331921604413e-230, 6.3146909508553973881e-246, 1.2573885592501532045e-261,\n  1.7618353855408067201e-214, 5.060587206499956961e-231, 5.9380161562121075096e-247, -1.2904053011746964278e-263,\n  1.7618353855408067201e-214, 5.060587206499956961e-231, 5.9380161562121075096e-247, -1.2904053011746964278e-263,\n  8.3356801918574821257e-215, 5.060587206499956961e-231, 5.9380161562121075096e-247, -1.2904053011746964278e-263,\n  3.6943433600821895879e-215, 5.060587206499956961e-231, 5.9380161562121075096e-247, -1.2904053011746964278e-263,\n  1.3736749441945438342e-215, -9.2331809177749077733e-233, -1.4042876247421726117e-248, -9.9505977179164858712e-265,\n  2.1334073625072069974e-216, -9.2331809177749077733e-233, -1.4042876247421726117e-248, -9.9505977179164858712e-265,\n  2.1334073625072069974e-216, -9.2331809177749077733e-233, -1.4042876247421726117e-248, -9.9505977179164858712e-265,\n  2.1334073625072069974e-216, -9.2331809177749077733e-233, -1.4042876247421726117e-248, -9.9505977179164858712e-265,\n  6.8298960257742791824e-217, 6.8696910062179237095e-233, 3.8349029251851101018e-249, -2.6436684620390282645e-267,\n  6.8298960257742791824e-217, 6.8696910062179237095e-233, 3.8349029251851101018e-249, -2.6436684620390282645e-267,\n  3.2038516259498326923e-217, -1.1817449557784924788e-233, -6.3454186796659920093e-250, -2.6436684620390282645e-267,\n  1.3908294260376086421e-217, 2.8439730252197153919e-233, 3.8349029251851101018e-249, -2.6436684620390282645e-267,\n  4.8431832608149701961e-218, 8.3111403472061145651e-234, 1.6001805286092554504e-249, -2.6436684620390282645e-267,\n  3.1062776103441183191e-219, 7.6291913283447536617e-235, 2.0347903074934629333e-250, -2.6436684620390282645e-267,\n  3.1062776103441183191e-219, 7.6291913283447536617e-235, 2.0347903074934629333e-250, -2.6436684620390282645e-267,\n  3.1062776103441183191e-219, 7.6291913283447536617e-235, 2.0347903074934629333e-250, -2.6436684620390282645e-267,\n  3.1062776103441183191e-219, 7.6291913283447536617e-235, 2.0347903074934629333e-250, -2.6436684620390282645e-267,\n  2.7343042298126957741e-220, 5.5273393987134252385e-236, 1.1432574793608780349e-251, 1.2329569415922591084e-267,\n  2.7343042298126957741e-220, 5.5273393987134252385e-236, 1.1432574793608780349e-251, 1.2329569415922591084e-267,\n  2.7343042298126957741e-220, 5.5273393987134252385e-236, 1.1432574793608780349e-251, 1.2329569415922591084e-267,\n  2.7343042298126957741e-220, 5.5273393987134252385e-236, 1.1432574793608780349e-251, 1.2329569415922591084e-267,\n  9.6377473771091526132e-221, 1.5959741828948633012e-236, 2.7031904319843490867e-252, 2.638005906844372114e-268,\n  7.8509991660024955813e-222, 1.2171222696290252021e-237, -2.4742181023285720738e-254, -1.2030990169203137715e-270,\n  7.8509991660024955813e-222, 1.2171222696290252021e-237, -2.4742181023285720738e-254, -1.2030990169203137715e-270,\n  7.8509991660024955813e-222, 1.2171222696290252021e-237, -2.4742181023285720738e-254, -1.2030990169203137715e-270,\n  7.8509991660024955813e-222, 1.2171222696290252021e-237, -2.4742181023285720738e-254, -1.2030990169203137715e-270,\n  2.318094503184431479e-222, -1.1429360314275701698e-239, 8.3218722366085688343e-256, -2.0046830753539155726e-272,\n  2.318094503184431479e-222, -1.1429360314275701698e-239, 8.3218722366085688343e-256, -2.0046830753539155726e-272,\n  9.3486833747991514629e-223, -1.1429360314275701698e-239, 8.3218722366085688343e-256, -2.0046830753539155726e-272,\n  2.4325525462765697993e-223, -1.1429360314275701698e-239, 8.3218722366085688343e-256, -2.0046830753539155726e-272,\n  2.4325525462765697993e-223, -1.1429360314275701698e-239, 8.3218722366085688343e-256, -2.0046830753539155726e-272,\n  7.0351983914592419146e-224, 7.766758903588374524e-240, 8.3218722366085688343e-256, -2.0046830753539155726e-272,\n  7.0351983914592419146e-224, 7.766758903588374524e-240, 8.3218722366085688343e-256, -2.0046830753539155726e-272,\n  2.7126166236326293347e-224, -1.8313007053436625212e-240, -2.3341145329525056675e-256, -2.0046830753539155726e-272,\n  5.5132573971932232487e-225, 5.6821419688934674008e-241, 3.2988215943776273615e-257, 2.1353977370878701046e-273,\n  5.5132573971932232487e-225, 5.6821419688934674008e-241, 3.2988215943776273615e-257, 2.1353977370878701046e-273,\n  1.1003018740995688645e-226, 5.827891678485165325e-243, -3.117427111020820077e-259, -5.9718623963762788119e-275,\n  1.1003018740995688645e-226, 5.827891678485165325e-243, -3.117427111020820077e-259, -5.9718623963762788119e-275,\n  1.1003018740995688645e-226, 5.827891678485165325e-243, -3.117427111020820077e-259, -5.9718623963762788119e-275,\n  1.1003018740995688645e-226, 5.827891678485165325e-243, -3.117427111020820077e-259, -5.9718623963762788119e-275,\n  1.1003018740995688645e-226, 5.827891678485165325e-243, -3.117427111020820077e-259, -5.9718623963762788119e-275,\n  1.1003018740995688645e-226, 5.827891678485165325e-243, -3.117427111020820077e-259, -5.9718623963762788119e-275,\n  2.560476225709334075e-227, 5.827891678485165325e-243, -3.117427111020820077e-259, -5.9718623963762788119e-275,\n  2.560476225709334075e-227, 5.827891678485165325e-243, -3.117427111020820077e-259, -5.9718623963762788119e-275,\n  4.4984059688774601837e-228, -3.0299000794643401155e-245, -2.8075477999879273582e-261, -1.472095602234059958e-277,\n  4.4984059688774601837e-228, -3.0299000794643401155e-245, -2.8075477999879273582e-261, -1.472095602234059958e-277,\n  4.4984059688774601837e-228, -3.0299000794643401155e-245, -2.8075477999879273582e-261, -1.472095602234059958e-277,\n  1.8601114328504743806e-228, -3.0299000794643401155e-245, -2.8075477999879273582e-261, -1.472095602234059958e-277,\n  5.409641648369814791e-229, -3.0299000794643401155e-245, -2.8075477999879273582e-261, -1.472095602234059958e-277,\n  5.409641648369814791e-229, -3.0299000794643401155e-245, -2.8075477999879273582e-261, -1.472095602234059958e-277,\n  2.1117734783360818049e-229, 4.2928382696354204061e-245, -2.8075477999879273582e-261, -1.472095602234059958e-277,\n  4.6283939331921604413e-230, 6.3146909508553973881e-246, 1.2573885592501529789e-261, 3.0408903374280139822e-277,\n  4.6283939331921604413e-230, 6.3146909508553973881e-246, 1.2573885592501529789e-261, 3.0408903374280139822e-277,\n  5.060587206499956961e-231, 5.9380161562121075096e-247, -1.2904053011746964278e-263, 8.7279092175580820317e-280,\n  5.060587206499956961e-231, 5.9380161562121075096e-247, -1.2904053011746964278e-263, 8.7279092175580820317e-280,\n  5.060587206499956961e-231, 5.9380161562121075096e-247, -1.2904053011746964278e-263, 8.7279092175580820317e-280,\n  5.060587206499956961e-231, 5.9380161562121075096e-247, -1.2904053011746964278e-263, 8.7279092175580820317e-280,\n  2.4841276986611042098e-231, 2.1712682097791944335e-248, 2.9746046415267896827e-264, -8.6516445844406224413e-282,\n  1.1958979447416775482e-231, 2.1712682097791944335e-248, 2.9746046415267896827e-264, -8.6516445844406224413e-282,\n  5.5178306778196421733e-232, 2.1712682097791944335e-248, 2.9746046415267896827e-264, -8.6516445844406224413e-282,\n  2.2972562930210755192e-232, 2.1712682097791944335e-248, 2.9746046415267896827e-264, -8.6516445844406224413e-282,\n  6.8696910062179237095e-233, 3.8349029251851101018e-249, -2.6436684620390282645e-267, -4.3807022524130141006e-284,\n  6.8696910062179237095e-233, 3.8349029251851101018e-249, -2.6436684620390282645e-267, -4.3807022524130141006e-284,\n  2.8439730252197153919e-233, 3.8349029251851101018e-249, -2.6436684620390282645e-267, -4.3807022524130141006e-284,\n  8.3111403472061145651e-234, 1.6001805286092554504e-249, -2.6436684620390282645e-267, -4.3807022524130141006e-284,\n  8.3111403472061145651e-234, 1.6001805286092554504e-249, -2.6436684620390282645e-267, -4.3807022524130141006e-284,\n  3.2789928709583552854e-234, 4.8281933032132812475e-250, -2.6436684620390282645e-267, -4.3807022524130141006e-284,\n  7.6291913283447536617e-235, 2.0347903074934629333e-250, -2.6436684620390282645e-267, -4.3807022524130141006e-284,\n  7.6291913283447536617e-235, 2.0347903074934629333e-250, -2.6436684620390282645e-267, -4.3807022524130141006e-284,\n  1.3390069830350552605e-235, -6.026193929640082176e-252, -7.0535576022338457803e-268, -4.3807022524130141006e-284,\n  1.3390069830350552605e-235, -6.026193929640082176e-252, -7.0535576022338457803e-268, -4.3807022524130141006e-284,\n  1.3390069830350552605e-235, -6.026193929640082176e-252, -7.0535576022338457803e-268, -4.3807022524130141006e-284,\n  5.5273393987134252385e-236, 1.1432574793608780349e-251, 1.2329569415922591084e-267, -4.3807022524130141006e-284,\n  1.5959741828948633012e-236, 2.7031904319843490867e-252, 2.638005906844371576e-268, 6.3790946999826013345e-284,\n  1.5959741828948633012e-236, 2.7031904319843490867e-252, 2.638005906844371576e-268, 6.3790946999826013345e-284,\n  6.1313287894022281692e-237, 5.2084434157824127104e-253, 2.1511502957481757317e-269, 3.2670891426006739096e-285,\n  1.2171222696290252021e-237, -2.4742181023285720738e-254, -1.2030990169203137715e-270, -9.5347405022956042207e-287,\n  1.2171222696290252021e-237, -2.4742181023285720738e-254, -1.2030990169203137715e-270, -9.5347405022956042207e-287,\n  1.2171222696290252021e-237, -2.4742181023285720738e-254, -1.2030990169203137715e-270, -9.5347405022956042207e-287,\n  6.0284645465737476297e-238, -2.4742181023285720738e-254, -1.2030990169203137715e-270, -9.5347405022956042207e-287,\n  2.9570854717154947523e-238, 4.3456134301905148502e-254, 6.3684349745470443788e-270, -9.5347405022956042207e-287,\n  1.4213959342863689955e-238, 9.3569766393097138822e-255, 2.5826679788133653036e-270, -9.5347405022956042207e-287,\n  6.5355116557180594664e-239, 9.3569766393097138822e-255, 2.5826679788133653036e-270, -9.5347405022956042207e-287,\n  2.6962878121452450746e-239, 8.3218722366085688343e-256, -2.0046830753539152442e-272, -3.4057806738724185961e-288,\n  7.766758903588374524e-240, 8.3218722366085688343e-256, -2.0046830753539152442e-272, -3.4057806738724185961e-288,\n  7.766758903588374524e-240, 8.3218722366085688343e-256, -2.0046830753539152442e-272, -3.4057806738724185961e-288,\n  2.9677290991223565342e-240, -2.3341145329525056675e-256, -2.0046830753539152442e-272, -3.4057806738724185961e-288,\n  5.6821419688934674008e-241, 3.2988215943776273615e-257, 2.1353977370878701046e-273, -1.2215123283371736879e-289,\n  5.6821419688934674008e-241, 3.2988215943776273615e-257, 2.1353977370878701046e-273, -1.2215123283371736879e-289,\n  5.6821419688934674008e-241, 3.2988215943776273615e-257, 2.1353977370878701046e-273, -1.2215123283371736879e-289,\n  2.6827483411022054912e-241, 3.2988215943776273615e-257, 2.1353977370878701046e-273, -1.2215123283371736879e-289,\n  1.1830515272065748694e-241, -3.117427111020820077e-259, -5.9718623963762788119e-275, 6.1155422068568954053e-291,\n  4.3320312025875939195e-242, -3.117427111020820077e-259, -5.9718623963762788119e-275, 6.1155422068568954053e-291,\n  5.827891678485165325e-243, -3.117427111020820077e-259, -5.9718623963762788119e-275, 6.1155422068568954053e-291,\n  5.827891678485165325e-243, -3.117427111020820077e-259, -5.9718623963762788119e-275, 6.1155422068568954053e-291,\n  5.827891678485165325e-243, -3.117427111020820077e-259, -5.9718623963762788119e-275, 6.1155422068568954053e-291,\n  1.1413391350613183311e-243, -5.1586784110844895013e-260, -1.9524039360882352712e-276, -2.9779654517181717279e-292,\n  1.1413391350613183311e-243, -5.1586784110844895013e-260, -1.9524039360882352712e-276, -2.9779654517181717279e-292,\n  1.1413391350613183311e-243, -5.1586784110844895013e-260, -1.9524039360882352712e-276, -2.9779654517181717279e-292,\n  5.5552006713333735927e-244, 7.8491179384773690214e-260, -1.9524039360882352712e-276, -2.9779654517181717279e-292,\n  2.6261053316934700345e-244, 1.345219763696439399e-260, 1.6579848156414234801e-276, 1.0303712682997740506e-292,\n  1.1615576618735179302e-244, 1.345219763696439399e-260, 1.6579848156414234801e-276, 1.0303712682997740506e-292,\n  4.2928382696354204061e-245, -2.8075477999879273582e-261, -1.472095602234059958e-277, 2.8287088295287585094e-294,\n  6.3146909508553973881e-246, 1.2573885592501529789e-261, 3.0408903374280139822e-277, 2.8287088295287585094e-294,\n  6.3146909508553973881e-246, 1.2573885592501529789e-261, 3.0408903374280139822e-277, 2.8287088295287585094e-294,\n  6.3146909508553973881e-246, 1.2573885592501529789e-261, 3.0408903374280139822e-277, 2.8287088295287585094e-294,\n  1.7379794826680480784e-246, 2.4115446944063306384e-262, 2.202741251392177696e-278, 2.8287088295287585094e-294,\n  1.7379794826680480784e-246, 2.4115446944063306384e-262, 2.202741251392177696e-278, 2.8287088295287585094e-294,\n  5.9380161562121075096e-247, -1.2904053011746964278e-263, 8.7279092175580810531e-280, 8.8634899828990930877e-296,\n  2.1712682097791944335e-248, 2.9746046415267896827e-264, -8.6516445844406224413e-282, -5.0528699238150276549e-299,\n  2.1712682097791944335e-248, 2.9746046415267896827e-264, -8.6516445844406224413e-282, -5.0528699238150276549e-299,\n  2.1712682097791944335e-248, 2.9746046415267896827e-264, -8.6516445844406224413e-282, -5.0528699238150276549e-299,\n  2.1712682097791944335e-248, 2.9746046415267896827e-264, -8.6516445844406224413e-282, -5.0528699238150276549e-299,\n  2.1712682097791944335e-248, 2.9746046415267896827e-264, -8.6516445844406224413e-282, -5.0528699238150276549e-299,\n  3.8349029251851101018e-249, -2.6436684620390282645e-267, -4.3807022524130141006e-284, -2.7456019707854725967e-300,\n  3.8349029251851101018e-249, -2.6436684620390282645e-267, -4.3807022524130141006e-284, -2.7456019707854725967e-300,\n  3.8349029251851101018e-249, -2.6436684620390282645e-267, -4.3807022524130141006e-284, -2.7456019707854725967e-300,\n  1.6001805286092554504e-249, -2.6436684620390282645e-267, -4.3807022524130141006e-284, -2.7456019707854725967e-300,\n  4.8281933032132812475e-250, -2.6436684620390282645e-267, -4.3807022524130141006e-284, -2.7456019707854725967e-300,\n  4.8281933032132812475e-250, -2.6436684620390282645e-267, -4.3807022524130141006e-284, -2.7456019707854725967e-300,\n  2.0347903074934629333e-250, -2.6436684620390282645e-267, -4.3807022524130141006e-284, -2.7456019707854725967e-300,\n  6.3808880963355377617e-251, -2.6436684620390282645e-267, -4.3807022524130141006e-284, -2.7456019707854725967e-300,\n  6.3808880963355377617e-251, -2.6436684620390282645e-267, -4.3807022524130141006e-284, -2.7456019707854725967e-300,\n  2.8891343516857640937e-251, 5.1095823452235464813e-267, -4.3807022524130141006e-284, -2.7456019707854725967e-300,\n  1.1432574793608780349e-251, 1.2329569415922591084e-267, -4.3807022524130141006e-284, -2.7456019707854725967e-300,\n  2.7031904319843490867e-252, 2.638005906844371576e-268, 6.3790946999826013345e-284, -2.7456019707854725967e-300,\n  2.7031904319843490867e-252, 2.638005906844371576e-268, 6.3790946999826013345e-284, -2.7456019707854725967e-300,\n  5.2084434157824127104e-253, 2.1511502957481757317e-269, 3.2670891426006735363e-285, 2.4084160842482777461e-301,\n  5.2084434157824127104e-253, 2.1511502957481757317e-269, 3.2670891426006735363e-285, 2.4084160842482777461e-301,\n  5.2084434157824127104e-253, 2.1511502957481757317e-269, 3.2670891426006735363e-285, 2.4084160842482777461e-301,\n  2.4805108027747776379e-253, 2.1511502957481757317e-269, 3.2670891426006735363e-285, 2.4084160842482777461e-301,\n  1.1165444962709601017e-253, 2.1511502957481757317e-269, 3.2670891426006735363e-285, 2.4084160842482777461e-301,\n  4.3456134301905148502e-254, 6.3684349745470443788e-270, -9.5347405022956030541e-287, -1.5805886663557401565e-302,\n  9.3569766393097138822e-255, 2.5826679788133653036e-270, -9.5347405022956030541e-287, -1.5805886663557401565e-302,\n  9.3569766393097138822e-255, 2.5826679788133653036e-270, -9.5347405022956030541e-287, -1.5805886663557401565e-302,\n  8.3218722366085688343e-256, -2.0046830753539152442e-272, -3.4057806738724185961e-288, 2.3458177946667328156e-304,\n  8.3218722366085688343e-256, -2.0046830753539152442e-272, -3.4057806738724185961e-288, 2.3458177946667328156e-304,\n  8.3218722366085688343e-256, -2.0046830753539152442e-272, -3.4057806738724185961e-288, 2.3458177946667328156e-304,\n  8.3218722366085688343e-256, -2.0046830753539152442e-272, -3.4057806738724185961e-288, 2.3458177946667328156e-304,\n  2.9938788518280315834e-256, -2.0046830753539152442e-272, -3.4057806738724185961e-288, 2.3458177946667328156e-304,\n  3.2988215943776273615e-257, 2.1353977370878701046e-273, -1.2215123283371736879e-289, 6.7342163555358599277e-306,\n  3.2988215943776273615e-257, 2.1353977370878701046e-273, -1.2215123283371736879e-289, 6.7342163555358599277e-306,\n  3.2988215943776273615e-257, 2.1353977370878701046e-273, -1.2215123283371736879e-289, 6.7342163555358599277e-306,\n  3.2988215943776273615e-257, 2.1353977370878701046e-273, -1.2215123283371736879e-289, 6.7342163555358599277e-306,\n  1.6338236616337094706e-257, 2.1353977370878701046e-273, -1.2215123283371736879e-289, 6.7342163555358599277e-306,\n  8.0132469526175071002e-258, 2.8687869620228451614e-274, -1.9537812801257956865e-290, 1.0380272777574237546e-306,\n  3.850752120757712373e-258, 2.8687869620228451614e-274, -1.9537812801257956865e-290, 1.0380272777574237546e-306,\n  1.7695047048278150093e-258, 2.8687869620228451614e-274, -1.9537812801257956865e-290, 1.0380272777574237546e-306,\n  7.2888099686286655858e-259, 5.581381609158630475e-275, 6.1155422068568946933e-291, 1.0380272777574237546e-306,\n  2.0856914288039227544e-259, -1.9524039360882352712e-276, -2.9779654517181712829e-292, -3.000817432603284506e-308,\n  2.0856914288039227544e-259, -1.9524039360882352712e-276, -2.9779654517181712829e-292, -3.000817432603284506e-308,\n  7.8491179384773690214e-260, -1.9524039360882352712e-276, -2.9779654517181712829e-292, -3.000817432603284506e-308,\n  1.345219763696439399e-260, 1.6579848156414234801e-276, 1.0303712682997738281e-292, 1.4493302844111182601e-308,\n  1.345219763696439399e-260, 1.6579848156414234801e-276, 1.0303712682997738281e-292, 1.4493302844111182601e-308,\n  1.345219763696439399e-260, 1.6579848156414234801e-276, 1.0303712682997738281e-292, 1.4493302844111182601e-308,\n  5.3223249184882342185e-261, -1.472095602234059958e-277, 2.8287088295287585094e-294, -1.0874435234232647519e-310,\n  1.2573885592501529789e-261, 3.0408903374280139822e-277, 2.8287088295287585094e-294, -1.0874435234232647519e-310,\n  1.2573885592501529789e-261, 3.0408903374280139822e-277, 2.8287088295287585094e-294, -1.0874435234232647519e-310,\n  2.4115446944063306384e-262, 2.202741251392177696e-278, 2.8287088295287585094e-294, -1.0874435234232647519e-310,\n  2.4115446944063306384e-262, 2.202741251392177696e-278, 2.8287088295287585094e-294, -1.0874435234232647519e-310,\n  2.4115446944063306384e-262, 2.202741251392177696e-278, 2.8287088295287585094e-294, -1.0874435234232647519e-310,\n  1.1412520821444306741e-262, -6.1787496089661820348e-279, -3.028042329852615431e-295, -2.182740474438892116e-311,\n  5.0610577601348040988e-263, 7.9243314524777990283e-279, -3.028042329852615431e-295, -2.182740474438892116e-311,\n  1.8853262294800541881e-263, 8.7279092175580810531e-280, 8.8634899828990930877e-296, -9.8167844904532653004e-314,\n  2.9746046415267896827e-264, -8.6516445844406224413e-282, -5.0528699238150265939e-299, -1.3288013265921760399e-314,\n  2.9746046415267896827e-264, -8.6516445844406224413e-282, -5.0528699238150265939e-299, -1.3288013265921760399e-314,\n  2.9746046415267896827e-264, -8.6516445844406224413e-282, -5.0528699238150265939e-299, -1.3288013265921760399e-314,\n  9.8977243486757054781e-265, -8.6516445844406224413e-282, -5.0528699238150265939e-299, -1.3288013265921760399e-314,\n  9.8977243486757054781e-265, -8.6516445844406224413e-282, -5.0528699238150265939e-299, -1.3288013265921760399e-314,\n  4.9356438320276576408e-265, -8.6516445844406224413e-282, -5.0528699238150265939e-299, -1.3288013265921760399e-314,\n  2.4546035737036337221e-265, -8.6516445844406224413e-282, -5.0528699238150265939e-299, -1.3288013265921760399e-314,\n  1.2140834445416214873e-265, 1.8893435613692150014e-281, 3.0075895258731974416e-297, -9.8167844904532653004e-314,\n  5.9382337996061564537e-266, 5.1208955146257653156e-282, -5.0528699238150265939e-299, -1.3288013265921760399e-314,\n  2.8369334767011265554e-266, 5.1208955146257653156e-282, -5.0528699238150265939e-299, -1.3288013265921760399e-314,\n  1.2862833152486119506e-266, 1.6777604898591683764e-282, -5.0528699238150265939e-299, -1.3288013265921760399e-314,\n  5.1095823452235464813e-267, -4.3807022524130141006e-284, -2.7456019707854725967e-300, -2.5539572388808429997e-317,\n  1.2329569415922591084e-267, -4.3807022524130141006e-284, -2.7456019707854725967e-300, -2.5539572388808429997e-317,\n  1.2329569415922591084e-267, -4.3807022524130141006e-284, -2.7456019707854725967e-300, -2.5539572388808429997e-317,\n  2.638005906844371576e-268, 6.3790946999826013345e-284, -2.7456019707854725967e-300, -2.5539572388808429997e-317,\n  2.638005906844371576e-268, 6.3790946999826013345e-284, -2.7456019707854725967e-300, -2.5539572388808429997e-317,\n  2.1511502957481757317e-269, 3.2670891426006735363e-285, 2.4084160842482773317e-301, 5.7350888195772519812e-317,\n  2.1511502957481757317e-269, 3.2670891426006735363e-285, 2.4084160842482773317e-301, 5.7350888195772519812e-317,\n  2.1511502957481757317e-269, 3.2670891426006735363e-285, 2.4084160842482773317e-301, 5.7350888195772519812e-317,\n  2.1511502957481757317e-269, 3.2670891426006735363e-285, 2.4084160842482773317e-301, 5.7350888195772519812e-317,\n  6.3684349745470443788e-270, -9.5347405022956030541e-287, -1.5805886663557401565e-302, 3.6369654387311681856e-319,\n  6.3684349745470443788e-270, -9.5347405022956030541e-287, -1.5805886663557401565e-302, 3.6369654387311681856e-319,\n  2.5826679788133653036e-270, -9.5347405022956030541e-287, -1.5805886663557401565e-302, 3.6369654387311681856e-319,\n  6.8978448094652555593e-271, 1.1480487920352081009e-286, 7.5257037990230704094e-303, 3.6369654387311681856e-319,\n  6.8978448094652555593e-271, 1.1480487920352081009e-286, 7.5257037990230704094e-303, 3.6369654387311681856e-319,\n  2.1656360647981577662e-271, 9.7287370902823839435e-288, 1.6928061833779524157e-303, 3.6369654387311681856e-319,\n  2.1656360647981577662e-271, 9.7287370902823839435e-288, 1.6928061833779524157e-303, 3.6369654387311681856e-319,\n  9.825838786313830552e-272, 9.7287370902823839435e-288, 1.6928061833779524157e-303, 3.6369654387311681856e-319,\n  3.9105778554799569972e-272, 9.7287370902823839435e-288, 1.6928061833779524157e-303, 3.6369654387311681856e-319,\n  9.5294739006302120482e-273, -1.2215123283371736879e-289, 6.7342163555358599277e-306, -5.681754927174335258e-322,\n  9.5294739006302120482e-273, -1.2215123283371736879e-289, 6.7342163555358599277e-306, -5.681754927174335258e-322,\n  2.1353977370878701046e-273, -1.2215123283371736879e-289, 6.7342163555358599277e-306, -5.681754927174335258e-322,\n  2.1353977370878701046e-273, -1.2215123283371736879e-289, 6.7342163555358599277e-306, -5.681754927174335258e-322,\n  2.8687869620228451614e-274, -1.9537812801257956865e-290, 1.0380272777574237546e-306, 6.4228533959362050743e-323,\n};\n\nNOEXPORT ALIGNED(64) const float Sleef_rempitabsp[] = {\n  0.159154892, 5.112411827e-08, 3.626141271e-15, -2.036222915e-22,\n  0.03415493667, 6.420638243e-09, 7.342738037e-17, 8.135951656e-24,\n  0.03415493667, 6.420638243e-09, 7.342738037e-17, 8.135951656e-24,\n  0.002904943191, -9.861969574e-11, -9.839336547e-18, -1.790215892e-24,\n  0.002904943191, -9.861969574e-11, -9.839336547e-18, -1.790215892e-24,\n  0.002904943191, -9.861969574e-11, -9.839336547e-18, -1.790215892e-24,\n  0.002904943191, -9.861969574e-11, -9.839336547e-18, -1.790215892e-24,\n  0.0009518179577, 1.342109202e-10, 1.791623576e-17, 1.518506657e-24,\n  0.0009518179577, 1.342109202e-10, 1.791623576e-17, 1.518506657e-24,\n  0.0004635368241, 1.779561221e-11, 4.038449606e-18, -1.358546052e-25,\n  0.0002193961991, 1.779561221e-11, 4.038449606e-18, -1.358546052e-25,\n  9.73258866e-05, 1.779561221e-11, 4.038449606e-18, -1.358546052e-25,\n  3.62907449e-05, 3.243700447e-12, 5.690024473e-19, 7.09405479e-26,\n  5.773168596e-06, 1.424711477e-12, 1.3532163e-19, 1.92417627e-26,\n  5.773168596e-06, 1.424711477e-12, 1.3532163e-19, 1.92417627e-26,\n  5.773168596e-06, 1.424711477e-12, 1.3532163e-19, 1.92417627e-26,\n  1.958472239e-06, 5.152167755e-13, 1.3532163e-19, 1.92417627e-26,\n  5.112411827e-08, 3.626141271e-15, -2.036222915e-22, 6.177847236e-30,\n  5.112411827e-08, 3.626141271e-15, -2.036222915e-22, 6.177847236e-30,\n  5.112411827e-08, 3.626141271e-15, -2.036222915e-22, 6.177847236e-30,\n  5.112411827e-08, 3.626141271e-15, -2.036222915e-22, 6.177847236e-30,\n  5.112411827e-08, 3.626141271e-15, -2.036222915e-22, 6.177847236e-30,\n  5.112411827e-08, 3.626141271e-15, -2.036222915e-22, 6.177847236e-30,\n  2.132179588e-08, 3.626141271e-15, -2.036222915e-22, 6.177847236e-30,\n  6.420638243e-09, 7.342738037e-17, 8.135951656e-24, -1.330400526e-31,\n  6.420638243e-09, 7.342738037e-17, 8.135951656e-24, -1.330400526e-31,\n  2.695347945e-09, 7.342738037e-17, 8.135951656e-24, -1.330400526e-31,\n  8.327027956e-10, 7.342738037e-17, 8.135951656e-24, -1.330400526e-31,\n  8.327027956e-10, 7.342738037e-17, 8.135951656e-24, -1.330400526e-31,\n  3.670415083e-10, 7.342738037e-17, 8.135951656e-24, -1.330400526e-31,\n  1.342109202e-10, 1.791623576e-17, 1.518506361e-24, 2.613904e-31,\n  1.779561221e-11, 4.038449606e-18, -1.358545683e-25, -3.443243946e-32,\n  1.779561221e-11, 4.038449606e-18, -1.358545683e-25, -3.443243946e-32,\n  1.779561221e-11, 4.038449606e-18, -1.358545683e-25, -3.443243946e-32,\n  3.243700447e-12, 5.690024473e-19, 7.094053557e-26, 1.487136711e-32,\n  3.243700447e-12, 5.690024473e-19, 7.094053557e-26, 1.487136711e-32,\n  3.243700447e-12, 5.690024473e-19, 7.094053557e-26, 1.487136711e-32,\n  1.424711477e-12, 1.3532163e-19, 1.924175961e-26, 2.545416018e-33,\n  5.152167755e-13, 1.3532163e-19, 1.924175961e-26, 2.545416018e-33,\n  6.046956013e-14, -2.036222915e-22, 6.177846108e-30, 1.082084378e-36,\n  6.046956013e-14, -2.036222915e-22, 6.177846108e-30, 1.082084378e-36,\n  6.046956013e-14, -2.036222915e-22, 6.177846108e-30, 1.082084378e-36,\n  3.626141271e-15, -2.036222915e-22, 6.177846108e-30, 1.082084378e-36,\n  3.626141271e-15, -2.036222915e-22, 6.177846108e-30, 1.082084378e-36,\n  3.626141271e-15, -2.036222915e-22, 6.177846108e-30, 1.082084378e-36,\n  3.626141271e-15, -2.036222915e-22, 6.177846108e-30, 1.082084378e-36,\n  7.342738037e-17, 8.135951656e-24, -1.330400526e-31, 6.296048013e-40,\n  7.342738037e-17, 8.135951656e-24, -1.330400526e-31, 6.296048013e-40,\n  7.342738037e-17, 8.135951656e-24, -1.330400526e-31, 6.296048013e-40,\n  7.342738037e-17, 8.135951656e-24, -1.330400526e-31, 6.296048013e-40,\n  7.342738037e-17, 8.135951656e-24, -1.330400526e-31, 6.296048013e-40,\n  7.342738037e-17, 8.135951656e-24, -1.330400526e-31, 6.296048013e-40,\n  1.791623576e-17, 1.518506361e-24, 2.61390353e-31, 4.764937743e-38,\n  1.791623576e-17, 1.518506361e-24, 2.61390353e-31, 4.764937743e-38,\n  4.038449606e-18, -1.358545683e-25, -3.443243946e-32, 6.296048013e-40,\n  4.038449606e-18, -1.358545683e-25, -3.443243946e-32, 6.296048013e-40,\n  5.690024473e-19, 7.094053557e-26, 1.487136711e-32, 6.296048013e-40,\n  5.690024473e-19, 7.094053557e-26, 1.487136711e-32, 6.296048013e-40,\n  5.690024473e-19, 7.094053557e-26, 1.487136711e-32, 6.296048013e-40,\n  1.3532163e-19, 1.924175961e-26, 2.545415467e-33, 6.296048013e-40,\n  1.3532163e-19, 1.924175961e-26, 2.545415467e-33, 6.296048013e-40,\n  2.690143217e-20, -1.452834402e-28, -6.441077673e-36, -1.764234767e-42,\n  2.690143217e-20, -1.452834402e-28, -6.441077673e-36, -1.764234767e-42,\n  2.690143217e-20, -1.452834402e-28, -6.441077673e-36, -1.764234767e-42,\n  1.334890502e-20, -1.452834402e-28, -6.441077673e-36, -1.764234767e-42,\n  6.572641438e-21, -1.452834402e-28, -6.441077673e-36, -1.764234767e-42,\n  0.05874381959, 1.222115387e-08, 7.693612965e-16, 1.792054435e-22,\n  0.02749382704, 4.77057327e-09, 7.693612965e-16, 1.792054435e-22,\n  0.01186883077, 1.045283415e-09, 3.252721926e-16, 7.332633139e-23,\n  0.00405633077, 1.045283415e-09, 3.252721926e-16, 7.332633139e-23,\n  0.000150081818, -2.454155802e-12, 1.161414894e-20, 1.291319272e-27,\n  0.000150081818, -2.454155802e-12, 1.161414894e-20, 1.291319272e-27,\n  0.000150081818, -2.454155802e-12, 1.161414894e-20, 1.291319272e-27,\n  0.000150081818, -2.454155802e-12, 1.161414894e-20, 1.291319272e-27,\n  0.000150081818, -2.454155802e-12, 1.161414894e-20, 1.291319272e-27,\n  2.801149822e-05, 4.821800945e-12, 8.789757674e-19, 1.208447639e-25,\n  2.801149822e-05, 4.821800945e-12, 8.789757674e-19, 1.208447639e-25,\n  2.801149822e-05, 4.821800945e-12, 8.789757674e-19, 1.208447639e-25,\n  1.275271279e-05, 1.183823005e-12, 1.161414894e-20, 1.291319272e-27,\n  5.12331826e-06, 1.183823005e-12, 1.161414894e-20, 1.291319272e-27,\n  1.308621904e-06, 2.743283031e-13, 1.161414894e-20, 1.291319272e-27,\n  1.308621904e-06, 2.743283031e-13, 1.161414894e-20, 1.291319272e-27,\n  3.549478151e-07, 4.695462769e-14, 1.161414894e-20, 1.291319272e-27,\n  3.549478151e-07, 4.695462769e-14, 1.161414894e-20, 1.291319272e-27,\n  1.165292645e-07, 1.853292503e-14, 4.837885366e-21, 1.291319272e-27,\n  1.165292645e-07, 1.853292503e-14, 4.837885366e-21, 1.291319272e-27,\n  5.69246339e-08, 4.322073705e-15, 1.449754789e-21, 7.962890365e-29,\n  2.712231151e-08, 4.322073705e-15, 1.449754789e-21, 7.962890365e-29,\n  1.222115387e-08, 7.693612965e-16, 1.792054182e-22, 2.91418027e-29,\n  4.77057327e-09, 7.693612965e-16, 1.792054182e-22, 2.91418027e-29,\n  1.045283415e-09, 3.252721926e-16, 7.332632508e-23, 3.898253736e-30,\n  1.045283415e-09, 3.252721926e-16, 7.332632508e-23, 3.898253736e-30,\n  1.139611461e-10, 1.996093359e-17, 5.344349223e-25, 1.511644828e-31,\n  1.139611461e-10, 1.996093359e-17, 5.344349223e-25, 1.511644828e-31,\n  1.139611461e-10, 1.996093359e-17, 5.344349223e-25, 1.511644828e-31,\n  1.139611461e-10, 1.996093359e-17, 5.344349223e-25, 1.511644828e-31,\n  5.575349904e-11, 6.083145782e-18, 5.344349223e-25, 1.511644828e-31,\n  2.664967552e-11, -8.557475018e-19, -8.595036458e-26, -2.139883875e-32,\n  1.209775682e-11, 2.61369883e-18, 5.344349223e-25, 1.511644828e-31,\n  4.821800945e-12, 8.789757674e-19, 1.208447639e-25, 3.253064536e-33,\n  1.183823005e-12, 1.161414894e-20, 1.29131908e-27, 1.715766248e-34,\n  1.183823005e-12, 1.161414894e-20, 1.29131908e-27, 1.715766248e-34,\n  2.743283031e-13, 1.161414894e-20, 1.29131908e-27, 1.715766248e-34,\n};\n"
  },
  {
    "path": "src/rename.h",
    "content": "#ifndef RENAMESCALAR_H\n               #define RENAMESCALAR_H\n\n               /* ------------------------------------------------------------------------- */\n                   /* Naming of functions scalar */\n\n                   \n\n                   #ifdef DETERMINISTIC\n\n                   #define xsin nsimd_sleef_sin_u35d_scalar_f64\n#define xsinf nsimd_sleef_sin_u35d_scalar_f32\n#define xcos nsimd_sleef_cos_u35d_scalar_f64\n#define xcosf nsimd_sleef_cos_u35d_scalar_f32\n#define xsincos nsimd_sleef_sincos_u35d_scalar_f64\n#define xsincosf nsimd_sleef_sincos_u35d_scalar_f32\n#define xtan nsimd_sleef_tan_u35d_scalar_f64\n#define xtanf nsimd_sleef_tan_u35d_scalar_f32\n#define xasin nsimd_sleef_asin_u35d_scalar_f64\n#define xasinf nsimd_sleef_asin_u35d_scalar_f32\n#define xacos nsimd_sleef_acos_u35d_scalar_f64\n#define xacosf nsimd_sleef_acos_u35d_scalar_f32\n#define xatan nsimd_sleef_atan_u35d_scalar_f64\n#define xatanf nsimd_sleef_atan_u35d_scalar_f32\n#define xatan2 nsimd_sleef_atan2_u35d_scalar_f64\n#define xatan2f nsimd_sleef_atan2_u35d_scalar_f32\n#define xlog nsimd_sleef_log_u35d_scalar_f64\n#define xlogf nsimd_sleef_log_u35d_scalar_f32\n#define xcbrt nsimd_sleef_cbrt_u35d_scalar_f64\n#define xcbrtf nsimd_sleef_cbrt_u35d_scalar_f32\n#define xsin_u1 nsimd_sleef_sin_u10d_scalar_f64\n#define xsinf_u1 nsimd_sleef_sin_u10d_scalar_f32\n#define xcos_u1 nsimd_sleef_cos_u10d_scalar_f64\n#define xcosf_u1 nsimd_sleef_cos_u10d_scalar_f32\n#define xsincos_u1 nsimd_sleef_sincos_u10d_scalar_f64\n#define xsincosf_u1 nsimd_sleef_sincos_u10d_scalar_f32\n#define xtan_u1 nsimd_sleef_tan_u10d_scalar_f64\n#define xtanf_u1 nsimd_sleef_tan_u10d_scalar_f32\n#define xasin_u1 nsimd_sleef_asin_u10d_scalar_f64\n#define xasinf_u1 nsimd_sleef_asin_u10d_scalar_f32\n#define xacos_u1 nsimd_sleef_acos_u10d_scalar_f64\n#define xacosf_u1 nsimd_sleef_acos_u10d_scalar_f32\n#define xatan_u1 nsimd_sleef_atan_u10d_scalar_f64\n#define xatanf_u1 nsimd_sleef_atan_u10d_scalar_f32\n#define xatan2_u1 nsimd_sleef_atan2_u10d_scalar_f64\n#define xatan2f_u1 nsimd_sleef_atan2_u10d_scalar_f32\n#define xlog_u1 nsimd_sleef_log_u10d_scalar_f64\n#define xlogf_u1 nsimd_sleef_log_u10d_scalar_f32\n#define xcbrt_u1 nsimd_sleef_cbrt_u10d_scalar_f64\n#define xcbrtf_u1 nsimd_sleef_cbrt_u10d_scalar_f32\n#define xexp nsimd_sleef_exp_u10d_scalar_f64\n#define xexpf nsimd_sleef_exp_u10d_scalar_f32\n#define xpow nsimd_sleef_pow_u10d_scalar_f64\n#define xpowf nsimd_sleef_pow_u10d_scalar_f32\n#define xsinh nsimd_sleef_sinh_u10d_scalar_f64\n#define xsinhf nsimd_sleef_sinh_u10d_scalar_f32\n#define xcosh nsimd_sleef_cosh_u10d_scalar_f64\n#define xcoshf nsimd_sleef_cosh_u10d_scalar_f32\n#define xtanh nsimd_sleef_tanh_u10d_scalar_f64\n#define xtanhf nsimd_sleef_tanh_u10d_scalar_f32\n#define xsinh_u35 nsimd_sleef_sinh_u35d_scalar_f64\n#define xsinhf_u35 nsimd_sleef_sinh_u35d_scalar_f32\n#define xcosh_u35 nsimd_sleef_cosh_u35d_scalar_f64\n#define xcoshf_u35 nsimd_sleef_cosh_u35d_scalar_f32\n#define xtanh_u35 nsimd_sleef_tanh_u35d_scalar_f64\n#define xtanhf_u35 nsimd_sleef_tanh_u35d_scalar_f32\n#define xfastsin_u3500 nsimd_sleef_fastsin_u3500d_scalar_f64\n#define xfastsinf_u3500 nsimd_sleef_fastsin_u3500d_scalar_f32\n#define xfastcos_u3500 nsimd_sleef_fastcos_u3500d_scalar_f64\n#define xfastcosf_u3500 nsimd_sleef_fastcos_u3500d_scalar_f32\n#define xfastpow_u3500 nsimd_sleef_fastpow_u3500d_scalar_f64\n#define xfastpowf_u3500 nsimd_sleef_fastpow_u3500d_scalar_f32\n#define xasinh nsimd_sleef_asinh_u10d_scalar_f64\n#define xasinhf nsimd_sleef_asinh_u10d_scalar_f32\n#define xacosh nsimd_sleef_acosh_u10d_scalar_f64\n#define xacoshf nsimd_sleef_acosh_u10d_scalar_f32\n#define xatanh nsimd_sleef_atanh_u10d_scalar_f64\n#define xatanhf nsimd_sleef_atanh_u10d_scalar_f32\n#define xexp2 nsimd_sleef_exp2_u10d_scalar_f64\n#define xexp2f nsimd_sleef_exp2_u10d_scalar_f32\n#define xexp2_u35 nsimd_sleef_exp2_u35d_scalar_f64\n#define xexp2f_u35 nsimd_sleef_exp2_u35d_scalar_f32\n#define xexp10 nsimd_sleef_exp10_u10d_scalar_f64\n#define xexp10f nsimd_sleef_exp10_u10d_scalar_f32\n#define xexp10_u35 nsimd_sleef_exp10_u35d_scalar_f64\n#define xexp10f_u35 nsimd_sleef_exp10_u35d_scalar_f32\n#define xexpm1 nsimd_sleef_expm1_u10d_scalar_f64\n#define xexpm1f nsimd_sleef_expm1_u10d_scalar_f32\n#define xlog10 nsimd_sleef_log10_u10d_scalar_f64\n#define xlog10f nsimd_sleef_log10_u10d_scalar_f32\n#define xlog2 nsimd_sleef_log2_u10d_scalar_f64\n#define xlog2f nsimd_sleef_log2_u10d_scalar_f32\n#define xlog2_u35 nsimd_sleef_log2_u35d_scalar_f64\n#define xlog2f_u35 nsimd_sleef_log2_u35d_scalar_f32\n#define xlog1p nsimd_sleef_log1p_u10d_scalar_f64\n#define xlog1pf nsimd_sleef_log1p_u10d_scalar_f32\n#define xsincospi_u05 nsimd_sleef_sincospi_u05d_scalar_f64\n#define xsincospif_u05 nsimd_sleef_sincospi_u05d_scalar_f32\n#define xsincospi_u35 nsimd_sleef_sincospi_u35d_scalar_f64\n#define xsincospif_u35 nsimd_sleef_sincospi_u35d_scalar_f32\n#define xsinpi_u05 nsimd_sleef_sinpi_u05d_scalar_f64\n#define xsinpif_u05 nsimd_sleef_sinpi_u05d_scalar_f32\n#define xcospi_u05 nsimd_sleef_cospi_u05d_scalar_f64\n#define xcospif_u05 nsimd_sleef_cospi_u05d_scalar_f32\n#define xldexp nsimd_sleef_ldexp_scalar_f64\n#define xldexpf nsimd_sleef_ldexp_scalar_f32\n#define xilogb nsimd_sleef_ilogb_scalar_f64\n#define xilogbf nsimd_sleef_ilogb_scalar_f32\n#define xfma nsimd_sleef_fma_scalar_f64\n#define xfmaf nsimd_sleef_fma_scalar_f32\n#define xsqrt nsimd_sleef_sqrt_scalar_f64\n#define xsqrtf nsimd_sleef_sqrt_scalar_f32\n#define xsqrt_u05 nsimd_sleef_sqrt_u05d_scalar_f64\n#define xsqrtf_u05 nsimd_sleef_sqrt_u05d_scalar_f32\n#define xsqrt_u35 nsimd_sleef_sqrt_u35d_scalar_f64\n#define xsqrtf_u35 nsimd_sleef_sqrt_u35d_scalar_f32\n#define xhypot_u05 nsimd_sleef_hypot_u05d_scalar_f64\n#define xhypotf_u05 nsimd_sleef_hypot_u05d_scalar_f32\n#define xhypot_u35 nsimd_sleef_hypot_u35d_scalar_f64\n#define xhypotf_u35 nsimd_sleef_hypot_u35d_scalar_f32\n#define xfabs nsimd_sleef_fabs_scalar_f64\n#define xfabsf nsimd_sleef_fabs_scalar_f32\n#define xcopysign nsimd_sleef_copysign_scalar_f64\n#define xcopysignf nsimd_sleef_copysign_scalar_f32\n#define xfmax nsimd_sleef_fmax_scalar_f64\n#define xfmaxf nsimd_sleef_fmax_scalar_f32\n#define xfmin nsimd_sleef_fmin_scalar_f64\n#define xfminf nsimd_sleef_fmin_scalar_f32\n#define xfdim nsimd_sleef_fdim_scalar_f64\n#define xfdimf nsimd_sleef_fdim_scalar_f32\n#define xtrunc nsimd_sleef_trunc_scalar_f64\n#define xtruncf nsimd_sleef_trunc_scalar_f32\n#define xfloor nsimd_sleef_floor_scalar_f64\n#define xfloorf nsimd_sleef_floor_scalar_f32\n#define xceil nsimd_sleef_ceil_scalar_f64\n#define xceilf nsimd_sleef_ceil_scalar_f32\n#define xround nsimd_sleef_round_scalar_f64\n#define xroundf nsimd_sleef_round_scalar_f32\n#define xrint nsimd_sleef_rint_scalar_f64\n#define xrintf nsimd_sleef_rint_scalar_f32\n#define xnextafter nsimd_sleef_nextafter_scalar_f64\n#define xnextafterf nsimd_sleef_nextafter_scalar_f32\n#define xfrfrexp nsimd_sleef_frfrexp_scalar_f64\n#define xfrfrexpf nsimd_sleef_frfrexp_scalar_f32\n#define xexpfrexp nsimd_sleef_expfrexp_scalar_f64\n#define xexpfrexpf nsimd_sleef_expfrexp_scalar_f32\n#define xfmod nsimd_sleef_fmod_scalar_f64\n#define xfmodf nsimd_sleef_fmod_scalar_f32\n#define xremainder nsimd_sleef_remainder_scalar_f64\n#define xremainderf nsimd_sleef_remainder_scalar_f32\n#define xmodf nsimd_sleef_modf_scalar_f64\n#define xmodff nsimd_sleef_modf_scalar_f32\n#define xlgamma_u1 nsimd_sleef_lgamma_u10d_scalar_f64\n#define xlgammaf_u1 nsimd_sleef_lgamma_u10d_scalar_f32\n#define xtgamma_u1 nsimd_sleef_tgamma_u10d_scalar_f64\n#define xtgammaf_u1 nsimd_sleef_tgamma_u10d_scalar_f32\n#define xerf_u1 nsimd_sleef_erf_u10d_scalar_f64\n#define xerff_u1 nsimd_sleef_erf_u10d_scalar_f32\n#define xerfc_u15 nsimd_sleef_erfc_u15d_scalar_f64\n#define xerfcf_u15 nsimd_sleef_erfc_u15d_scalar_f32\n#define xgetInt nsimd_sleef_getInt_scalar_f64\n#define xgetIntf nsimd_sleef_getInt_scalar_f32\n#define xgetPtr nsimd_sleef_getPtr_scalar_f64\n#define xgetPtrf nsimd_sleef_getPtr_scalar_f32\n\n                   #else\n\n                   #define xsin nsimd_sleef_sin_u35_scalar_f64\n#define xsinf nsimd_sleef_sin_u35_scalar_f32\n#define xcos nsimd_sleef_cos_u35_scalar_f64\n#define xcosf nsimd_sleef_cos_u35_scalar_f32\n#define xsincos nsimd_sleef_sincos_u35_scalar_f64\n#define xsincosf nsimd_sleef_sincos_u35_scalar_f32\n#define xtan nsimd_sleef_tan_u35_scalar_f64\n#define xtanf nsimd_sleef_tan_u35_scalar_f32\n#define xasin nsimd_sleef_asin_u35_scalar_f64\n#define xasinf nsimd_sleef_asin_u35_scalar_f32\n#define xacos nsimd_sleef_acos_u35_scalar_f64\n#define xacosf nsimd_sleef_acos_u35_scalar_f32\n#define xatan nsimd_sleef_atan_u35_scalar_f64\n#define xatanf nsimd_sleef_atan_u35_scalar_f32\n#define xatan2 nsimd_sleef_atan2_u35_scalar_f64\n#define xatan2f nsimd_sleef_atan2_u35_scalar_f32\n#define xlog nsimd_sleef_log_u35_scalar_f64\n#define xlogf nsimd_sleef_log_u35_scalar_f32\n#define xcbrt nsimd_sleef_cbrt_u35_scalar_f64\n#define xcbrtf nsimd_sleef_cbrt_u35_scalar_f32\n#define xsin_u1 nsimd_sleef_sin_u10_scalar_f64\n#define xsinf_u1 nsimd_sleef_sin_u10_scalar_f32\n#define xcos_u1 nsimd_sleef_cos_u10_scalar_f64\n#define xcosf_u1 nsimd_sleef_cos_u10_scalar_f32\n#define xsincos_u1 nsimd_sleef_sincos_u10_scalar_f64\n#define xsincosf_u1 nsimd_sleef_sincos_u10_scalar_f32\n#define xtan_u1 nsimd_sleef_tan_u10_scalar_f64\n#define xtanf_u1 nsimd_sleef_tan_u10_scalar_f32\n#define xasin_u1 nsimd_sleef_asin_u10_scalar_f64\n#define xasinf_u1 nsimd_sleef_asin_u10_scalar_f32\n#define xacos_u1 nsimd_sleef_acos_u10_scalar_f64\n#define xacosf_u1 nsimd_sleef_acos_u10_scalar_f32\n#define xatan_u1 nsimd_sleef_atan_u10_scalar_f64\n#define xatanf_u1 nsimd_sleef_atan_u10_scalar_f32\n#define xatan2_u1 nsimd_sleef_atan2_u10_scalar_f64\n#define xatan2f_u1 nsimd_sleef_atan2_u10_scalar_f32\n#define xlog_u1 nsimd_sleef_log_u10_scalar_f64\n#define xlogf_u1 nsimd_sleef_log_u10_scalar_f32\n#define xcbrt_u1 nsimd_sleef_cbrt_u10_scalar_f64\n#define xcbrtf_u1 nsimd_sleef_cbrt_u10_scalar_f32\n#define xexp nsimd_sleef_exp_u10_scalar_f64\n#define xexpf nsimd_sleef_exp_u10_scalar_f32\n#define xpow nsimd_sleef_pow_u10_scalar_f64\n#define xpowf nsimd_sleef_pow_u10_scalar_f32\n#define xsinh nsimd_sleef_sinh_u10_scalar_f64\n#define xsinhf nsimd_sleef_sinh_u10_scalar_f32\n#define xcosh nsimd_sleef_cosh_u10_scalar_f64\n#define xcoshf nsimd_sleef_cosh_u10_scalar_f32\n#define xtanh nsimd_sleef_tanh_u10_scalar_f64\n#define xtanhf nsimd_sleef_tanh_u10_scalar_f32\n#define xsinh_u35 nsimd_sleef_sinh_u35_scalar_f64\n#define xsinhf_u35 nsimd_sleef_sinh_u35_scalar_f32\n#define xcosh_u35 nsimd_sleef_cosh_u35_scalar_f64\n#define xcoshf_u35 nsimd_sleef_cosh_u35_scalar_f32\n#define xtanh_u35 nsimd_sleef_tanh_u35_scalar_f64\n#define xtanhf_u35 nsimd_sleef_tanh_u35_scalar_f32\n#define xfastsin_u3500 nsimd_sleef_fastsin_u3500_scalar_f64\n#define xfastsinf_u3500 nsimd_sleef_fastsin_u3500_scalar_f32\n#define xfastcos_u3500 nsimd_sleef_fastcos_u3500_scalar_f64\n#define xfastcosf_u3500 nsimd_sleef_fastcos_u3500_scalar_f32\n#define xfastpow_u3500 nsimd_sleef_fastpow_u3500_scalar_f64\n#define xfastpowf_u3500 nsimd_sleef_fastpow_u3500_scalar_f32\n#define xasinh nsimd_sleef_asinh_u10_scalar_f64\n#define xasinhf nsimd_sleef_asinh_u10_scalar_f32\n#define xacosh nsimd_sleef_acosh_u10_scalar_f64\n#define xacoshf nsimd_sleef_acosh_u10_scalar_f32\n#define xatanh nsimd_sleef_atanh_u10_scalar_f64\n#define xatanhf nsimd_sleef_atanh_u10_scalar_f32\n#define xexp2 nsimd_sleef_exp2_u10_scalar_f64\n#define xexp2f nsimd_sleef_exp2_u10_scalar_f32\n#define xexp2_u35 nsimd_sleef_exp2_u35_scalar_f64\n#define xexp2f_u35 nsimd_sleef_exp2_u35_scalar_f32\n#define xexp10 nsimd_sleef_exp10_u10_scalar_f64\n#define xexp10f nsimd_sleef_exp10_u10_scalar_f32\n#define xexp10_u35 nsimd_sleef_exp10_u35_scalar_f64\n#define xexp10f_u35 nsimd_sleef_exp10_u35_scalar_f32\n#define xexpm1 nsimd_sleef_expm1_u10_scalar_f64\n#define xexpm1f nsimd_sleef_expm1_u10_scalar_f32\n#define xlog10 nsimd_sleef_log10_u10_scalar_f64\n#define xlog10f nsimd_sleef_log10_u10_scalar_f32\n#define xlog2 nsimd_sleef_log2_u10_scalar_f64\n#define xlog2f nsimd_sleef_log2_u10_scalar_f32\n#define xlog2_u35 nsimd_sleef_log2_u35_scalar_f64\n#define xlog2f_u35 nsimd_sleef_log2_u35_scalar_f32\n#define xlog1p nsimd_sleef_log1p_u10_scalar_f64\n#define xlog1pf nsimd_sleef_log1p_u10_scalar_f32\n#define xsincospi_u05 nsimd_sleef_sincospi_u05_scalar_f64\n#define xsincospif_u05 nsimd_sleef_sincospi_u05_scalar_f32\n#define xsincospi_u35 nsimd_sleef_sincospi_u35_scalar_f64\n#define xsincospif_u35 nsimd_sleef_sincospi_u35_scalar_f32\n#define xsinpi_u05 nsimd_sleef_sinpi_u05_scalar_f64\n#define xsinpif_u05 nsimd_sleef_sinpi_u05_scalar_f32\n#define xcospi_u05 nsimd_sleef_cospi_u05_scalar_f64\n#define xcospif_u05 nsimd_sleef_cospi_u05_scalar_f32\n#define xldexp nsimd_sleef_ldexp_scalar_f64\n#define xldexpf nsimd_sleef_ldexp_scalar_f32\n#define xilogb nsimd_sleef_ilogb_scalar_f64\n#define xilogbf nsimd_sleef_ilogb_scalar_f32\n#define xfma nsimd_sleef_fma_scalar_f64\n#define xfmaf nsimd_sleef_fma_scalar_f32\n#define xsqrt nsimd_sleef_sqrt_scalar_f64\n#define xsqrtf nsimd_sleef_sqrt_scalar_f32\n#define xsqrt_u05 nsimd_sleef_sqrt_u05_scalar_f64\n#define xsqrtf_u05 nsimd_sleef_sqrt_u05_scalar_f32\n#define xsqrt_u35 nsimd_sleef_sqrt_u35_scalar_f64\n#define xsqrtf_u35 nsimd_sleef_sqrt_u35_scalar_f32\n#define xhypot_u05 nsimd_sleef_hypot_u05_scalar_f64\n#define xhypotf_u05 nsimd_sleef_hypot_u05_scalar_f32\n#define xhypot_u35 nsimd_sleef_hypot_u35_scalar_f64\n#define xhypotf_u35 nsimd_sleef_hypot_u35_scalar_f32\n#define xfabs nsimd_sleef_fabs_scalar_f64\n#define xfabsf nsimd_sleef_fabs_scalar_f32\n#define xcopysign nsimd_sleef_copysign_scalar_f64\n#define xcopysignf nsimd_sleef_copysign_scalar_f32\n#define xfmax nsimd_sleef_fmax_scalar_f64\n#define xfmaxf nsimd_sleef_fmax_scalar_f32\n#define xfmin nsimd_sleef_fmin_scalar_f64\n#define xfminf nsimd_sleef_fmin_scalar_f32\n#define xfdim nsimd_sleef_fdim_scalar_f64\n#define xfdimf nsimd_sleef_fdim_scalar_f32\n#define xtrunc nsimd_sleef_trunc_scalar_f64\n#define xtruncf nsimd_sleef_trunc_scalar_f32\n#define xfloor nsimd_sleef_floor_scalar_f64\n#define xfloorf nsimd_sleef_floor_scalar_f32\n#define xceil nsimd_sleef_ceil_scalar_f64\n#define xceilf nsimd_sleef_ceil_scalar_f32\n#define xround nsimd_sleef_round_scalar_f64\n#define xroundf nsimd_sleef_round_scalar_f32\n#define xrint nsimd_sleef_rint_scalar_f64\n#define xrintf nsimd_sleef_rint_scalar_f32\n#define xnextafter nsimd_sleef_nextafter_scalar_f64\n#define xnextafterf nsimd_sleef_nextafter_scalar_f32\n#define xfrfrexp nsimd_sleef_frfrexp_scalar_f64\n#define xfrfrexpf nsimd_sleef_frfrexp_scalar_f32\n#define xexpfrexp nsimd_sleef_expfrexp_scalar_f64\n#define xexpfrexpf nsimd_sleef_expfrexp_scalar_f32\n#define xfmod nsimd_sleef_fmod_scalar_f64\n#define xfmodf nsimd_sleef_fmod_scalar_f32\n#define xremainder nsimd_sleef_remainder_scalar_f64\n#define xremainderf nsimd_sleef_remainder_scalar_f32\n#define xmodf nsimd_sleef_modf_scalar_f64\n#define xmodff nsimd_sleef_modf_scalar_f32\n#define xlgamma_u1 nsimd_sleef_lgamma_u10_scalar_f64\n#define xlgammaf_u1 nsimd_sleef_lgamma_u10_scalar_f32\n#define xtgamma_u1 nsimd_sleef_tgamma_u10_scalar_f64\n#define xtgammaf_u1 nsimd_sleef_tgamma_u10_scalar_f32\n#define xerf_u1 nsimd_sleef_erf_u10_scalar_f64\n#define xerff_u1 nsimd_sleef_erf_u10_scalar_f32\n#define xerfc_u15 nsimd_sleef_erfc_u15_scalar_f64\n#define xerfcf_u15 nsimd_sleef_erfc_u15_scalar_f32\n#define xgetInt nsimd_sleef_getInt_scalar_f64\n#define xgetIntf nsimd_sleef_getInt_scalar_f32\n#define xgetPtr nsimd_sleef_getPtr_scalar_f64\n#define xgetPtrf nsimd_sleef_getPtr_scalar_f32\n\n                   #endif\n\n                   #define rempi nsimd_sleef_rempi_scalar\n                   #define rempif nsimd_sleef_rempif_scalar\n                   #define rempisub nsimd_sleef_rempisub_scalar\n                   #define rempisubf nsimd_sleef_rempisubf_scalar\n                   #define gammak nsimd_gammak_scalar\n                   #define gammafk nsimd_gammafk_scalar\n\n                   \n\n                   \n\n#endif\n\n"
  },
  {
    "path": "src/renameadvsimd.h",
    "content": "#ifndef RENAMEADVSIMD_H\n               #define RENAMEADVSIMD_H\n\n               /* ------------------------------------------------------------------------- */\n                   /* Naming of functions aarch64 */\n\n                   #ifdef NSIMD_AARCH64\n\n                   #ifdef DETERMINISTIC\n\n                   #define xsin nsimd_sleef_sin_u35d_aarch64_f64\n#define xsinf nsimd_sleef_sin_u35d_aarch64_f32\n#define xcos nsimd_sleef_cos_u35d_aarch64_f64\n#define xcosf nsimd_sleef_cos_u35d_aarch64_f32\n#define xsincos nsimd_sleef_sincos_u35d_aarch64_f64\n#define xsincosf nsimd_sleef_sincos_u35d_aarch64_f32\n#define xtan nsimd_sleef_tan_u35d_aarch64_f64\n#define xtanf nsimd_sleef_tan_u35d_aarch64_f32\n#define xasin nsimd_sleef_asin_u35d_aarch64_f64\n#define xasinf nsimd_sleef_asin_u35d_aarch64_f32\n#define xacos nsimd_sleef_acos_u35d_aarch64_f64\n#define xacosf nsimd_sleef_acos_u35d_aarch64_f32\n#define xatan nsimd_sleef_atan_u35d_aarch64_f64\n#define xatanf nsimd_sleef_atan_u35d_aarch64_f32\n#define xatan2 nsimd_sleef_atan2_u35d_aarch64_f64\n#define xatan2f nsimd_sleef_atan2_u35d_aarch64_f32\n#define xlog nsimd_sleef_log_u35d_aarch64_f64\n#define xlogf nsimd_sleef_log_u35d_aarch64_f32\n#define xcbrt nsimd_sleef_cbrt_u35d_aarch64_f64\n#define xcbrtf nsimd_sleef_cbrt_u35d_aarch64_f32\n#define xsin_u1 nsimd_sleef_sin_u10d_aarch64_f64\n#define xsinf_u1 nsimd_sleef_sin_u10d_aarch64_f32\n#define xcos_u1 nsimd_sleef_cos_u10d_aarch64_f64\n#define xcosf_u1 nsimd_sleef_cos_u10d_aarch64_f32\n#define xsincos_u1 nsimd_sleef_sincos_u10d_aarch64_f64\n#define xsincosf_u1 nsimd_sleef_sincos_u10d_aarch64_f32\n#define xtan_u1 nsimd_sleef_tan_u10d_aarch64_f64\n#define xtanf_u1 nsimd_sleef_tan_u10d_aarch64_f32\n#define xasin_u1 nsimd_sleef_asin_u10d_aarch64_f64\n#define xasinf_u1 nsimd_sleef_asin_u10d_aarch64_f32\n#define xacos_u1 nsimd_sleef_acos_u10d_aarch64_f64\n#define xacosf_u1 nsimd_sleef_acos_u10d_aarch64_f32\n#define xatan_u1 nsimd_sleef_atan_u10d_aarch64_f64\n#define xatanf_u1 nsimd_sleef_atan_u10d_aarch64_f32\n#define xatan2_u1 nsimd_sleef_atan2_u10d_aarch64_f64\n#define xatan2f_u1 nsimd_sleef_atan2_u10d_aarch64_f32\n#define xlog_u1 nsimd_sleef_log_u10d_aarch64_f64\n#define xlogf_u1 nsimd_sleef_log_u10d_aarch64_f32\n#define xcbrt_u1 nsimd_sleef_cbrt_u10d_aarch64_f64\n#define xcbrtf_u1 nsimd_sleef_cbrt_u10d_aarch64_f32\n#define xexp nsimd_sleef_exp_u10d_aarch64_f64\n#define xexpf nsimd_sleef_exp_u10d_aarch64_f32\n#define xpow nsimd_sleef_pow_u10d_aarch64_f64\n#define xpowf nsimd_sleef_pow_u10d_aarch64_f32\n#define xsinh nsimd_sleef_sinh_u10d_aarch64_f64\n#define xsinhf nsimd_sleef_sinh_u10d_aarch64_f32\n#define xcosh nsimd_sleef_cosh_u10d_aarch64_f64\n#define xcoshf nsimd_sleef_cosh_u10d_aarch64_f32\n#define xtanh nsimd_sleef_tanh_u10d_aarch64_f64\n#define xtanhf nsimd_sleef_tanh_u10d_aarch64_f32\n#define xsinh_u35 nsimd_sleef_sinh_u35d_aarch64_f64\n#define xsinhf_u35 nsimd_sleef_sinh_u35d_aarch64_f32\n#define xcosh_u35 nsimd_sleef_cosh_u35d_aarch64_f64\n#define xcoshf_u35 nsimd_sleef_cosh_u35d_aarch64_f32\n#define xtanh_u35 nsimd_sleef_tanh_u35d_aarch64_f64\n#define xtanhf_u35 nsimd_sleef_tanh_u35d_aarch64_f32\n#define xfastsin_u3500 nsimd_sleef_fastsin_u3500d_aarch64_f64\n#define xfastsinf_u3500 nsimd_sleef_fastsin_u3500d_aarch64_f32\n#define xfastcos_u3500 nsimd_sleef_fastcos_u3500d_aarch64_f64\n#define xfastcosf_u3500 nsimd_sleef_fastcos_u3500d_aarch64_f32\n#define xfastpow_u3500 nsimd_sleef_fastpow_u3500d_aarch64_f64\n#define xfastpowf_u3500 nsimd_sleef_fastpow_u3500d_aarch64_f32\n#define xasinh nsimd_sleef_asinh_u10d_aarch64_f64\n#define xasinhf nsimd_sleef_asinh_u10d_aarch64_f32\n#define xacosh nsimd_sleef_acosh_u10d_aarch64_f64\n#define xacoshf nsimd_sleef_acosh_u10d_aarch64_f32\n#define xatanh nsimd_sleef_atanh_u10d_aarch64_f64\n#define xatanhf nsimd_sleef_atanh_u10d_aarch64_f32\n#define xexp2 nsimd_sleef_exp2_u10d_aarch64_f64\n#define xexp2f nsimd_sleef_exp2_u10d_aarch64_f32\n#define xexp2_u35 nsimd_sleef_exp2_u35d_aarch64_f64\n#define xexp2f_u35 nsimd_sleef_exp2_u35d_aarch64_f32\n#define xexp10 nsimd_sleef_exp10_u10d_aarch64_f64\n#define xexp10f nsimd_sleef_exp10_u10d_aarch64_f32\n#define xexp10_u35 nsimd_sleef_exp10_u35d_aarch64_f64\n#define xexp10f_u35 nsimd_sleef_exp10_u35d_aarch64_f32\n#define xexpm1 nsimd_sleef_expm1_u10d_aarch64_f64\n#define xexpm1f nsimd_sleef_expm1_u10d_aarch64_f32\n#define xlog10 nsimd_sleef_log10_u10d_aarch64_f64\n#define xlog10f nsimd_sleef_log10_u10d_aarch64_f32\n#define xlog2 nsimd_sleef_log2_u10d_aarch64_f64\n#define xlog2f nsimd_sleef_log2_u10d_aarch64_f32\n#define xlog2_u35 nsimd_sleef_log2_u35d_aarch64_f64\n#define xlog2f_u35 nsimd_sleef_log2_u35d_aarch64_f32\n#define xlog1p nsimd_sleef_log1p_u10d_aarch64_f64\n#define xlog1pf nsimd_sleef_log1p_u10d_aarch64_f32\n#define xsincospi_u05 nsimd_sleef_sincospi_u05d_aarch64_f64\n#define xsincospif_u05 nsimd_sleef_sincospi_u05d_aarch64_f32\n#define xsincospi_u35 nsimd_sleef_sincospi_u35d_aarch64_f64\n#define xsincospif_u35 nsimd_sleef_sincospi_u35d_aarch64_f32\n#define xsinpi_u05 nsimd_sleef_sinpi_u05d_aarch64_f64\n#define xsinpif_u05 nsimd_sleef_sinpi_u05d_aarch64_f32\n#define xcospi_u05 nsimd_sleef_cospi_u05d_aarch64_f64\n#define xcospif_u05 nsimd_sleef_cospi_u05d_aarch64_f32\n#define xldexp nsimd_sleef_ldexp_aarch64_f64\n#define xldexpf nsimd_sleef_ldexp_aarch64_f32\n#define xilogb nsimd_sleef_ilogb_aarch64_f64\n#define xilogbf nsimd_sleef_ilogb_aarch64_f32\n#define xfma nsimd_sleef_fma_aarch64_f64\n#define xfmaf nsimd_sleef_fma_aarch64_f32\n#define xsqrt nsimd_sleef_sqrt_aarch64_f64\n#define xsqrtf nsimd_sleef_sqrt_aarch64_f32\n#define xsqrt_u05 nsimd_sleef_sqrt_u05d_aarch64_f64\n#define xsqrtf_u05 nsimd_sleef_sqrt_u05d_aarch64_f32\n#define xsqrt_u35 nsimd_sleef_sqrt_u35d_aarch64_f64\n#define xsqrtf_u35 nsimd_sleef_sqrt_u35d_aarch64_f32\n#define xhypot_u05 nsimd_sleef_hypot_u05d_aarch64_f64\n#define xhypotf_u05 nsimd_sleef_hypot_u05d_aarch64_f32\n#define xhypot_u35 nsimd_sleef_hypot_u35d_aarch64_f64\n#define xhypotf_u35 nsimd_sleef_hypot_u35d_aarch64_f32\n#define xfabs nsimd_sleef_fabs_aarch64_f64\n#define xfabsf nsimd_sleef_fabs_aarch64_f32\n#define xcopysign nsimd_sleef_copysign_aarch64_f64\n#define xcopysignf nsimd_sleef_copysign_aarch64_f32\n#define xfmax nsimd_sleef_fmax_aarch64_f64\n#define xfmaxf nsimd_sleef_fmax_aarch64_f32\n#define xfmin nsimd_sleef_fmin_aarch64_f64\n#define xfminf nsimd_sleef_fmin_aarch64_f32\n#define xfdim nsimd_sleef_fdim_aarch64_f64\n#define xfdimf nsimd_sleef_fdim_aarch64_f32\n#define xtrunc nsimd_sleef_trunc_aarch64_f64\n#define xtruncf nsimd_sleef_trunc_aarch64_f32\n#define xfloor nsimd_sleef_floor_aarch64_f64\n#define xfloorf nsimd_sleef_floor_aarch64_f32\n#define xceil nsimd_sleef_ceil_aarch64_f64\n#define xceilf nsimd_sleef_ceil_aarch64_f32\n#define xround nsimd_sleef_round_aarch64_f64\n#define xroundf nsimd_sleef_round_aarch64_f32\n#define xrint nsimd_sleef_rint_aarch64_f64\n#define xrintf nsimd_sleef_rint_aarch64_f32\n#define xnextafter nsimd_sleef_nextafter_aarch64_f64\n#define xnextafterf nsimd_sleef_nextafter_aarch64_f32\n#define xfrfrexp nsimd_sleef_frfrexp_aarch64_f64\n#define xfrfrexpf nsimd_sleef_frfrexp_aarch64_f32\n#define xexpfrexp nsimd_sleef_expfrexp_aarch64_f64\n#define xexpfrexpf nsimd_sleef_expfrexp_aarch64_f32\n#define xfmod nsimd_sleef_fmod_aarch64_f64\n#define xfmodf nsimd_sleef_fmod_aarch64_f32\n#define xremainder nsimd_sleef_remainder_aarch64_f64\n#define xremainderf nsimd_sleef_remainder_aarch64_f32\n#define xmodf nsimd_sleef_modf_aarch64_f64\n#define xmodff nsimd_sleef_modf_aarch64_f32\n#define xlgamma_u1 nsimd_sleef_lgamma_u10d_aarch64_f64\n#define xlgammaf_u1 nsimd_sleef_lgamma_u10d_aarch64_f32\n#define xtgamma_u1 nsimd_sleef_tgamma_u10d_aarch64_f64\n#define xtgammaf_u1 nsimd_sleef_tgamma_u10d_aarch64_f32\n#define xerf_u1 nsimd_sleef_erf_u10d_aarch64_f64\n#define xerff_u1 nsimd_sleef_erf_u10d_aarch64_f32\n#define xerfc_u15 nsimd_sleef_erfc_u15d_aarch64_f64\n#define xerfcf_u15 nsimd_sleef_erfc_u15d_aarch64_f32\n#define xgetInt nsimd_sleef_getInt_aarch64_f64\n#define xgetIntf nsimd_sleef_getInt_aarch64_f32\n#define xgetPtr nsimd_sleef_getPtr_aarch64_f64\n#define xgetPtrf nsimd_sleef_getPtr_aarch64_f32\n\n                   #else\n\n                   #define xsin nsimd_sleef_sin_u35_aarch64_f64\n#define xsinf nsimd_sleef_sin_u35_aarch64_f32\n#define xcos nsimd_sleef_cos_u35_aarch64_f64\n#define xcosf nsimd_sleef_cos_u35_aarch64_f32\n#define xsincos nsimd_sleef_sincos_u35_aarch64_f64\n#define xsincosf nsimd_sleef_sincos_u35_aarch64_f32\n#define xtan nsimd_sleef_tan_u35_aarch64_f64\n#define xtanf nsimd_sleef_tan_u35_aarch64_f32\n#define xasin nsimd_sleef_asin_u35_aarch64_f64\n#define xasinf nsimd_sleef_asin_u35_aarch64_f32\n#define xacos nsimd_sleef_acos_u35_aarch64_f64\n#define xacosf nsimd_sleef_acos_u35_aarch64_f32\n#define xatan nsimd_sleef_atan_u35_aarch64_f64\n#define xatanf nsimd_sleef_atan_u35_aarch64_f32\n#define xatan2 nsimd_sleef_atan2_u35_aarch64_f64\n#define xatan2f nsimd_sleef_atan2_u35_aarch64_f32\n#define xlog nsimd_sleef_log_u35_aarch64_f64\n#define xlogf nsimd_sleef_log_u35_aarch64_f32\n#define xcbrt nsimd_sleef_cbrt_u35_aarch64_f64\n#define xcbrtf nsimd_sleef_cbrt_u35_aarch64_f32\n#define xsin_u1 nsimd_sleef_sin_u10_aarch64_f64\n#define xsinf_u1 nsimd_sleef_sin_u10_aarch64_f32\n#define xcos_u1 nsimd_sleef_cos_u10_aarch64_f64\n#define xcosf_u1 nsimd_sleef_cos_u10_aarch64_f32\n#define xsincos_u1 nsimd_sleef_sincos_u10_aarch64_f64\n#define xsincosf_u1 nsimd_sleef_sincos_u10_aarch64_f32\n#define xtan_u1 nsimd_sleef_tan_u10_aarch64_f64\n#define xtanf_u1 nsimd_sleef_tan_u10_aarch64_f32\n#define xasin_u1 nsimd_sleef_asin_u10_aarch64_f64\n#define xasinf_u1 nsimd_sleef_asin_u10_aarch64_f32\n#define xacos_u1 nsimd_sleef_acos_u10_aarch64_f64\n#define xacosf_u1 nsimd_sleef_acos_u10_aarch64_f32\n#define xatan_u1 nsimd_sleef_atan_u10_aarch64_f64\n#define xatanf_u1 nsimd_sleef_atan_u10_aarch64_f32\n#define xatan2_u1 nsimd_sleef_atan2_u10_aarch64_f64\n#define xatan2f_u1 nsimd_sleef_atan2_u10_aarch64_f32\n#define xlog_u1 nsimd_sleef_log_u10_aarch64_f64\n#define xlogf_u1 nsimd_sleef_log_u10_aarch64_f32\n#define xcbrt_u1 nsimd_sleef_cbrt_u10_aarch64_f64\n#define xcbrtf_u1 nsimd_sleef_cbrt_u10_aarch64_f32\n#define xexp nsimd_sleef_exp_u10_aarch64_f64\n#define xexpf nsimd_sleef_exp_u10_aarch64_f32\n#define xpow nsimd_sleef_pow_u10_aarch64_f64\n#define xpowf nsimd_sleef_pow_u10_aarch64_f32\n#define xsinh nsimd_sleef_sinh_u10_aarch64_f64\n#define xsinhf nsimd_sleef_sinh_u10_aarch64_f32\n#define xcosh nsimd_sleef_cosh_u10_aarch64_f64\n#define xcoshf nsimd_sleef_cosh_u10_aarch64_f32\n#define xtanh nsimd_sleef_tanh_u10_aarch64_f64\n#define xtanhf nsimd_sleef_tanh_u10_aarch64_f32\n#define xsinh_u35 nsimd_sleef_sinh_u35_aarch64_f64\n#define xsinhf_u35 nsimd_sleef_sinh_u35_aarch64_f32\n#define xcosh_u35 nsimd_sleef_cosh_u35_aarch64_f64\n#define xcoshf_u35 nsimd_sleef_cosh_u35_aarch64_f32\n#define xtanh_u35 nsimd_sleef_tanh_u35_aarch64_f64\n#define xtanhf_u35 nsimd_sleef_tanh_u35_aarch64_f32\n#define xfastsin_u3500 nsimd_sleef_fastsin_u3500_aarch64_f64\n#define xfastsinf_u3500 nsimd_sleef_fastsin_u3500_aarch64_f32\n#define xfastcos_u3500 nsimd_sleef_fastcos_u3500_aarch64_f64\n#define xfastcosf_u3500 nsimd_sleef_fastcos_u3500_aarch64_f32\n#define xfastpow_u3500 nsimd_sleef_fastpow_u3500_aarch64_f64\n#define xfastpowf_u3500 nsimd_sleef_fastpow_u3500_aarch64_f32\n#define xasinh nsimd_sleef_asinh_u10_aarch64_f64\n#define xasinhf nsimd_sleef_asinh_u10_aarch64_f32\n#define xacosh nsimd_sleef_acosh_u10_aarch64_f64\n#define xacoshf nsimd_sleef_acosh_u10_aarch64_f32\n#define xatanh nsimd_sleef_atanh_u10_aarch64_f64\n#define xatanhf nsimd_sleef_atanh_u10_aarch64_f32\n#define xexp2 nsimd_sleef_exp2_u10_aarch64_f64\n#define xexp2f nsimd_sleef_exp2_u10_aarch64_f32\n#define xexp2_u35 nsimd_sleef_exp2_u35_aarch64_f64\n#define xexp2f_u35 nsimd_sleef_exp2_u35_aarch64_f32\n#define xexp10 nsimd_sleef_exp10_u10_aarch64_f64\n#define xexp10f nsimd_sleef_exp10_u10_aarch64_f32\n#define xexp10_u35 nsimd_sleef_exp10_u35_aarch64_f64\n#define xexp10f_u35 nsimd_sleef_exp10_u35_aarch64_f32\n#define xexpm1 nsimd_sleef_expm1_u10_aarch64_f64\n#define xexpm1f nsimd_sleef_expm1_u10_aarch64_f32\n#define xlog10 nsimd_sleef_log10_u10_aarch64_f64\n#define xlog10f nsimd_sleef_log10_u10_aarch64_f32\n#define xlog2 nsimd_sleef_log2_u10_aarch64_f64\n#define xlog2f nsimd_sleef_log2_u10_aarch64_f32\n#define xlog2_u35 nsimd_sleef_log2_u35_aarch64_f64\n#define xlog2f_u35 nsimd_sleef_log2_u35_aarch64_f32\n#define xlog1p nsimd_sleef_log1p_u10_aarch64_f64\n#define xlog1pf nsimd_sleef_log1p_u10_aarch64_f32\n#define xsincospi_u05 nsimd_sleef_sincospi_u05_aarch64_f64\n#define xsincospif_u05 nsimd_sleef_sincospi_u05_aarch64_f32\n#define xsincospi_u35 nsimd_sleef_sincospi_u35_aarch64_f64\n#define xsincospif_u35 nsimd_sleef_sincospi_u35_aarch64_f32\n#define xsinpi_u05 nsimd_sleef_sinpi_u05_aarch64_f64\n#define xsinpif_u05 nsimd_sleef_sinpi_u05_aarch64_f32\n#define xcospi_u05 nsimd_sleef_cospi_u05_aarch64_f64\n#define xcospif_u05 nsimd_sleef_cospi_u05_aarch64_f32\n#define xldexp nsimd_sleef_ldexp_aarch64_f64\n#define xldexpf nsimd_sleef_ldexp_aarch64_f32\n#define xilogb nsimd_sleef_ilogb_aarch64_f64\n#define xilogbf nsimd_sleef_ilogb_aarch64_f32\n#define xfma nsimd_sleef_fma_aarch64_f64\n#define xfmaf nsimd_sleef_fma_aarch64_f32\n#define xsqrt nsimd_sleef_sqrt_aarch64_f64\n#define xsqrtf nsimd_sleef_sqrt_aarch64_f32\n#define xsqrt_u05 nsimd_sleef_sqrt_u05_aarch64_f64\n#define xsqrtf_u05 nsimd_sleef_sqrt_u05_aarch64_f32\n#define xsqrt_u35 nsimd_sleef_sqrt_u35_aarch64_f64\n#define xsqrtf_u35 nsimd_sleef_sqrt_u35_aarch64_f32\n#define xhypot_u05 nsimd_sleef_hypot_u05_aarch64_f64\n#define xhypotf_u05 nsimd_sleef_hypot_u05_aarch64_f32\n#define xhypot_u35 nsimd_sleef_hypot_u35_aarch64_f64\n#define xhypotf_u35 nsimd_sleef_hypot_u35_aarch64_f32\n#define xfabs nsimd_sleef_fabs_aarch64_f64\n#define xfabsf nsimd_sleef_fabs_aarch64_f32\n#define xcopysign nsimd_sleef_copysign_aarch64_f64\n#define xcopysignf nsimd_sleef_copysign_aarch64_f32\n#define xfmax nsimd_sleef_fmax_aarch64_f64\n#define xfmaxf nsimd_sleef_fmax_aarch64_f32\n#define xfmin nsimd_sleef_fmin_aarch64_f64\n#define xfminf nsimd_sleef_fmin_aarch64_f32\n#define xfdim nsimd_sleef_fdim_aarch64_f64\n#define xfdimf nsimd_sleef_fdim_aarch64_f32\n#define xtrunc nsimd_sleef_trunc_aarch64_f64\n#define xtruncf nsimd_sleef_trunc_aarch64_f32\n#define xfloor nsimd_sleef_floor_aarch64_f64\n#define xfloorf nsimd_sleef_floor_aarch64_f32\n#define xceil nsimd_sleef_ceil_aarch64_f64\n#define xceilf nsimd_sleef_ceil_aarch64_f32\n#define xround nsimd_sleef_round_aarch64_f64\n#define xroundf nsimd_sleef_round_aarch64_f32\n#define xrint nsimd_sleef_rint_aarch64_f64\n#define xrintf nsimd_sleef_rint_aarch64_f32\n#define xnextafter nsimd_sleef_nextafter_aarch64_f64\n#define xnextafterf nsimd_sleef_nextafter_aarch64_f32\n#define xfrfrexp nsimd_sleef_frfrexp_aarch64_f64\n#define xfrfrexpf nsimd_sleef_frfrexp_aarch64_f32\n#define xexpfrexp nsimd_sleef_expfrexp_aarch64_f64\n#define xexpfrexpf nsimd_sleef_expfrexp_aarch64_f32\n#define xfmod nsimd_sleef_fmod_aarch64_f64\n#define xfmodf nsimd_sleef_fmod_aarch64_f32\n#define xremainder nsimd_sleef_remainder_aarch64_f64\n#define xremainderf nsimd_sleef_remainder_aarch64_f32\n#define xmodf nsimd_sleef_modf_aarch64_f64\n#define xmodff nsimd_sleef_modf_aarch64_f32\n#define xlgamma_u1 nsimd_sleef_lgamma_u10_aarch64_f64\n#define xlgammaf_u1 nsimd_sleef_lgamma_u10_aarch64_f32\n#define xtgamma_u1 nsimd_sleef_tgamma_u10_aarch64_f64\n#define xtgammaf_u1 nsimd_sleef_tgamma_u10_aarch64_f32\n#define xerf_u1 nsimd_sleef_erf_u10_aarch64_f64\n#define xerff_u1 nsimd_sleef_erf_u10_aarch64_f32\n#define xerfc_u15 nsimd_sleef_erfc_u15_aarch64_f64\n#define xerfcf_u15 nsimd_sleef_erfc_u15_aarch64_f32\n#define xgetInt nsimd_sleef_getInt_aarch64_f64\n#define xgetIntf nsimd_sleef_getInt_aarch64_f32\n#define xgetPtr nsimd_sleef_getPtr_aarch64_f64\n#define xgetPtrf nsimd_sleef_getPtr_aarch64_f32\n\n                   #endif\n\n                   #define rempi nsimd_sleef_rempi_aarch64\n                   #define rempif nsimd_sleef_rempif_aarch64\n                   #define rempisub nsimd_sleef_rempisub_aarch64\n                   #define rempisubf nsimd_sleef_rempisubf_aarch64\n                   #define gammak nsimd_gammak_aarch64\n                   #define gammafk nsimd_gammafk_aarch64\n\n                   #endif\n\n                   \n\n#endif\n\n"
  },
  {
    "path": "src/renameavx.h",
    "content": "#ifndef RENAMEAVX_H\n               #define RENAMEAVX_H\n\n               /* ------------------------------------------------------------------------- */\n                   /* Naming of functions avx */\n\n                   #ifdef NSIMD_AVX\n\n                   #ifdef DETERMINISTIC\n\n                   #define xsin nsimd_sleef_sin_u35d_avx_f64\n#define xsinf nsimd_sleef_sin_u35d_avx_f32\n#define xcos nsimd_sleef_cos_u35d_avx_f64\n#define xcosf nsimd_sleef_cos_u35d_avx_f32\n#define xsincos nsimd_sleef_sincos_u35d_avx_f64\n#define xsincosf nsimd_sleef_sincos_u35d_avx_f32\n#define xtan nsimd_sleef_tan_u35d_avx_f64\n#define xtanf nsimd_sleef_tan_u35d_avx_f32\n#define xasin nsimd_sleef_asin_u35d_avx_f64\n#define xasinf nsimd_sleef_asin_u35d_avx_f32\n#define xacos nsimd_sleef_acos_u35d_avx_f64\n#define xacosf nsimd_sleef_acos_u35d_avx_f32\n#define xatan nsimd_sleef_atan_u35d_avx_f64\n#define xatanf nsimd_sleef_atan_u35d_avx_f32\n#define xatan2 nsimd_sleef_atan2_u35d_avx_f64\n#define xatan2f nsimd_sleef_atan2_u35d_avx_f32\n#define xlog nsimd_sleef_log_u35d_avx_f64\n#define xlogf nsimd_sleef_log_u35d_avx_f32\n#define xcbrt nsimd_sleef_cbrt_u35d_avx_f64\n#define xcbrtf nsimd_sleef_cbrt_u35d_avx_f32\n#define xsin_u1 nsimd_sleef_sin_u10d_avx_f64\n#define xsinf_u1 nsimd_sleef_sin_u10d_avx_f32\n#define xcos_u1 nsimd_sleef_cos_u10d_avx_f64\n#define xcosf_u1 nsimd_sleef_cos_u10d_avx_f32\n#define xsincos_u1 nsimd_sleef_sincos_u10d_avx_f64\n#define xsincosf_u1 nsimd_sleef_sincos_u10d_avx_f32\n#define xtan_u1 nsimd_sleef_tan_u10d_avx_f64\n#define xtanf_u1 nsimd_sleef_tan_u10d_avx_f32\n#define xasin_u1 nsimd_sleef_asin_u10d_avx_f64\n#define xasinf_u1 nsimd_sleef_asin_u10d_avx_f32\n#define xacos_u1 nsimd_sleef_acos_u10d_avx_f64\n#define xacosf_u1 nsimd_sleef_acos_u10d_avx_f32\n#define xatan_u1 nsimd_sleef_atan_u10d_avx_f64\n#define xatanf_u1 nsimd_sleef_atan_u10d_avx_f32\n#define xatan2_u1 nsimd_sleef_atan2_u10d_avx_f64\n#define xatan2f_u1 nsimd_sleef_atan2_u10d_avx_f32\n#define xlog_u1 nsimd_sleef_log_u10d_avx_f64\n#define xlogf_u1 nsimd_sleef_log_u10d_avx_f32\n#define xcbrt_u1 nsimd_sleef_cbrt_u10d_avx_f64\n#define xcbrtf_u1 nsimd_sleef_cbrt_u10d_avx_f32\n#define xexp nsimd_sleef_exp_u10d_avx_f64\n#define xexpf nsimd_sleef_exp_u10d_avx_f32\n#define xpow nsimd_sleef_pow_u10d_avx_f64\n#define xpowf nsimd_sleef_pow_u10d_avx_f32\n#define xsinh nsimd_sleef_sinh_u10d_avx_f64\n#define xsinhf nsimd_sleef_sinh_u10d_avx_f32\n#define xcosh nsimd_sleef_cosh_u10d_avx_f64\n#define xcoshf nsimd_sleef_cosh_u10d_avx_f32\n#define xtanh nsimd_sleef_tanh_u10d_avx_f64\n#define xtanhf nsimd_sleef_tanh_u10d_avx_f32\n#define xsinh_u35 nsimd_sleef_sinh_u35d_avx_f64\n#define xsinhf_u35 nsimd_sleef_sinh_u35d_avx_f32\n#define xcosh_u35 nsimd_sleef_cosh_u35d_avx_f64\n#define xcoshf_u35 nsimd_sleef_cosh_u35d_avx_f32\n#define xtanh_u35 nsimd_sleef_tanh_u35d_avx_f64\n#define xtanhf_u35 nsimd_sleef_tanh_u35d_avx_f32\n#define xfastsin_u3500 nsimd_sleef_fastsin_u3500d_avx_f64\n#define xfastsinf_u3500 nsimd_sleef_fastsin_u3500d_avx_f32\n#define xfastcos_u3500 nsimd_sleef_fastcos_u3500d_avx_f64\n#define xfastcosf_u3500 nsimd_sleef_fastcos_u3500d_avx_f32\n#define xfastpow_u3500 nsimd_sleef_fastpow_u3500d_avx_f64\n#define xfastpowf_u3500 nsimd_sleef_fastpow_u3500d_avx_f32\n#define xasinh nsimd_sleef_asinh_u10d_avx_f64\n#define xasinhf nsimd_sleef_asinh_u10d_avx_f32\n#define xacosh nsimd_sleef_acosh_u10d_avx_f64\n#define xacoshf nsimd_sleef_acosh_u10d_avx_f32\n#define xatanh nsimd_sleef_atanh_u10d_avx_f64\n#define xatanhf nsimd_sleef_atanh_u10d_avx_f32\n#define xexp2 nsimd_sleef_exp2_u10d_avx_f64\n#define xexp2f nsimd_sleef_exp2_u10d_avx_f32\n#define xexp2_u35 nsimd_sleef_exp2_u35d_avx_f64\n#define xexp2f_u35 nsimd_sleef_exp2_u35d_avx_f32\n#define xexp10 nsimd_sleef_exp10_u10d_avx_f64\n#define xexp10f nsimd_sleef_exp10_u10d_avx_f32\n#define xexp10_u35 nsimd_sleef_exp10_u35d_avx_f64\n#define xexp10f_u35 nsimd_sleef_exp10_u35d_avx_f32\n#define xexpm1 nsimd_sleef_expm1_u10d_avx_f64\n#define xexpm1f nsimd_sleef_expm1_u10d_avx_f32\n#define xlog10 nsimd_sleef_log10_u10d_avx_f64\n#define xlog10f nsimd_sleef_log10_u10d_avx_f32\n#define xlog2 nsimd_sleef_log2_u10d_avx_f64\n#define xlog2f nsimd_sleef_log2_u10d_avx_f32\n#define xlog2_u35 nsimd_sleef_log2_u35d_avx_f64\n#define xlog2f_u35 nsimd_sleef_log2_u35d_avx_f32\n#define xlog1p nsimd_sleef_log1p_u10d_avx_f64\n#define xlog1pf nsimd_sleef_log1p_u10d_avx_f32\n#define xsincospi_u05 nsimd_sleef_sincospi_u05d_avx_f64\n#define xsincospif_u05 nsimd_sleef_sincospi_u05d_avx_f32\n#define xsincospi_u35 nsimd_sleef_sincospi_u35d_avx_f64\n#define xsincospif_u35 nsimd_sleef_sincospi_u35d_avx_f32\n#define xsinpi_u05 nsimd_sleef_sinpi_u05d_avx_f64\n#define xsinpif_u05 nsimd_sleef_sinpi_u05d_avx_f32\n#define xcospi_u05 nsimd_sleef_cospi_u05d_avx_f64\n#define xcospif_u05 nsimd_sleef_cospi_u05d_avx_f32\n#define xldexp nsimd_sleef_ldexp_avx_f64\n#define xldexpf nsimd_sleef_ldexp_avx_f32\n#define xilogb nsimd_sleef_ilogb_avx_f64\n#define xilogbf nsimd_sleef_ilogb_avx_f32\n#define xfma nsimd_sleef_fma_avx_f64\n#define xfmaf nsimd_sleef_fma_avx_f32\n#define xsqrt nsimd_sleef_sqrt_avx_f64\n#define xsqrtf nsimd_sleef_sqrt_avx_f32\n#define xsqrt_u05 nsimd_sleef_sqrt_u05d_avx_f64\n#define xsqrtf_u05 nsimd_sleef_sqrt_u05d_avx_f32\n#define xsqrt_u35 nsimd_sleef_sqrt_u35d_avx_f64\n#define xsqrtf_u35 nsimd_sleef_sqrt_u35d_avx_f32\n#define xhypot_u05 nsimd_sleef_hypot_u05d_avx_f64\n#define xhypotf_u05 nsimd_sleef_hypot_u05d_avx_f32\n#define xhypot_u35 nsimd_sleef_hypot_u35d_avx_f64\n#define xhypotf_u35 nsimd_sleef_hypot_u35d_avx_f32\n#define xfabs nsimd_sleef_fabs_avx_f64\n#define xfabsf nsimd_sleef_fabs_avx_f32\n#define xcopysign nsimd_sleef_copysign_avx_f64\n#define xcopysignf nsimd_sleef_copysign_avx_f32\n#define xfmax nsimd_sleef_fmax_avx_f64\n#define xfmaxf nsimd_sleef_fmax_avx_f32\n#define xfmin nsimd_sleef_fmin_avx_f64\n#define xfminf nsimd_sleef_fmin_avx_f32\n#define xfdim nsimd_sleef_fdim_avx_f64\n#define xfdimf nsimd_sleef_fdim_avx_f32\n#define xtrunc nsimd_sleef_trunc_avx_f64\n#define xtruncf nsimd_sleef_trunc_avx_f32\n#define xfloor nsimd_sleef_floor_avx_f64\n#define xfloorf nsimd_sleef_floor_avx_f32\n#define xceil nsimd_sleef_ceil_avx_f64\n#define xceilf nsimd_sleef_ceil_avx_f32\n#define xround nsimd_sleef_round_avx_f64\n#define xroundf nsimd_sleef_round_avx_f32\n#define xrint nsimd_sleef_rint_avx_f64\n#define xrintf nsimd_sleef_rint_avx_f32\n#define xnextafter nsimd_sleef_nextafter_avx_f64\n#define xnextafterf nsimd_sleef_nextafter_avx_f32\n#define xfrfrexp nsimd_sleef_frfrexp_avx_f64\n#define xfrfrexpf nsimd_sleef_frfrexp_avx_f32\n#define xexpfrexp nsimd_sleef_expfrexp_avx_f64\n#define xexpfrexpf nsimd_sleef_expfrexp_avx_f32\n#define xfmod nsimd_sleef_fmod_avx_f64\n#define xfmodf nsimd_sleef_fmod_avx_f32\n#define xremainder nsimd_sleef_remainder_avx_f64\n#define xremainderf nsimd_sleef_remainder_avx_f32\n#define xmodf nsimd_sleef_modf_avx_f64\n#define xmodff nsimd_sleef_modf_avx_f32\n#define xlgamma_u1 nsimd_sleef_lgamma_u10d_avx_f64\n#define xlgammaf_u1 nsimd_sleef_lgamma_u10d_avx_f32\n#define xtgamma_u1 nsimd_sleef_tgamma_u10d_avx_f64\n#define xtgammaf_u1 nsimd_sleef_tgamma_u10d_avx_f32\n#define xerf_u1 nsimd_sleef_erf_u10d_avx_f64\n#define xerff_u1 nsimd_sleef_erf_u10d_avx_f32\n#define xerfc_u15 nsimd_sleef_erfc_u15d_avx_f64\n#define xerfcf_u15 nsimd_sleef_erfc_u15d_avx_f32\n#define xgetInt nsimd_sleef_getInt_avx_f64\n#define xgetIntf nsimd_sleef_getInt_avx_f32\n#define xgetPtr nsimd_sleef_getPtr_avx_f64\n#define xgetPtrf nsimd_sleef_getPtr_avx_f32\n\n                   #else\n\n                   #define xsin nsimd_sleef_sin_u35_avx_f64\n#define xsinf nsimd_sleef_sin_u35_avx_f32\n#define xcos nsimd_sleef_cos_u35_avx_f64\n#define xcosf nsimd_sleef_cos_u35_avx_f32\n#define xsincos nsimd_sleef_sincos_u35_avx_f64\n#define xsincosf nsimd_sleef_sincos_u35_avx_f32\n#define xtan nsimd_sleef_tan_u35_avx_f64\n#define xtanf nsimd_sleef_tan_u35_avx_f32\n#define xasin nsimd_sleef_asin_u35_avx_f64\n#define xasinf nsimd_sleef_asin_u35_avx_f32\n#define xacos nsimd_sleef_acos_u35_avx_f64\n#define xacosf nsimd_sleef_acos_u35_avx_f32\n#define xatan nsimd_sleef_atan_u35_avx_f64\n#define xatanf nsimd_sleef_atan_u35_avx_f32\n#define xatan2 nsimd_sleef_atan2_u35_avx_f64\n#define xatan2f nsimd_sleef_atan2_u35_avx_f32\n#define xlog nsimd_sleef_log_u35_avx_f64\n#define xlogf nsimd_sleef_log_u35_avx_f32\n#define xcbrt nsimd_sleef_cbrt_u35_avx_f64\n#define xcbrtf nsimd_sleef_cbrt_u35_avx_f32\n#define xsin_u1 nsimd_sleef_sin_u10_avx_f64\n#define xsinf_u1 nsimd_sleef_sin_u10_avx_f32\n#define xcos_u1 nsimd_sleef_cos_u10_avx_f64\n#define xcosf_u1 nsimd_sleef_cos_u10_avx_f32\n#define xsincos_u1 nsimd_sleef_sincos_u10_avx_f64\n#define xsincosf_u1 nsimd_sleef_sincos_u10_avx_f32\n#define xtan_u1 nsimd_sleef_tan_u10_avx_f64\n#define xtanf_u1 nsimd_sleef_tan_u10_avx_f32\n#define xasin_u1 nsimd_sleef_asin_u10_avx_f64\n#define xasinf_u1 nsimd_sleef_asin_u10_avx_f32\n#define xacos_u1 nsimd_sleef_acos_u10_avx_f64\n#define xacosf_u1 nsimd_sleef_acos_u10_avx_f32\n#define xatan_u1 nsimd_sleef_atan_u10_avx_f64\n#define xatanf_u1 nsimd_sleef_atan_u10_avx_f32\n#define xatan2_u1 nsimd_sleef_atan2_u10_avx_f64\n#define xatan2f_u1 nsimd_sleef_atan2_u10_avx_f32\n#define xlog_u1 nsimd_sleef_log_u10_avx_f64\n#define xlogf_u1 nsimd_sleef_log_u10_avx_f32\n#define xcbrt_u1 nsimd_sleef_cbrt_u10_avx_f64\n#define xcbrtf_u1 nsimd_sleef_cbrt_u10_avx_f32\n#define xexp nsimd_sleef_exp_u10_avx_f64\n#define xexpf nsimd_sleef_exp_u10_avx_f32\n#define xpow nsimd_sleef_pow_u10_avx_f64\n#define xpowf nsimd_sleef_pow_u10_avx_f32\n#define xsinh nsimd_sleef_sinh_u10_avx_f64\n#define xsinhf nsimd_sleef_sinh_u10_avx_f32\n#define xcosh nsimd_sleef_cosh_u10_avx_f64\n#define xcoshf nsimd_sleef_cosh_u10_avx_f32\n#define xtanh nsimd_sleef_tanh_u10_avx_f64\n#define xtanhf nsimd_sleef_tanh_u10_avx_f32\n#define xsinh_u35 nsimd_sleef_sinh_u35_avx_f64\n#define xsinhf_u35 nsimd_sleef_sinh_u35_avx_f32\n#define xcosh_u35 nsimd_sleef_cosh_u35_avx_f64\n#define xcoshf_u35 nsimd_sleef_cosh_u35_avx_f32\n#define xtanh_u35 nsimd_sleef_tanh_u35_avx_f64\n#define xtanhf_u35 nsimd_sleef_tanh_u35_avx_f32\n#define xfastsin_u3500 nsimd_sleef_fastsin_u3500_avx_f64\n#define xfastsinf_u3500 nsimd_sleef_fastsin_u3500_avx_f32\n#define xfastcos_u3500 nsimd_sleef_fastcos_u3500_avx_f64\n#define xfastcosf_u3500 nsimd_sleef_fastcos_u3500_avx_f32\n#define xfastpow_u3500 nsimd_sleef_fastpow_u3500_avx_f64\n#define xfastpowf_u3500 nsimd_sleef_fastpow_u3500_avx_f32\n#define xasinh nsimd_sleef_asinh_u10_avx_f64\n#define xasinhf nsimd_sleef_asinh_u10_avx_f32\n#define xacosh nsimd_sleef_acosh_u10_avx_f64\n#define xacoshf nsimd_sleef_acosh_u10_avx_f32\n#define xatanh nsimd_sleef_atanh_u10_avx_f64\n#define xatanhf nsimd_sleef_atanh_u10_avx_f32\n#define xexp2 nsimd_sleef_exp2_u10_avx_f64\n#define xexp2f nsimd_sleef_exp2_u10_avx_f32\n#define xexp2_u35 nsimd_sleef_exp2_u35_avx_f64\n#define xexp2f_u35 nsimd_sleef_exp2_u35_avx_f32\n#define xexp10 nsimd_sleef_exp10_u10_avx_f64\n#define xexp10f nsimd_sleef_exp10_u10_avx_f32\n#define xexp10_u35 nsimd_sleef_exp10_u35_avx_f64\n#define xexp10f_u35 nsimd_sleef_exp10_u35_avx_f32\n#define xexpm1 nsimd_sleef_expm1_u10_avx_f64\n#define xexpm1f nsimd_sleef_expm1_u10_avx_f32\n#define xlog10 nsimd_sleef_log10_u10_avx_f64\n#define xlog10f nsimd_sleef_log10_u10_avx_f32\n#define xlog2 nsimd_sleef_log2_u10_avx_f64\n#define xlog2f nsimd_sleef_log2_u10_avx_f32\n#define xlog2_u35 nsimd_sleef_log2_u35_avx_f64\n#define xlog2f_u35 nsimd_sleef_log2_u35_avx_f32\n#define xlog1p nsimd_sleef_log1p_u10_avx_f64\n#define xlog1pf nsimd_sleef_log1p_u10_avx_f32\n#define xsincospi_u05 nsimd_sleef_sincospi_u05_avx_f64\n#define xsincospif_u05 nsimd_sleef_sincospi_u05_avx_f32\n#define xsincospi_u35 nsimd_sleef_sincospi_u35_avx_f64\n#define xsincospif_u35 nsimd_sleef_sincospi_u35_avx_f32\n#define xsinpi_u05 nsimd_sleef_sinpi_u05_avx_f64\n#define xsinpif_u05 nsimd_sleef_sinpi_u05_avx_f32\n#define xcospi_u05 nsimd_sleef_cospi_u05_avx_f64\n#define xcospif_u05 nsimd_sleef_cospi_u05_avx_f32\n#define xldexp nsimd_sleef_ldexp_avx_f64\n#define xldexpf nsimd_sleef_ldexp_avx_f32\n#define xilogb nsimd_sleef_ilogb_avx_f64\n#define xilogbf nsimd_sleef_ilogb_avx_f32\n#define xfma nsimd_sleef_fma_avx_f64\n#define xfmaf nsimd_sleef_fma_avx_f32\n#define xsqrt nsimd_sleef_sqrt_avx_f64\n#define xsqrtf nsimd_sleef_sqrt_avx_f32\n#define xsqrt_u05 nsimd_sleef_sqrt_u05_avx_f64\n#define xsqrtf_u05 nsimd_sleef_sqrt_u05_avx_f32\n#define xsqrt_u35 nsimd_sleef_sqrt_u35_avx_f64\n#define xsqrtf_u35 nsimd_sleef_sqrt_u35_avx_f32\n#define xhypot_u05 nsimd_sleef_hypot_u05_avx_f64\n#define xhypotf_u05 nsimd_sleef_hypot_u05_avx_f32\n#define xhypot_u35 nsimd_sleef_hypot_u35_avx_f64\n#define xhypotf_u35 nsimd_sleef_hypot_u35_avx_f32\n#define xfabs nsimd_sleef_fabs_avx_f64\n#define xfabsf nsimd_sleef_fabs_avx_f32\n#define xcopysign nsimd_sleef_copysign_avx_f64\n#define xcopysignf nsimd_sleef_copysign_avx_f32\n#define xfmax nsimd_sleef_fmax_avx_f64\n#define xfmaxf nsimd_sleef_fmax_avx_f32\n#define xfmin nsimd_sleef_fmin_avx_f64\n#define xfminf nsimd_sleef_fmin_avx_f32\n#define xfdim nsimd_sleef_fdim_avx_f64\n#define xfdimf nsimd_sleef_fdim_avx_f32\n#define xtrunc nsimd_sleef_trunc_avx_f64\n#define xtruncf nsimd_sleef_trunc_avx_f32\n#define xfloor nsimd_sleef_floor_avx_f64\n#define xfloorf nsimd_sleef_floor_avx_f32\n#define xceil nsimd_sleef_ceil_avx_f64\n#define xceilf nsimd_sleef_ceil_avx_f32\n#define xround nsimd_sleef_round_avx_f64\n#define xroundf nsimd_sleef_round_avx_f32\n#define xrint nsimd_sleef_rint_avx_f64\n#define xrintf nsimd_sleef_rint_avx_f32\n#define xnextafter nsimd_sleef_nextafter_avx_f64\n#define xnextafterf nsimd_sleef_nextafter_avx_f32\n#define xfrfrexp nsimd_sleef_frfrexp_avx_f64\n#define xfrfrexpf nsimd_sleef_frfrexp_avx_f32\n#define xexpfrexp nsimd_sleef_expfrexp_avx_f64\n#define xexpfrexpf nsimd_sleef_expfrexp_avx_f32\n#define xfmod nsimd_sleef_fmod_avx_f64\n#define xfmodf nsimd_sleef_fmod_avx_f32\n#define xremainder nsimd_sleef_remainder_avx_f64\n#define xremainderf nsimd_sleef_remainder_avx_f32\n#define xmodf nsimd_sleef_modf_avx_f64\n#define xmodff nsimd_sleef_modf_avx_f32\n#define xlgamma_u1 nsimd_sleef_lgamma_u10_avx_f64\n#define xlgammaf_u1 nsimd_sleef_lgamma_u10_avx_f32\n#define xtgamma_u1 nsimd_sleef_tgamma_u10_avx_f64\n#define xtgammaf_u1 nsimd_sleef_tgamma_u10_avx_f32\n#define xerf_u1 nsimd_sleef_erf_u10_avx_f64\n#define xerff_u1 nsimd_sleef_erf_u10_avx_f32\n#define xerfc_u15 nsimd_sleef_erfc_u15_avx_f64\n#define xerfcf_u15 nsimd_sleef_erfc_u15_avx_f32\n#define xgetInt nsimd_sleef_getInt_avx_f64\n#define xgetIntf nsimd_sleef_getInt_avx_f32\n#define xgetPtr nsimd_sleef_getPtr_avx_f64\n#define xgetPtrf nsimd_sleef_getPtr_avx_f32\n\n                   #endif\n\n                   #define rempi nsimd_sleef_rempi_avx\n                   #define rempif nsimd_sleef_rempif_avx\n                   #define rempisub nsimd_sleef_rempisub_avx\n                   #define rempisubf nsimd_sleef_rempisubf_avx\n                   #define gammak nsimd_gammak_avx\n                   #define gammafk nsimd_gammafk_avx\n\n                   #endif\n\n                   \n\n#endif\n\n"
  },
  {
    "path": "src/renameavx2.h",
    "content": "#ifndef RENAMEAVX2_H\n               #define RENAMEAVX2_H\n\n               /* ------------------------------------------------------------------------- */\n                   /* Naming of functions avx2 */\n\n                   #ifdef NSIMD_AVX2\n\n                   #ifdef DETERMINISTIC\n\n                   #define xsin nsimd_sleef_sin_u35d_avx2_f64\n#define xsinf nsimd_sleef_sin_u35d_avx2_f32\n#define xcos nsimd_sleef_cos_u35d_avx2_f64\n#define xcosf nsimd_sleef_cos_u35d_avx2_f32\n#define xsincos nsimd_sleef_sincos_u35d_avx2_f64\n#define xsincosf nsimd_sleef_sincos_u35d_avx2_f32\n#define xtan nsimd_sleef_tan_u35d_avx2_f64\n#define xtanf nsimd_sleef_tan_u35d_avx2_f32\n#define xasin nsimd_sleef_asin_u35d_avx2_f64\n#define xasinf nsimd_sleef_asin_u35d_avx2_f32\n#define xacos nsimd_sleef_acos_u35d_avx2_f64\n#define xacosf nsimd_sleef_acos_u35d_avx2_f32\n#define xatan nsimd_sleef_atan_u35d_avx2_f64\n#define xatanf nsimd_sleef_atan_u35d_avx2_f32\n#define xatan2 nsimd_sleef_atan2_u35d_avx2_f64\n#define xatan2f nsimd_sleef_atan2_u35d_avx2_f32\n#define xlog nsimd_sleef_log_u35d_avx2_f64\n#define xlogf nsimd_sleef_log_u35d_avx2_f32\n#define xcbrt nsimd_sleef_cbrt_u35d_avx2_f64\n#define xcbrtf nsimd_sleef_cbrt_u35d_avx2_f32\n#define xsin_u1 nsimd_sleef_sin_u10d_avx2_f64\n#define xsinf_u1 nsimd_sleef_sin_u10d_avx2_f32\n#define xcos_u1 nsimd_sleef_cos_u10d_avx2_f64\n#define xcosf_u1 nsimd_sleef_cos_u10d_avx2_f32\n#define xsincos_u1 nsimd_sleef_sincos_u10d_avx2_f64\n#define xsincosf_u1 nsimd_sleef_sincos_u10d_avx2_f32\n#define xtan_u1 nsimd_sleef_tan_u10d_avx2_f64\n#define xtanf_u1 nsimd_sleef_tan_u10d_avx2_f32\n#define xasin_u1 nsimd_sleef_asin_u10d_avx2_f64\n#define xasinf_u1 nsimd_sleef_asin_u10d_avx2_f32\n#define xacos_u1 nsimd_sleef_acos_u10d_avx2_f64\n#define xacosf_u1 nsimd_sleef_acos_u10d_avx2_f32\n#define xatan_u1 nsimd_sleef_atan_u10d_avx2_f64\n#define xatanf_u1 nsimd_sleef_atan_u10d_avx2_f32\n#define xatan2_u1 nsimd_sleef_atan2_u10d_avx2_f64\n#define xatan2f_u1 nsimd_sleef_atan2_u10d_avx2_f32\n#define xlog_u1 nsimd_sleef_log_u10d_avx2_f64\n#define xlogf_u1 nsimd_sleef_log_u10d_avx2_f32\n#define xcbrt_u1 nsimd_sleef_cbrt_u10d_avx2_f64\n#define xcbrtf_u1 nsimd_sleef_cbrt_u10d_avx2_f32\n#define xexp nsimd_sleef_exp_u10d_avx2_f64\n#define xexpf nsimd_sleef_exp_u10d_avx2_f32\n#define xpow nsimd_sleef_pow_u10d_avx2_f64\n#define xpowf nsimd_sleef_pow_u10d_avx2_f32\n#define xsinh nsimd_sleef_sinh_u10d_avx2_f64\n#define xsinhf nsimd_sleef_sinh_u10d_avx2_f32\n#define xcosh nsimd_sleef_cosh_u10d_avx2_f64\n#define xcoshf nsimd_sleef_cosh_u10d_avx2_f32\n#define xtanh nsimd_sleef_tanh_u10d_avx2_f64\n#define xtanhf nsimd_sleef_tanh_u10d_avx2_f32\n#define xsinh_u35 nsimd_sleef_sinh_u35d_avx2_f64\n#define xsinhf_u35 nsimd_sleef_sinh_u35d_avx2_f32\n#define xcosh_u35 nsimd_sleef_cosh_u35d_avx2_f64\n#define xcoshf_u35 nsimd_sleef_cosh_u35d_avx2_f32\n#define xtanh_u35 nsimd_sleef_tanh_u35d_avx2_f64\n#define xtanhf_u35 nsimd_sleef_tanh_u35d_avx2_f32\n#define xfastsin_u3500 nsimd_sleef_fastsin_u3500d_avx2_f64\n#define xfastsinf_u3500 nsimd_sleef_fastsin_u3500d_avx2_f32\n#define xfastcos_u3500 nsimd_sleef_fastcos_u3500d_avx2_f64\n#define xfastcosf_u3500 nsimd_sleef_fastcos_u3500d_avx2_f32\n#define xfastpow_u3500 nsimd_sleef_fastpow_u3500d_avx2_f64\n#define xfastpowf_u3500 nsimd_sleef_fastpow_u3500d_avx2_f32\n#define xasinh nsimd_sleef_asinh_u10d_avx2_f64\n#define xasinhf nsimd_sleef_asinh_u10d_avx2_f32\n#define xacosh nsimd_sleef_acosh_u10d_avx2_f64\n#define xacoshf nsimd_sleef_acosh_u10d_avx2_f32\n#define xatanh nsimd_sleef_atanh_u10d_avx2_f64\n#define xatanhf nsimd_sleef_atanh_u10d_avx2_f32\n#define xexp2 nsimd_sleef_exp2_u10d_avx2_f64\n#define xexp2f nsimd_sleef_exp2_u10d_avx2_f32\n#define xexp2_u35 nsimd_sleef_exp2_u35d_avx2_f64\n#define xexp2f_u35 nsimd_sleef_exp2_u35d_avx2_f32\n#define xexp10 nsimd_sleef_exp10_u10d_avx2_f64\n#define xexp10f nsimd_sleef_exp10_u10d_avx2_f32\n#define xexp10_u35 nsimd_sleef_exp10_u35d_avx2_f64\n#define xexp10f_u35 nsimd_sleef_exp10_u35d_avx2_f32\n#define xexpm1 nsimd_sleef_expm1_u10d_avx2_f64\n#define xexpm1f nsimd_sleef_expm1_u10d_avx2_f32\n#define xlog10 nsimd_sleef_log10_u10d_avx2_f64\n#define xlog10f nsimd_sleef_log10_u10d_avx2_f32\n#define xlog2 nsimd_sleef_log2_u10d_avx2_f64\n#define xlog2f nsimd_sleef_log2_u10d_avx2_f32\n#define xlog2_u35 nsimd_sleef_log2_u35d_avx2_f64\n#define xlog2f_u35 nsimd_sleef_log2_u35d_avx2_f32\n#define xlog1p nsimd_sleef_log1p_u10d_avx2_f64\n#define xlog1pf nsimd_sleef_log1p_u10d_avx2_f32\n#define xsincospi_u05 nsimd_sleef_sincospi_u05d_avx2_f64\n#define xsincospif_u05 nsimd_sleef_sincospi_u05d_avx2_f32\n#define xsincospi_u35 nsimd_sleef_sincospi_u35d_avx2_f64\n#define xsincospif_u35 nsimd_sleef_sincospi_u35d_avx2_f32\n#define xsinpi_u05 nsimd_sleef_sinpi_u05d_avx2_f64\n#define xsinpif_u05 nsimd_sleef_sinpi_u05d_avx2_f32\n#define xcospi_u05 nsimd_sleef_cospi_u05d_avx2_f64\n#define xcospif_u05 nsimd_sleef_cospi_u05d_avx2_f32\n#define xldexp nsimd_sleef_ldexp_avx2_f64\n#define xldexpf nsimd_sleef_ldexp_avx2_f32\n#define xilogb nsimd_sleef_ilogb_avx2_f64\n#define xilogbf nsimd_sleef_ilogb_avx2_f32\n#define xfma nsimd_sleef_fma_avx2_f64\n#define xfmaf nsimd_sleef_fma_avx2_f32\n#define xsqrt nsimd_sleef_sqrt_avx2_f64\n#define xsqrtf nsimd_sleef_sqrt_avx2_f32\n#define xsqrt_u05 nsimd_sleef_sqrt_u05d_avx2_f64\n#define xsqrtf_u05 nsimd_sleef_sqrt_u05d_avx2_f32\n#define xsqrt_u35 nsimd_sleef_sqrt_u35d_avx2_f64\n#define xsqrtf_u35 nsimd_sleef_sqrt_u35d_avx2_f32\n#define xhypot_u05 nsimd_sleef_hypot_u05d_avx2_f64\n#define xhypotf_u05 nsimd_sleef_hypot_u05d_avx2_f32\n#define xhypot_u35 nsimd_sleef_hypot_u35d_avx2_f64\n#define xhypotf_u35 nsimd_sleef_hypot_u35d_avx2_f32\n#define xfabs nsimd_sleef_fabs_avx2_f64\n#define xfabsf nsimd_sleef_fabs_avx2_f32\n#define xcopysign nsimd_sleef_copysign_avx2_f64\n#define xcopysignf nsimd_sleef_copysign_avx2_f32\n#define xfmax nsimd_sleef_fmax_avx2_f64\n#define xfmaxf nsimd_sleef_fmax_avx2_f32\n#define xfmin nsimd_sleef_fmin_avx2_f64\n#define xfminf nsimd_sleef_fmin_avx2_f32\n#define xfdim nsimd_sleef_fdim_avx2_f64\n#define xfdimf nsimd_sleef_fdim_avx2_f32\n#define xtrunc nsimd_sleef_trunc_avx2_f64\n#define xtruncf nsimd_sleef_trunc_avx2_f32\n#define xfloor nsimd_sleef_floor_avx2_f64\n#define xfloorf nsimd_sleef_floor_avx2_f32\n#define xceil nsimd_sleef_ceil_avx2_f64\n#define xceilf nsimd_sleef_ceil_avx2_f32\n#define xround nsimd_sleef_round_avx2_f64\n#define xroundf nsimd_sleef_round_avx2_f32\n#define xrint nsimd_sleef_rint_avx2_f64\n#define xrintf nsimd_sleef_rint_avx2_f32\n#define xnextafter nsimd_sleef_nextafter_avx2_f64\n#define xnextafterf nsimd_sleef_nextafter_avx2_f32\n#define xfrfrexp nsimd_sleef_frfrexp_avx2_f64\n#define xfrfrexpf nsimd_sleef_frfrexp_avx2_f32\n#define xexpfrexp nsimd_sleef_expfrexp_avx2_f64\n#define xexpfrexpf nsimd_sleef_expfrexp_avx2_f32\n#define xfmod nsimd_sleef_fmod_avx2_f64\n#define xfmodf nsimd_sleef_fmod_avx2_f32\n#define xremainder nsimd_sleef_remainder_avx2_f64\n#define xremainderf nsimd_sleef_remainder_avx2_f32\n#define xmodf nsimd_sleef_modf_avx2_f64\n#define xmodff nsimd_sleef_modf_avx2_f32\n#define xlgamma_u1 nsimd_sleef_lgamma_u10d_avx2_f64\n#define xlgammaf_u1 nsimd_sleef_lgamma_u10d_avx2_f32\n#define xtgamma_u1 nsimd_sleef_tgamma_u10d_avx2_f64\n#define xtgammaf_u1 nsimd_sleef_tgamma_u10d_avx2_f32\n#define xerf_u1 nsimd_sleef_erf_u10d_avx2_f64\n#define xerff_u1 nsimd_sleef_erf_u10d_avx2_f32\n#define xerfc_u15 nsimd_sleef_erfc_u15d_avx2_f64\n#define xerfcf_u15 nsimd_sleef_erfc_u15d_avx2_f32\n#define xgetInt nsimd_sleef_getInt_avx2_f64\n#define xgetIntf nsimd_sleef_getInt_avx2_f32\n#define xgetPtr nsimd_sleef_getPtr_avx2_f64\n#define xgetPtrf nsimd_sleef_getPtr_avx2_f32\n\n                   #else\n\n                   #define xsin nsimd_sleef_sin_u35_avx2_f64\n#define xsinf nsimd_sleef_sin_u35_avx2_f32\n#define xcos nsimd_sleef_cos_u35_avx2_f64\n#define xcosf nsimd_sleef_cos_u35_avx2_f32\n#define xsincos nsimd_sleef_sincos_u35_avx2_f64\n#define xsincosf nsimd_sleef_sincos_u35_avx2_f32\n#define xtan nsimd_sleef_tan_u35_avx2_f64\n#define xtanf nsimd_sleef_tan_u35_avx2_f32\n#define xasin nsimd_sleef_asin_u35_avx2_f64\n#define xasinf nsimd_sleef_asin_u35_avx2_f32\n#define xacos nsimd_sleef_acos_u35_avx2_f64\n#define xacosf nsimd_sleef_acos_u35_avx2_f32\n#define xatan nsimd_sleef_atan_u35_avx2_f64\n#define xatanf nsimd_sleef_atan_u35_avx2_f32\n#define xatan2 nsimd_sleef_atan2_u35_avx2_f64\n#define xatan2f nsimd_sleef_atan2_u35_avx2_f32\n#define xlog nsimd_sleef_log_u35_avx2_f64\n#define xlogf nsimd_sleef_log_u35_avx2_f32\n#define xcbrt nsimd_sleef_cbrt_u35_avx2_f64\n#define xcbrtf nsimd_sleef_cbrt_u35_avx2_f32\n#define xsin_u1 nsimd_sleef_sin_u10_avx2_f64\n#define xsinf_u1 nsimd_sleef_sin_u10_avx2_f32\n#define xcos_u1 nsimd_sleef_cos_u10_avx2_f64\n#define xcosf_u1 nsimd_sleef_cos_u10_avx2_f32\n#define xsincos_u1 nsimd_sleef_sincos_u10_avx2_f64\n#define xsincosf_u1 nsimd_sleef_sincos_u10_avx2_f32\n#define xtan_u1 nsimd_sleef_tan_u10_avx2_f64\n#define xtanf_u1 nsimd_sleef_tan_u10_avx2_f32\n#define xasin_u1 nsimd_sleef_asin_u10_avx2_f64\n#define xasinf_u1 nsimd_sleef_asin_u10_avx2_f32\n#define xacos_u1 nsimd_sleef_acos_u10_avx2_f64\n#define xacosf_u1 nsimd_sleef_acos_u10_avx2_f32\n#define xatan_u1 nsimd_sleef_atan_u10_avx2_f64\n#define xatanf_u1 nsimd_sleef_atan_u10_avx2_f32\n#define xatan2_u1 nsimd_sleef_atan2_u10_avx2_f64\n#define xatan2f_u1 nsimd_sleef_atan2_u10_avx2_f32\n#define xlog_u1 nsimd_sleef_log_u10_avx2_f64\n#define xlogf_u1 nsimd_sleef_log_u10_avx2_f32\n#define xcbrt_u1 nsimd_sleef_cbrt_u10_avx2_f64\n#define xcbrtf_u1 nsimd_sleef_cbrt_u10_avx2_f32\n#define xexp nsimd_sleef_exp_u10_avx2_f64\n#define xexpf nsimd_sleef_exp_u10_avx2_f32\n#define xpow nsimd_sleef_pow_u10_avx2_f64\n#define xpowf nsimd_sleef_pow_u10_avx2_f32\n#define xsinh nsimd_sleef_sinh_u10_avx2_f64\n#define xsinhf nsimd_sleef_sinh_u10_avx2_f32\n#define xcosh nsimd_sleef_cosh_u10_avx2_f64\n#define xcoshf nsimd_sleef_cosh_u10_avx2_f32\n#define xtanh nsimd_sleef_tanh_u10_avx2_f64\n#define xtanhf nsimd_sleef_tanh_u10_avx2_f32\n#define xsinh_u35 nsimd_sleef_sinh_u35_avx2_f64\n#define xsinhf_u35 nsimd_sleef_sinh_u35_avx2_f32\n#define xcosh_u35 nsimd_sleef_cosh_u35_avx2_f64\n#define xcoshf_u35 nsimd_sleef_cosh_u35_avx2_f32\n#define xtanh_u35 nsimd_sleef_tanh_u35_avx2_f64\n#define xtanhf_u35 nsimd_sleef_tanh_u35_avx2_f32\n#define xfastsin_u3500 nsimd_sleef_fastsin_u3500_avx2_f64\n#define xfastsinf_u3500 nsimd_sleef_fastsin_u3500_avx2_f32\n#define xfastcos_u3500 nsimd_sleef_fastcos_u3500_avx2_f64\n#define xfastcosf_u3500 nsimd_sleef_fastcos_u3500_avx2_f32\n#define xfastpow_u3500 nsimd_sleef_fastpow_u3500_avx2_f64\n#define xfastpowf_u3500 nsimd_sleef_fastpow_u3500_avx2_f32\n#define xasinh nsimd_sleef_asinh_u10_avx2_f64\n#define xasinhf nsimd_sleef_asinh_u10_avx2_f32\n#define xacosh nsimd_sleef_acosh_u10_avx2_f64\n#define xacoshf nsimd_sleef_acosh_u10_avx2_f32\n#define xatanh nsimd_sleef_atanh_u10_avx2_f64\n#define xatanhf nsimd_sleef_atanh_u10_avx2_f32\n#define xexp2 nsimd_sleef_exp2_u10_avx2_f64\n#define xexp2f nsimd_sleef_exp2_u10_avx2_f32\n#define xexp2_u35 nsimd_sleef_exp2_u35_avx2_f64\n#define xexp2f_u35 nsimd_sleef_exp2_u35_avx2_f32\n#define xexp10 nsimd_sleef_exp10_u10_avx2_f64\n#define xexp10f nsimd_sleef_exp10_u10_avx2_f32\n#define xexp10_u35 nsimd_sleef_exp10_u35_avx2_f64\n#define xexp10f_u35 nsimd_sleef_exp10_u35_avx2_f32\n#define xexpm1 nsimd_sleef_expm1_u10_avx2_f64\n#define xexpm1f nsimd_sleef_expm1_u10_avx2_f32\n#define xlog10 nsimd_sleef_log10_u10_avx2_f64\n#define xlog10f nsimd_sleef_log10_u10_avx2_f32\n#define xlog2 nsimd_sleef_log2_u10_avx2_f64\n#define xlog2f nsimd_sleef_log2_u10_avx2_f32\n#define xlog2_u35 nsimd_sleef_log2_u35_avx2_f64\n#define xlog2f_u35 nsimd_sleef_log2_u35_avx2_f32\n#define xlog1p nsimd_sleef_log1p_u10_avx2_f64\n#define xlog1pf nsimd_sleef_log1p_u10_avx2_f32\n#define xsincospi_u05 nsimd_sleef_sincospi_u05_avx2_f64\n#define xsincospif_u05 nsimd_sleef_sincospi_u05_avx2_f32\n#define xsincospi_u35 nsimd_sleef_sincospi_u35_avx2_f64\n#define xsincospif_u35 nsimd_sleef_sincospi_u35_avx2_f32\n#define xsinpi_u05 nsimd_sleef_sinpi_u05_avx2_f64\n#define xsinpif_u05 nsimd_sleef_sinpi_u05_avx2_f32\n#define xcospi_u05 nsimd_sleef_cospi_u05_avx2_f64\n#define xcospif_u05 nsimd_sleef_cospi_u05_avx2_f32\n#define xldexp nsimd_sleef_ldexp_avx2_f64\n#define xldexpf nsimd_sleef_ldexp_avx2_f32\n#define xilogb nsimd_sleef_ilogb_avx2_f64\n#define xilogbf nsimd_sleef_ilogb_avx2_f32\n#define xfma nsimd_sleef_fma_avx2_f64\n#define xfmaf nsimd_sleef_fma_avx2_f32\n#define xsqrt nsimd_sleef_sqrt_avx2_f64\n#define xsqrtf nsimd_sleef_sqrt_avx2_f32\n#define xsqrt_u05 nsimd_sleef_sqrt_u05_avx2_f64\n#define xsqrtf_u05 nsimd_sleef_sqrt_u05_avx2_f32\n#define xsqrt_u35 nsimd_sleef_sqrt_u35_avx2_f64\n#define xsqrtf_u35 nsimd_sleef_sqrt_u35_avx2_f32\n#define xhypot_u05 nsimd_sleef_hypot_u05_avx2_f64\n#define xhypotf_u05 nsimd_sleef_hypot_u05_avx2_f32\n#define xhypot_u35 nsimd_sleef_hypot_u35_avx2_f64\n#define xhypotf_u35 nsimd_sleef_hypot_u35_avx2_f32\n#define xfabs nsimd_sleef_fabs_avx2_f64\n#define xfabsf nsimd_sleef_fabs_avx2_f32\n#define xcopysign nsimd_sleef_copysign_avx2_f64\n#define xcopysignf nsimd_sleef_copysign_avx2_f32\n#define xfmax nsimd_sleef_fmax_avx2_f64\n#define xfmaxf nsimd_sleef_fmax_avx2_f32\n#define xfmin nsimd_sleef_fmin_avx2_f64\n#define xfminf nsimd_sleef_fmin_avx2_f32\n#define xfdim nsimd_sleef_fdim_avx2_f64\n#define xfdimf nsimd_sleef_fdim_avx2_f32\n#define xtrunc nsimd_sleef_trunc_avx2_f64\n#define xtruncf nsimd_sleef_trunc_avx2_f32\n#define xfloor nsimd_sleef_floor_avx2_f64\n#define xfloorf nsimd_sleef_floor_avx2_f32\n#define xceil nsimd_sleef_ceil_avx2_f64\n#define xceilf nsimd_sleef_ceil_avx2_f32\n#define xround nsimd_sleef_round_avx2_f64\n#define xroundf nsimd_sleef_round_avx2_f32\n#define xrint nsimd_sleef_rint_avx2_f64\n#define xrintf nsimd_sleef_rint_avx2_f32\n#define xnextafter nsimd_sleef_nextafter_avx2_f64\n#define xnextafterf nsimd_sleef_nextafter_avx2_f32\n#define xfrfrexp nsimd_sleef_frfrexp_avx2_f64\n#define xfrfrexpf nsimd_sleef_frfrexp_avx2_f32\n#define xexpfrexp nsimd_sleef_expfrexp_avx2_f64\n#define xexpfrexpf nsimd_sleef_expfrexp_avx2_f32\n#define xfmod nsimd_sleef_fmod_avx2_f64\n#define xfmodf nsimd_sleef_fmod_avx2_f32\n#define xremainder nsimd_sleef_remainder_avx2_f64\n#define xremainderf nsimd_sleef_remainder_avx2_f32\n#define xmodf nsimd_sleef_modf_avx2_f64\n#define xmodff nsimd_sleef_modf_avx2_f32\n#define xlgamma_u1 nsimd_sleef_lgamma_u10_avx2_f64\n#define xlgammaf_u1 nsimd_sleef_lgamma_u10_avx2_f32\n#define xtgamma_u1 nsimd_sleef_tgamma_u10_avx2_f64\n#define xtgammaf_u1 nsimd_sleef_tgamma_u10_avx2_f32\n#define xerf_u1 nsimd_sleef_erf_u10_avx2_f64\n#define xerff_u1 nsimd_sleef_erf_u10_avx2_f32\n#define xerfc_u15 nsimd_sleef_erfc_u15_avx2_f64\n#define xerfcf_u15 nsimd_sleef_erfc_u15_avx2_f32\n#define xgetInt nsimd_sleef_getInt_avx2_f64\n#define xgetIntf nsimd_sleef_getInt_avx2_f32\n#define xgetPtr nsimd_sleef_getPtr_avx2_f64\n#define xgetPtrf nsimd_sleef_getPtr_avx2_f32\n\n                   #endif\n\n                   #define rempi nsimd_sleef_rempi_avx2\n                   #define rempif nsimd_sleef_rempif_avx2\n                   #define rempisub nsimd_sleef_rempisub_avx2\n                   #define rempisubf nsimd_sleef_rempisubf_avx2\n                   #define gammak nsimd_gammak_avx2\n                   #define gammafk nsimd_gammafk_avx2\n\n                   #endif\n\n                   \n\n#endif\n\n"
  },
  {
    "path": "src/renameavx512f.h",
    "content": "#ifndef RENAMEAVX512F_H\n               #define RENAMEAVX512F_H\n\n               /* ------------------------------------------------------------------------- */\n                   /* Naming of functions avx512_knl */\n\n                   #ifdef NSIMD_AVX512_KNL\n\n                   #ifdef DETERMINISTIC\n\n                   #define xsin nsimd_sleef_sin_u35d_avx512_knl_f64\n#define xsinf nsimd_sleef_sin_u35d_avx512_knl_f32\n#define xcos nsimd_sleef_cos_u35d_avx512_knl_f64\n#define xcosf nsimd_sleef_cos_u35d_avx512_knl_f32\n#define xsincos nsimd_sleef_sincos_u35d_avx512_knl_f64\n#define xsincosf nsimd_sleef_sincos_u35d_avx512_knl_f32\n#define xtan nsimd_sleef_tan_u35d_avx512_knl_f64\n#define xtanf nsimd_sleef_tan_u35d_avx512_knl_f32\n#define xasin nsimd_sleef_asin_u35d_avx512_knl_f64\n#define xasinf nsimd_sleef_asin_u35d_avx512_knl_f32\n#define xacos nsimd_sleef_acos_u35d_avx512_knl_f64\n#define xacosf nsimd_sleef_acos_u35d_avx512_knl_f32\n#define xatan nsimd_sleef_atan_u35d_avx512_knl_f64\n#define xatanf nsimd_sleef_atan_u35d_avx512_knl_f32\n#define xatan2 nsimd_sleef_atan2_u35d_avx512_knl_f64\n#define xatan2f nsimd_sleef_atan2_u35d_avx512_knl_f32\n#define xlog nsimd_sleef_log_u35d_avx512_knl_f64\n#define xlogf nsimd_sleef_log_u35d_avx512_knl_f32\n#define xcbrt nsimd_sleef_cbrt_u35d_avx512_knl_f64\n#define xcbrtf nsimd_sleef_cbrt_u35d_avx512_knl_f32\n#define xsin_u1 nsimd_sleef_sin_u10d_avx512_knl_f64\n#define xsinf_u1 nsimd_sleef_sin_u10d_avx512_knl_f32\n#define xcos_u1 nsimd_sleef_cos_u10d_avx512_knl_f64\n#define xcosf_u1 nsimd_sleef_cos_u10d_avx512_knl_f32\n#define xsincos_u1 nsimd_sleef_sincos_u10d_avx512_knl_f64\n#define xsincosf_u1 nsimd_sleef_sincos_u10d_avx512_knl_f32\n#define xtan_u1 nsimd_sleef_tan_u10d_avx512_knl_f64\n#define xtanf_u1 nsimd_sleef_tan_u10d_avx512_knl_f32\n#define xasin_u1 nsimd_sleef_asin_u10d_avx512_knl_f64\n#define xasinf_u1 nsimd_sleef_asin_u10d_avx512_knl_f32\n#define xacos_u1 nsimd_sleef_acos_u10d_avx512_knl_f64\n#define xacosf_u1 nsimd_sleef_acos_u10d_avx512_knl_f32\n#define xatan_u1 nsimd_sleef_atan_u10d_avx512_knl_f64\n#define xatanf_u1 nsimd_sleef_atan_u10d_avx512_knl_f32\n#define xatan2_u1 nsimd_sleef_atan2_u10d_avx512_knl_f64\n#define xatan2f_u1 nsimd_sleef_atan2_u10d_avx512_knl_f32\n#define xlog_u1 nsimd_sleef_log_u10d_avx512_knl_f64\n#define xlogf_u1 nsimd_sleef_log_u10d_avx512_knl_f32\n#define xcbrt_u1 nsimd_sleef_cbrt_u10d_avx512_knl_f64\n#define xcbrtf_u1 nsimd_sleef_cbrt_u10d_avx512_knl_f32\n#define xexp nsimd_sleef_exp_u10d_avx512_knl_f64\n#define xexpf nsimd_sleef_exp_u10d_avx512_knl_f32\n#define xpow nsimd_sleef_pow_u10d_avx512_knl_f64\n#define xpowf nsimd_sleef_pow_u10d_avx512_knl_f32\n#define xsinh nsimd_sleef_sinh_u10d_avx512_knl_f64\n#define xsinhf nsimd_sleef_sinh_u10d_avx512_knl_f32\n#define xcosh nsimd_sleef_cosh_u10d_avx512_knl_f64\n#define xcoshf nsimd_sleef_cosh_u10d_avx512_knl_f32\n#define xtanh nsimd_sleef_tanh_u10d_avx512_knl_f64\n#define xtanhf nsimd_sleef_tanh_u10d_avx512_knl_f32\n#define xsinh_u35 nsimd_sleef_sinh_u35d_avx512_knl_f64\n#define xsinhf_u35 nsimd_sleef_sinh_u35d_avx512_knl_f32\n#define xcosh_u35 nsimd_sleef_cosh_u35d_avx512_knl_f64\n#define xcoshf_u35 nsimd_sleef_cosh_u35d_avx512_knl_f32\n#define xtanh_u35 nsimd_sleef_tanh_u35d_avx512_knl_f64\n#define xtanhf_u35 nsimd_sleef_tanh_u35d_avx512_knl_f32\n#define xfastsin_u3500 nsimd_sleef_fastsin_u3500d_avx512_knl_f64\n#define xfastsinf_u3500 nsimd_sleef_fastsin_u3500d_avx512_knl_f32\n#define xfastcos_u3500 nsimd_sleef_fastcos_u3500d_avx512_knl_f64\n#define xfastcosf_u3500 nsimd_sleef_fastcos_u3500d_avx512_knl_f32\n#define xfastpow_u3500 nsimd_sleef_fastpow_u3500d_avx512_knl_f64\n#define xfastpowf_u3500 nsimd_sleef_fastpow_u3500d_avx512_knl_f32\n#define xasinh nsimd_sleef_asinh_u10d_avx512_knl_f64\n#define xasinhf nsimd_sleef_asinh_u10d_avx512_knl_f32\n#define xacosh nsimd_sleef_acosh_u10d_avx512_knl_f64\n#define xacoshf nsimd_sleef_acosh_u10d_avx512_knl_f32\n#define xatanh nsimd_sleef_atanh_u10d_avx512_knl_f64\n#define xatanhf nsimd_sleef_atanh_u10d_avx512_knl_f32\n#define xexp2 nsimd_sleef_exp2_u10d_avx512_knl_f64\n#define xexp2f nsimd_sleef_exp2_u10d_avx512_knl_f32\n#define xexp2_u35 nsimd_sleef_exp2_u35d_avx512_knl_f64\n#define xexp2f_u35 nsimd_sleef_exp2_u35d_avx512_knl_f32\n#define xexp10 nsimd_sleef_exp10_u10d_avx512_knl_f64\n#define xexp10f nsimd_sleef_exp10_u10d_avx512_knl_f32\n#define xexp10_u35 nsimd_sleef_exp10_u35d_avx512_knl_f64\n#define xexp10f_u35 nsimd_sleef_exp10_u35d_avx512_knl_f32\n#define xexpm1 nsimd_sleef_expm1_u10d_avx512_knl_f64\n#define xexpm1f nsimd_sleef_expm1_u10d_avx512_knl_f32\n#define xlog10 nsimd_sleef_log10_u10d_avx512_knl_f64\n#define xlog10f nsimd_sleef_log10_u10d_avx512_knl_f32\n#define xlog2 nsimd_sleef_log2_u10d_avx512_knl_f64\n#define xlog2f nsimd_sleef_log2_u10d_avx512_knl_f32\n#define xlog2_u35 nsimd_sleef_log2_u35d_avx512_knl_f64\n#define xlog2f_u35 nsimd_sleef_log2_u35d_avx512_knl_f32\n#define xlog1p nsimd_sleef_log1p_u10d_avx512_knl_f64\n#define xlog1pf nsimd_sleef_log1p_u10d_avx512_knl_f32\n#define xsincospi_u05 nsimd_sleef_sincospi_u05d_avx512_knl_f64\n#define xsincospif_u05 nsimd_sleef_sincospi_u05d_avx512_knl_f32\n#define xsincospi_u35 nsimd_sleef_sincospi_u35d_avx512_knl_f64\n#define xsincospif_u35 nsimd_sleef_sincospi_u35d_avx512_knl_f32\n#define xsinpi_u05 nsimd_sleef_sinpi_u05d_avx512_knl_f64\n#define xsinpif_u05 nsimd_sleef_sinpi_u05d_avx512_knl_f32\n#define xcospi_u05 nsimd_sleef_cospi_u05d_avx512_knl_f64\n#define xcospif_u05 nsimd_sleef_cospi_u05d_avx512_knl_f32\n#define xldexp nsimd_sleef_ldexp_avx512_knl_f64\n#define xldexpf nsimd_sleef_ldexp_avx512_knl_f32\n#define xilogb nsimd_sleef_ilogb_avx512_knl_f64\n#define xilogbf nsimd_sleef_ilogb_avx512_knl_f32\n#define xfma nsimd_sleef_fma_avx512_knl_f64\n#define xfmaf nsimd_sleef_fma_avx512_knl_f32\n#define xsqrt nsimd_sleef_sqrt_avx512_knl_f64\n#define xsqrtf nsimd_sleef_sqrt_avx512_knl_f32\n#define xsqrt_u05 nsimd_sleef_sqrt_u05d_avx512_knl_f64\n#define xsqrtf_u05 nsimd_sleef_sqrt_u05d_avx512_knl_f32\n#define xsqrt_u35 nsimd_sleef_sqrt_u35d_avx512_knl_f64\n#define xsqrtf_u35 nsimd_sleef_sqrt_u35d_avx512_knl_f32\n#define xhypot_u05 nsimd_sleef_hypot_u05d_avx512_knl_f64\n#define xhypotf_u05 nsimd_sleef_hypot_u05d_avx512_knl_f32\n#define xhypot_u35 nsimd_sleef_hypot_u35d_avx512_knl_f64\n#define xhypotf_u35 nsimd_sleef_hypot_u35d_avx512_knl_f32\n#define xfabs nsimd_sleef_fabs_avx512_knl_f64\n#define xfabsf nsimd_sleef_fabs_avx512_knl_f32\n#define xcopysign nsimd_sleef_copysign_avx512_knl_f64\n#define xcopysignf nsimd_sleef_copysign_avx512_knl_f32\n#define xfmax nsimd_sleef_fmax_avx512_knl_f64\n#define xfmaxf nsimd_sleef_fmax_avx512_knl_f32\n#define xfmin nsimd_sleef_fmin_avx512_knl_f64\n#define xfminf nsimd_sleef_fmin_avx512_knl_f32\n#define xfdim nsimd_sleef_fdim_avx512_knl_f64\n#define xfdimf nsimd_sleef_fdim_avx512_knl_f32\n#define xtrunc nsimd_sleef_trunc_avx512_knl_f64\n#define xtruncf nsimd_sleef_trunc_avx512_knl_f32\n#define xfloor nsimd_sleef_floor_avx512_knl_f64\n#define xfloorf nsimd_sleef_floor_avx512_knl_f32\n#define xceil nsimd_sleef_ceil_avx512_knl_f64\n#define xceilf nsimd_sleef_ceil_avx512_knl_f32\n#define xround nsimd_sleef_round_avx512_knl_f64\n#define xroundf nsimd_sleef_round_avx512_knl_f32\n#define xrint nsimd_sleef_rint_avx512_knl_f64\n#define xrintf nsimd_sleef_rint_avx512_knl_f32\n#define xnextafter nsimd_sleef_nextafter_avx512_knl_f64\n#define xnextafterf nsimd_sleef_nextafter_avx512_knl_f32\n#define xfrfrexp nsimd_sleef_frfrexp_avx512_knl_f64\n#define xfrfrexpf nsimd_sleef_frfrexp_avx512_knl_f32\n#define xexpfrexp nsimd_sleef_expfrexp_avx512_knl_f64\n#define xexpfrexpf nsimd_sleef_expfrexp_avx512_knl_f32\n#define xfmod nsimd_sleef_fmod_avx512_knl_f64\n#define xfmodf nsimd_sleef_fmod_avx512_knl_f32\n#define xremainder nsimd_sleef_remainder_avx512_knl_f64\n#define xremainderf nsimd_sleef_remainder_avx512_knl_f32\n#define xmodf nsimd_sleef_modf_avx512_knl_f64\n#define xmodff nsimd_sleef_modf_avx512_knl_f32\n#define xlgamma_u1 nsimd_sleef_lgamma_u10d_avx512_knl_f64\n#define xlgammaf_u1 nsimd_sleef_lgamma_u10d_avx512_knl_f32\n#define xtgamma_u1 nsimd_sleef_tgamma_u10d_avx512_knl_f64\n#define xtgammaf_u1 nsimd_sleef_tgamma_u10d_avx512_knl_f32\n#define xerf_u1 nsimd_sleef_erf_u10d_avx512_knl_f64\n#define xerff_u1 nsimd_sleef_erf_u10d_avx512_knl_f32\n#define xerfc_u15 nsimd_sleef_erfc_u15d_avx512_knl_f64\n#define xerfcf_u15 nsimd_sleef_erfc_u15d_avx512_knl_f32\n#define xgetInt nsimd_sleef_getInt_avx512_knl_f64\n#define xgetIntf nsimd_sleef_getInt_avx512_knl_f32\n#define xgetPtr nsimd_sleef_getPtr_avx512_knl_f64\n#define xgetPtrf nsimd_sleef_getPtr_avx512_knl_f32\n\n                   #else\n\n                   #define xsin nsimd_sleef_sin_u35_avx512_knl_f64\n#define xsinf nsimd_sleef_sin_u35_avx512_knl_f32\n#define xcos nsimd_sleef_cos_u35_avx512_knl_f64\n#define xcosf nsimd_sleef_cos_u35_avx512_knl_f32\n#define xsincos nsimd_sleef_sincos_u35_avx512_knl_f64\n#define xsincosf nsimd_sleef_sincos_u35_avx512_knl_f32\n#define xtan nsimd_sleef_tan_u35_avx512_knl_f64\n#define xtanf nsimd_sleef_tan_u35_avx512_knl_f32\n#define xasin nsimd_sleef_asin_u35_avx512_knl_f64\n#define xasinf nsimd_sleef_asin_u35_avx512_knl_f32\n#define xacos nsimd_sleef_acos_u35_avx512_knl_f64\n#define xacosf nsimd_sleef_acos_u35_avx512_knl_f32\n#define xatan nsimd_sleef_atan_u35_avx512_knl_f64\n#define xatanf nsimd_sleef_atan_u35_avx512_knl_f32\n#define xatan2 nsimd_sleef_atan2_u35_avx512_knl_f64\n#define xatan2f nsimd_sleef_atan2_u35_avx512_knl_f32\n#define xlog nsimd_sleef_log_u35_avx512_knl_f64\n#define xlogf nsimd_sleef_log_u35_avx512_knl_f32\n#define xcbrt nsimd_sleef_cbrt_u35_avx512_knl_f64\n#define xcbrtf nsimd_sleef_cbrt_u35_avx512_knl_f32\n#define xsin_u1 nsimd_sleef_sin_u10_avx512_knl_f64\n#define xsinf_u1 nsimd_sleef_sin_u10_avx512_knl_f32\n#define xcos_u1 nsimd_sleef_cos_u10_avx512_knl_f64\n#define xcosf_u1 nsimd_sleef_cos_u10_avx512_knl_f32\n#define xsincos_u1 nsimd_sleef_sincos_u10_avx512_knl_f64\n#define xsincosf_u1 nsimd_sleef_sincos_u10_avx512_knl_f32\n#define xtan_u1 nsimd_sleef_tan_u10_avx512_knl_f64\n#define xtanf_u1 nsimd_sleef_tan_u10_avx512_knl_f32\n#define xasin_u1 nsimd_sleef_asin_u10_avx512_knl_f64\n#define xasinf_u1 nsimd_sleef_asin_u10_avx512_knl_f32\n#define xacos_u1 nsimd_sleef_acos_u10_avx512_knl_f64\n#define xacosf_u1 nsimd_sleef_acos_u10_avx512_knl_f32\n#define xatan_u1 nsimd_sleef_atan_u10_avx512_knl_f64\n#define xatanf_u1 nsimd_sleef_atan_u10_avx512_knl_f32\n#define xatan2_u1 nsimd_sleef_atan2_u10_avx512_knl_f64\n#define xatan2f_u1 nsimd_sleef_atan2_u10_avx512_knl_f32\n#define xlog_u1 nsimd_sleef_log_u10_avx512_knl_f64\n#define xlogf_u1 nsimd_sleef_log_u10_avx512_knl_f32\n#define xcbrt_u1 nsimd_sleef_cbrt_u10_avx512_knl_f64\n#define xcbrtf_u1 nsimd_sleef_cbrt_u10_avx512_knl_f32\n#define xexp nsimd_sleef_exp_u10_avx512_knl_f64\n#define xexpf nsimd_sleef_exp_u10_avx512_knl_f32\n#define xpow nsimd_sleef_pow_u10_avx512_knl_f64\n#define xpowf nsimd_sleef_pow_u10_avx512_knl_f32\n#define xsinh nsimd_sleef_sinh_u10_avx512_knl_f64\n#define xsinhf nsimd_sleef_sinh_u10_avx512_knl_f32\n#define xcosh nsimd_sleef_cosh_u10_avx512_knl_f64\n#define xcoshf nsimd_sleef_cosh_u10_avx512_knl_f32\n#define xtanh nsimd_sleef_tanh_u10_avx512_knl_f64\n#define xtanhf nsimd_sleef_tanh_u10_avx512_knl_f32\n#define xsinh_u35 nsimd_sleef_sinh_u35_avx512_knl_f64\n#define xsinhf_u35 nsimd_sleef_sinh_u35_avx512_knl_f32\n#define xcosh_u35 nsimd_sleef_cosh_u35_avx512_knl_f64\n#define xcoshf_u35 nsimd_sleef_cosh_u35_avx512_knl_f32\n#define xtanh_u35 nsimd_sleef_tanh_u35_avx512_knl_f64\n#define xtanhf_u35 nsimd_sleef_tanh_u35_avx512_knl_f32\n#define xfastsin_u3500 nsimd_sleef_fastsin_u3500_avx512_knl_f64\n#define xfastsinf_u3500 nsimd_sleef_fastsin_u3500_avx512_knl_f32\n#define xfastcos_u3500 nsimd_sleef_fastcos_u3500_avx512_knl_f64\n#define xfastcosf_u3500 nsimd_sleef_fastcos_u3500_avx512_knl_f32\n#define xfastpow_u3500 nsimd_sleef_fastpow_u3500_avx512_knl_f64\n#define xfastpowf_u3500 nsimd_sleef_fastpow_u3500_avx512_knl_f32\n#define xasinh nsimd_sleef_asinh_u10_avx512_knl_f64\n#define xasinhf nsimd_sleef_asinh_u10_avx512_knl_f32\n#define xacosh nsimd_sleef_acosh_u10_avx512_knl_f64\n#define xacoshf nsimd_sleef_acosh_u10_avx512_knl_f32\n#define xatanh nsimd_sleef_atanh_u10_avx512_knl_f64\n#define xatanhf nsimd_sleef_atanh_u10_avx512_knl_f32\n#define xexp2 nsimd_sleef_exp2_u10_avx512_knl_f64\n#define xexp2f nsimd_sleef_exp2_u10_avx512_knl_f32\n#define xexp2_u35 nsimd_sleef_exp2_u35_avx512_knl_f64\n#define xexp2f_u35 nsimd_sleef_exp2_u35_avx512_knl_f32\n#define xexp10 nsimd_sleef_exp10_u10_avx512_knl_f64\n#define xexp10f nsimd_sleef_exp10_u10_avx512_knl_f32\n#define xexp10_u35 nsimd_sleef_exp10_u35_avx512_knl_f64\n#define xexp10f_u35 nsimd_sleef_exp10_u35_avx512_knl_f32\n#define xexpm1 nsimd_sleef_expm1_u10_avx512_knl_f64\n#define xexpm1f nsimd_sleef_expm1_u10_avx512_knl_f32\n#define xlog10 nsimd_sleef_log10_u10_avx512_knl_f64\n#define xlog10f nsimd_sleef_log10_u10_avx512_knl_f32\n#define xlog2 nsimd_sleef_log2_u10_avx512_knl_f64\n#define xlog2f nsimd_sleef_log2_u10_avx512_knl_f32\n#define xlog2_u35 nsimd_sleef_log2_u35_avx512_knl_f64\n#define xlog2f_u35 nsimd_sleef_log2_u35_avx512_knl_f32\n#define xlog1p nsimd_sleef_log1p_u10_avx512_knl_f64\n#define xlog1pf nsimd_sleef_log1p_u10_avx512_knl_f32\n#define xsincospi_u05 nsimd_sleef_sincospi_u05_avx512_knl_f64\n#define xsincospif_u05 nsimd_sleef_sincospi_u05_avx512_knl_f32\n#define xsincospi_u35 nsimd_sleef_sincospi_u35_avx512_knl_f64\n#define xsincospif_u35 nsimd_sleef_sincospi_u35_avx512_knl_f32\n#define xsinpi_u05 nsimd_sleef_sinpi_u05_avx512_knl_f64\n#define xsinpif_u05 nsimd_sleef_sinpi_u05_avx512_knl_f32\n#define xcospi_u05 nsimd_sleef_cospi_u05_avx512_knl_f64\n#define xcospif_u05 nsimd_sleef_cospi_u05_avx512_knl_f32\n#define xldexp nsimd_sleef_ldexp_avx512_knl_f64\n#define xldexpf nsimd_sleef_ldexp_avx512_knl_f32\n#define xilogb nsimd_sleef_ilogb_avx512_knl_f64\n#define xilogbf nsimd_sleef_ilogb_avx512_knl_f32\n#define xfma nsimd_sleef_fma_avx512_knl_f64\n#define xfmaf nsimd_sleef_fma_avx512_knl_f32\n#define xsqrt nsimd_sleef_sqrt_avx512_knl_f64\n#define xsqrtf nsimd_sleef_sqrt_avx512_knl_f32\n#define xsqrt_u05 nsimd_sleef_sqrt_u05_avx512_knl_f64\n#define xsqrtf_u05 nsimd_sleef_sqrt_u05_avx512_knl_f32\n#define xsqrt_u35 nsimd_sleef_sqrt_u35_avx512_knl_f64\n#define xsqrtf_u35 nsimd_sleef_sqrt_u35_avx512_knl_f32\n#define xhypot_u05 nsimd_sleef_hypot_u05_avx512_knl_f64\n#define xhypotf_u05 nsimd_sleef_hypot_u05_avx512_knl_f32\n#define xhypot_u35 nsimd_sleef_hypot_u35_avx512_knl_f64\n#define xhypotf_u35 nsimd_sleef_hypot_u35_avx512_knl_f32\n#define xfabs nsimd_sleef_fabs_avx512_knl_f64\n#define xfabsf nsimd_sleef_fabs_avx512_knl_f32\n#define xcopysign nsimd_sleef_copysign_avx512_knl_f64\n#define xcopysignf nsimd_sleef_copysign_avx512_knl_f32\n#define xfmax nsimd_sleef_fmax_avx512_knl_f64\n#define xfmaxf nsimd_sleef_fmax_avx512_knl_f32\n#define xfmin nsimd_sleef_fmin_avx512_knl_f64\n#define xfminf nsimd_sleef_fmin_avx512_knl_f32\n#define xfdim nsimd_sleef_fdim_avx512_knl_f64\n#define xfdimf nsimd_sleef_fdim_avx512_knl_f32\n#define xtrunc nsimd_sleef_trunc_avx512_knl_f64\n#define xtruncf nsimd_sleef_trunc_avx512_knl_f32\n#define xfloor nsimd_sleef_floor_avx512_knl_f64\n#define xfloorf nsimd_sleef_floor_avx512_knl_f32\n#define xceil nsimd_sleef_ceil_avx512_knl_f64\n#define xceilf nsimd_sleef_ceil_avx512_knl_f32\n#define xround nsimd_sleef_round_avx512_knl_f64\n#define xroundf nsimd_sleef_round_avx512_knl_f32\n#define xrint nsimd_sleef_rint_avx512_knl_f64\n#define xrintf nsimd_sleef_rint_avx512_knl_f32\n#define xnextafter nsimd_sleef_nextafter_avx512_knl_f64\n#define xnextafterf nsimd_sleef_nextafter_avx512_knl_f32\n#define xfrfrexp nsimd_sleef_frfrexp_avx512_knl_f64\n#define xfrfrexpf nsimd_sleef_frfrexp_avx512_knl_f32\n#define xexpfrexp nsimd_sleef_expfrexp_avx512_knl_f64\n#define xexpfrexpf nsimd_sleef_expfrexp_avx512_knl_f32\n#define xfmod nsimd_sleef_fmod_avx512_knl_f64\n#define xfmodf nsimd_sleef_fmod_avx512_knl_f32\n#define xremainder nsimd_sleef_remainder_avx512_knl_f64\n#define xremainderf nsimd_sleef_remainder_avx512_knl_f32\n#define xmodf nsimd_sleef_modf_avx512_knl_f64\n#define xmodff nsimd_sleef_modf_avx512_knl_f32\n#define xlgamma_u1 nsimd_sleef_lgamma_u10_avx512_knl_f64\n#define xlgammaf_u1 nsimd_sleef_lgamma_u10_avx512_knl_f32\n#define xtgamma_u1 nsimd_sleef_tgamma_u10_avx512_knl_f64\n#define xtgammaf_u1 nsimd_sleef_tgamma_u10_avx512_knl_f32\n#define xerf_u1 nsimd_sleef_erf_u10_avx512_knl_f64\n#define xerff_u1 nsimd_sleef_erf_u10_avx512_knl_f32\n#define xerfc_u15 nsimd_sleef_erfc_u15_avx512_knl_f64\n#define xerfcf_u15 nsimd_sleef_erfc_u15_avx512_knl_f32\n#define xgetInt nsimd_sleef_getInt_avx512_knl_f64\n#define xgetIntf nsimd_sleef_getInt_avx512_knl_f32\n#define xgetPtr nsimd_sleef_getPtr_avx512_knl_f64\n#define xgetPtrf nsimd_sleef_getPtr_avx512_knl_f32\n\n                   #endif\n\n                   #define rempi nsimd_sleef_rempi_avx512_knl\n                   #define rempif nsimd_sleef_rempif_avx512_knl\n                   #define rempisub nsimd_sleef_rempisub_avx512_knl\n                   #define rempisubf nsimd_sleef_rempisubf_avx512_knl\n                   #define gammak nsimd_gammak_avx512_knl\n                   #define gammafk nsimd_gammafk_avx512_knl\n\n                   #endif\n\n                   /* ------------------------------------------------------------------------- */\n                   /* Naming of functions avx512_skylake */\n\n                   #ifdef NSIMD_AVX512_SKYLAKE\n\n                   #ifdef DETERMINISTIC\n\n                   #define xsin nsimd_sleef_sin_u35d_avx512_skylake_f64\n#define xsinf nsimd_sleef_sin_u35d_avx512_skylake_f32\n#define xcos nsimd_sleef_cos_u35d_avx512_skylake_f64\n#define xcosf nsimd_sleef_cos_u35d_avx512_skylake_f32\n#define xsincos nsimd_sleef_sincos_u35d_avx512_skylake_f64\n#define xsincosf nsimd_sleef_sincos_u35d_avx512_skylake_f32\n#define xtan nsimd_sleef_tan_u35d_avx512_skylake_f64\n#define xtanf nsimd_sleef_tan_u35d_avx512_skylake_f32\n#define xasin nsimd_sleef_asin_u35d_avx512_skylake_f64\n#define xasinf nsimd_sleef_asin_u35d_avx512_skylake_f32\n#define xacos nsimd_sleef_acos_u35d_avx512_skylake_f64\n#define xacosf nsimd_sleef_acos_u35d_avx512_skylake_f32\n#define xatan nsimd_sleef_atan_u35d_avx512_skylake_f64\n#define xatanf nsimd_sleef_atan_u35d_avx512_skylake_f32\n#define xatan2 nsimd_sleef_atan2_u35d_avx512_skylake_f64\n#define xatan2f nsimd_sleef_atan2_u35d_avx512_skylake_f32\n#define xlog nsimd_sleef_log_u35d_avx512_skylake_f64\n#define xlogf nsimd_sleef_log_u35d_avx512_skylake_f32\n#define xcbrt nsimd_sleef_cbrt_u35d_avx512_skylake_f64\n#define xcbrtf nsimd_sleef_cbrt_u35d_avx512_skylake_f32\n#define xsin_u1 nsimd_sleef_sin_u10d_avx512_skylake_f64\n#define xsinf_u1 nsimd_sleef_sin_u10d_avx512_skylake_f32\n#define xcos_u1 nsimd_sleef_cos_u10d_avx512_skylake_f64\n#define xcosf_u1 nsimd_sleef_cos_u10d_avx512_skylake_f32\n#define xsincos_u1 nsimd_sleef_sincos_u10d_avx512_skylake_f64\n#define xsincosf_u1 nsimd_sleef_sincos_u10d_avx512_skylake_f32\n#define xtan_u1 nsimd_sleef_tan_u10d_avx512_skylake_f64\n#define xtanf_u1 nsimd_sleef_tan_u10d_avx512_skylake_f32\n#define xasin_u1 nsimd_sleef_asin_u10d_avx512_skylake_f64\n#define xasinf_u1 nsimd_sleef_asin_u10d_avx512_skylake_f32\n#define xacos_u1 nsimd_sleef_acos_u10d_avx512_skylake_f64\n#define xacosf_u1 nsimd_sleef_acos_u10d_avx512_skylake_f32\n#define xatan_u1 nsimd_sleef_atan_u10d_avx512_skylake_f64\n#define xatanf_u1 nsimd_sleef_atan_u10d_avx512_skylake_f32\n#define xatan2_u1 nsimd_sleef_atan2_u10d_avx512_skylake_f64\n#define xatan2f_u1 nsimd_sleef_atan2_u10d_avx512_skylake_f32\n#define xlog_u1 nsimd_sleef_log_u10d_avx512_skylake_f64\n#define xlogf_u1 nsimd_sleef_log_u10d_avx512_skylake_f32\n#define xcbrt_u1 nsimd_sleef_cbrt_u10d_avx512_skylake_f64\n#define xcbrtf_u1 nsimd_sleef_cbrt_u10d_avx512_skylake_f32\n#define xexp nsimd_sleef_exp_u10d_avx512_skylake_f64\n#define xexpf nsimd_sleef_exp_u10d_avx512_skylake_f32\n#define xpow nsimd_sleef_pow_u10d_avx512_skylake_f64\n#define xpowf nsimd_sleef_pow_u10d_avx512_skylake_f32\n#define xsinh nsimd_sleef_sinh_u10d_avx512_skylake_f64\n#define xsinhf nsimd_sleef_sinh_u10d_avx512_skylake_f32\n#define xcosh nsimd_sleef_cosh_u10d_avx512_skylake_f64\n#define xcoshf nsimd_sleef_cosh_u10d_avx512_skylake_f32\n#define xtanh nsimd_sleef_tanh_u10d_avx512_skylake_f64\n#define xtanhf nsimd_sleef_tanh_u10d_avx512_skylake_f32\n#define xsinh_u35 nsimd_sleef_sinh_u35d_avx512_skylake_f64\n#define xsinhf_u35 nsimd_sleef_sinh_u35d_avx512_skylake_f32\n#define xcosh_u35 nsimd_sleef_cosh_u35d_avx512_skylake_f64\n#define xcoshf_u35 nsimd_sleef_cosh_u35d_avx512_skylake_f32\n#define xtanh_u35 nsimd_sleef_tanh_u35d_avx512_skylake_f64\n#define xtanhf_u35 nsimd_sleef_tanh_u35d_avx512_skylake_f32\n#define xfastsin_u3500 nsimd_sleef_fastsin_u3500d_avx512_skylake_f64\n#define xfastsinf_u3500 nsimd_sleef_fastsin_u3500d_avx512_skylake_f32\n#define xfastcos_u3500 nsimd_sleef_fastcos_u3500d_avx512_skylake_f64\n#define xfastcosf_u3500 nsimd_sleef_fastcos_u3500d_avx512_skylake_f32\n#define xfastpow_u3500 nsimd_sleef_fastpow_u3500d_avx512_skylake_f64\n#define xfastpowf_u3500 nsimd_sleef_fastpow_u3500d_avx512_skylake_f32\n#define xasinh nsimd_sleef_asinh_u10d_avx512_skylake_f64\n#define xasinhf nsimd_sleef_asinh_u10d_avx512_skylake_f32\n#define xacosh nsimd_sleef_acosh_u10d_avx512_skylake_f64\n#define xacoshf nsimd_sleef_acosh_u10d_avx512_skylake_f32\n#define xatanh nsimd_sleef_atanh_u10d_avx512_skylake_f64\n#define xatanhf nsimd_sleef_atanh_u10d_avx512_skylake_f32\n#define xexp2 nsimd_sleef_exp2_u10d_avx512_skylake_f64\n#define xexp2f nsimd_sleef_exp2_u10d_avx512_skylake_f32\n#define xexp2_u35 nsimd_sleef_exp2_u35d_avx512_skylake_f64\n#define xexp2f_u35 nsimd_sleef_exp2_u35d_avx512_skylake_f32\n#define xexp10 nsimd_sleef_exp10_u10d_avx512_skylake_f64\n#define xexp10f nsimd_sleef_exp10_u10d_avx512_skylake_f32\n#define xexp10_u35 nsimd_sleef_exp10_u35d_avx512_skylake_f64\n#define xexp10f_u35 nsimd_sleef_exp10_u35d_avx512_skylake_f32\n#define xexpm1 nsimd_sleef_expm1_u10d_avx512_skylake_f64\n#define xexpm1f nsimd_sleef_expm1_u10d_avx512_skylake_f32\n#define xlog10 nsimd_sleef_log10_u10d_avx512_skylake_f64\n#define xlog10f nsimd_sleef_log10_u10d_avx512_skylake_f32\n#define xlog2 nsimd_sleef_log2_u10d_avx512_skylake_f64\n#define xlog2f nsimd_sleef_log2_u10d_avx512_skylake_f32\n#define xlog2_u35 nsimd_sleef_log2_u35d_avx512_skylake_f64\n#define xlog2f_u35 nsimd_sleef_log2_u35d_avx512_skylake_f32\n#define xlog1p nsimd_sleef_log1p_u10d_avx512_skylake_f64\n#define xlog1pf nsimd_sleef_log1p_u10d_avx512_skylake_f32\n#define xsincospi_u05 nsimd_sleef_sincospi_u05d_avx512_skylake_f64\n#define xsincospif_u05 nsimd_sleef_sincospi_u05d_avx512_skylake_f32\n#define xsincospi_u35 nsimd_sleef_sincospi_u35d_avx512_skylake_f64\n#define xsincospif_u35 nsimd_sleef_sincospi_u35d_avx512_skylake_f32\n#define xsinpi_u05 nsimd_sleef_sinpi_u05d_avx512_skylake_f64\n#define xsinpif_u05 nsimd_sleef_sinpi_u05d_avx512_skylake_f32\n#define xcospi_u05 nsimd_sleef_cospi_u05d_avx512_skylake_f64\n#define xcospif_u05 nsimd_sleef_cospi_u05d_avx512_skylake_f32\n#define xldexp nsimd_sleef_ldexp_avx512_skylake_f64\n#define xldexpf nsimd_sleef_ldexp_avx512_skylake_f32\n#define xilogb nsimd_sleef_ilogb_avx512_skylake_f64\n#define xilogbf nsimd_sleef_ilogb_avx512_skylake_f32\n#define xfma nsimd_sleef_fma_avx512_skylake_f64\n#define xfmaf nsimd_sleef_fma_avx512_skylake_f32\n#define xsqrt nsimd_sleef_sqrt_avx512_skylake_f64\n#define xsqrtf nsimd_sleef_sqrt_avx512_skylake_f32\n#define xsqrt_u05 nsimd_sleef_sqrt_u05d_avx512_skylake_f64\n#define xsqrtf_u05 nsimd_sleef_sqrt_u05d_avx512_skylake_f32\n#define xsqrt_u35 nsimd_sleef_sqrt_u35d_avx512_skylake_f64\n#define xsqrtf_u35 nsimd_sleef_sqrt_u35d_avx512_skylake_f32\n#define xhypot_u05 nsimd_sleef_hypot_u05d_avx512_skylake_f64\n#define xhypotf_u05 nsimd_sleef_hypot_u05d_avx512_skylake_f32\n#define xhypot_u35 nsimd_sleef_hypot_u35d_avx512_skylake_f64\n#define xhypotf_u35 nsimd_sleef_hypot_u35d_avx512_skylake_f32\n#define xfabs nsimd_sleef_fabs_avx512_skylake_f64\n#define xfabsf nsimd_sleef_fabs_avx512_skylake_f32\n#define xcopysign nsimd_sleef_copysign_avx512_skylake_f64\n#define xcopysignf nsimd_sleef_copysign_avx512_skylake_f32\n#define xfmax nsimd_sleef_fmax_avx512_skylake_f64\n#define xfmaxf nsimd_sleef_fmax_avx512_skylake_f32\n#define xfmin nsimd_sleef_fmin_avx512_skylake_f64\n#define xfminf nsimd_sleef_fmin_avx512_skylake_f32\n#define xfdim nsimd_sleef_fdim_avx512_skylake_f64\n#define xfdimf nsimd_sleef_fdim_avx512_skylake_f32\n#define xtrunc nsimd_sleef_trunc_avx512_skylake_f64\n#define xtruncf nsimd_sleef_trunc_avx512_skylake_f32\n#define xfloor nsimd_sleef_floor_avx512_skylake_f64\n#define xfloorf nsimd_sleef_floor_avx512_skylake_f32\n#define xceil nsimd_sleef_ceil_avx512_skylake_f64\n#define xceilf nsimd_sleef_ceil_avx512_skylake_f32\n#define xround nsimd_sleef_round_avx512_skylake_f64\n#define xroundf nsimd_sleef_round_avx512_skylake_f32\n#define xrint nsimd_sleef_rint_avx512_skylake_f64\n#define xrintf nsimd_sleef_rint_avx512_skylake_f32\n#define xnextafter nsimd_sleef_nextafter_avx512_skylake_f64\n#define xnextafterf nsimd_sleef_nextafter_avx512_skylake_f32\n#define xfrfrexp nsimd_sleef_frfrexp_avx512_skylake_f64\n#define xfrfrexpf nsimd_sleef_frfrexp_avx512_skylake_f32\n#define xexpfrexp nsimd_sleef_expfrexp_avx512_skylake_f64\n#define xexpfrexpf nsimd_sleef_expfrexp_avx512_skylake_f32\n#define xfmod nsimd_sleef_fmod_avx512_skylake_f64\n#define xfmodf nsimd_sleef_fmod_avx512_skylake_f32\n#define xremainder nsimd_sleef_remainder_avx512_skylake_f64\n#define xremainderf nsimd_sleef_remainder_avx512_skylake_f32\n#define xmodf nsimd_sleef_modf_avx512_skylake_f64\n#define xmodff nsimd_sleef_modf_avx512_skylake_f32\n#define xlgamma_u1 nsimd_sleef_lgamma_u10d_avx512_skylake_f64\n#define xlgammaf_u1 nsimd_sleef_lgamma_u10d_avx512_skylake_f32\n#define xtgamma_u1 nsimd_sleef_tgamma_u10d_avx512_skylake_f64\n#define xtgammaf_u1 nsimd_sleef_tgamma_u10d_avx512_skylake_f32\n#define xerf_u1 nsimd_sleef_erf_u10d_avx512_skylake_f64\n#define xerff_u1 nsimd_sleef_erf_u10d_avx512_skylake_f32\n#define xerfc_u15 nsimd_sleef_erfc_u15d_avx512_skylake_f64\n#define xerfcf_u15 nsimd_sleef_erfc_u15d_avx512_skylake_f32\n#define xgetInt nsimd_sleef_getInt_avx512_skylake_f64\n#define xgetIntf nsimd_sleef_getInt_avx512_skylake_f32\n#define xgetPtr nsimd_sleef_getPtr_avx512_skylake_f64\n#define xgetPtrf nsimd_sleef_getPtr_avx512_skylake_f32\n\n                   #else\n\n                   #define xsin nsimd_sleef_sin_u35_avx512_skylake_f64\n#define xsinf nsimd_sleef_sin_u35_avx512_skylake_f32\n#define xcos nsimd_sleef_cos_u35_avx512_skylake_f64\n#define xcosf nsimd_sleef_cos_u35_avx512_skylake_f32\n#define xsincos nsimd_sleef_sincos_u35_avx512_skylake_f64\n#define xsincosf nsimd_sleef_sincos_u35_avx512_skylake_f32\n#define xtan nsimd_sleef_tan_u35_avx512_skylake_f64\n#define xtanf nsimd_sleef_tan_u35_avx512_skylake_f32\n#define xasin nsimd_sleef_asin_u35_avx512_skylake_f64\n#define xasinf nsimd_sleef_asin_u35_avx512_skylake_f32\n#define xacos nsimd_sleef_acos_u35_avx512_skylake_f64\n#define xacosf nsimd_sleef_acos_u35_avx512_skylake_f32\n#define xatan nsimd_sleef_atan_u35_avx512_skylake_f64\n#define xatanf nsimd_sleef_atan_u35_avx512_skylake_f32\n#define xatan2 nsimd_sleef_atan2_u35_avx512_skylake_f64\n#define xatan2f nsimd_sleef_atan2_u35_avx512_skylake_f32\n#define xlog nsimd_sleef_log_u35_avx512_skylake_f64\n#define xlogf nsimd_sleef_log_u35_avx512_skylake_f32\n#define xcbrt nsimd_sleef_cbrt_u35_avx512_skylake_f64\n#define xcbrtf nsimd_sleef_cbrt_u35_avx512_skylake_f32\n#define xsin_u1 nsimd_sleef_sin_u10_avx512_skylake_f64\n#define xsinf_u1 nsimd_sleef_sin_u10_avx512_skylake_f32\n#define xcos_u1 nsimd_sleef_cos_u10_avx512_skylake_f64\n#define xcosf_u1 nsimd_sleef_cos_u10_avx512_skylake_f32\n#define xsincos_u1 nsimd_sleef_sincos_u10_avx512_skylake_f64\n#define xsincosf_u1 nsimd_sleef_sincos_u10_avx512_skylake_f32\n#define xtan_u1 nsimd_sleef_tan_u10_avx512_skylake_f64\n#define xtanf_u1 nsimd_sleef_tan_u10_avx512_skylake_f32\n#define xasin_u1 nsimd_sleef_asin_u10_avx512_skylake_f64\n#define xasinf_u1 nsimd_sleef_asin_u10_avx512_skylake_f32\n#define xacos_u1 nsimd_sleef_acos_u10_avx512_skylake_f64\n#define xacosf_u1 nsimd_sleef_acos_u10_avx512_skylake_f32\n#define xatan_u1 nsimd_sleef_atan_u10_avx512_skylake_f64\n#define xatanf_u1 nsimd_sleef_atan_u10_avx512_skylake_f32\n#define xatan2_u1 nsimd_sleef_atan2_u10_avx512_skylake_f64\n#define xatan2f_u1 nsimd_sleef_atan2_u10_avx512_skylake_f32\n#define xlog_u1 nsimd_sleef_log_u10_avx512_skylake_f64\n#define xlogf_u1 nsimd_sleef_log_u10_avx512_skylake_f32\n#define xcbrt_u1 nsimd_sleef_cbrt_u10_avx512_skylake_f64\n#define xcbrtf_u1 nsimd_sleef_cbrt_u10_avx512_skylake_f32\n#define xexp nsimd_sleef_exp_u10_avx512_skylake_f64\n#define xexpf nsimd_sleef_exp_u10_avx512_skylake_f32\n#define xpow nsimd_sleef_pow_u10_avx512_skylake_f64\n#define xpowf nsimd_sleef_pow_u10_avx512_skylake_f32\n#define xsinh nsimd_sleef_sinh_u10_avx512_skylake_f64\n#define xsinhf nsimd_sleef_sinh_u10_avx512_skylake_f32\n#define xcosh nsimd_sleef_cosh_u10_avx512_skylake_f64\n#define xcoshf nsimd_sleef_cosh_u10_avx512_skylake_f32\n#define xtanh nsimd_sleef_tanh_u10_avx512_skylake_f64\n#define xtanhf nsimd_sleef_tanh_u10_avx512_skylake_f32\n#define xsinh_u35 nsimd_sleef_sinh_u35_avx512_skylake_f64\n#define xsinhf_u35 nsimd_sleef_sinh_u35_avx512_skylake_f32\n#define xcosh_u35 nsimd_sleef_cosh_u35_avx512_skylake_f64\n#define xcoshf_u35 nsimd_sleef_cosh_u35_avx512_skylake_f32\n#define xtanh_u35 nsimd_sleef_tanh_u35_avx512_skylake_f64\n#define xtanhf_u35 nsimd_sleef_tanh_u35_avx512_skylake_f32\n#define xfastsin_u3500 nsimd_sleef_fastsin_u3500_avx512_skylake_f64\n#define xfastsinf_u3500 nsimd_sleef_fastsin_u3500_avx512_skylake_f32\n#define xfastcos_u3500 nsimd_sleef_fastcos_u3500_avx512_skylake_f64\n#define xfastcosf_u3500 nsimd_sleef_fastcos_u3500_avx512_skylake_f32\n#define xfastpow_u3500 nsimd_sleef_fastpow_u3500_avx512_skylake_f64\n#define xfastpowf_u3500 nsimd_sleef_fastpow_u3500_avx512_skylake_f32\n#define xasinh nsimd_sleef_asinh_u10_avx512_skylake_f64\n#define xasinhf nsimd_sleef_asinh_u10_avx512_skylake_f32\n#define xacosh nsimd_sleef_acosh_u10_avx512_skylake_f64\n#define xacoshf nsimd_sleef_acosh_u10_avx512_skylake_f32\n#define xatanh nsimd_sleef_atanh_u10_avx512_skylake_f64\n#define xatanhf nsimd_sleef_atanh_u10_avx512_skylake_f32\n#define xexp2 nsimd_sleef_exp2_u10_avx512_skylake_f64\n#define xexp2f nsimd_sleef_exp2_u10_avx512_skylake_f32\n#define xexp2_u35 nsimd_sleef_exp2_u35_avx512_skylake_f64\n#define xexp2f_u35 nsimd_sleef_exp2_u35_avx512_skylake_f32\n#define xexp10 nsimd_sleef_exp10_u10_avx512_skylake_f64\n#define xexp10f nsimd_sleef_exp10_u10_avx512_skylake_f32\n#define xexp10_u35 nsimd_sleef_exp10_u35_avx512_skylake_f64\n#define xexp10f_u35 nsimd_sleef_exp10_u35_avx512_skylake_f32\n#define xexpm1 nsimd_sleef_expm1_u10_avx512_skylake_f64\n#define xexpm1f nsimd_sleef_expm1_u10_avx512_skylake_f32\n#define xlog10 nsimd_sleef_log10_u10_avx512_skylake_f64\n#define xlog10f nsimd_sleef_log10_u10_avx512_skylake_f32\n#define xlog2 nsimd_sleef_log2_u10_avx512_skylake_f64\n#define xlog2f nsimd_sleef_log2_u10_avx512_skylake_f32\n#define xlog2_u35 nsimd_sleef_log2_u35_avx512_skylake_f64\n#define xlog2f_u35 nsimd_sleef_log2_u35_avx512_skylake_f32\n#define xlog1p nsimd_sleef_log1p_u10_avx512_skylake_f64\n#define xlog1pf nsimd_sleef_log1p_u10_avx512_skylake_f32\n#define xsincospi_u05 nsimd_sleef_sincospi_u05_avx512_skylake_f64\n#define xsincospif_u05 nsimd_sleef_sincospi_u05_avx512_skylake_f32\n#define xsincospi_u35 nsimd_sleef_sincospi_u35_avx512_skylake_f64\n#define xsincospif_u35 nsimd_sleef_sincospi_u35_avx512_skylake_f32\n#define xsinpi_u05 nsimd_sleef_sinpi_u05_avx512_skylake_f64\n#define xsinpif_u05 nsimd_sleef_sinpi_u05_avx512_skylake_f32\n#define xcospi_u05 nsimd_sleef_cospi_u05_avx512_skylake_f64\n#define xcospif_u05 nsimd_sleef_cospi_u05_avx512_skylake_f32\n#define xldexp nsimd_sleef_ldexp_avx512_skylake_f64\n#define xldexpf nsimd_sleef_ldexp_avx512_skylake_f32\n#define xilogb nsimd_sleef_ilogb_avx512_skylake_f64\n#define xilogbf nsimd_sleef_ilogb_avx512_skylake_f32\n#define xfma nsimd_sleef_fma_avx512_skylake_f64\n#define xfmaf nsimd_sleef_fma_avx512_skylake_f32\n#define xsqrt nsimd_sleef_sqrt_avx512_skylake_f64\n#define xsqrtf nsimd_sleef_sqrt_avx512_skylake_f32\n#define xsqrt_u05 nsimd_sleef_sqrt_u05_avx512_skylake_f64\n#define xsqrtf_u05 nsimd_sleef_sqrt_u05_avx512_skylake_f32\n#define xsqrt_u35 nsimd_sleef_sqrt_u35_avx512_skylake_f64\n#define xsqrtf_u35 nsimd_sleef_sqrt_u35_avx512_skylake_f32\n#define xhypot_u05 nsimd_sleef_hypot_u05_avx512_skylake_f64\n#define xhypotf_u05 nsimd_sleef_hypot_u05_avx512_skylake_f32\n#define xhypot_u35 nsimd_sleef_hypot_u35_avx512_skylake_f64\n#define xhypotf_u35 nsimd_sleef_hypot_u35_avx512_skylake_f32\n#define xfabs nsimd_sleef_fabs_avx512_skylake_f64\n#define xfabsf nsimd_sleef_fabs_avx512_skylake_f32\n#define xcopysign nsimd_sleef_copysign_avx512_skylake_f64\n#define xcopysignf nsimd_sleef_copysign_avx512_skylake_f32\n#define xfmax nsimd_sleef_fmax_avx512_skylake_f64\n#define xfmaxf nsimd_sleef_fmax_avx512_skylake_f32\n#define xfmin nsimd_sleef_fmin_avx512_skylake_f64\n#define xfminf nsimd_sleef_fmin_avx512_skylake_f32\n#define xfdim nsimd_sleef_fdim_avx512_skylake_f64\n#define xfdimf nsimd_sleef_fdim_avx512_skylake_f32\n#define xtrunc nsimd_sleef_trunc_avx512_skylake_f64\n#define xtruncf nsimd_sleef_trunc_avx512_skylake_f32\n#define xfloor nsimd_sleef_floor_avx512_skylake_f64\n#define xfloorf nsimd_sleef_floor_avx512_skylake_f32\n#define xceil nsimd_sleef_ceil_avx512_skylake_f64\n#define xceilf nsimd_sleef_ceil_avx512_skylake_f32\n#define xround nsimd_sleef_round_avx512_skylake_f64\n#define xroundf nsimd_sleef_round_avx512_skylake_f32\n#define xrint nsimd_sleef_rint_avx512_skylake_f64\n#define xrintf nsimd_sleef_rint_avx512_skylake_f32\n#define xnextafter nsimd_sleef_nextafter_avx512_skylake_f64\n#define xnextafterf nsimd_sleef_nextafter_avx512_skylake_f32\n#define xfrfrexp nsimd_sleef_frfrexp_avx512_skylake_f64\n#define xfrfrexpf nsimd_sleef_frfrexp_avx512_skylake_f32\n#define xexpfrexp nsimd_sleef_expfrexp_avx512_skylake_f64\n#define xexpfrexpf nsimd_sleef_expfrexp_avx512_skylake_f32\n#define xfmod nsimd_sleef_fmod_avx512_skylake_f64\n#define xfmodf nsimd_sleef_fmod_avx512_skylake_f32\n#define xremainder nsimd_sleef_remainder_avx512_skylake_f64\n#define xremainderf nsimd_sleef_remainder_avx512_skylake_f32\n#define xmodf nsimd_sleef_modf_avx512_skylake_f64\n#define xmodff nsimd_sleef_modf_avx512_skylake_f32\n#define xlgamma_u1 nsimd_sleef_lgamma_u10_avx512_skylake_f64\n#define xlgammaf_u1 nsimd_sleef_lgamma_u10_avx512_skylake_f32\n#define xtgamma_u1 nsimd_sleef_tgamma_u10_avx512_skylake_f64\n#define xtgammaf_u1 nsimd_sleef_tgamma_u10_avx512_skylake_f32\n#define xerf_u1 nsimd_sleef_erf_u10_avx512_skylake_f64\n#define xerff_u1 nsimd_sleef_erf_u10_avx512_skylake_f32\n#define xerfc_u15 nsimd_sleef_erfc_u15_avx512_skylake_f64\n#define xerfcf_u15 nsimd_sleef_erfc_u15_avx512_skylake_f32\n#define xgetInt nsimd_sleef_getInt_avx512_skylake_f64\n#define xgetIntf nsimd_sleef_getInt_avx512_skylake_f32\n#define xgetPtr nsimd_sleef_getPtr_avx512_skylake_f64\n#define xgetPtrf nsimd_sleef_getPtr_avx512_skylake_f32\n\n                   #endif\n\n                   #define rempi nsimd_sleef_rempi_avx512_skylake\n                   #define rempif nsimd_sleef_rempif_avx512_skylake\n                   #define rempisub nsimd_sleef_rempisub_avx512_skylake\n                   #define rempisubf nsimd_sleef_rempisubf_avx512_skylake\n                   #define gammak nsimd_gammak_avx512_skylake\n                   #define gammafk nsimd_gammafk_avx512_skylake\n\n                   #endif\n\n                   \n\n#endif\n\n"
  },
  {
    "path": "src/renameneon32.h",
    "content": "#ifndef RENAMENEON32_H\n               #define RENAMENEON32_H\n\n               /* ------------------------------------------------------------------------- */\n                   /* Naming of functions neon128 */\n\n                   #ifdef NSIMD_NEON128\n\n                   #ifdef DETERMINISTIC\n\n                   #define xsin nsimd_sleef_sin_u35d_neon128_f64\n#define xsinf nsimd_sleef_sin_u35d_neon128_f32\n#define xcos nsimd_sleef_cos_u35d_neon128_f64\n#define xcosf nsimd_sleef_cos_u35d_neon128_f32\n#define xsincos nsimd_sleef_sincos_u35d_neon128_f64\n#define xsincosf nsimd_sleef_sincos_u35d_neon128_f32\n#define xtan nsimd_sleef_tan_u35d_neon128_f64\n#define xtanf nsimd_sleef_tan_u35d_neon128_f32\n#define xasin nsimd_sleef_asin_u35d_neon128_f64\n#define xasinf nsimd_sleef_asin_u35d_neon128_f32\n#define xacos nsimd_sleef_acos_u35d_neon128_f64\n#define xacosf nsimd_sleef_acos_u35d_neon128_f32\n#define xatan nsimd_sleef_atan_u35d_neon128_f64\n#define xatanf nsimd_sleef_atan_u35d_neon128_f32\n#define xatan2 nsimd_sleef_atan2_u35d_neon128_f64\n#define xatan2f nsimd_sleef_atan2_u35d_neon128_f32\n#define xlog nsimd_sleef_log_u35d_neon128_f64\n#define xlogf nsimd_sleef_log_u35d_neon128_f32\n#define xcbrt nsimd_sleef_cbrt_u35d_neon128_f64\n#define xcbrtf nsimd_sleef_cbrt_u35d_neon128_f32\n#define xsin_u1 nsimd_sleef_sin_u10d_neon128_f64\n#define xsinf_u1 nsimd_sleef_sin_u10d_neon128_f32\n#define xcos_u1 nsimd_sleef_cos_u10d_neon128_f64\n#define xcosf_u1 nsimd_sleef_cos_u10d_neon128_f32\n#define xsincos_u1 nsimd_sleef_sincos_u10d_neon128_f64\n#define xsincosf_u1 nsimd_sleef_sincos_u10d_neon128_f32\n#define xtan_u1 nsimd_sleef_tan_u10d_neon128_f64\n#define xtanf_u1 nsimd_sleef_tan_u10d_neon128_f32\n#define xasin_u1 nsimd_sleef_asin_u10d_neon128_f64\n#define xasinf_u1 nsimd_sleef_asin_u10d_neon128_f32\n#define xacos_u1 nsimd_sleef_acos_u10d_neon128_f64\n#define xacosf_u1 nsimd_sleef_acos_u10d_neon128_f32\n#define xatan_u1 nsimd_sleef_atan_u10d_neon128_f64\n#define xatanf_u1 nsimd_sleef_atan_u10d_neon128_f32\n#define xatan2_u1 nsimd_sleef_atan2_u10d_neon128_f64\n#define xatan2f_u1 nsimd_sleef_atan2_u10d_neon128_f32\n#define xlog_u1 nsimd_sleef_log_u10d_neon128_f64\n#define xlogf_u1 nsimd_sleef_log_u10d_neon128_f32\n#define xcbrt_u1 nsimd_sleef_cbrt_u10d_neon128_f64\n#define xcbrtf_u1 nsimd_sleef_cbrt_u10d_neon128_f32\n#define xexp nsimd_sleef_exp_u10d_neon128_f64\n#define xexpf nsimd_sleef_exp_u10d_neon128_f32\n#define xpow nsimd_sleef_pow_u10d_neon128_f64\n#define xpowf nsimd_sleef_pow_u10d_neon128_f32\n#define xsinh nsimd_sleef_sinh_u10d_neon128_f64\n#define xsinhf nsimd_sleef_sinh_u10d_neon128_f32\n#define xcosh nsimd_sleef_cosh_u10d_neon128_f64\n#define xcoshf nsimd_sleef_cosh_u10d_neon128_f32\n#define xtanh nsimd_sleef_tanh_u10d_neon128_f64\n#define xtanhf nsimd_sleef_tanh_u10d_neon128_f32\n#define xsinh_u35 nsimd_sleef_sinh_u35d_neon128_f64\n#define xsinhf_u35 nsimd_sleef_sinh_u35d_neon128_f32\n#define xcosh_u35 nsimd_sleef_cosh_u35d_neon128_f64\n#define xcoshf_u35 nsimd_sleef_cosh_u35d_neon128_f32\n#define xtanh_u35 nsimd_sleef_tanh_u35d_neon128_f64\n#define xtanhf_u35 nsimd_sleef_tanh_u35d_neon128_f32\n#define xfastsin_u3500 nsimd_sleef_fastsin_u3500d_neon128_f64\n#define xfastsinf_u3500 nsimd_sleef_fastsin_u3500d_neon128_f32\n#define xfastcos_u3500 nsimd_sleef_fastcos_u3500d_neon128_f64\n#define xfastcosf_u3500 nsimd_sleef_fastcos_u3500d_neon128_f32\n#define xfastpow_u3500 nsimd_sleef_fastpow_u3500d_neon128_f64\n#define xfastpowf_u3500 nsimd_sleef_fastpow_u3500d_neon128_f32\n#define xasinh nsimd_sleef_asinh_u10d_neon128_f64\n#define xasinhf nsimd_sleef_asinh_u10d_neon128_f32\n#define xacosh nsimd_sleef_acosh_u10d_neon128_f64\n#define xacoshf nsimd_sleef_acosh_u10d_neon128_f32\n#define xatanh nsimd_sleef_atanh_u10d_neon128_f64\n#define xatanhf nsimd_sleef_atanh_u10d_neon128_f32\n#define xexp2 nsimd_sleef_exp2_u10d_neon128_f64\n#define xexp2f nsimd_sleef_exp2_u10d_neon128_f32\n#define xexp2_u35 nsimd_sleef_exp2_u35d_neon128_f64\n#define xexp2f_u35 nsimd_sleef_exp2_u35d_neon128_f32\n#define xexp10 nsimd_sleef_exp10_u10d_neon128_f64\n#define xexp10f nsimd_sleef_exp10_u10d_neon128_f32\n#define xexp10_u35 nsimd_sleef_exp10_u35d_neon128_f64\n#define xexp10f_u35 nsimd_sleef_exp10_u35d_neon128_f32\n#define xexpm1 nsimd_sleef_expm1_u10d_neon128_f64\n#define xexpm1f nsimd_sleef_expm1_u10d_neon128_f32\n#define xlog10 nsimd_sleef_log10_u10d_neon128_f64\n#define xlog10f nsimd_sleef_log10_u10d_neon128_f32\n#define xlog2 nsimd_sleef_log2_u10d_neon128_f64\n#define xlog2f nsimd_sleef_log2_u10d_neon128_f32\n#define xlog2_u35 nsimd_sleef_log2_u35d_neon128_f64\n#define xlog2f_u35 nsimd_sleef_log2_u35d_neon128_f32\n#define xlog1p nsimd_sleef_log1p_u10d_neon128_f64\n#define xlog1pf nsimd_sleef_log1p_u10d_neon128_f32\n#define xsincospi_u05 nsimd_sleef_sincospi_u05d_neon128_f64\n#define xsincospif_u05 nsimd_sleef_sincospi_u05d_neon128_f32\n#define xsincospi_u35 nsimd_sleef_sincospi_u35d_neon128_f64\n#define xsincospif_u35 nsimd_sleef_sincospi_u35d_neon128_f32\n#define xsinpi_u05 nsimd_sleef_sinpi_u05d_neon128_f64\n#define xsinpif_u05 nsimd_sleef_sinpi_u05d_neon128_f32\n#define xcospi_u05 nsimd_sleef_cospi_u05d_neon128_f64\n#define xcospif_u05 nsimd_sleef_cospi_u05d_neon128_f32\n#define xldexp nsimd_sleef_ldexp_neon128_f64\n#define xldexpf nsimd_sleef_ldexp_neon128_f32\n#define xilogb nsimd_sleef_ilogb_neon128_f64\n#define xilogbf nsimd_sleef_ilogb_neon128_f32\n#define xfma nsimd_sleef_fma_neon128_f64\n#define xfmaf nsimd_sleef_fma_neon128_f32\n#define xsqrt nsimd_sleef_sqrt_neon128_f64\n#define xsqrtf nsimd_sleef_sqrt_neon128_f32\n#define xsqrt_u05 nsimd_sleef_sqrt_u05d_neon128_f64\n#define xsqrtf_u05 nsimd_sleef_sqrt_u05d_neon128_f32\n#define xsqrt_u35 nsimd_sleef_sqrt_u35d_neon128_f64\n#define xsqrtf_u35 nsimd_sleef_sqrt_u35d_neon128_f32\n#define xhypot_u05 nsimd_sleef_hypot_u05d_neon128_f64\n#define xhypotf_u05 nsimd_sleef_hypot_u05d_neon128_f32\n#define xhypot_u35 nsimd_sleef_hypot_u35d_neon128_f64\n#define xhypotf_u35 nsimd_sleef_hypot_u35d_neon128_f32\n#define xfabs nsimd_sleef_fabs_neon128_f64\n#define xfabsf nsimd_sleef_fabs_neon128_f32\n#define xcopysign nsimd_sleef_copysign_neon128_f64\n#define xcopysignf nsimd_sleef_copysign_neon128_f32\n#define xfmax nsimd_sleef_fmax_neon128_f64\n#define xfmaxf nsimd_sleef_fmax_neon128_f32\n#define xfmin nsimd_sleef_fmin_neon128_f64\n#define xfminf nsimd_sleef_fmin_neon128_f32\n#define xfdim nsimd_sleef_fdim_neon128_f64\n#define xfdimf nsimd_sleef_fdim_neon128_f32\n#define xtrunc nsimd_sleef_trunc_neon128_f64\n#define xtruncf nsimd_sleef_trunc_neon128_f32\n#define xfloor nsimd_sleef_floor_neon128_f64\n#define xfloorf nsimd_sleef_floor_neon128_f32\n#define xceil nsimd_sleef_ceil_neon128_f64\n#define xceilf nsimd_sleef_ceil_neon128_f32\n#define xround nsimd_sleef_round_neon128_f64\n#define xroundf nsimd_sleef_round_neon128_f32\n#define xrint nsimd_sleef_rint_neon128_f64\n#define xrintf nsimd_sleef_rint_neon128_f32\n#define xnextafter nsimd_sleef_nextafter_neon128_f64\n#define xnextafterf nsimd_sleef_nextafter_neon128_f32\n#define xfrfrexp nsimd_sleef_frfrexp_neon128_f64\n#define xfrfrexpf nsimd_sleef_frfrexp_neon128_f32\n#define xexpfrexp nsimd_sleef_expfrexp_neon128_f64\n#define xexpfrexpf nsimd_sleef_expfrexp_neon128_f32\n#define xfmod nsimd_sleef_fmod_neon128_f64\n#define xfmodf nsimd_sleef_fmod_neon128_f32\n#define xremainder nsimd_sleef_remainder_neon128_f64\n#define xremainderf nsimd_sleef_remainder_neon128_f32\n#define xmodf nsimd_sleef_modf_neon128_f64\n#define xmodff nsimd_sleef_modf_neon128_f32\n#define xlgamma_u1 nsimd_sleef_lgamma_u10d_neon128_f64\n#define xlgammaf_u1 nsimd_sleef_lgamma_u10d_neon128_f32\n#define xtgamma_u1 nsimd_sleef_tgamma_u10d_neon128_f64\n#define xtgammaf_u1 nsimd_sleef_tgamma_u10d_neon128_f32\n#define xerf_u1 nsimd_sleef_erf_u10d_neon128_f64\n#define xerff_u1 nsimd_sleef_erf_u10d_neon128_f32\n#define xerfc_u15 nsimd_sleef_erfc_u15d_neon128_f64\n#define xerfcf_u15 nsimd_sleef_erfc_u15d_neon128_f32\n#define xgetInt nsimd_sleef_getInt_neon128_f64\n#define xgetIntf nsimd_sleef_getInt_neon128_f32\n#define xgetPtr nsimd_sleef_getPtr_neon128_f64\n#define xgetPtrf nsimd_sleef_getPtr_neon128_f32\n\n                   #else\n\n                   #define xsin nsimd_sleef_sin_u35_neon128_f64\n#define xsinf nsimd_sleef_sin_u35_neon128_f32\n#define xcos nsimd_sleef_cos_u35_neon128_f64\n#define xcosf nsimd_sleef_cos_u35_neon128_f32\n#define xsincos nsimd_sleef_sincos_u35_neon128_f64\n#define xsincosf nsimd_sleef_sincos_u35_neon128_f32\n#define xtan nsimd_sleef_tan_u35_neon128_f64\n#define xtanf nsimd_sleef_tan_u35_neon128_f32\n#define xasin nsimd_sleef_asin_u35_neon128_f64\n#define xasinf nsimd_sleef_asin_u35_neon128_f32\n#define xacos nsimd_sleef_acos_u35_neon128_f64\n#define xacosf nsimd_sleef_acos_u35_neon128_f32\n#define xatan nsimd_sleef_atan_u35_neon128_f64\n#define xatanf nsimd_sleef_atan_u35_neon128_f32\n#define xatan2 nsimd_sleef_atan2_u35_neon128_f64\n#define xatan2f nsimd_sleef_atan2_u35_neon128_f32\n#define xlog nsimd_sleef_log_u35_neon128_f64\n#define xlogf nsimd_sleef_log_u35_neon128_f32\n#define xcbrt nsimd_sleef_cbrt_u35_neon128_f64\n#define xcbrtf nsimd_sleef_cbrt_u35_neon128_f32\n#define xsin_u1 nsimd_sleef_sin_u10_neon128_f64\n#define xsinf_u1 nsimd_sleef_sin_u10_neon128_f32\n#define xcos_u1 nsimd_sleef_cos_u10_neon128_f64\n#define xcosf_u1 nsimd_sleef_cos_u10_neon128_f32\n#define xsincos_u1 nsimd_sleef_sincos_u10_neon128_f64\n#define xsincosf_u1 nsimd_sleef_sincos_u10_neon128_f32\n#define xtan_u1 nsimd_sleef_tan_u10_neon128_f64\n#define xtanf_u1 nsimd_sleef_tan_u10_neon128_f32\n#define xasin_u1 nsimd_sleef_asin_u10_neon128_f64\n#define xasinf_u1 nsimd_sleef_asin_u10_neon128_f32\n#define xacos_u1 nsimd_sleef_acos_u10_neon128_f64\n#define xacosf_u1 nsimd_sleef_acos_u10_neon128_f32\n#define xatan_u1 nsimd_sleef_atan_u10_neon128_f64\n#define xatanf_u1 nsimd_sleef_atan_u10_neon128_f32\n#define xatan2_u1 nsimd_sleef_atan2_u10_neon128_f64\n#define xatan2f_u1 nsimd_sleef_atan2_u10_neon128_f32\n#define xlog_u1 nsimd_sleef_log_u10_neon128_f64\n#define xlogf_u1 nsimd_sleef_log_u10_neon128_f32\n#define xcbrt_u1 nsimd_sleef_cbrt_u10_neon128_f64\n#define xcbrtf_u1 nsimd_sleef_cbrt_u10_neon128_f32\n#define xexp nsimd_sleef_exp_u10_neon128_f64\n#define xexpf nsimd_sleef_exp_u10_neon128_f32\n#define xpow nsimd_sleef_pow_u10_neon128_f64\n#define xpowf nsimd_sleef_pow_u10_neon128_f32\n#define xsinh nsimd_sleef_sinh_u10_neon128_f64\n#define xsinhf nsimd_sleef_sinh_u10_neon128_f32\n#define xcosh nsimd_sleef_cosh_u10_neon128_f64\n#define xcoshf nsimd_sleef_cosh_u10_neon128_f32\n#define xtanh nsimd_sleef_tanh_u10_neon128_f64\n#define xtanhf nsimd_sleef_tanh_u10_neon128_f32\n#define xsinh_u35 nsimd_sleef_sinh_u35_neon128_f64\n#define xsinhf_u35 nsimd_sleef_sinh_u35_neon128_f32\n#define xcosh_u35 nsimd_sleef_cosh_u35_neon128_f64\n#define xcoshf_u35 nsimd_sleef_cosh_u35_neon128_f32\n#define xtanh_u35 nsimd_sleef_tanh_u35_neon128_f64\n#define xtanhf_u35 nsimd_sleef_tanh_u35_neon128_f32\n#define xfastsin_u3500 nsimd_sleef_fastsin_u3500_neon128_f64\n#define xfastsinf_u3500 nsimd_sleef_fastsin_u3500_neon128_f32\n#define xfastcos_u3500 nsimd_sleef_fastcos_u3500_neon128_f64\n#define xfastcosf_u3500 nsimd_sleef_fastcos_u3500_neon128_f32\n#define xfastpow_u3500 nsimd_sleef_fastpow_u3500_neon128_f64\n#define xfastpowf_u3500 nsimd_sleef_fastpow_u3500_neon128_f32\n#define xasinh nsimd_sleef_asinh_u10_neon128_f64\n#define xasinhf nsimd_sleef_asinh_u10_neon128_f32\n#define xacosh nsimd_sleef_acosh_u10_neon128_f64\n#define xacoshf nsimd_sleef_acosh_u10_neon128_f32\n#define xatanh nsimd_sleef_atanh_u10_neon128_f64\n#define xatanhf nsimd_sleef_atanh_u10_neon128_f32\n#define xexp2 nsimd_sleef_exp2_u10_neon128_f64\n#define xexp2f nsimd_sleef_exp2_u10_neon128_f32\n#define xexp2_u35 nsimd_sleef_exp2_u35_neon128_f64\n#define xexp2f_u35 nsimd_sleef_exp2_u35_neon128_f32\n#define xexp10 nsimd_sleef_exp10_u10_neon128_f64\n#define xexp10f nsimd_sleef_exp10_u10_neon128_f32\n#define xexp10_u35 nsimd_sleef_exp10_u35_neon128_f64\n#define xexp10f_u35 nsimd_sleef_exp10_u35_neon128_f32\n#define xexpm1 nsimd_sleef_expm1_u10_neon128_f64\n#define xexpm1f nsimd_sleef_expm1_u10_neon128_f32\n#define xlog10 nsimd_sleef_log10_u10_neon128_f64\n#define xlog10f nsimd_sleef_log10_u10_neon128_f32\n#define xlog2 nsimd_sleef_log2_u10_neon128_f64\n#define xlog2f nsimd_sleef_log2_u10_neon128_f32\n#define xlog2_u35 nsimd_sleef_log2_u35_neon128_f64\n#define xlog2f_u35 nsimd_sleef_log2_u35_neon128_f32\n#define xlog1p nsimd_sleef_log1p_u10_neon128_f64\n#define xlog1pf nsimd_sleef_log1p_u10_neon128_f32\n#define xsincospi_u05 nsimd_sleef_sincospi_u05_neon128_f64\n#define xsincospif_u05 nsimd_sleef_sincospi_u05_neon128_f32\n#define xsincospi_u35 nsimd_sleef_sincospi_u35_neon128_f64\n#define xsincospif_u35 nsimd_sleef_sincospi_u35_neon128_f32\n#define xsinpi_u05 nsimd_sleef_sinpi_u05_neon128_f64\n#define xsinpif_u05 nsimd_sleef_sinpi_u05_neon128_f32\n#define xcospi_u05 nsimd_sleef_cospi_u05_neon128_f64\n#define xcospif_u05 nsimd_sleef_cospi_u05_neon128_f32\n#define xldexp nsimd_sleef_ldexp_neon128_f64\n#define xldexpf nsimd_sleef_ldexp_neon128_f32\n#define xilogb nsimd_sleef_ilogb_neon128_f64\n#define xilogbf nsimd_sleef_ilogb_neon128_f32\n#define xfma nsimd_sleef_fma_neon128_f64\n#define xfmaf nsimd_sleef_fma_neon128_f32\n#define xsqrt nsimd_sleef_sqrt_neon128_f64\n#define xsqrtf nsimd_sleef_sqrt_neon128_f32\n#define xsqrt_u05 nsimd_sleef_sqrt_u05_neon128_f64\n#define xsqrtf_u05 nsimd_sleef_sqrt_u05_neon128_f32\n#define xsqrt_u35 nsimd_sleef_sqrt_u35_neon128_f64\n#define xsqrtf_u35 nsimd_sleef_sqrt_u35_neon128_f32\n#define xhypot_u05 nsimd_sleef_hypot_u05_neon128_f64\n#define xhypotf_u05 nsimd_sleef_hypot_u05_neon128_f32\n#define xhypot_u35 nsimd_sleef_hypot_u35_neon128_f64\n#define xhypotf_u35 nsimd_sleef_hypot_u35_neon128_f32\n#define xfabs nsimd_sleef_fabs_neon128_f64\n#define xfabsf nsimd_sleef_fabs_neon128_f32\n#define xcopysign nsimd_sleef_copysign_neon128_f64\n#define xcopysignf nsimd_sleef_copysign_neon128_f32\n#define xfmax nsimd_sleef_fmax_neon128_f64\n#define xfmaxf nsimd_sleef_fmax_neon128_f32\n#define xfmin nsimd_sleef_fmin_neon128_f64\n#define xfminf nsimd_sleef_fmin_neon128_f32\n#define xfdim nsimd_sleef_fdim_neon128_f64\n#define xfdimf nsimd_sleef_fdim_neon128_f32\n#define xtrunc nsimd_sleef_trunc_neon128_f64\n#define xtruncf nsimd_sleef_trunc_neon128_f32\n#define xfloor nsimd_sleef_floor_neon128_f64\n#define xfloorf nsimd_sleef_floor_neon128_f32\n#define xceil nsimd_sleef_ceil_neon128_f64\n#define xceilf nsimd_sleef_ceil_neon128_f32\n#define xround nsimd_sleef_round_neon128_f64\n#define xroundf nsimd_sleef_round_neon128_f32\n#define xrint nsimd_sleef_rint_neon128_f64\n#define xrintf nsimd_sleef_rint_neon128_f32\n#define xnextafter nsimd_sleef_nextafter_neon128_f64\n#define xnextafterf nsimd_sleef_nextafter_neon128_f32\n#define xfrfrexp nsimd_sleef_frfrexp_neon128_f64\n#define xfrfrexpf nsimd_sleef_frfrexp_neon128_f32\n#define xexpfrexp nsimd_sleef_expfrexp_neon128_f64\n#define xexpfrexpf nsimd_sleef_expfrexp_neon128_f32\n#define xfmod nsimd_sleef_fmod_neon128_f64\n#define xfmodf nsimd_sleef_fmod_neon128_f32\n#define xremainder nsimd_sleef_remainder_neon128_f64\n#define xremainderf nsimd_sleef_remainder_neon128_f32\n#define xmodf nsimd_sleef_modf_neon128_f64\n#define xmodff nsimd_sleef_modf_neon128_f32\n#define xlgamma_u1 nsimd_sleef_lgamma_u10_neon128_f64\n#define xlgammaf_u1 nsimd_sleef_lgamma_u10_neon128_f32\n#define xtgamma_u1 nsimd_sleef_tgamma_u10_neon128_f64\n#define xtgammaf_u1 nsimd_sleef_tgamma_u10_neon128_f32\n#define xerf_u1 nsimd_sleef_erf_u10_neon128_f64\n#define xerff_u1 nsimd_sleef_erf_u10_neon128_f32\n#define xerfc_u15 nsimd_sleef_erfc_u15_neon128_f64\n#define xerfcf_u15 nsimd_sleef_erfc_u15_neon128_f32\n#define xgetInt nsimd_sleef_getInt_neon128_f64\n#define xgetIntf nsimd_sleef_getInt_neon128_f32\n#define xgetPtr nsimd_sleef_getPtr_neon128_f64\n#define xgetPtrf nsimd_sleef_getPtr_neon128_f32\n\n                   #endif\n\n                   #define rempi nsimd_sleef_rempi_neon128\n                   #define rempif nsimd_sleef_rempif_neon128\n                   #define rempisub nsimd_sleef_rempisub_neon128\n                   #define rempisubf nsimd_sleef_rempisubf_neon128\n                   #define gammak nsimd_gammak_neon128\n                   #define gammafk nsimd_gammafk_neon128\n\n                   #endif\n\n                   \n\n#endif\n\n"
  },
  {
    "path": "src/renamesse2.h",
    "content": "#ifndef RENAMESSE2_H\n               #define RENAMESSE2_H\n\n               /* ------------------------------------------------------------------------- */\n                   /* Naming of functions sse2 */\n\n                   #ifdef NSIMD_SSE2\n\n                   #ifdef DETERMINISTIC\n\n                   #define xsin nsimd_sleef_sin_u35d_sse2_f64\n#define xsinf nsimd_sleef_sin_u35d_sse2_f32\n#define xcos nsimd_sleef_cos_u35d_sse2_f64\n#define xcosf nsimd_sleef_cos_u35d_sse2_f32\n#define xsincos nsimd_sleef_sincos_u35d_sse2_f64\n#define xsincosf nsimd_sleef_sincos_u35d_sse2_f32\n#define xtan nsimd_sleef_tan_u35d_sse2_f64\n#define xtanf nsimd_sleef_tan_u35d_sse2_f32\n#define xasin nsimd_sleef_asin_u35d_sse2_f64\n#define xasinf nsimd_sleef_asin_u35d_sse2_f32\n#define xacos nsimd_sleef_acos_u35d_sse2_f64\n#define xacosf nsimd_sleef_acos_u35d_sse2_f32\n#define xatan nsimd_sleef_atan_u35d_sse2_f64\n#define xatanf nsimd_sleef_atan_u35d_sse2_f32\n#define xatan2 nsimd_sleef_atan2_u35d_sse2_f64\n#define xatan2f nsimd_sleef_atan2_u35d_sse2_f32\n#define xlog nsimd_sleef_log_u35d_sse2_f64\n#define xlogf nsimd_sleef_log_u35d_sse2_f32\n#define xcbrt nsimd_sleef_cbrt_u35d_sse2_f64\n#define xcbrtf nsimd_sleef_cbrt_u35d_sse2_f32\n#define xsin_u1 nsimd_sleef_sin_u10d_sse2_f64\n#define xsinf_u1 nsimd_sleef_sin_u10d_sse2_f32\n#define xcos_u1 nsimd_sleef_cos_u10d_sse2_f64\n#define xcosf_u1 nsimd_sleef_cos_u10d_sse2_f32\n#define xsincos_u1 nsimd_sleef_sincos_u10d_sse2_f64\n#define xsincosf_u1 nsimd_sleef_sincos_u10d_sse2_f32\n#define xtan_u1 nsimd_sleef_tan_u10d_sse2_f64\n#define xtanf_u1 nsimd_sleef_tan_u10d_sse2_f32\n#define xasin_u1 nsimd_sleef_asin_u10d_sse2_f64\n#define xasinf_u1 nsimd_sleef_asin_u10d_sse2_f32\n#define xacos_u1 nsimd_sleef_acos_u10d_sse2_f64\n#define xacosf_u1 nsimd_sleef_acos_u10d_sse2_f32\n#define xatan_u1 nsimd_sleef_atan_u10d_sse2_f64\n#define xatanf_u1 nsimd_sleef_atan_u10d_sse2_f32\n#define xatan2_u1 nsimd_sleef_atan2_u10d_sse2_f64\n#define xatan2f_u1 nsimd_sleef_atan2_u10d_sse2_f32\n#define xlog_u1 nsimd_sleef_log_u10d_sse2_f64\n#define xlogf_u1 nsimd_sleef_log_u10d_sse2_f32\n#define xcbrt_u1 nsimd_sleef_cbrt_u10d_sse2_f64\n#define xcbrtf_u1 nsimd_sleef_cbrt_u10d_sse2_f32\n#define xexp nsimd_sleef_exp_u10d_sse2_f64\n#define xexpf nsimd_sleef_exp_u10d_sse2_f32\n#define xpow nsimd_sleef_pow_u10d_sse2_f64\n#define xpowf nsimd_sleef_pow_u10d_sse2_f32\n#define xsinh nsimd_sleef_sinh_u10d_sse2_f64\n#define xsinhf nsimd_sleef_sinh_u10d_sse2_f32\n#define xcosh nsimd_sleef_cosh_u10d_sse2_f64\n#define xcoshf nsimd_sleef_cosh_u10d_sse2_f32\n#define xtanh nsimd_sleef_tanh_u10d_sse2_f64\n#define xtanhf nsimd_sleef_tanh_u10d_sse2_f32\n#define xsinh_u35 nsimd_sleef_sinh_u35d_sse2_f64\n#define xsinhf_u35 nsimd_sleef_sinh_u35d_sse2_f32\n#define xcosh_u35 nsimd_sleef_cosh_u35d_sse2_f64\n#define xcoshf_u35 nsimd_sleef_cosh_u35d_sse2_f32\n#define xtanh_u35 nsimd_sleef_tanh_u35d_sse2_f64\n#define xtanhf_u35 nsimd_sleef_tanh_u35d_sse2_f32\n#define xfastsin_u3500 nsimd_sleef_fastsin_u3500d_sse2_f64\n#define xfastsinf_u3500 nsimd_sleef_fastsin_u3500d_sse2_f32\n#define xfastcos_u3500 nsimd_sleef_fastcos_u3500d_sse2_f64\n#define xfastcosf_u3500 nsimd_sleef_fastcos_u3500d_sse2_f32\n#define xfastpow_u3500 nsimd_sleef_fastpow_u3500d_sse2_f64\n#define xfastpowf_u3500 nsimd_sleef_fastpow_u3500d_sse2_f32\n#define xasinh nsimd_sleef_asinh_u10d_sse2_f64\n#define xasinhf nsimd_sleef_asinh_u10d_sse2_f32\n#define xacosh nsimd_sleef_acosh_u10d_sse2_f64\n#define xacoshf nsimd_sleef_acosh_u10d_sse2_f32\n#define xatanh nsimd_sleef_atanh_u10d_sse2_f64\n#define xatanhf nsimd_sleef_atanh_u10d_sse2_f32\n#define xexp2 nsimd_sleef_exp2_u10d_sse2_f64\n#define xexp2f nsimd_sleef_exp2_u10d_sse2_f32\n#define xexp2_u35 nsimd_sleef_exp2_u35d_sse2_f64\n#define xexp2f_u35 nsimd_sleef_exp2_u35d_sse2_f32\n#define xexp10 nsimd_sleef_exp10_u10d_sse2_f64\n#define xexp10f nsimd_sleef_exp10_u10d_sse2_f32\n#define xexp10_u35 nsimd_sleef_exp10_u35d_sse2_f64\n#define xexp10f_u35 nsimd_sleef_exp10_u35d_sse2_f32\n#define xexpm1 nsimd_sleef_expm1_u10d_sse2_f64\n#define xexpm1f nsimd_sleef_expm1_u10d_sse2_f32\n#define xlog10 nsimd_sleef_log10_u10d_sse2_f64\n#define xlog10f nsimd_sleef_log10_u10d_sse2_f32\n#define xlog2 nsimd_sleef_log2_u10d_sse2_f64\n#define xlog2f nsimd_sleef_log2_u10d_sse2_f32\n#define xlog2_u35 nsimd_sleef_log2_u35d_sse2_f64\n#define xlog2f_u35 nsimd_sleef_log2_u35d_sse2_f32\n#define xlog1p nsimd_sleef_log1p_u10d_sse2_f64\n#define xlog1pf nsimd_sleef_log1p_u10d_sse2_f32\n#define xsincospi_u05 nsimd_sleef_sincospi_u05d_sse2_f64\n#define xsincospif_u05 nsimd_sleef_sincospi_u05d_sse2_f32\n#define xsincospi_u35 nsimd_sleef_sincospi_u35d_sse2_f64\n#define xsincospif_u35 nsimd_sleef_sincospi_u35d_sse2_f32\n#define xsinpi_u05 nsimd_sleef_sinpi_u05d_sse2_f64\n#define xsinpif_u05 nsimd_sleef_sinpi_u05d_sse2_f32\n#define xcospi_u05 nsimd_sleef_cospi_u05d_sse2_f64\n#define xcospif_u05 nsimd_sleef_cospi_u05d_sse2_f32\n#define xldexp nsimd_sleef_ldexp_sse2_f64\n#define xldexpf nsimd_sleef_ldexp_sse2_f32\n#define xilogb nsimd_sleef_ilogb_sse2_f64\n#define xilogbf nsimd_sleef_ilogb_sse2_f32\n#define xfma nsimd_sleef_fma_sse2_f64\n#define xfmaf nsimd_sleef_fma_sse2_f32\n#define xsqrt nsimd_sleef_sqrt_sse2_f64\n#define xsqrtf nsimd_sleef_sqrt_sse2_f32\n#define xsqrt_u05 nsimd_sleef_sqrt_u05d_sse2_f64\n#define xsqrtf_u05 nsimd_sleef_sqrt_u05d_sse2_f32\n#define xsqrt_u35 nsimd_sleef_sqrt_u35d_sse2_f64\n#define xsqrtf_u35 nsimd_sleef_sqrt_u35d_sse2_f32\n#define xhypot_u05 nsimd_sleef_hypot_u05d_sse2_f64\n#define xhypotf_u05 nsimd_sleef_hypot_u05d_sse2_f32\n#define xhypot_u35 nsimd_sleef_hypot_u35d_sse2_f64\n#define xhypotf_u35 nsimd_sleef_hypot_u35d_sse2_f32\n#define xfabs nsimd_sleef_fabs_sse2_f64\n#define xfabsf nsimd_sleef_fabs_sse2_f32\n#define xcopysign nsimd_sleef_copysign_sse2_f64\n#define xcopysignf nsimd_sleef_copysign_sse2_f32\n#define xfmax nsimd_sleef_fmax_sse2_f64\n#define xfmaxf nsimd_sleef_fmax_sse2_f32\n#define xfmin nsimd_sleef_fmin_sse2_f64\n#define xfminf nsimd_sleef_fmin_sse2_f32\n#define xfdim nsimd_sleef_fdim_sse2_f64\n#define xfdimf nsimd_sleef_fdim_sse2_f32\n#define xtrunc nsimd_sleef_trunc_sse2_f64\n#define xtruncf nsimd_sleef_trunc_sse2_f32\n#define xfloor nsimd_sleef_floor_sse2_f64\n#define xfloorf nsimd_sleef_floor_sse2_f32\n#define xceil nsimd_sleef_ceil_sse2_f64\n#define xceilf nsimd_sleef_ceil_sse2_f32\n#define xround nsimd_sleef_round_sse2_f64\n#define xroundf nsimd_sleef_round_sse2_f32\n#define xrint nsimd_sleef_rint_sse2_f64\n#define xrintf nsimd_sleef_rint_sse2_f32\n#define xnextafter nsimd_sleef_nextafter_sse2_f64\n#define xnextafterf nsimd_sleef_nextafter_sse2_f32\n#define xfrfrexp nsimd_sleef_frfrexp_sse2_f64\n#define xfrfrexpf nsimd_sleef_frfrexp_sse2_f32\n#define xexpfrexp nsimd_sleef_expfrexp_sse2_f64\n#define xexpfrexpf nsimd_sleef_expfrexp_sse2_f32\n#define xfmod nsimd_sleef_fmod_sse2_f64\n#define xfmodf nsimd_sleef_fmod_sse2_f32\n#define xremainder nsimd_sleef_remainder_sse2_f64\n#define xremainderf nsimd_sleef_remainder_sse2_f32\n#define xmodf nsimd_sleef_modf_sse2_f64\n#define xmodff nsimd_sleef_modf_sse2_f32\n#define xlgamma_u1 nsimd_sleef_lgamma_u10d_sse2_f64\n#define xlgammaf_u1 nsimd_sleef_lgamma_u10d_sse2_f32\n#define xtgamma_u1 nsimd_sleef_tgamma_u10d_sse2_f64\n#define xtgammaf_u1 nsimd_sleef_tgamma_u10d_sse2_f32\n#define xerf_u1 nsimd_sleef_erf_u10d_sse2_f64\n#define xerff_u1 nsimd_sleef_erf_u10d_sse2_f32\n#define xerfc_u15 nsimd_sleef_erfc_u15d_sse2_f64\n#define xerfcf_u15 nsimd_sleef_erfc_u15d_sse2_f32\n#define xgetInt nsimd_sleef_getInt_sse2_f64\n#define xgetIntf nsimd_sleef_getInt_sse2_f32\n#define xgetPtr nsimd_sleef_getPtr_sse2_f64\n#define xgetPtrf nsimd_sleef_getPtr_sse2_f32\n\n                   #else\n\n                   #define xsin nsimd_sleef_sin_u35_sse2_f64\n#define xsinf nsimd_sleef_sin_u35_sse2_f32\n#define xcos nsimd_sleef_cos_u35_sse2_f64\n#define xcosf nsimd_sleef_cos_u35_sse2_f32\n#define xsincos nsimd_sleef_sincos_u35_sse2_f64\n#define xsincosf nsimd_sleef_sincos_u35_sse2_f32\n#define xtan nsimd_sleef_tan_u35_sse2_f64\n#define xtanf nsimd_sleef_tan_u35_sse2_f32\n#define xasin nsimd_sleef_asin_u35_sse2_f64\n#define xasinf nsimd_sleef_asin_u35_sse2_f32\n#define xacos nsimd_sleef_acos_u35_sse2_f64\n#define xacosf nsimd_sleef_acos_u35_sse2_f32\n#define xatan nsimd_sleef_atan_u35_sse2_f64\n#define xatanf nsimd_sleef_atan_u35_sse2_f32\n#define xatan2 nsimd_sleef_atan2_u35_sse2_f64\n#define xatan2f nsimd_sleef_atan2_u35_sse2_f32\n#define xlog nsimd_sleef_log_u35_sse2_f64\n#define xlogf nsimd_sleef_log_u35_sse2_f32\n#define xcbrt nsimd_sleef_cbrt_u35_sse2_f64\n#define xcbrtf nsimd_sleef_cbrt_u35_sse2_f32\n#define xsin_u1 nsimd_sleef_sin_u10_sse2_f64\n#define xsinf_u1 nsimd_sleef_sin_u10_sse2_f32\n#define xcos_u1 nsimd_sleef_cos_u10_sse2_f64\n#define xcosf_u1 nsimd_sleef_cos_u10_sse2_f32\n#define xsincos_u1 nsimd_sleef_sincos_u10_sse2_f64\n#define xsincosf_u1 nsimd_sleef_sincos_u10_sse2_f32\n#define xtan_u1 nsimd_sleef_tan_u10_sse2_f64\n#define xtanf_u1 nsimd_sleef_tan_u10_sse2_f32\n#define xasin_u1 nsimd_sleef_asin_u10_sse2_f64\n#define xasinf_u1 nsimd_sleef_asin_u10_sse2_f32\n#define xacos_u1 nsimd_sleef_acos_u10_sse2_f64\n#define xacosf_u1 nsimd_sleef_acos_u10_sse2_f32\n#define xatan_u1 nsimd_sleef_atan_u10_sse2_f64\n#define xatanf_u1 nsimd_sleef_atan_u10_sse2_f32\n#define xatan2_u1 nsimd_sleef_atan2_u10_sse2_f64\n#define xatan2f_u1 nsimd_sleef_atan2_u10_sse2_f32\n#define xlog_u1 nsimd_sleef_log_u10_sse2_f64\n#define xlogf_u1 nsimd_sleef_log_u10_sse2_f32\n#define xcbrt_u1 nsimd_sleef_cbrt_u10_sse2_f64\n#define xcbrtf_u1 nsimd_sleef_cbrt_u10_sse2_f32\n#define xexp nsimd_sleef_exp_u10_sse2_f64\n#define xexpf nsimd_sleef_exp_u10_sse2_f32\n#define xpow nsimd_sleef_pow_u10_sse2_f64\n#define xpowf nsimd_sleef_pow_u10_sse2_f32\n#define xsinh nsimd_sleef_sinh_u10_sse2_f64\n#define xsinhf nsimd_sleef_sinh_u10_sse2_f32\n#define xcosh nsimd_sleef_cosh_u10_sse2_f64\n#define xcoshf nsimd_sleef_cosh_u10_sse2_f32\n#define xtanh nsimd_sleef_tanh_u10_sse2_f64\n#define xtanhf nsimd_sleef_tanh_u10_sse2_f32\n#define xsinh_u35 nsimd_sleef_sinh_u35_sse2_f64\n#define xsinhf_u35 nsimd_sleef_sinh_u35_sse2_f32\n#define xcosh_u35 nsimd_sleef_cosh_u35_sse2_f64\n#define xcoshf_u35 nsimd_sleef_cosh_u35_sse2_f32\n#define xtanh_u35 nsimd_sleef_tanh_u35_sse2_f64\n#define xtanhf_u35 nsimd_sleef_tanh_u35_sse2_f32\n#define xfastsin_u3500 nsimd_sleef_fastsin_u3500_sse2_f64\n#define xfastsinf_u3500 nsimd_sleef_fastsin_u3500_sse2_f32\n#define xfastcos_u3500 nsimd_sleef_fastcos_u3500_sse2_f64\n#define xfastcosf_u3500 nsimd_sleef_fastcos_u3500_sse2_f32\n#define xfastpow_u3500 nsimd_sleef_fastpow_u3500_sse2_f64\n#define xfastpowf_u3500 nsimd_sleef_fastpow_u3500_sse2_f32\n#define xasinh nsimd_sleef_asinh_u10_sse2_f64\n#define xasinhf nsimd_sleef_asinh_u10_sse2_f32\n#define xacosh nsimd_sleef_acosh_u10_sse2_f64\n#define xacoshf nsimd_sleef_acosh_u10_sse2_f32\n#define xatanh nsimd_sleef_atanh_u10_sse2_f64\n#define xatanhf nsimd_sleef_atanh_u10_sse2_f32\n#define xexp2 nsimd_sleef_exp2_u10_sse2_f64\n#define xexp2f nsimd_sleef_exp2_u10_sse2_f32\n#define xexp2_u35 nsimd_sleef_exp2_u35_sse2_f64\n#define xexp2f_u35 nsimd_sleef_exp2_u35_sse2_f32\n#define xexp10 nsimd_sleef_exp10_u10_sse2_f64\n#define xexp10f nsimd_sleef_exp10_u10_sse2_f32\n#define xexp10_u35 nsimd_sleef_exp10_u35_sse2_f64\n#define xexp10f_u35 nsimd_sleef_exp10_u35_sse2_f32\n#define xexpm1 nsimd_sleef_expm1_u10_sse2_f64\n#define xexpm1f nsimd_sleef_expm1_u10_sse2_f32\n#define xlog10 nsimd_sleef_log10_u10_sse2_f64\n#define xlog10f nsimd_sleef_log10_u10_sse2_f32\n#define xlog2 nsimd_sleef_log2_u10_sse2_f64\n#define xlog2f nsimd_sleef_log2_u10_sse2_f32\n#define xlog2_u35 nsimd_sleef_log2_u35_sse2_f64\n#define xlog2f_u35 nsimd_sleef_log2_u35_sse2_f32\n#define xlog1p nsimd_sleef_log1p_u10_sse2_f64\n#define xlog1pf nsimd_sleef_log1p_u10_sse2_f32\n#define xsincospi_u05 nsimd_sleef_sincospi_u05_sse2_f64\n#define xsincospif_u05 nsimd_sleef_sincospi_u05_sse2_f32\n#define xsincospi_u35 nsimd_sleef_sincospi_u35_sse2_f64\n#define xsincospif_u35 nsimd_sleef_sincospi_u35_sse2_f32\n#define xsinpi_u05 nsimd_sleef_sinpi_u05_sse2_f64\n#define xsinpif_u05 nsimd_sleef_sinpi_u05_sse2_f32\n#define xcospi_u05 nsimd_sleef_cospi_u05_sse2_f64\n#define xcospif_u05 nsimd_sleef_cospi_u05_sse2_f32\n#define xldexp nsimd_sleef_ldexp_sse2_f64\n#define xldexpf nsimd_sleef_ldexp_sse2_f32\n#define xilogb nsimd_sleef_ilogb_sse2_f64\n#define xilogbf nsimd_sleef_ilogb_sse2_f32\n#define xfma nsimd_sleef_fma_sse2_f64\n#define xfmaf nsimd_sleef_fma_sse2_f32\n#define xsqrt nsimd_sleef_sqrt_sse2_f64\n#define xsqrtf nsimd_sleef_sqrt_sse2_f32\n#define xsqrt_u05 nsimd_sleef_sqrt_u05_sse2_f64\n#define xsqrtf_u05 nsimd_sleef_sqrt_u05_sse2_f32\n#define xsqrt_u35 nsimd_sleef_sqrt_u35_sse2_f64\n#define xsqrtf_u35 nsimd_sleef_sqrt_u35_sse2_f32\n#define xhypot_u05 nsimd_sleef_hypot_u05_sse2_f64\n#define xhypotf_u05 nsimd_sleef_hypot_u05_sse2_f32\n#define xhypot_u35 nsimd_sleef_hypot_u35_sse2_f64\n#define xhypotf_u35 nsimd_sleef_hypot_u35_sse2_f32\n#define xfabs nsimd_sleef_fabs_sse2_f64\n#define xfabsf nsimd_sleef_fabs_sse2_f32\n#define xcopysign nsimd_sleef_copysign_sse2_f64\n#define xcopysignf nsimd_sleef_copysign_sse2_f32\n#define xfmax nsimd_sleef_fmax_sse2_f64\n#define xfmaxf nsimd_sleef_fmax_sse2_f32\n#define xfmin nsimd_sleef_fmin_sse2_f64\n#define xfminf nsimd_sleef_fmin_sse2_f32\n#define xfdim nsimd_sleef_fdim_sse2_f64\n#define xfdimf nsimd_sleef_fdim_sse2_f32\n#define xtrunc nsimd_sleef_trunc_sse2_f64\n#define xtruncf nsimd_sleef_trunc_sse2_f32\n#define xfloor nsimd_sleef_floor_sse2_f64\n#define xfloorf nsimd_sleef_floor_sse2_f32\n#define xceil nsimd_sleef_ceil_sse2_f64\n#define xceilf nsimd_sleef_ceil_sse2_f32\n#define xround nsimd_sleef_round_sse2_f64\n#define xroundf nsimd_sleef_round_sse2_f32\n#define xrint nsimd_sleef_rint_sse2_f64\n#define xrintf nsimd_sleef_rint_sse2_f32\n#define xnextafter nsimd_sleef_nextafter_sse2_f64\n#define xnextafterf nsimd_sleef_nextafter_sse2_f32\n#define xfrfrexp nsimd_sleef_frfrexp_sse2_f64\n#define xfrfrexpf nsimd_sleef_frfrexp_sse2_f32\n#define xexpfrexp nsimd_sleef_expfrexp_sse2_f64\n#define xexpfrexpf nsimd_sleef_expfrexp_sse2_f32\n#define xfmod nsimd_sleef_fmod_sse2_f64\n#define xfmodf nsimd_sleef_fmod_sse2_f32\n#define xremainder nsimd_sleef_remainder_sse2_f64\n#define xremainderf nsimd_sleef_remainder_sse2_f32\n#define xmodf nsimd_sleef_modf_sse2_f64\n#define xmodff nsimd_sleef_modf_sse2_f32\n#define xlgamma_u1 nsimd_sleef_lgamma_u10_sse2_f64\n#define xlgammaf_u1 nsimd_sleef_lgamma_u10_sse2_f32\n#define xtgamma_u1 nsimd_sleef_tgamma_u10_sse2_f64\n#define xtgammaf_u1 nsimd_sleef_tgamma_u10_sse2_f32\n#define xerf_u1 nsimd_sleef_erf_u10_sse2_f64\n#define xerff_u1 nsimd_sleef_erf_u10_sse2_f32\n#define xerfc_u15 nsimd_sleef_erfc_u15_sse2_f64\n#define xerfcf_u15 nsimd_sleef_erfc_u15_sse2_f32\n#define xgetInt nsimd_sleef_getInt_sse2_f64\n#define xgetIntf nsimd_sleef_getInt_sse2_f32\n#define xgetPtr nsimd_sleef_getPtr_sse2_f64\n#define xgetPtrf nsimd_sleef_getPtr_sse2_f32\n\n                   #endif\n\n                   #define rempi nsimd_sleef_rempi_sse2\n                   #define rempif nsimd_sleef_rempif_sse2\n                   #define rempisub nsimd_sleef_rempisub_sse2\n                   #define rempisubf nsimd_sleef_rempisubf_sse2\n                   #define gammak nsimd_gammak_sse2\n                   #define gammafk nsimd_gammafk_sse2\n\n                   #endif\n\n                   \n\n#endif\n\n"
  },
  {
    "path": "src/renamesse4.h",
    "content": "#ifndef RENAMESSE4_H\n               #define RENAMESSE4_H\n\n               /* ------------------------------------------------------------------------- */\n                   /* Naming of functions sse42 */\n\n                   #ifdef NSIMD_SSE42\n\n                   #ifdef DETERMINISTIC\n\n                   #define xsin nsimd_sleef_sin_u35d_sse42_f64\n#define xsinf nsimd_sleef_sin_u35d_sse42_f32\n#define xcos nsimd_sleef_cos_u35d_sse42_f64\n#define xcosf nsimd_sleef_cos_u35d_sse42_f32\n#define xsincos nsimd_sleef_sincos_u35d_sse42_f64\n#define xsincosf nsimd_sleef_sincos_u35d_sse42_f32\n#define xtan nsimd_sleef_tan_u35d_sse42_f64\n#define xtanf nsimd_sleef_tan_u35d_sse42_f32\n#define xasin nsimd_sleef_asin_u35d_sse42_f64\n#define xasinf nsimd_sleef_asin_u35d_sse42_f32\n#define xacos nsimd_sleef_acos_u35d_sse42_f64\n#define xacosf nsimd_sleef_acos_u35d_sse42_f32\n#define xatan nsimd_sleef_atan_u35d_sse42_f64\n#define xatanf nsimd_sleef_atan_u35d_sse42_f32\n#define xatan2 nsimd_sleef_atan2_u35d_sse42_f64\n#define xatan2f nsimd_sleef_atan2_u35d_sse42_f32\n#define xlog nsimd_sleef_log_u35d_sse42_f64\n#define xlogf nsimd_sleef_log_u35d_sse42_f32\n#define xcbrt nsimd_sleef_cbrt_u35d_sse42_f64\n#define xcbrtf nsimd_sleef_cbrt_u35d_sse42_f32\n#define xsin_u1 nsimd_sleef_sin_u10d_sse42_f64\n#define xsinf_u1 nsimd_sleef_sin_u10d_sse42_f32\n#define xcos_u1 nsimd_sleef_cos_u10d_sse42_f64\n#define xcosf_u1 nsimd_sleef_cos_u10d_sse42_f32\n#define xsincos_u1 nsimd_sleef_sincos_u10d_sse42_f64\n#define xsincosf_u1 nsimd_sleef_sincos_u10d_sse42_f32\n#define xtan_u1 nsimd_sleef_tan_u10d_sse42_f64\n#define xtanf_u1 nsimd_sleef_tan_u10d_sse42_f32\n#define xasin_u1 nsimd_sleef_asin_u10d_sse42_f64\n#define xasinf_u1 nsimd_sleef_asin_u10d_sse42_f32\n#define xacos_u1 nsimd_sleef_acos_u10d_sse42_f64\n#define xacosf_u1 nsimd_sleef_acos_u10d_sse42_f32\n#define xatan_u1 nsimd_sleef_atan_u10d_sse42_f64\n#define xatanf_u1 nsimd_sleef_atan_u10d_sse42_f32\n#define xatan2_u1 nsimd_sleef_atan2_u10d_sse42_f64\n#define xatan2f_u1 nsimd_sleef_atan2_u10d_sse42_f32\n#define xlog_u1 nsimd_sleef_log_u10d_sse42_f64\n#define xlogf_u1 nsimd_sleef_log_u10d_sse42_f32\n#define xcbrt_u1 nsimd_sleef_cbrt_u10d_sse42_f64\n#define xcbrtf_u1 nsimd_sleef_cbrt_u10d_sse42_f32\n#define xexp nsimd_sleef_exp_u10d_sse42_f64\n#define xexpf nsimd_sleef_exp_u10d_sse42_f32\n#define xpow nsimd_sleef_pow_u10d_sse42_f64\n#define xpowf nsimd_sleef_pow_u10d_sse42_f32\n#define xsinh nsimd_sleef_sinh_u10d_sse42_f64\n#define xsinhf nsimd_sleef_sinh_u10d_sse42_f32\n#define xcosh nsimd_sleef_cosh_u10d_sse42_f64\n#define xcoshf nsimd_sleef_cosh_u10d_sse42_f32\n#define xtanh nsimd_sleef_tanh_u10d_sse42_f64\n#define xtanhf nsimd_sleef_tanh_u10d_sse42_f32\n#define xsinh_u35 nsimd_sleef_sinh_u35d_sse42_f64\n#define xsinhf_u35 nsimd_sleef_sinh_u35d_sse42_f32\n#define xcosh_u35 nsimd_sleef_cosh_u35d_sse42_f64\n#define xcoshf_u35 nsimd_sleef_cosh_u35d_sse42_f32\n#define xtanh_u35 nsimd_sleef_tanh_u35d_sse42_f64\n#define xtanhf_u35 nsimd_sleef_tanh_u35d_sse42_f32\n#define xfastsin_u3500 nsimd_sleef_fastsin_u3500d_sse42_f64\n#define xfastsinf_u3500 nsimd_sleef_fastsin_u3500d_sse42_f32\n#define xfastcos_u3500 nsimd_sleef_fastcos_u3500d_sse42_f64\n#define xfastcosf_u3500 nsimd_sleef_fastcos_u3500d_sse42_f32\n#define xfastpow_u3500 nsimd_sleef_fastpow_u3500d_sse42_f64\n#define xfastpowf_u3500 nsimd_sleef_fastpow_u3500d_sse42_f32\n#define xasinh nsimd_sleef_asinh_u10d_sse42_f64\n#define xasinhf nsimd_sleef_asinh_u10d_sse42_f32\n#define xacosh nsimd_sleef_acosh_u10d_sse42_f64\n#define xacoshf nsimd_sleef_acosh_u10d_sse42_f32\n#define xatanh nsimd_sleef_atanh_u10d_sse42_f64\n#define xatanhf nsimd_sleef_atanh_u10d_sse42_f32\n#define xexp2 nsimd_sleef_exp2_u10d_sse42_f64\n#define xexp2f nsimd_sleef_exp2_u10d_sse42_f32\n#define xexp2_u35 nsimd_sleef_exp2_u35d_sse42_f64\n#define xexp2f_u35 nsimd_sleef_exp2_u35d_sse42_f32\n#define xexp10 nsimd_sleef_exp10_u10d_sse42_f64\n#define xexp10f nsimd_sleef_exp10_u10d_sse42_f32\n#define xexp10_u35 nsimd_sleef_exp10_u35d_sse42_f64\n#define xexp10f_u35 nsimd_sleef_exp10_u35d_sse42_f32\n#define xexpm1 nsimd_sleef_expm1_u10d_sse42_f64\n#define xexpm1f nsimd_sleef_expm1_u10d_sse42_f32\n#define xlog10 nsimd_sleef_log10_u10d_sse42_f64\n#define xlog10f nsimd_sleef_log10_u10d_sse42_f32\n#define xlog2 nsimd_sleef_log2_u10d_sse42_f64\n#define xlog2f nsimd_sleef_log2_u10d_sse42_f32\n#define xlog2_u35 nsimd_sleef_log2_u35d_sse42_f64\n#define xlog2f_u35 nsimd_sleef_log2_u35d_sse42_f32\n#define xlog1p nsimd_sleef_log1p_u10d_sse42_f64\n#define xlog1pf nsimd_sleef_log1p_u10d_sse42_f32\n#define xsincospi_u05 nsimd_sleef_sincospi_u05d_sse42_f64\n#define xsincospif_u05 nsimd_sleef_sincospi_u05d_sse42_f32\n#define xsincospi_u35 nsimd_sleef_sincospi_u35d_sse42_f64\n#define xsincospif_u35 nsimd_sleef_sincospi_u35d_sse42_f32\n#define xsinpi_u05 nsimd_sleef_sinpi_u05d_sse42_f64\n#define xsinpif_u05 nsimd_sleef_sinpi_u05d_sse42_f32\n#define xcospi_u05 nsimd_sleef_cospi_u05d_sse42_f64\n#define xcospif_u05 nsimd_sleef_cospi_u05d_sse42_f32\n#define xldexp nsimd_sleef_ldexp_sse42_f64\n#define xldexpf nsimd_sleef_ldexp_sse42_f32\n#define xilogb nsimd_sleef_ilogb_sse42_f64\n#define xilogbf nsimd_sleef_ilogb_sse42_f32\n#define xfma nsimd_sleef_fma_sse42_f64\n#define xfmaf nsimd_sleef_fma_sse42_f32\n#define xsqrt nsimd_sleef_sqrt_sse42_f64\n#define xsqrtf nsimd_sleef_sqrt_sse42_f32\n#define xsqrt_u05 nsimd_sleef_sqrt_u05d_sse42_f64\n#define xsqrtf_u05 nsimd_sleef_sqrt_u05d_sse42_f32\n#define xsqrt_u35 nsimd_sleef_sqrt_u35d_sse42_f64\n#define xsqrtf_u35 nsimd_sleef_sqrt_u35d_sse42_f32\n#define xhypot_u05 nsimd_sleef_hypot_u05d_sse42_f64\n#define xhypotf_u05 nsimd_sleef_hypot_u05d_sse42_f32\n#define xhypot_u35 nsimd_sleef_hypot_u35d_sse42_f64\n#define xhypotf_u35 nsimd_sleef_hypot_u35d_sse42_f32\n#define xfabs nsimd_sleef_fabs_sse42_f64\n#define xfabsf nsimd_sleef_fabs_sse42_f32\n#define xcopysign nsimd_sleef_copysign_sse42_f64\n#define xcopysignf nsimd_sleef_copysign_sse42_f32\n#define xfmax nsimd_sleef_fmax_sse42_f64\n#define xfmaxf nsimd_sleef_fmax_sse42_f32\n#define xfmin nsimd_sleef_fmin_sse42_f64\n#define xfminf nsimd_sleef_fmin_sse42_f32\n#define xfdim nsimd_sleef_fdim_sse42_f64\n#define xfdimf nsimd_sleef_fdim_sse42_f32\n#define xtrunc nsimd_sleef_trunc_sse42_f64\n#define xtruncf nsimd_sleef_trunc_sse42_f32\n#define xfloor nsimd_sleef_floor_sse42_f64\n#define xfloorf nsimd_sleef_floor_sse42_f32\n#define xceil nsimd_sleef_ceil_sse42_f64\n#define xceilf nsimd_sleef_ceil_sse42_f32\n#define xround nsimd_sleef_round_sse42_f64\n#define xroundf nsimd_sleef_round_sse42_f32\n#define xrint nsimd_sleef_rint_sse42_f64\n#define xrintf nsimd_sleef_rint_sse42_f32\n#define xnextafter nsimd_sleef_nextafter_sse42_f64\n#define xnextafterf nsimd_sleef_nextafter_sse42_f32\n#define xfrfrexp nsimd_sleef_frfrexp_sse42_f64\n#define xfrfrexpf nsimd_sleef_frfrexp_sse42_f32\n#define xexpfrexp nsimd_sleef_expfrexp_sse42_f64\n#define xexpfrexpf nsimd_sleef_expfrexp_sse42_f32\n#define xfmod nsimd_sleef_fmod_sse42_f64\n#define xfmodf nsimd_sleef_fmod_sse42_f32\n#define xremainder nsimd_sleef_remainder_sse42_f64\n#define xremainderf nsimd_sleef_remainder_sse42_f32\n#define xmodf nsimd_sleef_modf_sse42_f64\n#define xmodff nsimd_sleef_modf_sse42_f32\n#define xlgamma_u1 nsimd_sleef_lgamma_u10d_sse42_f64\n#define xlgammaf_u1 nsimd_sleef_lgamma_u10d_sse42_f32\n#define xtgamma_u1 nsimd_sleef_tgamma_u10d_sse42_f64\n#define xtgammaf_u1 nsimd_sleef_tgamma_u10d_sse42_f32\n#define xerf_u1 nsimd_sleef_erf_u10d_sse42_f64\n#define xerff_u1 nsimd_sleef_erf_u10d_sse42_f32\n#define xerfc_u15 nsimd_sleef_erfc_u15d_sse42_f64\n#define xerfcf_u15 nsimd_sleef_erfc_u15d_sse42_f32\n#define xgetInt nsimd_sleef_getInt_sse42_f64\n#define xgetIntf nsimd_sleef_getInt_sse42_f32\n#define xgetPtr nsimd_sleef_getPtr_sse42_f64\n#define xgetPtrf nsimd_sleef_getPtr_sse42_f32\n\n                   #else\n\n                   #define xsin nsimd_sleef_sin_u35_sse42_f64\n#define xsinf nsimd_sleef_sin_u35_sse42_f32\n#define xcos nsimd_sleef_cos_u35_sse42_f64\n#define xcosf nsimd_sleef_cos_u35_sse42_f32\n#define xsincos nsimd_sleef_sincos_u35_sse42_f64\n#define xsincosf nsimd_sleef_sincos_u35_sse42_f32\n#define xtan nsimd_sleef_tan_u35_sse42_f64\n#define xtanf nsimd_sleef_tan_u35_sse42_f32\n#define xasin nsimd_sleef_asin_u35_sse42_f64\n#define xasinf nsimd_sleef_asin_u35_sse42_f32\n#define xacos nsimd_sleef_acos_u35_sse42_f64\n#define xacosf nsimd_sleef_acos_u35_sse42_f32\n#define xatan nsimd_sleef_atan_u35_sse42_f64\n#define xatanf nsimd_sleef_atan_u35_sse42_f32\n#define xatan2 nsimd_sleef_atan2_u35_sse42_f64\n#define xatan2f nsimd_sleef_atan2_u35_sse42_f32\n#define xlog nsimd_sleef_log_u35_sse42_f64\n#define xlogf nsimd_sleef_log_u35_sse42_f32\n#define xcbrt nsimd_sleef_cbrt_u35_sse42_f64\n#define xcbrtf nsimd_sleef_cbrt_u35_sse42_f32\n#define xsin_u1 nsimd_sleef_sin_u10_sse42_f64\n#define xsinf_u1 nsimd_sleef_sin_u10_sse42_f32\n#define xcos_u1 nsimd_sleef_cos_u10_sse42_f64\n#define xcosf_u1 nsimd_sleef_cos_u10_sse42_f32\n#define xsincos_u1 nsimd_sleef_sincos_u10_sse42_f64\n#define xsincosf_u1 nsimd_sleef_sincos_u10_sse42_f32\n#define xtan_u1 nsimd_sleef_tan_u10_sse42_f64\n#define xtanf_u1 nsimd_sleef_tan_u10_sse42_f32\n#define xasin_u1 nsimd_sleef_asin_u10_sse42_f64\n#define xasinf_u1 nsimd_sleef_asin_u10_sse42_f32\n#define xacos_u1 nsimd_sleef_acos_u10_sse42_f64\n#define xacosf_u1 nsimd_sleef_acos_u10_sse42_f32\n#define xatan_u1 nsimd_sleef_atan_u10_sse42_f64\n#define xatanf_u1 nsimd_sleef_atan_u10_sse42_f32\n#define xatan2_u1 nsimd_sleef_atan2_u10_sse42_f64\n#define xatan2f_u1 nsimd_sleef_atan2_u10_sse42_f32\n#define xlog_u1 nsimd_sleef_log_u10_sse42_f64\n#define xlogf_u1 nsimd_sleef_log_u10_sse42_f32\n#define xcbrt_u1 nsimd_sleef_cbrt_u10_sse42_f64\n#define xcbrtf_u1 nsimd_sleef_cbrt_u10_sse42_f32\n#define xexp nsimd_sleef_exp_u10_sse42_f64\n#define xexpf nsimd_sleef_exp_u10_sse42_f32\n#define xpow nsimd_sleef_pow_u10_sse42_f64\n#define xpowf nsimd_sleef_pow_u10_sse42_f32\n#define xsinh nsimd_sleef_sinh_u10_sse42_f64\n#define xsinhf nsimd_sleef_sinh_u10_sse42_f32\n#define xcosh nsimd_sleef_cosh_u10_sse42_f64\n#define xcoshf nsimd_sleef_cosh_u10_sse42_f32\n#define xtanh nsimd_sleef_tanh_u10_sse42_f64\n#define xtanhf nsimd_sleef_tanh_u10_sse42_f32\n#define xsinh_u35 nsimd_sleef_sinh_u35_sse42_f64\n#define xsinhf_u35 nsimd_sleef_sinh_u35_sse42_f32\n#define xcosh_u35 nsimd_sleef_cosh_u35_sse42_f64\n#define xcoshf_u35 nsimd_sleef_cosh_u35_sse42_f32\n#define xtanh_u35 nsimd_sleef_tanh_u35_sse42_f64\n#define xtanhf_u35 nsimd_sleef_tanh_u35_sse42_f32\n#define xfastsin_u3500 nsimd_sleef_fastsin_u3500_sse42_f64\n#define xfastsinf_u3500 nsimd_sleef_fastsin_u3500_sse42_f32\n#define xfastcos_u3500 nsimd_sleef_fastcos_u3500_sse42_f64\n#define xfastcosf_u3500 nsimd_sleef_fastcos_u3500_sse42_f32\n#define xfastpow_u3500 nsimd_sleef_fastpow_u3500_sse42_f64\n#define xfastpowf_u3500 nsimd_sleef_fastpow_u3500_sse42_f32\n#define xasinh nsimd_sleef_asinh_u10_sse42_f64\n#define xasinhf nsimd_sleef_asinh_u10_sse42_f32\n#define xacosh nsimd_sleef_acosh_u10_sse42_f64\n#define xacoshf nsimd_sleef_acosh_u10_sse42_f32\n#define xatanh nsimd_sleef_atanh_u10_sse42_f64\n#define xatanhf nsimd_sleef_atanh_u10_sse42_f32\n#define xexp2 nsimd_sleef_exp2_u10_sse42_f64\n#define xexp2f nsimd_sleef_exp2_u10_sse42_f32\n#define xexp2_u35 nsimd_sleef_exp2_u35_sse42_f64\n#define xexp2f_u35 nsimd_sleef_exp2_u35_sse42_f32\n#define xexp10 nsimd_sleef_exp10_u10_sse42_f64\n#define xexp10f nsimd_sleef_exp10_u10_sse42_f32\n#define xexp10_u35 nsimd_sleef_exp10_u35_sse42_f64\n#define xexp10f_u35 nsimd_sleef_exp10_u35_sse42_f32\n#define xexpm1 nsimd_sleef_expm1_u10_sse42_f64\n#define xexpm1f nsimd_sleef_expm1_u10_sse42_f32\n#define xlog10 nsimd_sleef_log10_u10_sse42_f64\n#define xlog10f nsimd_sleef_log10_u10_sse42_f32\n#define xlog2 nsimd_sleef_log2_u10_sse42_f64\n#define xlog2f nsimd_sleef_log2_u10_sse42_f32\n#define xlog2_u35 nsimd_sleef_log2_u35_sse42_f64\n#define xlog2f_u35 nsimd_sleef_log2_u35_sse42_f32\n#define xlog1p nsimd_sleef_log1p_u10_sse42_f64\n#define xlog1pf nsimd_sleef_log1p_u10_sse42_f32\n#define xsincospi_u05 nsimd_sleef_sincospi_u05_sse42_f64\n#define xsincospif_u05 nsimd_sleef_sincospi_u05_sse42_f32\n#define xsincospi_u35 nsimd_sleef_sincospi_u35_sse42_f64\n#define xsincospif_u35 nsimd_sleef_sincospi_u35_sse42_f32\n#define xsinpi_u05 nsimd_sleef_sinpi_u05_sse42_f64\n#define xsinpif_u05 nsimd_sleef_sinpi_u05_sse42_f32\n#define xcospi_u05 nsimd_sleef_cospi_u05_sse42_f64\n#define xcospif_u05 nsimd_sleef_cospi_u05_sse42_f32\n#define xldexp nsimd_sleef_ldexp_sse42_f64\n#define xldexpf nsimd_sleef_ldexp_sse42_f32\n#define xilogb nsimd_sleef_ilogb_sse42_f64\n#define xilogbf nsimd_sleef_ilogb_sse42_f32\n#define xfma nsimd_sleef_fma_sse42_f64\n#define xfmaf nsimd_sleef_fma_sse42_f32\n#define xsqrt nsimd_sleef_sqrt_sse42_f64\n#define xsqrtf nsimd_sleef_sqrt_sse42_f32\n#define xsqrt_u05 nsimd_sleef_sqrt_u05_sse42_f64\n#define xsqrtf_u05 nsimd_sleef_sqrt_u05_sse42_f32\n#define xsqrt_u35 nsimd_sleef_sqrt_u35_sse42_f64\n#define xsqrtf_u35 nsimd_sleef_sqrt_u35_sse42_f32\n#define xhypot_u05 nsimd_sleef_hypot_u05_sse42_f64\n#define xhypotf_u05 nsimd_sleef_hypot_u05_sse42_f32\n#define xhypot_u35 nsimd_sleef_hypot_u35_sse42_f64\n#define xhypotf_u35 nsimd_sleef_hypot_u35_sse42_f32\n#define xfabs nsimd_sleef_fabs_sse42_f64\n#define xfabsf nsimd_sleef_fabs_sse42_f32\n#define xcopysign nsimd_sleef_copysign_sse42_f64\n#define xcopysignf nsimd_sleef_copysign_sse42_f32\n#define xfmax nsimd_sleef_fmax_sse42_f64\n#define xfmaxf nsimd_sleef_fmax_sse42_f32\n#define xfmin nsimd_sleef_fmin_sse42_f64\n#define xfminf nsimd_sleef_fmin_sse42_f32\n#define xfdim nsimd_sleef_fdim_sse42_f64\n#define xfdimf nsimd_sleef_fdim_sse42_f32\n#define xtrunc nsimd_sleef_trunc_sse42_f64\n#define xtruncf nsimd_sleef_trunc_sse42_f32\n#define xfloor nsimd_sleef_floor_sse42_f64\n#define xfloorf nsimd_sleef_floor_sse42_f32\n#define xceil nsimd_sleef_ceil_sse42_f64\n#define xceilf nsimd_sleef_ceil_sse42_f32\n#define xround nsimd_sleef_round_sse42_f64\n#define xroundf nsimd_sleef_round_sse42_f32\n#define xrint nsimd_sleef_rint_sse42_f64\n#define xrintf nsimd_sleef_rint_sse42_f32\n#define xnextafter nsimd_sleef_nextafter_sse42_f64\n#define xnextafterf nsimd_sleef_nextafter_sse42_f32\n#define xfrfrexp nsimd_sleef_frfrexp_sse42_f64\n#define xfrfrexpf nsimd_sleef_frfrexp_sse42_f32\n#define xexpfrexp nsimd_sleef_expfrexp_sse42_f64\n#define xexpfrexpf nsimd_sleef_expfrexp_sse42_f32\n#define xfmod nsimd_sleef_fmod_sse42_f64\n#define xfmodf nsimd_sleef_fmod_sse42_f32\n#define xremainder nsimd_sleef_remainder_sse42_f64\n#define xremainderf nsimd_sleef_remainder_sse42_f32\n#define xmodf nsimd_sleef_modf_sse42_f64\n#define xmodff nsimd_sleef_modf_sse42_f32\n#define xlgamma_u1 nsimd_sleef_lgamma_u10_sse42_f64\n#define xlgammaf_u1 nsimd_sleef_lgamma_u10_sse42_f32\n#define xtgamma_u1 nsimd_sleef_tgamma_u10_sse42_f64\n#define xtgammaf_u1 nsimd_sleef_tgamma_u10_sse42_f32\n#define xerf_u1 nsimd_sleef_erf_u10_sse42_f64\n#define xerff_u1 nsimd_sleef_erf_u10_sse42_f32\n#define xerfc_u15 nsimd_sleef_erfc_u15_sse42_f64\n#define xerfcf_u15 nsimd_sleef_erfc_u15_sse42_f32\n#define xgetInt nsimd_sleef_getInt_sse42_f64\n#define xgetIntf nsimd_sleef_getInt_sse42_f32\n#define xgetPtr nsimd_sleef_getPtr_sse42_f64\n#define xgetPtrf nsimd_sleef_getPtr_sse42_f32\n\n                   #endif\n\n                   #define rempi nsimd_sleef_rempi_sse42\n                   #define rempif nsimd_sleef_rempif_sse42\n                   #define rempisub nsimd_sleef_rempisub_sse42\n                   #define rempisubf nsimd_sleef_rempisubf_sse42\n                   #define gammak nsimd_gammak_sse42\n                   #define gammafk nsimd_gammafk_sse42\n\n                   #endif\n\n                   \n\n#endif\n\n"
  },
  {
    "path": "src/renamesve.h",
    "content": "#ifndef RENAMESVE_H\n               #define RENAMESVE_H\n\n               /* ------------------------------------------------------------------------- */\n                   /* Naming of functions sve128 */\n\n                   #ifdef NSIMD_SVE128\n\n                   #ifdef DETERMINISTIC\n\n                   #define xsin nsimd_sleef_sin_u35d_sve128_f64\n#define xsinf nsimd_sleef_sin_u35d_sve128_f32\n#define xcos nsimd_sleef_cos_u35d_sve128_f64\n#define xcosf nsimd_sleef_cos_u35d_sve128_f32\n#define xsincos nsimd_sleef_sincos_u35d_sve128_f64\n#define xsincosf nsimd_sleef_sincos_u35d_sve128_f32\n#define xtan nsimd_sleef_tan_u35d_sve128_f64\n#define xtanf nsimd_sleef_tan_u35d_sve128_f32\n#define xasin nsimd_sleef_asin_u35d_sve128_f64\n#define xasinf nsimd_sleef_asin_u35d_sve128_f32\n#define xacos nsimd_sleef_acos_u35d_sve128_f64\n#define xacosf nsimd_sleef_acos_u35d_sve128_f32\n#define xatan nsimd_sleef_atan_u35d_sve128_f64\n#define xatanf nsimd_sleef_atan_u35d_sve128_f32\n#define xatan2 nsimd_sleef_atan2_u35d_sve128_f64\n#define xatan2f nsimd_sleef_atan2_u35d_sve128_f32\n#define xlog nsimd_sleef_log_u35d_sve128_f64\n#define xlogf nsimd_sleef_log_u35d_sve128_f32\n#define xcbrt nsimd_sleef_cbrt_u35d_sve128_f64\n#define xcbrtf nsimd_sleef_cbrt_u35d_sve128_f32\n#define xsin_u1 nsimd_sleef_sin_u10d_sve128_f64\n#define xsinf_u1 nsimd_sleef_sin_u10d_sve128_f32\n#define xcos_u1 nsimd_sleef_cos_u10d_sve128_f64\n#define xcosf_u1 nsimd_sleef_cos_u10d_sve128_f32\n#define xsincos_u1 nsimd_sleef_sincos_u10d_sve128_f64\n#define xsincosf_u1 nsimd_sleef_sincos_u10d_sve128_f32\n#define xtan_u1 nsimd_sleef_tan_u10d_sve128_f64\n#define xtanf_u1 nsimd_sleef_tan_u10d_sve128_f32\n#define xasin_u1 nsimd_sleef_asin_u10d_sve128_f64\n#define xasinf_u1 nsimd_sleef_asin_u10d_sve128_f32\n#define xacos_u1 nsimd_sleef_acos_u10d_sve128_f64\n#define xacosf_u1 nsimd_sleef_acos_u10d_sve128_f32\n#define xatan_u1 nsimd_sleef_atan_u10d_sve128_f64\n#define xatanf_u1 nsimd_sleef_atan_u10d_sve128_f32\n#define xatan2_u1 nsimd_sleef_atan2_u10d_sve128_f64\n#define xatan2f_u1 nsimd_sleef_atan2_u10d_sve128_f32\n#define xlog_u1 nsimd_sleef_log_u10d_sve128_f64\n#define xlogf_u1 nsimd_sleef_log_u10d_sve128_f32\n#define xcbrt_u1 nsimd_sleef_cbrt_u10d_sve128_f64\n#define xcbrtf_u1 nsimd_sleef_cbrt_u10d_sve128_f32\n#define xexp nsimd_sleef_exp_u10d_sve128_f64\n#define xexpf nsimd_sleef_exp_u10d_sve128_f32\n#define xpow nsimd_sleef_pow_u10d_sve128_f64\n#define xpowf nsimd_sleef_pow_u10d_sve128_f32\n#define xsinh nsimd_sleef_sinh_u10d_sve128_f64\n#define xsinhf nsimd_sleef_sinh_u10d_sve128_f32\n#define xcosh nsimd_sleef_cosh_u10d_sve128_f64\n#define xcoshf nsimd_sleef_cosh_u10d_sve128_f32\n#define xtanh nsimd_sleef_tanh_u10d_sve128_f64\n#define xtanhf nsimd_sleef_tanh_u10d_sve128_f32\n#define xsinh_u35 nsimd_sleef_sinh_u35d_sve128_f64\n#define xsinhf_u35 nsimd_sleef_sinh_u35d_sve128_f32\n#define xcosh_u35 nsimd_sleef_cosh_u35d_sve128_f64\n#define xcoshf_u35 nsimd_sleef_cosh_u35d_sve128_f32\n#define xtanh_u35 nsimd_sleef_tanh_u35d_sve128_f64\n#define xtanhf_u35 nsimd_sleef_tanh_u35d_sve128_f32\n#define xfastsin_u3500 nsimd_sleef_fastsin_u3500d_sve128_f64\n#define xfastsinf_u3500 nsimd_sleef_fastsin_u3500d_sve128_f32\n#define xfastcos_u3500 nsimd_sleef_fastcos_u3500d_sve128_f64\n#define xfastcosf_u3500 nsimd_sleef_fastcos_u3500d_sve128_f32\n#define xfastpow_u3500 nsimd_sleef_fastpow_u3500d_sve128_f64\n#define xfastpowf_u3500 nsimd_sleef_fastpow_u3500d_sve128_f32\n#define xasinh nsimd_sleef_asinh_u10d_sve128_f64\n#define xasinhf nsimd_sleef_asinh_u10d_sve128_f32\n#define xacosh nsimd_sleef_acosh_u10d_sve128_f64\n#define xacoshf nsimd_sleef_acosh_u10d_sve128_f32\n#define xatanh nsimd_sleef_atanh_u10d_sve128_f64\n#define xatanhf nsimd_sleef_atanh_u10d_sve128_f32\n#define xexp2 nsimd_sleef_exp2_u10d_sve128_f64\n#define xexp2f nsimd_sleef_exp2_u10d_sve128_f32\n#define xexp2_u35 nsimd_sleef_exp2_u35d_sve128_f64\n#define xexp2f_u35 nsimd_sleef_exp2_u35d_sve128_f32\n#define xexp10 nsimd_sleef_exp10_u10d_sve128_f64\n#define xexp10f nsimd_sleef_exp10_u10d_sve128_f32\n#define xexp10_u35 nsimd_sleef_exp10_u35d_sve128_f64\n#define xexp10f_u35 nsimd_sleef_exp10_u35d_sve128_f32\n#define xexpm1 nsimd_sleef_expm1_u10d_sve128_f64\n#define xexpm1f nsimd_sleef_expm1_u10d_sve128_f32\n#define xlog10 nsimd_sleef_log10_u10d_sve128_f64\n#define xlog10f nsimd_sleef_log10_u10d_sve128_f32\n#define xlog2 nsimd_sleef_log2_u10d_sve128_f64\n#define xlog2f nsimd_sleef_log2_u10d_sve128_f32\n#define xlog2_u35 nsimd_sleef_log2_u35d_sve128_f64\n#define xlog2f_u35 nsimd_sleef_log2_u35d_sve128_f32\n#define xlog1p nsimd_sleef_log1p_u10d_sve128_f64\n#define xlog1pf nsimd_sleef_log1p_u10d_sve128_f32\n#define xsincospi_u05 nsimd_sleef_sincospi_u05d_sve128_f64\n#define xsincospif_u05 nsimd_sleef_sincospi_u05d_sve128_f32\n#define xsincospi_u35 nsimd_sleef_sincospi_u35d_sve128_f64\n#define xsincospif_u35 nsimd_sleef_sincospi_u35d_sve128_f32\n#define xsinpi_u05 nsimd_sleef_sinpi_u05d_sve128_f64\n#define xsinpif_u05 nsimd_sleef_sinpi_u05d_sve128_f32\n#define xcospi_u05 nsimd_sleef_cospi_u05d_sve128_f64\n#define xcospif_u05 nsimd_sleef_cospi_u05d_sve128_f32\n#define xldexp nsimd_sleef_ldexp_sve128_f64\n#define xldexpf nsimd_sleef_ldexp_sve128_f32\n#define xilogb nsimd_sleef_ilogb_sve128_f64\n#define xilogbf nsimd_sleef_ilogb_sve128_f32\n#define xfma nsimd_sleef_fma_sve128_f64\n#define xfmaf nsimd_sleef_fma_sve128_f32\n#define xsqrt nsimd_sleef_sqrt_sve128_f64\n#define xsqrtf nsimd_sleef_sqrt_sve128_f32\n#define xsqrt_u05 nsimd_sleef_sqrt_u05d_sve128_f64\n#define xsqrtf_u05 nsimd_sleef_sqrt_u05d_sve128_f32\n#define xsqrt_u35 nsimd_sleef_sqrt_u35d_sve128_f64\n#define xsqrtf_u35 nsimd_sleef_sqrt_u35d_sve128_f32\n#define xhypot_u05 nsimd_sleef_hypot_u05d_sve128_f64\n#define xhypotf_u05 nsimd_sleef_hypot_u05d_sve128_f32\n#define xhypot_u35 nsimd_sleef_hypot_u35d_sve128_f64\n#define xhypotf_u35 nsimd_sleef_hypot_u35d_sve128_f32\n#define xfabs nsimd_sleef_fabs_sve128_f64\n#define xfabsf nsimd_sleef_fabs_sve128_f32\n#define xcopysign nsimd_sleef_copysign_sve128_f64\n#define xcopysignf nsimd_sleef_copysign_sve128_f32\n#define xfmax nsimd_sleef_fmax_sve128_f64\n#define xfmaxf nsimd_sleef_fmax_sve128_f32\n#define xfmin nsimd_sleef_fmin_sve128_f64\n#define xfminf nsimd_sleef_fmin_sve128_f32\n#define xfdim nsimd_sleef_fdim_sve128_f64\n#define xfdimf nsimd_sleef_fdim_sve128_f32\n#define xtrunc nsimd_sleef_trunc_sve128_f64\n#define xtruncf nsimd_sleef_trunc_sve128_f32\n#define xfloor nsimd_sleef_floor_sve128_f64\n#define xfloorf nsimd_sleef_floor_sve128_f32\n#define xceil nsimd_sleef_ceil_sve128_f64\n#define xceilf nsimd_sleef_ceil_sve128_f32\n#define xround nsimd_sleef_round_sve128_f64\n#define xroundf nsimd_sleef_round_sve128_f32\n#define xrint nsimd_sleef_rint_sve128_f64\n#define xrintf nsimd_sleef_rint_sve128_f32\n#define xnextafter nsimd_sleef_nextafter_sve128_f64\n#define xnextafterf nsimd_sleef_nextafter_sve128_f32\n#define xfrfrexp nsimd_sleef_frfrexp_sve128_f64\n#define xfrfrexpf nsimd_sleef_frfrexp_sve128_f32\n#define xexpfrexp nsimd_sleef_expfrexp_sve128_f64\n#define xexpfrexpf nsimd_sleef_expfrexp_sve128_f32\n#define xfmod nsimd_sleef_fmod_sve128_f64\n#define xfmodf nsimd_sleef_fmod_sve128_f32\n#define xremainder nsimd_sleef_remainder_sve128_f64\n#define xremainderf nsimd_sleef_remainder_sve128_f32\n#define xmodf nsimd_sleef_modf_sve128_f64\n#define xmodff nsimd_sleef_modf_sve128_f32\n#define xlgamma_u1 nsimd_sleef_lgamma_u10d_sve128_f64\n#define xlgammaf_u1 nsimd_sleef_lgamma_u10d_sve128_f32\n#define xtgamma_u1 nsimd_sleef_tgamma_u10d_sve128_f64\n#define xtgammaf_u1 nsimd_sleef_tgamma_u10d_sve128_f32\n#define xerf_u1 nsimd_sleef_erf_u10d_sve128_f64\n#define xerff_u1 nsimd_sleef_erf_u10d_sve128_f32\n#define xerfc_u15 nsimd_sleef_erfc_u15d_sve128_f64\n#define xerfcf_u15 nsimd_sleef_erfc_u15d_sve128_f32\n#define xgetInt nsimd_sleef_getInt_sve128_f64\n#define xgetIntf nsimd_sleef_getInt_sve128_f32\n#define xgetPtr nsimd_sleef_getPtr_sve128_f64\n#define xgetPtrf nsimd_sleef_getPtr_sve128_f32\n\n                   #else\n\n                   #define xsin nsimd_sleef_sin_u35_sve128_f64\n#define xsinf nsimd_sleef_sin_u35_sve128_f32\n#define xcos nsimd_sleef_cos_u35_sve128_f64\n#define xcosf nsimd_sleef_cos_u35_sve128_f32\n#define xsincos nsimd_sleef_sincos_u35_sve128_f64\n#define xsincosf nsimd_sleef_sincos_u35_sve128_f32\n#define xtan nsimd_sleef_tan_u35_sve128_f64\n#define xtanf nsimd_sleef_tan_u35_sve128_f32\n#define xasin nsimd_sleef_asin_u35_sve128_f64\n#define xasinf nsimd_sleef_asin_u35_sve128_f32\n#define xacos nsimd_sleef_acos_u35_sve128_f64\n#define xacosf nsimd_sleef_acos_u35_sve128_f32\n#define xatan nsimd_sleef_atan_u35_sve128_f64\n#define xatanf nsimd_sleef_atan_u35_sve128_f32\n#define xatan2 nsimd_sleef_atan2_u35_sve128_f64\n#define xatan2f nsimd_sleef_atan2_u35_sve128_f32\n#define xlog nsimd_sleef_log_u35_sve128_f64\n#define xlogf nsimd_sleef_log_u35_sve128_f32\n#define xcbrt nsimd_sleef_cbrt_u35_sve128_f64\n#define xcbrtf nsimd_sleef_cbrt_u35_sve128_f32\n#define xsin_u1 nsimd_sleef_sin_u10_sve128_f64\n#define xsinf_u1 nsimd_sleef_sin_u10_sve128_f32\n#define xcos_u1 nsimd_sleef_cos_u10_sve128_f64\n#define xcosf_u1 nsimd_sleef_cos_u10_sve128_f32\n#define xsincos_u1 nsimd_sleef_sincos_u10_sve128_f64\n#define xsincosf_u1 nsimd_sleef_sincos_u10_sve128_f32\n#define xtan_u1 nsimd_sleef_tan_u10_sve128_f64\n#define xtanf_u1 nsimd_sleef_tan_u10_sve128_f32\n#define xasin_u1 nsimd_sleef_asin_u10_sve128_f64\n#define xasinf_u1 nsimd_sleef_asin_u10_sve128_f32\n#define xacos_u1 nsimd_sleef_acos_u10_sve128_f64\n#define xacosf_u1 nsimd_sleef_acos_u10_sve128_f32\n#define xatan_u1 nsimd_sleef_atan_u10_sve128_f64\n#define xatanf_u1 nsimd_sleef_atan_u10_sve128_f32\n#define xatan2_u1 nsimd_sleef_atan2_u10_sve128_f64\n#define xatan2f_u1 nsimd_sleef_atan2_u10_sve128_f32\n#define xlog_u1 nsimd_sleef_log_u10_sve128_f64\n#define xlogf_u1 nsimd_sleef_log_u10_sve128_f32\n#define xcbrt_u1 nsimd_sleef_cbrt_u10_sve128_f64\n#define xcbrtf_u1 nsimd_sleef_cbrt_u10_sve128_f32\n#define xexp nsimd_sleef_exp_u10_sve128_f64\n#define xexpf nsimd_sleef_exp_u10_sve128_f32\n#define xpow nsimd_sleef_pow_u10_sve128_f64\n#define xpowf nsimd_sleef_pow_u10_sve128_f32\n#define xsinh nsimd_sleef_sinh_u10_sve128_f64\n#define xsinhf nsimd_sleef_sinh_u10_sve128_f32\n#define xcosh nsimd_sleef_cosh_u10_sve128_f64\n#define xcoshf nsimd_sleef_cosh_u10_sve128_f32\n#define xtanh nsimd_sleef_tanh_u10_sve128_f64\n#define xtanhf nsimd_sleef_tanh_u10_sve128_f32\n#define xsinh_u35 nsimd_sleef_sinh_u35_sve128_f64\n#define xsinhf_u35 nsimd_sleef_sinh_u35_sve128_f32\n#define xcosh_u35 nsimd_sleef_cosh_u35_sve128_f64\n#define xcoshf_u35 nsimd_sleef_cosh_u35_sve128_f32\n#define xtanh_u35 nsimd_sleef_tanh_u35_sve128_f64\n#define xtanhf_u35 nsimd_sleef_tanh_u35_sve128_f32\n#define xfastsin_u3500 nsimd_sleef_fastsin_u3500_sve128_f64\n#define xfastsinf_u3500 nsimd_sleef_fastsin_u3500_sve128_f32\n#define xfastcos_u3500 nsimd_sleef_fastcos_u3500_sve128_f64\n#define xfastcosf_u3500 nsimd_sleef_fastcos_u3500_sve128_f32\n#define xfastpow_u3500 nsimd_sleef_fastpow_u3500_sve128_f64\n#define xfastpowf_u3500 nsimd_sleef_fastpow_u3500_sve128_f32\n#define xasinh nsimd_sleef_asinh_u10_sve128_f64\n#define xasinhf nsimd_sleef_asinh_u10_sve128_f32\n#define xacosh nsimd_sleef_acosh_u10_sve128_f64\n#define xacoshf nsimd_sleef_acosh_u10_sve128_f32\n#define xatanh nsimd_sleef_atanh_u10_sve128_f64\n#define xatanhf nsimd_sleef_atanh_u10_sve128_f32\n#define xexp2 nsimd_sleef_exp2_u10_sve128_f64\n#define xexp2f nsimd_sleef_exp2_u10_sve128_f32\n#define xexp2_u35 nsimd_sleef_exp2_u35_sve128_f64\n#define xexp2f_u35 nsimd_sleef_exp2_u35_sve128_f32\n#define xexp10 nsimd_sleef_exp10_u10_sve128_f64\n#define xexp10f nsimd_sleef_exp10_u10_sve128_f32\n#define xexp10_u35 nsimd_sleef_exp10_u35_sve128_f64\n#define xexp10f_u35 nsimd_sleef_exp10_u35_sve128_f32\n#define xexpm1 nsimd_sleef_expm1_u10_sve128_f64\n#define xexpm1f nsimd_sleef_expm1_u10_sve128_f32\n#define xlog10 nsimd_sleef_log10_u10_sve128_f64\n#define xlog10f nsimd_sleef_log10_u10_sve128_f32\n#define xlog2 nsimd_sleef_log2_u10_sve128_f64\n#define xlog2f nsimd_sleef_log2_u10_sve128_f32\n#define xlog2_u35 nsimd_sleef_log2_u35_sve128_f64\n#define xlog2f_u35 nsimd_sleef_log2_u35_sve128_f32\n#define xlog1p nsimd_sleef_log1p_u10_sve128_f64\n#define xlog1pf nsimd_sleef_log1p_u10_sve128_f32\n#define xsincospi_u05 nsimd_sleef_sincospi_u05_sve128_f64\n#define xsincospif_u05 nsimd_sleef_sincospi_u05_sve128_f32\n#define xsincospi_u35 nsimd_sleef_sincospi_u35_sve128_f64\n#define xsincospif_u35 nsimd_sleef_sincospi_u35_sve128_f32\n#define xsinpi_u05 nsimd_sleef_sinpi_u05_sve128_f64\n#define xsinpif_u05 nsimd_sleef_sinpi_u05_sve128_f32\n#define xcospi_u05 nsimd_sleef_cospi_u05_sve128_f64\n#define xcospif_u05 nsimd_sleef_cospi_u05_sve128_f32\n#define xldexp nsimd_sleef_ldexp_sve128_f64\n#define xldexpf nsimd_sleef_ldexp_sve128_f32\n#define xilogb nsimd_sleef_ilogb_sve128_f64\n#define xilogbf nsimd_sleef_ilogb_sve128_f32\n#define xfma nsimd_sleef_fma_sve128_f64\n#define xfmaf nsimd_sleef_fma_sve128_f32\n#define xsqrt nsimd_sleef_sqrt_sve128_f64\n#define xsqrtf nsimd_sleef_sqrt_sve128_f32\n#define xsqrt_u05 nsimd_sleef_sqrt_u05_sve128_f64\n#define xsqrtf_u05 nsimd_sleef_sqrt_u05_sve128_f32\n#define xsqrt_u35 nsimd_sleef_sqrt_u35_sve128_f64\n#define xsqrtf_u35 nsimd_sleef_sqrt_u35_sve128_f32\n#define xhypot_u05 nsimd_sleef_hypot_u05_sve128_f64\n#define xhypotf_u05 nsimd_sleef_hypot_u05_sve128_f32\n#define xhypot_u35 nsimd_sleef_hypot_u35_sve128_f64\n#define xhypotf_u35 nsimd_sleef_hypot_u35_sve128_f32\n#define xfabs nsimd_sleef_fabs_sve128_f64\n#define xfabsf nsimd_sleef_fabs_sve128_f32\n#define xcopysign nsimd_sleef_copysign_sve128_f64\n#define xcopysignf nsimd_sleef_copysign_sve128_f32\n#define xfmax nsimd_sleef_fmax_sve128_f64\n#define xfmaxf nsimd_sleef_fmax_sve128_f32\n#define xfmin nsimd_sleef_fmin_sve128_f64\n#define xfminf nsimd_sleef_fmin_sve128_f32\n#define xfdim nsimd_sleef_fdim_sve128_f64\n#define xfdimf nsimd_sleef_fdim_sve128_f32\n#define xtrunc nsimd_sleef_trunc_sve128_f64\n#define xtruncf nsimd_sleef_trunc_sve128_f32\n#define xfloor nsimd_sleef_floor_sve128_f64\n#define xfloorf nsimd_sleef_floor_sve128_f32\n#define xceil nsimd_sleef_ceil_sve128_f64\n#define xceilf nsimd_sleef_ceil_sve128_f32\n#define xround nsimd_sleef_round_sve128_f64\n#define xroundf nsimd_sleef_round_sve128_f32\n#define xrint nsimd_sleef_rint_sve128_f64\n#define xrintf nsimd_sleef_rint_sve128_f32\n#define xnextafter nsimd_sleef_nextafter_sve128_f64\n#define xnextafterf nsimd_sleef_nextafter_sve128_f32\n#define xfrfrexp nsimd_sleef_frfrexp_sve128_f64\n#define xfrfrexpf nsimd_sleef_frfrexp_sve128_f32\n#define xexpfrexp nsimd_sleef_expfrexp_sve128_f64\n#define xexpfrexpf nsimd_sleef_expfrexp_sve128_f32\n#define xfmod nsimd_sleef_fmod_sve128_f64\n#define xfmodf nsimd_sleef_fmod_sve128_f32\n#define xremainder nsimd_sleef_remainder_sve128_f64\n#define xremainderf nsimd_sleef_remainder_sve128_f32\n#define xmodf nsimd_sleef_modf_sve128_f64\n#define xmodff nsimd_sleef_modf_sve128_f32\n#define xlgamma_u1 nsimd_sleef_lgamma_u10_sve128_f64\n#define xlgammaf_u1 nsimd_sleef_lgamma_u10_sve128_f32\n#define xtgamma_u1 nsimd_sleef_tgamma_u10_sve128_f64\n#define xtgammaf_u1 nsimd_sleef_tgamma_u10_sve128_f32\n#define xerf_u1 nsimd_sleef_erf_u10_sve128_f64\n#define xerff_u1 nsimd_sleef_erf_u10_sve128_f32\n#define xerfc_u15 nsimd_sleef_erfc_u15_sve128_f64\n#define xerfcf_u15 nsimd_sleef_erfc_u15_sve128_f32\n#define xgetInt nsimd_sleef_getInt_sve128_f64\n#define xgetIntf nsimd_sleef_getInt_sve128_f32\n#define xgetPtr nsimd_sleef_getPtr_sve128_f64\n#define xgetPtrf nsimd_sleef_getPtr_sve128_f32\n\n                   #endif\n\n                   #define rempi nsimd_sleef_rempi_sve128\n                   #define rempif nsimd_sleef_rempif_sve128\n                   #define rempisub nsimd_sleef_rempisub_sve128\n                   #define rempisubf nsimd_sleef_rempisubf_sve128\n                   #define gammak nsimd_gammak_sve128\n                   #define gammafk nsimd_gammafk_sve128\n\n                   #endif\n\n                   /* ------------------------------------------------------------------------- */\n                   /* Naming of functions sve256 */\n\n                   #ifdef NSIMD_SVE256\n\n                   #ifdef DETERMINISTIC\n\n                   #define xsin nsimd_sleef_sin_u35d_sve256_f64\n#define xsinf nsimd_sleef_sin_u35d_sve256_f32\n#define xcos nsimd_sleef_cos_u35d_sve256_f64\n#define xcosf nsimd_sleef_cos_u35d_sve256_f32\n#define xsincos nsimd_sleef_sincos_u35d_sve256_f64\n#define xsincosf nsimd_sleef_sincos_u35d_sve256_f32\n#define xtan nsimd_sleef_tan_u35d_sve256_f64\n#define xtanf nsimd_sleef_tan_u35d_sve256_f32\n#define xasin nsimd_sleef_asin_u35d_sve256_f64\n#define xasinf nsimd_sleef_asin_u35d_sve256_f32\n#define xacos nsimd_sleef_acos_u35d_sve256_f64\n#define xacosf nsimd_sleef_acos_u35d_sve256_f32\n#define xatan nsimd_sleef_atan_u35d_sve256_f64\n#define xatanf nsimd_sleef_atan_u35d_sve256_f32\n#define xatan2 nsimd_sleef_atan2_u35d_sve256_f64\n#define xatan2f nsimd_sleef_atan2_u35d_sve256_f32\n#define xlog nsimd_sleef_log_u35d_sve256_f64\n#define xlogf nsimd_sleef_log_u35d_sve256_f32\n#define xcbrt nsimd_sleef_cbrt_u35d_sve256_f64\n#define xcbrtf nsimd_sleef_cbrt_u35d_sve256_f32\n#define xsin_u1 nsimd_sleef_sin_u10d_sve256_f64\n#define xsinf_u1 nsimd_sleef_sin_u10d_sve256_f32\n#define xcos_u1 nsimd_sleef_cos_u10d_sve256_f64\n#define xcosf_u1 nsimd_sleef_cos_u10d_sve256_f32\n#define xsincos_u1 nsimd_sleef_sincos_u10d_sve256_f64\n#define xsincosf_u1 nsimd_sleef_sincos_u10d_sve256_f32\n#define xtan_u1 nsimd_sleef_tan_u10d_sve256_f64\n#define xtanf_u1 nsimd_sleef_tan_u10d_sve256_f32\n#define xasin_u1 nsimd_sleef_asin_u10d_sve256_f64\n#define xasinf_u1 nsimd_sleef_asin_u10d_sve256_f32\n#define xacos_u1 nsimd_sleef_acos_u10d_sve256_f64\n#define xacosf_u1 nsimd_sleef_acos_u10d_sve256_f32\n#define xatan_u1 nsimd_sleef_atan_u10d_sve256_f64\n#define xatanf_u1 nsimd_sleef_atan_u10d_sve256_f32\n#define xatan2_u1 nsimd_sleef_atan2_u10d_sve256_f64\n#define xatan2f_u1 nsimd_sleef_atan2_u10d_sve256_f32\n#define xlog_u1 nsimd_sleef_log_u10d_sve256_f64\n#define xlogf_u1 nsimd_sleef_log_u10d_sve256_f32\n#define xcbrt_u1 nsimd_sleef_cbrt_u10d_sve256_f64\n#define xcbrtf_u1 nsimd_sleef_cbrt_u10d_sve256_f32\n#define xexp nsimd_sleef_exp_u10d_sve256_f64\n#define xexpf nsimd_sleef_exp_u10d_sve256_f32\n#define xpow nsimd_sleef_pow_u10d_sve256_f64\n#define xpowf nsimd_sleef_pow_u10d_sve256_f32\n#define xsinh nsimd_sleef_sinh_u10d_sve256_f64\n#define xsinhf nsimd_sleef_sinh_u10d_sve256_f32\n#define xcosh nsimd_sleef_cosh_u10d_sve256_f64\n#define xcoshf nsimd_sleef_cosh_u10d_sve256_f32\n#define xtanh nsimd_sleef_tanh_u10d_sve256_f64\n#define xtanhf nsimd_sleef_tanh_u10d_sve256_f32\n#define xsinh_u35 nsimd_sleef_sinh_u35d_sve256_f64\n#define xsinhf_u35 nsimd_sleef_sinh_u35d_sve256_f32\n#define xcosh_u35 nsimd_sleef_cosh_u35d_sve256_f64\n#define xcoshf_u35 nsimd_sleef_cosh_u35d_sve256_f32\n#define xtanh_u35 nsimd_sleef_tanh_u35d_sve256_f64\n#define xtanhf_u35 nsimd_sleef_tanh_u35d_sve256_f32\n#define xfastsin_u3500 nsimd_sleef_fastsin_u3500d_sve256_f64\n#define xfastsinf_u3500 nsimd_sleef_fastsin_u3500d_sve256_f32\n#define xfastcos_u3500 nsimd_sleef_fastcos_u3500d_sve256_f64\n#define xfastcosf_u3500 nsimd_sleef_fastcos_u3500d_sve256_f32\n#define xfastpow_u3500 nsimd_sleef_fastpow_u3500d_sve256_f64\n#define xfastpowf_u3500 nsimd_sleef_fastpow_u3500d_sve256_f32\n#define xasinh nsimd_sleef_asinh_u10d_sve256_f64\n#define xasinhf nsimd_sleef_asinh_u10d_sve256_f32\n#define xacosh nsimd_sleef_acosh_u10d_sve256_f64\n#define xacoshf nsimd_sleef_acosh_u10d_sve256_f32\n#define xatanh nsimd_sleef_atanh_u10d_sve256_f64\n#define xatanhf nsimd_sleef_atanh_u10d_sve256_f32\n#define xexp2 nsimd_sleef_exp2_u10d_sve256_f64\n#define xexp2f nsimd_sleef_exp2_u10d_sve256_f32\n#define xexp2_u35 nsimd_sleef_exp2_u35d_sve256_f64\n#define xexp2f_u35 nsimd_sleef_exp2_u35d_sve256_f32\n#define xexp10 nsimd_sleef_exp10_u10d_sve256_f64\n#define xexp10f nsimd_sleef_exp10_u10d_sve256_f32\n#define xexp10_u35 nsimd_sleef_exp10_u35d_sve256_f64\n#define xexp10f_u35 nsimd_sleef_exp10_u35d_sve256_f32\n#define xexpm1 nsimd_sleef_expm1_u10d_sve256_f64\n#define xexpm1f nsimd_sleef_expm1_u10d_sve256_f32\n#define xlog10 nsimd_sleef_log10_u10d_sve256_f64\n#define xlog10f nsimd_sleef_log10_u10d_sve256_f32\n#define xlog2 nsimd_sleef_log2_u10d_sve256_f64\n#define xlog2f nsimd_sleef_log2_u10d_sve256_f32\n#define xlog2_u35 nsimd_sleef_log2_u35d_sve256_f64\n#define xlog2f_u35 nsimd_sleef_log2_u35d_sve256_f32\n#define xlog1p nsimd_sleef_log1p_u10d_sve256_f64\n#define xlog1pf nsimd_sleef_log1p_u10d_sve256_f32\n#define xsincospi_u05 nsimd_sleef_sincospi_u05d_sve256_f64\n#define xsincospif_u05 nsimd_sleef_sincospi_u05d_sve256_f32\n#define xsincospi_u35 nsimd_sleef_sincospi_u35d_sve256_f64\n#define xsincospif_u35 nsimd_sleef_sincospi_u35d_sve256_f32\n#define xsinpi_u05 nsimd_sleef_sinpi_u05d_sve256_f64\n#define xsinpif_u05 nsimd_sleef_sinpi_u05d_sve256_f32\n#define xcospi_u05 nsimd_sleef_cospi_u05d_sve256_f64\n#define xcospif_u05 nsimd_sleef_cospi_u05d_sve256_f32\n#define xldexp nsimd_sleef_ldexp_sve256_f64\n#define xldexpf nsimd_sleef_ldexp_sve256_f32\n#define xilogb nsimd_sleef_ilogb_sve256_f64\n#define xilogbf nsimd_sleef_ilogb_sve256_f32\n#define xfma nsimd_sleef_fma_sve256_f64\n#define xfmaf nsimd_sleef_fma_sve256_f32\n#define xsqrt nsimd_sleef_sqrt_sve256_f64\n#define xsqrtf nsimd_sleef_sqrt_sve256_f32\n#define xsqrt_u05 nsimd_sleef_sqrt_u05d_sve256_f64\n#define xsqrtf_u05 nsimd_sleef_sqrt_u05d_sve256_f32\n#define xsqrt_u35 nsimd_sleef_sqrt_u35d_sve256_f64\n#define xsqrtf_u35 nsimd_sleef_sqrt_u35d_sve256_f32\n#define xhypot_u05 nsimd_sleef_hypot_u05d_sve256_f64\n#define xhypotf_u05 nsimd_sleef_hypot_u05d_sve256_f32\n#define xhypot_u35 nsimd_sleef_hypot_u35d_sve256_f64\n#define xhypotf_u35 nsimd_sleef_hypot_u35d_sve256_f32\n#define xfabs nsimd_sleef_fabs_sve256_f64\n#define xfabsf nsimd_sleef_fabs_sve256_f32\n#define xcopysign nsimd_sleef_copysign_sve256_f64\n#define xcopysignf nsimd_sleef_copysign_sve256_f32\n#define xfmax nsimd_sleef_fmax_sve256_f64\n#define xfmaxf nsimd_sleef_fmax_sve256_f32\n#define xfmin nsimd_sleef_fmin_sve256_f64\n#define xfminf nsimd_sleef_fmin_sve256_f32\n#define xfdim nsimd_sleef_fdim_sve256_f64\n#define xfdimf nsimd_sleef_fdim_sve256_f32\n#define xtrunc nsimd_sleef_trunc_sve256_f64\n#define xtruncf nsimd_sleef_trunc_sve256_f32\n#define xfloor nsimd_sleef_floor_sve256_f64\n#define xfloorf nsimd_sleef_floor_sve256_f32\n#define xceil nsimd_sleef_ceil_sve256_f64\n#define xceilf nsimd_sleef_ceil_sve256_f32\n#define xround nsimd_sleef_round_sve256_f64\n#define xroundf nsimd_sleef_round_sve256_f32\n#define xrint nsimd_sleef_rint_sve256_f64\n#define xrintf nsimd_sleef_rint_sve256_f32\n#define xnextafter nsimd_sleef_nextafter_sve256_f64\n#define xnextafterf nsimd_sleef_nextafter_sve256_f32\n#define xfrfrexp nsimd_sleef_frfrexp_sve256_f64\n#define xfrfrexpf nsimd_sleef_frfrexp_sve256_f32\n#define xexpfrexp nsimd_sleef_expfrexp_sve256_f64\n#define xexpfrexpf nsimd_sleef_expfrexp_sve256_f32\n#define xfmod nsimd_sleef_fmod_sve256_f64\n#define xfmodf nsimd_sleef_fmod_sve256_f32\n#define xremainder nsimd_sleef_remainder_sve256_f64\n#define xremainderf nsimd_sleef_remainder_sve256_f32\n#define xmodf nsimd_sleef_modf_sve256_f64\n#define xmodff nsimd_sleef_modf_sve256_f32\n#define xlgamma_u1 nsimd_sleef_lgamma_u10d_sve256_f64\n#define xlgammaf_u1 nsimd_sleef_lgamma_u10d_sve256_f32\n#define xtgamma_u1 nsimd_sleef_tgamma_u10d_sve256_f64\n#define xtgammaf_u1 nsimd_sleef_tgamma_u10d_sve256_f32\n#define xerf_u1 nsimd_sleef_erf_u10d_sve256_f64\n#define xerff_u1 nsimd_sleef_erf_u10d_sve256_f32\n#define xerfc_u15 nsimd_sleef_erfc_u15d_sve256_f64\n#define xerfcf_u15 nsimd_sleef_erfc_u15d_sve256_f32\n#define xgetInt nsimd_sleef_getInt_sve256_f64\n#define xgetIntf nsimd_sleef_getInt_sve256_f32\n#define xgetPtr nsimd_sleef_getPtr_sve256_f64\n#define xgetPtrf nsimd_sleef_getPtr_sve256_f32\n\n                   #else\n\n                   #define xsin nsimd_sleef_sin_u35_sve256_f64\n#define xsinf nsimd_sleef_sin_u35_sve256_f32\n#define xcos nsimd_sleef_cos_u35_sve256_f64\n#define xcosf nsimd_sleef_cos_u35_sve256_f32\n#define xsincos nsimd_sleef_sincos_u35_sve256_f64\n#define xsincosf nsimd_sleef_sincos_u35_sve256_f32\n#define xtan nsimd_sleef_tan_u35_sve256_f64\n#define xtanf nsimd_sleef_tan_u35_sve256_f32\n#define xasin nsimd_sleef_asin_u35_sve256_f64\n#define xasinf nsimd_sleef_asin_u35_sve256_f32\n#define xacos nsimd_sleef_acos_u35_sve256_f64\n#define xacosf nsimd_sleef_acos_u35_sve256_f32\n#define xatan nsimd_sleef_atan_u35_sve256_f64\n#define xatanf nsimd_sleef_atan_u35_sve256_f32\n#define xatan2 nsimd_sleef_atan2_u35_sve256_f64\n#define xatan2f nsimd_sleef_atan2_u35_sve256_f32\n#define xlog nsimd_sleef_log_u35_sve256_f64\n#define xlogf nsimd_sleef_log_u35_sve256_f32\n#define xcbrt nsimd_sleef_cbrt_u35_sve256_f64\n#define xcbrtf nsimd_sleef_cbrt_u35_sve256_f32\n#define xsin_u1 nsimd_sleef_sin_u10_sve256_f64\n#define xsinf_u1 nsimd_sleef_sin_u10_sve256_f32\n#define xcos_u1 nsimd_sleef_cos_u10_sve256_f64\n#define xcosf_u1 nsimd_sleef_cos_u10_sve256_f32\n#define xsincos_u1 nsimd_sleef_sincos_u10_sve256_f64\n#define xsincosf_u1 nsimd_sleef_sincos_u10_sve256_f32\n#define xtan_u1 nsimd_sleef_tan_u10_sve256_f64\n#define xtanf_u1 nsimd_sleef_tan_u10_sve256_f32\n#define xasin_u1 nsimd_sleef_asin_u10_sve256_f64\n#define xasinf_u1 nsimd_sleef_asin_u10_sve256_f32\n#define xacos_u1 nsimd_sleef_acos_u10_sve256_f64\n#define xacosf_u1 nsimd_sleef_acos_u10_sve256_f32\n#define xatan_u1 nsimd_sleef_atan_u10_sve256_f64\n#define xatanf_u1 nsimd_sleef_atan_u10_sve256_f32\n#define xatan2_u1 nsimd_sleef_atan2_u10_sve256_f64\n#define xatan2f_u1 nsimd_sleef_atan2_u10_sve256_f32\n#define xlog_u1 nsimd_sleef_log_u10_sve256_f64\n#define xlogf_u1 nsimd_sleef_log_u10_sve256_f32\n#define xcbrt_u1 nsimd_sleef_cbrt_u10_sve256_f64\n#define xcbrtf_u1 nsimd_sleef_cbrt_u10_sve256_f32\n#define xexp nsimd_sleef_exp_u10_sve256_f64\n#define xexpf nsimd_sleef_exp_u10_sve256_f32\n#define xpow nsimd_sleef_pow_u10_sve256_f64\n#define xpowf nsimd_sleef_pow_u10_sve256_f32\n#define xsinh nsimd_sleef_sinh_u10_sve256_f64\n#define xsinhf nsimd_sleef_sinh_u10_sve256_f32\n#define xcosh nsimd_sleef_cosh_u10_sve256_f64\n#define xcoshf nsimd_sleef_cosh_u10_sve256_f32\n#define xtanh nsimd_sleef_tanh_u10_sve256_f64\n#define xtanhf nsimd_sleef_tanh_u10_sve256_f32\n#define xsinh_u35 nsimd_sleef_sinh_u35_sve256_f64\n#define xsinhf_u35 nsimd_sleef_sinh_u35_sve256_f32\n#define xcosh_u35 nsimd_sleef_cosh_u35_sve256_f64\n#define xcoshf_u35 nsimd_sleef_cosh_u35_sve256_f32\n#define xtanh_u35 nsimd_sleef_tanh_u35_sve256_f64\n#define xtanhf_u35 nsimd_sleef_tanh_u35_sve256_f32\n#define xfastsin_u3500 nsimd_sleef_fastsin_u3500_sve256_f64\n#define xfastsinf_u3500 nsimd_sleef_fastsin_u3500_sve256_f32\n#define xfastcos_u3500 nsimd_sleef_fastcos_u3500_sve256_f64\n#define xfastcosf_u3500 nsimd_sleef_fastcos_u3500_sve256_f32\n#define xfastpow_u3500 nsimd_sleef_fastpow_u3500_sve256_f64\n#define xfastpowf_u3500 nsimd_sleef_fastpow_u3500_sve256_f32\n#define xasinh nsimd_sleef_asinh_u10_sve256_f64\n#define xasinhf nsimd_sleef_asinh_u10_sve256_f32\n#define xacosh nsimd_sleef_acosh_u10_sve256_f64\n#define xacoshf nsimd_sleef_acosh_u10_sve256_f32\n#define xatanh nsimd_sleef_atanh_u10_sve256_f64\n#define xatanhf nsimd_sleef_atanh_u10_sve256_f32\n#define xexp2 nsimd_sleef_exp2_u10_sve256_f64\n#define xexp2f nsimd_sleef_exp2_u10_sve256_f32\n#define xexp2_u35 nsimd_sleef_exp2_u35_sve256_f64\n#define xexp2f_u35 nsimd_sleef_exp2_u35_sve256_f32\n#define xexp10 nsimd_sleef_exp10_u10_sve256_f64\n#define xexp10f nsimd_sleef_exp10_u10_sve256_f32\n#define xexp10_u35 nsimd_sleef_exp10_u35_sve256_f64\n#define xexp10f_u35 nsimd_sleef_exp10_u35_sve256_f32\n#define xexpm1 nsimd_sleef_expm1_u10_sve256_f64\n#define xexpm1f nsimd_sleef_expm1_u10_sve256_f32\n#define xlog10 nsimd_sleef_log10_u10_sve256_f64\n#define xlog10f nsimd_sleef_log10_u10_sve256_f32\n#define xlog2 nsimd_sleef_log2_u10_sve256_f64\n#define xlog2f nsimd_sleef_log2_u10_sve256_f32\n#define xlog2_u35 nsimd_sleef_log2_u35_sve256_f64\n#define xlog2f_u35 nsimd_sleef_log2_u35_sve256_f32\n#define xlog1p nsimd_sleef_log1p_u10_sve256_f64\n#define xlog1pf nsimd_sleef_log1p_u10_sve256_f32\n#define xsincospi_u05 nsimd_sleef_sincospi_u05_sve256_f64\n#define xsincospif_u05 nsimd_sleef_sincospi_u05_sve256_f32\n#define xsincospi_u35 nsimd_sleef_sincospi_u35_sve256_f64\n#define xsincospif_u35 nsimd_sleef_sincospi_u35_sve256_f32\n#define xsinpi_u05 nsimd_sleef_sinpi_u05_sve256_f64\n#define xsinpif_u05 nsimd_sleef_sinpi_u05_sve256_f32\n#define xcospi_u05 nsimd_sleef_cospi_u05_sve256_f64\n#define xcospif_u05 nsimd_sleef_cospi_u05_sve256_f32\n#define xldexp nsimd_sleef_ldexp_sve256_f64\n#define xldexpf nsimd_sleef_ldexp_sve256_f32\n#define xilogb nsimd_sleef_ilogb_sve256_f64\n#define xilogbf nsimd_sleef_ilogb_sve256_f32\n#define xfma nsimd_sleef_fma_sve256_f64\n#define xfmaf nsimd_sleef_fma_sve256_f32\n#define xsqrt nsimd_sleef_sqrt_sve256_f64\n#define xsqrtf nsimd_sleef_sqrt_sve256_f32\n#define xsqrt_u05 nsimd_sleef_sqrt_u05_sve256_f64\n#define xsqrtf_u05 nsimd_sleef_sqrt_u05_sve256_f32\n#define xsqrt_u35 nsimd_sleef_sqrt_u35_sve256_f64\n#define xsqrtf_u35 nsimd_sleef_sqrt_u35_sve256_f32\n#define xhypot_u05 nsimd_sleef_hypot_u05_sve256_f64\n#define xhypotf_u05 nsimd_sleef_hypot_u05_sve256_f32\n#define xhypot_u35 nsimd_sleef_hypot_u35_sve256_f64\n#define xhypotf_u35 nsimd_sleef_hypot_u35_sve256_f32\n#define xfabs nsimd_sleef_fabs_sve256_f64\n#define xfabsf nsimd_sleef_fabs_sve256_f32\n#define xcopysign nsimd_sleef_copysign_sve256_f64\n#define xcopysignf nsimd_sleef_copysign_sve256_f32\n#define xfmax nsimd_sleef_fmax_sve256_f64\n#define xfmaxf nsimd_sleef_fmax_sve256_f32\n#define xfmin nsimd_sleef_fmin_sve256_f64\n#define xfminf nsimd_sleef_fmin_sve256_f32\n#define xfdim nsimd_sleef_fdim_sve256_f64\n#define xfdimf nsimd_sleef_fdim_sve256_f32\n#define xtrunc nsimd_sleef_trunc_sve256_f64\n#define xtruncf nsimd_sleef_trunc_sve256_f32\n#define xfloor nsimd_sleef_floor_sve256_f64\n#define xfloorf nsimd_sleef_floor_sve256_f32\n#define xceil nsimd_sleef_ceil_sve256_f64\n#define xceilf nsimd_sleef_ceil_sve256_f32\n#define xround nsimd_sleef_round_sve256_f64\n#define xroundf nsimd_sleef_round_sve256_f32\n#define xrint nsimd_sleef_rint_sve256_f64\n#define xrintf nsimd_sleef_rint_sve256_f32\n#define xnextafter nsimd_sleef_nextafter_sve256_f64\n#define xnextafterf nsimd_sleef_nextafter_sve256_f32\n#define xfrfrexp nsimd_sleef_frfrexp_sve256_f64\n#define xfrfrexpf nsimd_sleef_frfrexp_sve256_f32\n#define xexpfrexp nsimd_sleef_expfrexp_sve256_f64\n#define xexpfrexpf nsimd_sleef_expfrexp_sve256_f32\n#define xfmod nsimd_sleef_fmod_sve256_f64\n#define xfmodf nsimd_sleef_fmod_sve256_f32\n#define xremainder nsimd_sleef_remainder_sve256_f64\n#define xremainderf nsimd_sleef_remainder_sve256_f32\n#define xmodf nsimd_sleef_modf_sve256_f64\n#define xmodff nsimd_sleef_modf_sve256_f32\n#define xlgamma_u1 nsimd_sleef_lgamma_u10_sve256_f64\n#define xlgammaf_u1 nsimd_sleef_lgamma_u10_sve256_f32\n#define xtgamma_u1 nsimd_sleef_tgamma_u10_sve256_f64\n#define xtgammaf_u1 nsimd_sleef_tgamma_u10_sve256_f32\n#define xerf_u1 nsimd_sleef_erf_u10_sve256_f64\n#define xerff_u1 nsimd_sleef_erf_u10_sve256_f32\n#define xerfc_u15 nsimd_sleef_erfc_u15_sve256_f64\n#define xerfcf_u15 nsimd_sleef_erfc_u15_sve256_f32\n#define xgetInt nsimd_sleef_getInt_sve256_f64\n#define xgetIntf nsimd_sleef_getInt_sve256_f32\n#define xgetPtr nsimd_sleef_getPtr_sve256_f64\n#define xgetPtrf nsimd_sleef_getPtr_sve256_f32\n\n                   #endif\n\n                   #define rempi nsimd_sleef_rempi_sve256\n                   #define rempif nsimd_sleef_rempif_sve256\n                   #define rempisub nsimd_sleef_rempisub_sve256\n                   #define rempisubf nsimd_sleef_rempisubf_sve256\n                   #define gammak nsimd_gammak_sve256\n                   #define gammafk nsimd_gammafk_sve256\n\n                   #endif\n\n                   /* ------------------------------------------------------------------------- */\n                   /* Naming of functions sve512 */\n\n                   #ifdef NSIMD_SVE512\n\n                   #ifdef DETERMINISTIC\n\n                   #define xsin nsimd_sleef_sin_u35d_sve512_f64\n#define xsinf nsimd_sleef_sin_u35d_sve512_f32\n#define xcos nsimd_sleef_cos_u35d_sve512_f64\n#define xcosf nsimd_sleef_cos_u35d_sve512_f32\n#define xsincos nsimd_sleef_sincos_u35d_sve512_f64\n#define xsincosf nsimd_sleef_sincos_u35d_sve512_f32\n#define xtan nsimd_sleef_tan_u35d_sve512_f64\n#define xtanf nsimd_sleef_tan_u35d_sve512_f32\n#define xasin nsimd_sleef_asin_u35d_sve512_f64\n#define xasinf nsimd_sleef_asin_u35d_sve512_f32\n#define xacos nsimd_sleef_acos_u35d_sve512_f64\n#define xacosf nsimd_sleef_acos_u35d_sve512_f32\n#define xatan nsimd_sleef_atan_u35d_sve512_f64\n#define xatanf nsimd_sleef_atan_u35d_sve512_f32\n#define xatan2 nsimd_sleef_atan2_u35d_sve512_f64\n#define xatan2f nsimd_sleef_atan2_u35d_sve512_f32\n#define xlog nsimd_sleef_log_u35d_sve512_f64\n#define xlogf nsimd_sleef_log_u35d_sve512_f32\n#define xcbrt nsimd_sleef_cbrt_u35d_sve512_f64\n#define xcbrtf nsimd_sleef_cbrt_u35d_sve512_f32\n#define xsin_u1 nsimd_sleef_sin_u10d_sve512_f64\n#define xsinf_u1 nsimd_sleef_sin_u10d_sve512_f32\n#define xcos_u1 nsimd_sleef_cos_u10d_sve512_f64\n#define xcosf_u1 nsimd_sleef_cos_u10d_sve512_f32\n#define xsincos_u1 nsimd_sleef_sincos_u10d_sve512_f64\n#define xsincosf_u1 nsimd_sleef_sincos_u10d_sve512_f32\n#define xtan_u1 nsimd_sleef_tan_u10d_sve512_f64\n#define xtanf_u1 nsimd_sleef_tan_u10d_sve512_f32\n#define xasin_u1 nsimd_sleef_asin_u10d_sve512_f64\n#define xasinf_u1 nsimd_sleef_asin_u10d_sve512_f32\n#define xacos_u1 nsimd_sleef_acos_u10d_sve512_f64\n#define xacosf_u1 nsimd_sleef_acos_u10d_sve512_f32\n#define xatan_u1 nsimd_sleef_atan_u10d_sve512_f64\n#define xatanf_u1 nsimd_sleef_atan_u10d_sve512_f32\n#define xatan2_u1 nsimd_sleef_atan2_u10d_sve512_f64\n#define xatan2f_u1 nsimd_sleef_atan2_u10d_sve512_f32\n#define xlog_u1 nsimd_sleef_log_u10d_sve512_f64\n#define xlogf_u1 nsimd_sleef_log_u10d_sve512_f32\n#define xcbrt_u1 nsimd_sleef_cbrt_u10d_sve512_f64\n#define xcbrtf_u1 nsimd_sleef_cbrt_u10d_sve512_f32\n#define xexp nsimd_sleef_exp_u10d_sve512_f64\n#define xexpf nsimd_sleef_exp_u10d_sve512_f32\n#define xpow nsimd_sleef_pow_u10d_sve512_f64\n#define xpowf nsimd_sleef_pow_u10d_sve512_f32\n#define xsinh nsimd_sleef_sinh_u10d_sve512_f64\n#define xsinhf nsimd_sleef_sinh_u10d_sve512_f32\n#define xcosh nsimd_sleef_cosh_u10d_sve512_f64\n#define xcoshf nsimd_sleef_cosh_u10d_sve512_f32\n#define xtanh nsimd_sleef_tanh_u10d_sve512_f64\n#define xtanhf nsimd_sleef_tanh_u10d_sve512_f32\n#define xsinh_u35 nsimd_sleef_sinh_u35d_sve512_f64\n#define xsinhf_u35 nsimd_sleef_sinh_u35d_sve512_f32\n#define xcosh_u35 nsimd_sleef_cosh_u35d_sve512_f64\n#define xcoshf_u35 nsimd_sleef_cosh_u35d_sve512_f32\n#define xtanh_u35 nsimd_sleef_tanh_u35d_sve512_f64\n#define xtanhf_u35 nsimd_sleef_tanh_u35d_sve512_f32\n#define xfastsin_u3500 nsimd_sleef_fastsin_u3500d_sve512_f64\n#define xfastsinf_u3500 nsimd_sleef_fastsin_u3500d_sve512_f32\n#define xfastcos_u3500 nsimd_sleef_fastcos_u3500d_sve512_f64\n#define xfastcosf_u3500 nsimd_sleef_fastcos_u3500d_sve512_f32\n#define xfastpow_u3500 nsimd_sleef_fastpow_u3500d_sve512_f64\n#define xfastpowf_u3500 nsimd_sleef_fastpow_u3500d_sve512_f32\n#define xasinh nsimd_sleef_asinh_u10d_sve512_f64\n#define xasinhf nsimd_sleef_asinh_u10d_sve512_f32\n#define xacosh nsimd_sleef_acosh_u10d_sve512_f64\n#define xacoshf nsimd_sleef_acosh_u10d_sve512_f32\n#define xatanh nsimd_sleef_atanh_u10d_sve512_f64\n#define xatanhf nsimd_sleef_atanh_u10d_sve512_f32\n#define xexp2 nsimd_sleef_exp2_u10d_sve512_f64\n#define xexp2f nsimd_sleef_exp2_u10d_sve512_f32\n#define xexp2_u35 nsimd_sleef_exp2_u35d_sve512_f64\n#define xexp2f_u35 nsimd_sleef_exp2_u35d_sve512_f32\n#define xexp10 nsimd_sleef_exp10_u10d_sve512_f64\n#define xexp10f nsimd_sleef_exp10_u10d_sve512_f32\n#define xexp10_u35 nsimd_sleef_exp10_u35d_sve512_f64\n#define xexp10f_u35 nsimd_sleef_exp10_u35d_sve512_f32\n#define xexpm1 nsimd_sleef_expm1_u10d_sve512_f64\n#define xexpm1f nsimd_sleef_expm1_u10d_sve512_f32\n#define xlog10 nsimd_sleef_log10_u10d_sve512_f64\n#define xlog10f nsimd_sleef_log10_u10d_sve512_f32\n#define xlog2 nsimd_sleef_log2_u10d_sve512_f64\n#define xlog2f nsimd_sleef_log2_u10d_sve512_f32\n#define xlog2_u35 nsimd_sleef_log2_u35d_sve512_f64\n#define xlog2f_u35 nsimd_sleef_log2_u35d_sve512_f32\n#define xlog1p nsimd_sleef_log1p_u10d_sve512_f64\n#define xlog1pf nsimd_sleef_log1p_u10d_sve512_f32\n#define xsincospi_u05 nsimd_sleef_sincospi_u05d_sve512_f64\n#define xsincospif_u05 nsimd_sleef_sincospi_u05d_sve512_f32\n#define xsincospi_u35 nsimd_sleef_sincospi_u35d_sve512_f64\n#define xsincospif_u35 nsimd_sleef_sincospi_u35d_sve512_f32\n#define xsinpi_u05 nsimd_sleef_sinpi_u05d_sve512_f64\n#define xsinpif_u05 nsimd_sleef_sinpi_u05d_sve512_f32\n#define xcospi_u05 nsimd_sleef_cospi_u05d_sve512_f64\n#define xcospif_u05 nsimd_sleef_cospi_u05d_sve512_f32\n#define xldexp nsimd_sleef_ldexp_sve512_f64\n#define xldexpf nsimd_sleef_ldexp_sve512_f32\n#define xilogb nsimd_sleef_ilogb_sve512_f64\n#define xilogbf nsimd_sleef_ilogb_sve512_f32\n#define xfma nsimd_sleef_fma_sve512_f64\n#define xfmaf nsimd_sleef_fma_sve512_f32\n#define xsqrt nsimd_sleef_sqrt_sve512_f64\n#define xsqrtf nsimd_sleef_sqrt_sve512_f32\n#define xsqrt_u05 nsimd_sleef_sqrt_u05d_sve512_f64\n#define xsqrtf_u05 nsimd_sleef_sqrt_u05d_sve512_f32\n#define xsqrt_u35 nsimd_sleef_sqrt_u35d_sve512_f64\n#define xsqrtf_u35 nsimd_sleef_sqrt_u35d_sve512_f32\n#define xhypot_u05 nsimd_sleef_hypot_u05d_sve512_f64\n#define xhypotf_u05 nsimd_sleef_hypot_u05d_sve512_f32\n#define xhypot_u35 nsimd_sleef_hypot_u35d_sve512_f64\n#define xhypotf_u35 nsimd_sleef_hypot_u35d_sve512_f32\n#define xfabs nsimd_sleef_fabs_sve512_f64\n#define xfabsf nsimd_sleef_fabs_sve512_f32\n#define xcopysign nsimd_sleef_copysign_sve512_f64\n#define xcopysignf nsimd_sleef_copysign_sve512_f32\n#define xfmax nsimd_sleef_fmax_sve512_f64\n#define xfmaxf nsimd_sleef_fmax_sve512_f32\n#define xfmin nsimd_sleef_fmin_sve512_f64\n#define xfminf nsimd_sleef_fmin_sve512_f32\n#define xfdim nsimd_sleef_fdim_sve512_f64\n#define xfdimf nsimd_sleef_fdim_sve512_f32\n#define xtrunc nsimd_sleef_trunc_sve512_f64\n#define xtruncf nsimd_sleef_trunc_sve512_f32\n#define xfloor nsimd_sleef_floor_sve512_f64\n#define xfloorf nsimd_sleef_floor_sve512_f32\n#define xceil nsimd_sleef_ceil_sve512_f64\n#define xceilf nsimd_sleef_ceil_sve512_f32\n#define xround nsimd_sleef_round_sve512_f64\n#define xroundf nsimd_sleef_round_sve512_f32\n#define xrint nsimd_sleef_rint_sve512_f64\n#define xrintf nsimd_sleef_rint_sve512_f32\n#define xnextafter nsimd_sleef_nextafter_sve512_f64\n#define xnextafterf nsimd_sleef_nextafter_sve512_f32\n#define xfrfrexp nsimd_sleef_frfrexp_sve512_f64\n#define xfrfrexpf nsimd_sleef_frfrexp_sve512_f32\n#define xexpfrexp nsimd_sleef_expfrexp_sve512_f64\n#define xexpfrexpf nsimd_sleef_expfrexp_sve512_f32\n#define xfmod nsimd_sleef_fmod_sve512_f64\n#define xfmodf nsimd_sleef_fmod_sve512_f32\n#define xremainder nsimd_sleef_remainder_sve512_f64\n#define xremainderf nsimd_sleef_remainder_sve512_f32\n#define xmodf nsimd_sleef_modf_sve512_f64\n#define xmodff nsimd_sleef_modf_sve512_f32\n#define xlgamma_u1 nsimd_sleef_lgamma_u10d_sve512_f64\n#define xlgammaf_u1 nsimd_sleef_lgamma_u10d_sve512_f32\n#define xtgamma_u1 nsimd_sleef_tgamma_u10d_sve512_f64\n#define xtgammaf_u1 nsimd_sleef_tgamma_u10d_sve512_f32\n#define xerf_u1 nsimd_sleef_erf_u10d_sve512_f64\n#define xerff_u1 nsimd_sleef_erf_u10d_sve512_f32\n#define xerfc_u15 nsimd_sleef_erfc_u15d_sve512_f64\n#define xerfcf_u15 nsimd_sleef_erfc_u15d_sve512_f32\n#define xgetInt nsimd_sleef_getInt_sve512_f64\n#define xgetIntf nsimd_sleef_getInt_sve512_f32\n#define xgetPtr nsimd_sleef_getPtr_sve512_f64\n#define xgetPtrf nsimd_sleef_getPtr_sve512_f32\n\n                   #else\n\n                   #define xsin nsimd_sleef_sin_u35_sve512_f64\n#define xsinf nsimd_sleef_sin_u35_sve512_f32\n#define xcos nsimd_sleef_cos_u35_sve512_f64\n#define xcosf nsimd_sleef_cos_u35_sve512_f32\n#define xsincos nsimd_sleef_sincos_u35_sve512_f64\n#define xsincosf nsimd_sleef_sincos_u35_sve512_f32\n#define xtan nsimd_sleef_tan_u35_sve512_f64\n#define xtanf nsimd_sleef_tan_u35_sve512_f32\n#define xasin nsimd_sleef_asin_u35_sve512_f64\n#define xasinf nsimd_sleef_asin_u35_sve512_f32\n#define xacos nsimd_sleef_acos_u35_sve512_f64\n#define xacosf nsimd_sleef_acos_u35_sve512_f32\n#define xatan nsimd_sleef_atan_u35_sve512_f64\n#define xatanf nsimd_sleef_atan_u35_sve512_f32\n#define xatan2 nsimd_sleef_atan2_u35_sve512_f64\n#define xatan2f nsimd_sleef_atan2_u35_sve512_f32\n#define xlog nsimd_sleef_log_u35_sve512_f64\n#define xlogf nsimd_sleef_log_u35_sve512_f32\n#define xcbrt nsimd_sleef_cbrt_u35_sve512_f64\n#define xcbrtf nsimd_sleef_cbrt_u35_sve512_f32\n#define xsin_u1 nsimd_sleef_sin_u10_sve512_f64\n#define xsinf_u1 nsimd_sleef_sin_u10_sve512_f32\n#define xcos_u1 nsimd_sleef_cos_u10_sve512_f64\n#define xcosf_u1 nsimd_sleef_cos_u10_sve512_f32\n#define xsincos_u1 nsimd_sleef_sincos_u10_sve512_f64\n#define xsincosf_u1 nsimd_sleef_sincos_u10_sve512_f32\n#define xtan_u1 nsimd_sleef_tan_u10_sve512_f64\n#define xtanf_u1 nsimd_sleef_tan_u10_sve512_f32\n#define xasin_u1 nsimd_sleef_asin_u10_sve512_f64\n#define xasinf_u1 nsimd_sleef_asin_u10_sve512_f32\n#define xacos_u1 nsimd_sleef_acos_u10_sve512_f64\n#define xacosf_u1 nsimd_sleef_acos_u10_sve512_f32\n#define xatan_u1 nsimd_sleef_atan_u10_sve512_f64\n#define xatanf_u1 nsimd_sleef_atan_u10_sve512_f32\n#define xatan2_u1 nsimd_sleef_atan2_u10_sve512_f64\n#define xatan2f_u1 nsimd_sleef_atan2_u10_sve512_f32\n#define xlog_u1 nsimd_sleef_log_u10_sve512_f64\n#define xlogf_u1 nsimd_sleef_log_u10_sve512_f32\n#define xcbrt_u1 nsimd_sleef_cbrt_u10_sve512_f64\n#define xcbrtf_u1 nsimd_sleef_cbrt_u10_sve512_f32\n#define xexp nsimd_sleef_exp_u10_sve512_f64\n#define xexpf nsimd_sleef_exp_u10_sve512_f32\n#define xpow nsimd_sleef_pow_u10_sve512_f64\n#define xpowf nsimd_sleef_pow_u10_sve512_f32\n#define xsinh nsimd_sleef_sinh_u10_sve512_f64\n#define xsinhf nsimd_sleef_sinh_u10_sve512_f32\n#define xcosh nsimd_sleef_cosh_u10_sve512_f64\n#define xcoshf nsimd_sleef_cosh_u10_sve512_f32\n#define xtanh nsimd_sleef_tanh_u10_sve512_f64\n#define xtanhf nsimd_sleef_tanh_u10_sve512_f32\n#define xsinh_u35 nsimd_sleef_sinh_u35_sve512_f64\n#define xsinhf_u35 nsimd_sleef_sinh_u35_sve512_f32\n#define xcosh_u35 nsimd_sleef_cosh_u35_sve512_f64\n#define xcoshf_u35 nsimd_sleef_cosh_u35_sve512_f32\n#define xtanh_u35 nsimd_sleef_tanh_u35_sve512_f64\n#define xtanhf_u35 nsimd_sleef_tanh_u35_sve512_f32\n#define xfastsin_u3500 nsimd_sleef_fastsin_u3500_sve512_f64\n#define xfastsinf_u3500 nsimd_sleef_fastsin_u3500_sve512_f32\n#define xfastcos_u3500 nsimd_sleef_fastcos_u3500_sve512_f64\n#define xfastcosf_u3500 nsimd_sleef_fastcos_u3500_sve512_f32\n#define xfastpow_u3500 nsimd_sleef_fastpow_u3500_sve512_f64\n#define xfastpowf_u3500 nsimd_sleef_fastpow_u3500_sve512_f32\n#define xasinh nsimd_sleef_asinh_u10_sve512_f64\n#define xasinhf nsimd_sleef_asinh_u10_sve512_f32\n#define xacosh nsimd_sleef_acosh_u10_sve512_f64\n#define xacoshf nsimd_sleef_acosh_u10_sve512_f32\n#define xatanh nsimd_sleef_atanh_u10_sve512_f64\n#define xatanhf nsimd_sleef_atanh_u10_sve512_f32\n#define xexp2 nsimd_sleef_exp2_u10_sve512_f64\n#define xexp2f nsimd_sleef_exp2_u10_sve512_f32\n#define xexp2_u35 nsimd_sleef_exp2_u35_sve512_f64\n#define xexp2f_u35 nsimd_sleef_exp2_u35_sve512_f32\n#define xexp10 nsimd_sleef_exp10_u10_sve512_f64\n#define xexp10f nsimd_sleef_exp10_u10_sve512_f32\n#define xexp10_u35 nsimd_sleef_exp10_u35_sve512_f64\n#define xexp10f_u35 nsimd_sleef_exp10_u35_sve512_f32\n#define xexpm1 nsimd_sleef_expm1_u10_sve512_f64\n#define xexpm1f nsimd_sleef_expm1_u10_sve512_f32\n#define xlog10 nsimd_sleef_log10_u10_sve512_f64\n#define xlog10f nsimd_sleef_log10_u10_sve512_f32\n#define xlog2 nsimd_sleef_log2_u10_sve512_f64\n#define xlog2f nsimd_sleef_log2_u10_sve512_f32\n#define xlog2_u35 nsimd_sleef_log2_u35_sve512_f64\n#define xlog2f_u35 nsimd_sleef_log2_u35_sve512_f32\n#define xlog1p nsimd_sleef_log1p_u10_sve512_f64\n#define xlog1pf nsimd_sleef_log1p_u10_sve512_f32\n#define xsincospi_u05 nsimd_sleef_sincospi_u05_sve512_f64\n#define xsincospif_u05 nsimd_sleef_sincospi_u05_sve512_f32\n#define xsincospi_u35 nsimd_sleef_sincospi_u35_sve512_f64\n#define xsincospif_u35 nsimd_sleef_sincospi_u35_sve512_f32\n#define xsinpi_u05 nsimd_sleef_sinpi_u05_sve512_f64\n#define xsinpif_u05 nsimd_sleef_sinpi_u05_sve512_f32\n#define xcospi_u05 nsimd_sleef_cospi_u05_sve512_f64\n#define xcospif_u05 nsimd_sleef_cospi_u05_sve512_f32\n#define xldexp nsimd_sleef_ldexp_sve512_f64\n#define xldexpf nsimd_sleef_ldexp_sve512_f32\n#define xilogb nsimd_sleef_ilogb_sve512_f64\n#define xilogbf nsimd_sleef_ilogb_sve512_f32\n#define xfma nsimd_sleef_fma_sve512_f64\n#define xfmaf nsimd_sleef_fma_sve512_f32\n#define xsqrt nsimd_sleef_sqrt_sve512_f64\n#define xsqrtf nsimd_sleef_sqrt_sve512_f32\n#define xsqrt_u05 nsimd_sleef_sqrt_u05_sve512_f64\n#define xsqrtf_u05 nsimd_sleef_sqrt_u05_sve512_f32\n#define xsqrt_u35 nsimd_sleef_sqrt_u35_sve512_f64\n#define xsqrtf_u35 nsimd_sleef_sqrt_u35_sve512_f32\n#define xhypot_u05 nsimd_sleef_hypot_u05_sve512_f64\n#define xhypotf_u05 nsimd_sleef_hypot_u05_sve512_f32\n#define xhypot_u35 nsimd_sleef_hypot_u35_sve512_f64\n#define xhypotf_u35 nsimd_sleef_hypot_u35_sve512_f32\n#define xfabs nsimd_sleef_fabs_sve512_f64\n#define xfabsf nsimd_sleef_fabs_sve512_f32\n#define xcopysign nsimd_sleef_copysign_sve512_f64\n#define xcopysignf nsimd_sleef_copysign_sve512_f32\n#define xfmax nsimd_sleef_fmax_sve512_f64\n#define xfmaxf nsimd_sleef_fmax_sve512_f32\n#define xfmin nsimd_sleef_fmin_sve512_f64\n#define xfminf nsimd_sleef_fmin_sve512_f32\n#define xfdim nsimd_sleef_fdim_sve512_f64\n#define xfdimf nsimd_sleef_fdim_sve512_f32\n#define xtrunc nsimd_sleef_trunc_sve512_f64\n#define xtruncf nsimd_sleef_trunc_sve512_f32\n#define xfloor nsimd_sleef_floor_sve512_f64\n#define xfloorf nsimd_sleef_floor_sve512_f32\n#define xceil nsimd_sleef_ceil_sve512_f64\n#define xceilf nsimd_sleef_ceil_sve512_f32\n#define xround nsimd_sleef_round_sve512_f64\n#define xroundf nsimd_sleef_round_sve512_f32\n#define xrint nsimd_sleef_rint_sve512_f64\n#define xrintf nsimd_sleef_rint_sve512_f32\n#define xnextafter nsimd_sleef_nextafter_sve512_f64\n#define xnextafterf nsimd_sleef_nextafter_sve512_f32\n#define xfrfrexp nsimd_sleef_frfrexp_sve512_f64\n#define xfrfrexpf nsimd_sleef_frfrexp_sve512_f32\n#define xexpfrexp nsimd_sleef_expfrexp_sve512_f64\n#define xexpfrexpf nsimd_sleef_expfrexp_sve512_f32\n#define xfmod nsimd_sleef_fmod_sve512_f64\n#define xfmodf nsimd_sleef_fmod_sve512_f32\n#define xremainder nsimd_sleef_remainder_sve512_f64\n#define xremainderf nsimd_sleef_remainder_sve512_f32\n#define xmodf nsimd_sleef_modf_sve512_f64\n#define xmodff nsimd_sleef_modf_sve512_f32\n#define xlgamma_u1 nsimd_sleef_lgamma_u10_sve512_f64\n#define xlgammaf_u1 nsimd_sleef_lgamma_u10_sve512_f32\n#define xtgamma_u1 nsimd_sleef_tgamma_u10_sve512_f64\n#define xtgammaf_u1 nsimd_sleef_tgamma_u10_sve512_f32\n#define xerf_u1 nsimd_sleef_erf_u10_sve512_f64\n#define xerff_u1 nsimd_sleef_erf_u10_sve512_f32\n#define xerfc_u15 nsimd_sleef_erfc_u15_sve512_f64\n#define xerfcf_u15 nsimd_sleef_erfc_u15_sve512_f32\n#define xgetInt nsimd_sleef_getInt_sve512_f64\n#define xgetIntf nsimd_sleef_getInt_sve512_f32\n#define xgetPtr nsimd_sleef_getPtr_sve512_f64\n#define xgetPtrf nsimd_sleef_getPtr_sve512_f32\n\n                   #endif\n\n                   #define rempi nsimd_sleef_rempi_sve512\n                   #define rempif nsimd_sleef_rempif_sve512\n                   #define rempisub nsimd_sleef_rempisub_sve512\n                   #define rempisubf nsimd_sleef_rempisubf_sve512\n                   #define gammak nsimd_gammak_sve512\n                   #define gammafk nsimd_gammafk_sve512\n\n                   #endif\n\n                   /* ------------------------------------------------------------------------- */\n                   /* Naming of functions sve1024 */\n\n                   #ifdef NSIMD_SVE1024\n\n                   #ifdef DETERMINISTIC\n\n                   #define xsin nsimd_sleef_sin_u35d_sve1024_f64\n#define xsinf nsimd_sleef_sin_u35d_sve1024_f32\n#define xcos nsimd_sleef_cos_u35d_sve1024_f64\n#define xcosf nsimd_sleef_cos_u35d_sve1024_f32\n#define xsincos nsimd_sleef_sincos_u35d_sve1024_f64\n#define xsincosf nsimd_sleef_sincos_u35d_sve1024_f32\n#define xtan nsimd_sleef_tan_u35d_sve1024_f64\n#define xtanf nsimd_sleef_tan_u35d_sve1024_f32\n#define xasin nsimd_sleef_asin_u35d_sve1024_f64\n#define xasinf nsimd_sleef_asin_u35d_sve1024_f32\n#define xacos nsimd_sleef_acos_u35d_sve1024_f64\n#define xacosf nsimd_sleef_acos_u35d_sve1024_f32\n#define xatan nsimd_sleef_atan_u35d_sve1024_f64\n#define xatanf nsimd_sleef_atan_u35d_sve1024_f32\n#define xatan2 nsimd_sleef_atan2_u35d_sve1024_f64\n#define xatan2f nsimd_sleef_atan2_u35d_sve1024_f32\n#define xlog nsimd_sleef_log_u35d_sve1024_f64\n#define xlogf nsimd_sleef_log_u35d_sve1024_f32\n#define xcbrt nsimd_sleef_cbrt_u35d_sve1024_f64\n#define xcbrtf nsimd_sleef_cbrt_u35d_sve1024_f32\n#define xsin_u1 nsimd_sleef_sin_u10d_sve1024_f64\n#define xsinf_u1 nsimd_sleef_sin_u10d_sve1024_f32\n#define xcos_u1 nsimd_sleef_cos_u10d_sve1024_f64\n#define xcosf_u1 nsimd_sleef_cos_u10d_sve1024_f32\n#define xsincos_u1 nsimd_sleef_sincos_u10d_sve1024_f64\n#define xsincosf_u1 nsimd_sleef_sincos_u10d_sve1024_f32\n#define xtan_u1 nsimd_sleef_tan_u10d_sve1024_f64\n#define xtanf_u1 nsimd_sleef_tan_u10d_sve1024_f32\n#define xasin_u1 nsimd_sleef_asin_u10d_sve1024_f64\n#define xasinf_u1 nsimd_sleef_asin_u10d_sve1024_f32\n#define xacos_u1 nsimd_sleef_acos_u10d_sve1024_f64\n#define xacosf_u1 nsimd_sleef_acos_u10d_sve1024_f32\n#define xatan_u1 nsimd_sleef_atan_u10d_sve1024_f64\n#define xatanf_u1 nsimd_sleef_atan_u10d_sve1024_f32\n#define xatan2_u1 nsimd_sleef_atan2_u10d_sve1024_f64\n#define xatan2f_u1 nsimd_sleef_atan2_u10d_sve1024_f32\n#define xlog_u1 nsimd_sleef_log_u10d_sve1024_f64\n#define xlogf_u1 nsimd_sleef_log_u10d_sve1024_f32\n#define xcbrt_u1 nsimd_sleef_cbrt_u10d_sve1024_f64\n#define xcbrtf_u1 nsimd_sleef_cbrt_u10d_sve1024_f32\n#define xexp nsimd_sleef_exp_u10d_sve1024_f64\n#define xexpf nsimd_sleef_exp_u10d_sve1024_f32\n#define xpow nsimd_sleef_pow_u10d_sve1024_f64\n#define xpowf nsimd_sleef_pow_u10d_sve1024_f32\n#define xsinh nsimd_sleef_sinh_u10d_sve1024_f64\n#define xsinhf nsimd_sleef_sinh_u10d_sve1024_f32\n#define xcosh nsimd_sleef_cosh_u10d_sve1024_f64\n#define xcoshf nsimd_sleef_cosh_u10d_sve1024_f32\n#define xtanh nsimd_sleef_tanh_u10d_sve1024_f64\n#define xtanhf nsimd_sleef_tanh_u10d_sve1024_f32\n#define xsinh_u35 nsimd_sleef_sinh_u35d_sve1024_f64\n#define xsinhf_u35 nsimd_sleef_sinh_u35d_sve1024_f32\n#define xcosh_u35 nsimd_sleef_cosh_u35d_sve1024_f64\n#define xcoshf_u35 nsimd_sleef_cosh_u35d_sve1024_f32\n#define xtanh_u35 nsimd_sleef_tanh_u35d_sve1024_f64\n#define xtanhf_u35 nsimd_sleef_tanh_u35d_sve1024_f32\n#define xfastsin_u3500 nsimd_sleef_fastsin_u3500d_sve1024_f64\n#define xfastsinf_u3500 nsimd_sleef_fastsin_u3500d_sve1024_f32\n#define xfastcos_u3500 nsimd_sleef_fastcos_u3500d_sve1024_f64\n#define xfastcosf_u3500 nsimd_sleef_fastcos_u3500d_sve1024_f32\n#define xfastpow_u3500 nsimd_sleef_fastpow_u3500d_sve1024_f64\n#define xfastpowf_u3500 nsimd_sleef_fastpow_u3500d_sve1024_f32\n#define xasinh nsimd_sleef_asinh_u10d_sve1024_f64\n#define xasinhf nsimd_sleef_asinh_u10d_sve1024_f32\n#define xacosh nsimd_sleef_acosh_u10d_sve1024_f64\n#define xacoshf nsimd_sleef_acosh_u10d_sve1024_f32\n#define xatanh nsimd_sleef_atanh_u10d_sve1024_f64\n#define xatanhf nsimd_sleef_atanh_u10d_sve1024_f32\n#define xexp2 nsimd_sleef_exp2_u10d_sve1024_f64\n#define xexp2f nsimd_sleef_exp2_u10d_sve1024_f32\n#define xexp2_u35 nsimd_sleef_exp2_u35d_sve1024_f64\n#define xexp2f_u35 nsimd_sleef_exp2_u35d_sve1024_f32\n#define xexp10 nsimd_sleef_exp10_u10d_sve1024_f64\n#define xexp10f nsimd_sleef_exp10_u10d_sve1024_f32\n#define xexp10_u35 nsimd_sleef_exp10_u35d_sve1024_f64\n#define xexp10f_u35 nsimd_sleef_exp10_u35d_sve1024_f32\n#define xexpm1 nsimd_sleef_expm1_u10d_sve1024_f64\n#define xexpm1f nsimd_sleef_expm1_u10d_sve1024_f32\n#define xlog10 nsimd_sleef_log10_u10d_sve1024_f64\n#define xlog10f nsimd_sleef_log10_u10d_sve1024_f32\n#define xlog2 nsimd_sleef_log2_u10d_sve1024_f64\n#define xlog2f nsimd_sleef_log2_u10d_sve1024_f32\n#define xlog2_u35 nsimd_sleef_log2_u35d_sve1024_f64\n#define xlog2f_u35 nsimd_sleef_log2_u35d_sve1024_f32\n#define xlog1p nsimd_sleef_log1p_u10d_sve1024_f64\n#define xlog1pf nsimd_sleef_log1p_u10d_sve1024_f32\n#define xsincospi_u05 nsimd_sleef_sincospi_u05d_sve1024_f64\n#define xsincospif_u05 nsimd_sleef_sincospi_u05d_sve1024_f32\n#define xsincospi_u35 nsimd_sleef_sincospi_u35d_sve1024_f64\n#define xsincospif_u35 nsimd_sleef_sincospi_u35d_sve1024_f32\n#define xsinpi_u05 nsimd_sleef_sinpi_u05d_sve1024_f64\n#define xsinpif_u05 nsimd_sleef_sinpi_u05d_sve1024_f32\n#define xcospi_u05 nsimd_sleef_cospi_u05d_sve1024_f64\n#define xcospif_u05 nsimd_sleef_cospi_u05d_sve1024_f32\n#define xldexp nsimd_sleef_ldexp_sve1024_f64\n#define xldexpf nsimd_sleef_ldexp_sve1024_f32\n#define xilogb nsimd_sleef_ilogb_sve1024_f64\n#define xilogbf nsimd_sleef_ilogb_sve1024_f32\n#define xfma nsimd_sleef_fma_sve1024_f64\n#define xfmaf nsimd_sleef_fma_sve1024_f32\n#define xsqrt nsimd_sleef_sqrt_sve1024_f64\n#define xsqrtf nsimd_sleef_sqrt_sve1024_f32\n#define xsqrt_u05 nsimd_sleef_sqrt_u05d_sve1024_f64\n#define xsqrtf_u05 nsimd_sleef_sqrt_u05d_sve1024_f32\n#define xsqrt_u35 nsimd_sleef_sqrt_u35d_sve1024_f64\n#define xsqrtf_u35 nsimd_sleef_sqrt_u35d_sve1024_f32\n#define xhypot_u05 nsimd_sleef_hypot_u05d_sve1024_f64\n#define xhypotf_u05 nsimd_sleef_hypot_u05d_sve1024_f32\n#define xhypot_u35 nsimd_sleef_hypot_u35d_sve1024_f64\n#define xhypotf_u35 nsimd_sleef_hypot_u35d_sve1024_f32\n#define xfabs nsimd_sleef_fabs_sve1024_f64\n#define xfabsf nsimd_sleef_fabs_sve1024_f32\n#define xcopysign nsimd_sleef_copysign_sve1024_f64\n#define xcopysignf nsimd_sleef_copysign_sve1024_f32\n#define xfmax nsimd_sleef_fmax_sve1024_f64\n#define xfmaxf nsimd_sleef_fmax_sve1024_f32\n#define xfmin nsimd_sleef_fmin_sve1024_f64\n#define xfminf nsimd_sleef_fmin_sve1024_f32\n#define xfdim nsimd_sleef_fdim_sve1024_f64\n#define xfdimf nsimd_sleef_fdim_sve1024_f32\n#define xtrunc nsimd_sleef_trunc_sve1024_f64\n#define xtruncf nsimd_sleef_trunc_sve1024_f32\n#define xfloor nsimd_sleef_floor_sve1024_f64\n#define xfloorf nsimd_sleef_floor_sve1024_f32\n#define xceil nsimd_sleef_ceil_sve1024_f64\n#define xceilf nsimd_sleef_ceil_sve1024_f32\n#define xround nsimd_sleef_round_sve1024_f64\n#define xroundf nsimd_sleef_round_sve1024_f32\n#define xrint nsimd_sleef_rint_sve1024_f64\n#define xrintf nsimd_sleef_rint_sve1024_f32\n#define xnextafter nsimd_sleef_nextafter_sve1024_f64\n#define xnextafterf nsimd_sleef_nextafter_sve1024_f32\n#define xfrfrexp nsimd_sleef_frfrexp_sve1024_f64\n#define xfrfrexpf nsimd_sleef_frfrexp_sve1024_f32\n#define xexpfrexp nsimd_sleef_expfrexp_sve1024_f64\n#define xexpfrexpf nsimd_sleef_expfrexp_sve1024_f32\n#define xfmod nsimd_sleef_fmod_sve1024_f64\n#define xfmodf nsimd_sleef_fmod_sve1024_f32\n#define xremainder nsimd_sleef_remainder_sve1024_f64\n#define xremainderf nsimd_sleef_remainder_sve1024_f32\n#define xmodf nsimd_sleef_modf_sve1024_f64\n#define xmodff nsimd_sleef_modf_sve1024_f32\n#define xlgamma_u1 nsimd_sleef_lgamma_u10d_sve1024_f64\n#define xlgammaf_u1 nsimd_sleef_lgamma_u10d_sve1024_f32\n#define xtgamma_u1 nsimd_sleef_tgamma_u10d_sve1024_f64\n#define xtgammaf_u1 nsimd_sleef_tgamma_u10d_sve1024_f32\n#define xerf_u1 nsimd_sleef_erf_u10d_sve1024_f64\n#define xerff_u1 nsimd_sleef_erf_u10d_sve1024_f32\n#define xerfc_u15 nsimd_sleef_erfc_u15d_sve1024_f64\n#define xerfcf_u15 nsimd_sleef_erfc_u15d_sve1024_f32\n#define xgetInt nsimd_sleef_getInt_sve1024_f64\n#define xgetIntf nsimd_sleef_getInt_sve1024_f32\n#define xgetPtr nsimd_sleef_getPtr_sve1024_f64\n#define xgetPtrf nsimd_sleef_getPtr_sve1024_f32\n\n                   #else\n\n                   #define xsin nsimd_sleef_sin_u35_sve1024_f64\n#define xsinf nsimd_sleef_sin_u35_sve1024_f32\n#define xcos nsimd_sleef_cos_u35_sve1024_f64\n#define xcosf nsimd_sleef_cos_u35_sve1024_f32\n#define xsincos nsimd_sleef_sincos_u35_sve1024_f64\n#define xsincosf nsimd_sleef_sincos_u35_sve1024_f32\n#define xtan nsimd_sleef_tan_u35_sve1024_f64\n#define xtanf nsimd_sleef_tan_u35_sve1024_f32\n#define xasin nsimd_sleef_asin_u35_sve1024_f64\n#define xasinf nsimd_sleef_asin_u35_sve1024_f32\n#define xacos nsimd_sleef_acos_u35_sve1024_f64\n#define xacosf nsimd_sleef_acos_u35_sve1024_f32\n#define xatan nsimd_sleef_atan_u35_sve1024_f64\n#define xatanf nsimd_sleef_atan_u35_sve1024_f32\n#define xatan2 nsimd_sleef_atan2_u35_sve1024_f64\n#define xatan2f nsimd_sleef_atan2_u35_sve1024_f32\n#define xlog nsimd_sleef_log_u35_sve1024_f64\n#define xlogf nsimd_sleef_log_u35_sve1024_f32\n#define xcbrt nsimd_sleef_cbrt_u35_sve1024_f64\n#define xcbrtf nsimd_sleef_cbrt_u35_sve1024_f32\n#define xsin_u1 nsimd_sleef_sin_u10_sve1024_f64\n#define xsinf_u1 nsimd_sleef_sin_u10_sve1024_f32\n#define xcos_u1 nsimd_sleef_cos_u10_sve1024_f64\n#define xcosf_u1 nsimd_sleef_cos_u10_sve1024_f32\n#define xsincos_u1 nsimd_sleef_sincos_u10_sve1024_f64\n#define xsincosf_u1 nsimd_sleef_sincos_u10_sve1024_f32\n#define xtan_u1 nsimd_sleef_tan_u10_sve1024_f64\n#define xtanf_u1 nsimd_sleef_tan_u10_sve1024_f32\n#define xasin_u1 nsimd_sleef_asin_u10_sve1024_f64\n#define xasinf_u1 nsimd_sleef_asin_u10_sve1024_f32\n#define xacos_u1 nsimd_sleef_acos_u10_sve1024_f64\n#define xacosf_u1 nsimd_sleef_acos_u10_sve1024_f32\n#define xatan_u1 nsimd_sleef_atan_u10_sve1024_f64\n#define xatanf_u1 nsimd_sleef_atan_u10_sve1024_f32\n#define xatan2_u1 nsimd_sleef_atan2_u10_sve1024_f64\n#define xatan2f_u1 nsimd_sleef_atan2_u10_sve1024_f32\n#define xlog_u1 nsimd_sleef_log_u10_sve1024_f64\n#define xlogf_u1 nsimd_sleef_log_u10_sve1024_f32\n#define xcbrt_u1 nsimd_sleef_cbrt_u10_sve1024_f64\n#define xcbrtf_u1 nsimd_sleef_cbrt_u10_sve1024_f32\n#define xexp nsimd_sleef_exp_u10_sve1024_f64\n#define xexpf nsimd_sleef_exp_u10_sve1024_f32\n#define xpow nsimd_sleef_pow_u10_sve1024_f64\n#define xpowf nsimd_sleef_pow_u10_sve1024_f32\n#define xsinh nsimd_sleef_sinh_u10_sve1024_f64\n#define xsinhf nsimd_sleef_sinh_u10_sve1024_f32\n#define xcosh nsimd_sleef_cosh_u10_sve1024_f64\n#define xcoshf nsimd_sleef_cosh_u10_sve1024_f32\n#define xtanh nsimd_sleef_tanh_u10_sve1024_f64\n#define xtanhf nsimd_sleef_tanh_u10_sve1024_f32\n#define xsinh_u35 nsimd_sleef_sinh_u35_sve1024_f64\n#define xsinhf_u35 nsimd_sleef_sinh_u35_sve1024_f32\n#define xcosh_u35 nsimd_sleef_cosh_u35_sve1024_f64\n#define xcoshf_u35 nsimd_sleef_cosh_u35_sve1024_f32\n#define xtanh_u35 nsimd_sleef_tanh_u35_sve1024_f64\n#define xtanhf_u35 nsimd_sleef_tanh_u35_sve1024_f32\n#define xfastsin_u3500 nsimd_sleef_fastsin_u3500_sve1024_f64\n#define xfastsinf_u3500 nsimd_sleef_fastsin_u3500_sve1024_f32\n#define xfastcos_u3500 nsimd_sleef_fastcos_u3500_sve1024_f64\n#define xfastcosf_u3500 nsimd_sleef_fastcos_u3500_sve1024_f32\n#define xfastpow_u3500 nsimd_sleef_fastpow_u3500_sve1024_f64\n#define xfastpowf_u3500 nsimd_sleef_fastpow_u3500_sve1024_f32\n#define xasinh nsimd_sleef_asinh_u10_sve1024_f64\n#define xasinhf nsimd_sleef_asinh_u10_sve1024_f32\n#define xacosh nsimd_sleef_acosh_u10_sve1024_f64\n#define xacoshf nsimd_sleef_acosh_u10_sve1024_f32\n#define xatanh nsimd_sleef_atanh_u10_sve1024_f64\n#define xatanhf nsimd_sleef_atanh_u10_sve1024_f32\n#define xexp2 nsimd_sleef_exp2_u10_sve1024_f64\n#define xexp2f nsimd_sleef_exp2_u10_sve1024_f32\n#define xexp2_u35 nsimd_sleef_exp2_u35_sve1024_f64\n#define xexp2f_u35 nsimd_sleef_exp2_u35_sve1024_f32\n#define xexp10 nsimd_sleef_exp10_u10_sve1024_f64\n#define xexp10f nsimd_sleef_exp10_u10_sve1024_f32\n#define xexp10_u35 nsimd_sleef_exp10_u35_sve1024_f64\n#define xexp10f_u35 nsimd_sleef_exp10_u35_sve1024_f32\n#define xexpm1 nsimd_sleef_expm1_u10_sve1024_f64\n#define xexpm1f nsimd_sleef_expm1_u10_sve1024_f32\n#define xlog10 nsimd_sleef_log10_u10_sve1024_f64\n#define xlog10f nsimd_sleef_log10_u10_sve1024_f32\n#define xlog2 nsimd_sleef_log2_u10_sve1024_f64\n#define xlog2f nsimd_sleef_log2_u10_sve1024_f32\n#define xlog2_u35 nsimd_sleef_log2_u35_sve1024_f64\n#define xlog2f_u35 nsimd_sleef_log2_u35_sve1024_f32\n#define xlog1p nsimd_sleef_log1p_u10_sve1024_f64\n#define xlog1pf nsimd_sleef_log1p_u10_sve1024_f32\n#define xsincospi_u05 nsimd_sleef_sincospi_u05_sve1024_f64\n#define xsincospif_u05 nsimd_sleef_sincospi_u05_sve1024_f32\n#define xsincospi_u35 nsimd_sleef_sincospi_u35_sve1024_f64\n#define xsincospif_u35 nsimd_sleef_sincospi_u35_sve1024_f32\n#define xsinpi_u05 nsimd_sleef_sinpi_u05_sve1024_f64\n#define xsinpif_u05 nsimd_sleef_sinpi_u05_sve1024_f32\n#define xcospi_u05 nsimd_sleef_cospi_u05_sve1024_f64\n#define xcospif_u05 nsimd_sleef_cospi_u05_sve1024_f32\n#define xldexp nsimd_sleef_ldexp_sve1024_f64\n#define xldexpf nsimd_sleef_ldexp_sve1024_f32\n#define xilogb nsimd_sleef_ilogb_sve1024_f64\n#define xilogbf nsimd_sleef_ilogb_sve1024_f32\n#define xfma nsimd_sleef_fma_sve1024_f64\n#define xfmaf nsimd_sleef_fma_sve1024_f32\n#define xsqrt nsimd_sleef_sqrt_sve1024_f64\n#define xsqrtf nsimd_sleef_sqrt_sve1024_f32\n#define xsqrt_u05 nsimd_sleef_sqrt_u05_sve1024_f64\n#define xsqrtf_u05 nsimd_sleef_sqrt_u05_sve1024_f32\n#define xsqrt_u35 nsimd_sleef_sqrt_u35_sve1024_f64\n#define xsqrtf_u35 nsimd_sleef_sqrt_u35_sve1024_f32\n#define xhypot_u05 nsimd_sleef_hypot_u05_sve1024_f64\n#define xhypotf_u05 nsimd_sleef_hypot_u05_sve1024_f32\n#define xhypot_u35 nsimd_sleef_hypot_u35_sve1024_f64\n#define xhypotf_u35 nsimd_sleef_hypot_u35_sve1024_f32\n#define xfabs nsimd_sleef_fabs_sve1024_f64\n#define xfabsf nsimd_sleef_fabs_sve1024_f32\n#define xcopysign nsimd_sleef_copysign_sve1024_f64\n#define xcopysignf nsimd_sleef_copysign_sve1024_f32\n#define xfmax nsimd_sleef_fmax_sve1024_f64\n#define xfmaxf nsimd_sleef_fmax_sve1024_f32\n#define xfmin nsimd_sleef_fmin_sve1024_f64\n#define xfminf nsimd_sleef_fmin_sve1024_f32\n#define xfdim nsimd_sleef_fdim_sve1024_f64\n#define xfdimf nsimd_sleef_fdim_sve1024_f32\n#define xtrunc nsimd_sleef_trunc_sve1024_f64\n#define xtruncf nsimd_sleef_trunc_sve1024_f32\n#define xfloor nsimd_sleef_floor_sve1024_f64\n#define xfloorf nsimd_sleef_floor_sve1024_f32\n#define xceil nsimd_sleef_ceil_sve1024_f64\n#define xceilf nsimd_sleef_ceil_sve1024_f32\n#define xround nsimd_sleef_round_sve1024_f64\n#define xroundf nsimd_sleef_round_sve1024_f32\n#define xrint nsimd_sleef_rint_sve1024_f64\n#define xrintf nsimd_sleef_rint_sve1024_f32\n#define xnextafter nsimd_sleef_nextafter_sve1024_f64\n#define xnextafterf nsimd_sleef_nextafter_sve1024_f32\n#define xfrfrexp nsimd_sleef_frfrexp_sve1024_f64\n#define xfrfrexpf nsimd_sleef_frfrexp_sve1024_f32\n#define xexpfrexp nsimd_sleef_expfrexp_sve1024_f64\n#define xexpfrexpf nsimd_sleef_expfrexp_sve1024_f32\n#define xfmod nsimd_sleef_fmod_sve1024_f64\n#define xfmodf nsimd_sleef_fmod_sve1024_f32\n#define xremainder nsimd_sleef_remainder_sve1024_f64\n#define xremainderf nsimd_sleef_remainder_sve1024_f32\n#define xmodf nsimd_sleef_modf_sve1024_f64\n#define xmodff nsimd_sleef_modf_sve1024_f32\n#define xlgamma_u1 nsimd_sleef_lgamma_u10_sve1024_f64\n#define xlgammaf_u1 nsimd_sleef_lgamma_u10_sve1024_f32\n#define xtgamma_u1 nsimd_sleef_tgamma_u10_sve1024_f64\n#define xtgammaf_u1 nsimd_sleef_tgamma_u10_sve1024_f32\n#define xerf_u1 nsimd_sleef_erf_u10_sve1024_f64\n#define xerff_u1 nsimd_sleef_erf_u10_sve1024_f32\n#define xerfc_u15 nsimd_sleef_erfc_u15_sve1024_f64\n#define xerfcf_u15 nsimd_sleef_erfc_u15_sve1024_f32\n#define xgetInt nsimd_sleef_getInt_sve1024_f64\n#define xgetIntf nsimd_sleef_getInt_sve1024_f32\n#define xgetPtr nsimd_sleef_getPtr_sve1024_f64\n#define xgetPtrf nsimd_sleef_getPtr_sve1024_f32\n\n                   #endif\n\n                   #define rempi nsimd_sleef_rempi_sve1024\n                   #define rempif nsimd_sleef_rempif_sve1024\n                   #define rempisub nsimd_sleef_rempisub_sve1024\n                   #define rempisubf nsimd_sleef_rempisubf_sve1024\n                   #define gammak nsimd_gammak_sve1024\n                   #define gammafk nsimd_gammafk_sve1024\n\n                   #endif\n\n                   /* ------------------------------------------------------------------------- */\n                   /* Naming of functions sve2048 */\n\n                   #ifdef NSIMD_SVE2048\n\n                   #ifdef DETERMINISTIC\n\n                   #define xsin nsimd_sleef_sin_u35d_sve2048_f64\n#define xsinf nsimd_sleef_sin_u35d_sve2048_f32\n#define xcos nsimd_sleef_cos_u35d_sve2048_f64\n#define xcosf nsimd_sleef_cos_u35d_sve2048_f32\n#define xsincos nsimd_sleef_sincos_u35d_sve2048_f64\n#define xsincosf nsimd_sleef_sincos_u35d_sve2048_f32\n#define xtan nsimd_sleef_tan_u35d_sve2048_f64\n#define xtanf nsimd_sleef_tan_u35d_sve2048_f32\n#define xasin nsimd_sleef_asin_u35d_sve2048_f64\n#define xasinf nsimd_sleef_asin_u35d_sve2048_f32\n#define xacos nsimd_sleef_acos_u35d_sve2048_f64\n#define xacosf nsimd_sleef_acos_u35d_sve2048_f32\n#define xatan nsimd_sleef_atan_u35d_sve2048_f64\n#define xatanf nsimd_sleef_atan_u35d_sve2048_f32\n#define xatan2 nsimd_sleef_atan2_u35d_sve2048_f64\n#define xatan2f nsimd_sleef_atan2_u35d_sve2048_f32\n#define xlog nsimd_sleef_log_u35d_sve2048_f64\n#define xlogf nsimd_sleef_log_u35d_sve2048_f32\n#define xcbrt nsimd_sleef_cbrt_u35d_sve2048_f64\n#define xcbrtf nsimd_sleef_cbrt_u35d_sve2048_f32\n#define xsin_u1 nsimd_sleef_sin_u10d_sve2048_f64\n#define xsinf_u1 nsimd_sleef_sin_u10d_sve2048_f32\n#define xcos_u1 nsimd_sleef_cos_u10d_sve2048_f64\n#define xcosf_u1 nsimd_sleef_cos_u10d_sve2048_f32\n#define xsincos_u1 nsimd_sleef_sincos_u10d_sve2048_f64\n#define xsincosf_u1 nsimd_sleef_sincos_u10d_sve2048_f32\n#define xtan_u1 nsimd_sleef_tan_u10d_sve2048_f64\n#define xtanf_u1 nsimd_sleef_tan_u10d_sve2048_f32\n#define xasin_u1 nsimd_sleef_asin_u10d_sve2048_f64\n#define xasinf_u1 nsimd_sleef_asin_u10d_sve2048_f32\n#define xacos_u1 nsimd_sleef_acos_u10d_sve2048_f64\n#define xacosf_u1 nsimd_sleef_acos_u10d_sve2048_f32\n#define xatan_u1 nsimd_sleef_atan_u10d_sve2048_f64\n#define xatanf_u1 nsimd_sleef_atan_u10d_sve2048_f32\n#define xatan2_u1 nsimd_sleef_atan2_u10d_sve2048_f64\n#define xatan2f_u1 nsimd_sleef_atan2_u10d_sve2048_f32\n#define xlog_u1 nsimd_sleef_log_u10d_sve2048_f64\n#define xlogf_u1 nsimd_sleef_log_u10d_sve2048_f32\n#define xcbrt_u1 nsimd_sleef_cbrt_u10d_sve2048_f64\n#define xcbrtf_u1 nsimd_sleef_cbrt_u10d_sve2048_f32\n#define xexp nsimd_sleef_exp_u10d_sve2048_f64\n#define xexpf nsimd_sleef_exp_u10d_sve2048_f32\n#define xpow nsimd_sleef_pow_u10d_sve2048_f64\n#define xpowf nsimd_sleef_pow_u10d_sve2048_f32\n#define xsinh nsimd_sleef_sinh_u10d_sve2048_f64\n#define xsinhf nsimd_sleef_sinh_u10d_sve2048_f32\n#define xcosh nsimd_sleef_cosh_u10d_sve2048_f64\n#define xcoshf nsimd_sleef_cosh_u10d_sve2048_f32\n#define xtanh nsimd_sleef_tanh_u10d_sve2048_f64\n#define xtanhf nsimd_sleef_tanh_u10d_sve2048_f32\n#define xsinh_u35 nsimd_sleef_sinh_u35d_sve2048_f64\n#define xsinhf_u35 nsimd_sleef_sinh_u35d_sve2048_f32\n#define xcosh_u35 nsimd_sleef_cosh_u35d_sve2048_f64\n#define xcoshf_u35 nsimd_sleef_cosh_u35d_sve2048_f32\n#define xtanh_u35 nsimd_sleef_tanh_u35d_sve2048_f64\n#define xtanhf_u35 nsimd_sleef_tanh_u35d_sve2048_f32\n#define xfastsin_u3500 nsimd_sleef_fastsin_u3500d_sve2048_f64\n#define xfastsinf_u3500 nsimd_sleef_fastsin_u3500d_sve2048_f32\n#define xfastcos_u3500 nsimd_sleef_fastcos_u3500d_sve2048_f64\n#define xfastcosf_u3500 nsimd_sleef_fastcos_u3500d_sve2048_f32\n#define xfastpow_u3500 nsimd_sleef_fastpow_u3500d_sve2048_f64\n#define xfastpowf_u3500 nsimd_sleef_fastpow_u3500d_sve2048_f32\n#define xasinh nsimd_sleef_asinh_u10d_sve2048_f64\n#define xasinhf nsimd_sleef_asinh_u10d_sve2048_f32\n#define xacosh nsimd_sleef_acosh_u10d_sve2048_f64\n#define xacoshf nsimd_sleef_acosh_u10d_sve2048_f32\n#define xatanh nsimd_sleef_atanh_u10d_sve2048_f64\n#define xatanhf nsimd_sleef_atanh_u10d_sve2048_f32\n#define xexp2 nsimd_sleef_exp2_u10d_sve2048_f64\n#define xexp2f nsimd_sleef_exp2_u10d_sve2048_f32\n#define xexp2_u35 nsimd_sleef_exp2_u35d_sve2048_f64\n#define xexp2f_u35 nsimd_sleef_exp2_u35d_sve2048_f32\n#define xexp10 nsimd_sleef_exp10_u10d_sve2048_f64\n#define xexp10f nsimd_sleef_exp10_u10d_sve2048_f32\n#define xexp10_u35 nsimd_sleef_exp10_u35d_sve2048_f64\n#define xexp10f_u35 nsimd_sleef_exp10_u35d_sve2048_f32\n#define xexpm1 nsimd_sleef_expm1_u10d_sve2048_f64\n#define xexpm1f nsimd_sleef_expm1_u10d_sve2048_f32\n#define xlog10 nsimd_sleef_log10_u10d_sve2048_f64\n#define xlog10f nsimd_sleef_log10_u10d_sve2048_f32\n#define xlog2 nsimd_sleef_log2_u10d_sve2048_f64\n#define xlog2f nsimd_sleef_log2_u10d_sve2048_f32\n#define xlog2_u35 nsimd_sleef_log2_u35d_sve2048_f64\n#define xlog2f_u35 nsimd_sleef_log2_u35d_sve2048_f32\n#define xlog1p nsimd_sleef_log1p_u10d_sve2048_f64\n#define xlog1pf nsimd_sleef_log1p_u10d_sve2048_f32\n#define xsincospi_u05 nsimd_sleef_sincospi_u05d_sve2048_f64\n#define xsincospif_u05 nsimd_sleef_sincospi_u05d_sve2048_f32\n#define xsincospi_u35 nsimd_sleef_sincospi_u35d_sve2048_f64\n#define xsincospif_u35 nsimd_sleef_sincospi_u35d_sve2048_f32\n#define xsinpi_u05 nsimd_sleef_sinpi_u05d_sve2048_f64\n#define xsinpif_u05 nsimd_sleef_sinpi_u05d_sve2048_f32\n#define xcospi_u05 nsimd_sleef_cospi_u05d_sve2048_f64\n#define xcospif_u05 nsimd_sleef_cospi_u05d_sve2048_f32\n#define xldexp nsimd_sleef_ldexp_sve2048_f64\n#define xldexpf nsimd_sleef_ldexp_sve2048_f32\n#define xilogb nsimd_sleef_ilogb_sve2048_f64\n#define xilogbf nsimd_sleef_ilogb_sve2048_f32\n#define xfma nsimd_sleef_fma_sve2048_f64\n#define xfmaf nsimd_sleef_fma_sve2048_f32\n#define xsqrt nsimd_sleef_sqrt_sve2048_f64\n#define xsqrtf nsimd_sleef_sqrt_sve2048_f32\n#define xsqrt_u05 nsimd_sleef_sqrt_u05d_sve2048_f64\n#define xsqrtf_u05 nsimd_sleef_sqrt_u05d_sve2048_f32\n#define xsqrt_u35 nsimd_sleef_sqrt_u35d_sve2048_f64\n#define xsqrtf_u35 nsimd_sleef_sqrt_u35d_sve2048_f32\n#define xhypot_u05 nsimd_sleef_hypot_u05d_sve2048_f64\n#define xhypotf_u05 nsimd_sleef_hypot_u05d_sve2048_f32\n#define xhypot_u35 nsimd_sleef_hypot_u35d_sve2048_f64\n#define xhypotf_u35 nsimd_sleef_hypot_u35d_sve2048_f32\n#define xfabs nsimd_sleef_fabs_sve2048_f64\n#define xfabsf nsimd_sleef_fabs_sve2048_f32\n#define xcopysign nsimd_sleef_copysign_sve2048_f64\n#define xcopysignf nsimd_sleef_copysign_sve2048_f32\n#define xfmax nsimd_sleef_fmax_sve2048_f64\n#define xfmaxf nsimd_sleef_fmax_sve2048_f32\n#define xfmin nsimd_sleef_fmin_sve2048_f64\n#define xfminf nsimd_sleef_fmin_sve2048_f32\n#define xfdim nsimd_sleef_fdim_sve2048_f64\n#define xfdimf nsimd_sleef_fdim_sve2048_f32\n#define xtrunc nsimd_sleef_trunc_sve2048_f64\n#define xtruncf nsimd_sleef_trunc_sve2048_f32\n#define xfloor nsimd_sleef_floor_sve2048_f64\n#define xfloorf nsimd_sleef_floor_sve2048_f32\n#define xceil nsimd_sleef_ceil_sve2048_f64\n#define xceilf nsimd_sleef_ceil_sve2048_f32\n#define xround nsimd_sleef_round_sve2048_f64\n#define xroundf nsimd_sleef_round_sve2048_f32\n#define xrint nsimd_sleef_rint_sve2048_f64\n#define xrintf nsimd_sleef_rint_sve2048_f32\n#define xnextafter nsimd_sleef_nextafter_sve2048_f64\n#define xnextafterf nsimd_sleef_nextafter_sve2048_f32\n#define xfrfrexp nsimd_sleef_frfrexp_sve2048_f64\n#define xfrfrexpf nsimd_sleef_frfrexp_sve2048_f32\n#define xexpfrexp nsimd_sleef_expfrexp_sve2048_f64\n#define xexpfrexpf nsimd_sleef_expfrexp_sve2048_f32\n#define xfmod nsimd_sleef_fmod_sve2048_f64\n#define xfmodf nsimd_sleef_fmod_sve2048_f32\n#define xremainder nsimd_sleef_remainder_sve2048_f64\n#define xremainderf nsimd_sleef_remainder_sve2048_f32\n#define xmodf nsimd_sleef_modf_sve2048_f64\n#define xmodff nsimd_sleef_modf_sve2048_f32\n#define xlgamma_u1 nsimd_sleef_lgamma_u10d_sve2048_f64\n#define xlgammaf_u1 nsimd_sleef_lgamma_u10d_sve2048_f32\n#define xtgamma_u1 nsimd_sleef_tgamma_u10d_sve2048_f64\n#define xtgammaf_u1 nsimd_sleef_tgamma_u10d_sve2048_f32\n#define xerf_u1 nsimd_sleef_erf_u10d_sve2048_f64\n#define xerff_u1 nsimd_sleef_erf_u10d_sve2048_f32\n#define xerfc_u15 nsimd_sleef_erfc_u15d_sve2048_f64\n#define xerfcf_u15 nsimd_sleef_erfc_u15d_sve2048_f32\n#define xgetInt nsimd_sleef_getInt_sve2048_f64\n#define xgetIntf nsimd_sleef_getInt_sve2048_f32\n#define xgetPtr nsimd_sleef_getPtr_sve2048_f64\n#define xgetPtrf nsimd_sleef_getPtr_sve2048_f32\n\n                   #else\n\n                   #define xsin nsimd_sleef_sin_u35_sve2048_f64\n#define xsinf nsimd_sleef_sin_u35_sve2048_f32\n#define xcos nsimd_sleef_cos_u35_sve2048_f64\n#define xcosf nsimd_sleef_cos_u35_sve2048_f32\n#define xsincos nsimd_sleef_sincos_u35_sve2048_f64\n#define xsincosf nsimd_sleef_sincos_u35_sve2048_f32\n#define xtan nsimd_sleef_tan_u35_sve2048_f64\n#define xtanf nsimd_sleef_tan_u35_sve2048_f32\n#define xasin nsimd_sleef_asin_u35_sve2048_f64\n#define xasinf nsimd_sleef_asin_u35_sve2048_f32\n#define xacos nsimd_sleef_acos_u35_sve2048_f64\n#define xacosf nsimd_sleef_acos_u35_sve2048_f32\n#define xatan nsimd_sleef_atan_u35_sve2048_f64\n#define xatanf nsimd_sleef_atan_u35_sve2048_f32\n#define xatan2 nsimd_sleef_atan2_u35_sve2048_f64\n#define xatan2f nsimd_sleef_atan2_u35_sve2048_f32\n#define xlog nsimd_sleef_log_u35_sve2048_f64\n#define xlogf nsimd_sleef_log_u35_sve2048_f32\n#define xcbrt nsimd_sleef_cbrt_u35_sve2048_f64\n#define xcbrtf nsimd_sleef_cbrt_u35_sve2048_f32\n#define xsin_u1 nsimd_sleef_sin_u10_sve2048_f64\n#define xsinf_u1 nsimd_sleef_sin_u10_sve2048_f32\n#define xcos_u1 nsimd_sleef_cos_u10_sve2048_f64\n#define xcosf_u1 nsimd_sleef_cos_u10_sve2048_f32\n#define xsincos_u1 nsimd_sleef_sincos_u10_sve2048_f64\n#define xsincosf_u1 nsimd_sleef_sincos_u10_sve2048_f32\n#define xtan_u1 nsimd_sleef_tan_u10_sve2048_f64\n#define xtanf_u1 nsimd_sleef_tan_u10_sve2048_f32\n#define xasin_u1 nsimd_sleef_asin_u10_sve2048_f64\n#define xasinf_u1 nsimd_sleef_asin_u10_sve2048_f32\n#define xacos_u1 nsimd_sleef_acos_u10_sve2048_f64\n#define xacosf_u1 nsimd_sleef_acos_u10_sve2048_f32\n#define xatan_u1 nsimd_sleef_atan_u10_sve2048_f64\n#define xatanf_u1 nsimd_sleef_atan_u10_sve2048_f32\n#define xatan2_u1 nsimd_sleef_atan2_u10_sve2048_f64\n#define xatan2f_u1 nsimd_sleef_atan2_u10_sve2048_f32\n#define xlog_u1 nsimd_sleef_log_u10_sve2048_f64\n#define xlogf_u1 nsimd_sleef_log_u10_sve2048_f32\n#define xcbrt_u1 nsimd_sleef_cbrt_u10_sve2048_f64\n#define xcbrtf_u1 nsimd_sleef_cbrt_u10_sve2048_f32\n#define xexp nsimd_sleef_exp_u10_sve2048_f64\n#define xexpf nsimd_sleef_exp_u10_sve2048_f32\n#define xpow nsimd_sleef_pow_u10_sve2048_f64\n#define xpowf nsimd_sleef_pow_u10_sve2048_f32\n#define xsinh nsimd_sleef_sinh_u10_sve2048_f64\n#define xsinhf nsimd_sleef_sinh_u10_sve2048_f32\n#define xcosh nsimd_sleef_cosh_u10_sve2048_f64\n#define xcoshf nsimd_sleef_cosh_u10_sve2048_f32\n#define xtanh nsimd_sleef_tanh_u10_sve2048_f64\n#define xtanhf nsimd_sleef_tanh_u10_sve2048_f32\n#define xsinh_u35 nsimd_sleef_sinh_u35_sve2048_f64\n#define xsinhf_u35 nsimd_sleef_sinh_u35_sve2048_f32\n#define xcosh_u35 nsimd_sleef_cosh_u35_sve2048_f64\n#define xcoshf_u35 nsimd_sleef_cosh_u35_sve2048_f32\n#define xtanh_u35 nsimd_sleef_tanh_u35_sve2048_f64\n#define xtanhf_u35 nsimd_sleef_tanh_u35_sve2048_f32\n#define xfastsin_u3500 nsimd_sleef_fastsin_u3500_sve2048_f64\n#define xfastsinf_u3500 nsimd_sleef_fastsin_u3500_sve2048_f32\n#define xfastcos_u3500 nsimd_sleef_fastcos_u3500_sve2048_f64\n#define xfastcosf_u3500 nsimd_sleef_fastcos_u3500_sve2048_f32\n#define xfastpow_u3500 nsimd_sleef_fastpow_u3500_sve2048_f64\n#define xfastpowf_u3500 nsimd_sleef_fastpow_u3500_sve2048_f32\n#define xasinh nsimd_sleef_asinh_u10_sve2048_f64\n#define xasinhf nsimd_sleef_asinh_u10_sve2048_f32\n#define xacosh nsimd_sleef_acosh_u10_sve2048_f64\n#define xacoshf nsimd_sleef_acosh_u10_sve2048_f32\n#define xatanh nsimd_sleef_atanh_u10_sve2048_f64\n#define xatanhf nsimd_sleef_atanh_u10_sve2048_f32\n#define xexp2 nsimd_sleef_exp2_u10_sve2048_f64\n#define xexp2f nsimd_sleef_exp2_u10_sve2048_f32\n#define xexp2_u35 nsimd_sleef_exp2_u35_sve2048_f64\n#define xexp2f_u35 nsimd_sleef_exp2_u35_sve2048_f32\n#define xexp10 nsimd_sleef_exp10_u10_sve2048_f64\n#define xexp10f nsimd_sleef_exp10_u10_sve2048_f32\n#define xexp10_u35 nsimd_sleef_exp10_u35_sve2048_f64\n#define xexp10f_u35 nsimd_sleef_exp10_u35_sve2048_f32\n#define xexpm1 nsimd_sleef_expm1_u10_sve2048_f64\n#define xexpm1f nsimd_sleef_expm1_u10_sve2048_f32\n#define xlog10 nsimd_sleef_log10_u10_sve2048_f64\n#define xlog10f nsimd_sleef_log10_u10_sve2048_f32\n#define xlog2 nsimd_sleef_log2_u10_sve2048_f64\n#define xlog2f nsimd_sleef_log2_u10_sve2048_f32\n#define xlog2_u35 nsimd_sleef_log2_u35_sve2048_f64\n#define xlog2f_u35 nsimd_sleef_log2_u35_sve2048_f32\n#define xlog1p nsimd_sleef_log1p_u10_sve2048_f64\n#define xlog1pf nsimd_sleef_log1p_u10_sve2048_f32\n#define xsincospi_u05 nsimd_sleef_sincospi_u05_sve2048_f64\n#define xsincospif_u05 nsimd_sleef_sincospi_u05_sve2048_f32\n#define xsincospi_u35 nsimd_sleef_sincospi_u35_sve2048_f64\n#define xsincospif_u35 nsimd_sleef_sincospi_u35_sve2048_f32\n#define xsinpi_u05 nsimd_sleef_sinpi_u05_sve2048_f64\n#define xsinpif_u05 nsimd_sleef_sinpi_u05_sve2048_f32\n#define xcospi_u05 nsimd_sleef_cospi_u05_sve2048_f64\n#define xcospif_u05 nsimd_sleef_cospi_u05_sve2048_f32\n#define xldexp nsimd_sleef_ldexp_sve2048_f64\n#define xldexpf nsimd_sleef_ldexp_sve2048_f32\n#define xilogb nsimd_sleef_ilogb_sve2048_f64\n#define xilogbf nsimd_sleef_ilogb_sve2048_f32\n#define xfma nsimd_sleef_fma_sve2048_f64\n#define xfmaf nsimd_sleef_fma_sve2048_f32\n#define xsqrt nsimd_sleef_sqrt_sve2048_f64\n#define xsqrtf nsimd_sleef_sqrt_sve2048_f32\n#define xsqrt_u05 nsimd_sleef_sqrt_u05_sve2048_f64\n#define xsqrtf_u05 nsimd_sleef_sqrt_u05_sve2048_f32\n#define xsqrt_u35 nsimd_sleef_sqrt_u35_sve2048_f64\n#define xsqrtf_u35 nsimd_sleef_sqrt_u35_sve2048_f32\n#define xhypot_u05 nsimd_sleef_hypot_u05_sve2048_f64\n#define xhypotf_u05 nsimd_sleef_hypot_u05_sve2048_f32\n#define xhypot_u35 nsimd_sleef_hypot_u35_sve2048_f64\n#define xhypotf_u35 nsimd_sleef_hypot_u35_sve2048_f32\n#define xfabs nsimd_sleef_fabs_sve2048_f64\n#define xfabsf nsimd_sleef_fabs_sve2048_f32\n#define xcopysign nsimd_sleef_copysign_sve2048_f64\n#define xcopysignf nsimd_sleef_copysign_sve2048_f32\n#define xfmax nsimd_sleef_fmax_sve2048_f64\n#define xfmaxf nsimd_sleef_fmax_sve2048_f32\n#define xfmin nsimd_sleef_fmin_sve2048_f64\n#define xfminf nsimd_sleef_fmin_sve2048_f32\n#define xfdim nsimd_sleef_fdim_sve2048_f64\n#define xfdimf nsimd_sleef_fdim_sve2048_f32\n#define xtrunc nsimd_sleef_trunc_sve2048_f64\n#define xtruncf nsimd_sleef_trunc_sve2048_f32\n#define xfloor nsimd_sleef_floor_sve2048_f64\n#define xfloorf nsimd_sleef_floor_sve2048_f32\n#define xceil nsimd_sleef_ceil_sve2048_f64\n#define xceilf nsimd_sleef_ceil_sve2048_f32\n#define xround nsimd_sleef_round_sve2048_f64\n#define xroundf nsimd_sleef_round_sve2048_f32\n#define xrint nsimd_sleef_rint_sve2048_f64\n#define xrintf nsimd_sleef_rint_sve2048_f32\n#define xnextafter nsimd_sleef_nextafter_sve2048_f64\n#define xnextafterf nsimd_sleef_nextafter_sve2048_f32\n#define xfrfrexp nsimd_sleef_frfrexp_sve2048_f64\n#define xfrfrexpf nsimd_sleef_frfrexp_sve2048_f32\n#define xexpfrexp nsimd_sleef_expfrexp_sve2048_f64\n#define xexpfrexpf nsimd_sleef_expfrexp_sve2048_f32\n#define xfmod nsimd_sleef_fmod_sve2048_f64\n#define xfmodf nsimd_sleef_fmod_sve2048_f32\n#define xremainder nsimd_sleef_remainder_sve2048_f64\n#define xremainderf nsimd_sleef_remainder_sve2048_f32\n#define xmodf nsimd_sleef_modf_sve2048_f64\n#define xmodff nsimd_sleef_modf_sve2048_f32\n#define xlgamma_u1 nsimd_sleef_lgamma_u10_sve2048_f64\n#define xlgammaf_u1 nsimd_sleef_lgamma_u10_sve2048_f32\n#define xtgamma_u1 nsimd_sleef_tgamma_u10_sve2048_f64\n#define xtgammaf_u1 nsimd_sleef_tgamma_u10_sve2048_f32\n#define xerf_u1 nsimd_sleef_erf_u10_sve2048_f64\n#define xerff_u1 nsimd_sleef_erf_u10_sve2048_f32\n#define xerfc_u15 nsimd_sleef_erfc_u15_sve2048_f64\n#define xerfcf_u15 nsimd_sleef_erfc_u15_sve2048_f32\n#define xgetInt nsimd_sleef_getInt_sve2048_f64\n#define xgetIntf nsimd_sleef_getInt_sve2048_f32\n#define xgetPtr nsimd_sleef_getPtr_sve2048_f64\n#define xgetPtrf nsimd_sleef_getPtr_sve2048_f32\n\n                   #endif\n\n                   #define rempi nsimd_sleef_rempi_sve2048\n                   #define rempif nsimd_sleef_rempif_sve2048\n                   #define rempisub nsimd_sleef_rempisub_sve2048\n                   #define rempisubf nsimd_sleef_rempisubf_sve2048\n                   #define gammak nsimd_gammak_sve2048\n                   #define gammafk nsimd_gammafk_sve2048\n\n                   #endif\n\n                   \n\n#endif\n\n"
  },
  {
    "path": "src/renamevsx.h",
    "content": "#ifndef RENAMEVSX_H\n               #define RENAMEVSX_H\n\n               /* ------------------------------------------------------------------------- */\n                   /* Naming of functions vmx */\n\n                   #ifdef NSIMD_VMX\n\n                   #ifdef DETERMINISTIC\n\n                   #define xsin nsimd_sleef_sin_u35d_vmx_f64\n#define xsinf nsimd_sleef_sin_u35d_vmx_f32\n#define xcos nsimd_sleef_cos_u35d_vmx_f64\n#define xcosf nsimd_sleef_cos_u35d_vmx_f32\n#define xsincos nsimd_sleef_sincos_u35d_vmx_f64\n#define xsincosf nsimd_sleef_sincos_u35d_vmx_f32\n#define xtan nsimd_sleef_tan_u35d_vmx_f64\n#define xtanf nsimd_sleef_tan_u35d_vmx_f32\n#define xasin nsimd_sleef_asin_u35d_vmx_f64\n#define xasinf nsimd_sleef_asin_u35d_vmx_f32\n#define xacos nsimd_sleef_acos_u35d_vmx_f64\n#define xacosf nsimd_sleef_acos_u35d_vmx_f32\n#define xatan nsimd_sleef_atan_u35d_vmx_f64\n#define xatanf nsimd_sleef_atan_u35d_vmx_f32\n#define xatan2 nsimd_sleef_atan2_u35d_vmx_f64\n#define xatan2f nsimd_sleef_atan2_u35d_vmx_f32\n#define xlog nsimd_sleef_log_u35d_vmx_f64\n#define xlogf nsimd_sleef_log_u35d_vmx_f32\n#define xcbrt nsimd_sleef_cbrt_u35d_vmx_f64\n#define xcbrtf nsimd_sleef_cbrt_u35d_vmx_f32\n#define xsin_u1 nsimd_sleef_sin_u10d_vmx_f64\n#define xsinf_u1 nsimd_sleef_sin_u10d_vmx_f32\n#define xcos_u1 nsimd_sleef_cos_u10d_vmx_f64\n#define xcosf_u1 nsimd_sleef_cos_u10d_vmx_f32\n#define xsincos_u1 nsimd_sleef_sincos_u10d_vmx_f64\n#define xsincosf_u1 nsimd_sleef_sincos_u10d_vmx_f32\n#define xtan_u1 nsimd_sleef_tan_u10d_vmx_f64\n#define xtanf_u1 nsimd_sleef_tan_u10d_vmx_f32\n#define xasin_u1 nsimd_sleef_asin_u10d_vmx_f64\n#define xasinf_u1 nsimd_sleef_asin_u10d_vmx_f32\n#define xacos_u1 nsimd_sleef_acos_u10d_vmx_f64\n#define xacosf_u1 nsimd_sleef_acos_u10d_vmx_f32\n#define xatan_u1 nsimd_sleef_atan_u10d_vmx_f64\n#define xatanf_u1 nsimd_sleef_atan_u10d_vmx_f32\n#define xatan2_u1 nsimd_sleef_atan2_u10d_vmx_f64\n#define xatan2f_u1 nsimd_sleef_atan2_u10d_vmx_f32\n#define xlog_u1 nsimd_sleef_log_u10d_vmx_f64\n#define xlogf_u1 nsimd_sleef_log_u10d_vmx_f32\n#define xcbrt_u1 nsimd_sleef_cbrt_u10d_vmx_f64\n#define xcbrtf_u1 nsimd_sleef_cbrt_u10d_vmx_f32\n#define xexp nsimd_sleef_exp_u10d_vmx_f64\n#define xexpf nsimd_sleef_exp_u10d_vmx_f32\n#define xpow nsimd_sleef_pow_u10d_vmx_f64\n#define xpowf nsimd_sleef_pow_u10d_vmx_f32\n#define xsinh nsimd_sleef_sinh_u10d_vmx_f64\n#define xsinhf nsimd_sleef_sinh_u10d_vmx_f32\n#define xcosh nsimd_sleef_cosh_u10d_vmx_f64\n#define xcoshf nsimd_sleef_cosh_u10d_vmx_f32\n#define xtanh nsimd_sleef_tanh_u10d_vmx_f64\n#define xtanhf nsimd_sleef_tanh_u10d_vmx_f32\n#define xsinh_u35 nsimd_sleef_sinh_u35d_vmx_f64\n#define xsinhf_u35 nsimd_sleef_sinh_u35d_vmx_f32\n#define xcosh_u35 nsimd_sleef_cosh_u35d_vmx_f64\n#define xcoshf_u35 nsimd_sleef_cosh_u35d_vmx_f32\n#define xtanh_u35 nsimd_sleef_tanh_u35d_vmx_f64\n#define xtanhf_u35 nsimd_sleef_tanh_u35d_vmx_f32\n#define xfastsin_u3500 nsimd_sleef_fastsin_u3500d_vmx_f64\n#define xfastsinf_u3500 nsimd_sleef_fastsin_u3500d_vmx_f32\n#define xfastcos_u3500 nsimd_sleef_fastcos_u3500d_vmx_f64\n#define xfastcosf_u3500 nsimd_sleef_fastcos_u3500d_vmx_f32\n#define xfastpow_u3500 nsimd_sleef_fastpow_u3500d_vmx_f64\n#define xfastpowf_u3500 nsimd_sleef_fastpow_u3500d_vmx_f32\n#define xasinh nsimd_sleef_asinh_u10d_vmx_f64\n#define xasinhf nsimd_sleef_asinh_u10d_vmx_f32\n#define xacosh nsimd_sleef_acosh_u10d_vmx_f64\n#define xacoshf nsimd_sleef_acosh_u10d_vmx_f32\n#define xatanh nsimd_sleef_atanh_u10d_vmx_f64\n#define xatanhf nsimd_sleef_atanh_u10d_vmx_f32\n#define xexp2 nsimd_sleef_exp2_u10d_vmx_f64\n#define xexp2f nsimd_sleef_exp2_u10d_vmx_f32\n#define xexp2_u35 nsimd_sleef_exp2_u35d_vmx_f64\n#define xexp2f_u35 nsimd_sleef_exp2_u35d_vmx_f32\n#define xexp10 nsimd_sleef_exp10_u10d_vmx_f64\n#define xexp10f nsimd_sleef_exp10_u10d_vmx_f32\n#define xexp10_u35 nsimd_sleef_exp10_u35d_vmx_f64\n#define xexp10f_u35 nsimd_sleef_exp10_u35d_vmx_f32\n#define xexpm1 nsimd_sleef_expm1_u10d_vmx_f64\n#define xexpm1f nsimd_sleef_expm1_u10d_vmx_f32\n#define xlog10 nsimd_sleef_log10_u10d_vmx_f64\n#define xlog10f nsimd_sleef_log10_u10d_vmx_f32\n#define xlog2 nsimd_sleef_log2_u10d_vmx_f64\n#define xlog2f nsimd_sleef_log2_u10d_vmx_f32\n#define xlog2_u35 nsimd_sleef_log2_u35d_vmx_f64\n#define xlog2f_u35 nsimd_sleef_log2_u35d_vmx_f32\n#define xlog1p nsimd_sleef_log1p_u10d_vmx_f64\n#define xlog1pf nsimd_sleef_log1p_u10d_vmx_f32\n#define xsincospi_u05 nsimd_sleef_sincospi_u05d_vmx_f64\n#define xsincospif_u05 nsimd_sleef_sincospi_u05d_vmx_f32\n#define xsincospi_u35 nsimd_sleef_sincospi_u35d_vmx_f64\n#define xsincospif_u35 nsimd_sleef_sincospi_u35d_vmx_f32\n#define xsinpi_u05 nsimd_sleef_sinpi_u05d_vmx_f64\n#define xsinpif_u05 nsimd_sleef_sinpi_u05d_vmx_f32\n#define xcospi_u05 nsimd_sleef_cospi_u05d_vmx_f64\n#define xcospif_u05 nsimd_sleef_cospi_u05d_vmx_f32\n#define xldexp nsimd_sleef_ldexp_vmx_f64\n#define xldexpf nsimd_sleef_ldexp_vmx_f32\n#define xilogb nsimd_sleef_ilogb_vmx_f64\n#define xilogbf nsimd_sleef_ilogb_vmx_f32\n#define xfma nsimd_sleef_fma_vmx_f64\n#define xfmaf nsimd_sleef_fma_vmx_f32\n#define xsqrt nsimd_sleef_sqrt_vmx_f64\n#define xsqrtf nsimd_sleef_sqrt_vmx_f32\n#define xsqrt_u05 nsimd_sleef_sqrt_u05d_vmx_f64\n#define xsqrtf_u05 nsimd_sleef_sqrt_u05d_vmx_f32\n#define xsqrt_u35 nsimd_sleef_sqrt_u35d_vmx_f64\n#define xsqrtf_u35 nsimd_sleef_sqrt_u35d_vmx_f32\n#define xhypot_u05 nsimd_sleef_hypot_u05d_vmx_f64\n#define xhypotf_u05 nsimd_sleef_hypot_u05d_vmx_f32\n#define xhypot_u35 nsimd_sleef_hypot_u35d_vmx_f64\n#define xhypotf_u35 nsimd_sleef_hypot_u35d_vmx_f32\n#define xfabs nsimd_sleef_fabs_vmx_f64\n#define xfabsf nsimd_sleef_fabs_vmx_f32\n#define xcopysign nsimd_sleef_copysign_vmx_f64\n#define xcopysignf nsimd_sleef_copysign_vmx_f32\n#define xfmax nsimd_sleef_fmax_vmx_f64\n#define xfmaxf nsimd_sleef_fmax_vmx_f32\n#define xfmin nsimd_sleef_fmin_vmx_f64\n#define xfminf nsimd_sleef_fmin_vmx_f32\n#define xfdim nsimd_sleef_fdim_vmx_f64\n#define xfdimf nsimd_sleef_fdim_vmx_f32\n#define xtrunc nsimd_sleef_trunc_vmx_f64\n#define xtruncf nsimd_sleef_trunc_vmx_f32\n#define xfloor nsimd_sleef_floor_vmx_f64\n#define xfloorf nsimd_sleef_floor_vmx_f32\n#define xceil nsimd_sleef_ceil_vmx_f64\n#define xceilf nsimd_sleef_ceil_vmx_f32\n#define xround nsimd_sleef_round_vmx_f64\n#define xroundf nsimd_sleef_round_vmx_f32\n#define xrint nsimd_sleef_rint_vmx_f64\n#define xrintf nsimd_sleef_rint_vmx_f32\n#define xnextafter nsimd_sleef_nextafter_vmx_f64\n#define xnextafterf nsimd_sleef_nextafter_vmx_f32\n#define xfrfrexp nsimd_sleef_frfrexp_vmx_f64\n#define xfrfrexpf nsimd_sleef_frfrexp_vmx_f32\n#define xexpfrexp nsimd_sleef_expfrexp_vmx_f64\n#define xexpfrexpf nsimd_sleef_expfrexp_vmx_f32\n#define xfmod nsimd_sleef_fmod_vmx_f64\n#define xfmodf nsimd_sleef_fmod_vmx_f32\n#define xremainder nsimd_sleef_remainder_vmx_f64\n#define xremainderf nsimd_sleef_remainder_vmx_f32\n#define xmodf nsimd_sleef_modf_vmx_f64\n#define xmodff nsimd_sleef_modf_vmx_f32\n#define xlgamma_u1 nsimd_sleef_lgamma_u10d_vmx_f64\n#define xlgammaf_u1 nsimd_sleef_lgamma_u10d_vmx_f32\n#define xtgamma_u1 nsimd_sleef_tgamma_u10d_vmx_f64\n#define xtgammaf_u1 nsimd_sleef_tgamma_u10d_vmx_f32\n#define xerf_u1 nsimd_sleef_erf_u10d_vmx_f64\n#define xerff_u1 nsimd_sleef_erf_u10d_vmx_f32\n#define xerfc_u15 nsimd_sleef_erfc_u15d_vmx_f64\n#define xerfcf_u15 nsimd_sleef_erfc_u15d_vmx_f32\n#define xgetInt nsimd_sleef_getInt_vmx_f64\n#define xgetIntf nsimd_sleef_getInt_vmx_f32\n#define xgetPtr nsimd_sleef_getPtr_vmx_f64\n#define xgetPtrf nsimd_sleef_getPtr_vmx_f32\n\n                   #else\n\n                   #define xsin nsimd_sleef_sin_u35_vmx_f64\n#define xsinf nsimd_sleef_sin_u35_vmx_f32\n#define xcos nsimd_sleef_cos_u35_vmx_f64\n#define xcosf nsimd_sleef_cos_u35_vmx_f32\n#define xsincos nsimd_sleef_sincos_u35_vmx_f64\n#define xsincosf nsimd_sleef_sincos_u35_vmx_f32\n#define xtan nsimd_sleef_tan_u35_vmx_f64\n#define xtanf nsimd_sleef_tan_u35_vmx_f32\n#define xasin nsimd_sleef_asin_u35_vmx_f64\n#define xasinf nsimd_sleef_asin_u35_vmx_f32\n#define xacos nsimd_sleef_acos_u35_vmx_f64\n#define xacosf nsimd_sleef_acos_u35_vmx_f32\n#define xatan nsimd_sleef_atan_u35_vmx_f64\n#define xatanf nsimd_sleef_atan_u35_vmx_f32\n#define xatan2 nsimd_sleef_atan2_u35_vmx_f64\n#define xatan2f nsimd_sleef_atan2_u35_vmx_f32\n#define xlog nsimd_sleef_log_u35_vmx_f64\n#define xlogf nsimd_sleef_log_u35_vmx_f32\n#define xcbrt nsimd_sleef_cbrt_u35_vmx_f64\n#define xcbrtf nsimd_sleef_cbrt_u35_vmx_f32\n#define xsin_u1 nsimd_sleef_sin_u10_vmx_f64\n#define xsinf_u1 nsimd_sleef_sin_u10_vmx_f32\n#define xcos_u1 nsimd_sleef_cos_u10_vmx_f64\n#define xcosf_u1 nsimd_sleef_cos_u10_vmx_f32\n#define xsincos_u1 nsimd_sleef_sincos_u10_vmx_f64\n#define xsincosf_u1 nsimd_sleef_sincos_u10_vmx_f32\n#define xtan_u1 nsimd_sleef_tan_u10_vmx_f64\n#define xtanf_u1 nsimd_sleef_tan_u10_vmx_f32\n#define xasin_u1 nsimd_sleef_asin_u10_vmx_f64\n#define xasinf_u1 nsimd_sleef_asin_u10_vmx_f32\n#define xacos_u1 nsimd_sleef_acos_u10_vmx_f64\n#define xacosf_u1 nsimd_sleef_acos_u10_vmx_f32\n#define xatan_u1 nsimd_sleef_atan_u10_vmx_f64\n#define xatanf_u1 nsimd_sleef_atan_u10_vmx_f32\n#define xatan2_u1 nsimd_sleef_atan2_u10_vmx_f64\n#define xatan2f_u1 nsimd_sleef_atan2_u10_vmx_f32\n#define xlog_u1 nsimd_sleef_log_u10_vmx_f64\n#define xlogf_u1 nsimd_sleef_log_u10_vmx_f32\n#define xcbrt_u1 nsimd_sleef_cbrt_u10_vmx_f64\n#define xcbrtf_u1 nsimd_sleef_cbrt_u10_vmx_f32\n#define xexp nsimd_sleef_exp_u10_vmx_f64\n#define xexpf nsimd_sleef_exp_u10_vmx_f32\n#define xpow nsimd_sleef_pow_u10_vmx_f64\n#define xpowf nsimd_sleef_pow_u10_vmx_f32\n#define xsinh nsimd_sleef_sinh_u10_vmx_f64\n#define xsinhf nsimd_sleef_sinh_u10_vmx_f32\n#define xcosh nsimd_sleef_cosh_u10_vmx_f64\n#define xcoshf nsimd_sleef_cosh_u10_vmx_f32\n#define xtanh nsimd_sleef_tanh_u10_vmx_f64\n#define xtanhf nsimd_sleef_tanh_u10_vmx_f32\n#define xsinh_u35 nsimd_sleef_sinh_u35_vmx_f64\n#define xsinhf_u35 nsimd_sleef_sinh_u35_vmx_f32\n#define xcosh_u35 nsimd_sleef_cosh_u35_vmx_f64\n#define xcoshf_u35 nsimd_sleef_cosh_u35_vmx_f32\n#define xtanh_u35 nsimd_sleef_tanh_u35_vmx_f64\n#define xtanhf_u35 nsimd_sleef_tanh_u35_vmx_f32\n#define xfastsin_u3500 nsimd_sleef_fastsin_u3500_vmx_f64\n#define xfastsinf_u3500 nsimd_sleef_fastsin_u3500_vmx_f32\n#define xfastcos_u3500 nsimd_sleef_fastcos_u3500_vmx_f64\n#define xfastcosf_u3500 nsimd_sleef_fastcos_u3500_vmx_f32\n#define xfastpow_u3500 nsimd_sleef_fastpow_u3500_vmx_f64\n#define xfastpowf_u3500 nsimd_sleef_fastpow_u3500_vmx_f32\n#define xasinh nsimd_sleef_asinh_u10_vmx_f64\n#define xasinhf nsimd_sleef_asinh_u10_vmx_f32\n#define xacosh nsimd_sleef_acosh_u10_vmx_f64\n#define xacoshf nsimd_sleef_acosh_u10_vmx_f32\n#define xatanh nsimd_sleef_atanh_u10_vmx_f64\n#define xatanhf nsimd_sleef_atanh_u10_vmx_f32\n#define xexp2 nsimd_sleef_exp2_u10_vmx_f64\n#define xexp2f nsimd_sleef_exp2_u10_vmx_f32\n#define xexp2_u35 nsimd_sleef_exp2_u35_vmx_f64\n#define xexp2f_u35 nsimd_sleef_exp2_u35_vmx_f32\n#define xexp10 nsimd_sleef_exp10_u10_vmx_f64\n#define xexp10f nsimd_sleef_exp10_u10_vmx_f32\n#define xexp10_u35 nsimd_sleef_exp10_u35_vmx_f64\n#define xexp10f_u35 nsimd_sleef_exp10_u35_vmx_f32\n#define xexpm1 nsimd_sleef_expm1_u10_vmx_f64\n#define xexpm1f nsimd_sleef_expm1_u10_vmx_f32\n#define xlog10 nsimd_sleef_log10_u10_vmx_f64\n#define xlog10f nsimd_sleef_log10_u10_vmx_f32\n#define xlog2 nsimd_sleef_log2_u10_vmx_f64\n#define xlog2f nsimd_sleef_log2_u10_vmx_f32\n#define xlog2_u35 nsimd_sleef_log2_u35_vmx_f64\n#define xlog2f_u35 nsimd_sleef_log2_u35_vmx_f32\n#define xlog1p nsimd_sleef_log1p_u10_vmx_f64\n#define xlog1pf nsimd_sleef_log1p_u10_vmx_f32\n#define xsincospi_u05 nsimd_sleef_sincospi_u05_vmx_f64\n#define xsincospif_u05 nsimd_sleef_sincospi_u05_vmx_f32\n#define xsincospi_u35 nsimd_sleef_sincospi_u35_vmx_f64\n#define xsincospif_u35 nsimd_sleef_sincospi_u35_vmx_f32\n#define xsinpi_u05 nsimd_sleef_sinpi_u05_vmx_f64\n#define xsinpif_u05 nsimd_sleef_sinpi_u05_vmx_f32\n#define xcospi_u05 nsimd_sleef_cospi_u05_vmx_f64\n#define xcospif_u05 nsimd_sleef_cospi_u05_vmx_f32\n#define xldexp nsimd_sleef_ldexp_vmx_f64\n#define xldexpf nsimd_sleef_ldexp_vmx_f32\n#define xilogb nsimd_sleef_ilogb_vmx_f64\n#define xilogbf nsimd_sleef_ilogb_vmx_f32\n#define xfma nsimd_sleef_fma_vmx_f64\n#define xfmaf nsimd_sleef_fma_vmx_f32\n#define xsqrt nsimd_sleef_sqrt_vmx_f64\n#define xsqrtf nsimd_sleef_sqrt_vmx_f32\n#define xsqrt_u05 nsimd_sleef_sqrt_u05_vmx_f64\n#define xsqrtf_u05 nsimd_sleef_sqrt_u05_vmx_f32\n#define xsqrt_u35 nsimd_sleef_sqrt_u35_vmx_f64\n#define xsqrtf_u35 nsimd_sleef_sqrt_u35_vmx_f32\n#define xhypot_u05 nsimd_sleef_hypot_u05_vmx_f64\n#define xhypotf_u05 nsimd_sleef_hypot_u05_vmx_f32\n#define xhypot_u35 nsimd_sleef_hypot_u35_vmx_f64\n#define xhypotf_u35 nsimd_sleef_hypot_u35_vmx_f32\n#define xfabs nsimd_sleef_fabs_vmx_f64\n#define xfabsf nsimd_sleef_fabs_vmx_f32\n#define xcopysign nsimd_sleef_copysign_vmx_f64\n#define xcopysignf nsimd_sleef_copysign_vmx_f32\n#define xfmax nsimd_sleef_fmax_vmx_f64\n#define xfmaxf nsimd_sleef_fmax_vmx_f32\n#define xfmin nsimd_sleef_fmin_vmx_f64\n#define xfminf nsimd_sleef_fmin_vmx_f32\n#define xfdim nsimd_sleef_fdim_vmx_f64\n#define xfdimf nsimd_sleef_fdim_vmx_f32\n#define xtrunc nsimd_sleef_trunc_vmx_f64\n#define xtruncf nsimd_sleef_trunc_vmx_f32\n#define xfloor nsimd_sleef_floor_vmx_f64\n#define xfloorf nsimd_sleef_floor_vmx_f32\n#define xceil nsimd_sleef_ceil_vmx_f64\n#define xceilf nsimd_sleef_ceil_vmx_f32\n#define xround nsimd_sleef_round_vmx_f64\n#define xroundf nsimd_sleef_round_vmx_f32\n#define xrint nsimd_sleef_rint_vmx_f64\n#define xrintf nsimd_sleef_rint_vmx_f32\n#define xnextafter nsimd_sleef_nextafter_vmx_f64\n#define xnextafterf nsimd_sleef_nextafter_vmx_f32\n#define xfrfrexp nsimd_sleef_frfrexp_vmx_f64\n#define xfrfrexpf nsimd_sleef_frfrexp_vmx_f32\n#define xexpfrexp nsimd_sleef_expfrexp_vmx_f64\n#define xexpfrexpf nsimd_sleef_expfrexp_vmx_f32\n#define xfmod nsimd_sleef_fmod_vmx_f64\n#define xfmodf nsimd_sleef_fmod_vmx_f32\n#define xremainder nsimd_sleef_remainder_vmx_f64\n#define xremainderf nsimd_sleef_remainder_vmx_f32\n#define xmodf nsimd_sleef_modf_vmx_f64\n#define xmodff nsimd_sleef_modf_vmx_f32\n#define xlgamma_u1 nsimd_sleef_lgamma_u10_vmx_f64\n#define xlgammaf_u1 nsimd_sleef_lgamma_u10_vmx_f32\n#define xtgamma_u1 nsimd_sleef_tgamma_u10_vmx_f64\n#define xtgammaf_u1 nsimd_sleef_tgamma_u10_vmx_f32\n#define xerf_u1 nsimd_sleef_erf_u10_vmx_f64\n#define xerff_u1 nsimd_sleef_erf_u10_vmx_f32\n#define xerfc_u15 nsimd_sleef_erfc_u15_vmx_f64\n#define xerfcf_u15 nsimd_sleef_erfc_u15_vmx_f32\n#define xgetInt nsimd_sleef_getInt_vmx_f64\n#define xgetIntf nsimd_sleef_getInt_vmx_f32\n#define xgetPtr nsimd_sleef_getPtr_vmx_f64\n#define xgetPtrf nsimd_sleef_getPtr_vmx_f32\n\n                   #endif\n\n                   #define rempi nsimd_sleef_rempi_vmx\n                   #define rempif nsimd_sleef_rempif_vmx\n                   #define rempisub nsimd_sleef_rempisub_vmx\n                   #define rempisubf nsimd_sleef_rempisubf_vmx\n                   #define gammak nsimd_gammak_vmx\n                   #define gammafk nsimd_gammafk_vmx\n\n                   #endif\n\n                   /* ------------------------------------------------------------------------- */\n                   /* Naming of functions vsx */\n\n                   #ifdef NSIMD_VSX\n\n                   #ifdef DETERMINISTIC\n\n                   #define xsin nsimd_sleef_sin_u35d_vsx_f64\n#define xsinf nsimd_sleef_sin_u35d_vsx_f32\n#define xcos nsimd_sleef_cos_u35d_vsx_f64\n#define xcosf nsimd_sleef_cos_u35d_vsx_f32\n#define xsincos nsimd_sleef_sincos_u35d_vsx_f64\n#define xsincosf nsimd_sleef_sincos_u35d_vsx_f32\n#define xtan nsimd_sleef_tan_u35d_vsx_f64\n#define xtanf nsimd_sleef_tan_u35d_vsx_f32\n#define xasin nsimd_sleef_asin_u35d_vsx_f64\n#define xasinf nsimd_sleef_asin_u35d_vsx_f32\n#define xacos nsimd_sleef_acos_u35d_vsx_f64\n#define xacosf nsimd_sleef_acos_u35d_vsx_f32\n#define xatan nsimd_sleef_atan_u35d_vsx_f64\n#define xatanf nsimd_sleef_atan_u35d_vsx_f32\n#define xatan2 nsimd_sleef_atan2_u35d_vsx_f64\n#define xatan2f nsimd_sleef_atan2_u35d_vsx_f32\n#define xlog nsimd_sleef_log_u35d_vsx_f64\n#define xlogf nsimd_sleef_log_u35d_vsx_f32\n#define xcbrt nsimd_sleef_cbrt_u35d_vsx_f64\n#define xcbrtf nsimd_sleef_cbrt_u35d_vsx_f32\n#define xsin_u1 nsimd_sleef_sin_u10d_vsx_f64\n#define xsinf_u1 nsimd_sleef_sin_u10d_vsx_f32\n#define xcos_u1 nsimd_sleef_cos_u10d_vsx_f64\n#define xcosf_u1 nsimd_sleef_cos_u10d_vsx_f32\n#define xsincos_u1 nsimd_sleef_sincos_u10d_vsx_f64\n#define xsincosf_u1 nsimd_sleef_sincos_u10d_vsx_f32\n#define xtan_u1 nsimd_sleef_tan_u10d_vsx_f64\n#define xtanf_u1 nsimd_sleef_tan_u10d_vsx_f32\n#define xasin_u1 nsimd_sleef_asin_u10d_vsx_f64\n#define xasinf_u1 nsimd_sleef_asin_u10d_vsx_f32\n#define xacos_u1 nsimd_sleef_acos_u10d_vsx_f64\n#define xacosf_u1 nsimd_sleef_acos_u10d_vsx_f32\n#define xatan_u1 nsimd_sleef_atan_u10d_vsx_f64\n#define xatanf_u1 nsimd_sleef_atan_u10d_vsx_f32\n#define xatan2_u1 nsimd_sleef_atan2_u10d_vsx_f64\n#define xatan2f_u1 nsimd_sleef_atan2_u10d_vsx_f32\n#define xlog_u1 nsimd_sleef_log_u10d_vsx_f64\n#define xlogf_u1 nsimd_sleef_log_u10d_vsx_f32\n#define xcbrt_u1 nsimd_sleef_cbrt_u10d_vsx_f64\n#define xcbrtf_u1 nsimd_sleef_cbrt_u10d_vsx_f32\n#define xexp nsimd_sleef_exp_u10d_vsx_f64\n#define xexpf nsimd_sleef_exp_u10d_vsx_f32\n#define xpow nsimd_sleef_pow_u10d_vsx_f64\n#define xpowf nsimd_sleef_pow_u10d_vsx_f32\n#define xsinh nsimd_sleef_sinh_u10d_vsx_f64\n#define xsinhf nsimd_sleef_sinh_u10d_vsx_f32\n#define xcosh nsimd_sleef_cosh_u10d_vsx_f64\n#define xcoshf nsimd_sleef_cosh_u10d_vsx_f32\n#define xtanh nsimd_sleef_tanh_u10d_vsx_f64\n#define xtanhf nsimd_sleef_tanh_u10d_vsx_f32\n#define xsinh_u35 nsimd_sleef_sinh_u35d_vsx_f64\n#define xsinhf_u35 nsimd_sleef_sinh_u35d_vsx_f32\n#define xcosh_u35 nsimd_sleef_cosh_u35d_vsx_f64\n#define xcoshf_u35 nsimd_sleef_cosh_u35d_vsx_f32\n#define xtanh_u35 nsimd_sleef_tanh_u35d_vsx_f64\n#define xtanhf_u35 nsimd_sleef_tanh_u35d_vsx_f32\n#define xfastsin_u3500 nsimd_sleef_fastsin_u3500d_vsx_f64\n#define xfastsinf_u3500 nsimd_sleef_fastsin_u3500d_vsx_f32\n#define xfastcos_u3500 nsimd_sleef_fastcos_u3500d_vsx_f64\n#define xfastcosf_u3500 nsimd_sleef_fastcos_u3500d_vsx_f32\n#define xfastpow_u3500 nsimd_sleef_fastpow_u3500d_vsx_f64\n#define xfastpowf_u3500 nsimd_sleef_fastpow_u3500d_vsx_f32\n#define xasinh nsimd_sleef_asinh_u10d_vsx_f64\n#define xasinhf nsimd_sleef_asinh_u10d_vsx_f32\n#define xacosh nsimd_sleef_acosh_u10d_vsx_f64\n#define xacoshf nsimd_sleef_acosh_u10d_vsx_f32\n#define xatanh nsimd_sleef_atanh_u10d_vsx_f64\n#define xatanhf nsimd_sleef_atanh_u10d_vsx_f32\n#define xexp2 nsimd_sleef_exp2_u10d_vsx_f64\n#define xexp2f nsimd_sleef_exp2_u10d_vsx_f32\n#define xexp2_u35 nsimd_sleef_exp2_u35d_vsx_f64\n#define xexp2f_u35 nsimd_sleef_exp2_u35d_vsx_f32\n#define xexp10 nsimd_sleef_exp10_u10d_vsx_f64\n#define xexp10f nsimd_sleef_exp10_u10d_vsx_f32\n#define xexp10_u35 nsimd_sleef_exp10_u35d_vsx_f64\n#define xexp10f_u35 nsimd_sleef_exp10_u35d_vsx_f32\n#define xexpm1 nsimd_sleef_expm1_u10d_vsx_f64\n#define xexpm1f nsimd_sleef_expm1_u10d_vsx_f32\n#define xlog10 nsimd_sleef_log10_u10d_vsx_f64\n#define xlog10f nsimd_sleef_log10_u10d_vsx_f32\n#define xlog2 nsimd_sleef_log2_u10d_vsx_f64\n#define xlog2f nsimd_sleef_log2_u10d_vsx_f32\n#define xlog2_u35 nsimd_sleef_log2_u35d_vsx_f64\n#define xlog2f_u35 nsimd_sleef_log2_u35d_vsx_f32\n#define xlog1p nsimd_sleef_log1p_u10d_vsx_f64\n#define xlog1pf nsimd_sleef_log1p_u10d_vsx_f32\n#define xsincospi_u05 nsimd_sleef_sincospi_u05d_vsx_f64\n#define xsincospif_u05 nsimd_sleef_sincospi_u05d_vsx_f32\n#define xsincospi_u35 nsimd_sleef_sincospi_u35d_vsx_f64\n#define xsincospif_u35 nsimd_sleef_sincospi_u35d_vsx_f32\n#define xsinpi_u05 nsimd_sleef_sinpi_u05d_vsx_f64\n#define xsinpif_u05 nsimd_sleef_sinpi_u05d_vsx_f32\n#define xcospi_u05 nsimd_sleef_cospi_u05d_vsx_f64\n#define xcospif_u05 nsimd_sleef_cospi_u05d_vsx_f32\n#define xldexp nsimd_sleef_ldexp_vsx_f64\n#define xldexpf nsimd_sleef_ldexp_vsx_f32\n#define xilogb nsimd_sleef_ilogb_vsx_f64\n#define xilogbf nsimd_sleef_ilogb_vsx_f32\n#define xfma nsimd_sleef_fma_vsx_f64\n#define xfmaf nsimd_sleef_fma_vsx_f32\n#define xsqrt nsimd_sleef_sqrt_vsx_f64\n#define xsqrtf nsimd_sleef_sqrt_vsx_f32\n#define xsqrt_u05 nsimd_sleef_sqrt_u05d_vsx_f64\n#define xsqrtf_u05 nsimd_sleef_sqrt_u05d_vsx_f32\n#define xsqrt_u35 nsimd_sleef_sqrt_u35d_vsx_f64\n#define xsqrtf_u35 nsimd_sleef_sqrt_u35d_vsx_f32\n#define xhypot_u05 nsimd_sleef_hypot_u05d_vsx_f64\n#define xhypotf_u05 nsimd_sleef_hypot_u05d_vsx_f32\n#define xhypot_u35 nsimd_sleef_hypot_u35d_vsx_f64\n#define xhypotf_u35 nsimd_sleef_hypot_u35d_vsx_f32\n#define xfabs nsimd_sleef_fabs_vsx_f64\n#define xfabsf nsimd_sleef_fabs_vsx_f32\n#define xcopysign nsimd_sleef_copysign_vsx_f64\n#define xcopysignf nsimd_sleef_copysign_vsx_f32\n#define xfmax nsimd_sleef_fmax_vsx_f64\n#define xfmaxf nsimd_sleef_fmax_vsx_f32\n#define xfmin nsimd_sleef_fmin_vsx_f64\n#define xfminf nsimd_sleef_fmin_vsx_f32\n#define xfdim nsimd_sleef_fdim_vsx_f64\n#define xfdimf nsimd_sleef_fdim_vsx_f32\n#define xtrunc nsimd_sleef_trunc_vsx_f64\n#define xtruncf nsimd_sleef_trunc_vsx_f32\n#define xfloor nsimd_sleef_floor_vsx_f64\n#define xfloorf nsimd_sleef_floor_vsx_f32\n#define xceil nsimd_sleef_ceil_vsx_f64\n#define xceilf nsimd_sleef_ceil_vsx_f32\n#define xround nsimd_sleef_round_vsx_f64\n#define xroundf nsimd_sleef_round_vsx_f32\n#define xrint nsimd_sleef_rint_vsx_f64\n#define xrintf nsimd_sleef_rint_vsx_f32\n#define xnextafter nsimd_sleef_nextafter_vsx_f64\n#define xnextafterf nsimd_sleef_nextafter_vsx_f32\n#define xfrfrexp nsimd_sleef_frfrexp_vsx_f64\n#define xfrfrexpf nsimd_sleef_frfrexp_vsx_f32\n#define xexpfrexp nsimd_sleef_expfrexp_vsx_f64\n#define xexpfrexpf nsimd_sleef_expfrexp_vsx_f32\n#define xfmod nsimd_sleef_fmod_vsx_f64\n#define xfmodf nsimd_sleef_fmod_vsx_f32\n#define xremainder nsimd_sleef_remainder_vsx_f64\n#define xremainderf nsimd_sleef_remainder_vsx_f32\n#define xmodf nsimd_sleef_modf_vsx_f64\n#define xmodff nsimd_sleef_modf_vsx_f32\n#define xlgamma_u1 nsimd_sleef_lgamma_u10d_vsx_f64\n#define xlgammaf_u1 nsimd_sleef_lgamma_u10d_vsx_f32\n#define xtgamma_u1 nsimd_sleef_tgamma_u10d_vsx_f64\n#define xtgammaf_u1 nsimd_sleef_tgamma_u10d_vsx_f32\n#define xerf_u1 nsimd_sleef_erf_u10d_vsx_f64\n#define xerff_u1 nsimd_sleef_erf_u10d_vsx_f32\n#define xerfc_u15 nsimd_sleef_erfc_u15d_vsx_f64\n#define xerfcf_u15 nsimd_sleef_erfc_u15d_vsx_f32\n#define xgetInt nsimd_sleef_getInt_vsx_f64\n#define xgetIntf nsimd_sleef_getInt_vsx_f32\n#define xgetPtr nsimd_sleef_getPtr_vsx_f64\n#define xgetPtrf nsimd_sleef_getPtr_vsx_f32\n\n                   #else\n\n                   #define xsin nsimd_sleef_sin_u35_vsx_f64\n#define xsinf nsimd_sleef_sin_u35_vsx_f32\n#define xcos nsimd_sleef_cos_u35_vsx_f64\n#define xcosf nsimd_sleef_cos_u35_vsx_f32\n#define xsincos nsimd_sleef_sincos_u35_vsx_f64\n#define xsincosf nsimd_sleef_sincos_u35_vsx_f32\n#define xtan nsimd_sleef_tan_u35_vsx_f64\n#define xtanf nsimd_sleef_tan_u35_vsx_f32\n#define xasin nsimd_sleef_asin_u35_vsx_f64\n#define xasinf nsimd_sleef_asin_u35_vsx_f32\n#define xacos nsimd_sleef_acos_u35_vsx_f64\n#define xacosf nsimd_sleef_acos_u35_vsx_f32\n#define xatan nsimd_sleef_atan_u35_vsx_f64\n#define xatanf nsimd_sleef_atan_u35_vsx_f32\n#define xatan2 nsimd_sleef_atan2_u35_vsx_f64\n#define xatan2f nsimd_sleef_atan2_u35_vsx_f32\n#define xlog nsimd_sleef_log_u35_vsx_f64\n#define xlogf nsimd_sleef_log_u35_vsx_f32\n#define xcbrt nsimd_sleef_cbrt_u35_vsx_f64\n#define xcbrtf nsimd_sleef_cbrt_u35_vsx_f32\n#define xsin_u1 nsimd_sleef_sin_u10_vsx_f64\n#define xsinf_u1 nsimd_sleef_sin_u10_vsx_f32\n#define xcos_u1 nsimd_sleef_cos_u10_vsx_f64\n#define xcosf_u1 nsimd_sleef_cos_u10_vsx_f32\n#define xsincos_u1 nsimd_sleef_sincos_u10_vsx_f64\n#define xsincosf_u1 nsimd_sleef_sincos_u10_vsx_f32\n#define xtan_u1 nsimd_sleef_tan_u10_vsx_f64\n#define xtanf_u1 nsimd_sleef_tan_u10_vsx_f32\n#define xasin_u1 nsimd_sleef_asin_u10_vsx_f64\n#define xasinf_u1 nsimd_sleef_asin_u10_vsx_f32\n#define xacos_u1 nsimd_sleef_acos_u10_vsx_f64\n#define xacosf_u1 nsimd_sleef_acos_u10_vsx_f32\n#define xatan_u1 nsimd_sleef_atan_u10_vsx_f64\n#define xatanf_u1 nsimd_sleef_atan_u10_vsx_f32\n#define xatan2_u1 nsimd_sleef_atan2_u10_vsx_f64\n#define xatan2f_u1 nsimd_sleef_atan2_u10_vsx_f32\n#define xlog_u1 nsimd_sleef_log_u10_vsx_f64\n#define xlogf_u1 nsimd_sleef_log_u10_vsx_f32\n#define xcbrt_u1 nsimd_sleef_cbrt_u10_vsx_f64\n#define xcbrtf_u1 nsimd_sleef_cbrt_u10_vsx_f32\n#define xexp nsimd_sleef_exp_u10_vsx_f64\n#define xexpf nsimd_sleef_exp_u10_vsx_f32\n#define xpow nsimd_sleef_pow_u10_vsx_f64\n#define xpowf nsimd_sleef_pow_u10_vsx_f32\n#define xsinh nsimd_sleef_sinh_u10_vsx_f64\n#define xsinhf nsimd_sleef_sinh_u10_vsx_f32\n#define xcosh nsimd_sleef_cosh_u10_vsx_f64\n#define xcoshf nsimd_sleef_cosh_u10_vsx_f32\n#define xtanh nsimd_sleef_tanh_u10_vsx_f64\n#define xtanhf nsimd_sleef_tanh_u10_vsx_f32\n#define xsinh_u35 nsimd_sleef_sinh_u35_vsx_f64\n#define xsinhf_u35 nsimd_sleef_sinh_u35_vsx_f32\n#define xcosh_u35 nsimd_sleef_cosh_u35_vsx_f64\n#define xcoshf_u35 nsimd_sleef_cosh_u35_vsx_f32\n#define xtanh_u35 nsimd_sleef_tanh_u35_vsx_f64\n#define xtanhf_u35 nsimd_sleef_tanh_u35_vsx_f32\n#define xfastsin_u3500 nsimd_sleef_fastsin_u3500_vsx_f64\n#define xfastsinf_u3500 nsimd_sleef_fastsin_u3500_vsx_f32\n#define xfastcos_u3500 nsimd_sleef_fastcos_u3500_vsx_f64\n#define xfastcosf_u3500 nsimd_sleef_fastcos_u3500_vsx_f32\n#define xfastpow_u3500 nsimd_sleef_fastpow_u3500_vsx_f64\n#define xfastpowf_u3500 nsimd_sleef_fastpow_u3500_vsx_f32\n#define xasinh nsimd_sleef_asinh_u10_vsx_f64\n#define xasinhf nsimd_sleef_asinh_u10_vsx_f32\n#define xacosh nsimd_sleef_acosh_u10_vsx_f64\n#define xacoshf nsimd_sleef_acosh_u10_vsx_f32\n#define xatanh nsimd_sleef_atanh_u10_vsx_f64\n#define xatanhf nsimd_sleef_atanh_u10_vsx_f32\n#define xexp2 nsimd_sleef_exp2_u10_vsx_f64\n#define xexp2f nsimd_sleef_exp2_u10_vsx_f32\n#define xexp2_u35 nsimd_sleef_exp2_u35_vsx_f64\n#define xexp2f_u35 nsimd_sleef_exp2_u35_vsx_f32\n#define xexp10 nsimd_sleef_exp10_u10_vsx_f64\n#define xexp10f nsimd_sleef_exp10_u10_vsx_f32\n#define xexp10_u35 nsimd_sleef_exp10_u35_vsx_f64\n#define xexp10f_u35 nsimd_sleef_exp10_u35_vsx_f32\n#define xexpm1 nsimd_sleef_expm1_u10_vsx_f64\n#define xexpm1f nsimd_sleef_expm1_u10_vsx_f32\n#define xlog10 nsimd_sleef_log10_u10_vsx_f64\n#define xlog10f nsimd_sleef_log10_u10_vsx_f32\n#define xlog2 nsimd_sleef_log2_u10_vsx_f64\n#define xlog2f nsimd_sleef_log2_u10_vsx_f32\n#define xlog2_u35 nsimd_sleef_log2_u35_vsx_f64\n#define xlog2f_u35 nsimd_sleef_log2_u35_vsx_f32\n#define xlog1p nsimd_sleef_log1p_u10_vsx_f64\n#define xlog1pf nsimd_sleef_log1p_u10_vsx_f32\n#define xsincospi_u05 nsimd_sleef_sincospi_u05_vsx_f64\n#define xsincospif_u05 nsimd_sleef_sincospi_u05_vsx_f32\n#define xsincospi_u35 nsimd_sleef_sincospi_u35_vsx_f64\n#define xsincospif_u35 nsimd_sleef_sincospi_u35_vsx_f32\n#define xsinpi_u05 nsimd_sleef_sinpi_u05_vsx_f64\n#define xsinpif_u05 nsimd_sleef_sinpi_u05_vsx_f32\n#define xcospi_u05 nsimd_sleef_cospi_u05_vsx_f64\n#define xcospif_u05 nsimd_sleef_cospi_u05_vsx_f32\n#define xldexp nsimd_sleef_ldexp_vsx_f64\n#define xldexpf nsimd_sleef_ldexp_vsx_f32\n#define xilogb nsimd_sleef_ilogb_vsx_f64\n#define xilogbf nsimd_sleef_ilogb_vsx_f32\n#define xfma nsimd_sleef_fma_vsx_f64\n#define xfmaf nsimd_sleef_fma_vsx_f32\n#define xsqrt nsimd_sleef_sqrt_vsx_f64\n#define xsqrtf nsimd_sleef_sqrt_vsx_f32\n#define xsqrt_u05 nsimd_sleef_sqrt_u05_vsx_f64\n#define xsqrtf_u05 nsimd_sleef_sqrt_u05_vsx_f32\n#define xsqrt_u35 nsimd_sleef_sqrt_u35_vsx_f64\n#define xsqrtf_u35 nsimd_sleef_sqrt_u35_vsx_f32\n#define xhypot_u05 nsimd_sleef_hypot_u05_vsx_f64\n#define xhypotf_u05 nsimd_sleef_hypot_u05_vsx_f32\n#define xhypot_u35 nsimd_sleef_hypot_u35_vsx_f64\n#define xhypotf_u35 nsimd_sleef_hypot_u35_vsx_f32\n#define xfabs nsimd_sleef_fabs_vsx_f64\n#define xfabsf nsimd_sleef_fabs_vsx_f32\n#define xcopysign nsimd_sleef_copysign_vsx_f64\n#define xcopysignf nsimd_sleef_copysign_vsx_f32\n#define xfmax nsimd_sleef_fmax_vsx_f64\n#define xfmaxf nsimd_sleef_fmax_vsx_f32\n#define xfmin nsimd_sleef_fmin_vsx_f64\n#define xfminf nsimd_sleef_fmin_vsx_f32\n#define xfdim nsimd_sleef_fdim_vsx_f64\n#define xfdimf nsimd_sleef_fdim_vsx_f32\n#define xtrunc nsimd_sleef_trunc_vsx_f64\n#define xtruncf nsimd_sleef_trunc_vsx_f32\n#define xfloor nsimd_sleef_floor_vsx_f64\n#define xfloorf nsimd_sleef_floor_vsx_f32\n#define xceil nsimd_sleef_ceil_vsx_f64\n#define xceilf nsimd_sleef_ceil_vsx_f32\n#define xround nsimd_sleef_round_vsx_f64\n#define xroundf nsimd_sleef_round_vsx_f32\n#define xrint nsimd_sleef_rint_vsx_f64\n#define xrintf nsimd_sleef_rint_vsx_f32\n#define xnextafter nsimd_sleef_nextafter_vsx_f64\n#define xnextafterf nsimd_sleef_nextafter_vsx_f32\n#define xfrfrexp nsimd_sleef_frfrexp_vsx_f64\n#define xfrfrexpf nsimd_sleef_frfrexp_vsx_f32\n#define xexpfrexp nsimd_sleef_expfrexp_vsx_f64\n#define xexpfrexpf nsimd_sleef_expfrexp_vsx_f32\n#define xfmod nsimd_sleef_fmod_vsx_f64\n#define xfmodf nsimd_sleef_fmod_vsx_f32\n#define xremainder nsimd_sleef_remainder_vsx_f64\n#define xremainderf nsimd_sleef_remainder_vsx_f32\n#define xmodf nsimd_sleef_modf_vsx_f64\n#define xmodff nsimd_sleef_modf_vsx_f32\n#define xlgamma_u1 nsimd_sleef_lgamma_u10_vsx_f64\n#define xlgammaf_u1 nsimd_sleef_lgamma_u10_vsx_f32\n#define xtgamma_u1 nsimd_sleef_tgamma_u10_vsx_f64\n#define xtgammaf_u1 nsimd_sleef_tgamma_u10_vsx_f32\n#define xerf_u1 nsimd_sleef_erf_u10_vsx_f64\n#define xerff_u1 nsimd_sleef_erf_u10_vsx_f32\n#define xerfc_u15 nsimd_sleef_erfc_u15_vsx_f64\n#define xerfcf_u15 nsimd_sleef_erfc_u15_vsx_f32\n#define xgetInt nsimd_sleef_getInt_vsx_f64\n#define xgetIntf nsimd_sleef_getInt_vsx_f32\n#define xgetPtr nsimd_sleef_getPtr_vsx_f64\n#define xgetPtrf nsimd_sleef_getPtr_vsx_f32\n\n                   #endif\n\n                   #define rempi nsimd_sleef_rempi_vsx\n                   #define rempif nsimd_sleef_rempif_vsx\n                   #define rempisub nsimd_sleef_rempisub_vsx\n                   #define rempisubf nsimd_sleef_rempisubf_vsx\n                   #define gammak nsimd_gammak_vsx\n                   #define gammafk nsimd_gammafk_vsx\n\n                   #endif\n\n                   \n\n#endif\n\n"
  },
  {
    "path": "src/sleefdp.c",
    "content": "//   Copyright Naoki Shibata and contributors 2010 - 2020.\n// Distributed under the Boost Software License, Version 1.0.\n//    (See accompanying file LICENSE.txt or copy at\n//          http://www.boost.org/LICENSE_1_0.txt)\n\n// Always use -ffp-contract=off option to compile SLEEF.\n\n#include <stdio.h>\n#include <assert.h>\n#include <stdint.h>\n#include <limits.h>\n#include <float.h>\n\n#ifndef ENABLE_BUILTIN_MATH\n#include <math.h>\n#define SQRT sqrt\n#else\n#define SQRT __builtin_sqrt\n#endif\n\n#include \"misc.h\"\n\nextern const double Sleef_rempitabdp[];\n\n#ifdef DORENAME\n#include \"rename.h\"\n#endif\n\n#if (defined(_MSC_VER))\n#pragma fp_contract (off)\n#endif\n\n#define MLA mla\n#define C2V(x) (x)\n#include \"estrin.h\"\n\nstatic INLINE CONST int64_t doubleToRawLongBits(double d) {\n  union {\n    double f;\n    int64_t i;\n  } tmp;\n  tmp.f = d;\n  return tmp.i;\n}\n\nstatic INLINE CONST double longBitsToDouble(int64_t i) {\n  union {\n    double f;\n    int64_t i;\n  } tmp;\n  tmp.i = i;\n  return tmp.f;\n}\n\nstatic INLINE CONST double fabsk(double x) {\n  return longBitsToDouble(INT64_C(0x7fffffffffffffff) & doubleToRawLongBits(x));\n}\n\nstatic INLINE CONST double mulsign(double x, double y) {\n  return longBitsToDouble(doubleToRawLongBits(x) ^ (doubleToRawLongBits(y) & (INT64_C(1) << 63)));\n}\n\nstatic INLINE CONST double copysignk(double x, double y) {\n  return longBitsToDouble((doubleToRawLongBits(x) & ~(INT64_C(1) << 63)) ^ (doubleToRawLongBits(y) & (INT64_C(1) << 63)));\n}\n\nstatic INLINE CONST double sign(double d) { return mulsign(1, d); }\nstatic INLINE CONST double mla(double x, double y, double z) { return x * y + z; }\nstatic INLINE CONST double rintk(double x) { return x < 0 ? (int)(x - 0.5) : (int)(x + 0.5); }\nstatic INLINE CONST int ceilk(double x) { return (int)x + (x < 0 ? 0 : 1); }\nstatic INLINE CONST double trunck(double x) { return (double)(int)x; }\nstatic INLINE CONST double fmink(double x, double y) { return x < y ? x : y; }\nstatic INLINE CONST double fmaxk(double x, double y) { return x > y ? x : y; }\n\nstatic INLINE CONST int xisnan(double x) { return x != x; }\nstatic INLINE CONST int xisinf(double x) { return x == SLEEF_INFINITY || x == -SLEEF_INFINITY; }\nstatic INLINE CONST int xisminf(double x) { return x == -SLEEF_INFINITY; }\nstatic INLINE CONST int xispinf(double x) { return x == SLEEF_INFINITY; }\nstatic INLINE CONST int xisnegzero(double x) { return doubleToRawLongBits(x) == doubleToRawLongBits(-0.0); }\nstatic INLINE CONST int xisnumber(double x) { return !xisinf(x) && !xisnan(x); }\n\nstatic INLINE CONST int xisint(double d) {\n  double x = d - (double)(INT64_C(1) << 31) * (int)(d * (1.0 / (INT64_C(1) << 31)));\n  return (x == (int)x) || (fabsk(d) >= (double)(INT64_C(1) << 53));\n}\n\nstatic INLINE CONST int xisodd(double d) {\n  double x = d - (double)(INT64_C(1) << 31) * (int)(d * (1.0 / (INT64_C(1) << 31)));\n  return (1 & (int)x) != 0 && fabsk(d) < (double)(INT64_C(1) << 53);\n}\n\nstatic INLINE CONST double pow2i(int q) {\n  return longBitsToDouble(((int64_t)(q + 0x3ff)) << 52);\n}\n\nstatic INLINE CONST double ldexpk(double x, int q) {\n  double u;\n  int m;\n  m = q >> 31;\n  m = (((m + q) >> 9) - m) << 7;\n  q = q - (m << 2);\n  m += 0x3ff;\n  m = m < 0     ? 0     : m;\n  m = m > 0x7ff ? 0x7ff : m;\n  u = longBitsToDouble(((int64_t)m) << 52);\n  x = x * u * u * u * u;\n  u = longBitsToDouble(((int64_t)(q + 0x3ff)) << 52);\n  return x * u;\n}\n\nstatic INLINE CONST double ldexp2k(double d, int e) { // faster than ldexpk, short reach\n  return d * pow2i(e >> 1) * pow2i(e - (e >> 1));\n}\n\nstatic INLINE CONST double ldexp3k(double d, int e) { // very fast, no denormal\n  return longBitsToDouble(doubleToRawLongBits(d) + (((int64_t)e) << 52));\n}\n\nEXPORT CONST double xldexp(double x, int exp) {\n  if (exp >  2100) exp =  2100;\n  if (exp < -2100) exp = -2100;\n  \n  int e0 = exp >> 2;\n  if (exp < 0) e0++;\n  if (-100 < exp && exp < 100) e0 = 0;\n  int e1 = exp - (e0 << 2);\n\n  double p = pow2i(e0);\n  double ret = x * pow2i(e1) * p * p * p * p;\n  \n  return ret;\n}\n\nstatic INLINE CONST int ilogbk(double d) {\n  int m = d < 4.9090934652977266E-91;\n  d = m ? 2.037035976334486E90 * d : d;\n  int q = (doubleToRawLongBits(d) >> 52) & 0x7ff;\n  q = m ? q - (300 + 0x03ff) : q - 0x03ff;\n  return q;\n}\n\n// ilogb2k is similar to ilogbk, but the argument has to be a\n// normalized FP value.\nstatic INLINE CONST int ilogb2k(double d) {\n  return ((doubleToRawLongBits(d) >> 52) & 0x7ff) - 0x3ff;\n}\n\nEXPORT CONST int xilogb(double d) {\n  int e = ilogbk(fabsk(d));\n  e = d == 0.0  ? SLEEF_FP_ILOGB0 : e;\n  e = xisnan(d) ? SLEEF_FP_ILOGBNAN : e;\n  e = xisinf(d) ? INT_MAX : e;\n  return e;\n}\n\n//\n\n#ifndef NDEBUG\nstatic int checkfp(double x) {\n  if (xisinf(x) || xisnan(x)) return 1;\n  return 0;\n}\n#endif\n\nstatic INLINE CONST double upper(double d) {\n  return longBitsToDouble(doubleToRawLongBits(d) & INT64_C(0xfffffffff8000000));\n}\n\nstatic INLINE CONST Sleef_double2 dd(double h, double l) {\n  Sleef_double2 ret;\n  ret.x = h; ret.y = l;\n  return ret;\n}\n\nstatic INLINE CONST Sleef_double2 ddnormalize_d2_d2(Sleef_double2 t) {\n  Sleef_double2 s;\n\n  s.x = t.x + t.y;\n  s.y = t.x - s.x + t.y;\n\n  return s;\n}\n\nstatic INLINE CONST Sleef_double2 ddscale_d2_d2_d(Sleef_double2 d, double s) {\n  Sleef_double2 r;\n\n  r.x = d.x * s;\n  r.y = d.y * s;\n\n  return r;\n}\n\nstatic INLINE CONST Sleef_double2 ddneg_d2_d2(Sleef_double2 d) {\n  Sleef_double2 r;\n\n  r.x = -d.x;\n  r.y = -d.y;\n\n  return r;\n}\n\nstatic INLINE CONST Sleef_double2 ddabs_d2_d2(Sleef_double2 x) {\n  return dd(x.x < 0 ? -x.x : x.x, x.x < 0 ? -x.y : x.y);\n}\n\n/*\n * ddadd and ddadd2 are functions for double-double addition.  ddadd\n * is simpler and faster than ddadd2, but it requires the absolute\n * value of first argument to be larger than the second argument. The\n * exact condition that should be met is checked if NDEBUG macro is\n * not defined.\n *\n * Please note that if the results won't be used, it is no problem to\n * feed arguments that do not meet this condition. You will see\n * warning messages if you turn off NDEBUG macro and run tester2, but\n * this is normal.\n * \n * Please see :\n * Jonathan Richard Shewchuk, Adaptive Precision Floating-Point\n * Arithmetic and Fast Robust Geometric Predicates, Discrete &\n * Computational Geometry 18:305-363, 1997.\n */\n\nstatic INLINE CONST Sleef_double2 ddadd_d2_d_d(double x, double y) {\n  // |x| >= |y|\n\n  Sleef_double2 r;\n\n#ifndef NDEBUG\n  if (!(checkfp(x) || checkfp(y) || fabsk(x) >= fabsk(y) || (fabsk(x+y) <= fabsk(x) && fabsk(x+y) <= fabsk(y)))) {\n    fprintf(stderr, \"[ddadd_d2_d_d : %g, %g]\\n\", x, y);\n    fflush(stderr);\n  }\n#endif\n\n  r.x = x + y;\n  r.y = x - r.x + y;\n\n  return r;\n}\n\nstatic INLINE CONST Sleef_double2 ddadd2_d2_d_d(double x, double y) {\n  Sleef_double2 r;\n\n  r.x = x + y;\n  double v = r.x - x;\n  r.y = (x - (r.x - v)) + (y - v);\n\n  return r;\n}\n\nstatic INLINE CONST Sleef_double2 ddadd_d2_d2_d(Sleef_double2 x, double y) {\n  // |x| >= |y|\n\n  Sleef_double2 r;\n\n#ifndef NDEBUG\n  if (!(checkfp(x.x) || checkfp(y) || fabsk(x.x) >= fabsk(y) || (fabsk(x.x+y) <= fabsk(x.x) && fabsk(x.x+y) <= fabsk(y)))) {\n    fprintf(stderr, \"[ddadd_d2_d2_d : %g %g]\\n\", x.x, y);\n    fflush(stderr);\n  }\n#endif\n\n  r.x = x.x + y;\n  r.y = x.x - r.x + y + x.y;\n\n  return r;\n}\n\nstatic INLINE CONST Sleef_double2 ddadd2_d2_d2_d(Sleef_double2 x, double y) {\n  Sleef_double2 r;\n\n  r.x  = x.x + y;\n  double v = r.x - x.x;\n  r.y = (x.x - (r.x - v)) + (y - v);\n  r.y += x.y;\n\n  return r;\n}\n\nstatic INLINE CONST Sleef_double2 ddadd_d2_d_d2(double x, Sleef_double2 y) {\n  // |x| >= |y|\n\n  Sleef_double2 r;\n\n#ifndef NDEBUG\n  if (!(checkfp(x) || checkfp(y.x) || fabsk(x) >= fabsk(y.x) || (fabsk(x+y.x) <= fabsk(x) && fabsk(x+y.x) <= fabsk(y.x)))) {\n    fprintf(stderr, \"[ddadd_d2_d_d2 : %g %g]\\n\", x, y.x);\n    fflush(stderr);\n  }\n#endif\n\n  r.x = x + y.x;\n  r.y = x - r.x + y.x + y.y;\n\n  return r;\n}\n\nstatic INLINE CONST Sleef_double2 ddadd2_d2_d_d2(double x, Sleef_double2 y) {\n  Sleef_double2 r;\n\n  r.x  = x + y.x;\n  double v = r.x - x;\n  r.y = (x - (r.x - v)) + (y.x - v) + y.y;\n\n  return r;\n}\n\nstatic INLINE CONST double ddadd2_d_d_d2(double x, Sleef_double2 y) { return y.y + y.x + x; }\n\nstatic INLINE CONST Sleef_double2 ddadd_d2_d2_d2(Sleef_double2 x, Sleef_double2 y) {\n  // |x| >= |y|\n\n  Sleef_double2 r;\n\n#ifndef NDEBUG\n  if (!(x.x == 0 || checkfp(x.x) || checkfp(y.x) || fabsk(x.x) >= fabsk(y.x) || (fabsk(x.x+y.x) <= fabsk(x.x) && fabsk(x.x+y.x) <= fabsk(y.x)))) {\n    fprintf(stderr, \"[ddadd_d2_d2_d2 : %g %g]\\n\", x.x, y.x);\n    fflush(stderr);\n  }\n#endif\n\n  r.x = x.x + y.x;\n  r.y = x.x - r.x + y.x + x.y + y.y;\n\n  return r;\n}\n\nstatic INLINE CONST Sleef_double2 ddadd2_d2_d2_d2(Sleef_double2 x, Sleef_double2 y) {\n  Sleef_double2 r;\n\n  r.x  = x.x + y.x;\n  double v = r.x - x.x;\n  r.y = (x.x - (r.x - v)) + (y.x - v);\n  r.y += x.y + y.y;\n\n  return r;\n}\n\nstatic INLINE CONST Sleef_double2 ddsub_d2_d2_d2(Sleef_double2 x, Sleef_double2 y) {\n  // |x| >= |y|\n\n  Sleef_double2 r;\n\n#ifndef NDEBUG\n  if (!(checkfp(x.x) || checkfp(y.x) || fabsk(x.x) >= fabsk(y.x) || (fabsk(x.x-y.x) <= fabsk(x.x) && fabsk(x.x-y.x) <= fabsk(y.x)))) {\n    fprintf(stderr, \"[ddsub_d2_d2_d2 : %g %g]\\n\", x.x, y.x);\n    fflush(stderr);\n  }\n#endif\n\n  r.x = x.x - y.x;\n  r.y = x.x - r.x - y.x + x.y - y.y;\n\n  return r;\n}\n\nstatic INLINE CONST Sleef_double2 dddiv_d2_d2_d2(Sleef_double2 n, Sleef_double2 d) {\n  double t = 1.0 / d.x;\n  double dh  = upper(d.x), dl  = d.x - dh;\n  double th  = upper(t  ), tl  = t   - th;\n  double nhh = upper(n.x), nhl = n.x - nhh;\n\n  Sleef_double2 q;\n\n  q.x = n.x * t;\n\n  double u = -q.x + nhh * th + nhh * tl + nhl * th + nhl * tl +\n    q.x * (1 - dh * th - dh * tl - dl * th - dl * tl);\n\n  q.y = t * (n.y - q.x * d.y) + u;\n\n  return q;\n}\n\nstatic INLINE CONST Sleef_double2 ddmul_d2_d_d(double x, double y) {\n  double xh = upper(x), xl = x - xh;\n  double yh = upper(y), yl = y - yh;\n  Sleef_double2 r;\n\n  r.x = x * y;\n  r.y = xh * yh - r.x + xl * yh + xh * yl + xl * yl;\n\n  return r;\n}\n\nstatic INLINE CONST Sleef_double2 ddmul_d2_d2_d(Sleef_double2 x, double y) {\n  double xh = upper(x.x), xl = x.x - xh;\n  double yh = upper(y  ), yl = y   - yh;\n  Sleef_double2 r;\n\n  r.x = x.x * y;\n  r.y = xh * yh - r.x + xl * yh + xh * yl + xl * yl + x.y * y;\n\n  return r;\n}\n\nstatic INLINE CONST Sleef_double2 ddmul_d2_d2_d2(Sleef_double2 x, Sleef_double2 y) {\n  double xh = upper(x.x), xl = x.x - xh;\n  double yh = upper(y.x), yl = y.x - yh;\n  Sleef_double2 r;\n\n  r.x = x.x * y.x;\n  r.y = xh * yh - r.x + xl * yh + xh * yl + xl * yl + x.x * y.y + x.y * y.x;\n\n  return r;\n}\n\nstatic INLINE CONST double ddmul_d_d2_d2(Sleef_double2 x, Sleef_double2 y) {\n  double xh = upper(x.x), xl = x.x - xh;\n  double yh = upper(y.x), yl = y.x - yh;\n  \n  return x.y * yh + xh * y.y + xl * yl + xh * yl + xl * yh + xh * yh;\n}\n\nstatic INLINE CONST Sleef_double2 ddsqu_d2_d2(Sleef_double2 x) {\n  double xh = upper(x.x), xl = x.x - xh;\n  Sleef_double2 r;\n\n  r.x = x.x * x.x;\n  r.y = xh * xh - r.x + (xh + xh) * xl + xl * xl + x.x * (x.y + x.y);\n\n  return r;\n}\n\nstatic INLINE CONST double ddsqu_d_d2(Sleef_double2 x) {\n  double xh = upper(x.x), xl = x.x - xh;\n\n  return xh * x.y + xh * x.y + xl * xl + (xh * xl + xh * xl) + xh * xh;\n}\n\nstatic INLINE CONST Sleef_double2 ddrec_d2_d(double d) {\n  double t = 1.0 / d;\n  double dh = upper(d), dl = d - dh;\n  double th = upper(t), tl = t - th;\n  Sleef_double2 q;\n\n  q.x = t;\n  q.y = t * (1 - dh * th - dh * tl - dl * th - dl * tl);\n\n  return q;\n}\n\nstatic INLINE CONST Sleef_double2 ddrec_d2_d2(Sleef_double2 d) {\n  double t = 1.0 / d.x;\n  double dh = upper(d.x), dl = d.x - dh;\n  double th = upper(t  ), tl = t   - th;\n  Sleef_double2 q;\n\n  q.x = t;\n  q.y = t * (1 - dh * th - dh * tl - dl * th - dl * tl - d.y * t);\n\n  return q;\n}\n\nstatic INLINE CONST Sleef_double2 ddsqrt_d2_d2(Sleef_double2 d) {\n  double t = SQRT(d.x + d.y);\n  return ddscale_d2_d2_d(ddmul_d2_d2_d2(ddadd2_d2_d2_d2(d, ddmul_d2_d_d(t, t)), ddrec_d2_d(t)), 0.5);\n}\n\nstatic INLINE CONST Sleef_double2 ddsqrt_d2_d(double d) {\n  double t = SQRT(d);\n  return ddscale_d2_d2_d(ddmul_d2_d2_d2(ddadd2_d2_d_d2(d, ddmul_d2_d_d(t, t)), ddrec_d2_d(t)), 0.5);\n}\n\n//\n\nstatic INLINE CONST double atan2k(double y, double x) {\n  double s, t, u;\n  int q = 0;\n\n  if (x < 0) { x = -x; q = -2; }\n  if (y > x) { t = x; x = y; y = -t; q += 1; }\n\n  s = y / x;\n  t = s * s;\n\n  double t2 = t * t, t4 = t2 * t2, t8 = t4 * t4, t16 = t8 * t8;\n  u = POLY19(t, t2, t4, t8, t16,\n\t     -1.88796008463073496563746e-05,\n\t     0.000209850076645816976906797,\n\t     -0.00110611831486672482563471,\n\t     0.00370026744188713119232403,\n\t     -0.00889896195887655491740809,\n\t     0.016599329773529201970117,\n\t     -0.0254517624932312641616861,\n\t     0.0337852580001353069993897,\n\t     -0.0407629191276836500001934,\n\t     0.0466667150077840625632675,\n\t     -0.0523674852303482457616113,\n\t     0.0587666392926673580854313,\n\t     -0.0666573579361080525984562,\n\t     0.0769219538311769618355029,\n\t     -0.090908995008245008229153,\n\t     0.111111105648261418443745,\n\t     -0.14285714266771329383765,\n\t     0.199999999996591265594148,\n\t     -0.333333333333311110369124);\n\n  t = u * t * s + s;\n  t = q * (M_PI/2) + t;\n\n  return t;\n}\n\nEXPORT CONST double xatan2(double y, double x) {\n  double r = atan2k(fabsk(y), x);\n\n  r = mulsign(r, x);\n  if (xisinf(x) || x == 0) r = M_PI/2 - (xisinf(x) ? (sign(x) * (M_PI  /2)) : 0);\n  if (xisinf(y)          ) r = M_PI/2 - (xisinf(x) ? (sign(x) * (M_PI*1/4)) : 0);\n  if (             y == 0) r = (sign(x) == -1 ? M_PI : 0);\n\n  return xisnan(x) || xisnan(y) ? SLEEF_NAN : mulsign(r, y);\n}\n\nEXPORT CONST double xasin(double d) {\n  int o = fabsk(d) < 0.5;\n  double x2 = o ? (d*d) : ((1-fabsk(d))*0.5), x = o ? fabsk(d) : SQRT(x2), u;\n\n  double x4 = x2 * x2, x8 = x4 * x4, x16 = x8 * x8;\n  u = POLY12(x2, x4, x8, x16,\n\t     +0.3161587650653934628e-1,\n\t     -0.1581918243329996643e-1,\n\t     +0.1929045477267910674e-1,\n\t     +0.6606077476277170610e-2,\n\t     +0.1215360525577377331e-1,\n\t     +0.1388715184501609218e-1,\n\t     +0.1735956991223614604e-1,\n\t     +0.2237176181932048341e-1,\n\t     +0.3038195928038132237e-1,\n\t     +0.4464285681377102438e-1,\n\t     +0.7500000000378581611e-1,\n\t     +0.1666666666666497543e+0);\n\n  u = mla(u, x * x2, x);\n  \n  double r = o ? u : (M_PI/2 - 2*u);\n  r = mulsign(r, d);\n\n  return r;\n}\n\nEXPORT CONST double xacos(double d) {\n  int o = fabsk(d) < 0.5;\n  double x2 = o ? (d*d) : ((1-fabsk(d))*0.5), u;\n  double x = o ? fabsk(d) : SQRT(x2);\n  x = fabsk(d) == 1.0 ? 0 : x;\n\n  double x4 = x2 * x2, x8 = x4 * x4, x16 = x8 * x8;\n  u = POLY12(x2, x4, x8, x16,\n\t     +0.3161587650653934628e-1,\n\t     -0.1581918243329996643e-1,\n\t     +0.1929045477267910674e-1,\n\t     +0.6606077476277170610e-2,\n\t     +0.1215360525577377331e-1,\n\t     +0.1388715184501609218e-1,\n\t     +0.1735956991223614604e-1,\n\t     +0.2237176181932048341e-1,\n\t     +0.3038195928038132237e-1,\n\t     +0.4464285681377102438e-1,\n\t     +0.7500000000378581611e-1,\n\t     +0.1666666666666497543e+0);\n\n  u *= x * x2;\n  \n  double y = 3.1415926535897932/2 - (mulsign(x, d) + mulsign(u, d));\n  x += u;\n  double r = o ? y : (x*2);\n  if (!o && d < 0) r = ddadd_d2_d2_d(dd(3.141592653589793116, 1.2246467991473532072e-16), -r).x;\n\n  return r;\n}\n\nEXPORT CONST double xatan(double s) {\n  double t, u;\n  int q = 0;\n\n  if (sign(s) == -1) { s = -s; q = 2; }\n  if (s > 1) { s = 1.0 / s; q |= 1; }\n\n  t = s * s;\n\n  double t2 = t * t, t4 = t2 * t2, t8 = t4 * t4, t16 = t8 * t8;\n  u = POLY19(t, t2, t4, t8, t16,\n\t     -1.88796008463073496563746e-05,\n\t     0.000209850076645816976906797,\n\t     -0.00110611831486672482563471,\n\t     0.00370026744188713119232403,\n\t     -0.00889896195887655491740809,\n\t     0.016599329773529201970117,\n\t     -0.0254517624932312641616861,\n\t     0.0337852580001353069993897,\n\t     -0.0407629191276836500001934,\n\t     0.0466667150077840625632675,\n\t     -0.0523674852303482457616113,\n\t     0.0587666392926673580854313,\n\t     -0.0666573579361080525984562,\n\t     0.0769219538311769618355029,\n\t     -0.090908995008245008229153,\n\t     0.111111105648261418443745,\n\t     -0.14285714266771329383765,\n\t     0.199999999996591265594148,\n\t     -0.333333333333311110369124);\n\n  t = s + s * (t * u);\n\n  if ((q & 1) != 0) t = 1.570796326794896557998982 - t;\n  if ((q & 2) != 0) t = -t;\n\n  return t;\n}\n\nstatic Sleef_double2 atan2k_u1(Sleef_double2 y, Sleef_double2 x) {\n  double u;\n  Sleef_double2 s, t;\n  int q = 0;\n\n  if (x.x < 0) { x.x = -x.x; x.y = -x.y; q = -2; }\n  if (y.x > x.x) { t = x; x = y; y.x = -t.x; y.y = -t.y; q += 1; }\n\n  s = dddiv_d2_d2_d2(y, x);\n  t = ddsqu_d2_d2(s);\n  t = ddnormalize_d2_d2(t);\n\n  double t2 = t.x * t.x, t4 = t2 * t2, t8 = t4 * t4, t16 = t8 * t8;\n  u = POLY16(t.x, t2, t4, t8,\n\t     1.06298484191448746607415e-05,\n\t     -0.000125620649967286867384336,\n\t     0.00070557664296393412389774,\n\t     -0.00251865614498713360352999,\n\t     0.00646262899036991172313504,\n\t     -0.0128281333663399031014274,\n\t     0.0208024799924145797902497,\n\t     -0.0289002344784740315686289,\n\t     0.0359785005035104590853656,\n\t     -0.041848579703592507506027,\n\t     0.0470843011653283988193763,\n\t     -0.0524914210588448421068719,\n\t     0.0587946590969581003860434,\n\t     -0.0666620884778795497194182,\n\t     0.0769225330296203768654095,\n\t     -0.0909090442773387574781907);\n  u = mla(u, t.x, 0.111111108376896236538123);\n  u = mla(u, t.x, -0.142857142756268568062339);\n  u = mla(u, t.x, 0.199999999997977351284817);\n  u = mla(u, t.x, -0.333333333333317605173818);\n\n  t = ddadd_d2_d2_d2(s, ddmul_d2_d2_d(ddmul_d2_d2_d2(s, t), u));\n\n  if (fabsk(s.x) < 1e-200) t = s;\n  t = ddadd2_d2_d2_d2(ddmul_d2_d2_d(dd(1.570796326794896557998982, 6.12323399573676603586882e-17), q), t);\n  \n  return t;\n}\n\nEXPORT CONST double xatan2_u1(double y, double x) {\n  if (fabsk(x) < 5.5626846462680083984e-309) { y *= (UINT64_C(1) << 53); x *= (UINT64_C(1) << 53); } // nexttoward((1.0 / DBL_MAX), 1)\n  Sleef_double2 d = atan2k_u1(dd(fabsk(y), 0), dd(x, 0));\n  double r = d.x + d.y;\n\n  r = mulsign(r, x);\n  if (xisinf(x) || x == 0) r = M_PI/2 - (xisinf(x) ? (sign(x) * (M_PI  /2)) : 0);\n  if (xisinf(y)          ) r = M_PI/2 - (xisinf(x) ? (sign(x) * (M_PI*1/4)) : 0);\n  if (             y == 0) r = (sign(x) == -1 ? M_PI : 0);\n\n  return xisnan(x) || xisnan(y) ? SLEEF_NAN : mulsign(r, y);\n}\n\nEXPORT CONST double xasin_u1(double d) {\n  int o = fabsk(d) < 0.5;\n  double x2 = o ? (d*d) : ((1-fabsk(d))*0.5), u;\n  Sleef_double2 x = o ? dd(fabsk(d), 0) : ddsqrt_d2_d(x2);\n  x = fabsk(d) == 1.0 ? dd(0, 0) : x;\n\n  double x4 = x2 * x2, x8 = x4 * x4, x16 = x8 * x8;\n  u = POLY12(x2, x4, x8, x16,\n\t     +0.3161587650653934628e-1,\n\t     -0.1581918243329996643e-1,\n\t     +0.1929045477267910674e-1,\n\t     +0.6606077476277170610e-2,\n\t     +0.1215360525577377331e-1,\n\t     +0.1388715184501609218e-1,\n\t     +0.1735956991223614604e-1,\n\t     +0.2237176181932048341e-1,\n\t     +0.3038195928038132237e-1,\n\t     +0.4464285681377102438e-1,\n\t     +0.7500000000378581611e-1,\n\t     +0.1666666666666497543e+0);\n\n  u *= x2 * x.x;\n  \n  Sleef_double2 y = ddadd_d2_d2_d(ddsub_d2_d2_d2(dd(3.141592653589793116/4, 1.2246467991473532072e-16/4), x), -u);\n  double r = o ? (u + x.x) : ((y.x + y.y)*2);\n  r = mulsign(r, d);\n\n  return r;\n}\n\nEXPORT CONST double xacos_u1(double d) {\n  int o = fabsk(d) < 0.5;\n  double x2 = o ? (d*d) : ((1-fabsk(d))*0.5), u;\n  Sleef_double2 x = o ? dd(fabsk(d), 0) : ddsqrt_d2_d(x2), w;\n  x = fabsk(d) == 1.0 ? dd(0, 0) : x;\n\n  double x4 = x2 * x2, x8 = x4 * x4, x16 = x8 * x8;\n  u = POLY12(x2, x4, x8, x16,\n\t     +0.3161587650653934628e-1,\n\t     -0.1581918243329996643e-1,\n\t     +0.1929045477267910674e-1,\n\t     +0.6606077476277170610e-2,\n\t     +0.1215360525577377331e-1,\n\t     +0.1388715184501609218e-1,\n\t     +0.1735956991223614604e-1,\n\t     +0.2237176181932048341e-1,\n\t     +0.3038195928038132237e-1,\n\t     +0.4464285681377102438e-1,\n\t     +0.7500000000378581611e-1,\n\t     +0.1666666666666497543e+0);\n  \n  u *= x.x * x2;\n\n  Sleef_double2 y = ddsub_d2_d2_d2(dd(3.141592653589793116/2, 1.2246467991473532072e-16/2),\n\t\t\t\t   ddadd_d2_d_d(mulsign(x.x, d), mulsign(u, d)));\n  x = ddadd_d2_d2_d(x, u);\n  y = o ? y : ddscale_d2_d2_d(x, 2);\n  if (!o && d < 0) y = ddsub_d2_d2_d2(dd(3.141592653589793116, 1.2246467991473532072e-16), y);\n  \n  return y.x + y.y;\n}\n\nEXPORT CONST double xatan_u1(double d) {\n  Sleef_double2 d2 = atan2k_u1(dd(fabsk(d), 0), dd(1, 0));\n  double r = d2.x + d2.y;\n  if (xisinf(d)) r = 1.570796326794896557998982;\n  return mulsign(r, d);\n}\n\ntypedef struct {\n  double d;\n  int32_t i;\n} di_t;\n\ntypedef struct {\n  Sleef_double2 dd;\n  int32_t i;\n} ddi_t;\n\nstatic INLINE CONST double orsign(double x, double y) {\n  return longBitsToDouble(doubleToRawLongBits(x) | (doubleToRawLongBits(y) & (INT64_C(1) << 63)));\n}\n\nstatic CONST di_t rempisub(double x) {\n  // This function is equivalent to :\n  // di_t ret = { x - rint(4 * x) * 0.25, (int32_t)(rint(4 * x) - rint(x) * 4) };\n  di_t ret;\n  double c = mulsign(INT64_C(1) << 52, x);\n  double rint4x = fabsk(4*x) > INT64_C(1) << 52 ? (4*x) : orsign(mla(4, x, c) - c, x);\n  double rintx  = fabsk(  x) > INT64_C(1) << 52 ?   x   : orsign(x + c - c       , x);\n  ret.d = mla(-0.25, rint4x,      x);\n  ret.i = mla(-4   , rintx , rint4x);\n  return ret;\n}\n\n// Payne-Hanek like argument reduction\nstatic CONST ddi_t rempi(double a) {\n  Sleef_double2 x, y, z;\n  di_t di;\n  double t;\n  int ex = ilogb2k(a) - 55, q = ex > (700-55) ? -64 : 0;\n  a = ldexp3k(a, q);\n  if (ex < 0) ex = 0;\n  ex *= 4;\n  x = ddmul_d2_d_d(a, Sleef_rempitabdp[ex]);\n  di = rempisub(x.x);\n  q = di.i;\n  x.x = di.d;\n  x = ddnormalize_d2_d2(x);\n  y = ddmul_d2_d_d(a, Sleef_rempitabdp[ex+1]);\n  x = ddadd2_d2_d2_d2(x, y);\n  di = rempisub(x.x);\n  q += di.i;\n  x.x = di.d;\n  x = ddnormalize_d2_d2(x);\n  y = ddmul_d2_d2_d(dd(Sleef_rempitabdp[ex+2], Sleef_rempitabdp[ex+3]), a);\n  x = ddadd2_d2_d2_d2(x, y);\n  x = ddnormalize_d2_d2(x);\n  x = ddmul_d2_d2_d2(x, dd(3.141592653589793116*2, 1.2246467991473532072e-16*2));\n  ddi_t ret = { fabsk(a) < 0.7 ? dd(a, 0) : x, q };\n  return ret;\n}\n\nEXPORT CONST double xsin(double d) {\n  double u, s, t = d;\n  int ql;\n\n  if (fabsk(d) < TRIGRANGEMAX2) {\n    ql = rintk(d * M_1_PI);\n    d = mla(ql, -PI_A2, d);\n    d = mla(ql, -PI_B2, d);\n  } else if (fabsk(d) < TRIGRANGEMAX) {\n    double dqh = trunck(d * (M_1_PI / (1 << 24))) * (double)(1 << 24);\n    ql = rintk(mla(d, M_1_PI, -dqh));\n\n    d = mla(dqh, -PI_A, d);\n    d = mla( ql, -PI_A, d);\n    d = mla(dqh, -PI_B, d);\n    d = mla( ql, -PI_B, d);\n    d = mla(dqh, -PI_C, d);\n    d = mla( ql, -PI_C, d);\n    d = mla(dqh + ql, -PI_D, d);\n  } else {\n    ddi_t ddi = rempi(t);\n    ql = ((ddi.i & 3) * 2 + (ddi.dd.x > 0) + 1) >> 2;\n    if ((ddi.i & 1) != 0) {\n      ddi.dd = ddadd2_d2_d2_d2(ddi.dd, dd(mulsign(3.141592653589793116*-0.5, ddi.dd.x),\n\t\t\t\t\t  mulsign(1.2246467991473532072e-16*-0.5, ddi.dd.x)));\n    }\n    d = ddi.dd.x + ddi.dd.y;\n    if (xisinf(t) || xisnan(t)) d = SLEEF_NAN;\n  }\n\n  s = d * d;\n\n  if ((ql & 1) != 0) d = -d;\n\n  double s2 = s * s, s4 = s2 * s2;\n  u = POLY8(s, s2, s4,\n\t    -7.97255955009037868891952e-18,\n\t    2.81009972710863200091251e-15,\n\t    -7.64712219118158833288484e-13,\n\t    1.60590430605664501629054e-10,\n\t    -2.50521083763502045810755e-08,\n\t    2.75573192239198747630416e-06,\n\t    -0.000198412698412696162806809,\n\t    0.00833333333333332974823815);\n  u = mla(u, s, -0.166666666666666657414808);\n\n  u = mla(s, u * d, d);\n\n  if (xisnegzero(t)) u = t;\n\n  return u;\n}\n\nEXPORT CONST double xsin_u1(double d) {\n  double u;\n  Sleef_double2 s, t, x;\n  int ql;\n  \n  if (fabsk(d) < TRIGRANGEMAX2) {\n    ql = rintk(d * M_1_PI);\n    u = mla(ql, -PI_A2, d);\n    s = ddadd_d2_d_d (u,  ql * -PI_B2);\n  } else if (fabsk(d) < TRIGRANGEMAX) {\n    const double dqh = trunck(d * (M_1_PI / (1 << 24))) * (double)(1 << 24);\n    ql = rintk(mla(d, M_1_PI, -dqh));\n\n    u = mla(dqh, -PI_A, d);\n    s = ddadd_d2_d_d  (u,  ql * -PI_A);\n    s = ddadd2_d2_d2_d(s, dqh * -PI_B);\n    s = ddadd2_d2_d2_d(s,  ql * -PI_B);\n    s = ddadd2_d2_d2_d(s, dqh * -PI_C);\n    s = ddadd2_d2_d2_d(s,  ql * -PI_C);\n    s = ddadd_d2_d2_d (s, (dqh + ql) * -PI_D);\n  } else {\n    ddi_t ddi = rempi(d);\n    ql = ((ddi.i & 3) * 2 + (ddi.dd.x > 0) + 1) >> 2;\n    if ((ddi.i & 1) != 0) {\n      ddi.dd = ddadd2_d2_d2_d2(ddi.dd, dd(mulsign(3.141592653589793116*-0.5, ddi.dd.x),\n\t\t\t\t\t  mulsign(1.2246467991473532072e-16*-0.5, ddi.dd.x)));\n    }\n    s = ddnormalize_d2_d2(ddi.dd);\n    if (xisinf(d) || xisnan(d)) s.x = SLEEF_NAN;\n  }\n\n  t = s;\n  s = ddsqu_d2_d2(s);\n\n  double s2 = s.x * s.x, s4 = s2 * s2;\n  u = POLY6(s.x, s2, s4,\n\t    2.72052416138529567917983e-15,\n\t    -7.6429259411395447190023e-13,\n\t    1.60589370117277896211623e-10,\n\t    -2.5052106814843123359368e-08,\n\t    2.75573192104428224777379e-06,\n\t    -0.000198412698412046454654947);\n  u = mla(u, s.x, 0.00833333333333318056201922);\n\n  x = ddadd_d2_d_d2(1, ddmul_d2_d2_d2(ddadd_d2_d_d(-0.166666666666666657414808, u * s.x), s));\n  u = ddmul_d_d2_d2(t, x);\n  \n  if ((ql & 1) != 0) u = -u;\n  if (xisnegzero(d)) u = d;\n  \n  return u;\n}\n\nEXPORT CONST double xcos(double d) {\n  double u, s, t = d;\n  int ql;\n\n  if (fabsk(d) < TRIGRANGEMAX2) {\n    ql = mla(2, rintk(d * M_1_PI - 0.5), 1);\n    d = mla(ql, -PI_A2*0.5, d);\n    d = mla(ql, -PI_B2*0.5, d);\n  } else if (fabsk(d) < TRIGRANGEMAX) {\n    double dqh = trunck(d * (M_1_PI / (INT64_C(1) << 23)) - 0.5 * (M_1_PI / (INT64_C(1) << 23)));\n    ql = 2*rintk(d * M_1_PI - 0.5 - dqh * (double)(INT64_C(1) << 23))+1;\n    dqh *= 1 << 24;\n\n    d = mla(dqh, -PI_A*0.5, d);\n    d = mla( ql, -PI_A*0.5, d);\n    d = mla(dqh, -PI_B*0.5, d);\n    d = mla( ql, -PI_B*0.5, d);\n    d = mla(dqh, -PI_C*0.5, d);\n    d = mla( ql, -PI_C*0.5, d);\n    d = mla(dqh + ql , -PI_D*0.5, d);\n  } else {\n    ddi_t ddi = rempi(t);\n    ql = ((ddi.i & 3) * 2 + (ddi.dd.x > 0) + 7) >> 1;\n    if ((ddi.i & 1) == 0) {\n      ddi.dd = ddadd2_d2_d2_d2(ddi.dd, dd(mulsign(3.141592653589793116*-0.5, ddi.dd.x > 0 ? 1 : -1),\n\t\t\t\t\t  mulsign(1.2246467991473532072e-16*-0.5, ddi.dd.x > 0 ? 1 : -1)));\n    }\n    d = ddi.dd.x + ddi.dd.y;\n    if (xisinf(t) || xisnan(t)) d = SLEEF_NAN;\n  }\n  \n  s = d * d;\n\n  if ((ql & 2) == 0) d = -d;\n\n  double s2 = s * s, s4 = s2 * s2;\n  u = POLY8(s, s2, s4,\n\t    -7.97255955009037868891952e-18,\n\t    2.81009972710863200091251e-15,\n\t    -7.64712219118158833288484e-13,\n\t    1.60590430605664501629054e-10,\n\t    -2.50521083763502045810755e-08,\n\t    2.75573192239198747630416e-06,\n\t    -0.000198412698412696162806809,\n\t    0.00833333333333332974823815);\n  u = mla(u, s, -0.166666666666666657414808);\n\n  u = mla(s, u * d, d);\n\n  return u;\n}\n\nEXPORT CONST double xcos_u1(double d) {\n  double u;\n  Sleef_double2 s, t, x;\n  int ql;\n  \n  d = fabsk(d);\n\n  if (d < TRIGRANGEMAX2) {\n    ql = mla(2, rintk(d * M_1_PI - 0.5), 1);\n    s = ddadd2_d2_d_d(d, ql * (-PI_A2*0.5));\n    s = ddadd_d2_d2_d(s, ql * (-PI_B2*0.5));\n  } else if (d < TRIGRANGEMAX) {\n    double dqh = trunck(d * (M_1_PI / (INT64_C(1) << 23)) - 0.5 * (M_1_PI / (INT64_C(1) << 23)));\n    ql = 2*rintk(d * M_1_PI - 0.5 - dqh * (double)(INT64_C(1) << 23))+1;\n    dqh *= 1 << 24;\n\n    u = mla(dqh, -PI_A*0.5, d);\n    s = ddadd2_d2_d_d (u,  ql * (-PI_A*0.5));\n    s = ddadd2_d2_d2_d(s, dqh * (-PI_B*0.5));\n    s = ddadd2_d2_d2_d(s,  ql * (-PI_B*0.5));\n    s = ddadd2_d2_d2_d(s, dqh * (-PI_C*0.5));\n    s = ddadd2_d2_d2_d(s,  ql * (-PI_C*0.5));\n    s = ddadd_d2_d2_d(s, (dqh + ql) * (-PI_D*0.5));\n  } else {\n    ddi_t ddi = rempi(d);\n    ql = ((ddi.i & 3) * 2 + (ddi.dd.x > 0) + 7) >> 1;\n    if ((ddi.i & 1) == 0) {\n      ddi.dd = ddadd2_d2_d2_d2(ddi.dd, dd(mulsign(3.141592653589793116*-0.5, ddi.dd.x > 0 ? 1 : -1),\n\t\t\t\t\t  mulsign(1.2246467991473532072e-16*-0.5, ddi.dd.x > 0 ? 1 : -1)));\n    }\n    s = ddnormalize_d2_d2(ddi.dd);\n    if (xisinf(d) || xisnan(d)) s.x = SLEEF_NAN;\n  }\n  \n  t = s;\n  s = ddsqu_d2_d2(s);\n\n  double s2 = s.x * s.x, s4 = s2 * s2;\n  u = POLY6(s.x, s2, s4,\n\t    2.72052416138529567917983e-15,\n\t    -7.6429259411395447190023e-13,\n\t    1.60589370117277896211623e-10,\n\t    -2.5052106814843123359368e-08,\n\t    2.75573192104428224777379e-06,\n\t    -0.000198412698412046454654947);\n  u = mla(u, s.x, 0.00833333333333318056201922);\n\n  x = ddadd_d2_d_d2(1, ddmul_d2_d2_d2(ddadd_d2_d_d(-0.166666666666666657414808, u * s.x), s));\n  u = ddmul_d_d2_d2(t, x);\n  \n  if ((((int)ql) & 2) == 0) u = -u;\n\n  return u;\n}\n\nEXPORT CONST Sleef_double2 xsincos(double d) {\n  double u, s, t;\n  Sleef_double2 r;\n  int ql;\n\n  s = d;\n\n  if (fabsk(d) < TRIGRANGEMAX2) {\n    ql = rintk(s * (2 * M_1_PI));\n    s = mla(ql, -PI_A2*0.5, s);\n    s = mla(ql, -PI_B2*0.5, s);\n  } else if (fabsk(d) < TRIGRANGEMAX) {\n    double dqh = trunck(d * ((2 * M_1_PI) / (1 << 24))) * (double)(1 << 24);\n    ql = rintk(d * (2 * M_1_PI) - dqh);\n\n    s = mla(dqh, -PI_A * 0.5, s);\n    s = mla( ql, -PI_A * 0.5, s);\n    s = mla(dqh, -PI_B * 0.5, s);\n    s = mla( ql, -PI_B * 0.5, s);\n    s = mla(dqh, -PI_C * 0.5, s);\n    s = mla( ql, -PI_C * 0.5, s);\n    s = mla(dqh + ql, -PI_D * 0.5, s);\n  } else {\n    ddi_t ddi = rempi(d);\n    ql = ddi.i;\n    s = ddi.dd.x + ddi.dd.y;\n    if (xisinf(d) || xisnan(d)) s = SLEEF_NAN;\n  }  \n\n  t = s;\n\n  s = s * s;\n  \n  u = 1.58938307283228937328511e-10;\n  u = mla(u, s, -2.50506943502539773349318e-08);\n  u = mla(u, s, 2.75573131776846360512547e-06);\n  u = mla(u, s, -0.000198412698278911770864914);\n  u = mla(u, s, 0.0083333333333191845961746);\n  u = mla(u, s, -0.166666666666666130709393);\n  u = u * s * t;\n\n  r.x = t + u;\n\n  if (xisnegzero(d)) r.x = -0.0;\n\n  u = -1.13615350239097429531523e-11;\n  u = mla(u, s, 2.08757471207040055479366e-09);\n  u = mla(u, s, -2.75573144028847567498567e-07);\n  u = mla(u, s, 2.48015872890001867311915e-05);\n  u = mla(u, s, -0.00138888888888714019282329);\n  u = mla(u, s, 0.0416666666666665519592062);\n  u = mla(u, s, -0.5);\n\n  r.y = u * s + 1;\n\n  if ((ql & 1) != 0) { s = r.y; r.y = r.x; r.x = s; }\n  if ((ql & 2) != 0) { r.x = -r.x; }\n  if (((ql+1) & 2) != 0) { r.y = -r.y; }\n\n  return r;\n}\n\nEXPORT CONST Sleef_double2 xsincos_u1(double d) {\n  double u;\n  Sleef_double2 r, s, t, x;\n  int ql;\n  \n  if (fabsk(d) < TRIGRANGEMAX2) {\n    ql = rintk(d * (2 * M_1_PI));\n    u = mla(ql, -PI_A2*0.5, d);\n    s = ddadd_d2_d_d (u,  ql * (-PI_B2*0.5));\n  } else if (fabsk(d) < TRIGRANGEMAX) {\n    const double dqh = trunck(d * ((2 * M_1_PI) / (1 << 24))) * (double)(1 << 24);\n    ql = rintk(d * (2 * M_1_PI) - dqh);\n\n    u = mla(dqh, -PI_A*0.5, d);\n    s = ddadd_d2_d_d(u, ql * (-PI_A*0.5));\n    s = ddadd2_d2_d2_d(s, dqh * (-PI_B*0.5));\n    s = ddadd2_d2_d2_d(s, ql * (-PI_B*0.5));\n    s = ddadd2_d2_d2_d(s, dqh * (-PI_C*0.5));\n    s = ddadd2_d2_d2_d(s, ql * (-PI_C*0.5));\n    s = ddadd_d2_d2_d(s, (dqh + ql) * (-PI_D*0.5));\n  } else {\n    ddi_t ddi = rempi(d);\n    ql = ddi.i;\n    s = ddi.dd;\n    if (xisinf(d) || xisnan(d)) s = dd(SLEEF_NAN, SLEEF_NAN);\n  }\n  \n  t = s;\n\n  s.x = ddsqu_d_d2(s);\n  \n  u = 1.58938307283228937328511e-10;\n  u = mla(u, s.x, -2.50506943502539773349318e-08);\n  u = mla(u, s.x, 2.75573131776846360512547e-06);\n  u = mla(u, s.x, -0.000198412698278911770864914);\n  u = mla(u, s.x, 0.0083333333333191845961746);\n  u = mla(u, s.x, -0.166666666666666130709393);\n\n  u *= s.x * t.x;\n\n  x = ddadd_d2_d2_d(t, u);\n  r.x = x.x + x.y;\n  \n  if (xisnegzero(d)) r.x = -0.0;\n\n  u = -1.13615350239097429531523e-11;\n  u = mla(u, s.x, 2.08757471207040055479366e-09);\n  u = mla(u, s.x, -2.75573144028847567498567e-07);\n  u = mla(u, s.x, 2.48015872890001867311915e-05);\n  u = mla(u, s.x, -0.00138888888888714019282329);\n  u = mla(u, s.x, 0.0416666666666665519592062);\n  u = mla(u, s.x, -0.5);\n\n  x = ddadd_d2_d_d2(1, ddmul_d2_d_d(s.x, u));\n  r.y = x.x + x.y;\n  \n  if ((ql & 1) != 0) { u = r.y; r.y = r.x; r.x = u; }\n  if ((ql & 2) != 0) { r.x = -r.x; }\n  if (((ql+1) & 2) != 0) { r.y = -r.y; }\n\n  return r;\n}\n\nEXPORT CONST Sleef_double2 xsincospi_u05(double d) {\n  double u, s, t;\n  Sleef_double2 r, x, s2;\n\n  u = d * 4;\n  int q = ceilk(u) & ~(int)1;\n  \n  s = u - (double)q;\n  t = s;\n  s = s * s;\n  s2 = ddmul_d2_d_d(t, t);\n  \n  //\n  \n  u = -2.02461120785182399295868e-14;\n  u = mla(u, s, 6.94821830580179461327784e-12);\n  u = mla(u, s, -1.75724749952853179952664e-09);\n  u = mla(u, s, 3.13361688966868392878422e-07);\n  u = mla(u, s, -3.6576204182161551920361e-05);\n  u = mla(u, s, 0.00249039457019271850274356);\n  x = ddadd2_d2_d_d2(u * s, dd(-0.0807455121882807852484731, 3.61852475067037104849987e-18));\n  x = ddadd2_d2_d2_d2(ddmul_d2_d2_d2(s2, x), dd(0.785398163397448278999491, 3.06287113727155002607105e-17));\n\n  x = ddmul_d2_d2_d(x, t);\n  r.x = x.x + x.y;\n  \n  if (xisnegzero(d)) r.x = -0.0;\n  \n  //\n\n  u = 9.94480387626843774090208e-16;\n  u = mla(u, s, -3.89796226062932799164047e-13);\n  u = mla(u, s, 1.15011582539996035266901e-10);\n  u = mla(u, s, -2.4611369501044697495359e-08);\n  u = mla(u, s, 3.59086044859052754005062e-06);\n  u = mla(u, s, -0.000325991886927389905997954);\n  x = ddadd2_d2_d_d2(u * s, dd(0.0158543442438155018914259, -1.04693272280631521908845e-18));\n  x = ddadd2_d2_d2_d2(ddmul_d2_d2_d2(s2, x), dd(-0.308425137534042437259529, -1.95698492133633550338345e-17));\n\n  x = ddadd2_d2_d2_d(ddmul_d2_d2_d2(x, s2), 1);\n  r.y = x.x + x.y;\n  \n  //\n\n  if ((q & 2) != 0) { s = r.y; r.y = r.x; r.x = s; }\n  if ((q & 4) != 0) { r.x = -r.x; }\n  if (((q+2) & 4) != 0) { r.y = -r.y; }\n\n  if (fabsk(d) > TRIGRANGEMAX3/4) { r.x = 0; r.y = 1; }\n  if (xisinf(d)) { r.x = r.y = SLEEF_NAN; }\n\n  return r;\n}\n\nEXPORT CONST Sleef_double2 xsincospi_u35(double d) {\n  double u, s, t;\n  Sleef_double2 r;\n\n  u = d * 4;\n  int q = ceilk(u) & ~(int)1;\n  \n  s = u - (double)q;\n  t = s;\n  s = s * s;\n  \n  //\n  \n  u = +0.6880638894766060136e-11;\n  u = mla(u, s, -0.1757159564542310199e-8);\n  u = mla(u, s, +0.3133616327257867311e-6);\n  u = mla(u, s, -0.3657620416388486452e-4);\n  u = mla(u, s, +0.2490394570189932103e-2);\n  u = mla(u, s, -0.8074551218828056320e-1);\n  u = mla(u, s, +0.7853981633974482790e+0);\n  \n  r.x = u * t;\n\n  //\n\n  u = -0.3860141213683794352e-12;\n  u = mla(u, s, +0.1150057888029681415e-9);\n  u = mla(u, s, -0.2461136493006663553e-7);\n  u = mla(u, s, +0.3590860446623516713e-5);\n  u = mla(u, s, -0.3259918869269435942e-3);\n  u = mla(u, s, +0.1585434424381541169e-1);\n  u = mla(u, s, -0.3084251375340424373e+0);\n  u = mla(u, s, 1);\n\n  r.y = u;\n\n  //\n  \n  if ((q & 2) != 0) { s = r.y; r.y = r.x; r.x = s; }\n  if ((q & 4) != 0) { r.x = -r.x; }\n  if (((q+2) & 4) != 0) { r.y = -r.y; }\n\n  if (fabsk(d) > TRIGRANGEMAX3/4) { r.x = 0; r.y = 1; }\n  if (xisinf(d)) { r.x = r.y = SLEEF_NAN; }\n\n  return r;\n}\n\nstatic INLINE CONST Sleef_double2 sinpik(double d) {\n  double u, s, t;\n  Sleef_double2 x, s2;\n\n  u = d * 4;\n  int q = ceilk(u) & ~1;\n  int o = (q & 2) != 0;\n  \n  s = u - (double)q;\n  t = s;\n  s = s * s;\n  s2 = ddmul_d2_d_d(t, t);\n  \n  //\n  \n  u = o ? 9.94480387626843774090208e-16 : -2.02461120785182399295868e-14;\n  u = mla(u, s, o ? -3.89796226062932799164047e-13 : 6.94821830580179461327784e-12);\n  u = mla(u, s, o ? 1.15011582539996035266901e-10 : -1.75724749952853179952664e-09);\n  u = mla(u, s, o ? -2.4611369501044697495359e-08 : 3.13361688966868392878422e-07);\n  u = mla(u, s, o ? 3.59086044859052754005062e-06 : -3.6576204182161551920361e-05);\n  u = mla(u, s, o ? -0.000325991886927389905997954 : 0.00249039457019271850274356);\n  x = ddadd2_d2_d_d2(u * s, o ? dd(0.0158543442438155018914259, -1.04693272280631521908845e-18) :\n\t\t     dd(-0.0807455121882807852484731, 3.61852475067037104849987e-18));\n  x = ddadd2_d2_d2_d2(ddmul_d2_d2_d2(s2, x), o ? dd(-0.308425137534042437259529, -1.95698492133633550338345e-17) :\n\t\t      dd(0.785398163397448278999491, 3.06287113727155002607105e-17));\n\n  x = ddmul_d2_d2_d2(x, o ? s2 : dd(t, 0));\n  x = o ? ddadd2_d2_d2_d(x, 1) : x;\n  \n  //\n\n  if ((q & 4) != 0) { x.x = -x.x; x.y = -x.y; }\n\n  return x;\n}\n\nEXPORT CONST double xsinpi_u05(double d) {\n  Sleef_double2 x = sinpik(d);\n  double r = x.x + x.y;\n\n  if (xisnegzero(d)) r = -0.0;\n  if (fabsk(d) > TRIGRANGEMAX3/4) r = 0; \n  if (xisinf(d)) r = SLEEF_NAN;\n\n  return r;\n}\n\nstatic INLINE CONST Sleef_double2 cospik(double d) {\n  double u, s, t;\n  Sleef_double2 x, s2;\n\n  u = d * 4;\n  int q = ceilk(u) & ~1;\n  int o = (q & 2) == 0;\n  \n  s = u - (double)q;\n  t = s;\n  s = s * s;\n  s2 = ddmul_d2_d_d(t, t);\n  \n  //\n  \n  u = o ? 9.94480387626843774090208e-16 : -2.02461120785182399295868e-14;\n  u = mla(u, s, o ? -3.89796226062932799164047e-13 : 6.94821830580179461327784e-12);\n  u = mla(u, s, o ? 1.15011582539996035266901e-10 : -1.75724749952853179952664e-09);\n  u = mla(u, s, o ? -2.4611369501044697495359e-08 : 3.13361688966868392878422e-07);\n  u = mla(u, s, o ? 3.59086044859052754005062e-06 : -3.6576204182161551920361e-05);\n  u = mla(u, s, o ? -0.000325991886927389905997954 : 0.00249039457019271850274356);\n  x = ddadd2_d2_d_d2(u * s, o ? dd(0.0158543442438155018914259, -1.04693272280631521908845e-18) :\n\t\t     dd(-0.0807455121882807852484731, 3.61852475067037104849987e-18));\n  x = ddadd2_d2_d2_d2(ddmul_d2_d2_d2(s2, x), o ? dd(-0.308425137534042437259529, -1.95698492133633550338345e-17) :\n\t\t      dd(0.785398163397448278999491, 3.06287113727155002607105e-17));\n\n  x = ddmul_d2_d2_d2(x, o ? s2 : dd(t, 0));\n  x = o ? ddadd2_d2_d2_d(x, 1) : x;\n  \n  //\n\n  if (((q+2) & 4) != 0) { x.x = -x.x; x.y = -x.y; }\n\n  return x;\n}\n\nEXPORT CONST double xcospi_u05(double d) {\n  Sleef_double2 x = cospik(d);\n  double r = x.x + x.y;\n\n  if (fabsk(d) > TRIGRANGEMAX3/4) r = 1; \n  if (xisinf(d)) r = SLEEF_NAN;\n\n  return r;\n}\n\nEXPORT CONST double xtan(double d) {\n  double u, s, x, y;\n  int ql;\n\n  if (fabsk(d) < TRIGRANGEMAX2) {\n    ql = rintk(d * (2 * M_1_PI));\n    x = mla(ql, -PI_A2*0.5, d);\n    x = mla(ql, -PI_B2*0.5, x);\n  } else if (fabsk(d) < 1e+6) {\n    double dqh = trunck(d * ((2 * M_1_PI) / (1 << 24))) * (double)(1 << 24);\n    ql = rintk(d * (2 * M_1_PI) - dqh);\n\n    x = mla(dqh, -PI_A * 0.5, d);\n    x = mla( ql, -PI_A * 0.5, x);\n    x = mla(dqh, -PI_B * 0.5, x);\n    x = mla( ql, -PI_B * 0.5, x);\n    x = mla(dqh, -PI_C * 0.5, x);\n    x = mla( ql, -PI_C * 0.5, x);\n    x = mla(dqh + ql, -PI_D * 0.5, x);\n  } else {\n    ddi_t ddi = rempi(d);\n    ql = ddi.i;\n    x = ddi.dd.x + ddi.dd.y;\n    if (xisinf(d) || xisnan(d)) x = SLEEF_NAN;\n  }\n  \n  x *= 0.5;\n  s = x * x;\n\n  double s2 = s * s, s4 = s2 * s2;\n  u = POLY8(s, s2, s4,\n\t    +0.3245098826639276316e-3,\n\t    +0.5619219738114323735e-3,\n\t    +0.1460781502402784494e-2,\n\t    +0.3591611540792499519e-2,\n\t    +0.8863268409563113126e-2,\n\t    +0.2186948728185535498e-1,\n\t    +0.5396825399517272970e-1,\n\t    +0.1333333333330500581e+0);\n\n  u = mla(u, s, +0.3333333333333343695e+0);\n  u = mla(s, u * x, x);\n\n  y = mla(u, u, -1);\n  x = -2 * u;\n\n  if ((ql & 1) != 0) { double t = x; x = y; y = -t; }\n\n  u = x / y;\n\n  return u;\n}\n\nEXPORT CONST double xtan_u1(double d) {\n  double u;\n  Sleef_double2 s, t, x, y;\n  int ql;\n  \n  if (fabsk(d) < TRIGRANGEMAX2) {\n    ql = rintk(d * (2 * M_1_PI));\n    u = mla(ql, -PI_A2*0.5, d);\n    s = ddadd_d2_d_d(u,  ql * (-PI_B2*0.5));\n  } else if (fabsk(d) < TRIGRANGEMAX) {\n    const double dqh = trunck(d * (M_2_PI / (1 << 24))) * (double)(1 << 24);\n    s = ddadd2_d2_d2_d(ddmul_d2_d2_d(dd(M_2_PI_H, M_2_PI_L), d), (d < 0 ? -0.5 : 0.5) - dqh);\n    ql = s.x + s.y;\n\n    u = mla(dqh, -PI_A*0.5, d);\n    s = ddadd_d2_d_d  (u,  ql * (-PI_A*0.5));\n    s = ddadd2_d2_d2_d(s, dqh * (-PI_B*0.5));\n    s = ddadd2_d2_d2_d(s,  ql * (-PI_B*0.5));\n    s = ddadd2_d2_d2_d(s, dqh * (-PI_C*0.5));\n    s = ddadd2_d2_d2_d(s,  ql * (-PI_C*0.5));\n    s = ddadd_d2_d2_d(s, (dqh + ql) * (-PI_D*0.5));\n  } else {\n    ddi_t ddi = rempi(d);\n    ql = ddi.i;\n    s = ddi.dd;\n    if (xisinf(d) || xisnan(d)) s.x = SLEEF_NAN;\n  }\n  \n  t = ddscale_d2_d2_d(s, 0.5);\n  s = ddsqu_d2_d2(t);\n\n  double s2 = s.x * s.x, s4 = s2 * s2;\n  u = POLY8(s.x, s2, s4,\n\t    +0.3245098826639276316e-3,\n\t    +0.5619219738114323735e-3,\n\t    +0.1460781502402784494e-2,\n\t    +0.3591611540792499519e-2,\n\t    +0.8863268409563113126e-2,\n\t    +0.2186948728185535498e-1,\n\t    +0.5396825399517272970e-1,\n\t    +0.1333333333330500581e+0);\n\n  u = mla(u, s.x, +0.3333333333333343695e+0);\n  x = ddadd_d2_d2_d2(t, ddmul_d2_d2_d(ddmul_d2_d2_d2(s, t), u));\n\n  y = ddadd_d2_d_d2(-1, ddsqu_d2_d2(x));\n  x = ddscale_d2_d2_d(x, -2);\n\n  if ((ql & 1) != 0) { t = x; x = y; y = ddneg_d2_d2(t); }\n\n  x = dddiv_d2_d2_d2(x, y);\n\n  u = x.x + x.y;\n\n  if (xisnegzero(d)) u = d;\n  \n  return u;\n}\n\nEXPORT CONST double xlog(double d) {\n  double x, x2, t, m;\n  int e;\n\n  int o = d < DBL_MIN;\n  if (o) d *= (double)(INT64_C(1) << 32) * (double)(INT64_C(1) << 32);\n  \n  e = ilogb2k(d * (1.0/0.75));\n  m = ldexp3k(d, -e);\n\n  if (o) e -= 64;\n  \n  x = (m-1) / (m+1);\n  x2 = x * x;\n\n  double x4 = x2 * x2, x8 = x4 * x4;\n\n  t = POLY7(x2, x4, x8,\n\t    0.153487338491425068243146,\n\t    0.152519917006351951593857,\n\t    0.181863266251982985677316,\n\t    0.222221366518767365905163,\n\t    0.285714294746548025383248,\n\t    0.399999999950799600689777,\n\t    0.6666666666667778740063);\n\n  x = x * 2 + 0.693147180559945286226764 * e + x * x2 * t;\n  \n  if (xisinf(d)) x = SLEEF_INFINITY;\n  if (d < 0 || xisnan(d)) x = SLEEF_NAN;\n  if (d == 0) x = -SLEEF_INFINITY;\n\n  return x;\n}\n\nEXPORT CONST double xexp(double d) {\n  int q = (int)rintk(d * R_LN2);\n  double s, u;\n\n  s = mla(q, -L2U, d);\n  s = mla(q, -L2L, s);\n\n  double s2 = s * s, s4 = s2 * s2, s8 = s4 * s4;\n  u = POLY10(s, s2, s4, s8,\n\t     2.08860621107283687536341e-09,\n\t     2.51112930892876518610661e-08,\n\t     2.75573911234900471893338e-07,\n\t     2.75572362911928827629423e-06,\n\t     2.4801587159235472998791e-05,\n\t     0.000198412698960509205564975,\n\t     0.00138888888889774492207962,\n\t     0.00833333333331652721664984,\n\t     0.0416666666666665047591422,\n\t     0.166666666666666851703837);\n  u = mla(u, s, +0.5);\n\n  u = s * s * u + s + 1;\n  u = ldexp2k(u, q);\n\n  if (d > 709.78271114955742909217217426) u = SLEEF_INFINITY;\n  if (d < -1000) u = 0;\n  \n  return u;\n}\n\nstatic INLINE CONST double expm1k(double d) {\n  int q = (int)rintk(d * R_LN2);\n  double s, u;\n\n  s = mla(q, -L2U, d);\n  s = mla(q, -L2L, s);\n  \n  double s2 = s * s, s4 = s2 * s2, s8 = s4 * s4;\n  u = POLY10(s, s2, s4, s8,\n\t     2.08860621107283687536341e-09,\n\t     2.51112930892876518610661e-08,\n\t     2.75573911234900471893338e-07,\n\t     2.75572362911928827629423e-06,\n\t     2.4801587159235472998791e-05,\n\t     0.000198412698960509205564975,\n\t     0.00138888888889774492207962,\n\t     0.00833333333331652721664984,\n\t     0.0416666666666665047591422,\n\t     0.166666666666666851703837);\n\n  u = mla(s2, 0.5, s2 * s * u) + s;\n\n  if (q != 0) u = ldexp2k(u + 1, q) - 1;\n  \n  return u;\n}\n\nstatic INLINE CONST Sleef_double2 logk(double d) {\n  Sleef_double2 x, x2, s;\n  double m, t;\n  int e;\n\n  int o = d < DBL_MIN;\n  if (o) d *= (double)(INT64_C(1) << 32) * (double)(INT64_C(1) << 32);\n  \n  e = ilogb2k(d * (1.0/0.75));\n  m = ldexp3k(d, -e);\n\n  if (o) e -= 64;\n  \n  x = dddiv_d2_d2_d2(ddadd2_d2_d_d(-1, m), ddadd2_d2_d_d(1, m));\n  x2 = ddsqu_d2_d2(x);\n\n  double x4 = x2.x * x2.x, x8 = x4 * x4, x16 = x8 * x8;\n  t = POLY9(x2.x, x4, x8, x16,\n\t    0.116255524079935043668677,\n\t    0.103239680901072952701192,\n\t    0.117754809412463995466069,\n\t    0.13332981086846273921509,\n\t    0.153846227114512262845736,\n\t    0.181818180850050775676507,\n\t    0.222222222230083560345903,\n\t    0.285714285714249172087875,\n\t    0.400000000000000077715612);\n\n  Sleef_double2 c = dd(0.666666666666666629659233, 3.80554962542412056336616e-17);\n  s = ddmul_d2_d2_d(dd(0.693147180559945286226764, 2.319046813846299558417771e-17), e);\n  s = ddadd_d2_d2_d2(s, ddscale_d2_d2_d(x, 2));\n  x = ddmul_d2_d2_d2(x2, x);\n  s = ddadd_d2_d2_d2(s, ddmul_d2_d2_d2(x, c));\n  x = ddmul_d2_d2_d2(x2, x);\n  s = ddadd_d2_d2_d2(s, ddmul_d2_d2_d(x, t));\n\n  return s;\n}\n\nEXPORT CONST double xlog_u1(double d) {\n  Sleef_double2 x, s;\n  double m, t, x2;\n  int e;\n\n  int o = d < DBL_MIN;\n  if (o) d *= (double)(INT64_C(1) << 32) * (double)(INT64_C(1) << 32);\n      \n  e = ilogb2k(d * (1.0/0.75));\n  m = ldexp3k(d, -e);\n\n  if (o) e -= 64;\n  \n  x = dddiv_d2_d2_d2(ddadd2_d2_d_d(-1, m), ddadd2_d2_d_d(1, m));\n  x2 = x.x * x.x;\n\n  double x4 = x2 * x2, x8 = x4 * x4;\n  t = POLY7(x2, x4, x8,\n\t    0.1532076988502701353e+0,\n\t    0.1525629051003428716e+0,\n\t    0.1818605932937785996e+0,\n\t    0.2222214519839380009e+0,\n\t    0.2857142932794299317e+0,\n\t    0.3999999999635251990e+0,\n\t    0.6666666666667333541e+0);\n\n  s = ddmul_d2_d2_d(dd(0.693147180559945286226764, 2.319046813846299558417771e-17), (double)e);\n  s = ddadd_d2_d2_d2(s, ddscale_d2_d2_d(x, 2));\n  s = ddadd_d2_d2_d(s, x2 * x.x * t);\n\n  double r = s.x + s.y;\n  \n  if (xisinf(d)) r = SLEEF_INFINITY;\n  if (d < 0 || xisnan(d)) r = SLEEF_NAN;\n  if (d == 0) r = -SLEEF_INFINITY;\n\n  return r;\n}\n\nstatic INLINE CONST double expk(Sleef_double2 d) {\n  int q = (int)rintk((d.x + d.y) * R_LN2);\n  Sleef_double2 s, t;\n  double u;\n\n  s = ddadd2_d2_d2_d(d, q * -L2U);\n  s = ddadd2_d2_d2_d(s, q * -L2L);\n\n  s = ddnormalize_d2_d2(s);\n\n  double s2 = s.x * s.x, s4 = s2 * s2, s8 = s4 * s4;\n  u = POLY10(s.x, s2, s4, s8,\n\t     2.51069683420950419527139e-08,\n\t     2.76286166770270649116855e-07,\n\t     2.75572496725023574143864e-06,\n\t     2.48014973989819794114153e-05,\n\t     0.000198412698809069797676111,\n\t     0.0013888888939977128960529,\n\t     0.00833333333332371417601081,\n\t     0.0416666666665409524128449,\n\t     0.166666666666666740681535,\n\t     0.500000000000000999200722);\n\n  t = ddadd_d2_d_d2(1, s);\n  t = ddadd_d2_d2_d2(t, ddmul_d2_d2_d(ddsqu_d2_d2(s), u));\n\n  u = ldexpk(t.x + t.y, q);\n\n  if (d.x < -1000) u = 0;\n\n  return u;\n}\n\nEXPORT CONST double xpow(double x, double y) {\n  int yisint = xisint(y);\n  int yisodd = yisint && xisodd(y);\n\n  Sleef_double2 d = ddmul_d2_d2_d(logk(fabsk(x)), y);\n  double result = expk(d);\n  if (d.x > 709.78271114955742909217217426) result = SLEEF_INFINITY;\n\n  result = xisnan(result) ? SLEEF_INFINITY : result;\n  result *= (x > 0 ? 1 : (!yisint ? SLEEF_NAN : (yisodd ? -1 : 1)));\n\n  double efx = mulsign(fabsk(x) - 1, y);\n  if (xisinf(y)) result = efx < 0 ? 0.0 : (efx == 0 ? 1.0 : SLEEF_INFINITY);\n  if (xisinf(x) || x == 0) result = (yisodd ? sign(x) : 1) * ((x == 0 ? -y : y) < 0 ? 0 : SLEEF_INFINITY);\n  if (xisnan(x) || xisnan(y)) result = SLEEF_NAN;\n  if (y == 0 || x == 1) result = 1;\n\n  return result;\n}\n\nstatic INLINE CONST Sleef_double2 expk2(Sleef_double2 d) {\n  int q = (int)rintk((d.x + d.y) * R_LN2);\n  Sleef_double2 s, t;\n  double u;\n\n  s = ddadd2_d2_d2_d(d, q * -L2U);\n  s = ddadd2_d2_d2_d(s, q * -L2L);\n\n  u = +0.1602472219709932072e-9;\n  u = mla(u, s.x, +0.2092255183563157007e-8);\n  u = mla(u, s.x, +0.2505230023782644465e-7);\n  u = mla(u, s.x, +0.2755724800902135303e-6);\n  u = mla(u, s.x, +0.2755731892386044373e-5);\n  u = mla(u, s.x, +0.2480158735605815065e-4);\n  u = mla(u, s.x, +0.1984126984148071858e-3);\n  u = mla(u, s.x, +0.1388888888886763255e-2);\n  u = mla(u, s.x, +0.8333333333333347095e-2);\n  u = mla(u, s.x, +0.4166666666666669905e-1);\n\n  t = ddadd2_d2_d2_d(ddmul_d2_d2_d(s, u), +0.1666666666666666574e+0);\n  t = ddadd2_d2_d2_d(ddmul_d2_d2_d2(s, t), 0.5);\n  t = ddadd2_d2_d2_d2(s, ddmul_d2_d2_d2(ddsqu_d2_d2(s), t));\n\n  t = ddadd2_d2_d_d2(1, t);\n\n  t.x = ldexp2k(t.x, q);\n  t.y = ldexp2k(t.y, q);\n\n  return d.x < -1000 ? dd(0, 0) : t;\n}\n\nEXPORT CONST double xsinh(double x) {\n  double y = fabsk(x);\n  Sleef_double2 d = expk2(dd(y, 0));\n  d = ddsub_d2_d2_d2(d, ddrec_d2_d2(d));\n  y = (d.x + d.y) * 0.5;\n\n  y = fabsk(x) > 710 ? SLEEF_INFINITY : y;\n  y = xisnan(y) ? SLEEF_INFINITY : y;\n  y = mulsign(y, x);\n  y = xisnan(x) ? SLEEF_NAN : y;\n\n  return y;\n}\n\nEXPORT CONST double xcosh(double x) {\n  double y = fabsk(x);\n  Sleef_double2 d = expk2(dd(y, 0));\n  d = ddadd_d2_d2_d2(d, ddrec_d2_d2(d));\n  y = (d.x + d.y) * 0.5;\n\n  y = fabsk(x) > 710 ? SLEEF_INFINITY : y;\n  y = xisnan(y) ? SLEEF_INFINITY : y;\n  y = xisnan(x) ? SLEEF_NAN : y;\n\n  return y;\n}\n\nEXPORT CONST double xtanh(double x) {\n  double y = fabsk(x);\n  Sleef_double2 d = expk2(dd(y, 0));\n  Sleef_double2 e = ddrec_d2_d2(d);\n  d = dddiv_d2_d2_d2(ddsub_d2_d2_d2(d, e), ddadd_d2_d2_d2(d, e));\n  y = d.x + d.y;\n\n  y = fabsk(x) > 18.714973875 ? 1.0 : y;\n  y = xisnan(y) ? 1.0 : y;\n  y = mulsign(y, x);\n  y = xisnan(x) ? SLEEF_NAN : y;\n\n  return y;\n}\n\nEXPORT CONST double xsinh_u35(double x) {\n  double e = expm1k(fabsk(x));\n  double y = (e + 2) / (e + 1) * (0.5 * e);\n\n  y = fabsk(x) > 709 ? SLEEF_INFINITY : y;\n  y = xisnan(y) ? SLEEF_INFINITY : y;\n  y = mulsign(y, x);\n  y = xisnan(x) ? SLEEF_NAN : y;\n\n  return y;\n}\n\nEXPORT CONST double xcosh_u35(double x) {\n  double e = xexp(fabsk(x));\n  double y = 0.5 / e + 0.5 * e;\n\n  y = fabsk(x) > 709 ? SLEEF_INFINITY : y;\n  y = xisnan(y) ? SLEEF_INFINITY : y;\n  y = xisnan(x) ? SLEEF_NAN : y;\n\n  return y;\n}\n\nEXPORT CONST double xtanh_u35(double x) {\n  double y = fabsk(x);\n  double d = expm1k(2*y);\n  y = d / (d + 2);\n\n  y = fabsk(x) > 18.714973875 ? 1.0 : y;\n  y = xisnan(y) ? 1.0 : y;\n  y = mulsign(y, x);\n  y = xisnan(x) ? SLEEF_NAN : y;\n\n  return y;\n}\n\nstatic INLINE CONST Sleef_double2 logk2(Sleef_double2 d) {\n  Sleef_double2 x, x2, m, s;\n  double t;\n  int e;\n  \n  e = ilogbk(d.x * (1.0/0.75));\n\n  m.x = ldexp2k(d.x, -e);\n  m.y = ldexp2k(d.y, -e);\n\n  x = dddiv_d2_d2_d2(ddadd2_d2_d2_d(m, -1), ddadd2_d2_d2_d(m, 1));\n  x2 = ddsqu_d2_d2(x);\n\n  double x4 = x2.x * x2.x, x8 = x4 * x4;\n  t = POLY7(x2.x, x4, x8,\n\t    0.13860436390467167910856,\n\t    0.131699838841615374240845,\n\t    0.153914168346271945653214,\n\t    0.181816523941564611721589,\n\t    0.22222224632662035403996,\n\t    0.285714285511134091777308,\n\t    0.400000000000914013309483);\n  t = mla(t, x2.x, 0.666666666666664853302393);\n\n  s = ddmul_d2_d2_d(dd(0.693147180559945286226764, 2.319046813846299558417771e-17), e);\n  s = ddadd_d2_d2_d2(s, ddscale_d2_d2_d(x, 2));\n  s = ddadd_d2_d2_d2(s, ddmul_d2_d2_d(ddmul_d2_d2_d2(x2, x), t));\n\n  return s;\n}\n\nEXPORT CONST double xasinh(double x) {\n  double y = fabsk(x);\n  Sleef_double2 d;\n\n  d = y > 1 ? ddrec_d2_d(x) : dd(y, 0);\n  d = ddsqrt_d2_d2(ddadd2_d2_d2_d(ddsqu_d2_d2(d), 1));\n  d = y > 1 ? ddmul_d2_d2_d(d, y) : d;\n  \n  d = logk2(ddnormalize_d2_d2(ddadd_d2_d2_d(d, x)));\n  y = d.x + d.y;\n\n  y = (fabsk(x) > SQRT_DBL_MAX || xisnan(y)) ? mulsign(SLEEF_INFINITY, x) : y;\n  y = xisnan(x) ? SLEEF_NAN : y;\n  y = xisnegzero(x) ? -0.0 : y;\n  \n  return y;\n}\n\nEXPORT CONST double xacosh(double x) {\n  Sleef_double2 d = logk2(ddadd2_d2_d2_d(ddmul_d2_d2_d2(ddsqrt_d2_d2(ddadd2_d2_d_d(x, 1)), ddsqrt_d2_d2(ddadd2_d2_d_d(x, -1))), x));\n  double y = d.x + d.y;\n\n  y = (x > SQRT_DBL_MAX || xisnan(y)) ? SLEEF_INFINITY : y;\n  y = x == 1.0 ? 0.0 : y;\n  y = x < 1.0 ? SLEEF_NAN : y;\n  y = xisnan(x) ? SLEEF_NAN : y;\n\n  return y;\n}\n\nEXPORT CONST double xatanh(double x) {\n  double y = fabsk(x);\n  Sleef_double2 d = logk2(dddiv_d2_d2_d2(ddadd2_d2_d_d(1, y), ddadd2_d2_d_d(1, -y)));\n  y = y > 1.0 ? SLEEF_NAN : (y == 1.0 ? SLEEF_INFINITY : (d.x + d.y) * 0.5);\n\n  y = mulsign(y, x);\n  y = (xisinf(x) || xisnan(y)) ? SLEEF_NAN : y;\n\n  return y;\n}\n\n//\n\nEXPORT CONST double xcbrt(double d) { // max error : 2 ulps\n  double x, y, q = 1.0;\n  int e, r;\n\n  e = ilogbk(fabsk(d))+1;\n  d = ldexp2k(d, -e);\n  r = (e + 6144) % 3;\n  q = (r == 1) ? 1.2599210498948731647672106 : q;\n  q = (r == 2) ? 1.5874010519681994747517056 : q;\n  q = ldexp2k(q, (e + 6144) / 3 - 2048);\n\n  q = mulsign(q, d);\n  d = fabsk(d);\n\n  x = -0.640245898480692909870982;\n  x = mla(x, d, 2.96155103020039511818595);\n  x = mla(x, d, -5.73353060922947843636166);\n  x = mla(x, d, 6.03990368989458747961407);\n  x = mla(x, d, -3.85841935510444988821632);\n  x = mla(x, d, 2.2307275302496609725722);\n\n  y = x * x; y = y * y; x -= (d * y - x) * (1.0 / 3.0);\n  y = d * x * x;\n  y = (y - (2.0 / 3.0) * y * (y * x - 1)) * q;\n\n  return y;\n}\n\nEXPORT CONST double xcbrt_u1(double d) {\n  double x, y, z;\n  Sleef_double2 q2 = dd(1, 0), u, v;\n  int e, r;\n\n  e = ilogbk(fabsk(d))+1;\n  d = ldexp2k(d, -e);\n  r = (e + 6144) % 3;\n  q2 = (r == 1) ? dd(1.2599210498948731907, -2.5899333753005069177e-17) : q2;\n  q2 = (r == 2) ? dd(1.5874010519681995834, -1.0869008194197822986e-16) : q2;\n\n  q2.x = mulsign(q2.x, d); q2.y = mulsign(q2.y, d);\n  d = fabsk(d);\n\n  x = -0.640245898480692909870982;\n  x = mla(x, d, 2.96155103020039511818595);\n  x = mla(x, d, -5.73353060922947843636166);\n  x = mla(x, d, 6.03990368989458747961407);\n  x = mla(x, d, -3.85841935510444988821632);\n  x = mla(x, d, 2.2307275302496609725722);\n\n  y = x * x; y = y * y; x -= (d * y - x) * (1.0 / 3.0);\n\n  z = x;\n\n  u = ddmul_d2_d_d(x, x);\n  u = ddmul_d2_d2_d2(u, u);\n  u = ddmul_d2_d2_d(u, d);\n  u = ddadd2_d2_d2_d(u, -x);\n  y = u.x + u.y;\n\n  y = -2.0 / 3.0 * y * z;\n  v = ddadd2_d2_d2_d(ddmul_d2_d_d(z, z), y);\n  v = ddmul_d2_d2_d(v, d);\n  v = ddmul_d2_d2_d2(v, q2);\n  z = ldexp2k(v.x + v.y, (e + 6144) / 3 - 2048);\n\n  if (xisinf(d)) { z = mulsign(SLEEF_INFINITY, q2.x); }\n  if (d == 0) { z = mulsign(0, q2.x); }\n\n  return z;\n}\n\nEXPORT CONST double xexp2(double d) {\n  int q = (int)rintk(d);\n  double s, u;\n\n  s = d - q;\n\n  double s2 = s * s, s4 = s2 * s2, s8 = s4 * s4;\n  u = POLY10(s, s2, s4, s8,\n\t     +0.4434359082926529454e-9,\n\t     +0.7073164598085707425e-8,\n\t     +0.1017819260921760451e-6,\n\t     +0.1321543872511327615e-5,\n\t     +0.1525273353517584730e-4,\n\t     +0.1540353045101147808e-3,\n\t     +0.1333355814670499073e-2,\n\t     +0.9618129107597600536e-2,\n\t     +0.5550410866482046596e-1,\n\t     +0.2402265069591012214e+0);\n  u = mla(u, s, +0.6931471805599452862e+0);\n\n  u = ddnormalize_d2_d2(ddadd_d2_d_d2(1, ddmul_d2_d_d(u, s))).x;\n\n  u = ldexp2k(u, q);\n\n  if (d >= 1024) u = SLEEF_INFINITY;\n  if (d < -2000) u = 0;\n  \n  return u;\n}\n\nEXPORT CONST double xexp2_u35(double d) {\n  int q = (int)rintk(d);\n  double s, u;\n\n  s = d - q;\n\n  u = +0.4434359082926529454e-9;\n  u = mla(u, s, +0.7073164598085707425e-8);\n  u = mla(u, s, +0.1017819260921760451e-6);\n  u = mla(u, s, +0.1321543872511327615e-5);\n  u = mla(u, s, +0.1525273353517584730e-4);\n  u = mla(u, s, +0.1540353045101147808e-3);\n  u = mla(u, s, +0.1333355814670499073e-2);\n  u = mla(u, s, +0.9618129107597600536e-2);\n  u = mla(u, s, +0.5550410866482046596e-1);\n  u = mla(u, s, +0.2402265069591012214e+0);\n  u = mla(u, s, +0.6931471805599452862e+0);\n  u = mla(u, s, +0.1000000000000000000e+1);\n\n  u = ldexp2k(u, q);\n\n  if (d >= 1024) u = SLEEF_INFINITY;\n  if (d < -2000) u = 0;\n  \n  return u;\n}\n\nEXPORT CONST double xexp10(double d) {\n  int q = (int)rintk(d * LOG10_2);\n  double s, u;\n  \n  s = mla(q, -L10U, d);\n  s = mla(q, -L10L, s);\n  \n  u = +0.2411463498334267652e-3;\n  u = mla(u, s, +0.1157488415217187375e-2);\n  u = mla(u, s, +0.5013975546789733659e-2);\n  u = mla(u, s, +0.1959762320720533080e-1);\n  u = mla(u, s, +0.6808936399446784138e-1);\n  u = mla(u, s, +0.2069958494722676234e+0);\n  u = mla(u, s, +0.5393829292058536229e+0);\n  u = mla(u, s, +0.1171255148908541655e+1);\n  u = mla(u, s, +0.2034678592293432953e+1);\n  u = mla(u, s, +0.2650949055239205876e+1);\n  u = mla(u, s, +0.2302585092994045901e+1);\n\n  u = ddnormalize_d2_d2(ddadd_d2_d_d2(1, ddmul_d2_d_d(u, s))).x;\n  \n  u = ldexp2k(u, q);\n  \n  if (d > 308.25471555991671) u = SLEEF_INFINITY; // log10(DBL_MAX)\n  if (d < -350) u = 0;\n  \n  return u;\n}\n\nEXPORT CONST double xexp10_u35(double d) {\n  int q = (int)rintk(d * LOG10_2);\n  double s, u;\n  \n  s = mla(q, -L10U, d);\n  s = mla(q, -L10L, s);\n  \n  u = +0.2411463498334267652e-3;\n  u = mla(u, s, +0.1157488415217187375e-2);\n  u = mla(u, s, +0.5013975546789733659e-2);\n  u = mla(u, s, +0.1959762320720533080e-1);\n  u = mla(u, s, +0.6808936399446784138e-1);\n  u = mla(u, s, +0.2069958494722676234e+0);\n  u = mla(u, s, +0.5393829292058536229e+0);\n  u = mla(u, s, +0.1171255148908541655e+1);\n  u = mla(u, s, +0.2034678592293432953e+1);\n  u = mla(u, s, +0.2650949055239205876e+1);\n  u = mla(u, s, +0.2302585092994045901e+1);\n  u = mla(u, s, +0.1000000000000000000e+1);\n  \n  u = ldexp2k(u, q);\n  \n  if (d > 308.25471555991671) u = SLEEF_INFINITY;\n  if (d < -350) u = 0;\n  \n  return u;\n}\n\nEXPORT CONST double xexpm1(double a) {\n  Sleef_double2 d = ddadd2_d2_d2_d(expk2(dd(a, 0)), -1.0);\n  double x = d.x + d.y;\n  if (a > 709.782712893383996732223) x = SLEEF_INFINITY; // log(DBL_MAX)\n  if (a < -36.736800569677101399113302437) x = -1; // log(1 - nexttoward(1, 0))\n  if (xisnegzero(a)) x = -0.0;\n  return x;\n}\n\nEXPORT CONST double xlog10(double d) {\n  Sleef_double2 x, s;\n  double m, t, x2;\n  int e;\n\n  int o = d < DBL_MIN;\n  if (o) d *= (double)(INT64_C(1) << 32) * (double)(INT64_C(1) << 32);\n      \n  e = ilogb2k(d * (1.0/0.75));\n  m = ldexp3k(d, -e);\n\n  if (o) e -= 64;\n\n  x = dddiv_d2_d2_d2(ddadd2_d2_d_d(-1, m), ddadd2_d2_d_d(1, m));\n  x2 = x.x * x.x;\n\n  double x4 = x2 * x2, x8 = x4 * x4;\n  t = POLY7(x2, x4, x8,\n\t    +0.6653725819576758460e-1,\n\t    +0.6625722782820833712e-1,\n\t    +0.7898105214313944078e-1,\n\t    +0.9650955035715275132e-1,\n\t    +0.1240841409721444993e+0,\n\t    +0.1737177927454605086e+0,\n\t    +0.2895296546021972617e+0);\n  \n  s = ddmul_d2_d2_d(dd(0.30102999566398119802, -2.803728127785170339e-18), (double)e);\n  s = ddadd_d2_d2_d2(s, ddmul_d2_d2_d2(x, dd(0.86858896380650363334, 1.1430059694096389311e-17)));\n  s = ddadd_d2_d2_d(s, x2 * x.x * t);\n\n  double r = s.x + s.y;\n  \n  if (xisinf(d)) r = SLEEF_INFINITY;\n  if (d < 0 || xisnan(d)) r = SLEEF_NAN;\n  if (d == 0) r = -SLEEF_INFINITY;\n\n  return r;\n}\n\nEXPORT CONST double xlog2(double d) {\n  Sleef_double2 x, s;\n  double m, t, x2;\n  int e;\n\n  int o = d < DBL_MIN;\n  if (o) d *= (double)(INT64_C(1) << 32) * (double)(INT64_C(1) << 32);\n      \n  e = ilogb2k(d * (1.0/0.75));\n  m = ldexp3k(d, -e);\n\n  if (o) e -= 64;\n\n  x = dddiv_d2_d2_d2(ddadd2_d2_d_d(-1, m), ddadd2_d2_d_d(1, m));\n  x2 = x.x * x.x;\n\n  double x4 = x2 * x2, x8 = x4 * x4;\n  t = POLY7(x2, x4, x8,\n\t    +0.2211941750456081490e+0,\n\t    +0.2200768693152277689e+0,\n\t    +0.2623708057488514656e+0,\n\t    +0.3205977477944495502e+0,\n\t    +0.4121985945485324709e+0,\n\t    +0.5770780162997058982e+0,\n\t    +0.96179669392608091449);\n\n  s = ddadd2_d2_d_d2(e, ddmul_d2_d2_d2(x, dd(2.885390081777926774, 6.0561604995516736434e-18)));\n  s = ddadd2_d2_d2_d(s, x2 * x.x * t);\n  \n  double r = s.x + s.y;\n  \n  if (xisinf(d)) r = SLEEF_INFINITY;\n  if (d < 0 || xisnan(d)) r = SLEEF_NAN;\n  if (d == 0) r = -SLEEF_INFINITY;\n\n  return r;\n}\n\nEXPORT CONST double xlog2_u35(double d) {\n  double m, t, x, x2;\n  int e;\n\n  int o = d < DBL_MIN;\n  if (o) d *= (double)(INT64_C(1) << 32) * (double)(INT64_C(1) << 32);\n      \n  e = ilogb2k(d * (1.0/0.75));\n  m = ldexp3k(d, -e);\n\n  if (o) e -= 64;\n\n  x = (m - 1) / (m + 1);\n  x2 = x * x;\n\n  t = +0.2211941750456081490e+0;\n  t = mla(t, x2, +0.2200768693152277689e+0);\n  t = mla(t, x2, +0.2623708057488514656e+0);\n  t = mla(t, x2, +0.3205977477944495502e+0);\n  t = mla(t, x2, +0.4121985945485324709e+0);\n  t = mla(t, x2, +0.5770780162997058982e+0);\n  t = mla(t, x2, +0.96179669392608091449  );\n\n  Sleef_double2 s = ddadd_d2_d_d2(e, ddmul_d2_d_d(2.885390081777926774, x));\n  double r = mla(t, x * x2, s.x + s.y);\n  \n  if (xisinf(d)) r = SLEEF_INFINITY;\n  if (d < 0 || xisnan(d)) r = SLEEF_NAN;\n  if (d == 0) r = -SLEEF_INFINITY;\n\n  return r;\n}\n\nEXPORT CONST double xlog1p(double d) {\n  Sleef_double2 x, s;\n  double m, t, x2;\n  int e;\n\n  double dp1 = d + 1;\n  \n  int o = dp1 < DBL_MIN;\n  if (o) dp1 *= (double)(INT64_C(1) << 32) * (double)(INT64_C(1) << 32);\n      \n  e = ilogb2k(dp1 * (1.0/0.75));\n\n  t = ldexp3k(1, -e);\n  m = mla(d, t, t - 1);\n  \n  if (o) e -= 64;\n\n  x = dddiv_d2_d2_d2(dd(m, 0), ddadd_d2_d_d(2, m));\n  x2 = x.x * x.x;\n\n  double x4 = x2 * x2, x8 = x4 * x4;\n  t = POLY7(x2, x4, x8,\n\t    0.1532076988502701353e+0,\n\t    0.1525629051003428716e+0,\n\t    0.1818605932937785996e+0,\n\t    0.2222214519839380009e+0,\n\t    0.2857142932794299317e+0,\n\t    0.3999999999635251990e+0,\n\t    0.6666666666667333541e+0);\n\n  s = ddmul_d2_d2_d(dd(0.693147180559945286226764, 2.319046813846299558417771e-17), (double)e);\n  s = ddadd_d2_d2_d2(s, ddscale_d2_d2_d(x, 2));\n  s = ddadd_d2_d2_d(s, x2 * x.x * t);\n\n  double r = s.x + s.y;\n  \n  if (d > 1e+307) r = SLEEF_INFINITY;\n  if (d < -1 || xisnan(d)) r = SLEEF_NAN;\n  if (d == -1) r = -SLEEF_INFINITY;\n  if (xisnegzero(d)) r = -0.0;\n\n  return r;\n}\n\n//\n\nEXPORT CONST double xfma(double x, double y, double z) {\n  double h2 = x * y + z, q = 1;\n  if (fabsk(h2) < 1e-300) {\n    const double c0 = UINT64_C(1) << 54, c1 = c0 * c0, c2 = c1 * c1;\n    x *= c1;\n    y *= c1;\n    z *= c2;\n    q = 1.0 / c2;\n  }\n  if (fabsk(h2) > 1e+299) {\n    const double c0 = UINT64_C(1) << 54, c1 = c0 * c0, c2 = c1 * c1;\n    x *= 1.0 / c1;\n    y *= 1.0 / c1;\n    z *= 1. / c2;\n    q = c2;\n  }\n  Sleef_double2 d = ddmul_d2_d_d(x, y);\n  d = ddadd2_d2_d2_d(d, z);\n  double ret = (x == 0 || y == 0) ? z : (d.x + d.y);\n  if ((xisinf(z) && !xisinf(x) && !xisnan(x) && !xisinf(y) && !xisnan(y))) h2 = z;\n  return (xisinf(h2) || xisnan(h2)) ? h2 : ret*q;\n}\n\nEXPORT CONST double xsqrt_u05(double d) {\n  double q = 0.5;\n\n  d = d < 0 ? SLEEF_NAN : d;\n  \n  if (d < 8.636168555094445E-78) {\n    d *= 1.157920892373162E77;\n    q = 2.9387358770557188E-39 * 0.5;\n  }\n\n  if (d > 1.3407807929942597e+154) {\n    d *= 7.4583407312002070e-155;\n    q = 1.1579208923731620e+77 * 0.5;\n  }\n  \n  // http://en.wikipedia.org/wiki/Fast_inverse_square_root\n  double x = longBitsToDouble(0x5fe6ec85e7de30da - (doubleToRawLongBits(d + 1e-320) >> 1));\n\n  x = x * (1.5 - 0.5 * d * x * x);\n  x = x * (1.5 - 0.5 * d * x * x);\n  x = x * (1.5 - 0.5 * d * x * x) * d;\n\n  Sleef_double2 d2 = ddmul_d2_d2_d2(ddadd2_d2_d_d2(d, ddmul_d2_d_d(x, x)), ddrec_d2_d(x));\n\n  double ret = (d2.x + d2.y) * q;\n\n  ret = d == SLEEF_INFINITY ? SLEEF_INFINITY : ret;\n  ret = d == 0 ? d : ret;\n\n  return ret;\n}\n\nEXPORT CONST double xsqrt_u35(double d) { return xsqrt_u05(d); }\nEXPORT CONST double xsqrt(double d) { return SQRT(d); }\n\nEXPORT CONST double xfabs(double x) { return fabsk(x); }\n\nEXPORT CONST double xcopysign(double x, double y) { return copysignk(x, y); }\n\nEXPORT CONST double xfmax(double x, double y) {\n  return y != y ? x : (x > y ? x : y);\n}\n\nEXPORT CONST double xfmin(double x, double y) {\n  return y != y ? x : (x < y ? x : y);\n}\n\nEXPORT CONST double xfdim(double x, double y) {\n  double ret = x - y;\n  if (ret < 0 || x == y) ret = 0;\n  return ret;\n}\n\nEXPORT CONST double xtrunc(double x) {\n  double fr = x - (double)(INT64_C(1) << 31) * (int32_t)(x * (1.0 / (INT64_C(1) << 31)));\n  fr = fr - (int32_t)fr;\n  return (xisinf(x) || fabsk(x) >= (double)(INT64_C(1) << 52)) ? x : copysignk(x - fr, x);\n}\n\nEXPORT CONST double xfloor(double x) {\n  double fr = x - (double)(INT64_C(1) << 31) * (int32_t)(x * (1.0 / (INT64_C(1) << 31)));\n  fr = fr - (int32_t)fr;\n  fr = fr < 0 ? fr+1.0 : fr;\n  return (xisinf(x) || fabsk(x) >= (double)(INT64_C(1) << 52)) ? x : copysignk(x - fr, x);\n}\n\nEXPORT CONST double xceil(double x) {\n  double fr = x - (double)(INT64_C(1) << 31) * (int32_t)(x * (1.0 / (INT64_C(1) << 31)));\n  fr = fr - (int32_t)fr;\n  fr = fr <= 0 ? fr : fr-1.0;\n  return (xisinf(x) || fabsk(x) >= (double)(INT64_C(1) << 52)) ? x : copysignk(x - fr, x);\n}\n\nEXPORT CONST double xround(double d) {\n  double x = d + 0.5;\n  double fr = x - (double)(INT64_C(1) << 31) * (int32_t)(x * (1.0 / (INT64_C(1) << 31)));\n  fr = fr - (int32_t)fr;\n  if (fr == 0 && x <= 0) x--;\n  fr = fr < 0 ? fr+1.0 : fr;\n  x = d == 0.49999999999999994449 ? 0 : x;  // nextafter(0.5, 0)\n  return (xisinf(d) || fabsk(d) >= (double)(INT64_C(1) << 52)) ? d : copysignk(x - fr, d);\n}\n\nEXPORT CONST double xrint(double d) {\n  double c = mulsign(INT64_C(1) << 52, d);\n  return fabsk(d) > INT64_C(1) << 52 ? d : orsign(d + c - c, d);\n}\n\nEXPORT CONST double xhypot_u05(double x, double y) {\n  x = fabsk(x);\n  y = fabsk(y);\n  double min = fmink(x, y), n = min;\n  double max = fmaxk(x, y), d = max;\n\n  if (max < DBL_MIN) { n *= UINT64_C(1) << 54; d *= UINT64_C(1) << 54; }\n  Sleef_double2 t = dddiv_d2_d2_d2(dd(n, 0), dd(d, 0));\n  t = ddmul_d2_d2_d(ddsqrt_d2_d2(ddadd2_d2_d2_d(ddsqu_d2_d2(t), 1)), max);\n  double ret = t.x + t.y;\n  if (xisnan(ret)) ret = SLEEF_INFINITY;\n  if (min == 0) ret = max;\n  if (xisnan(x) || xisnan(y)) ret = SLEEF_NAN;\n  if (x == SLEEF_INFINITY || y == SLEEF_INFINITY) ret = SLEEF_INFINITY;\n  return ret;\n}\n\nEXPORT CONST double xhypot_u35(double x, double y) {\n  x = fabsk(x);\n  y = fabsk(y);\n  double min = fmink(x, y);\n  double max = fmaxk(x, y);\n\n  double t = min / max;\n  double ret = max * SQRT(1 + t*t);\n  if (min == 0) ret = max;\n  if (xisnan(x) || xisnan(y)) ret = SLEEF_NAN;\n  if (x == SLEEF_INFINITY || y == SLEEF_INFINITY) ret = SLEEF_INFINITY;\n  return ret;\n}\n\nEXPORT CONST double xnextafter(double x, double y) {\n  union {\n    double f;\n    int64_t i;\n  } cx;\n\n  x = x == 0 ? mulsign(0, y) : x;\n  cx.f = x;\n  int c = (cx.i < 0) == (y < x);\n  if (c) cx.i = -(cx.i ^ (UINT64_C(1) << 63));\n\n  if (x != y) cx.i--;\n\n  if (c) cx.i = -(cx.i ^ (UINT64_C(1) << 63));\n\n  if (cx.f == 0 && x != 0) cx.f = mulsign(0, x);\n  if (x == 0 && y == 0) cx.f = y;\n  if (xisnan(x) || xisnan(y)) cx.f = SLEEF_NAN;\n  \n  return cx.f;\n}\n\nEXPORT CONST double xfrfrexp(double x) {\n  union {\n    double f;\n    uint64_t u;\n  } cx;\n\n  if (fabsk(x) < DBL_MIN) x *= (UINT64_C(1) << 63);\n  \n  cx.f = x;\n  cx.u &= ~UINT64_C(0x7ff0000000000000);\n  cx.u |=  UINT64_C(0x3fe0000000000000);\n\n  if (xisinf(x)) cx.f = mulsign(SLEEF_INFINITY, x);\n  if (x == 0) cx.f = x;\n  \n  return cx.f;\n}\n\nEXPORT CONST int xexpfrexp(double x) {\n  union {\n    double f;\n    uint64_t u;\n  } cx;\n\n  int ret = 0;\n  \n  if (fabsk(x) < DBL_MIN) { x *= (UINT64_C(1) << 63); ret = -63; }\n  \n  cx.f = x;\n  ret += (int32_t)(((cx.u >> 52) & 0x7ff)) - 0x3fe;\n\n  if (x == 0 || xisnan(x) || xisinf(x)) ret = 0;\n  \n  return ret;\n}\n\nstatic INLINE CONST double toward0(double d) {\n  return d == 0 ? 0 : longBitsToDouble(doubleToRawLongBits(d)-1);\n}\n\nstatic INLINE CONST double removelsb(double d) {\n  return longBitsToDouble(doubleToRawLongBits(d) & INT64_C(0xfffffffffffffffe));\n}\n\nstatic INLINE CONST double ptrunc(double x) {\n  double fr = mla(-(double)(INT64_C(1) << 31), (int32_t)(x * (1.0 / (INT64_C(1) << 31))), x);\n  return fabsk(x) >= (double)(INT64_C(1) << 52) ? x : (x - (fr - (int32_t)fr));\n}\n\nEXPORT CONST double xfmod(double x, double y) {\n  double n = fabsk(x), d = fabsk(y), s = 1, q;\n  if (d < DBL_MIN) { n *= UINT64_C(1) << 54; d *= UINT64_C(1) << 54; s = 1.0 / (UINT64_C(1) << 54); }\n  Sleef_double2 r = dd(n, 0);\n  double rd = toward0(1.0 / d);\n  \n  for(int i=0;i < 21;i++) { // ceil(log2(DBL_MAX) / 52)\n    q = removelsb(ptrunc(toward0(r.x) * rd));\n    q = (3*d > r.x && r.x > d) ? 2 : q;\n    q = (2*d > r.x && r.x > d) ? 1 : q;\n    q = r.x == d ? (r.y >= 0 ? 1 : 0) : q;\n    r = ddnormalize_d2_d2(ddadd2_d2_d2_d2(r, ddmul_d2_d_d(q, -d)));\n    if (r.x < d) break;\n  }\n  \n  double ret = r.x * s;\n  if (r.x + r.y == d) ret = 0;\n  ret = mulsign(ret, x);\n  if (n < d) ret = x;\n  if (d == 0) ret = SLEEF_NAN;\n  \n  return ret;\n}\n\nstatic INLINE CONST double rintk2(double d) {\n  double c = mulsign(INT64_C(1) << 52, d);\n  return fabsk(d) > INT64_C(1) << 52 ? d : orsign(d + c - c, d);\n}\n\nEXPORT CONST double xremainder(double x, double y) {\n  double n = fabsk(x), d = fabsk(y), s = 1, q;\n  if (d < DBL_MIN*2) { n *= UINT64_C(1) << 54; d *= UINT64_C(1) << 54; s = 1.0 / (UINT64_C(1) << 54); }\n  double rd = 1.0 / d;\n  Sleef_double2 r = dd(n, 0);\n  int qisodd = 0;\n  \n  for(int i=0;i < 21;i++) { // ceil(log2(DBL_MAX) / 52)\n    q = removelsb(rintk2(r.x * rd));\n    if (fabsk(r.x) < 1.5 * d) q = r.x < 0 ? -1 : 1;\n    if (fabsk(r.x) < 0.5 * d || (fabsk(r.x) == 0.5 * d && !qisodd)) q = 0;\n    if (q == 0) break;\n    if (xisinf(q * -d)) q = q + mulsign(-1, r.x);\n    qisodd ^= xisodd(q);\n    r = ddnormalize_d2_d2(ddadd2_d2_d2_d2(r, ddmul_d2_d_d(q, -d)));\n  }\n  \n  double ret = r.x * s;\n  ret = mulsign(ret, x);\n  if (xisinf(y)) ret = xisinf(x) ? SLEEF_NAN : x;\n  if (d == 0) ret = SLEEF_NAN;\n\n  return ret;\n}\n\nEXPORT CONST Sleef_double2 xmodf(double x) {\n  double fr = x - (double)(INT64_C(1) << 31) * (int32_t)(x * (1.0 / (INT64_C(1) << 31)));\n  fr = fr - (int32_t)fr;\n  fr = fabsk(x) >= (double)(INT64_C(1) << 52) ? 0 : fr;\n  Sleef_double2 ret = { copysignk(fr, x), copysignk(x - fr, x) };\n  return ret;\n}\n\ntypedef struct {\n  Sleef_double2 a, b;\n} dd2;\n\nstatic CONST dd2 gammak(double a) {\n  Sleef_double2 clc = dd(0, 0), clln = dd(1, 0), clld = dd(1, 0), v = dd(1, 0), x, y, z;\n  double t, u;\n\n  int otiny = fabsk(a) < 1e-306, oref = a < 0.5;\n\n  x = otiny ? dd(0, 0) : (oref ? ddadd2_d2_d_d(1, -a) : dd(a, 0));\n\n  int o0 = (0.5 <= x.x && x.x <= 1.1), o2 = 2.3 < x.x;\n\n  y = ddnormalize_d2_d2(ddmul_d2_d2_d2(ddadd2_d2_d2_d(x, 1), x));\n  y = ddnormalize_d2_d2(ddmul_d2_d2_d2(ddadd2_d2_d2_d(x, 2), y));\n  y = ddnormalize_d2_d2(ddmul_d2_d2_d2(ddadd2_d2_d2_d(x, 3), y));\n  y = ddnormalize_d2_d2(ddmul_d2_d2_d2(ddadd2_d2_d2_d(x, 4), y));\n\n  clln = (o2 && x.x <= 7) ? y : clln;\n\n  x = (o2 && x.x <= 7) ? ddadd2_d2_d2_d(x, 5) : x;\n  t = o2 ? (1.0 / x.x) : ddnormalize_d2_d2(ddadd2_d2_d2_d(x, o0 ? -1 : -2)).x;\n  \n  u = o2 ? -156.801412704022726379848862 : (o0 ? +0.2947916772827614196e+2 : +0.7074816000864609279e-7);\n  u = mla(u, t, o2 ? +1.120804464289911606838558160000 : (o0 ? +0.1281459691827820109e+3 : +0.4009244333008730443e-6));\n  u = mla(u, t, o2 ? +13.39798545514258921833306020000 : (o0 ? +0.2617544025784515043e+3 : +0.1040114641628246946e-5));\n  u = mla(u, t, o2 ? -0.116546276599463200848033357000 : (o0 ? +0.3287022855685790432e+3 : +0.1508349150733329167e-5));\n  u = mla(u, t, o2 ? -1.391801093265337481495562410000 : (o0 ? +0.2818145867730348186e+3 : +0.1288143074933901020e-5));\n  u = mla(u, t, o2 ? +0.015056113040026424412918973400 : (o0 ? +0.1728670414673559605e+3 : +0.4744167749884993937e-6));\n  u = mla(u, t, o2 ? +0.179540117061234856098844714000 : (o0 ? +0.7748735764030416817e+2 : -0.6554816306542489902e-7));\n  u = mla(u, t, o2 ? -0.002481743600264997730942489280 : (o0 ? +0.2512856643080930752e+2 : -0.3189252471452599844e-6));\n  u = mla(u, t, o2 ? -0.029527880945699120504851034100 : (o0 ? +0.5766792106140076868e+1 : +0.1358883821470355377e-6));\n  u = mla(u, t, o2 ? +0.000540164767892604515196325186 : (o0 ? +0.7270275473996180571e+0 : -0.4343931277157336040e-6));\n  u = mla(u, t, o2 ? +0.006403362833808069794787256200 : (o0 ? +0.8396709124579147809e-1 : +0.9724785897406779555e-6));\n  u = mla(u, t, o2 ? -0.000162516262783915816896611252 : (o0 ? -0.8211558669746804595e-1 : -0.2036886057225966011e-5));\n  u = mla(u, t, o2 ? -0.001914438498565477526465972390 : (o0 ? +0.6828831828341884458e-1 : +0.4373363141819725815e-5));\n  u = mla(u, t, o2 ? +7.20489541602001055898311517e-05 : (o0 ? -0.7712481339961671511e-1 : -0.9439951268304008677e-5));\n  u = mla(u, t, o2 ? +0.000839498720672087279971000786 : (o0 ? +0.8337492023017314957e-1 : +0.2050727030376389804e-4));\n  u = mla(u, t, o2 ? -5.17179090826059219329394422e-05 : (o0 ? -0.9094964931456242518e-1 : -0.4492620183431184018e-4));\n  u = mla(u, t, o2 ? -0.000592166437353693882857342347 : (o0 ? +0.1000996313575929358e+0 : +0.9945751236071875931e-4));\n  u = mla(u, t, o2 ? +6.97281375836585777403743539e-05 : (o0 ? -0.1113342861544207724e+0 : -0.2231547599034983196e-3));\n  u = mla(u, t, o2 ? +0.000784039221720066627493314301 : (o0 ? +0.1255096673213020875e+0 : +0.5096695247101967622e-3));\n  u = mla(u, t, o2 ? -0.000229472093621399176949318732 : (o0 ? -0.1440498967843054368e+0 : -0.1192753911667886971e-2));\n  u = mla(u, t, o2 ? -0.002681327160493827160473958490 : (o0 ? +0.1695571770041949811e+0 : +0.2890510330742210310e-2));\n  u = mla(u, t, o2 ? +0.003472222222222222222175164840 : (o0 ? -0.2073855510284092762e+0 : -0.7385551028674461858e-2));\n  u = mla(u, t, o2 ? +0.083333333333333333335592087900 : (o0 ? +0.2705808084277815939e+0 : +0.2058080842778455335e-1));\n\n  y = ddmul_d2_d2_d2(ddadd2_d2_d2_d(x, -0.5), logk2(x));\n  y = ddadd2_d2_d2_d2(y, ddneg_d2_d2(x));\n  y = ddadd2_d2_d2_d2(y, dd(0.91893853320467278056, -3.8782941580672414498e-17)); // 0.5*log(2*M_PI)\n\n  z = ddadd2_d2_d2_d(ddmul_d2_d_d (u, t), o0 ? -0.4006856343865314862e+0 : -0.6735230105319810201e-1);\n  z = ddadd2_d2_d2_d(ddmul_d2_d2_d(z, t), o0 ? +0.8224670334241132030e+0 : +0.3224670334241132030e+0);\n  z = ddadd2_d2_d2_d(ddmul_d2_d2_d(z, t), o0 ? -0.5772156649015328655e+0 : +0.4227843350984671345e+0);\n  z = ddmul_d2_d2_d(z, t);\n\n  clc = o2 ? y : z;\n  \n  clld = o2 ? ddadd2_d2_d2_d(ddmul_d2_d_d(u, t), 1) : clld;\n  \n  y = clln;\n\n  clc = otiny ? dd(83.1776616671934334590333, 3.67103459631568507221878e-15) : // log(2^120)\n    (oref ? ddadd2_d2_d2_d2(dd(1.1447298858494001639, 1.026595116270782638e-17), ddneg_d2_d2(clc)) : clc); // log(M_PI)\n  clln = otiny ? dd(1, 0) : (oref ? clln : clld);\n\n  if (oref) x = ddmul_d2_d2_d2(clld, sinpik(a - (double)(INT64_C(1) << 28) * (int32_t)(a * (1.0 / (INT64_C(1) << 28)))));\n\n  clld = otiny ? dd(a*((INT64_C(1) << 60)*(double)(INT64_C(1) << 60)), 0) : (oref ? x : y);\n\n  dd2 ret = { clc, dddiv_d2_d2_d2(clln, clld) };\n\n  return ret;\n}\n\nEXPORT CONST double xtgamma_u1(double a) {\n  dd2 d = gammak(a);\n  Sleef_double2 y = ddmul_d2_d2_d2(expk2(d.a), d.b);\n  double r = y.x + y.y;\n  r = (a == -SLEEF_INFINITY || (a < 0 && xisint(a)) || (xisnumber(a) && a < 0 && xisnan(r))) ? SLEEF_NAN : r;\n  r = ((a == SLEEF_INFINITY || xisnumber(a)) && a >= -DBL_MIN && (a == 0 || a > 200 || xisnan(r))) ? mulsign(SLEEF_INFINITY, a) : r;\n  return r;\n}\n\nEXPORT CONST double xlgamma_u1(double a) {\n  dd2 d = gammak(a);\n  Sleef_double2 y = ddadd2_d2_d2_d2(d.a, logk2(ddabs_d2_d2(d.b)));\n  double r = y.x + y.y;\n  r = (xisinf(a) || (a <= 0 && xisint(a)) || (xisnumber(a) && xisnan(r))) ? SLEEF_INFINITY : r;\n  return r;\n}\n\nEXPORT CONST double xerf_u1(double a) {\n  double s = a, t, u;\n  Sleef_double2 d;\n\n  a = fabsk(a);\n  int o0 = a < 1.0, o1 = a < 3.7, o2 = a < 6.0;\n  u = o0 ? (a*a) : a;\n  \n  t = o0 ? +0.6801072401395392157e-20 : o1 ? +0.2830954522087717660e-13 : -0.5846750404269610493e-17;\n  t = mla(t, u, o0 ? -0.2161766247570056391e-18 : o1 ? -0.1509491946179481940e-11 : +0.6076691048812607898e-15);\n  t = mla(t, u, o0 ? +0.4695919173301598752e-17 : o1 ? +0.3827857177807173152e-10 : -0.3007518609604893831e-13);\n  t = mla(t, u, o0 ? -0.9049140419888010819e-16 : o1 ? -0.6139733921558987241e-09 : +0.9427906260824646063e-12);\n  t = mla(t, u, o0 ? +0.1634018903557411517e-14 : o1 ? +0.6985387934608038824e-08 : -0.2100110908269393629e-10);\n  t = mla(t, u, o0 ? -0.2783485786333455216e-13 : o1 ? -0.5988224513034371474e-07 : +0.3534639523461223473e-09);\n  t = mla(t, u, o0 ? +0.4463221276786412722e-12 : o1 ? +0.4005716952355346640e-06 : -0.4664967728285395926e-08);\n  t = mla(t, u, o0 ? -0.6711366622850138987e-11 : o1 ? -0.2132190104575784400e-05 : +0.4943823283769000532e-07);\n  t = mla(t, u, o0 ? +0.9422759050232658346e-10 : o1 ? +0.9092461304042630325e-05 : -0.4271203394761148254e-06);\n  t = mla(t, u, o0 ? -0.1229055530100228477e-08 : o1 ? -0.3079188080966205457e-04 : +0.3034067677404915895e-05);\n  t = mla(t, u, o0 ? +0.1480719281585085023e-07 : o1 ? +0.7971413443082370762e-04 : -0.1776295289066871135e-04);\n  t = mla(t, u, o0 ? -0.1636584469123402714e-06 : o1 ? -0.1387853215225442864e-03 : +0.8524547630559505050e-04);\n  t = mla(t, u, o0 ? +0.1646211436588923363e-05 : o1 ? +0.6469678026257590965e-04 : -0.3290582944961784398e-03);\n  t = mla(t, u, o0 ? -0.1492565035840624866e-04 : o1 ? +0.4996645280372945860e-03 : +0.9696966068789101157e-03);\n  t = mla(t, u, o0 ? +0.1205533298178966496e-03 : o1 ? -0.1622802482842520535e-02 : -0.1812527628046986137e-02);\n  t = mla(t, u, o0 ? -0.8548327023450851166e-03 : o1 ? +0.1615320557049377171e-03 : -0.4725409828123619017e-03);\n  t = mla(t, u, o0 ? +0.5223977625442188799e-02 : o1 ? +0.1915262325574875607e-01 : +0.2090315427924229266e-01);\n  t = mla(t, u, o0 ? -0.2686617064513125569e-01 : o1 ? -0.1027818298486033455e+00 : -0.1052041921842776645e+00);\n  t = mla(t, u, o0 ? +0.1128379167095512753e+00 : o1 ? -0.6366172819842503827e+00 : -0.6345351808766568347e+00);\n  t = mla(t, u, o0 ? -0.3761263890318375380e+00 : o1 ? -0.1128379590648910469e+01 : -0.1129442929103524396e+01);\n  d = ddmul_d2_d_d(t, u);\n  d = ddadd2_d2_d2_d2(d, o0 ? dd(1.1283791670955125586, 1.5335459613165822674e-17) :\n\t\t      o1 ? dd(3.4110644736196137587e-08, -2.4875650708323294246e-24) :\n\t\t      dd(0.00024963035690526438285, -5.4362665034856259795e-21));\n  d = o0 ? ddmul_d2_d2_d(d, a) : ddadd_d2_d_d2(1.0, ddneg_d2_d2(expk2(d)));\n  u = mulsign(o2 ? (d.x + d.y) : 1, s);\n  u = xisnan(a) ? SLEEF_NAN : u;\n  return u;\n}\n\nEXPORT CONST double xerfc_u15(double a) {\n  double s = a, r = 0, t;\n  Sleef_double2 u, d, x;\n  a = fabsk(a);\n  int o0 = a < 1.0, o1 = a < 2.2, o2 = a < 4.2, o3 = a < 27.3;\n  u = o0 ? ddmul_d2_d_d(a, a) : o1 ? dd(a, 0) : dddiv_d2_d2_d2(dd(1, 0), dd(a, 0));\n\n  t = o0 ? +0.6801072401395386139e-20 : o1 ? +0.3438010341362585303e-12 : o2 ? -0.5757819536420710449e+2 : +0.2334249729638701319e+5;\n  t = mla(t, u.x, o0 ? -0.2161766247570055669e-18 : o1 ? -0.1237021188160598264e-10 : o2 ? +0.4669289654498104483e+3 : -0.4695661044933107769e+5);\n  t = mla(t, u.x, o0 ? +0.4695919173301595670e-17 : o1 ? +0.2117985839877627852e-09 : o2 ? -0.1796329879461355858e+4 : +0.3173403108748643353e+5);\n  t = mla(t, u.x, o0 ? -0.9049140419888007122e-16 : o1 ? -0.2290560929177369506e-08 : o2 ? +0.4355892193699575728e+4 : +0.3242982786959573787e+4);\n  t = mla(t, u.x, o0 ? +0.1634018903557410728e-14 : o1 ? +0.1748931621698149538e-07 : o2 ? -0.7456258884965764992e+4 : -0.2014717999760347811e+5);\n  t = mla(t, u.x, o0 ? -0.2783485786333451745e-13 : o1 ? -0.9956602606623249195e-07 : o2 ? +0.9553977358167021521e+4 : +0.1554006970967118286e+5);\n  t = mla(t, u.x, o0 ? +0.4463221276786415752e-12 : o1 ? +0.4330010240640327080e-06 : o2 ? -0.9470019905444229153e+4 : -0.6150874190563554293e+4);\n  t = mla(t, u.x, o0 ? -0.6711366622850136563e-11 : o1 ? -0.1435050600991763331e-05 : o2 ? +0.7387344321849855078e+4 : +0.1240047765634815732e+4);\n  t = mla(t, u.x, o0 ? +0.9422759050232662223e-10 : o1 ? +0.3460139479650695662e-05 : o2 ? -0.4557713054166382790e+4 : -0.8210325475752699731e+2);\n  t = mla(t, u.x, o0 ? -0.1229055530100229098e-08 : o1 ? -0.4988908180632898173e-05 : o2 ? +0.2207866967354055305e+4 : +0.3242443880839930870e+2);\n  t = mla(t, u.x, o0 ? +0.1480719281585086512e-07 : o1 ? -0.1308775976326352012e-05 : o2 ? -0.8217975658621754746e+3 : -0.2923418863833160586e+2);\n  t = mla(t, u.x, o0 ? -0.1636584469123399803e-06 : o1 ? +0.2825086540850310103e-04 : o2 ? +0.2268659483507917400e+3 : +0.3457461732814383071e+0);\n  t = mla(t, u.x, o0 ? +0.1646211436588923575e-05 : o1 ? -0.6393913713069986071e-04 : o2 ? -0.4633361260318560682e+2 : +0.5489730155952392998e+1);\n  t = mla(t, u.x, o0 ? -0.1492565035840623511e-04 : o1 ? -0.2566436514695078926e-04 : o2 ? +0.9557380123733945965e+1 : +0.1559934132251294134e-2);\n  t = mla(t, u.x, o0 ? +0.1205533298178967851e-03 : o1 ? +0.5895792375659440364e-03 : o2 ? -0.2958429331939661289e+1 : -0.1541741566831520638e+1);\n  t = mla(t, u.x, o0 ? -0.8548327023450850081e-03 : o1 ? -0.1695715579163588598e-02 : o2 ? +0.1670329508092765480e+0 : +0.2823152230558364186e-5);\n  t = mla(t, u.x, o0 ? +0.5223977625442187932e-02 : o1 ? +0.2089116434918055149e-03 : o2 ? +0.6096615680115419211e+0 : +0.6249999184195342838e+0);\n  t = mla(t, u.x, o0 ? -0.2686617064513125222e-01 : o1 ? +0.1912855949584917753e-01 : o2 ? +0.1059212443193543585e-2 : +0.1741749416408701288e-8);\n  \n  d = ddmul_d2_d2_d(u, t);\n  d = ddadd2_d2_d2_d2(d, o0 ? dd(0.11283791670955126141, -4.0175691625932118483e-18) :\n\t\t      o1 ? dd(-0.10277263343147646779, -6.2338714083404900225e-18) :\n\t\t      o2 ? dd(-0.50005180473999022439, 2.6362140569041995803e-17) :\n\t\t      dd(-0.5000000000258444377, -4.0074044712386992281e-17));\n  d = ddmul_d2_d2_d2(d, u);\n  d = ddadd2_d2_d2_d2(d, o0 ? dd(-0.37612638903183753802, 1.3391897206042552387e-17) :\n\t\t      o1 ? dd(-0.63661976742916359662, 7.6321019159085724662e-18) :\n\t\t      o2 ? dd(1.601106273924963368e-06, 1.1974001857764476775e-23) :\n\t\t      dd(2.3761973137523364792e-13, -1.1670076950531026582e-29));\n  d = ddmul_d2_d2_d2(d, u);\n  d = ddadd2_d2_d2_d2(d, o0 ? dd(1.1283791670955125586, 1.5335459613165822674e-17) :\n\t\t      o1 ? dd(-1.1283791674717296161, 8.0896847755965377194e-17) :\n\t\t      o2 ? dd(-0.57236496645145429341, 3.0704553245872027258e-17) :\n\t\t      dd(-0.57236494292470108114, -2.3984352208056898003e-17));\n\n  x = ddmul_d2_d2_d(o1 ? d : dd(-a, 0), a);\n  x = o1 ? x : ddadd2_d2_d2_d2(x, d);\n  x = o0 ? ddsub_d2_d2_d2(dd(1, 0), x) : expk2(x);\n  x = o1 ? x : ddmul_d2_d2_d2(x, u);\n\n  r = o3 ? (x.x + x.y) : 0;\n  if (s < 0) r = 2 - r;\n  r = xisnan(s) ? SLEEF_NAN : r;\n  return r;\n}\n\n#ifdef ENABLE_MAIN\n// gcc -w -DENABLE_MAIN -I../common sleefdp.c rempitab.c -lm\n#include <stdlib.h>\nint main(int argc, char **argv) {\n  double d1 = atof(argv[1]);\n  printf(\"arg1 = %.20g\\n\", d1);\n  //int i1 = atoi(argv[1]);\n  //double d2 = atof(argv[2]);\n  //printf(\"arg2 = %.20g\\n\", d2);\n  //printf(\"%d\\n\", (int)d2);\n#if 0\n  double d3 = atof(argv[3]);\n  printf(\"arg3 = %.20g\\n\", d3);\n#endif\n  //printf(\"%g\\n\", pow2i(i1));\n  //int exp = xexpfrexp(d1);\n  //double r = xnextafter(d1, d2);\n  //double r = xfma(d1, d2, d3);\n  printf(\"test = %.20g\\n\", xcos_u1(d1));\n  //printf(\"test = %.20g\\n\", xlog(d1));\n  //r = nextafter(d1, d2);\n  printf(\"corr = %.20g\\n\", cos(d1));\n  //printf(\"%.20g %.20g\\n\", xround(d1), xrint(d1));\n  //Sleef_double2 r = xsincospi_u35(d);\n  //printf(\"%g, %g\\n\", (double)r.x, (double)r.y);\n}\n#endif\n"
  },
  {
    "path": "src/sleefsimddp.c",
    "content": "//   Copyright Naoki Shibata and contributors 2010 - 2020.\n// Distributed under the Boost Software License, Version 1.0.\n//    (See accompanying file LICENSE.txt or copy at\n//          http://www.boost.org/LICENSE_1_0.txt)\n\n// Always use -ffp-contract=off option to compile SLEEF.\n\n#if !defined(SLEEF_GENHEADER)\n#include <stdint.h>\n#include <assert.h>\n#include <limits.h>\n#include <float.h>\n#endif\n\n#include \"misc.h\"\n\nextern const double Sleef_rempitabdp[];\n\n#define __SLEEFSIMDDP_C__\n\n#if (defined(_MSC_VER))\n#pragma fp_contract (off)\n#endif\n\n// Intel\n\n#ifdef ENABLE_SSE2\n#define CONFIG 2\n#include \"helpersse2.h\"\n#ifdef DORENAME\n#ifdef ENABLE_GNUABI\n#include \"renamesse2_gnuabi.h\"\n#else\n#include \"renamesse2.h\"\n#endif\n#endif\n#endif\n\n#ifdef ENABLE_SSE4\n#define CONFIG 4\n#include \"helpersse2.h\"\n#ifdef DORENAME\n#include \"renamesse4.h\"\n#endif\n#endif\n\n#ifdef ENABLE_AVX\n#define CONFIG 1\n#include \"helperavx.h\"\n#ifdef DORENAME\n#ifdef ENABLE_GNUABI\n#include \"renameavx_gnuabi.h\"\n#else\n#include \"renameavx.h\"\n#endif\n#endif\n#endif\n\n#ifdef ENABLE_FMA4\n#define CONFIG 4\n#include \"helperavx.h\"\n#ifdef DORENAME\n#ifdef ENABLE_GNUABI\n#include \"renamefma4_gnuabi.h\"\n#else\n#include \"renamefma4.h\"\n#endif\n#endif\n#endif\n\n#ifdef ENABLE_AVX2\n#define CONFIG 1\n#include \"helperavx2.h\"\n#ifdef DORENAME\n#ifdef ENABLE_GNUABI\n#include \"renameavx2_gnuabi.h\"\n#else\n#include \"renameavx2.h\"\n#endif\n#endif\n#endif\n\n#ifdef ENABLE_AVX2128\n#define CONFIG 1\n#include \"helperavx2_128.h\"\n#ifdef DORENAME\n#include \"renameavx2128.h\"\n#endif\n#endif\n\n#ifdef ENABLE_AVX512F\n#define CONFIG 1\n#include \"helperavx512f.h\"\n#ifdef DORENAME\n#ifdef ENABLE_GNUABI\n#include \"renameavx512f_gnuabi.h\"\n#else\n#include \"renameavx512f.h\"\n#endif\n#endif\n#endif\n\n#ifdef ENABLE_AVX512FNOFMA\n#define CONFIG 2\n#include \"helperavx512f.h\"\n#ifdef DORENAME\n#include \"renameavx512fnofma.h\"\n#endif\n#endif\n\n// Arm\n\n#ifdef ENABLE_ADVSIMD\n#define CONFIG 1\n#include \"helperadvsimd.h\"\n#ifdef DORENAME\n#ifdef ENABLE_GNUABI\n#include \"renameadvsimd_gnuabi.h\"\n#else\n#include \"renameadvsimd.h\"\n#endif\n#endif\n#endif\n\n#ifdef ENABLE_ADVSIMDNOFMA\n#define CONFIG 2\n#include \"helperadvsimd.h\"\n#ifdef DORENAME\n#include \"renameadvsimdnofma.h\"\n#endif\n#endif\n\n#ifdef ENABLE_SVE\n#define CONFIG 1\n#include \"helpersve.h\"\n#ifdef DORENAME\n#ifdef ENABLE_GNUABI\n#include \"renamesve_gnuabi.h\"\n#else\n#include \"renamesve.h\"\n#endif /* ENABLE_GNUABI */\n#endif /* DORENAME */\n#endif /* ENABLE_SVE */\n\n#ifdef ENABLE_SVENOFMA\n#define CONFIG 2\n#include \"helpersve.h\"\n#ifdef DORENAME\n#include \"renamesvenofma.h\"\n#endif /* DORENAME */\n#endif /* ENABLE_SVE */\n\n// IBM\n\n#ifdef ENABLE_VSX\n#define CONFIG 1\n#include \"helperpower_128.h\"\n#ifdef DORENAME\n#include \"renamevsx.h\"\n#endif\n#endif\n\n#ifdef ENABLE_VSXNOFMA\n#define CONFIG 2\n#include \"helperpower_128.h\"\n#ifdef DORENAME\n#include \"renamevsxnofma.h\"\n#endif\n#endif\n\n#ifdef ENABLE_ZVECTOR2\n#define CONFIG 140\n#include \"helpers390x_128.h\"\n#ifdef DORENAME\n#include \"renamezvector2.h\"\n#endif\n#endif\n\n#ifdef ENABLE_ZVECTOR2NOFMA\n#define CONFIG 141\n#include \"helpers390x_128.h\"\n#ifdef DORENAME\n#include \"renamezvector2nofma.h\"\n#endif\n#endif\n\n// Generic\n\n#ifdef ENABLE_VECEXT\n#define CONFIG 1\n#include \"helpervecext.h\"\n#ifdef DORENAME\n#include \"renamevecext.h\"\n#endif\n#endif\n\n#ifdef ENABLE_PUREC\n#define CONFIG 1\n#include \"helperpurec.h\"\n#ifdef DORENAME\n#include \"renamepurec.h\"\n#endif\n#endif\n\n#ifdef ENABLE_PUREC_SCALAR\n#define CONFIG 1\n#include \"helperpurec_scalar.h\"\n#ifdef DORENAME\n#include \"renamepurec_scalar.h\"\n#endif\n#endif\n\n#ifdef ENABLE_PURECFMA_SCALAR\n#define CONFIG 2\n#include \"helperpurec_scalar.h\"\n#ifdef DORENAME\n#include \"renamepurecfma_scalar.h\"\n#endif\n#endif\n\n//\n\n#define MLA(x, y, z) vmla_vd_vd_vd_vd((x), (y), (z))\n#define C2V(c) vcast_vd_d(c)\n#include \"estrin.h\"\n\n//\n\n#include \"dd.h\"\n\n//\n\nstatic INLINE VECTOR_CC vopmask vnot_vo64_vo64(vopmask x) {\n  return vxor_vo_vo_vo(x, veq64_vo_vm_vm(vcast_vm_i_i(0, 0), vcast_vm_i_i(0, 0)));\n}\n\nstatic INLINE CONST VECTOR_CC vopmask vsignbit_vo_vd(vdouble d) {\n  return veq64_vo_vm_vm(vand_vm_vm_vm(vreinterpret_vm_vd(d), vreinterpret_vm_vd(vcast_vd_d(-0.0))), vreinterpret_vm_vd(vcast_vd_d(-0.0)));\n}\n\n// return d0 < d1 ? x : y\nstatic INLINE CONST VECTOR_CC vint vsel_vi_vd_vd_vi_vi(vdouble d0, vdouble d1, vint x, vint y) { return vsel_vi_vo_vi_vi(vcast_vo32_vo64(vlt_vo_vd_vd(d0, d1)), x, y); } \n\n// return d0 < 0 ? x : 0\nstatic INLINE CONST VECTOR_CC vint vsel_vi_vd_vi(vdouble d, vint x) { return vand_vi_vo_vi(vcast_vo32_vo64(vsignbit_vo_vd(d)), x); }\n\nstatic INLINE CONST VECTOR_CC vopmask visnegzero_vo_vd(vdouble d) {\n  return veq64_vo_vm_vm(vreinterpret_vm_vd(d), vreinterpret_vm_vd(vcast_vd_d(-0.0)));\n}\n\nstatic INLINE CONST VECTOR_CC vopmask visnumber_vo_vd(vdouble x) {\n  return vandnot_vo_vo_vo(visinf_vo_vd(x), veq_vo_vd_vd(x, x));\n}\n\nstatic INLINE CONST VECTOR_CC vmask vsignbit_vm_vd(vdouble d) {\n  return vand_vm_vm_vm(vreinterpret_vm_vd(d), vreinterpret_vm_vd(vcast_vd_d(-0.0)));\n}\n\nstatic INLINE CONST VECTOR_CC vdouble vmulsign_vd_vd_vd(vdouble x, vdouble y) {\n  return vreinterpret_vd_vm(vxor_vm_vm_vm(vreinterpret_vm_vd(x), vsignbit_vm_vd(y)));\n}\n\nstatic INLINE CONST VECTOR_CC vdouble vcopysign_vd_vd_vd(vdouble x, vdouble y) {\n  return vreinterpret_vd_vm(vxor_vm_vm_vm(vandnot_vm_vm_vm(vreinterpret_vm_vd(vcast_vd_d(-0.0)), vreinterpret_vm_vd(x)), \n\t\t\t\t\t  vand_vm_vm_vm   (vreinterpret_vm_vd(vcast_vd_d(-0.0)), vreinterpret_vm_vd(y))));\n}\n\nstatic INLINE CONST VECTOR_CC vdouble vsign_vd_vd(vdouble d) {\n  return vmulsign_vd_vd_vd(vcast_vd_d(1.0), d);\n}\n\nstatic INLINE CONST VECTOR_CC vdouble vpow2i_vd_vi(vint q) {\n  q = vadd_vi_vi_vi(vcast_vi_i(0x3ff), q);\n  vint2 r = vcastu_vi2_vi(q);\n  return vreinterpret_vd_vi2(vsll_vi2_vi2_i(r, 20));\n}\n\nstatic INLINE CONST VECTOR_CC vdouble vldexp_vd_vd_vi(vdouble x, vint q) {\n  vint m = vsra_vi_vi_i(q, 31);\n  m = vsll_vi_vi_i(vsub_vi_vi_vi(vsra_vi_vi_i(vadd_vi_vi_vi(m, q), 9), m), 7);\n  q = vsub_vi_vi_vi(q, vsll_vi_vi_i(m, 2));\n  m = vadd_vi_vi_vi(vcast_vi_i(0x3ff), m);\n  m = vandnot_vi_vo_vi(vgt_vo_vi_vi(vcast_vi_i(0), m), m);\n  m = vsel_vi_vo_vi_vi(vgt_vo_vi_vi(m, vcast_vi_i(0x7ff)), vcast_vi_i(0x7ff), m);\n  vint2 r = vcastu_vi2_vi(m);\n  vdouble y = vreinterpret_vd_vi2(vsll_vi2_vi2_i(r, 20));\n  return vmul_vd_vd_vd(vmul_vd_vd_vd(vmul_vd_vd_vd(vmul_vd_vd_vd(vmul_vd_vd_vd(x, y), y), y), y), vpow2i_vd_vi(q));\n}\n\nstatic INLINE CONST VECTOR_CC vdouble vldexp2_vd_vd_vi(vdouble d, vint e) {\n  return vmul_vd_vd_vd(vmul_vd_vd_vd(d, vpow2i_vd_vi(vsra_vi_vi_i(e, 1))), vpow2i_vd_vi(vsub_vi_vi_vi(e, vsra_vi_vi_i(e, 1))));\n}\n\nstatic INLINE CONST VECTOR_CC vdouble vldexp3_vd_vd_vi(vdouble d, vint q) {\n  return vreinterpret_vd_vi2(vadd_vi2_vi2_vi2(vreinterpret_vi2_vd(d), vsll_vi2_vi2_i(vcastu_vi2_vi(q), 20)));\n}\n\n#if !defined(ENABLE_AVX512F) && !defined(ENABLE_AVX512FNOFMA)\nstatic INLINE CONST VECTOR_CC vint vilogbk_vi_vd(vdouble d) {\n  vopmask o = vlt_vo_vd_vd(d, vcast_vd_d(4.9090934652977266E-91));\n  d = vsel_vd_vo_vd_vd(o, vmul_vd_vd_vd(vcast_vd_d(2.037035976334486E90), d), d);\n  vint q = vcastu_vi_vi2(vreinterpret_vi2_vd(d));\n  q = vand_vi_vi_vi(q, vcast_vi_i(((1 << 12)-1) << 20));\n  q = vsrl_vi_vi_i(q, 20);\n  q = vsub_vi_vi_vi(q, vsel_vi_vo_vi_vi(vcast_vo32_vo64(o), vcast_vi_i(300 + 0x3ff), vcast_vi_i(0x3ff)));\n  return q;\n}\n\nstatic INLINE CONST VECTOR_CC vint vilogb2k_vi_vd(vdouble d) {\n  vint q = vcastu_vi_vi2(vreinterpret_vi2_vd(d));\n  q = vsrl_vi_vi_i(q, 20);\n  q = vand_vi_vi_vi(q, vcast_vi_i(0x7ff));\n  q = vsub_vi_vi_vi(q, vcast_vi_i(0x3ff));\n  return q;\n}\n#endif\n\nstatic INLINE CONST VECTOR_CC vopmask visint_vo_vd(vdouble d) {\n#ifdef FULL_FP_ROUNDING\n  return veq_vo_vd_vd(vtruncate_vd_vd(d), d);\n#else\n  vdouble x = vtruncate_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(1.0 / (INT64_C(1) << 31))));\n  x = vmla_vd_vd_vd_vd(vcast_vd_d(-(double)(INT64_C(1) << 31)), x, d);\n  return vor_vo_vo_vo(veq_vo_vd_vd(vtruncate_vd_vd(x), x),\n\t\t      vgt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(INT64_C(1) << 53)));\n#endif\n}\n\nstatic INLINE CONST VECTOR_CC vopmask visodd_vo_vd(vdouble d) {\n#ifdef FULL_FP_ROUNDING\n  vdouble x = vmul_vd_vd_vd(d, vcast_vd_d(0.5));\n  return vneq_vo_vd_vd(vtruncate_vd_vd(x), x);\n#else\n  vdouble x = vtruncate_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(1.0 / (INT64_C(1) << 31))));\n  x = vmla_vd_vd_vd_vd(vcast_vd_d(-(double)(INT64_C(1) << 31)), x, d);\n\n  return vand_vo_vo_vo(vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(vtruncate_vi_vd(x), vcast_vi_i(1)), vcast_vi_i(1))),\n\t\t       vlt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(INT64_C(1) << 53)));\n#endif\n}\n\n//\n\nEXPORT CONST VECTOR_CC vdouble xldexp(vdouble x, vint q) { return vldexp_vd_vd_vi(x, q); }\n\nEXPORT CONST VECTOR_CC vint xilogb(vdouble d) {\n  vdouble e = vcast_vd_vi(vilogbk_vi_vd(vabs_vd_vd(d)));\n  e = vsel_vd_vo_vd_vd(veq_vo_vd_vd(d, vcast_vd_d(0)), vcast_vd_d(SLEEF_FP_ILOGB0), e);\n  e = vsel_vd_vo_vd_vd(visnan_vo_vd(d), vcast_vd_d(SLEEF_FP_ILOGBNAN), e);\n  e = vsel_vd_vo_vd_vd(visinf_vo_vd(d), vcast_vd_d(INT_MAX), e);\n  return vrint_vi_vd(e);\n}\n\n#if !(defined(ENABLE_SVE) || defined(ENABLE_SVENOFMA))\ntypedef struct {\n  vdouble d;\n  vint i;\n} di_t;\n\nstatic vdouble digetd_vd_di(di_t d) { return d.d; }\nstatic vint digeti_vi_di(di_t d) { return d.i; }\nstatic di_t disetdi_di_vd_vi(vdouble d, vint i) {\n  di_t r = { d, i };\n  return r;\n}\n\ntypedef struct {\n  vdouble2 dd;\n  vint i;\n} ddi_t;\n\nstatic vdouble2 ddigetdd_vd2_ddi(ddi_t d) { return d.dd; }\nstatic vint ddigeti_vi_ddi(ddi_t d) { return d.i; }\nstatic ddi_t ddisetddi_ddi_vd2_vi(vdouble2 v, vint i) {\n  ddi_t r = { v, i };\n  return r;\n}\nstatic ddi_t ddisetdd_ddi_ddi_vd2(ddi_t ddi, vdouble2 v) {\n  ddi.dd = v;\n  return ddi;\n}\n#endif\n\nstatic INLINE CONST VECTOR_CC vdouble vorsign_vd_vd_vd(vdouble x, vdouble y) {\n  return vreinterpret_vd_vm(vor_vm_vm_vm(vreinterpret_vm_vd(x), vsignbit_vm_vd(y)));\n}\n\nstatic INLINE CONST di_t rempisub(vdouble x) {\n#ifdef FULL_FP_ROUNDING\n  vdouble y = vrint_vd_vd(vmul_vd_vd_vd(x, vcast_vd_d(4)));\n  vint vi = vtruncate_vi_vd(vsub_vd_vd_vd(y, vmul_vd_vd_vd(vrint_vd_vd(x), vcast_vd_d(4))));\n  return disetdi_di_vd_vi(vsub_vd_vd_vd(x, vmul_vd_vd_vd(y, vcast_vd_d(0.25))), vi);\n#else\n  vdouble c = vmulsign_vd_vd_vd(vcast_vd_d(INT64_C(1) << 52), x);\n  vdouble rint4x = vsel_vd_vo_vd_vd(vgt_vo_vd_vd(vabs_vd_vd(vmul_vd_vd_vd(vcast_vd_d(4), x)), vcast_vd_d(INT64_C(1) << 52)),\n\t\t\t\t    vmul_vd_vd_vd(vcast_vd_d(4), x),\n\t\t\t\t    vorsign_vd_vd_vd(vsub_vd_vd_vd(vmla_vd_vd_vd_vd(vcast_vd_d(4), x, c), c), x));\n  vdouble rintx  = vsel_vd_vo_vd_vd(vgt_vo_vd_vd(vabs_vd_vd(x), vcast_vd_d(INT64_C(1) << 52)),\n\t\t\t\t    x, vorsign_vd_vd_vd(vsub_vd_vd_vd(vadd_vd_vd_vd(x, c), c), x));\n  return disetdi_di_vd_vi(vmla_vd_vd_vd_vd(vcast_vd_d(-0.25), rint4x, x),\n\t\t\t  vtruncate_vi_vd(vmla_vd_vd_vd_vd(vcast_vd_d(-4), rintx, rint4x)));\n#endif\n}\n\nstatic INLINE CONST ddi_t rempi(vdouble a) {\n  vdouble2 x, y, z;\n  vint ex = vilogb2k_vi_vd(a);\n#if defined(ENABLE_AVX512F) || defined(ENABLE_AVX512FNOFMA)\n  ex = vandnot_vi_vi_vi(vsra_vi_vi_i(ex, 31), ex);\n  ex = vand_vi_vi_vi(ex, vcast_vi_i(1023));\n#endif\n  ex = vsub_vi_vi_vi(ex, vcast_vi_i(55));\n  vint q = vand_vi_vo_vi(vgt_vo_vi_vi(ex, vcast_vi_i(700-55)), vcast_vi_i(-64));\n  a = vldexp3_vd_vd_vi(a, q);\n  ex = vandnot_vi_vi_vi(vsra_vi_vi_i(ex, 31), ex);\n  ex = vsll_vi_vi_i(ex, 2);\n  x = ddmul_vd2_vd_vd(a, vgather_vd_p_vi(Sleef_rempitabdp, ex));\n  di_t di = rempisub(vd2getx_vd_vd2(x));\n  q = digeti_vi_di(di);\n  x = vd2setx_vd2_vd2_vd(x, digetd_vd_di(di));\n  x = ddnormalize_vd2_vd2(x);\n  y = ddmul_vd2_vd_vd(a, vgather_vd_p_vi(Sleef_rempitabdp+1, ex));\n  x = ddadd2_vd2_vd2_vd2(x, y);\n  di = rempisub(vd2getx_vd_vd2(x));\n  q = vadd_vi_vi_vi(q, digeti_vi_di(di));\n  x = vd2setx_vd2_vd2_vd(x, digetd_vd_di(di));\n  x = ddnormalize_vd2_vd2(x);\n  y = vcast_vd2_vd_vd(vgather_vd_p_vi(Sleef_rempitabdp+2, ex), vgather_vd_p_vi(Sleef_rempitabdp+3, ex));\n  y = ddmul_vd2_vd2_vd(y, a);\n  x = ddadd2_vd2_vd2_vd2(x, y);\n  x = ddnormalize_vd2_vd2(x);\n  x = ddmul_vd2_vd2_vd2(x, vcast_vd2_d_d(3.141592653589793116*2, 1.2246467991473532072e-16*2));\n  vopmask o = vlt_vo_vd_vd(vabs_vd_vd(a), vcast_vd_d(0.7));\n  x = vd2setx_vd2_vd2_vd(x, vsel_vd_vo_vd_vd(o, a, vd2getx_vd_vd2(x)));\n  x = vd2sety_vd2_vd2_vd(x, vreinterpret_vd_vm(vandnot_vm_vo64_vm(o, vreinterpret_vm_vd(vd2gety_vd_vd2(x)))));\n  return ddisetddi_ddi_vd2_vi(x, q);\n}\n\nEXPORT CONST VECTOR_CC vdouble xsin(vdouble d) {\n#if !defined(DETERMINISTIC)\n// The SIMD source files(sleefsimd?p.c) are compiled twice for each\n// vector extension, with DETERMINISTIC macro turned on and off.\n// Below is the normal(faster) implementation of sin function.  The\n// function name xsin will be renamed to Sleef_sind2_u35sse2 with\n// renamesse2.h, for example.\n\n  vdouble u, s, r = d;\n  vint ql;\n\n  if (LIKELY(vtestallones_i_vo64(vlt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(TRIGRANGEMAX2))))) {\n    vdouble dql = vrint_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(M_1_PI)));\n    ql = vrint_vi_vd(dql);\n    d = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_A2), d);\n    d = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_B2), d);\n  } else if (LIKELY(vtestallones_i_vo64(vlt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(TRIGRANGEMAX))))) {\n    vdouble dqh = vtruncate_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(M_1_PI / (1 << 24))));\n    dqh = vmul_vd_vd_vd(dqh, vcast_vd_d(1 << 24));\n    vdouble dql = vrint_vd_vd(vmlapn_vd_vd_vd_vd(d, vcast_vd_d(M_1_PI), dqh));\n    ql = vrint_vi_vd(dql);\n\n    d = vmla_vd_vd_vd_vd(dqh, vcast_vd_d(-PI_A), d);\n    d = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_A), d);\n    d = vmla_vd_vd_vd_vd(dqh, vcast_vd_d(-PI_B), d);\n    d = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_B), d);\n    d = vmla_vd_vd_vd_vd(dqh, vcast_vd_d(-PI_C), d);\n    d = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_C), d);\n    d = vmla_vd_vd_vd_vd(vadd_vd_vd_vd(dqh, dql), vcast_vd_d(-PI_D), d);\n  } else {\n    ddi_t ddi = rempi(d);\n    ql = vand_vi_vi_vi(ddigeti_vi_ddi(ddi), vcast_vi_i(3));\n    ql = vadd_vi_vi_vi(vadd_vi_vi_vi(ql, ql), vsel_vi_vo_vi_vi(vcast_vo32_vo64(vgt_vo_vd_vd(vd2getx_vd_vd2(ddigetdd_vd2_ddi(ddi)), vcast_vd_d(0))), vcast_vi_i(2), vcast_vi_i(1)));\n    ql = vsra_vi_vi_i(ql, 2);\n    vopmask o = veq_vo_vi_vi(vand_vi_vi_vi(ddigeti_vi_ddi(ddi), vcast_vi_i(1)), vcast_vi_i(1));\n    vdouble2 x = vcast_vd2_vd_vd(vmulsign_vd_vd_vd(vcast_vd_d(-3.141592653589793116 * 0.5), vd2getx_vd_vd2(ddigetdd_vd2_ddi(ddi))), \n\t\t\t\t vmulsign_vd_vd_vd(vcast_vd_d(-1.2246467991473532072e-16 * 0.5), vd2getx_vd_vd2(ddigetdd_vd2_ddi(ddi))));\n    x = ddadd2_vd2_vd2_vd2(ddigetdd_vd2_ddi(ddi), x);\n    ddi = ddisetdd_ddi_ddi_vd2(ddi, vsel_vd2_vo_vd2_vd2(vcast_vo64_vo32(o), x, ddigetdd_vd2_ddi(ddi)));\n    d = vadd_vd_vd_vd(vd2getx_vd_vd2(ddigetdd_vd2_ddi(ddi)), vd2gety_vd_vd2(ddigetdd_vd2_ddi(ddi)));\n    d = vreinterpret_vd_vm(vor_vm_vo64_vm(vor_vo_vo_vo(visinf_vo_vd(r), visnan_vo_vd(r)), vreinterpret_vm_vd(d)));\n  }\n\n  s = vmul_vd_vd_vd(d, d);\n\n  d = vreinterpret_vd_vm(vxor_vm_vm_vm(vand_vm_vo64_vm(vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(ql, vcast_vi_i(1)), vcast_vi_i(1))), vreinterpret_vm_vd(vcast_vd_d(-0.0))), vreinterpret_vm_vd(d)));\n\n  vdouble s2 = vmul_vd_vd_vd(s, s), s4 = vmul_vd_vd_vd(s2, s2);\n  u = POLY8(s, s2, s4,\n\t    -7.97255955009037868891952e-18,\n\t    2.81009972710863200091251e-15,\n\t    -7.64712219118158833288484e-13,\n\t    1.60590430605664501629054e-10,\n\t    -2.50521083763502045810755e-08,\n\t    2.75573192239198747630416e-06,\n\t    -0.000198412698412696162806809,\n\t    0.00833333333333332974823815);\n  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-0.166666666666666657414808));\n\n  u = vadd_vd_vd_vd(vmul_vd_vd_vd(s, vmul_vd_vd_vd(u, d)), d);\n\n  u = vsel_vd_vo_vd_vd(visnegzero_vo_vd(r), r, u);\n  \n  return u;\n\n#else // #if !defined(DETERMINISTIC)\n\n// This is the deterministic implementation of sin function. Returned\n// values from deterministic functions are bitwise consistent across\n// all platforms. The function name xsin will be renamed to\n// Sleef_cinz_sind2_u35sse2 with renamesse2.h, for example. The\n// renaming by rename*.h is switched according to DETERMINISTIC macro.\n  vdouble u, s, r = d;\n  vint ql;\n\n  vdouble dql = vrint_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(M_1_PI)));\n  ql = vrint_vi_vd(dql);\n  d = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_A2), d);\n  d = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_B2), d);\n  vopmask g = vlt_vo_vd_vd(vabs_vd_vd(r), vcast_vd_d(TRIGRANGEMAX2));\n\n  if (!LIKELY(vtestallones_i_vo64(g))) {\n    vdouble dqh = vtruncate_vd_vd(vmul_vd_vd_vd(r, vcast_vd_d(M_1_PI / (1 << 24))));\n    dqh = vmul_vd_vd_vd(dqh, vcast_vd_d(1 << 24));\n    vdouble dql = vrint_vd_vd(vmlapn_vd_vd_vd_vd(r, vcast_vd_d(M_1_PI), dqh));\n\n    u = vmla_vd_vd_vd_vd(dqh, vcast_vd_d(-PI_A), r);\n    u = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_A), u);\n    u = vmla_vd_vd_vd_vd(dqh, vcast_vd_d(-PI_B), u);\n    u = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_B), u);\n    u = vmla_vd_vd_vd_vd(dqh, vcast_vd_d(-PI_C), u);\n    u = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_C), u);\n    u = vmla_vd_vd_vd_vd(vadd_vd_vd_vd(dqh, dql), vcast_vd_d(-PI_D), u);\n\n    ql = vsel_vi_vo_vi_vi(vcast_vo32_vo64(g), ql, vrint_vi_vd(dql));\n    d = vsel_vd_vo_vd_vd(g, d, u);\n    g = vlt_vo_vd_vd(vabs_vd_vd(r), vcast_vd_d(TRIGRANGEMAX));\n\n    if (!LIKELY(vtestallones_i_vo64(g))) {\n      ddi_t ddi = rempi(r);\n      vint ql2 = vand_vi_vi_vi(ddigeti_vi_ddi(ddi), vcast_vi_i(3));\n      ql2 = vadd_vi_vi_vi(vadd_vi_vi_vi(ql2, ql2), vsel_vi_vo_vi_vi(vcast_vo32_vo64(vgt_vo_vd_vd(vd2getx_vd_vd2(ddigetdd_vd2_ddi(ddi)), vcast_vd_d(0))), vcast_vi_i(2), vcast_vi_i(1)));\n      ql2 = vsra_vi_vi_i(ql2, 2);\n      vopmask o = veq_vo_vi_vi(vand_vi_vi_vi(ddigeti_vi_ddi(ddi), vcast_vi_i(1)), vcast_vi_i(1));\n      vdouble2 x = vcast_vd2_vd_vd(vmulsign_vd_vd_vd(vcast_vd_d(-3.141592653589793116 * 0.5), vd2getx_vd_vd2(ddigetdd_vd2_ddi(ddi))), \n\t\t\t\t   vmulsign_vd_vd_vd(vcast_vd_d(-1.2246467991473532072e-16 * 0.5), vd2getx_vd_vd2(ddigetdd_vd2_ddi(ddi))));\n      x = ddadd2_vd2_vd2_vd2(ddigetdd_vd2_ddi(ddi), x);\n      ddi = ddisetdd_ddi_ddi_vd2(ddi, vsel_vd2_vo_vd2_vd2(vcast_vo64_vo32(o), x, ddigetdd_vd2_ddi(ddi)));\n      u = vadd_vd_vd_vd(vd2getx_vd_vd2(ddigetdd_vd2_ddi(ddi)), vd2gety_vd_vd2(ddigetdd_vd2_ddi(ddi)));\n      ql = vsel_vi_vo_vi_vi(vcast_vo32_vo64(g), ql, ql2);\n      d = vsel_vd_vo_vd_vd(g, d, u);\n      d = vreinterpret_vd_vm(vor_vm_vo64_vm(vor_vo_vo_vo(visinf_vo_vd(r), visnan_vo_vd(r)), vreinterpret_vm_vd(d)));\n    }\n  }\n\n  s = vmul_vd_vd_vd(d, d);\n\n  d = vreinterpret_vd_vm(vxor_vm_vm_vm(vand_vm_vo64_vm(vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(ql, vcast_vi_i(1)), vcast_vi_i(1))), vreinterpret_vm_vd(vcast_vd_d(-0.0))), vreinterpret_vm_vd(d)));\n\n  vdouble s2 = vmul_vd_vd_vd(s, s), s4 = vmul_vd_vd_vd(s2, s2);\n  u = POLY8(s, s2, s4,\n\t    -7.97255955009037868891952e-18,\n\t    2.81009972710863200091251e-15,\n\t    -7.64712219118158833288484e-13,\n\t    1.60590430605664501629054e-10,\n\t    -2.50521083763502045810755e-08,\n\t    2.75573192239198747630416e-06,\n\t    -0.000198412698412696162806809,\n\t    0.00833333333333332974823815);\n  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-0.166666666666666657414808));\n\n  u = vadd_vd_vd_vd(vmul_vd_vd_vd(s, vmul_vd_vd_vd(u, d)), d);\n\n  u = vsel_vd_vo_vd_vd(visnegzero_vo_vd(r), r, u);\n  \n  return u;\n#endif // #if !defined(DETERMINISTIC)\n}\n\nEXPORT CONST VECTOR_CC vdouble xsin_u1(vdouble d) {\n#if !defined(DETERMINISTIC)\n  vdouble u;\n  vdouble2 s, t, x;\n  vint ql;\n  \n  if (LIKELY(vtestallones_i_vo64(vlt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(TRIGRANGEMAX2))))) {\n    const vdouble dql = vrint_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(M_1_PI)));\n    ql = vrint_vi_vd(dql);\n    u = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_A2), d);\n    s = ddadd_vd2_vd_vd (u, vmul_vd_vd_vd(dql, vcast_vd_d(-PI_B2)));\n  } else if (LIKELY(vtestallones_i_vo64(vlt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(TRIGRANGEMAX))))) {\n    vdouble dqh = vtruncate_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(M_1_PI / (1 << 24))));\n    dqh = vmul_vd_vd_vd(dqh, vcast_vd_d(1 << 24));\n    const vdouble dql = vrint_vd_vd(vmlapn_vd_vd_vd_vd(d, vcast_vd_d(M_1_PI), dqh));\n    ql = vrint_vi_vd(dql);\n\n    u = vmla_vd_vd_vd_vd(dqh, vcast_vd_d(-PI_A), d);\n    s = ddadd_vd2_vd_vd  (u, vmul_vd_vd_vd(dql, vcast_vd_d(-PI_A)));\n    s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(dqh, vcast_vd_d(-PI_B)));\n    s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(dql, vcast_vd_d(-PI_B)));\n    s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(dqh, vcast_vd_d(-PI_C)));\n    s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(dql, vcast_vd_d(-PI_C)));\n    s = ddadd_vd2_vd2_vd(s, vmul_vd_vd_vd(vadd_vd_vd_vd(dqh, dql), vcast_vd_d(-PI_D)));\n  } else {\n    ddi_t ddi = rempi(d);\n    ql = vand_vi_vi_vi(ddigeti_vi_ddi(ddi), vcast_vi_i(3));\n    ql = vadd_vi_vi_vi(vadd_vi_vi_vi(ql, ql), vsel_vi_vo_vi_vi(vcast_vo32_vo64(vgt_vo_vd_vd(vd2getx_vd_vd2(ddigetdd_vd2_ddi(ddi)), vcast_vd_d(0))), vcast_vi_i(2), vcast_vi_i(1)));\n    ql = vsra_vi_vi_i(ql, 2);\n    vopmask o = veq_vo_vi_vi(vand_vi_vi_vi(ddigeti_vi_ddi(ddi), vcast_vi_i(1)), vcast_vi_i(1));\n    vdouble2 x = vcast_vd2_vd_vd(vmulsign_vd_vd_vd(vcast_vd_d(-3.141592653589793116 * 0.5), vd2getx_vd_vd2(ddigetdd_vd2_ddi(ddi))), \n\t\t\t\t vmulsign_vd_vd_vd(vcast_vd_d(-1.2246467991473532072e-16 * 0.5), vd2getx_vd_vd2(ddigetdd_vd2_ddi(ddi))));\n    x = ddadd2_vd2_vd2_vd2(ddigetdd_vd2_ddi(ddi), x);\n    ddi = ddisetdd_ddi_ddi_vd2(ddi, vsel_vd2_vo_vd2_vd2(vcast_vo64_vo32(o), x, ddigetdd_vd2_ddi(ddi)));\n    s = ddnormalize_vd2_vd2(ddigetdd_vd2_ddi(ddi));\n    s = vd2setx_vd2_vd2_vd(s, vreinterpret_vd_vm(vor_vm_vo64_vm(vor_vo_vo_vo(visinf_vo_vd(d), visnan_vo_vd(d)), vreinterpret_vm_vd(vd2getx_vd_vd2(s)))));\n  }\n  \n  t = s;\n  s = ddsqu_vd2_vd2(s);\n\n  vdouble s2 = vmul_vd_vd_vd(vd2getx_vd_vd2(s), vd2getx_vd_vd2(s)), s4 = vmul_vd_vd_vd(s2, s2);\n  u = POLY6(vd2getx_vd_vd2(s), s2, s4,\n\t    2.72052416138529567917983e-15,\n\t    -7.6429259411395447190023e-13,\n\t    1.60589370117277896211623e-10,\n\t    -2.5052106814843123359368e-08,\n\t    2.75573192104428224777379e-06,\n\t    -0.000198412698412046454654947);\n  u = vmla_vd_vd_vd_vd(u, vd2getx_vd_vd2(s), vcast_vd_d(0.00833333333333318056201922));\n\n  x = ddadd_vd2_vd_vd2(vcast_vd_d(1), ddmul_vd2_vd2_vd2(ddadd_vd2_vd_vd(vcast_vd_d(-0.166666666666666657414808), vmul_vd_vd_vd(u, vd2getx_vd_vd2(s))), s));\n  u = ddmul_vd_vd2_vd2(t, x);\n  \n  u = vreinterpret_vd_vm(vxor_vm_vm_vm(vand_vm_vo64_vm(vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(ql, vcast_vi_i(1)), vcast_vi_i(1))),\n\t\t\t\t\t\t       vreinterpret_vm_vd(vcast_vd_d(-0.0))), vreinterpret_vm_vd(u)));\n  u = vsel_vd_vo_vd_vd(veq_vo_vd_vd(d, vcast_vd_d(0)), d, u);\n  \n  return u;\n\n#else // #if !defined(DETERMINISTIC)\n\n  vdouble u;\n  vdouble2 s, t, x;\n  vint ql;\n\n  vopmask g = vlt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(TRIGRANGEMAX2));\n  vdouble dql = vrint_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(M_1_PI)));\n  ql = vrint_vi_vd(dql);\n  u = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_A2), d);\n  x = ddadd_vd2_vd_vd (u, vmul_vd_vd_vd(dql, vcast_vd_d(-PI_B2)));\n\n  if (!LIKELY(vtestallones_i_vo64(g))) {\n    vdouble dqh = vtruncate_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(M_1_PI / (1 << 24))));\n    dqh = vmul_vd_vd_vd(dqh, vcast_vd_d(1 << 24));\n    const vdouble dql = vrint_vd_vd(vmlapn_vd_vd_vd_vd(d, vcast_vd_d(M_1_PI), dqh));\n\n    u = vmla_vd_vd_vd_vd(dqh, vcast_vd_d(-PI_A), d);\n    s = ddadd_vd2_vd_vd  (u, vmul_vd_vd_vd(dql, vcast_vd_d(-PI_A)));\n    s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(dqh, vcast_vd_d(-PI_B)));\n    s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(dql, vcast_vd_d(-PI_B)));\n    s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(dqh, vcast_vd_d(-PI_C)));\n    s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(dql, vcast_vd_d(-PI_C)));\n    s = ddadd_vd2_vd2_vd(s, vmul_vd_vd_vd(vadd_vd_vd_vd(dqh, dql), vcast_vd_d(-PI_D)));\n\n    ql = vsel_vi_vo_vi_vi(vcast_vo32_vo64(g), ql, vrint_vi_vd(dql));\n    x = vsel_vd2_vo_vd2_vd2(g, x, s);\n    g = vlt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(TRIGRANGEMAX));\n\n    if (!LIKELY(vtestallones_i_vo64(g))) {\n      ddi_t ddi = rempi(d);\n      vint ql2 = vand_vi_vi_vi(ddigeti_vi_ddi(ddi), vcast_vi_i(3));\n      ql2 = vadd_vi_vi_vi(vadd_vi_vi_vi(ql2, ql2), vsel_vi_vo_vi_vi(vcast_vo32_vo64(vgt_vo_vd_vd(vd2getx_vd_vd2(ddigetdd_vd2_ddi(ddi)), vcast_vd_d(0))), vcast_vi_i(2), vcast_vi_i(1)));\n      ql2 = vsra_vi_vi_i(ql2, 2);\n      vopmask o = veq_vo_vi_vi(vand_vi_vi_vi(ddigeti_vi_ddi(ddi), vcast_vi_i(1)), vcast_vi_i(1));\n      vdouble2 t = vcast_vd2_vd_vd(vmulsign_vd_vd_vd(vcast_vd_d(-3.141592653589793116 * 0.5), vd2getx_vd_vd2(ddigetdd_vd2_ddi(ddi))), \n\t\t\t\t   vmulsign_vd_vd_vd(vcast_vd_d(-1.2246467991473532072e-16 * 0.5), vd2getx_vd_vd2(ddigetdd_vd2_ddi(ddi))));\n      t = ddadd2_vd2_vd2_vd2(ddigetdd_vd2_ddi(ddi), t);\n      ddi = ddisetdd_ddi_ddi_vd2(ddi, vsel_vd2_vo_vd2_vd2(vcast_vo64_vo32(o), t, ddigetdd_vd2_ddi(ddi)));\n      s = ddnormalize_vd2_vd2(ddigetdd_vd2_ddi(ddi));\n      ql = vsel_vi_vo_vi_vi(vcast_vo32_vo64(g), ql, ql2);\n      x = vsel_vd2_vo_vd2_vd2(g, x, s);\n      x = vd2setx_vd2_vd2_vd(x, vreinterpret_vd_vm(vor_vm_vo64_vm(vor_vo_vo_vo(visinf_vo_vd(d), visnan_vo_vd(d)), vreinterpret_vm_vd(vd2getx_vd_vd2(x)))));\n    }\n  }\n  \n  t = x;\n  s = ddsqu_vd2_vd2(x);\n\n  vdouble s2 = vmul_vd_vd_vd(vd2getx_vd_vd2(s), vd2getx_vd_vd2(s)), s4 = vmul_vd_vd_vd(s2, s2);\n  u = POLY6(vd2getx_vd_vd2(s), s2, s4,\n\t    2.72052416138529567917983e-15,\n\t    -7.6429259411395447190023e-13,\n\t    1.60589370117277896211623e-10,\n\t    -2.5052106814843123359368e-08,\n\t    2.75573192104428224777379e-06,\n\t    -0.000198412698412046454654947);\n  u = vmla_vd_vd_vd_vd(u, vd2getx_vd_vd2(s), vcast_vd_d(0.00833333333333318056201922));\n\n  x = ddadd_vd2_vd_vd2(vcast_vd_d(1), ddmul_vd2_vd2_vd2(ddadd_vd2_vd_vd(vcast_vd_d(-0.166666666666666657414808), vmul_vd_vd_vd(u, vd2getx_vd_vd2(s))), s));\n  u = ddmul_vd_vd2_vd2(t, x);\n\n  u = vreinterpret_vd_vm(vxor_vm_vm_vm(vand_vm_vo64_vm(vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(ql, vcast_vi_i(1)), vcast_vi_i(1))),\n\t\t\t\t\t\t       vreinterpret_vm_vd(vcast_vd_d(-0.0))), vreinterpret_vm_vd(u)));\n\n  u = vsel_vd_vo_vd_vd(veq_vo_vd_vd(d, vcast_vd_d(0)), d, u);\n  \n  return u;\n#endif // #if !defined(DETERMINISTIC)\n}\n\nEXPORT CONST VECTOR_CC vdouble xcos(vdouble d) {\n#if !defined(DETERMINISTIC)\n  vdouble u, s, r = d;\n  vint ql;\n\n  if (LIKELY(vtestallones_i_vo64(vlt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(TRIGRANGEMAX2))))) {\n    vdouble dql = vmla_vd_vd_vd_vd(vcast_vd_d(2),\n\t\t\t\t   vrint_vd_vd(vmla_vd_vd_vd_vd(d, vcast_vd_d(M_1_PI), vcast_vd_d(-0.5))),\n\t\t\t\t   vcast_vd_d(1));\n    ql = vrint_vi_vd(dql);\n    d = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_A2 * 0.5), d);\n    d = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_B2 * 0.5), d);\n  } else if (LIKELY(vtestallones_i_vo64(vlt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(TRIGRANGEMAX))))) {\n    vdouble dqh = vtruncate_vd_vd(vmla_vd_vd_vd_vd(d, vcast_vd_d(M_1_PI / (1 << 23)), vcast_vd_d(-M_1_PI / (1 << 24))));\n    ql = vrint_vi_vd(vadd_vd_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(M_1_PI)),\n\t\t\t\t   vmla_vd_vd_vd_vd(dqh, vcast_vd_d(-(1 << 23)), vcast_vd_d(-0.5))));\n    dqh = vmul_vd_vd_vd(dqh, vcast_vd_d(1 << 24));\n    ql = vadd_vi_vi_vi(vadd_vi_vi_vi(ql, ql), vcast_vi_i(1));\n    vdouble dql = vcast_vd_vi(ql);\n\n    d = vmla_vd_vd_vd_vd(dqh, vcast_vd_d(-PI_A * 0.5), d);\n    d = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_A * 0.5), d);\n    d = vmla_vd_vd_vd_vd(dqh, vcast_vd_d(-PI_B * 0.5), d);\n    d = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_B * 0.5), d);\n    d = vmla_vd_vd_vd_vd(dqh, vcast_vd_d(-PI_C * 0.5), d);\n    d = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_C * 0.5), d);\n    d = vmla_vd_vd_vd_vd(vadd_vd_vd_vd(dqh, dql), vcast_vd_d(-PI_D * 0.5), d);\n  } else {\n    ddi_t ddi = rempi(d);\n    ql = vand_vi_vi_vi(ddigeti_vi_ddi(ddi), vcast_vi_i(3));\n    ql = vadd_vi_vi_vi(vadd_vi_vi_vi(ql, ql), vsel_vi_vo_vi_vi(vcast_vo32_vo64(vgt_vo_vd_vd(vd2getx_vd_vd2(ddigetdd_vd2_ddi(ddi)), vcast_vd_d(0))), vcast_vi_i(8), vcast_vi_i(7)));\n    ql = vsra_vi_vi_i(ql, 1);\n    vopmask o = veq_vo_vi_vi(vand_vi_vi_vi(ddigeti_vi_ddi(ddi), vcast_vi_i(1)), vcast_vi_i(0));\n    vdouble y = vsel_vd_vo_vd_vd(vgt_vo_vd_vd(vd2getx_vd_vd2(ddigetdd_vd2_ddi(ddi)), vcast_vd_d(0)), vcast_vd_d(0), vcast_vd_d(-1));\n    vdouble2 x = vcast_vd2_vd_vd(vmulsign_vd_vd_vd(vcast_vd_d(-3.141592653589793116 * 0.5), y), \n\t\t\t\t vmulsign_vd_vd_vd(vcast_vd_d(-1.2246467991473532072e-16 * 0.5), y));\n    x = ddadd2_vd2_vd2_vd2(ddigetdd_vd2_ddi(ddi), x);\n    ddi = ddisetdd_ddi_ddi_vd2(ddi, vsel_vd2_vo_vd2_vd2(vcast_vo64_vo32(o), x, ddigetdd_vd2_ddi(ddi)));\n    d = vadd_vd_vd_vd(vd2getx_vd_vd2(ddigetdd_vd2_ddi(ddi)), vd2gety_vd_vd2(ddigetdd_vd2_ddi(ddi)));\n    d = vreinterpret_vd_vm(vor_vm_vo64_vm(vor_vo_vo_vo(visinf_vo_vd(r), visnan_vo_vd(r)), vreinterpret_vm_vd(d)));\n  }\n\n  s = vmul_vd_vd_vd(d, d);\n\n  d = vreinterpret_vd_vm(vxor_vm_vm_vm(vand_vm_vo64_vm(vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(ql, vcast_vi_i(2)), vcast_vi_i(0))), vreinterpret_vm_vd(vcast_vd_d(-0.0))), vreinterpret_vm_vd(d)));\n\n  vdouble s2 = vmul_vd_vd_vd(s, s), s4 = vmul_vd_vd_vd(s2, s2);\n  u = POLY8(s, s2, s4,\n\t    -7.97255955009037868891952e-18,\n\t    2.81009972710863200091251e-15,\n\t    -7.64712219118158833288484e-13,\n\t    1.60590430605664501629054e-10,\n\t    -2.50521083763502045810755e-08,\n\t    2.75573192239198747630416e-06,\n\t    -0.000198412698412696162806809,\n\t    0.00833333333333332974823815);\n  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-0.166666666666666657414808));\n\n  u = vadd_vd_vd_vd(vmul_vd_vd_vd(s, vmul_vd_vd_vd(u, d)), d);\n  \n  return u;\n\n#else // #if !defined(DETERMINISTIC)\n\n  vdouble u, s, r = d;\n  vint ql;\n\n  vopmask g = vlt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(TRIGRANGEMAX2));\n  vdouble dql = vmla_vd_vd_vd_vd(vcast_vd_d(2),\n\t\t\t\t vrint_vd_vd(vmla_vd_vd_vd_vd(d, vcast_vd_d(M_1_PI), vcast_vd_d(-0.5))),\n\t\t\t\t vcast_vd_d(1));\n  ql = vrint_vi_vd(dql);\n  d = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_A2 * 0.5), d);\n  d = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_B2 * 0.5), d);\n\n  if (!LIKELY(vtestallones_i_vo64(g))) {\n    vdouble dqh = vtruncate_vd_vd(vmla_vd_vd_vd_vd(r, vcast_vd_d(M_1_PI / (1 << 23)), vcast_vd_d(-M_1_PI / (1 << 24))));\n    vint ql2 = vrint_vi_vd(vadd_vd_vd_vd(vmul_vd_vd_vd(r, vcast_vd_d(M_1_PI)),\n\t\t\t\t\t vmla_vd_vd_vd_vd(dqh, vcast_vd_d(-(1 << 23)), vcast_vd_d(-0.5))));\n    dqh = vmul_vd_vd_vd(dqh, vcast_vd_d(1 << 24));\n    ql2 = vadd_vi_vi_vi(vadd_vi_vi_vi(ql2, ql2), vcast_vi_i(1));\n    vdouble dql = vcast_vd_vi(ql2);\n\n    u = vmla_vd_vd_vd_vd(dqh, vcast_vd_d(-PI_A * 0.5), r);\n    u = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_A * 0.5), u);\n    u = vmla_vd_vd_vd_vd(dqh, vcast_vd_d(-PI_B * 0.5), u);\n    u = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_B * 0.5), u);\n    u = vmla_vd_vd_vd_vd(dqh, vcast_vd_d(-PI_C * 0.5), u);\n    u = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_C * 0.5), u);\n    u = vmla_vd_vd_vd_vd(vadd_vd_vd_vd(dqh, dql), vcast_vd_d(-PI_D * 0.5), u);\n\n    ql = vsel_vi_vo_vi_vi(vcast_vo32_vo64(g), ql, ql2);\n    d = vsel_vd_vo_vd_vd(g, d, u);\n    g = vlt_vo_vd_vd(vabs_vd_vd(r), vcast_vd_d(TRIGRANGEMAX));\n\n    if (!LIKELY(vtestallones_i_vo64(g))) {\n      ddi_t ddi = rempi(r);\n      vint ql2 = vand_vi_vi_vi(ddigeti_vi_ddi(ddi), vcast_vi_i(3));\n      ql2 = vadd_vi_vi_vi(vadd_vi_vi_vi(ql2, ql2), vsel_vi_vo_vi_vi(vcast_vo32_vo64(vgt_vo_vd_vd(vd2getx_vd_vd2(ddigetdd_vd2_ddi(ddi)), vcast_vd_d(0))), vcast_vi_i(8), vcast_vi_i(7)));\n      ql2 = vsra_vi_vi_i(ql2, 1);\n      vopmask o = veq_vo_vi_vi(vand_vi_vi_vi(ddigeti_vi_ddi(ddi), vcast_vi_i(1)), vcast_vi_i(0));\n      vdouble y = vsel_vd_vo_vd_vd(vgt_vo_vd_vd(vd2getx_vd_vd2(ddigetdd_vd2_ddi(ddi)), vcast_vd_d(0)), vcast_vd_d(0), vcast_vd_d(-1));\n      vdouble2 x = vcast_vd2_vd_vd(vmulsign_vd_vd_vd(vcast_vd_d(-3.141592653589793116 * 0.5), y), \n\t\t\t\t   vmulsign_vd_vd_vd(vcast_vd_d(-1.2246467991473532072e-16 * 0.5), y));\n      x = ddadd2_vd2_vd2_vd2(ddigetdd_vd2_ddi(ddi), x);\n      ddi = ddisetdd_ddi_ddi_vd2(ddi, vsel_vd2_vo_vd2_vd2(vcast_vo64_vo32(o), x, ddigetdd_vd2_ddi(ddi)));\n      u = vadd_vd_vd_vd(vd2getx_vd_vd2(ddigetdd_vd2_ddi(ddi)), vd2gety_vd_vd2(ddigetdd_vd2_ddi(ddi)));\n      ql = vsel_vi_vo_vi_vi(vcast_vo32_vo64(g), ql, ql2);\n      d = vsel_vd_vo_vd_vd(g, d, u);\n      d = vreinterpret_vd_vm(vor_vm_vo64_vm(vor_vo_vo_vo(visinf_vo_vd(r), visnan_vo_vd(r)), vreinterpret_vm_vd(d)));\n    }\n  }\n\n  s = vmul_vd_vd_vd(d, d);\n\n  d = vreinterpret_vd_vm(vxor_vm_vm_vm(vand_vm_vo64_vm(vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(ql, vcast_vi_i(2)), vcast_vi_i(0))), vreinterpret_vm_vd(vcast_vd_d(-0.0))), vreinterpret_vm_vd(d)));\n\n  vdouble s2 = vmul_vd_vd_vd(s, s), s4 = vmul_vd_vd_vd(s2, s2);\n  u = POLY8(s, s2, s4,\n\t    -7.97255955009037868891952e-18,\n\t    2.81009972710863200091251e-15,\n\t    -7.64712219118158833288484e-13,\n\t    1.60590430605664501629054e-10,\n\t    -2.50521083763502045810755e-08,\n\t    2.75573192239198747630416e-06,\n\t    -0.000198412698412696162806809,\n\t    0.00833333333333332974823815);\n  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-0.166666666666666657414808));\n\n  u = vadd_vd_vd_vd(vmul_vd_vd_vd(s, vmul_vd_vd_vd(u, d)), d);\n  \n  return u;\n#endif // #if !defined(DETERMINISTIC)\n}\n\nEXPORT CONST VECTOR_CC vdouble xcos_u1(vdouble d) {\n#if !defined(DETERMINISTIC)\n  vdouble u;\n  vdouble2 s, t, x;\n  vint ql;\n  \n  if (LIKELY(vtestallones_i_vo64(vlt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(TRIGRANGEMAX2))))) {\n    vdouble dql = vrint_vd_vd(vmla_vd_vd_vd_vd(d, vcast_vd_d(M_1_PI), vcast_vd_d(-0.5)));\n    dql = vmla_vd_vd_vd_vd(vcast_vd_d(2), dql, vcast_vd_d(1));\n    ql = vrint_vi_vd(dql);\n    s = ddadd2_vd2_vd_vd(d, vmul_vd_vd_vd(dql, vcast_vd_d(-PI_A2*0.5)));\n    s = ddadd_vd2_vd2_vd(s, vmul_vd_vd_vd(dql, vcast_vd_d(-PI_B2*0.5)));\n  } else if (LIKELY(vtestallones_i_vo64(vlt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(TRIGRANGEMAX))))) {\n    vdouble dqh = vtruncate_vd_vd(vmla_vd_vd_vd_vd(d, vcast_vd_d(M_1_PI / (1 << 23)), vcast_vd_d(-M_1_PI / (1 << 24))));\n    ql = vrint_vi_vd(vadd_vd_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(M_1_PI)),\n\t\t\t\t\tvmla_vd_vd_vd_vd(dqh, vcast_vd_d(-(1 << 23)), vcast_vd_d(-0.5))));\n    dqh = vmul_vd_vd_vd(dqh, vcast_vd_d(1 << 24));\n    ql = vadd_vi_vi_vi(vadd_vi_vi_vi(ql, ql), vcast_vi_i(1));\n    const vdouble dql = vcast_vd_vi(ql);\n\n    u = vmla_vd_vd_vd_vd(dqh, vcast_vd_d(-PI_A * 0.5), d);\n    s = ddadd2_vd2_vd_vd(u, vmul_vd_vd_vd(dql, vcast_vd_d(-PI_A*0.5)));\n    s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(dqh, vcast_vd_d(-PI_B*0.5)));\n    s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(dql, vcast_vd_d(-PI_B*0.5)));\n    s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(dqh, vcast_vd_d(-PI_C*0.5)));\n    s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(dql, vcast_vd_d(-PI_C*0.5)));\n    s = ddadd_vd2_vd2_vd(s, vmul_vd_vd_vd(vadd_vd_vd_vd(dqh, dql), vcast_vd_d(-PI_D*0.5)));\n  } else {\n    ddi_t ddi = rempi(d);\n    ql = vand_vi_vi_vi(ddigeti_vi_ddi(ddi), vcast_vi_i(3));\n    ql = vadd_vi_vi_vi(vadd_vi_vi_vi(ql, ql), vsel_vi_vo_vi_vi(vcast_vo32_vo64(vgt_vo_vd_vd(vd2getx_vd_vd2(ddigetdd_vd2_ddi(ddi)), vcast_vd_d(0))), vcast_vi_i(8), vcast_vi_i(7)));\n    ql = vsra_vi_vi_i(ql, 1);\n    vopmask o = veq_vo_vi_vi(vand_vi_vi_vi(ddigeti_vi_ddi(ddi), vcast_vi_i(1)), vcast_vi_i(0));\n    vdouble y = vsel_vd_vo_vd_vd(vgt_vo_vd_vd(vd2getx_vd_vd2(ddigetdd_vd2_ddi(ddi)), vcast_vd_d(0)), vcast_vd_d(0), vcast_vd_d(-1));\n    vdouble2 x = vcast_vd2_vd_vd(vmulsign_vd_vd_vd(vcast_vd_d(-3.141592653589793116 * 0.5), y), \n\t\t\t\t vmulsign_vd_vd_vd(vcast_vd_d(-1.2246467991473532072e-16 * 0.5), y));\n    x = ddadd2_vd2_vd2_vd2(ddigetdd_vd2_ddi(ddi), x);\n    ddi = ddisetdd_ddi_ddi_vd2(ddi, vsel_vd2_vo_vd2_vd2(vcast_vo64_vo32(o), x, ddigetdd_vd2_ddi(ddi)));\n    s = ddnormalize_vd2_vd2(ddigetdd_vd2_ddi(ddi));\n    s = vd2setx_vd2_vd2_vd(s, vreinterpret_vd_vm(vor_vm_vo64_vm(vor_vo_vo_vo(visinf_vo_vd(d), visnan_vo_vd(d)), vreinterpret_vm_vd(vd2getx_vd_vd2(s)))));\n  }\n  \n  t = s;\n  s = ddsqu_vd2_vd2(s);\n\n  vdouble s2 = vmul_vd_vd_vd(vd2getx_vd_vd2(s), vd2getx_vd_vd2(s)), s4 = vmul_vd_vd_vd(s2, s2);\n  u = POLY6(vd2getx_vd_vd2(s), s2, s4,\n\t    2.72052416138529567917983e-15,\n\t    -7.6429259411395447190023e-13,\n\t    1.60589370117277896211623e-10,\n\t    -2.5052106814843123359368e-08,\n\t    2.75573192104428224777379e-06,\n\t    -0.000198412698412046454654947);\n  u = vmla_vd_vd_vd_vd(u, vd2getx_vd_vd2(s), vcast_vd_d(0.00833333333333318056201922));\n\n  x = ddadd_vd2_vd_vd2(vcast_vd_d(1), ddmul_vd2_vd2_vd2(ddadd_vd2_vd_vd(vcast_vd_d(-0.166666666666666657414808), vmul_vd_vd_vd(u, vd2getx_vd_vd2(s))), s));\n  u = ddmul_vd_vd2_vd2(t, x);\n  \n  u = vreinterpret_vd_vm(vxor_vm_vm_vm(vand_vm_vo64_vm(vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(ql, vcast_vi_i(2)), vcast_vi_i(0))), vreinterpret_vm_vd(vcast_vd_d(-0.0))), vreinterpret_vm_vd(u)));\n  \n  return u;\n\n#else // #if !defined(DETERMINISTIC)\n\n  vdouble u;\n  vdouble2 s, t, x;\n  vint ql;\n\n  vopmask g = vlt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(TRIGRANGEMAX2));\n  vdouble dql = vrint_vd_vd(vmla_vd_vd_vd_vd(d, vcast_vd_d(M_1_PI), vcast_vd_d(-0.5)));\n  dql = vmla_vd_vd_vd_vd(vcast_vd_d(2), dql, vcast_vd_d(1));\n  ql = vrint_vi_vd(dql);\n  x = ddadd2_vd2_vd_vd(d, vmul_vd_vd_vd(dql, vcast_vd_d(-PI_A2*0.5)));\n  x = ddadd_vd2_vd2_vd(x, vmul_vd_vd_vd(dql, vcast_vd_d(-PI_B2*0.5)));\n\n  if (!LIKELY(vtestallones_i_vo64(g))) {\n    vdouble dqh = vtruncate_vd_vd(vmla_vd_vd_vd_vd(d, vcast_vd_d(M_1_PI / (1 << 23)), vcast_vd_d(-M_1_PI / (1 << 24))));\n    vint ql2 = vrint_vi_vd(vadd_vd_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(M_1_PI)),\n\t\t\t\t\t vmla_vd_vd_vd_vd(dqh, vcast_vd_d(-(1 << 23)), vcast_vd_d(-0.5))));\n    dqh = vmul_vd_vd_vd(dqh, vcast_vd_d(1 << 24));\n    ql2 = vadd_vi_vi_vi(vadd_vi_vi_vi(ql2, ql2), vcast_vi_i(1));\n    const vdouble dql = vcast_vd_vi(ql2);\n\n    u = vmla_vd_vd_vd_vd(dqh, vcast_vd_d(-PI_A * 0.5), d);\n    s = ddadd2_vd2_vd_vd(u, vmul_vd_vd_vd(dql, vcast_vd_d(-PI_A*0.5)));\n    s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(dqh, vcast_vd_d(-PI_B*0.5)));\n    s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(dql, vcast_vd_d(-PI_B*0.5)));\n    s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(dqh, vcast_vd_d(-PI_C*0.5)));\n    s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(dql, vcast_vd_d(-PI_C*0.5)));\n    s = ddadd_vd2_vd2_vd(s, vmul_vd_vd_vd(vadd_vd_vd_vd(dqh, dql), vcast_vd_d(-PI_D*0.5)));\n\n    ql = vsel_vi_vo_vi_vi(vcast_vo32_vo64(g), ql, ql2);\n    x = vsel_vd2_vo_vd2_vd2(g, x, s);\n    g = vlt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(TRIGRANGEMAX));\n\n    if (!LIKELY(vtestallones_i_vo64(g))) {\n      ddi_t ddi = rempi(d);\n      vint ql2 = vand_vi_vi_vi(ddigeti_vi_ddi(ddi), vcast_vi_i(3));\n      ql2 = vadd_vi_vi_vi(vadd_vi_vi_vi(ql2, ql2), vsel_vi_vo_vi_vi(vcast_vo32_vo64(vgt_vo_vd_vd(vd2getx_vd_vd2(ddigetdd_vd2_ddi(ddi)), vcast_vd_d(0))), vcast_vi_i(8), vcast_vi_i(7)));\n      ql2 = vsra_vi_vi_i(ql2, 1);\n      vopmask o = veq_vo_vi_vi(vand_vi_vi_vi(ddigeti_vi_ddi(ddi), vcast_vi_i(1)), vcast_vi_i(0));\n      vdouble y = vsel_vd_vo_vd_vd(vgt_vo_vd_vd(vd2getx_vd_vd2(ddigetdd_vd2_ddi(ddi)), vcast_vd_d(0)), vcast_vd_d(0), vcast_vd_d(-1));\n      vdouble2 t = vcast_vd2_vd_vd(vmulsign_vd_vd_vd(vcast_vd_d(-3.141592653589793116 * 0.5), y), \n\t\t\t\t   vmulsign_vd_vd_vd(vcast_vd_d(-1.2246467991473532072e-16 * 0.5), y));\n      t = ddadd2_vd2_vd2_vd2(ddigetdd_vd2_ddi(ddi), t);\n      ddi = ddisetdd_ddi_ddi_vd2(ddi, vsel_vd2_vo_vd2_vd2(vcast_vo64_vo32(o), t, ddigetdd_vd2_ddi(ddi)));\n      s = ddnormalize_vd2_vd2(ddigetdd_vd2_ddi(ddi));\n      ql = vsel_vi_vo_vi_vi(vcast_vo32_vo64(g), ql, ql2);\n      x = vsel_vd2_vo_vd2_vd2(g, x, s);\n      x = vd2setx_vd2_vd2_vd(x, vreinterpret_vd_vm(vor_vm_vo64_vm(vor_vo_vo_vo(visinf_vo_vd(d), visnan_vo_vd(d)), vreinterpret_vm_vd(vd2getx_vd_vd2(x)))));\n    }\n  }\n  \n  t = x;\n  s = ddsqu_vd2_vd2(x);\n\n  vdouble s2 = vmul_vd_vd_vd(vd2getx_vd_vd2(s), vd2getx_vd_vd2(s)), s4 = vmul_vd_vd_vd(s2, s2);\n  u = POLY6(vd2getx_vd_vd2(s), s2, s4,\n\t    2.72052416138529567917983e-15,\n\t    -7.6429259411395447190023e-13,\n\t    1.60589370117277896211623e-10,\n\t    -2.5052106814843123359368e-08,\n\t    2.75573192104428224777379e-06,\n\t    -0.000198412698412046454654947);\n  u = vmla_vd_vd_vd_vd(u, vd2getx_vd_vd2(s), vcast_vd_d(0.00833333333333318056201922));\n\n  x = ddadd_vd2_vd_vd2(vcast_vd_d(1), ddmul_vd2_vd2_vd2(ddadd_vd2_vd_vd(vcast_vd_d(-0.166666666666666657414808), vmul_vd_vd_vd(u, vd2getx_vd_vd2(s))), s));\n  u = ddmul_vd_vd2_vd2(t, x);\n  \n  u = vreinterpret_vd_vm(vxor_vm_vm_vm(vand_vm_vo64_vm(vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(ql, vcast_vi_i(2)), vcast_vi_i(0))), vreinterpret_vm_vd(vcast_vd_d(-0.0))), vreinterpret_vm_vd(u)));\n  \n  return u;\n#endif // #if !defined(DETERMINISTIC)\n}\n\n#ifdef ENABLE_GNUABI\n#define TYPE2_FUNCATR static INLINE CONST \n#define TYPE6_FUNCATR static INLINE CONST \n#define SQRTU05_FUNCATR static INLINE CONST \n#define XSINCOS sincosk\n#define XSINCOS_U1 sincosk_u1\n#define XSINCOSPI_U05 sincospik_u05\n#define XSINCOSPI_U35 sincospik_u35\n#define XMODF modfk\n#else\n#define TYPE2_FUNCATR EXPORT\n#define TYPE6_FUNCATR EXPORT CONST\n#define SQRTU05_FUNCATR EXPORT CONST\n#define XSINCOS xsincos\n#define XSINCOS_U1 xsincos_u1\n#define XSINCOSPI_U05 xsincospi_u05\n#define XSINCOSPI_U35 xsincospi_u35\n#define XMODF xmodf\n#endif\n\nTYPE2_FUNCATR VECTOR_CC vdouble2 XSINCOS(vdouble d) {\n#if !defined(DETERMINISTIC)\n  vopmask o;\n  vdouble u, t, rx, ry, s;\n  vdouble2 r;\n  vint ql;\n\n  if (LIKELY(vtestallones_i_vo64(vlt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(TRIGRANGEMAX2))))) {\n    vdouble dql = vrint_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(2 * M_1_PI)));\n    ql = vrint_vi_vd(dql);\n    s = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_A2 * 0.5), d);\n    s = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_B2 * 0.5), s);\n  } else if (LIKELY(vtestallones_i_vo64(vlt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(TRIGRANGEMAX))))) {\n    vdouble dqh = vtruncate_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(2*M_1_PI / (1 << 24))));\n    dqh = vmul_vd_vd_vd(dqh, vcast_vd_d(1 << 24));\n    vdouble dql = vrint_vd_vd(vsub_vd_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(2*M_1_PI)), dqh));\n    ql = vrint_vi_vd(dql);\n\n    s = vmla_vd_vd_vd_vd(dqh, vcast_vd_d(-PI_A * 0.5), d);\n    s = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_A * 0.5), s);\n    s = vmla_vd_vd_vd_vd(dqh, vcast_vd_d(-PI_B * 0.5), s);\n    s = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_B * 0.5), s);\n    s = vmla_vd_vd_vd_vd(dqh, vcast_vd_d(-PI_C * 0.5), s);\n    s = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_C * 0.5), s);\n    s = vmla_vd_vd_vd_vd(vadd_vd_vd_vd(dqh, dql), vcast_vd_d(-PI_D * 0.5), s);\n  } else {\n    ddi_t ddi = rempi(d);\n    ql = ddigeti_vi_ddi(ddi);\n    s = vadd_vd_vd_vd(vd2getx_vd_vd2(ddigetdd_vd2_ddi(ddi)), vd2gety_vd_vd2(ddigetdd_vd2_ddi(ddi)));\n    s = vreinterpret_vd_vm(vor_vm_vo64_vm(vor_vo_vo_vo(visinf_vo_vd(d), visnan_vo_vd(d)), vreinterpret_vm_vd(s)));\n  }\n  \n  t = s;\n\n  s = vmul_vd_vd_vd(s, s);\n\n  u = vcast_vd_d(1.58938307283228937328511e-10);\n  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-2.50506943502539773349318e-08));\n  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(2.75573131776846360512547e-06));\n  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-0.000198412698278911770864914));\n  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(0.0083333333333191845961746));\n  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-0.166666666666666130709393));\n\n  rx = vmla_vd_vd_vd_vd(vmul_vd_vd_vd(u, s), t, t);\n  rx = vsel_vd_vo_vd_vd(visnegzero_vo_vd(d), vcast_vd_d(-0.0), rx);\n\n  u = vcast_vd_d(-1.13615350239097429531523e-11);\n  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(2.08757471207040055479366e-09));\n  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-2.75573144028847567498567e-07));\n  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(2.48015872890001867311915e-05));\n  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-0.00138888888888714019282329));\n  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(0.0416666666666665519592062));\n  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-0.5));\n\n  ry = vmla_vd_vd_vd_vd(s, u, vcast_vd_d(1));\n\n  o = vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(ql, vcast_vi_i(1)), vcast_vi_i(0)));\n  r = vd2setxy_vd2_vd_vd(vsel_vd_vo_vd_vd(o, rx, ry), vsel_vd_vo_vd_vd(o, ry, rx));\n\n  o = vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(ql, vcast_vi_i(2)), vcast_vi_i(2)));\n  r = vd2setx_vd2_vd2_vd(r, vreinterpret_vd_vm(vxor_vm_vm_vm(vand_vm_vo64_vm(o, vreinterpret_vm_vd(vcast_vd_d(-0.0))), vreinterpret_vm_vd(vd2getx_vd_vd2(r)))));\n\n  o = vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(vadd_vi_vi_vi(ql, vcast_vi_i(1)), vcast_vi_i(2)), vcast_vi_i(2)));\n  r = vd2sety_vd2_vd2_vd(r, vreinterpret_vd_vm(vxor_vm_vm_vm(vand_vm_vo64_vm(o, vreinterpret_vm_vd(vcast_vd_d(-0.0))), vreinterpret_vm_vd(vd2gety_vd_vd2(r)))));\n  \n  return r;\n\n#else // #if !defined(DETERMINISTIC)\n\n  vopmask o;\n  vdouble u, t, rx, ry, s = d;\n  vdouble2 r;\n  vint ql;\n\n  vdouble dql = vrint_vd_vd(vmul_vd_vd_vd(s, vcast_vd_d(2 * M_1_PI)));\n  ql = vrint_vi_vd(dql);\n  s = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_A2 * 0.5), s);\n  s = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_B2 * 0.5), s);\n  vopmask g = vlt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(TRIGRANGEMAX2));\n\n  if (!LIKELY(vtestallones_i_vo64(g))) {\n    vdouble dqh = vtruncate_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(2*M_1_PI / (1 << 24))));\n    dqh = vmul_vd_vd_vd(dqh, vcast_vd_d(1 << 24));\n    vdouble dql = vrint_vd_vd(vsub_vd_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(2*M_1_PI)), dqh));\n\n    u = vmla_vd_vd_vd_vd(dqh, vcast_vd_d(-PI_A * 0.5), d);\n    u = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_A * 0.5), u);\n    u = vmla_vd_vd_vd_vd(dqh, vcast_vd_d(-PI_B * 0.5), u);\n    u = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_B * 0.5), u);\n    u = vmla_vd_vd_vd_vd(dqh, vcast_vd_d(-PI_C * 0.5), u);\n    u = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_C * 0.5), u);\n    u = vmla_vd_vd_vd_vd(vadd_vd_vd_vd(dqh, dql), vcast_vd_d(-PI_D * 0.5), u);\n\n    ql = vsel_vi_vo_vi_vi(vcast_vo32_vo64(g), ql, vrint_vi_vd(dql));\n    s = vsel_vd_vo_vd_vd(g, s, u);\n    g = vlt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(TRIGRANGEMAX));\n\n    if (!LIKELY(vtestallones_i_vo64(g))) {\n      ddi_t ddi = rempi(d);\n      u = vadd_vd_vd_vd(vd2getx_vd_vd2(ddigetdd_vd2_ddi(ddi)), vd2gety_vd_vd2(ddigetdd_vd2_ddi(ddi)));\n      u = vreinterpret_vd_vm(vor_vm_vo64_vm(vor_vo_vo_vo(visinf_vo_vd(d), visnan_vo_vd(d)), vreinterpret_vm_vd(u)));\n\n      ql = vsel_vi_vo_vi_vi(vcast_vo32_vo64(g), ql, ddigeti_vi_ddi(ddi));\n      s = vsel_vd_vo_vd_vd(g, s, u);\n    }\n  }\n  \n  t = s;\n\n  s = vmul_vd_vd_vd(s, s);\n\n  u = vcast_vd_d(1.58938307283228937328511e-10);\n  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-2.50506943502539773349318e-08));\n  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(2.75573131776846360512547e-06));\n  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-0.000198412698278911770864914));\n  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(0.0083333333333191845961746));\n  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-0.166666666666666130709393));\n\n  rx = vmla_vd_vd_vd_vd(vmul_vd_vd_vd(u, s), t, t);\n  rx = vsel_vd_vo_vd_vd(visnegzero_vo_vd(d), vcast_vd_d(-0.0), rx);\n\n  u = vcast_vd_d(-1.13615350239097429531523e-11);\n  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(2.08757471207040055479366e-09));\n  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-2.75573144028847567498567e-07));\n  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(2.48015872890001867311915e-05));\n  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-0.00138888888888714019282329));\n  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(0.0416666666666665519592062));\n  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-0.5));\n\n  ry = vmla_vd_vd_vd_vd(s, u, vcast_vd_d(1));\n\n  o = vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(ql, vcast_vi_i(1)), vcast_vi_i(0)));\n  r = vd2setxy_vd2_vd_vd(vsel_vd_vo_vd_vd(o, rx, ry), vsel_vd_vo_vd_vd(o, ry, rx));\n\n  o = vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(ql, vcast_vi_i(2)), vcast_vi_i(2)));\n  r = vd2setx_vd2_vd2_vd(r, vreinterpret_vd_vm(vxor_vm_vm_vm(vand_vm_vo64_vm(o, vreinterpret_vm_vd(vcast_vd_d(-0.0))), vreinterpret_vm_vd(vd2getx_vd_vd2(r)))));\n\n  o = vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(vadd_vi_vi_vi(ql, vcast_vi_i(1)), vcast_vi_i(2)), vcast_vi_i(2)));\n  r = vd2sety_vd2_vd2_vd(r, vreinterpret_vd_vm(vxor_vm_vm_vm(vand_vm_vo64_vm(o, vreinterpret_vm_vd(vcast_vd_d(-0.0))), vreinterpret_vm_vd(vd2gety_vd_vd2(r)))));\n  \n  return r;\n#endif // #if !defined(DETERMINISTIC)\n}\n\nTYPE2_FUNCATR VECTOR_CC vdouble2 XSINCOS_U1(vdouble d) {\n#if !defined(DETERMINISTIC)\n  vopmask o;\n  vdouble u, rx, ry;\n  vdouble2 r, s, t, x;\n  vint ql;\n  \n  if (LIKELY(vtestallones_i_vo64(vlt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(TRIGRANGEMAX2))))) {\n    const vdouble dql = vrint_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(2 * M_1_PI)));\n    ql = vrint_vi_vd(dql);\n    u = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_A2*0.5), d);\n    s = ddadd_vd2_vd_vd (u, vmul_vd_vd_vd(dql, vcast_vd_d(-PI_B2*0.5)));\n  } else if (LIKELY(vtestallones_i_vo64(vlt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(TRIGRANGEMAX))))) {\n    vdouble dqh = vtruncate_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(2*M_1_PI / (1 << 24))));\n    dqh = vmul_vd_vd_vd(dqh, vcast_vd_d(1 << 24));\n    const vdouble dql = vrint_vd_vd(vsub_vd_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(2*M_1_PI)), dqh));\n    ql = vrint_vi_vd(dql);\n    \n    u = vmla_vd_vd_vd_vd(dqh, vcast_vd_d(-PI_A * 0.5), d);\n    s = ddadd_vd2_vd_vd(u, vmul_vd_vd_vd(dql, vcast_vd_d(-PI_A*0.5)));\n    s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(dqh, vcast_vd_d(-PI_B*0.5)));\n    s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(dql, vcast_vd_d(-PI_B*0.5)));\n    s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(dqh, vcast_vd_d(-PI_C*0.5)));\n    s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(dql, vcast_vd_d(-PI_C*0.5)));\n    s = ddadd_vd2_vd2_vd(s, vmul_vd_vd_vd(vadd_vd_vd_vd(dqh, dql), vcast_vd_d(-PI_D*0.5)));\n  } else {\n    ddi_t ddi = rempi(d);\n    ql = ddigeti_vi_ddi(ddi);\n    s = ddigetdd_vd2_ddi(ddi);\n    o = vor_vo_vo_vo(visinf_vo_vd(d), visnan_vo_vd(d));\n    s = vd2setxy_vd2_vd_vd(vreinterpret_vd_vm(vor_vm_vo64_vm(o, vreinterpret_vm_vd(vd2getx_vd_vd2(s)))),\n\t\t\t   vreinterpret_vd_vm(vor_vm_vo64_vm(o, vreinterpret_vm_vd(vd2gety_vd_vd2(s)))));\n  }\n  \n  t = s;\n\n  s = vd2setx_vd2_vd2_vd(s, ddsqu_vd_vd2(s));\n  \n  u = vcast_vd_d(1.58938307283228937328511e-10);\n  u = vmla_vd_vd_vd_vd(u, vd2getx_vd_vd2(s), vcast_vd_d(-2.50506943502539773349318e-08));\n  u = vmla_vd_vd_vd_vd(u, vd2getx_vd_vd2(s), vcast_vd_d(2.75573131776846360512547e-06));\n  u = vmla_vd_vd_vd_vd(u, vd2getx_vd_vd2(s), vcast_vd_d(-0.000198412698278911770864914));\n  u = vmla_vd_vd_vd_vd(u, vd2getx_vd_vd2(s), vcast_vd_d(0.0083333333333191845961746));\n  u = vmla_vd_vd_vd_vd(u, vd2getx_vd_vd2(s), vcast_vd_d(-0.166666666666666130709393));\n\n  u = vmul_vd_vd_vd(u, vmul_vd_vd_vd(vd2getx_vd_vd2(s), vd2getx_vd_vd2(t)));\n\n  x = ddadd_vd2_vd2_vd(t, u);\n  rx = vadd_vd_vd_vd(vd2getx_vd_vd2(x), vd2gety_vd_vd2(x));\n\n  rx = vsel_vd_vo_vd_vd(visnegzero_vo_vd(d), vcast_vd_d(-0.0), rx);\n  \n  u = vcast_vd_d(-1.13615350239097429531523e-11);\n  u = vmla_vd_vd_vd_vd(u, vd2getx_vd_vd2(s), vcast_vd_d(2.08757471207040055479366e-09));\n  u = vmla_vd_vd_vd_vd(u, vd2getx_vd_vd2(s), vcast_vd_d(-2.75573144028847567498567e-07));\n  u = vmla_vd_vd_vd_vd(u, vd2getx_vd_vd2(s), vcast_vd_d(2.48015872890001867311915e-05));\n  u = vmla_vd_vd_vd_vd(u, vd2getx_vd_vd2(s), vcast_vd_d(-0.00138888888888714019282329));\n  u = vmla_vd_vd_vd_vd(u, vd2getx_vd_vd2(s), vcast_vd_d(0.0416666666666665519592062));\n  u = vmla_vd_vd_vd_vd(u, vd2getx_vd_vd2(s), vcast_vd_d(-0.5));\n\n  x = ddadd_vd2_vd_vd2(vcast_vd_d(1), ddmul_vd2_vd_vd(vd2getx_vd_vd2(s), u));\n  ry = vadd_vd_vd_vd(vd2getx_vd_vd2(x), vd2gety_vd_vd2(x));\n\n  o = vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(ql, vcast_vi_i(1)), vcast_vi_i(0)));\n  r = vd2setxy_vd2_vd_vd(vsel_vd_vo_vd_vd(o, rx, ry), vsel_vd_vo_vd_vd(o, ry, rx));\n\n  o = vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(ql, vcast_vi_i(2)), vcast_vi_i(2)));\n  r = vd2setx_vd2_vd2_vd(r, vreinterpret_vd_vm(vxor_vm_vm_vm(vand_vm_vo64_vm(o, vreinterpret_vm_vd(vcast_vd_d(-0.0))), vreinterpret_vm_vd(vd2getx_vd_vd2(r)))));\n\n  o = vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(vadd_vi_vi_vi(ql, vcast_vi_i(1)), vcast_vi_i(2)), vcast_vi_i(2)));\n  r = vd2sety_vd2_vd2_vd(r, vreinterpret_vd_vm(vxor_vm_vm_vm(vand_vm_vo64_vm(o, vreinterpret_vm_vd(vcast_vd_d(-0.0))), vreinterpret_vm_vd(vd2gety_vd_vd2(r)))));\n\n  return r;\n\n#else // #if !defined(DETERMINISTIC)\n\n  vopmask o;\n  vdouble u, rx, ry;\n  vdouble2 r, s, t, x;\n  vint ql;\n  \n  const vdouble dql = vrint_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(2 * M_1_PI)));\n  ql = vrint_vi_vd(dql);\n  u = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_A2*0.5), d);\n  s = ddadd_vd2_vd_vd (u, vmul_vd_vd_vd(dql, vcast_vd_d(-PI_B2*0.5)));\n  vopmask g = vlt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(TRIGRANGEMAX2));\n\n  if (!LIKELY(vtestallones_i_vo64(g))) {\n    vdouble dqh = vtruncate_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(2*M_1_PI / (1 << 24))));\n    dqh = vmul_vd_vd_vd(dqh, vcast_vd_d(1 << 24));\n    const vdouble dql = vrint_vd_vd(vsub_vd_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(2*M_1_PI)), dqh));\n    \n    u = vmla_vd_vd_vd_vd(dqh, vcast_vd_d(-PI_A * 0.5), d);\n    x = ddadd_vd2_vd_vd(u, vmul_vd_vd_vd(dql, vcast_vd_d(-PI_A*0.5)));\n    x = ddadd2_vd2_vd2_vd(x, vmul_vd_vd_vd(dqh, vcast_vd_d(-PI_B*0.5)));\n    x = ddadd2_vd2_vd2_vd(x, vmul_vd_vd_vd(dql, vcast_vd_d(-PI_B*0.5)));\n    x = ddadd2_vd2_vd2_vd(x, vmul_vd_vd_vd(dqh, vcast_vd_d(-PI_C*0.5)));\n    x = ddadd2_vd2_vd2_vd(x, vmul_vd_vd_vd(dql, vcast_vd_d(-PI_C*0.5)));\n    x = ddadd_vd2_vd2_vd(x, vmul_vd_vd_vd(vadd_vd_vd_vd(dqh, dql), vcast_vd_d(-PI_D*0.5)));\n\n    ql = vsel_vi_vo_vi_vi(vcast_vo32_vo64(g), ql, vrint_vi_vd(dql));\n    s = vsel_vd2_vo_vd2_vd2(g, s, x);\n    g = vlt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(TRIGRANGEMAX));\n\n    if (!LIKELY(vtestallones_i_vo64(g))) {\n      ddi_t ddi = rempi(d);\n      x = ddigetdd_vd2_ddi(ddi);\n      o = vor_vo_vo_vo(visinf_vo_vd(d), visnan_vo_vd(d));\n      x = vd2setx_vd2_vd2_vd(x, vreinterpret_vd_vm(vor_vm_vo64_vm(o, vreinterpret_vm_vd(vd2getx_vd_vd2(x)))));\n      x = vd2sety_vd2_vd2_vd(x, vreinterpret_vd_vm(vor_vm_vo64_vm(o, vreinterpret_vm_vd(vd2gety_vd_vd2(x)))));\n\n      ql = vsel_vi_vo_vi_vi(vcast_vo32_vo64(g), ql, ddigeti_vi_ddi(ddi));\n      s = vsel_vd2_vo_vd2_vd2(g, s, x);\n    }\n  }\n  \n  t = s;\n\n  s = vd2setx_vd2_vd2_vd(s, ddsqu_vd_vd2(s));\n  \n  u = vcast_vd_d(1.58938307283228937328511e-10);\n  u = vmla_vd_vd_vd_vd(u, vd2getx_vd_vd2(s), vcast_vd_d(-2.50506943502539773349318e-08));\n  u = vmla_vd_vd_vd_vd(u, vd2getx_vd_vd2(s), vcast_vd_d(2.75573131776846360512547e-06));\n  u = vmla_vd_vd_vd_vd(u, vd2getx_vd_vd2(s), vcast_vd_d(-0.000198412698278911770864914));\n  u = vmla_vd_vd_vd_vd(u, vd2getx_vd_vd2(s), vcast_vd_d(0.0083333333333191845961746));\n  u = vmla_vd_vd_vd_vd(u, vd2getx_vd_vd2(s), vcast_vd_d(-0.166666666666666130709393));\n\n  u = vmul_vd_vd_vd(u, vmul_vd_vd_vd(vd2getx_vd_vd2(s), vd2getx_vd_vd2(t)));\n\n  x = ddadd_vd2_vd2_vd(t, u);\n  rx = vadd_vd_vd_vd(vd2getx_vd_vd2(x), vd2gety_vd_vd2(x));\n\n  rx = vsel_vd_vo_vd_vd(visnegzero_vo_vd(d), vcast_vd_d(-0.0), rx);\n  \n  u = vcast_vd_d(-1.13615350239097429531523e-11);\n  u = vmla_vd_vd_vd_vd(u, vd2getx_vd_vd2(s), vcast_vd_d(2.08757471207040055479366e-09));\n  u = vmla_vd_vd_vd_vd(u, vd2getx_vd_vd2(s), vcast_vd_d(-2.75573144028847567498567e-07));\n  u = vmla_vd_vd_vd_vd(u, vd2getx_vd_vd2(s), vcast_vd_d(2.48015872890001867311915e-05));\n  u = vmla_vd_vd_vd_vd(u, vd2getx_vd_vd2(s), vcast_vd_d(-0.00138888888888714019282329));\n  u = vmla_vd_vd_vd_vd(u, vd2getx_vd_vd2(s), vcast_vd_d(0.0416666666666665519592062));\n  u = vmla_vd_vd_vd_vd(u, vd2getx_vd_vd2(s), vcast_vd_d(-0.5));\n\n  x = ddadd_vd2_vd_vd2(vcast_vd_d(1), ddmul_vd2_vd_vd(vd2getx_vd_vd2(s), u));\n  ry = vadd_vd_vd_vd(vd2getx_vd_vd2(x), vd2gety_vd_vd2(x));\n\n  o = vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(ql, vcast_vi_i(1)), vcast_vi_i(0)));\n  r = vd2setxy_vd2_vd_vd(vsel_vd_vo_vd_vd(o, rx, ry), vsel_vd_vo_vd_vd(o, ry, rx));\n\n  o = vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(ql, vcast_vi_i(2)), vcast_vi_i(2)));\n  r = vd2setx_vd2_vd2_vd(r, vreinterpret_vd_vm(vxor_vm_vm_vm(vand_vm_vo64_vm(o, vreinterpret_vm_vd(vcast_vd_d(-0.0))), vreinterpret_vm_vd(vd2getx_vd_vd2(r)))));\n\n  o = vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(vadd_vi_vi_vi(ql, vcast_vi_i(1)), vcast_vi_i(2)), vcast_vi_i(2)));\n  r = vd2sety_vd2_vd2_vd(r, vreinterpret_vd_vm(vxor_vm_vm_vm(vand_vm_vo64_vm(o, vreinterpret_vm_vd(vcast_vd_d(-0.0))), vreinterpret_vm_vd(vd2gety_vd_vd2(r)))));\n\n  return r;\n#endif // #if !defined(DETERMINISTIC)\n}\n\n#if !defined(DETERMINISTIC)\nTYPE2_FUNCATR VECTOR_CC vdouble2 XSINCOSPI_U05(vdouble d) {\n  vopmask o;\n  vdouble u, s, t, rx, ry;\n  vdouble2 r, x, s2;\n\n  u = vmul_vd_vd_vd(d, vcast_vd_d(4.0));\n  vint q = vtruncate_vi_vd(u);\n  q = vand_vi_vi_vi(vadd_vi_vi_vi(q, vxor_vi_vi_vi(vsrl_vi_vi_i(q, 31), vcast_vi_i(1))), vcast_vi_i(~1));\n  s = vsub_vd_vd_vd(u, vcast_vd_vi(q));\n  \n  t = s;\n  s = vmul_vd_vd_vd(s, s);\n  s2 = ddmul_vd2_vd_vd(t, t);\n  \n  //\n\n  u = vcast_vd_d(-2.02461120785182399295868e-14);\n  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(6.94821830580179461327784e-12));\n  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-1.75724749952853179952664e-09));\n  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(3.13361688966868392878422e-07));\n  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-3.6576204182161551920361e-05));\n  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(0.00249039457019271850274356));\n  x = ddadd2_vd2_vd_vd2(vmul_vd_vd_vd(u, s), vcast_vd2_d_d(-0.0807455121882807852484731, 3.61852475067037104849987e-18));\n  x = ddadd2_vd2_vd2_vd2(ddmul_vd2_vd2_vd2(s2, x), vcast_vd2_d_d(0.785398163397448278999491, 3.06287113727155002607105e-17));\n\n  x = ddmul_vd2_vd2_vd(x, t);\n  rx = vadd_vd_vd_vd(vd2getx_vd_vd2(x), vd2gety_vd_vd2(x));\n\n  rx = vsel_vd_vo_vd_vd(visnegzero_vo_vd(d), vcast_vd_d(-0.0), rx);\n  \n  //\n  \n  u = vcast_vd_d(9.94480387626843774090208e-16);\n  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-3.89796226062932799164047e-13));\n  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(1.15011582539996035266901e-10));\n  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-2.4611369501044697495359e-08));\n  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(3.59086044859052754005062e-06));\n  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-0.000325991886927389905997954));\n  x = ddadd2_vd2_vd_vd2(vmul_vd_vd_vd(u, s), vcast_vd2_d_d(0.0158543442438155018914259, -1.04693272280631521908845e-18));\n  x = ddadd2_vd2_vd2_vd2(ddmul_vd2_vd2_vd2(s2, x), vcast_vd2_d_d(-0.308425137534042437259529, -1.95698492133633550338345e-17));\n\n  x = ddadd2_vd2_vd2_vd(ddmul_vd2_vd2_vd2(x, s2), vcast_vd_d(1));\n  ry = vadd_vd_vd_vd(vd2getx_vd_vd2(x), vd2gety_vd_vd2(x));\n\n  //\n\n  o = vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(q, vcast_vi_i(2)), vcast_vi_i(0)));\n  r = vd2setxy_vd2_vd_vd(vsel_vd_vo_vd_vd(o, rx, ry), vsel_vd_vo_vd_vd(o, ry, rx));\n\n  o = vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(q, vcast_vi_i(4)), vcast_vi_i(4)));\n  r = vd2setx_vd2_vd2_vd(r, vreinterpret_vd_vm(vxor_vm_vm_vm(vand_vm_vo64_vm(o, vreinterpret_vm_vd(vcast_vd_d(-0.0))), vreinterpret_vm_vd(vd2getx_vd_vd2(r)))));\n\n  o = vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(vadd_vi_vi_vi(q, vcast_vi_i(2)), vcast_vi_i(4)), vcast_vi_i(4)));\n  r = vd2sety_vd2_vd2_vd(r, vreinterpret_vd_vm(vxor_vm_vm_vm(vand_vm_vo64_vm(o, vreinterpret_vm_vd(vcast_vd_d(-0.0))), vreinterpret_vm_vd(vd2gety_vd_vd2(r)))));\n\n  o = vgt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(TRIGRANGEMAX3/4));\n  r = vd2setx_vd2_vd2_vd(r, vreinterpret_vd_vm(vandnot_vm_vo64_vm(o, vreinterpret_vm_vd(vd2getx_vd_vd2(r)))));\n  r = vd2sety_vd2_vd2_vd(r, vsel_vd_vo_vd_vd(o, vcast_vd_d(1), vd2gety_vd_vd2(r)));\n\n  o = visinf_vo_vd(d);\n  r = vd2setx_vd2_vd2_vd(r, vreinterpret_vd_vm(vor_vm_vo64_vm(o, vreinterpret_vm_vd(vd2getx_vd_vd2(r)))));\n  r = vd2sety_vd2_vd2_vd(r, vreinterpret_vd_vm(vor_vm_vo64_vm(o, vreinterpret_vm_vd(vd2gety_vd_vd2(r)))));\n\n  return r;\n}\n\nTYPE2_FUNCATR VECTOR_CC vdouble2 XSINCOSPI_U35(vdouble d) {\n  vopmask o;\n  vdouble u, s, t, rx, ry;\n  vdouble2 r;\n\n  u = vmul_vd_vd_vd(d, vcast_vd_d(4.0));\n  vint q = vtruncate_vi_vd(u);\n  q = vand_vi_vi_vi(vadd_vi_vi_vi(q, vxor_vi_vi_vi(vsrl_vi_vi_i(q, 31), vcast_vi_i(1))), vcast_vi_i(~1));\n  s = vsub_vd_vd_vd(u, vcast_vd_vi(q));\n\n  t = s;\n  s = vmul_vd_vd_vd(s, s);\n  \n  //\n\n  u = vcast_vd_d(+0.6880638894766060136e-11);\n  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-0.1757159564542310199e-8));\n  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(+0.3133616327257867311e-6));\n  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-0.3657620416388486452e-4));\n  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(+0.2490394570189932103e-2));\n  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-0.8074551218828056320e-1));\n  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(+0.7853981633974482790e+0));\n\n  rx = vmul_vd_vd_vd(u, t);\n\n  //\n  \n  u = vcast_vd_d(-0.3860141213683794352e-12);\n  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(+0.1150057888029681415e-9));\n  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-0.2461136493006663553e-7));\n  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(+0.3590860446623516713e-5));\n  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-0.3259918869269435942e-3));\n  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(+0.1585434424381541169e-1));\n  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-0.3084251375340424373e+0));\n  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(1));\n\n  ry = u;\n\n  //\n\n  o = vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(q, vcast_vi_i(2)), vcast_vi_i(0)));\n  r = vd2setxy_vd2_vd_vd(vsel_vd_vo_vd_vd(o, rx, ry), vsel_vd_vo_vd_vd(o, ry, rx));\n\n  o = vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(q, vcast_vi_i(4)), vcast_vi_i(4)));\n  r = vd2setx_vd2_vd2_vd(r, vreinterpret_vd_vm(vxor_vm_vm_vm(vand_vm_vo64_vm(o, vreinterpret_vm_vd(vcast_vd_d(-0.0))), vreinterpret_vm_vd(vd2getx_vd_vd2(r)))));\n\n  o = vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(vadd_vi_vi_vi(q, vcast_vi_i(2)), vcast_vi_i(4)), vcast_vi_i(4)));\n  r = vd2sety_vd2_vd2_vd(r, vreinterpret_vd_vm(vxor_vm_vm_vm(vand_vm_vo64_vm(o, vreinterpret_vm_vd(vcast_vd_d(-0.0))), vreinterpret_vm_vd(vd2gety_vd_vd2(r)))));\n\n  o = vgt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(TRIGRANGEMAX3/4));\n  r = vd2setx_vd2_vd2_vd(r, vreinterpret_vd_vm(vandnot_vm_vo64_vm(o, vreinterpret_vm_vd(vd2getx_vd_vd2(r)))));\n  r = vd2sety_vd2_vd2_vd(r, vreinterpret_vd_vm(vandnot_vm_vo64_vm(o, vreinterpret_vm_vd(vd2gety_vd_vd2(r)))));\n\n  o = visinf_vo_vd(d);\n  r = vd2setx_vd2_vd2_vd(r, vreinterpret_vd_vm(vor_vm_vo64_vm(o, vreinterpret_vm_vd(vd2getx_vd_vd2(r)))));\n  r = vd2sety_vd2_vd2_vd(r, vreinterpret_vd_vm(vor_vm_vo64_vm(o, vreinterpret_vm_vd(vd2gety_vd_vd2(r)))));\n\n  return r;\n}\n\nTYPE6_FUNCATR VECTOR_CC vdouble2 XMODF(vdouble x) {\n  vdouble fr = vsub_vd_vd_vd(x, vmul_vd_vd_vd(vcast_vd_d(INT64_C(1) << 31), vcast_vd_vi(vtruncate_vi_vd(vmul_vd_vd_vd(x, vcast_vd_d(1.0 / (INT64_C(1) << 31)))))));\n  fr = vsub_vd_vd_vd(fr, vcast_vd_vi(vtruncate_vi_vd(fr)));\n  fr = vsel_vd_vo_vd_vd(vgt_vo_vd_vd(vabs_vd_vd(x), vcast_vd_d(INT64_C(1) << 52)), vcast_vd_d(0), fr);\n\n  vdouble2 ret;\n\n  ret = vd2setxy_vd2_vd_vd(vcopysign_vd_vd_vd(fr, x), vcopysign_vd_vd_vd(vsub_vd_vd_vd(x, fr), x));\n\n  return ret;\n}\n\n#ifdef ENABLE_GNUABI\nEXPORT VECTOR_CC void xsincos(vdouble a, double *ps, double *pc) {\n  vdouble2 r = sincosk(a);\n  vstoreu_v_p_vd(ps, vd2getx_vd_vd2(r));\n  vstoreu_v_p_vd(pc, vd2gety_vd_vd2(r));\n}\n\nEXPORT VECTOR_CC void xsincos_u1(vdouble a, double *ps, double *pc) {\n  vdouble2 r = sincosk_u1(a);\n  vstoreu_v_p_vd(ps, vd2getx_vd_vd2(r));\n  vstoreu_v_p_vd(pc, vd2gety_vd_vd2(r));\n}\n\nEXPORT VECTOR_CC void xsincospi_u05(vdouble a, double *ps, double *pc) {\n  vdouble2 r = sincospik_u05(a);\n  vstoreu_v_p_vd(ps, vd2getx_vd_vd2(r));\n  vstoreu_v_p_vd(pc, vd2gety_vd_vd2(r));\n}\n\nEXPORT VECTOR_CC void xsincospi_u35(vdouble a, double *ps, double *pc) {\n  vdouble2 r = sincospik_u35(a);\n  vstoreu_v_p_vd(ps, vd2getx_vd_vd2(r));\n  vstoreu_v_p_vd(pc, vd2gety_vd_vd2(r));\n}\n\nEXPORT CONST VECTOR_CC vdouble xmodf(vdouble a, double *iptr) {\n  vdouble2 r = modfk(a);\n  vstoreu_v_p_vd(iptr, vd2gety_vd_vd2(r));\n  return vd2getx_vd_vd2(r);\n}\n#endif // #ifdef ENABLE_GNUABI\n#endif // #if !defined(DETERMINISTIC)\n\nstatic INLINE CONST VECTOR_CC vdouble2 sinpik(vdouble d) {\n  vopmask o;\n  vdouble u, s, t;\n  vdouble2 x, s2;\n\n  u = vmul_vd_vd_vd(d, vcast_vd_d(4.0));\n  vint q = vtruncate_vi_vd(u);\n  q = vand_vi_vi_vi(vadd_vi_vi_vi(q, vxor_vi_vi_vi(vsrl_vi_vi_i(q, 31), vcast_vi_i(1))), vcast_vi_i(~1));\n  o = vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(q, vcast_vi_i(2)), vcast_vi_i(2)));\n\n  s = vsub_vd_vd_vd(u, vcast_vd_vi(q));\n  t = s;\n  s = vmul_vd_vd_vd(s, s);\n  s2 = ddmul_vd2_vd_vd(t, t);\n\n  //\n\n  u = vsel_vd_vo_d_d(o, 9.94480387626843774090208e-16, -2.02461120785182399295868e-14);\n  u = vmla_vd_vd_vd_vd(u, s, vsel_vd_vo_d_d(o, -3.89796226062932799164047e-13, 6.948218305801794613277840e-12));\n  u = vmla_vd_vd_vd_vd(u, s, vsel_vd_vo_d_d(o, 1.150115825399960352669010e-10, -1.75724749952853179952664e-09));\n  u = vmla_vd_vd_vd_vd(u, s, vsel_vd_vo_d_d(o, -2.46113695010446974953590e-08, 3.133616889668683928784220e-07));\n  u = vmla_vd_vd_vd_vd(u, s, vsel_vd_vo_d_d(o, 3.590860448590527540050620e-06, -3.65762041821615519203610e-05));\n  u = vmla_vd_vd_vd_vd(u, s, vsel_vd_vo_d_d(o, -0.000325991886927389905997954, 0.0024903945701927185027435600));\n  x = ddadd2_vd2_vd_vd2(vmul_vd_vd_vd(u, s),\n\t\t\tvsel_vd2_vo_d_d_d_d(o, 0.0158543442438155018914259, -1.04693272280631521908845e-18,\n\t\t\t\t\t    -0.0807455121882807852484731, 3.61852475067037104849987e-18));\n  x = ddadd2_vd2_vd2_vd2(ddmul_vd2_vd2_vd2(s2, x),\n\t\t\t vsel_vd2_vo_d_d_d_d(o, -0.308425137534042437259529, -1.95698492133633550338345e-17,\n\t\t\t\t\t     0.785398163397448278999491, 3.06287113727155002607105e-17));\n\n  x = ddmul_vd2_vd2_vd2(x, vsel_vd2_vo_vd2_vd2(o, s2, vcast_vd2_vd_vd(t, vcast_vd_d(0))));\n  x = vsel_vd2_vo_vd2_vd2(o, ddadd2_vd2_vd2_vd(x, vcast_vd_d(1)), x);\n\n  o = vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(q, vcast_vi_i(4)), vcast_vi_i(4)));\n  x = vd2setx_vd2_vd2_vd(x, vreinterpret_vd_vm(vxor_vm_vm_vm(vand_vm_vo64_vm(o, vreinterpret_vm_vd(vcast_vd_d(-0.0))), vreinterpret_vm_vd(vd2getx_vd_vd2(x)))));\n  x = vd2sety_vd2_vd2_vd(x, vreinterpret_vd_vm(vxor_vm_vm_vm(vand_vm_vo64_vm(o, vreinterpret_vm_vd(vcast_vd_d(-0.0))), vreinterpret_vm_vd(vd2gety_vd_vd2(x)))));\n\n  return x;\n}\n\nEXPORT CONST VECTOR_CC vdouble xsinpi_u05(vdouble d) {\n  vdouble2 x = sinpik(d);\n  vdouble r = vadd_vd_vd_vd(vd2getx_vd_vd2(x), vd2gety_vd_vd2(x));\n\n  r = vsel_vd_vo_vd_vd(visnegzero_vo_vd(d), vcast_vd_d(-0.0), r);\n  r = vreinterpret_vd_vm(vandnot_vm_vo64_vm(vgt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(TRIGRANGEMAX3/4)), vreinterpret_vm_vd(r)));\n  r = vreinterpret_vd_vm(vor_vm_vo64_vm(visinf_vo_vd(d), vreinterpret_vm_vd(r)));\n  \n  return r;\n}\n\nstatic INLINE CONST VECTOR_CC vdouble2 cospik(vdouble d) {\n  vopmask o;\n  vdouble u, s, t;\n  vdouble2 x, s2;\n\n  u = vmul_vd_vd_vd(d, vcast_vd_d(4.0));\n  vint q = vtruncate_vi_vd(u);\n  q = vand_vi_vi_vi(vadd_vi_vi_vi(q, vxor_vi_vi_vi(vsrl_vi_vi_i(q, 31), vcast_vi_i(1))), vcast_vi_i(~1));\n  o = vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(q, vcast_vi_i(2)), vcast_vi_i(0)));\n\n  s = vsub_vd_vd_vd(u, vcast_vd_vi(q));\n  t = s;\n  s = vmul_vd_vd_vd(s, s);\n  s2 = ddmul_vd2_vd_vd(t, t);\n  \n  //\n\n  u = vsel_vd_vo_d_d(o, 9.94480387626843774090208e-16, -2.02461120785182399295868e-14);\n  u = vmla_vd_vd_vd_vd(u, s, vsel_vd_vo_d_d(o, -3.89796226062932799164047e-13, 6.948218305801794613277840e-12));\n  u = vmla_vd_vd_vd_vd(u, s, vsel_vd_vo_d_d(o, 1.150115825399960352669010e-10, -1.75724749952853179952664e-09));\n  u = vmla_vd_vd_vd_vd(u, s, vsel_vd_vo_d_d(o, -2.46113695010446974953590e-08, 3.133616889668683928784220e-07));\n  u = vmla_vd_vd_vd_vd(u, s, vsel_vd_vo_d_d(o, 3.590860448590527540050620e-06, -3.65762041821615519203610e-05));\n  u = vmla_vd_vd_vd_vd(u, s, vsel_vd_vo_d_d(o, -0.000325991886927389905997954, 0.0024903945701927185027435600));\n  x = ddadd2_vd2_vd_vd2(vmul_vd_vd_vd(u, s),\n\t\t\tvsel_vd2_vo_d_d_d_d(o, 0.0158543442438155018914259, -1.04693272280631521908845e-18,\n\t\t\t\t\t    -0.0807455121882807852484731, 3.61852475067037104849987e-18));\n  x = ddadd2_vd2_vd2_vd2(ddmul_vd2_vd2_vd2(s2, x),\n\t\t\t vsel_vd2_vo_d_d_d_d(o, -0.308425137534042437259529, -1.95698492133633550338345e-17,\n\t\t\t\t\t     0.785398163397448278999491, 3.06287113727155002607105e-17));\n\n  x = ddmul_vd2_vd2_vd2(x, vsel_vd2_vo_vd2_vd2(o, s2, vcast_vd2_vd_vd(t, vcast_vd_d(0))));\n  x = vsel_vd2_vo_vd2_vd2(o, ddadd2_vd2_vd2_vd(x, vcast_vd_d(1)), x);\n\n  o = vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(vadd_vi_vi_vi(q, vcast_vi_i(2)), vcast_vi_i(4)), vcast_vi_i(4)));\n  x = vd2setx_vd2_vd2_vd(x, vreinterpret_vd_vm(vxor_vm_vm_vm(vand_vm_vo64_vm(o, vreinterpret_vm_vd(vcast_vd_d(-0.0))), vreinterpret_vm_vd(vd2getx_vd_vd2(x)))));\n  x = vd2sety_vd2_vd2_vd(x, vreinterpret_vd_vm(vxor_vm_vm_vm(vand_vm_vo64_vm(o, vreinterpret_vm_vd(vcast_vd_d(-0.0))), vreinterpret_vm_vd(vd2gety_vd_vd2(x)))));\n\n  return x;\n}\n\nEXPORT CONST VECTOR_CC vdouble xcospi_u05(vdouble d) {\n  vdouble2 x = cospik(d);\n  vdouble r = vadd_vd_vd_vd(vd2getx_vd_vd2(x), vd2gety_vd_vd2(x));\n\n  r = vsel_vd_vo_vd_vd(vgt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(TRIGRANGEMAX3/4)), vcast_vd_d(1), r);\n  r = vreinterpret_vd_vm(vor_vm_vo64_vm(visinf_vo_vd(d), vreinterpret_vm_vd(r)));\n  \n  return r;\n}\n\nEXPORT CONST VECTOR_CC vdouble xtan(vdouble d) {\n#if !defined(DETERMINISTIC)\n  vdouble u, s, x, y;\n  vopmask o;\n  vint ql;\n\n  if (LIKELY(vtestallones_i_vo64(vlt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(TRIGRANGEMAX2))))) {\n    vdouble dql = vrint_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(2 * M_1_PI)));\n    ql = vrint_vi_vd(dql);\n    x = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_A2 * 0.5), d);\n    x = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_B2 * 0.5), x);\n  } else if (LIKELY(vtestallones_i_vo64(vlt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(1e+6))))) {\n    vdouble dqh = vtruncate_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(2*M_1_PI / (1 << 24))));\n    dqh = vmul_vd_vd_vd(dqh, vcast_vd_d(1 << 24));\n    vdouble dql = vrint_vd_vd(vsub_vd_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(2*M_1_PI)), dqh));\n    ql = vrint_vi_vd(dql);\n\n    x = vmla_vd_vd_vd_vd(dqh, vcast_vd_d(-PI_A * 0.5), d);\n    x = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_A * 0.5), x);\n    x = vmla_vd_vd_vd_vd(dqh, vcast_vd_d(-PI_B * 0.5), x);\n    x = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_B * 0.5), x);\n    x = vmla_vd_vd_vd_vd(dqh, vcast_vd_d(-PI_C * 0.5), x);\n    x = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_C * 0.5), x);\n    x = vmla_vd_vd_vd_vd(vadd_vd_vd_vd(dqh, dql), vcast_vd_d(-PI_D * 0.5), x);\n  } else {\n    ddi_t ddi = rempi(d);\n    ql = ddigeti_vi_ddi(ddi);\n    x = vadd_vd_vd_vd(vd2getx_vd_vd2(ddigetdd_vd2_ddi(ddi)), vd2gety_vd_vd2(ddigetdd_vd2_ddi(ddi)));\n    x = vreinterpret_vd_vm(vor_vm_vo64_vm(visinf_vo_vd(d), vreinterpret_vm_vd(x)));\n    x = vreinterpret_vd_vm(vor_vm_vo64_vm(vor_vo_vo_vo(visinf_vo_vd(d), visnan_vo_vd(d)), vreinterpret_vm_vd(x)));\n  }\n\n  x = vmul_vd_vd_vd(x, vcast_vd_d(0.5));\n  s = vmul_vd_vd_vd(x, x);\n\n  vdouble s2 = vmul_vd_vd_vd(s, s), s4 = vmul_vd_vd_vd(s2, s2);\n  u = POLY8(s, s2, s4,\n\t     +0.3245098826639276316e-3,\n\t     +0.5619219738114323735e-3,\n\t     +0.1460781502402784494e-2,\n\t     +0.3591611540792499519e-2,\n\t     +0.8863268409563113126e-2,\n\t     +0.2186948728185535498e-1,\n\t     +0.5396825399517272970e-1,\n\t     +0.1333333333330500581e+0);\n\n  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(+0.3333333333333343695e+0));\n  u = vmla_vd_vd_vd_vd(s, vmul_vd_vd_vd(u, x), x);\n\n  y = vmla_vd_vd_vd_vd(u, u, vcast_vd_d(-1));\n  x = vmul_vd_vd_vd(u, vcast_vd_d(-2));\n\n  o = vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(ql, vcast_vi_i(1)), vcast_vi_i(1)));\n  u = vdiv_vd_vd_vd(vsel_vd_vo_vd_vd(o, vneg_vd_vd(y), x),\n\t\t    vsel_vd_vo_vd_vd(o, x, y));\n  u = vsel_vd_vo_vd_vd(veq_vo_vd_vd(d, vcast_vd_d(0)), d, u);\n\n  return u;\n\n#else // #if !defined(DETERMINISTIC)\n\n  vdouble u, s, x, y;\n  vopmask o;\n  vint ql;\n\n  vdouble dql = vrint_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(2 * M_1_PI)));\n  ql = vrint_vi_vd(dql);\n  s = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_A2 * 0.5), d);\n  s = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_B2 * 0.5), s);\n  vopmask g = vlt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(TRIGRANGEMAX2));\n\n  if (!LIKELY(vtestallones_i_vo64(g))) {\n    vdouble dqh = vtruncate_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(2*M_1_PI / (1 << 24))));\n    dqh = vmul_vd_vd_vd(dqh, vcast_vd_d(1 << 24));\n    vdouble dql = vrint_vd_vd(vsub_vd_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(2*M_1_PI)), dqh));\n\n    u = vmla_vd_vd_vd_vd(dqh, vcast_vd_d(-PI_A * 0.5), d);\n    u = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_A * 0.5), u);\n    u = vmla_vd_vd_vd_vd(dqh, vcast_vd_d(-PI_B * 0.5), u);\n    u = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_B * 0.5), u);\n    u = vmla_vd_vd_vd_vd(dqh, vcast_vd_d(-PI_C * 0.5), u);\n    u = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_C * 0.5), u);\n    u = vmla_vd_vd_vd_vd(vadd_vd_vd_vd(dqh, dql), vcast_vd_d(-PI_D * 0.5), u);\n\n    ql = vsel_vi_vo_vi_vi(vcast_vo32_vo64(g), ql, vrint_vi_vd(dql));\n    s = vsel_vd_vo_vd_vd(g, s, u);\n    g = vlt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(1e+6));\n\n    if (!LIKELY(vtestallones_i_vo64(g))) {\n      ddi_t ddi = rempi(d);\n      vint ql2 = ddigeti_vi_ddi(ddi);\n      u = vadd_vd_vd_vd(vd2getx_vd_vd2(ddigetdd_vd2_ddi(ddi)), vd2gety_vd_vd2(ddigetdd_vd2_ddi(ddi)));\n      u = vreinterpret_vd_vm(vor_vm_vo64_vm(vor_vo_vo_vo(visinf_vo_vd(d), visnan_vo_vd(d)), vreinterpret_vm_vd(u)));\n\n      ql = vsel_vi_vo_vi_vi(vcast_vo32_vo64(g), ql, ql2);\n      s = vsel_vd_vo_vd_vd(g, s, u);\n    }\n  }\n\n  x = vmul_vd_vd_vd(s, vcast_vd_d(0.5));\n  s = vmul_vd_vd_vd(x, x);\n\n  vdouble s2 = vmul_vd_vd_vd(s, s), s4 = vmul_vd_vd_vd(s2, s2);\n  u = POLY8(s, s2, s4,\n\t     +0.3245098826639276316e-3,\n\t     +0.5619219738114323735e-3,\n\t     +0.1460781502402784494e-2,\n\t     +0.3591611540792499519e-2,\n\t     +0.8863268409563113126e-2,\n\t     +0.2186948728185535498e-1,\n\t     +0.5396825399517272970e-1,\n\t     +0.1333333333330500581e+0);\n\n  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(+0.3333333333333343695e+0));\n  u = vmla_vd_vd_vd_vd(s, vmul_vd_vd_vd(u, x), x);\n\n  y = vmla_vd_vd_vd_vd(u, u, vcast_vd_d(-1));\n  x = vmul_vd_vd_vd(u, vcast_vd_d(-2));\n\n  o = vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(ql, vcast_vi_i(1)), vcast_vi_i(1)));\n  u = vdiv_vd_vd_vd(vsel_vd_vo_vd_vd(o, vneg_vd_vd(y), x),\n\t\t    vsel_vd_vo_vd_vd(o, x, y));\n  u = vsel_vd_vo_vd_vd(veq_vo_vd_vd(d, vcast_vd_d(0)), d, u);\n  \n  return u;\n#endif // #if !defined(DETERMINISTIC)\n}\n\nEXPORT CONST VECTOR_CC vdouble xtan_u1(vdouble d) {\n#if !defined(DETERMINISTIC)\n  vdouble u;\n  vdouble2 s, t, x, y;\n  vopmask o;\n  vint ql;\n  \n  if (LIKELY(vtestallones_i_vo64(vlt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(TRIGRANGEMAX2))))) {\n    vdouble dql = vrint_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(2 * M_1_PI)));\n    ql = vrint_vi_vd(dql);\n    u = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_A2*0.5), d);\n    s = ddadd_vd2_vd_vd (u, vmul_vd_vd_vd(dql, vcast_vd_d(-PI_B2*0.5)));\n  } else if (LIKELY(vtestallones_i_vo64(vlt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(TRIGRANGEMAX))))) {\n    vdouble dqh = vtruncate_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(2*M_1_PI / (1 << 24))));\n    dqh = vmul_vd_vd_vd(dqh, vcast_vd_d(1 << 24));\n    s = ddadd2_vd2_vd2_vd(ddmul_vd2_vd2_vd(vcast_vd2_d_d(M_2_PI_H, M_2_PI_L), d),\n\t\t\t  vsub_vd_vd_vd(vsel_vd_vo_vd_vd(vlt_vo_vd_vd(d, vcast_vd_d(0)),\n\t\t\t\t\t\t\t vcast_vd_d(-0.5), vcast_vd_d(0.5)), dqh));\n    const vdouble dql = vtruncate_vd_vd(vadd_vd_vd_vd(vd2getx_vd_vd2(s), vd2gety_vd_vd2(s)));\n    ql = vrint_vi_vd(dql);\n\n    u = vmla_vd_vd_vd_vd(dqh, vcast_vd_d(-PI_A * 0.5), d);\n    s = ddadd_vd2_vd_vd(u, vmul_vd_vd_vd(dql, vcast_vd_d(-PI_A*0.5            )));\n    s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(dqh, vcast_vd_d(-PI_B*0.5)));\n    s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(dql, vcast_vd_d(-PI_B*0.5            )));\n    s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(dqh, vcast_vd_d(-PI_C*0.5)));\n    s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(dql, vcast_vd_d(-PI_C*0.5            )));\n    s = ddadd_vd2_vd2_vd(s, vmul_vd_vd_vd(vadd_vd_vd_vd(dqh, dql), vcast_vd_d(-PI_D*0.5)));\n  } else {\n    ddi_t ddi = rempi(d);\n    ql = ddigeti_vi_ddi(ddi);\n    s = ddigetdd_vd2_ddi(ddi);\n    o = vor_vo_vo_vo(visinf_vo_vd(d), visnan_vo_vd(d));\n    s = vd2setx_vd2_vd2_vd(s, vreinterpret_vd_vm(vor_vm_vo64_vm(o, vreinterpret_vm_vd(vd2getx_vd_vd2(s)))));\n    s = vd2sety_vd2_vd2_vd(s, vreinterpret_vd_vm(vor_vm_vo64_vm(o, vreinterpret_vm_vd(vd2gety_vd_vd2(s)))));\n  }\n\n  t = ddscale_vd2_vd2_vd(s, vcast_vd_d(0.5));\n  s = ddsqu_vd2_vd2(t);\n\n  vdouble s2 = vmul_vd_vd_vd(vd2getx_vd_vd2(s), vd2getx_vd_vd2(s)), s4 = vmul_vd_vd_vd(s2, s2);\n  u = POLY8(vd2getx_vd_vd2(s), s2, s4,\n\t     +0.3245098826639276316e-3,\n\t     +0.5619219738114323735e-3,\n\t     +0.1460781502402784494e-2,\n\t     +0.3591611540792499519e-2,\n\t     +0.8863268409563113126e-2,\n\t     +0.2186948728185535498e-1,\n\t     +0.5396825399517272970e-1,\n\t     +0.1333333333330500581e+0);\n\n  u = vmla_vd_vd_vd_vd(u, vd2getx_vd_vd2(s), vcast_vd_d(+0.3333333333333343695e+0));\n  x = ddadd_vd2_vd2_vd2(t, ddmul_vd2_vd2_vd(ddmul_vd2_vd2_vd2(s, t), u));\n\n  y = ddadd_vd2_vd_vd2(vcast_vd_d(-1), ddsqu_vd2_vd2(x));\n  x = ddscale_vd2_vd2_vd(x, vcast_vd_d(-2));\n\n  o = vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(ql, vcast_vi_i(1)), vcast_vi_i(1)));\n\n  x = dddiv_vd2_vd2_vd2(vsel_vd2_vo_vd2_vd2(o, ddneg_vd2_vd2(y), x),\n\t\t\tvsel_vd2_vo_vd2_vd2(o, x, y));\n\n  u = vadd_vd_vd_vd(vd2getx_vd_vd2(x), vd2gety_vd_vd2(x));\n\n  u = vsel_vd_vo_vd_vd(veq_vo_vd_vd(d, vcast_vd_d(0)), d, u);\n\n  return u;\n\n#else // #if !defined(DETERMINISTIC)\n\n  vdouble u;\n  vdouble2 s, t, x, y;\n  vopmask o;\n  vint ql;\n  \n  const vdouble dql = vrint_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(2 * M_1_PI)));\n  ql = vrint_vi_vd(dql);\n  u = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_A2*0.5), d);\n  s = ddadd_vd2_vd_vd (u, vmul_vd_vd_vd(dql, vcast_vd_d(-PI_B2*0.5)));\n  vopmask g = vlt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(TRIGRANGEMAX2));\n\n  if (!LIKELY(vtestallones_i_vo64(g))) {\n    vdouble dqh = vtruncate_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(2*M_1_PI / (1 << 24))));\n    dqh = vmul_vd_vd_vd(dqh, vcast_vd_d(1 << 24));\n    x = ddadd2_vd2_vd2_vd(ddmul_vd2_vd2_vd(vcast_vd2_d_d(M_2_PI_H, M_2_PI_L), d),\n\t\t\t  vsub_vd_vd_vd(vsel_vd_vo_vd_vd(vlt_vo_vd_vd(d, vcast_vd_d(0)),\n\t\t\t\t\t\t\t vcast_vd_d(-0.5), vcast_vd_d(0.5)), dqh));\n    const vdouble dql = vtruncate_vd_vd(vadd_vd_vd_vd(vd2getx_vd_vd2(x), vd2gety_vd_vd2(x)));\n\n    u = vmla_vd_vd_vd_vd(dqh, vcast_vd_d(-PI_A * 0.5), d);\n    x = ddadd_vd2_vd_vd(u, vmul_vd_vd_vd(dql, vcast_vd_d(-PI_A*0.5            )));\n    x = ddadd2_vd2_vd2_vd(x, vmul_vd_vd_vd(dqh, vcast_vd_d(-PI_B*0.5)));\n    x = ddadd2_vd2_vd2_vd(x, vmul_vd_vd_vd(dql, vcast_vd_d(-PI_B*0.5            )));\n    x = ddadd2_vd2_vd2_vd(x, vmul_vd_vd_vd(dqh, vcast_vd_d(-PI_C*0.5)));\n    x = ddadd2_vd2_vd2_vd(x, vmul_vd_vd_vd(dql, vcast_vd_d(-PI_C*0.5            )));\n    x = ddadd_vd2_vd2_vd(x, vmul_vd_vd_vd(vadd_vd_vd_vd(dqh, dql), vcast_vd_d(-PI_D*0.5)));\n\n    ql = vsel_vi_vo_vi_vi(vcast_vo32_vo64(g), ql, vrint_vi_vd(dql));\n    s = vsel_vd2_vo_vd2_vd2(g, s, x);\n    g = vlt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(TRIGRANGEMAX));\n\n    if (!LIKELY(vtestallones_i_vo64(g))) {\n      ddi_t ddi = rempi(d);\n      x = ddigetdd_vd2_ddi(ddi);\n      o = vor_vo_vo_vo(visinf_vo_vd(d), visnan_vo_vd(d));\n      x = vd2setx_vd2_vd2_vd(x, vreinterpret_vd_vm(vor_vm_vo64_vm(o, vreinterpret_vm_vd(vd2getx_vd_vd2(x)))));\n      x = vd2sety_vd2_vd2_vd(x, vreinterpret_vd_vm(vor_vm_vo64_vm(o, vreinterpret_vm_vd(vd2gety_vd_vd2(x)))));\n\n      ql = vsel_vi_vo_vi_vi(vcast_vo32_vo64(g), ql, ddigeti_vi_ddi(ddi));\n      s = vsel_vd2_vo_vd2_vd2(g, s, x);\n    }\n  }\n  \n  t = ddscale_vd2_vd2_vd(s, vcast_vd_d(0.5));\n  s = ddsqu_vd2_vd2(t);\n\n  vdouble s2 = vmul_vd_vd_vd(vd2getx_vd_vd2(s), vd2getx_vd_vd2(s)), s4 = vmul_vd_vd_vd(s2, s2);\n  u = POLY8(vd2getx_vd_vd2(s), s2, s4,\n\t     +0.3245098826639276316e-3,\n\t     +0.5619219738114323735e-3,\n\t     +0.1460781502402784494e-2,\n\t     +0.3591611540792499519e-2,\n\t     +0.8863268409563113126e-2,\n\t     +0.2186948728185535498e-1,\n\t     +0.5396825399517272970e-1,\n\t     +0.1333333333330500581e+0);\n\n  u = vmla_vd_vd_vd_vd(u, vd2getx_vd_vd2(s), vcast_vd_d(+0.3333333333333343695e+0));\n  x = ddadd_vd2_vd2_vd2(t, ddmul_vd2_vd2_vd(ddmul_vd2_vd2_vd2(s, t), u));\n\n  y = ddadd_vd2_vd_vd2(vcast_vd_d(-1), ddsqu_vd2_vd2(x));\n  x = ddscale_vd2_vd2_vd(x, vcast_vd_d(-2));\n\n  o = vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(ql, vcast_vi_i(1)), vcast_vi_i(1)));\n\n  x = dddiv_vd2_vd2_vd2(vsel_vd2_vo_vd2_vd2(o, ddneg_vd2_vd2(y), x),\n\t\t\tvsel_vd2_vo_vd2_vd2(o, x, y));\n\n  u = vadd_vd_vd_vd(vd2getx_vd_vd2(x), vd2gety_vd_vd2(x));\n\n  u = vsel_vd_vo_vd_vd(veq_vo_vd_vd(d, vcast_vd_d(0)), d, u);\n\n  return u;\n#endif // #if !defined(DETERMINISTIC)\n}\n\nstatic INLINE CONST VECTOR_CC vdouble atan2k(vdouble y, vdouble x) {\n  vdouble s, t, u;\n  vint q;\n  vopmask p;\n\n  q = vsel_vi_vd_vi(x, vcast_vi_i(-2));\n  x = vabs_vd_vd(x);\n\n  q = vsel_vi_vd_vd_vi_vi(x, y, vadd_vi_vi_vi(q, vcast_vi_i(1)), q);\n  p = vlt_vo_vd_vd(x, y);\n  s = vsel_vd_vo_vd_vd(p, vneg_vd_vd(x), y);\n  t = vmax_vd_vd_vd(x, y);\n\n  s = vdiv_vd_vd_vd(s, t);\n  t = vmul_vd_vd_vd(s, s);\n\n  vdouble t2 = vmul_vd_vd_vd(t, t), t4 = vmul_vd_vd_vd(t2, t2), t8 = vmul_vd_vd_vd(t4, t4), t16 = vmul_vd_vd_vd(t8, t8);\n  u = POLY19(t, t2, t4, t8, t16,\n\t     -1.88796008463073496563746e-05,\n\t     0.000209850076645816976906797,\n\t     -0.00110611831486672482563471,\n\t     0.00370026744188713119232403,\n\t     -0.00889896195887655491740809,\n\t     0.016599329773529201970117,\n\t     -0.0254517624932312641616861,\n\t     0.0337852580001353069993897,\n\t     -0.0407629191276836500001934,\n\t     0.0466667150077840625632675,\n\t     -0.0523674852303482457616113,\n\t     0.0587666392926673580854313,\n\t     -0.0666573579361080525984562,\n\t     0.0769219538311769618355029,\n\t     -0.090908995008245008229153,\n\t     0.111111105648261418443745,\n\t     -0.14285714266771329383765,\n\t     0.199999999996591265594148,\n\t     -0.333333333333311110369124);\n  \n  t = vmla_vd_vd_vd_vd(s, vmul_vd_vd_vd(t, u), s);\n  t = vmla_vd_vd_vd_vd(vcast_vd_vi(q), vcast_vd_d(M_PI/2), t);\n\n  return t;\n}\n\nstatic INLINE CONST VECTOR_CC vdouble2 atan2k_u1(vdouble2 y, vdouble2 x) {\n  vdouble u;\n  vdouble2 s, t;\n  vint q;\n  vopmask p;\n\n  q = vsel_vi_vd_vi(vd2getx_vd_vd2(x), vcast_vi_i(-2));\n  p = vlt_vo_vd_vd(vd2getx_vd_vd2(x), vcast_vd_d(0));\n  vmask b = vand_vm_vo64_vm(p, vreinterpret_vm_vd(vcast_vd_d(-0.0)));\n  x = vd2setx_vd2_vd2_vd(x, vreinterpret_vd_vm(vxor_vm_vm_vm(b, vreinterpret_vm_vd(vd2getx_vd_vd2(x)))));\n  x = vd2sety_vd2_vd2_vd(x, vreinterpret_vd_vm(vxor_vm_vm_vm(b, vreinterpret_vm_vd(vd2gety_vd_vd2(x)))));\n\n  q = vsel_vi_vd_vd_vi_vi(vd2getx_vd_vd2(x), vd2getx_vd_vd2(y), vadd_vi_vi_vi(q, vcast_vi_i(1)), q);\n  p = vlt_vo_vd_vd(vd2getx_vd_vd2(x), vd2getx_vd_vd2(y));\n  s = vsel_vd2_vo_vd2_vd2(p, ddneg_vd2_vd2(x), y);\n  t = vsel_vd2_vo_vd2_vd2(p, y, x);\n\n  s = dddiv_vd2_vd2_vd2(s, t);\n  t = ddsqu_vd2_vd2(s);\n  t = ddnormalize_vd2_vd2(t);\n\n  vdouble t2 = vmul_vd_vd_vd(vd2getx_vd_vd2(t), vd2getx_vd_vd2(t)), t4 = vmul_vd_vd_vd(t2, t2), t8 = vmul_vd_vd_vd(t4, t4), t16 = vmul_vd_vd_vd(t8, t8);\n  u = POLY16(vd2getx_vd_vd2(t), t2, t4, t8,\n\t     1.06298484191448746607415e-05,\n\t     -0.000125620649967286867384336,\n\t     0.00070557664296393412389774,\n\t     -0.00251865614498713360352999,\n\t     0.00646262899036991172313504,\n\t     -0.0128281333663399031014274,\n\t     0.0208024799924145797902497,\n\t     -0.0289002344784740315686289,\n\t     0.0359785005035104590853656,\n\t     -0.041848579703592507506027,\n\t     0.0470843011653283988193763,\n\t     -0.0524914210588448421068719,\n\t     0.0587946590969581003860434,\n\t     -0.0666620884778795497194182,\n\t     0.0769225330296203768654095,\n\t     -0.0909090442773387574781907);\n  u = vmla_vd_vd_vd_vd(u, vd2getx_vd_vd2(t), vcast_vd_d(0.111111108376896236538123));\n  u = vmla_vd_vd_vd_vd(u, vd2getx_vd_vd2(t), vcast_vd_d(-0.142857142756268568062339));\n  u = vmla_vd_vd_vd_vd(u, vd2getx_vd_vd2(t), vcast_vd_d(0.199999999997977351284817));\n  u = vmla_vd_vd_vd_vd(u, vd2getx_vd_vd2(t), vcast_vd_d(-0.333333333333317605173818));\n\n  t = ddadd_vd2_vd2_vd2(s, ddmul_vd2_vd2_vd(ddmul_vd2_vd2_vd2(s, t), u));\n  \n  t = ddadd_vd2_vd2_vd2(ddmul_vd2_vd2_vd(vcast_vd2_d_d(1.570796326794896557998982, 6.12323399573676603586882e-17), vcast_vd_vi(q)), t);\n\n  return t;\n}\n\nstatic INLINE CONST VECTOR_CC vdouble visinf2_vd_vd_vd(vdouble d, vdouble m) {\n  return vreinterpret_vd_vm(vand_vm_vo64_vm(visinf_vo_vd(d), vor_vm_vm_vm(vand_vm_vm_vm(vreinterpret_vm_vd(d), vreinterpret_vm_vd(vcast_vd_d(-0.0))), vreinterpret_vm_vd(m))));\n}\n\nEXPORT CONST VECTOR_CC vdouble xatan2(vdouble y, vdouble x) {\n  vdouble r = atan2k(vabs_vd_vd(y), x);\n\n  r = vmulsign_vd_vd_vd(r, x);\n  r = vsel_vd_vo_vd_vd(vor_vo_vo_vo(visinf_vo_vd(x), veq_vo_vd_vd(x, vcast_vd_d(0))), vsub_vd_vd_vd(vcast_vd_d(M_PI/2), visinf2_vd_vd_vd(x, vmulsign_vd_vd_vd(vcast_vd_d(M_PI/2), x))), r);\n  r = vsel_vd_vo_vd_vd(visinf_vo_vd(y), vsub_vd_vd_vd(vcast_vd_d(M_PI/2), visinf2_vd_vd_vd(x, vmulsign_vd_vd_vd(vcast_vd_d(M_PI/4), x))), r);\n  r = vsel_vd_vo_vd_vd(veq_vo_vd_vd(y, vcast_vd_d(0.0)), vreinterpret_vd_vm(vand_vm_vo64_vm(vsignbit_vo_vd(x), vreinterpret_vm_vd(vcast_vd_d(M_PI)))), r);\n\n  r = vreinterpret_vd_vm(vor_vm_vo64_vm(vor_vo_vo_vo(visnan_vo_vd(x), visnan_vo_vd(y)), vreinterpret_vm_vd(vmulsign_vd_vd_vd(r, y))));\n  return r;\n}\n\nEXPORT CONST VECTOR_CC vdouble xatan2_u1(vdouble y, vdouble x) {\n  vopmask o = vlt_vo_vd_vd(vabs_vd_vd(x), vcast_vd_d(5.5626846462680083984e-309)); // nexttoward((1.0 / DBL_MAX), 1)\n  x = vsel_vd_vo_vd_vd(o, vmul_vd_vd_vd(x, vcast_vd_d(UINT64_C(1) << 53)), x);\n  y = vsel_vd_vo_vd_vd(o, vmul_vd_vd_vd(y, vcast_vd_d(UINT64_C(1) << 53)), y);\n\n  vdouble2 d = atan2k_u1(vcast_vd2_vd_vd(vabs_vd_vd(y), vcast_vd_d(0)), vcast_vd2_vd_vd(x, vcast_vd_d(0)));\n  vdouble r = vadd_vd_vd_vd(vd2getx_vd_vd2(d), vd2gety_vd_vd2(d));\n\n  r = vmulsign_vd_vd_vd(r, x);\n  r = vsel_vd_vo_vd_vd(vor_vo_vo_vo(visinf_vo_vd(x), veq_vo_vd_vd(x, vcast_vd_d(0))), vsub_vd_vd_vd(vcast_vd_d(M_PI/2), visinf2_vd_vd_vd(x, vmulsign_vd_vd_vd(vcast_vd_d(M_PI/2), x))), r);\n  r = vsel_vd_vo_vd_vd(visinf_vo_vd(y), vsub_vd_vd_vd(vcast_vd_d(M_PI/2), visinf2_vd_vd_vd(x, vmulsign_vd_vd_vd(vcast_vd_d(M_PI/4), x))), r);\n  r = vsel_vd_vo_vd_vd(veq_vo_vd_vd(y, vcast_vd_d(0.0)), vreinterpret_vd_vm(vand_vm_vo64_vm(vsignbit_vo_vd(x), vreinterpret_vm_vd(vcast_vd_d(M_PI)))), r);\n\n  r = vreinterpret_vd_vm(vor_vm_vo64_vm(vor_vo_vo_vo(visnan_vo_vd(x), visnan_vo_vd(y)), vreinterpret_vm_vd(vmulsign_vd_vd_vd(r, y))));\n  return r;\n}\n\nEXPORT CONST VECTOR_CC vdouble xasin(vdouble d) {\n  vopmask o = vlt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(0.5));\n  vdouble x2 = vsel_vd_vo_vd_vd(o, vmul_vd_vd_vd(d, d), vmul_vd_vd_vd(vsub_vd_vd_vd(vcast_vd_d(1), vabs_vd_vd(d)), vcast_vd_d(0.5)));\n  vdouble x = vsel_vd_vo_vd_vd(o, vabs_vd_vd(d), vsqrt_vd_vd(x2)), u;\n\n  vdouble x4 = vmul_vd_vd_vd(x2, x2), x8 = vmul_vd_vd_vd(x4, x4), x16 = vmul_vd_vd_vd(x8, x8);\n  u = POLY12(x2, x4, x8, x16,\n\t     +0.3161587650653934628e-1,\n\t     -0.1581918243329996643e-1,\n\t     +0.1929045477267910674e-1,\n\t     +0.6606077476277170610e-2,\n\t     +0.1215360525577377331e-1,\n\t     +0.1388715184501609218e-1,\n\t     +0.1735956991223614604e-1,\n\t     +0.2237176181932048341e-1,\n\t     +0.3038195928038132237e-1,\n\t     +0.4464285681377102438e-1,\n\t     +0.7500000000378581611e-1,\n\t     +0.1666666666666497543e+0);\n\n  u = vmla_vd_vd_vd_vd(u, vmul_vd_vd_vd(x, x2), x);\n  \n  vdouble r = vsel_vd_vo_vd_vd(o, u, vmla_vd_vd_vd_vd(u, vcast_vd_d(-2), vcast_vd_d(M_PI/2)));\n  return vmulsign_vd_vd_vd(r, d);\n}\n\nEXPORT CONST VECTOR_CC vdouble xasin_u1(vdouble d) {\n  vopmask o = vlt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(0.5));\n  vdouble x2 = vsel_vd_vo_vd_vd(o, vmul_vd_vd_vd(d, d), vmul_vd_vd_vd(vsub_vd_vd_vd(vcast_vd_d(1), vabs_vd_vd(d)), vcast_vd_d(0.5))), u;\n  vdouble2 x = vsel_vd2_vo_vd2_vd2(o, vcast_vd2_vd_vd(vabs_vd_vd(d), vcast_vd_d(0)), ddsqrt_vd2_vd(x2));\n  x = vsel_vd2_vo_vd2_vd2(veq_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(1.0)), vcast_vd2_d_d(0, 0), x);\n\n  vdouble x4 = vmul_vd_vd_vd(x2, x2), x8 = vmul_vd_vd_vd(x4, x4), x16 = vmul_vd_vd_vd(x8, x8);\n  u = POLY12(x2, x4, x8, x16,\n\t     +0.3161587650653934628e-1,\n\t     -0.1581918243329996643e-1,\n\t     +0.1929045477267910674e-1,\n\t     +0.6606077476277170610e-2,\n\t     +0.1215360525577377331e-1,\n\t     +0.1388715184501609218e-1,\n\t     +0.1735956991223614604e-1,\n\t     +0.2237176181932048341e-1,\n\t     +0.3038195928038132237e-1,\n\t     +0.4464285681377102438e-1,\n\t     +0.7500000000378581611e-1,\n\t     +0.1666666666666497543e+0);\n\n  u = vmul_vd_vd_vd(u, vmul_vd_vd_vd(x2, vd2getx_vd_vd2(x)));\n\n  vdouble2 y = ddsub_vd2_vd2_vd(ddsub_vd2_vd2_vd2(vcast_vd2_d_d(3.141592653589793116/4, 1.2246467991473532072e-16/4), x), u);\n  \n  vdouble r = vsel_vd_vo_vd_vd(o, vadd_vd_vd_vd(u, vd2getx_vd_vd2(x)),\n\t\t\t       vmul_vd_vd_vd(vadd_vd_vd_vd(vd2getx_vd_vd2(y), vd2gety_vd_vd2(y)), vcast_vd_d(2)));\n  return vmulsign_vd_vd_vd(r, d);\n}\n\nEXPORT CONST VECTOR_CC vdouble xacos(vdouble d) {\n  vopmask o = vlt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(0.5));\n  vdouble x2 = vsel_vd_vo_vd_vd(o, vmul_vd_vd_vd(d, d),\n\t\t\t\tvmul_vd_vd_vd(vsub_vd_vd_vd(vcast_vd_d(1), vabs_vd_vd(d)), vcast_vd_d(0.5))), u;\n  vdouble x = vsel_vd_vo_vd_vd(o, vabs_vd_vd(d), vsqrt_vd_vd(x2));\n  x = vsel_vd_vo_vd_vd(veq_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(1.0)), vcast_vd_d(0), x);\n\n  vdouble x4 = vmul_vd_vd_vd(x2, x2), x8 = vmul_vd_vd_vd(x4, x4), x16 = vmul_vd_vd_vd(x8, x8);\n  u = POLY12(x2, x4, x8, x16,\n\t     +0.3161587650653934628e-1,\n\t     -0.1581918243329996643e-1,\n\t     +0.1929045477267910674e-1,\n\t     +0.6606077476277170610e-2,\n\t     +0.1215360525577377331e-1,\n\t     +0.1388715184501609218e-1,\n\t     +0.1735956991223614604e-1,\n\t     +0.2237176181932048341e-1,\n\t     +0.3038195928038132237e-1,\n\t     +0.4464285681377102438e-1,\n\t     +0.7500000000378581611e-1,\n\t     +0.1666666666666497543e+0);\n\n  u = vmul_vd_vd_vd(u, vmul_vd_vd_vd(x2, x));\n\n  vdouble y = vsub_vd_vd_vd(vcast_vd_d(M_PI/2), vadd_vd_vd_vd(vmulsign_vd_vd_vd(x, d), vmulsign_vd_vd_vd(u, d)));\n  x = vadd_vd_vd_vd(x, u);\n  vdouble r = vsel_vd_vo_vd_vd(o, y, vmul_vd_vd_vd(x, vcast_vd_d(2)));\n  return vsel_vd_vo_vd_vd(vandnot_vo_vo_vo(o, vlt_vo_vd_vd(d, vcast_vd_d(0))),\n\t\t\t  vd2getx_vd_vd2(ddadd_vd2_vd2_vd(vcast_vd2_d_d(3.141592653589793116, 1.2246467991473532072e-16),\n\t\t\t\t\t\t\t  vneg_vd_vd(r))), r);\n}\n\nEXPORT CONST VECTOR_CC vdouble xacos_u1(vdouble d) {\n  vopmask o = vlt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(0.5));\n  vdouble x2 = vsel_vd_vo_vd_vd(o, vmul_vd_vd_vd(d, d), vmul_vd_vd_vd(vsub_vd_vd_vd(vcast_vd_d(1), vabs_vd_vd(d)), vcast_vd_d(0.5))), u;\n  vdouble2 x = vsel_vd2_vo_vd2_vd2(o, vcast_vd2_vd_vd(vabs_vd_vd(d), vcast_vd_d(0)), ddsqrt_vd2_vd(x2));\n  x = vsel_vd2_vo_vd2_vd2(veq_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(1.0)), vcast_vd2_d_d(0, 0), x);\n\n  vdouble x4 = vmul_vd_vd_vd(x2, x2), x8 = vmul_vd_vd_vd(x4, x4), x16 = vmul_vd_vd_vd(x8, x8);\n  u = POLY12(x2, x4, x8, x16,\n\t     +0.3161587650653934628e-1,\n\t     -0.1581918243329996643e-1,\n\t     +0.1929045477267910674e-1,\n\t     +0.6606077476277170610e-2,\n\t     +0.1215360525577377331e-1,\n\t     +0.1388715184501609218e-1,\n\t     +0.1735956991223614604e-1,\n\t     +0.2237176181932048341e-1,\n\t     +0.3038195928038132237e-1,\n\t     +0.4464285681377102438e-1,\n\t     +0.7500000000378581611e-1,\n\t     +0.1666666666666497543e+0);\n\n  u = vmul_vd_vd_vd(u, vmul_vd_vd_vd(x2, vd2getx_vd_vd2(x)));\n\n  vdouble2 y = ddsub_vd2_vd2_vd2(vcast_vd2_d_d(3.141592653589793116/2, 1.2246467991473532072e-16/2),\n\t\t\t\t ddadd_vd2_vd_vd(vmulsign_vd_vd_vd(vd2getx_vd_vd2(x), d), vmulsign_vd_vd_vd(u, d)));\n  x = ddadd_vd2_vd2_vd(x, u);\n  \n  y = vsel_vd2_vo_vd2_vd2(o, y, ddscale_vd2_vd2_vd(x, vcast_vd_d(2)));\n  \n  y = vsel_vd2_vo_vd2_vd2(vandnot_vo_vo_vo(o, vlt_vo_vd_vd(d, vcast_vd_d(0))),\n\t\t\t  ddsub_vd2_vd2_vd2(vcast_vd2_d_d(3.141592653589793116, 1.2246467991473532072e-16), y), y);\n\n  return vadd_vd_vd_vd(vd2getx_vd_vd2(y), vd2gety_vd_vd2(y));\n}\n\nEXPORT CONST VECTOR_CC vdouble xatan_u1(vdouble d) {\n  vdouble2 d2 = atan2k_u1(vcast_vd2_vd_vd(vabs_vd_vd(d), vcast_vd_d(0)), vcast_vd2_d_d(1, 0));\n  vdouble r = vadd_vd_vd_vd(vd2getx_vd_vd2(d2), vd2gety_vd_vd2(d2));\n  r = vsel_vd_vo_vd_vd(visinf_vo_vd(d), vcast_vd_d(1.570796326794896557998982), r);\n  return vmulsign_vd_vd_vd(r, d);\n}\n\nEXPORT CONST VECTOR_CC vdouble xatan(vdouble s) {\n  vdouble t, u;\n  vint q;\n#if defined(__INTEL_COMPILER) && defined(ENABLE_PURECFMA_SCALAR)\n  vdouble w = s;\n#endif\n\n  q = vsel_vi_vd_vi(s, vcast_vi_i(2));\n  s = vabs_vd_vd(s);\n\n  q = vsel_vi_vd_vd_vi_vi(vcast_vd_d(1), s, vadd_vi_vi_vi(q, vcast_vi_i(1)), q);\n  s = vsel_vd_vo_vd_vd(vlt_vo_vd_vd(vcast_vd_d(1), s), vrec_vd_vd(s), s);\n\n  t = vmul_vd_vd_vd(s, s);\n\n  vdouble t2 = vmul_vd_vd_vd(t, t), t4 = vmul_vd_vd_vd(t2, t2), t8 = vmul_vd_vd_vd(t4, t4), t16 = vmul_vd_vd_vd(t8, t8);\n  u = POLY19(t, t2, t4, t8, t16,\n\t     -1.88796008463073496563746e-05,\n\t     0.000209850076645816976906797,\n\t     -0.00110611831486672482563471,\n\t     0.00370026744188713119232403,\n\t     -0.00889896195887655491740809,\n\t     0.016599329773529201970117,\n\t     -0.0254517624932312641616861,\n\t     0.0337852580001353069993897,\n\t     -0.0407629191276836500001934,\n\t     0.0466667150077840625632675,\n\t     -0.0523674852303482457616113,\n\t     0.0587666392926673580854313,\n\t     -0.0666573579361080525984562,\n\t     0.0769219538311769618355029,\n\t     -0.090908995008245008229153,\n\t     0.111111105648261418443745,\n\t     -0.14285714266771329383765,\n\t     0.199999999996591265594148,\n\t     -0.333333333333311110369124);\n  \n  t = vmla_vd_vd_vd_vd(s, vmul_vd_vd_vd(t, u), s);\n\n  t = vsel_vd_vo_vd_vd(vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(q, vcast_vi_i(1)), vcast_vi_i(1))), vsub_vd_vd_vd(vcast_vd_d(M_PI/2), t), t);\n  t = vreinterpret_vd_vm(vxor_vm_vm_vm(vand_vm_vo64_vm(vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(q, vcast_vi_i(2)), vcast_vi_i(2))), vreinterpret_vm_vd(vcast_vd_d(-0.0))), vreinterpret_vm_vd(t)));\n\n#if defined(__INTEL_COMPILER) && defined(ENABLE_PURECFMA_SCALAR)\n  t = vsel_vd_vo_vd_vd(veq_vo_vd_vd(w, vcast_vd_d(0)), w, t);\n#endif\n\n  return t;\n}\n\n#if !defined(DETERMINISTIC)\nEXPORT CONST VECTOR_CC vdouble xlog(vdouble d) {\n  vdouble x, x2;\n  vdouble t, m;\n  \n#if !defined(ENABLE_AVX512F) && !defined(ENABLE_AVX512FNOFMA)\n  vopmask o = vlt_vo_vd_vd(d, vcast_vd_d(DBL_MIN));\n  d = vsel_vd_vo_vd_vd(o, vmul_vd_vd_vd(d, vcast_vd_d((double)(INT64_C(1) << 32) * (double)(INT64_C(1) << 32))), d);\n  vint e = vilogb2k_vi_vd(vmul_vd_vd_vd(d, vcast_vd_d(1.0/0.75)));\n  m = vldexp3_vd_vd_vi(d, vneg_vi_vi(e));\n  e = vsel_vi_vo_vi_vi(vcast_vo32_vo64(o), vsub_vi_vi_vi(e, vcast_vi_i(64)), e);\n#else\n  vdouble e = vgetexp_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(1.0/0.75)));\n  e = vsel_vd_vo_vd_vd(vispinf_vo_vd(e), vcast_vd_d(1024.0), e);\n  m = vgetmant_vd_vd(d);\n#endif\n  \n  x = vdiv_vd_vd_vd(vsub_vd_vd_vd(m, vcast_vd_d(1)), vadd_vd_vd_vd(vcast_vd_d(1), m));\n  x2 = vmul_vd_vd_vd(x, x);\n\n  vdouble x4 = vmul_vd_vd_vd(x2, x2), x8 = vmul_vd_vd_vd(x4, x4), x3 = vmul_vd_vd_vd(x, x2);\n  t = POLY7(x2, x4, x8,\n\t    0.153487338491425068243146,\n\t    0.152519917006351951593857,\n\t    0.181863266251982985677316,\n\t    0.222221366518767365905163,\n\t    0.285714294746548025383248,\n\t    0.399999999950799600689777,\n\t    0.6666666666667778740063);\n\n#if !defined(ENABLE_AVX512F) && !defined(ENABLE_AVX512FNOFMA)\n  x = vmla_vd_vd_vd_vd(x, vcast_vd_d(2), vmul_vd_vd_vd(vcast_vd_d(0.693147180559945286226764), vcast_vd_vi(e)));\n  x = vmla_vd_vd_vd_vd(x3, t, x);\n\n  x = vsel_vd_vo_vd_vd(vispinf_vo_vd(d), vcast_vd_d(SLEEF_INFINITY), x);\n  x = vsel_vd_vo_vd_vd(vor_vo_vo_vo(vlt_vo_vd_vd(d, vcast_vd_d(0)), visnan_vo_vd(d)), vcast_vd_d(SLEEF_NAN), x);\n  x = vsel_vd_vo_vd_vd(veq_vo_vd_vd(d, vcast_vd_d(0)), vcast_vd_d(-SLEEF_INFINITY), x);\n#else\n  x = vmla_vd_vd_vd_vd(x, vcast_vd_d(2), vmul_vd_vd_vd(vcast_vd_d(0.693147180559945286226764), e));\n  x = vmla_vd_vd_vd_vd(x3, t, x);\n\n  x = vfixup_vd_vd_vd_vi2_i(x, d, vcast_vi2_i((5 << (5*4))), 0);\n#endif\n\n  return x;\n}\n#endif // #if !defined(DETERMINISTIC)\n\nEXPORT CONST VECTOR_CC vdouble xexp(vdouble d) {\n  vdouble u = vrint_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(R_LN2))), s;\n  vint q = vrint_vi_vd(u);\n\n  s = vmla_vd_vd_vd_vd(u, vcast_vd_d(-L2U), d);\n  s = vmla_vd_vd_vd_vd(u, vcast_vd_d(-L2L), s);\n\n#ifdef ENABLE_FMA_DP\n  vdouble s2 = vmul_vd_vd_vd(s, s), s4 = vmul_vd_vd_vd(s2, s2), s8 = vmul_vd_vd_vd(s4, s4);\n  u = POLY10(s, s2, s4, s8, \n\t     +0.2081276378237164457e-8,\n\t     +0.2511210703042288022e-7,\n\t     +0.2755762628169491192e-6,\n\t     +0.2755723402025388239e-5,\n\t     +0.2480158687479686264e-4,\n\t     +0.1984126989855865850e-3,\n\t     +0.1388888888914497797e-2,\n\t     +0.8333333333314938210e-2,\n\t     +0.4166666666666602598e-1,\n\t     +0.1666666666666669072e+0);\n  u = vfma_vd_vd_vd_vd(u, s, vcast_vd_d(+0.5000000000000000000e+0));\n  u = vfma_vd_vd_vd_vd(u, s, vcast_vd_d(+0.1000000000000000000e+1));\n  u = vfma_vd_vd_vd_vd(u, s, vcast_vd_d(+0.1000000000000000000e+1));\n#else // #ifdef ENABLE_FMA_DP\n  vdouble s2 = vmul_vd_vd_vd(s, s), s4 = vmul_vd_vd_vd(s2, s2), s8 = vmul_vd_vd_vd(s4, s4);\n  u = POLY10(s, s2, s4, s8,\n\t     2.08860621107283687536341e-09,\n\t     2.51112930892876518610661e-08,\n\t     2.75573911234900471893338e-07,\n\t     2.75572362911928827629423e-06,\n\t     2.4801587159235472998791e-05,\n\t     0.000198412698960509205564975,\n\t     0.00138888888889774492207962,\n\t     0.00833333333331652721664984,\n\t     0.0416666666666665047591422,\n\t     0.166666666666666851703837);\n  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(+0.5000000000000000000e+0));\n\n  u = vadd_vd_vd_vd(vcast_vd_d(1), vmla_vd_vd_vd_vd(vmul_vd_vd_vd(s, s), u, s));\n#endif // #ifdef ENABLE_FMA_DP\n  \n  u = vldexp2_vd_vd_vi(u, q);\n\n  u = vsel_vd_vo_vd_vd(vgt_vo_vd_vd(d, vcast_vd_d(709.78271114955742909217217426)), vcast_vd_d(SLEEF_INFINITY), u);\n  u = vreinterpret_vd_vm(vandnot_vm_vo64_vm(vlt_vo_vd_vd(d, vcast_vd_d(-1000)), vreinterpret_vm_vd(u)));\n\n  return u;\n}\n\nstatic INLINE CONST VECTOR_CC vdouble expm1k(vdouble d) {\n  vdouble u = vrint_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(R_LN2))), s;\n  vint q = vrint_vi_vd(u);\n\n  s = vmla_vd_vd_vd_vd(u, vcast_vd_d(-L2U), d);\n  s = vmla_vd_vd_vd_vd(u, vcast_vd_d(-L2L), s);\n\n  vdouble s2 = vmul_vd_vd_vd(s, s), s4 = vmul_vd_vd_vd(s2, s2), s8 = vmul_vd_vd_vd(s4, s4);\n  u = POLY10(s, s2, s4, s8,\n\t     2.08860621107283687536341e-09,\n\t     2.51112930892876518610661e-08,\n\t     2.75573911234900471893338e-07,\n\t     2.75572362911928827629423e-06,\n\t     2.4801587159235472998791e-05,\n\t     0.000198412698960509205564975,\n\t     0.00138888888889774492207962,\n\t     0.00833333333331652721664984,\n\t     0.0416666666666665047591422,\n\t     0.166666666666666851703837);\n\n  u = vadd_vd_vd_vd(vmla_vd_vd_vd_vd(s2, vcast_vd_d(0.5), vmul_vd_vd_vd(vmul_vd_vd_vd(s2, s), u)), s);\n  \n  u = vsel_vd_vo_vd_vd(vcast_vo64_vo32(veq_vo_vi_vi(q, vcast_vi_i(0))), u,\n\t\t       vsub_vd_vd_vd(vldexp2_vd_vd_vi(vadd_vd_vd_vd(u, vcast_vd_d(1)), q), vcast_vd_d(1)));\n\n  return u;\n}\n\nstatic INLINE CONST VECTOR_CC vdouble2 logk(vdouble d) {\n  vdouble2 x, x2, s;\n  vdouble t, m;\n\n#if !defined(ENABLE_AVX512F) && !defined(ENABLE_AVX512FNOFMA)\n  vopmask o = vlt_vo_vd_vd(d, vcast_vd_d(DBL_MIN));\n  d = vsel_vd_vo_vd_vd(o, vmul_vd_vd_vd(d, vcast_vd_d((double)(INT64_C(1) << 32) * (double)(INT64_C(1) << 32))), d);\n  vint e = vilogb2k_vi_vd(vmul_vd_vd_vd(d, vcast_vd_d(1.0/0.75)));\n  m = vldexp3_vd_vd_vi(d, vneg_vi_vi(e));\n  e = vsel_vi_vo_vi_vi(vcast_vo32_vo64(o), vsub_vi_vi_vi(e, vcast_vi_i(64)), e);\n#else\n  vdouble e = vgetexp_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(1.0/0.75)));\n  e = vsel_vd_vo_vd_vd(vispinf_vo_vd(e), vcast_vd_d(1024.0), e);\n  m = vgetmant_vd_vd(d);\n#endif\n\n  x = dddiv_vd2_vd2_vd2(ddadd2_vd2_vd_vd(vcast_vd_d(-1), m), ddadd2_vd2_vd_vd(vcast_vd_d(1), m));\n  x2 = ddsqu_vd2_vd2(x);\n\n  vdouble x4 = vmul_vd_vd_vd(vd2getx_vd_vd2(x2), vd2getx_vd_vd2(x2)), x8 = vmul_vd_vd_vd(x4, x4), x16 = vmul_vd_vd_vd(x8, x8);\n  t = POLY9(vd2getx_vd_vd2(x2), x4, x8, x16,\n\t    0.116255524079935043668677,\n\t    0.103239680901072952701192,\n\t    0.117754809412463995466069,\n\t    0.13332981086846273921509,\n\t    0.153846227114512262845736,\n\t    0.181818180850050775676507,\n\t    0.222222222230083560345903,\n\t    0.285714285714249172087875,\n\t    0.400000000000000077715612);\n\n  vdouble2 c = vcast_vd2_d_d(0.666666666666666629659233, 3.80554962542412056336616e-17);\n#if !defined(ENABLE_AVX512F) && !defined(ENABLE_AVX512FNOFMA)\n  s = ddmul_vd2_vd2_vd(vcast_vd2_d_d(0.693147180559945286226764, 2.319046813846299558417771e-17), vcast_vd_vi(e));\n#else\n  s = ddmul_vd2_vd2_vd(vcast_vd2_d_d(0.693147180559945286226764, 2.319046813846299558417771e-17), e);\n#endif\n  s = ddadd_vd2_vd2_vd2(s, ddscale_vd2_vd2_vd(x, vcast_vd_d(2)));\n  x = ddmul_vd2_vd2_vd2(x2, x);\n  s = ddadd_vd2_vd2_vd2(s, ddmul_vd2_vd2_vd2(x, c));\n  x = ddmul_vd2_vd2_vd2(x2, x);\n  s = ddadd_vd2_vd2_vd2(s, ddmul_vd2_vd2_vd(x, t));\n\n  return s;\n}\n\n#if !defined(DETERMINISTIC)\nEXPORT CONST VECTOR_CC vdouble xlog_u1(vdouble d) {\n  vdouble2 x;\n  vdouble t, m, x2;\n\n#if !defined(ENABLE_AVX512F) && !defined(ENABLE_AVX512FNOFMA)\n  vopmask o = vlt_vo_vd_vd(d, vcast_vd_d(DBL_MIN));\n  d = vsel_vd_vo_vd_vd(o, vmul_vd_vd_vd(d, vcast_vd_d((double)(INT64_C(1) << 32) * (double)(INT64_C(1) << 32))), d);\n  vint e = vilogb2k_vi_vd(vmul_vd_vd_vd(d, vcast_vd_d(1.0/0.75)));\n  m = vldexp3_vd_vd_vi(d, vneg_vi_vi(e));\n  e = vsel_vi_vo_vi_vi(vcast_vo32_vo64(o), vsub_vi_vi_vi(e, vcast_vi_i(64)), e);\n#else\n  vdouble e = vgetexp_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(1.0/0.75)));\n  e = vsel_vd_vo_vd_vd(vispinf_vo_vd(e), vcast_vd_d(1024.0), e);\n  m = vgetmant_vd_vd(d);\n#endif\n\n  x = dddiv_vd2_vd2_vd2(ddadd2_vd2_vd_vd(vcast_vd_d(-1), m), ddadd2_vd2_vd_vd(vcast_vd_d(1), m));\n  x2 = vmul_vd_vd_vd(vd2getx_vd_vd2(x), vd2getx_vd_vd2(x));\n\n  vdouble x4 = vmul_vd_vd_vd(x2, x2), x8 = vmul_vd_vd_vd(x4, x4);\n  t = POLY7(x2, x4, x8,\n\t    0.1532076988502701353e+0,\n\t    0.1525629051003428716e+0,\n\t    0.1818605932937785996e+0,\n\t    0.2222214519839380009e+0,\n\t    0.2857142932794299317e+0,\n\t    0.3999999999635251990e+0,\n\t    0.6666666666667333541e+0);\n\n#if !defined(ENABLE_AVX512F) && !defined(ENABLE_AVX512FNOFMA)\n  vdouble2 s = ddmul_vd2_vd2_vd(vcast_vd2_d_d(0.693147180559945286226764, 2.319046813846299558417771e-17), vcast_vd_vi(e));\n#else\n  vdouble2 s = ddmul_vd2_vd2_vd(vcast_vd2_d_d(0.693147180559945286226764, 2.319046813846299558417771e-17), e);\n#endif\n\n  s = ddadd_vd2_vd2_vd2(s, ddscale_vd2_vd2_vd(x, vcast_vd_d(2)));\n  s = ddadd_vd2_vd2_vd(s, vmul_vd_vd_vd(vmul_vd_vd_vd(x2, vd2getx_vd_vd2(x)), t));\n\n  vdouble r = vadd_vd_vd_vd(vd2getx_vd_vd2(s), vd2gety_vd_vd2(s));\n\n#if !defined(ENABLE_AVX512F) && !defined(ENABLE_AVX512FNOFMA)\n  r = vsel_vd_vo_vd_vd(vispinf_vo_vd(d), vcast_vd_d(SLEEF_INFINITY), r);\n  r = vsel_vd_vo_vd_vd(vor_vo_vo_vo(vlt_vo_vd_vd(d, vcast_vd_d(0)), visnan_vo_vd(d)), vcast_vd_d(SLEEF_NAN), r);\n  r = vsel_vd_vo_vd_vd(veq_vo_vd_vd(d, vcast_vd_d(0)), vcast_vd_d(-SLEEF_INFINITY), r);\n#else\n  r = vfixup_vd_vd_vd_vi2_i(r, d, vcast_vi2_i((4 << (2*4)) | (3 << (4*4)) | (5 << (5*4)) | (2 << (6*4))), 0);\n#endif\n  \n  return r;\n}\n#endif // #if !defined(DETERMINISTIC)\n\nstatic INLINE CONST VECTOR_CC vdouble expk(vdouble2 d) {\n  vdouble u = vmul_vd_vd_vd(vadd_vd_vd_vd(vd2getx_vd_vd2(d), vd2gety_vd_vd2(d)), vcast_vd_d(R_LN2));\n  vdouble dq = vrint_vd_vd(u);\n  vint q = vrint_vi_vd(dq);\n  vdouble2 s, t;\n\n  s = ddadd2_vd2_vd2_vd(d, vmul_vd_vd_vd(dq, vcast_vd_d(-L2U)));\n  s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(dq, vcast_vd_d(-L2L)));\n\n  s = ddnormalize_vd2_vd2(s);\n\n  vdouble s2 = vmul_vd_vd_vd(vd2getx_vd_vd2(s), vd2getx_vd_vd2(s)), s4 = vmul_vd_vd_vd(s2, s2), s8 = vmul_vd_vd_vd(s4, s4);\n  u = POLY10(vd2getx_vd_vd2(s), s2, s4, s8,\n\t     2.51069683420950419527139e-08,\n\t     2.76286166770270649116855e-07,\n\t     2.75572496725023574143864e-06,\n\t     2.48014973989819794114153e-05,\n\t     0.000198412698809069797676111,\n\t     0.0013888888939977128960529,\n\t     0.00833333333332371417601081,\n\t     0.0416666666665409524128449,\n\t     0.166666666666666740681535,\n\t     0.500000000000000999200722);\n\n  t = ddadd_vd2_vd_vd2(vcast_vd_d(1), s);\n  t = ddadd_vd2_vd2_vd2(t, ddmul_vd2_vd2_vd(ddsqu_vd2_vd2(s), u));\n\n  u = vadd_vd_vd_vd(vd2getx_vd_vd2(t), vd2gety_vd_vd2(t));\n  u = vldexp2_vd_vd_vi(u, q);\n\n  u = vreinterpret_vd_vm(vandnot_vm_vo64_vm(vlt_vo_vd_vd(vd2getx_vd_vd2(d), vcast_vd_d(-1000)), vreinterpret_vm_vd(u)));\n  \n  return u;\n}\n\n#if !defined(DETERMINISTIC)\nEXPORT CONST VECTOR_CC vdouble xpow(vdouble x, vdouble y) {\n#if 1\n  vopmask yisint = visint_vo_vd(y);\n  vopmask yisodd = vand_vo_vo_vo(visodd_vo_vd(y), yisint);\n\n  vdouble2 d = ddmul_vd2_vd2_vd(logk(vabs_vd_vd(x)), y);\n  vdouble result = expk(d);\n  result = vsel_vd_vo_vd_vd(vgt_vo_vd_vd(vd2getx_vd_vd2(d), vcast_vd_d(709.78271114955742909217217426)), vcast_vd_d(SLEEF_INFINITY), result);\n\n  result = vmul_vd_vd_vd(result,\n\t\t\t vsel_vd_vo_vd_vd(vgt_vo_vd_vd(x, vcast_vd_d(0)),\n\t\t\t\t\t  vcast_vd_d(1),\n\t\t\t\t\t  vsel_vd_vo_vd_vd(yisint, vsel_vd_vo_vd_vd(yisodd, vcast_vd_d(-1.0), vcast_vd_d(1)), vcast_vd_d(SLEEF_NAN))));\n\n  vdouble efx = vmulsign_vd_vd_vd(vsub_vd_vd_vd(vabs_vd_vd(x), vcast_vd_d(1)), y);\n\n  result = vsel_vd_vo_vd_vd(visinf_vo_vd(y),\n\t\t\t    vreinterpret_vd_vm(vandnot_vm_vo64_vm(vlt_vo_vd_vd(efx, vcast_vd_d(0.0)),\n\t\t\t\t\t\t\t\t  vreinterpret_vm_vd(vsel_vd_vo_vd_vd(veq_vo_vd_vd(efx, vcast_vd_d(0.0)),\n\t\t\t\t\t\t\t\t\t\t\t\t      vcast_vd_d(1.0),\n\t\t\t\t\t\t\t\t\t\t\t\t      vcast_vd_d(SLEEF_INFINITY))))),\n\t\t\t    result);\n\n  result = vsel_vd_vo_vd_vd(vor_vo_vo_vo(visinf_vo_vd(x), veq_vo_vd_vd(x, vcast_vd_d(0.0))),\n\t\t\t    vmul_vd_vd_vd(vsel_vd_vo_vd_vd(yisodd, vsign_vd_vd(x), vcast_vd_d(1.0)),\n\t\t\t\t\t  vreinterpret_vd_vm(vandnot_vm_vo64_vm(vlt_vo_vd_vd(vsel_vd_vo_vd_vd(veq_vo_vd_vd(x, vcast_vd_d(0.0)), vneg_vd_vd(y), y), vcast_vd_d(0.0)),\n\t\t\t\t\t\t\t\t\t\tvreinterpret_vm_vd(vcast_vd_d(SLEEF_INFINITY))))),\n\t\t\t    result);\n\n  result = vreinterpret_vd_vm(vor_vm_vo64_vm(vor_vo_vo_vo(visnan_vo_vd(x), visnan_vo_vd(y)), vreinterpret_vm_vd(result)));\n\n  result = vsel_vd_vo_vd_vd(vor_vo_vo_vo(veq_vo_vd_vd(y, vcast_vd_d(0)), veq_vo_vd_vd(x, vcast_vd_d(1))), vcast_vd_d(1), result);\n\n  return result;\n#else\n  return expk(ddmul_vd2_vd2_vd(logk(x), y));\n#endif\n}\n#endif // #if !defined(DETERMINISTIC)\n\nstatic INLINE CONST VECTOR_CC vdouble2 expk2(vdouble2 d) {\n  vdouble u = vmul_vd_vd_vd(vadd_vd_vd_vd(vd2getx_vd_vd2(d), vd2gety_vd_vd2(d)), vcast_vd_d(R_LN2));\n  vdouble dq = vrint_vd_vd(u);\n  vint q = vrint_vi_vd(dq);\n  vdouble2 s, t;\n\n  s = ddadd2_vd2_vd2_vd(d, vmul_vd_vd_vd(dq, vcast_vd_d(-L2U)));\n  s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(dq, vcast_vd_d(-L2L)));\n\n  vdouble2 s2 = ddsqu_vd2_vd2(s), s4 = ddsqu_vd2_vd2(s2);\n  vdouble s8 = vmul_vd_vd_vd(vd2getx_vd_vd2(s4), vd2getx_vd_vd2(s4));\n  u = POLY10(vd2getx_vd_vd2(s), vd2getx_vd_vd2(s2), vd2getx_vd_vd2(s4), s8,\n\t     +0.1602472219709932072e-9,\n\t     +0.2092255183563157007e-8,\n\t     +0.2505230023782644465e-7,\n\t     +0.2755724800902135303e-6,\n\t     +0.2755731892386044373e-5,\n\t     +0.2480158735605815065e-4,\n\t     +0.1984126984148071858e-3,\n\t     +0.1388888888886763255e-2,\n\t     +0.8333333333333347095e-2,\n\t     +0.4166666666666669905e-1);\n\n  t = ddadd_vd2_vd_vd2(vcast_vd_d(0.5), ddmul_vd2_vd2_vd(s, vcast_vd_d(+0.1666666666666666574e+0)));\n  t = ddadd_vd2_vd_vd2(vcast_vd_d(1.0), ddmul_vd2_vd2_vd2(t, s));\n  t = ddadd_vd2_vd_vd2(vcast_vd_d(1.0), ddmul_vd2_vd2_vd2(t, s));\n  t = ddadd_vd2_vd2_vd2(t, ddmul_vd2_vd2_vd(s4, u));\n\n  t = vd2setx_vd2_vd2_vd(t, vldexp2_vd_vd_vi(vd2getx_vd_vd2(t), q));\n  t = vd2sety_vd2_vd2_vd(t, vldexp2_vd_vd_vi(vd2gety_vd_vd2(t), q));\n\n  t = vd2setx_vd2_vd2_vd(t, vreinterpret_vd_vm(vandnot_vm_vo64_vm(vlt_vo_vd_vd(vd2getx_vd_vd2(d), vcast_vd_d(-1000)), vreinterpret_vm_vd(vd2getx_vd_vd2(t)))));\n  t = vd2sety_vd2_vd2_vd(t, vreinterpret_vd_vm(vandnot_vm_vo64_vm(vlt_vo_vd_vd(vd2getx_vd_vd2(d), vcast_vd_d(-1000)), vreinterpret_vm_vd(vd2gety_vd_vd2(t)))));\n\n  return t;\n}\n\n#if !defined(DETERMINISTIC)\nEXPORT CONST VECTOR_CC vdouble xsinh(vdouble x) {\n  vdouble y = vabs_vd_vd(x);\n  vdouble2 d = expk2(vcast_vd2_vd_vd(y, vcast_vd_d(0)));\n  d = ddsub_vd2_vd2_vd2(d, ddrec_vd2_vd2(d));\n  y = vmul_vd_vd_vd(vadd_vd_vd_vd(vd2getx_vd_vd2(d), vd2gety_vd_vd2(d)), vcast_vd_d(0.5));\n\n  y = vsel_vd_vo_vd_vd(vor_vo_vo_vo(vgt_vo_vd_vd(vabs_vd_vd(x), vcast_vd_d(710)), visnan_vo_vd(y)), vcast_vd_d(SLEEF_INFINITY), y);\n  y = vmulsign_vd_vd_vd(y, x);\n  y = vreinterpret_vd_vm(vor_vm_vo64_vm(visnan_vo_vd(x), vreinterpret_vm_vd(y)));\n\n  return y;\n}\n\nEXPORT CONST VECTOR_CC vdouble xcosh(vdouble x) {\n  vdouble y = vabs_vd_vd(x);\n  vdouble2 d = expk2(vcast_vd2_vd_vd(y, vcast_vd_d(0)));\n  d = ddadd_vd2_vd2_vd2(d, ddrec_vd2_vd2(d));\n  y = vmul_vd_vd_vd(vadd_vd_vd_vd(vd2getx_vd_vd2(d), vd2gety_vd_vd2(d)), vcast_vd_d(0.5));\n\n  y = vsel_vd_vo_vd_vd(vor_vo_vo_vo(vgt_vo_vd_vd(vabs_vd_vd(x), vcast_vd_d(710)), visnan_vo_vd(y)), vcast_vd_d(SLEEF_INFINITY), y);\n  y = vreinterpret_vd_vm(vor_vm_vo64_vm(visnan_vo_vd(x), vreinterpret_vm_vd(y)));\n\n  return y;\n}\n\nEXPORT CONST VECTOR_CC vdouble xtanh(vdouble x) {\n  vdouble y = vabs_vd_vd(x);\n  vdouble2 d = expk2(vcast_vd2_vd_vd(y, vcast_vd_d(0)));\n  vdouble2 e = ddrec_vd2_vd2(d);\n  d = dddiv_vd2_vd2_vd2(ddadd2_vd2_vd2_vd2(d, ddneg_vd2_vd2(e)), ddadd2_vd2_vd2_vd2(d, e));\n  y = vadd_vd_vd_vd(vd2getx_vd_vd2(d), vd2gety_vd_vd2(d));\n\n  y = vsel_vd_vo_vd_vd(vor_vo_vo_vo(vgt_vo_vd_vd(vabs_vd_vd(x), vcast_vd_d(18.714973875)), visnan_vo_vd(y)), vcast_vd_d(1.0), y);\n  y = vmulsign_vd_vd_vd(y, x);\n  y = vreinterpret_vd_vm(vor_vm_vo64_vm(visnan_vo_vd(x), vreinterpret_vm_vd(y)));\n\n  return y;\n}\n\nEXPORT CONST VECTOR_CC vdouble xsinh_u35(vdouble x) {\n  vdouble e = expm1k(vabs_vd_vd(x));\n\n  vdouble y = vdiv_vd_vd_vd(vadd_vd_vd_vd(e, vcast_vd_d(2)), vadd_vd_vd_vd(e, vcast_vd_d(1)));\n  y = vmul_vd_vd_vd(y, vmul_vd_vd_vd(vcast_vd_d(0.5), e));\n\n  y = vsel_vd_vo_vd_vd(vor_vo_vo_vo(vgt_vo_vd_vd(vabs_vd_vd(x), vcast_vd_d(709)), visnan_vo_vd(y)), vcast_vd_d(SLEEF_INFINITY), y);\n  y = vmulsign_vd_vd_vd(y, x);\n  y = vreinterpret_vd_vm(vor_vm_vo64_vm(visnan_vo_vd(x), vreinterpret_vm_vd(y)));\n\n  return y;\n}\n\nEXPORT CONST VECTOR_CC vdouble xcosh_u35(vdouble x) {\n  vdouble e = xexp(vabs_vd_vd(x));\n  vdouble y = vmla_vd_vd_vd_vd(vcast_vd_d(0.5), e, vdiv_vd_vd_vd(vcast_vd_d(0.5), e));\n\n  y = vsel_vd_vo_vd_vd(vor_vo_vo_vo(vgt_vo_vd_vd(vabs_vd_vd(x), vcast_vd_d(709)), visnan_vo_vd(y)), vcast_vd_d(SLEEF_INFINITY), y);\n  y = vreinterpret_vd_vm(vor_vm_vo64_vm(visnan_vo_vd(x), vreinterpret_vm_vd(y)));\n\n  return y;\n}\n\nEXPORT CONST VECTOR_CC vdouble xtanh_u35(vdouble x) {\n  vdouble d = expm1k(vmul_vd_vd_vd(vcast_vd_d(2), vabs_vd_vd(x)));\n  vdouble y = vdiv_vd_vd_vd(d, vadd_vd_vd_vd(vcast_vd_d(2), d));\n\n  y = vsel_vd_vo_vd_vd(vor_vo_vo_vo(vgt_vo_vd_vd(vabs_vd_vd(x), vcast_vd_d(18.714973875)), visnan_vo_vd(y)), vcast_vd_d(1.0), y);\n  y = vmulsign_vd_vd_vd(y, x);\n  y = vreinterpret_vd_vm(vor_vm_vo64_vm(visnan_vo_vd(x), vreinterpret_vm_vd(y)));\n\n  return y;\n}\n\nstatic INLINE CONST VECTOR_CC vdouble2 logk2(vdouble2 d) {\n  vdouble2 x, x2, m, s;\n  vdouble t;\n  vint e;\n  \n  e = vilogbk_vi_vd(vmul_vd_vd_vd(vd2getx_vd_vd2(d), vcast_vd_d(1.0/0.75)));\n\n  m = vd2setxy_vd2_vd_vd(vldexp2_vd_vd_vi(vd2getx_vd_vd2(d), vneg_vi_vi(e)), \n\t\t\t vldexp2_vd_vd_vi(vd2gety_vd_vd2(d), vneg_vi_vi(e)));\n\n  x = dddiv_vd2_vd2_vd2(ddadd2_vd2_vd2_vd(m, vcast_vd_d(-1)), ddadd2_vd2_vd2_vd(m, vcast_vd_d(1)));\n  x2 = ddsqu_vd2_vd2(x);\n\n  vdouble x4 = vmul_vd_vd_vd(vd2getx_vd_vd2(x2), vd2getx_vd_vd2(x2)), x8 = vmul_vd_vd_vd(x4, x4);\n  t = POLY7(vd2getx_vd_vd2(x2), x4, x8,\n\t    0.13860436390467167910856,\n\t    0.131699838841615374240845,\n\t    0.153914168346271945653214,\n\t    0.181816523941564611721589,\n\t    0.22222224632662035403996,\n\t    0.285714285511134091777308,\n\t    0.400000000000914013309483);\n  t = vmla_vd_vd_vd_vd(t, vd2getx_vd_vd2(x2), vcast_vd_d(0.666666666666664853302393));\n\n  s = ddmul_vd2_vd2_vd(vcast_vd2_d_d(0.693147180559945286226764, 2.319046813846299558417771e-17), vcast_vd_vi(e));\n  s = ddadd_vd2_vd2_vd2(s, ddscale_vd2_vd2_vd(x, vcast_vd_d(2)));\n  s = ddadd_vd2_vd2_vd2(s, ddmul_vd2_vd2_vd(ddmul_vd2_vd2_vd2(x2, x), t));\n\n  return  s;\n}\n\nEXPORT CONST VECTOR_CC vdouble xasinh(vdouble x) {\n  vdouble y = vabs_vd_vd(x);\n  vopmask o = vgt_vo_vd_vd(y, vcast_vd_d(1));\n  vdouble2 d;\n  \n  d = vsel_vd2_vo_vd2_vd2(o, ddrec_vd2_vd(x), vcast_vd2_vd_vd(y, vcast_vd_d(0)));\n  d = ddsqrt_vd2_vd2(ddadd2_vd2_vd2_vd(ddsqu_vd2_vd2(d), vcast_vd_d(1)));\n  d = vsel_vd2_vo_vd2_vd2(o, ddmul_vd2_vd2_vd(d, y), d);\n\n  d = logk2(ddnormalize_vd2_vd2(ddadd2_vd2_vd2_vd(d, x)));\n  y = vadd_vd_vd_vd(vd2getx_vd_vd2(d), vd2gety_vd_vd2(d));\n  \n  y = vsel_vd_vo_vd_vd(vor_vo_vo_vo(vgt_vo_vd_vd(vabs_vd_vd(x), vcast_vd_d(SQRT_DBL_MAX)),\n\t\t\t\t    visnan_vo_vd(y)),\n\t\t       vmulsign_vd_vd_vd(vcast_vd_d(SLEEF_INFINITY), x), y);\n\n  y = vreinterpret_vd_vm(vor_vm_vo64_vm(visnan_vo_vd(x), vreinterpret_vm_vd(y)));\n  y = vsel_vd_vo_vd_vd(visnegzero_vo_vd(x), vcast_vd_d(-0.0), y);\n  \n  return y;\n}\n\nEXPORT CONST VECTOR_CC vdouble xacosh(vdouble x) {\n  vdouble2 d = logk2(ddadd2_vd2_vd2_vd(ddmul_vd2_vd2_vd2(ddsqrt_vd2_vd2(ddadd2_vd2_vd_vd(x, vcast_vd_d(1))), ddsqrt_vd2_vd2(ddadd2_vd2_vd_vd(x, vcast_vd_d(-1)))), x));\n  vdouble y = vadd_vd_vd_vd(vd2getx_vd_vd2(d), vd2gety_vd_vd2(d));\n\n  y = vsel_vd_vo_vd_vd(vor_vo_vo_vo(vgt_vo_vd_vd(vabs_vd_vd(x), vcast_vd_d(SQRT_DBL_MAX)),\n\t\t\t\t    visnan_vo_vd(y)),\n\t\t       vcast_vd_d(SLEEF_INFINITY), y);\n  y = vreinterpret_vd_vm(vandnot_vm_vo64_vm(veq_vo_vd_vd(x, vcast_vd_d(1.0)), vreinterpret_vm_vd(y)));\n\n  y = vreinterpret_vd_vm(vor_vm_vo64_vm(vlt_vo_vd_vd(x, vcast_vd_d(1.0)), vreinterpret_vm_vd(y)));\n  y = vreinterpret_vd_vm(vor_vm_vo64_vm(visnan_vo_vd(x), vreinterpret_vm_vd(y)));\n  \n  return y;\n}\n\nEXPORT CONST VECTOR_CC vdouble xatanh(vdouble x) {\n  vdouble y = vabs_vd_vd(x);\n  vdouble2 d = logk2(dddiv_vd2_vd2_vd2(ddadd2_vd2_vd_vd(vcast_vd_d(1), y), ddadd2_vd2_vd_vd(vcast_vd_d(1), vneg_vd_vd(y))));\n  y = vreinterpret_vd_vm(vor_vm_vo64_vm(vgt_vo_vd_vd(y, vcast_vd_d(1.0)), vreinterpret_vm_vd(vsel_vd_vo_vd_vd(veq_vo_vd_vd(y, vcast_vd_d(1.0)), vcast_vd_d(SLEEF_INFINITY), vmul_vd_vd_vd(vadd_vd_vd_vd(vd2getx_vd_vd2(d), vd2gety_vd_vd2(d)), vcast_vd_d(0.5))))));\n\n  y = vmulsign_vd_vd_vd(y, x);\n  y = vreinterpret_vd_vm(vor_vm_vo64_vm(vor_vo_vo_vo(visinf_vo_vd(x), visnan_vo_vd(y)), vreinterpret_vm_vd(y)));\n  y = vreinterpret_vd_vm(vor_vm_vo64_vm(visnan_vo_vd(x), vreinterpret_vm_vd(y)));\n\n  return y;\n}\n\nEXPORT CONST VECTOR_CC vdouble xcbrt(vdouble d) {\n  vdouble x, y, q = vcast_vd_d(1.0);\n  vint e, qu, re;\n  vdouble t;\n\n#if defined(ENABLE_AVX512F) || defined(ENABLE_AVX512FNOFMA)\n  vdouble s = d;\n#endif\n  e = vadd_vi_vi_vi(vilogbk_vi_vd(vabs_vd_vd(d)), vcast_vi_i(1));\n  d = vldexp2_vd_vd_vi(d, vneg_vi_vi(e));\n\n  t = vadd_vd_vd_vd(vcast_vd_vi(e), vcast_vd_d(6144));\n  qu = vtruncate_vi_vd(vmul_vd_vd_vd(t, vcast_vd_d(1.0/3.0)));\n  re = vtruncate_vi_vd(vsub_vd_vd_vd(t, vmul_vd_vd_vd(vcast_vd_vi(qu), vcast_vd_d(3))));\n\n  q = vsel_vd_vo_vd_vd(vcast_vo64_vo32(veq_vo_vi_vi(re, vcast_vi_i(1))), vcast_vd_d(1.2599210498948731647672106), q);\n  q = vsel_vd_vo_vd_vd(vcast_vo64_vo32(veq_vo_vi_vi(re, vcast_vi_i(2))), vcast_vd_d(1.5874010519681994747517056), q);\n  q = vldexp2_vd_vd_vi(q, vsub_vi_vi_vi(qu, vcast_vi_i(2048)));\n\n  q = vmulsign_vd_vd_vd(q, d);\n\n  d = vabs_vd_vd(d);\n\n  x = vcast_vd_d(-0.640245898480692909870982);\n  x = vmla_vd_vd_vd_vd(x, d, vcast_vd_d(2.96155103020039511818595));\n  x = vmla_vd_vd_vd_vd(x, d, vcast_vd_d(-5.73353060922947843636166));\n  x = vmla_vd_vd_vd_vd(x, d, vcast_vd_d(6.03990368989458747961407));\n  x = vmla_vd_vd_vd_vd(x, d, vcast_vd_d(-3.85841935510444988821632));\n  x = vmla_vd_vd_vd_vd(x, d, vcast_vd_d(2.2307275302496609725722));\n\n  y = vmul_vd_vd_vd(x, x); y = vmul_vd_vd_vd(y, y); x = vsub_vd_vd_vd(x, vmul_vd_vd_vd(vmlapn_vd_vd_vd_vd(d, y, x), vcast_vd_d(1.0 / 3.0)));\n  y = vmul_vd_vd_vd(vmul_vd_vd_vd(d, x), x);\n  y = vmul_vd_vd_vd(vsub_vd_vd_vd(y, vmul_vd_vd_vd(vmul_vd_vd_vd(vcast_vd_d(2.0 / 3.0), y), vmla_vd_vd_vd_vd(y, x, vcast_vd_d(-1.0)))), q);\n\n#if defined(ENABLE_AVX512F) || defined(ENABLE_AVX512FNOFMA)\n  y = vsel_vd_vo_vd_vd(visinf_vo_vd(s), vmulsign_vd_vd_vd(vcast_vd_d(SLEEF_INFINITY), s), y);\n  y = vsel_vd_vo_vd_vd(veq_vo_vd_vd(s, vcast_vd_d(0)), vmulsign_vd_vd_vd(vcast_vd_d(0), s), y);\n#endif\n  \n  return y;\n}\n\nEXPORT CONST VECTOR_CC vdouble xcbrt_u1(vdouble d) {\n  vdouble x, y, z, t;\n  vdouble2 q2 = vcast_vd2_d_d(1, 0), u, v;\n  vint e, qu, re;\n\n#if defined(ENABLE_AVX512F) || defined(ENABLE_AVX512FNOFMA)\n  vdouble s = d;\n#endif\n  e = vadd_vi_vi_vi(vilogbk_vi_vd(vabs_vd_vd(d)), vcast_vi_i(1));\n  d = vldexp2_vd_vd_vi(d, vneg_vi_vi(e));\n\n  t = vadd_vd_vd_vd(vcast_vd_vi(e), vcast_vd_d(6144));\n  qu = vtruncate_vi_vd(vmul_vd_vd_vd(t, vcast_vd_d(1.0/3.0)));\n  re = vtruncate_vi_vd(vsub_vd_vd_vd(t, vmul_vd_vd_vd(vcast_vd_vi(qu), vcast_vd_d(3))));\n\n  q2 = vsel_vd2_vo_vd2_vd2(vcast_vo64_vo32(veq_vo_vi_vi(re, vcast_vi_i(1))), vcast_vd2_d_d(1.2599210498948731907, -2.5899333753005069177e-17), q2);\n  q2 = vsel_vd2_vo_vd2_vd2(vcast_vo64_vo32(veq_vo_vi_vi(re, vcast_vi_i(2))), vcast_vd2_d_d(1.5874010519681995834, -1.0869008194197822986e-16), q2);\n\n  q2 = vd2setxy_vd2_vd_vd(vmulsign_vd_vd_vd(vd2getx_vd_vd2(q2), d), vmulsign_vd_vd_vd(vd2gety_vd_vd2(q2), d));\n  d = vabs_vd_vd(d);\n\n  x = vcast_vd_d(-0.640245898480692909870982);\n  x = vmla_vd_vd_vd_vd(x, d, vcast_vd_d(2.96155103020039511818595));\n  x = vmla_vd_vd_vd_vd(x, d, vcast_vd_d(-5.73353060922947843636166));\n  x = vmla_vd_vd_vd_vd(x, d, vcast_vd_d(6.03990368989458747961407));\n  x = vmla_vd_vd_vd_vd(x, d, vcast_vd_d(-3.85841935510444988821632));\n  x = vmla_vd_vd_vd_vd(x, d, vcast_vd_d(2.2307275302496609725722));\n\n  y = vmul_vd_vd_vd(x, x); y = vmul_vd_vd_vd(y, y); x = vsub_vd_vd_vd(x, vmul_vd_vd_vd(vmlapn_vd_vd_vd_vd(d, y, x), vcast_vd_d(1.0 / 3.0)));\n\n  z = x;\n\n  u = ddmul_vd2_vd_vd(x, x);\n  u = ddmul_vd2_vd2_vd2(u, u);\n  u = ddmul_vd2_vd2_vd(u, d);\n  u = ddadd2_vd2_vd2_vd(u, vneg_vd_vd(x));\n  y = vadd_vd_vd_vd(vd2getx_vd_vd2(u), vd2gety_vd_vd2(u));\n\n  y = vmul_vd_vd_vd(vmul_vd_vd_vd(vcast_vd_d(-2.0 / 3.0), y), z);\n  v = ddadd2_vd2_vd2_vd(ddmul_vd2_vd_vd(z, z), y);\n  v = ddmul_vd2_vd2_vd(v, d);\n  v = ddmul_vd2_vd2_vd2(v, q2);\n  z = vldexp2_vd_vd_vi(vadd_vd_vd_vd(vd2getx_vd_vd2(v), vd2gety_vd_vd2(v)), vsub_vi_vi_vi(qu, vcast_vi_i(2048)));\n\n#if !defined(ENABLE_AVX512F) && !defined(ENABLE_AVX512FNOFMA)\n  z = vsel_vd_vo_vd_vd(visinf_vo_vd(d), vmulsign_vd_vd_vd(vcast_vd_d(SLEEF_INFINITY), vd2getx_vd_vd2(q2)), z);\n  z = vsel_vd_vo_vd_vd(veq_vo_vd_vd(d, vcast_vd_d(0)), vreinterpret_vd_vm(vsignbit_vm_vd(vd2getx_vd_vd2(q2))), z);\n#else\n  z = vsel_vd_vo_vd_vd(visinf_vo_vd(s), vmulsign_vd_vd_vd(vcast_vd_d(SLEEF_INFINITY), s), z);\n  z = vsel_vd_vo_vd_vd(veq_vo_vd_vd(s, vcast_vd_d(0)), vmulsign_vd_vd_vd(vcast_vd_d(0), s), z);\n#endif\n  \n  return z;\n}\n#endif // #if !defined(DETERMINISTIC)\n\nEXPORT CONST VECTOR_CC vdouble xexp2(vdouble d) {\n  vdouble u = vrint_vd_vd(d), s;\n  vint q = vrint_vi_vd(u);\n\n  s = vsub_vd_vd_vd(d, u);\n\n  vdouble s2 = vmul_vd_vd_vd(s, s), s4 = vmul_vd_vd_vd(s2, s2), s8 = vmul_vd_vd_vd(s4, s4);\n  u = POLY10(s, s2, s4, s8,\n\t     +0.4434359082926529454e-9,\n\t     +0.7073164598085707425e-8,\n\t     +0.1017819260921760451e-6,\n\t     +0.1321543872511327615e-5,\n\t     +0.1525273353517584730e-4,\n\t     +0.1540353045101147808e-3,\n\t     +0.1333355814670499073e-2,\n\t     +0.9618129107597600536e-2,\n\t     +0.5550410866482046596e-1,\n\t     +0.2402265069591012214e+0);\n  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(+0.6931471805599452862e+0));\n  \n#ifdef ENABLE_FMA_DP\n  u = vfma_vd_vd_vd_vd(u, s, vcast_vd_d(1));\n#else\n  u = vd2getx_vd_vd2(ddnormalize_vd2_vd2(ddadd_vd2_vd_vd2(vcast_vd_d(1), ddmul_vd2_vd_vd(u, s))));\n#endif\n  \n  u = vldexp2_vd_vd_vi(u, q);\n\n  u = vsel_vd_vo_vd_vd(vge_vo_vd_vd(d, vcast_vd_d(1024)), vcast_vd_d(SLEEF_INFINITY), u);\n  u = vreinterpret_vd_vm(vandnot_vm_vo64_vm(vlt_vo_vd_vd(d, vcast_vd_d(-2000)), vreinterpret_vm_vd(u)));\n\n  return u;\n}\n\nEXPORT CONST VECTOR_CC vdouble xexp2_u35(vdouble d) {\n  vdouble u = vrint_vd_vd(d), s;\n  vint q = vrint_vi_vd(u);\n\n  s = vsub_vd_vd_vd(d, u);\n\n  vdouble s2 = vmul_vd_vd_vd(s, s), s4 = vmul_vd_vd_vd(s2, s2), s8 = vmul_vd_vd_vd(s4, s4);\n  u = POLY10(s, s2, s4, s8,\n\t     +0.4434359082926529454e-9,\n\t     +0.7073164598085707425e-8,\n\t     +0.1017819260921760451e-6,\n\t     +0.1321543872511327615e-5,\n\t     +0.1525273353517584730e-4,\n\t     +0.1540353045101147808e-3,\n\t     +0.1333355814670499073e-2,\n\t     +0.9618129107597600536e-2,\n\t     +0.5550410866482046596e-1,\n\t     +0.2402265069591012214e+0);\n  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(+0.6931471805599452862e+0));\n  \n  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(1));\n  \n  u = vldexp2_vd_vd_vi(u, q);\n\n  u = vsel_vd_vo_vd_vd(vge_vo_vd_vd(d, vcast_vd_d(1024)), vcast_vd_d(SLEEF_INFINITY), u);\n  u = vreinterpret_vd_vm(vandnot_vm_vo64_vm(vlt_vo_vd_vd(d, vcast_vd_d(-2000)), vreinterpret_vm_vd(u)));\n\n  return u;\n}\n\nEXPORT CONST VECTOR_CC vdouble xexp10(vdouble d) {\n  vdouble u = vrint_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(LOG10_2))), s;\n  vint q = vrint_vi_vd(u);\n\n  s = vmla_vd_vd_vd_vd(u, vcast_vd_d(-L10U), d);\n  s = vmla_vd_vd_vd_vd(u, vcast_vd_d(-L10L), s);\n\n  u = vcast_vd_d(+0.2411463498334267652e-3);\n  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(+0.1157488415217187375e-2));\n  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(+0.5013975546789733659e-2));\n  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(+0.1959762320720533080e-1));\n  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(+0.6808936399446784138e-1));\n  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(+0.2069958494722676234e+0));\n  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(+0.5393829292058536229e+0));\n  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(+0.1171255148908541655e+1));\n  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(+0.2034678592293432953e+1));\n  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(+0.2650949055239205876e+1));\n  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(+0.2302585092994045901e+1));\n  \n#ifdef ENABLE_FMA_DP\n  u = vfma_vd_vd_vd_vd(u, s, vcast_vd_d(1));\n#else\n  u = vd2getx_vd_vd2(ddnormalize_vd2_vd2(ddadd_vd2_vd_vd2(vcast_vd_d(1), ddmul_vd2_vd_vd(u, s))));\n#endif\n  \n  u = vldexp2_vd_vd_vi(u, q);\n\n  u = vsel_vd_vo_vd_vd(vgt_vo_vd_vd(d, vcast_vd_d(308.25471555991671)), vcast_vd_d(SLEEF_INFINITY), u);\n  u = vreinterpret_vd_vm(vandnot_vm_vo64_vm(vlt_vo_vd_vd(d, vcast_vd_d(-350)), vreinterpret_vm_vd(u)));\n\n  return u;\n}\n\nEXPORT CONST VECTOR_CC vdouble xexp10_u35(vdouble d) {\n  vdouble u = vrint_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(LOG10_2))), s;\n  vint q = vrint_vi_vd(u);\n\n  s = vmla_vd_vd_vd_vd(u, vcast_vd_d(-L10U), d);\n  s = vmla_vd_vd_vd_vd(u, vcast_vd_d(-L10L), s);\n\n  vdouble s2 = vmul_vd_vd_vd(s, s), s4 = vmul_vd_vd_vd(s2, s2), s8 = vmul_vd_vd_vd(s4, s4);\n  u = POLY11(s, s2, s4, s8,\n\t     +0.2411463498334267652e-3,\n\t     +0.1157488415217187375e-2,\n\t     +0.5013975546789733659e-2,\n\t     +0.1959762320720533080e-1,\n\t     +0.6808936399446784138e-1,\n\t     +0.2069958494722676234e+0,\n\t     +0.5393829292058536229e+0,\n\t     +0.1171255148908541655e+1,\n\t     +0.2034678592293432953e+1,\n\t     +0.2650949055239205876e+1,\n\t     +0.2302585092994045901e+1);\n  \n  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(1));\n  \n  u = vldexp2_vd_vd_vi(u, q);\n\n  u = vsel_vd_vo_vd_vd(vgt_vo_vd_vd(d, vcast_vd_d(308.25471555991671)), vcast_vd_d(SLEEF_INFINITY), u);\n  u = vreinterpret_vd_vm(vandnot_vm_vo64_vm(vlt_vo_vd_vd(d, vcast_vd_d(-350)), vreinterpret_vm_vd(u)));\n\n  return u;\n}\n\n#if !defined(DETERMINISTIC)\nEXPORT CONST VECTOR_CC vdouble xexpm1(vdouble a) {\n  vdouble2 d = ddadd2_vd2_vd2_vd(expk2(vcast_vd2_vd_vd(a, vcast_vd_d(0))), vcast_vd_d(-1.0));\n  vdouble x = vadd_vd_vd_vd(vd2getx_vd_vd2(d), vd2gety_vd_vd2(d));\n  x = vsel_vd_vo_vd_vd(vgt_vo_vd_vd(a, vcast_vd_d(709.782712893383996732223)), vcast_vd_d(SLEEF_INFINITY), x);\n  x = vsel_vd_vo_vd_vd(vlt_vo_vd_vd(a, vcast_vd_d(-36.736800569677101399113302437)), vcast_vd_d(-1), x);\n  x = vsel_vd_vo_vd_vd(visnegzero_vo_vd(a), vcast_vd_d(-0.0), x);\n  return x;\n}\n\nEXPORT CONST VECTOR_CC vdouble xlog10(vdouble d) {\n  vdouble2 x;\n  vdouble t, m, x2;\n\n#if !defined(ENABLE_AVX512F) && !defined(ENABLE_AVX512FNOFMA)\n  vopmask o = vlt_vo_vd_vd(d, vcast_vd_d(DBL_MIN));\n  d = vsel_vd_vo_vd_vd(o, vmul_vd_vd_vd(d, vcast_vd_d((double)(INT64_C(1) << 32) * (double)(INT64_C(1) << 32))), d);\n  vint e = vilogb2k_vi_vd(vmul_vd_vd_vd(d, vcast_vd_d(1.0/0.75)));\n  m = vldexp3_vd_vd_vi(d, vneg_vi_vi(e));\n  e = vsel_vi_vo_vi_vi(vcast_vo32_vo64(o), vsub_vi_vi_vi(e, vcast_vi_i(64)), e);\n#else\n  vdouble e = vgetexp_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(1.0/0.75)));\n  e = vsel_vd_vo_vd_vd(vispinf_vo_vd(e), vcast_vd_d(1024.0), e);\n  m = vgetmant_vd_vd(d);\n#endif\n\n  x = dddiv_vd2_vd2_vd2(ddadd2_vd2_vd_vd(vcast_vd_d(-1), m), ddadd2_vd2_vd_vd(vcast_vd_d(1), m));\n  x2 = vmul_vd_vd_vd(vd2getx_vd_vd2(x), vd2getx_vd_vd2(x));\n\n  vdouble x4 = vmul_vd_vd_vd(x2, x2), x8 = vmul_vd_vd_vd(x4, x4);\n  t = POLY7(x2, x4, x8,\n\t    +0.6653725819576758460e-1,\n\t    +0.6625722782820833712e-1,\n\t    +0.7898105214313944078e-1,\n\t    +0.9650955035715275132e-1,\n\t    +0.1240841409721444993e+0,\n\t    +0.1737177927454605086e+0,\n\t    +0.2895296546021972617e+0);\n  \n#if !defined(ENABLE_AVX512F) && !defined(ENABLE_AVX512FNOFMA)\n  vdouble2 s = ddmul_vd2_vd2_vd(vcast_vd2_d_d(0.30102999566398119802, -2.803728127785170339e-18), vcast_vd_vi(e));\n#else\n  vdouble2 s = ddmul_vd2_vd2_vd(vcast_vd2_d_d(0.30102999566398119802, -2.803728127785170339e-18), e);\n#endif\n\n  s = ddadd_vd2_vd2_vd2(s, ddmul_vd2_vd2_vd2(x, vcast_vd2_d_d(0.86858896380650363334, 1.1430059694096389311e-17)));\n  s = ddadd_vd2_vd2_vd(s, vmul_vd_vd_vd(vmul_vd_vd_vd(x2, vd2getx_vd_vd2(x)), t));\n\n  vdouble r = vadd_vd_vd_vd(vd2getx_vd_vd2(s), vd2gety_vd_vd2(s));\n\n#if !defined(ENABLE_AVX512F) && !defined(ENABLE_AVX512FNOFMA)\n  r = vsel_vd_vo_vd_vd(vispinf_vo_vd(d), vcast_vd_d(SLEEF_INFINITY), r);\n  r = vsel_vd_vo_vd_vd(vor_vo_vo_vo(vlt_vo_vd_vd(d, vcast_vd_d(0)), visnan_vo_vd(d)), vcast_vd_d(SLEEF_NAN), r);\n  r = vsel_vd_vo_vd_vd(veq_vo_vd_vd(d, vcast_vd_d(0)), vcast_vd_d(-SLEEF_INFINITY), r);\n#else\n  r = vfixup_vd_vd_vd_vi2_i(r, d, vcast_vi2_i((4 << (2*4)) | (3 << (4*4)) | (5 << (5*4)) | (2 << (6*4))), 0);\n#endif\n  \n  return r;\n}\n\nEXPORT CONST VECTOR_CC vdouble xlog2(vdouble d) {\n  vdouble2 x;\n  vdouble t, m, x2;\n\n#if !defined(ENABLE_AVX512F) && !defined(ENABLE_AVX512FNOFMA)\n  vopmask o = vlt_vo_vd_vd(d, vcast_vd_d(DBL_MIN));\n  d = vsel_vd_vo_vd_vd(o, vmul_vd_vd_vd(d, vcast_vd_d((double)(INT64_C(1) << 32) * (double)(INT64_C(1) << 32))), d);\n  vint e = vilogb2k_vi_vd(vmul_vd_vd_vd(d, vcast_vd_d(1.0/0.75)));\n  m = vldexp3_vd_vd_vi(d, vneg_vi_vi(e));\n  e = vsel_vi_vo_vi_vi(vcast_vo32_vo64(o), vsub_vi_vi_vi(e, vcast_vi_i(64)), e);\n#else\n  vdouble e = vgetexp_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(1.0/0.75)));\n  e = vsel_vd_vo_vd_vd(vispinf_vo_vd(e), vcast_vd_d(1024.0), e);\n  m = vgetmant_vd_vd(d);\n#endif\n\n  x = dddiv_vd2_vd2_vd2(ddadd2_vd2_vd_vd(vcast_vd_d(-1), m), ddadd2_vd2_vd_vd(vcast_vd_d(1), m));\n  x2 = vmul_vd_vd_vd(vd2getx_vd_vd2(x), vd2getx_vd_vd2(x));\n\n  vdouble x4 = vmul_vd_vd_vd(x2, x2), x8 = vmul_vd_vd_vd(x4, x4);\n  t = POLY7(x2, x4, x8,\n\t    +0.2211941750456081490e+0,\n\t    +0.2200768693152277689e+0,\n\t    +0.2623708057488514656e+0,\n\t    +0.3205977477944495502e+0,\n\t    +0.4121985945485324709e+0,\n\t    +0.5770780162997058982e+0,\n\t    +0.96179669392608091449);\n  \n#if !defined(ENABLE_AVX512F) && !defined(ENABLE_AVX512FNOFMA)\n  vdouble2 s = ddadd2_vd2_vd_vd2(vcast_vd_vi(e),\n\t\t\t\t ddmul_vd2_vd2_vd2(x, vcast_vd2_d_d(2.885390081777926774, 6.0561604995516736434e-18)));\n#else\n  vdouble2 s = ddadd2_vd2_vd_vd2(e,\n\t\t\t\t ddmul_vd2_vd2_vd2(x, vcast_vd2_d_d(2.885390081777926774, 6.0561604995516736434e-18)));\n#endif\n\n  s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(vmul_vd_vd_vd(x2, vd2getx_vd_vd2(x)), t));\n\n  vdouble r = vadd_vd_vd_vd(vd2getx_vd_vd2(s), vd2gety_vd_vd2(s));\n\n#if !defined(ENABLE_AVX512F) && !defined(ENABLE_AVX512FNOFMA)\n  r = vsel_vd_vo_vd_vd(vispinf_vo_vd(d), vcast_vd_d(SLEEF_INFINITY), r);\n  r = vsel_vd_vo_vd_vd(vor_vo_vo_vo(vlt_vo_vd_vd(d, vcast_vd_d(0)), visnan_vo_vd(d)), vcast_vd_d(SLEEF_NAN), r);\n  r = vsel_vd_vo_vd_vd(veq_vo_vd_vd(d, vcast_vd_d(0)), vcast_vd_d(-SLEEF_INFINITY), r);\n#else\n  r = vfixup_vd_vd_vd_vi2_i(r, d, vcast_vi2_i((4 << (2*4)) | (3 << (4*4)) | (5 << (5*4)) | (2 << (6*4))), 0);\n#endif\n  \n  return r;\n}\n\nEXPORT CONST VECTOR_CC vdouble xlog2_u35(vdouble d) {\n  vdouble m, t, x, x2;\n\n#if !defined(ENABLE_AVX512F) && !defined(ENABLE_AVX512FNOFMA)\n  vopmask o = vlt_vo_vd_vd(d, vcast_vd_d(DBL_MIN));\n  d = vsel_vd_vo_vd_vd(o, vmul_vd_vd_vd(d, vcast_vd_d((double)(INT64_C(1) << 32) * (double)(INT64_C(1) << 32))), d);\n  vint e = vilogb2k_vi_vd(vmul_vd_vd_vd(d, vcast_vd_d(1.0/0.75)));\n  m = vldexp3_vd_vd_vi(d, vneg_vi_vi(e));\n  e = vsel_vi_vo_vi_vi(vcast_vo32_vo64(o), vsub_vi_vi_vi(e, vcast_vi_i(64)), e);\n#else\n  vdouble e = vgetexp_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(1.0/0.75)));\n  e = vsel_vd_vo_vd_vd(vispinf_vo_vd(e), vcast_vd_d(1024.0), e);\n  m = vgetmant_vd_vd(d);\n#endif\n\n  x = vdiv_vd_vd_vd(vsub_vd_vd_vd(m, vcast_vd_d(1)), vadd_vd_vd_vd(m, vcast_vd_d(1)));\n  x2 = vmul_vd_vd_vd(x, x);\n\n  t = vcast_vd_d(+0.2211941750456081490e+0);\n  t = vmla_vd_vd_vd_vd(t, x2, vcast_vd_d(+0.2200768693152277689e+0));\n  t = vmla_vd_vd_vd_vd(t, x2, vcast_vd_d(+0.2623708057488514656e+0));\n  t = vmla_vd_vd_vd_vd(t, x2, vcast_vd_d(+0.3205977477944495502e+0));\n  t = vmla_vd_vd_vd_vd(t, x2, vcast_vd_d(+0.4121985945485324709e+0));\n  t = vmla_vd_vd_vd_vd(t, x2, vcast_vd_d(+0.5770780162997058982e+0));\n  t = vmla_vd_vd_vd_vd(t, x2, vcast_vd_d(+0.96179669392608091449  ));\n  \n#if !defined(ENABLE_AVX512F) && !defined(ENABLE_AVX512FNOFMA)\n  vdouble2 s = ddadd_vd2_vd_vd2(vcast_vd_vi(e),\n\t\t\t\tddmul_vd2_vd_vd(x, vcast_vd_d(2.885390081777926774)));\n#else\n  vdouble2 s = ddadd_vd2_vd_vd2(e,\n\t\t\t\tddmul_vd2_vd_vd(x, vcast_vd_d(2.885390081777926774)));\n#endif\n\n  vdouble r = vmla_vd_vd_vd_vd(t, vmul_vd_vd_vd(x, x2), vadd_vd_vd_vd(vd2getx_vd_vd2(s), vd2gety_vd_vd2(s)));\n\n#if !defined(ENABLE_AVX512F) && !defined(ENABLE_AVX512FNOFMA)\n  r = vsel_vd_vo_vd_vd(vispinf_vo_vd(d), vcast_vd_d(SLEEF_INFINITY), r);\n  r = vsel_vd_vo_vd_vd(vor_vo_vo_vo(vlt_vo_vd_vd(d, vcast_vd_d(0)), visnan_vo_vd(d)), vcast_vd_d(SLEEF_NAN), r);\n  r = vsel_vd_vo_vd_vd(veq_vo_vd_vd(d, vcast_vd_d(0)), vcast_vd_d(-SLEEF_INFINITY), r);\n#else\n  r = vfixup_vd_vd_vd_vi2_i(r, d, vcast_vi2_i((4 << (2*4)) | (3 << (4*4)) | (5 << (5*4)) | (2 << (6*4))), 0);\n#endif\n  \n  return r;\n}\n\nEXPORT CONST VECTOR_CC vdouble xlog1p(vdouble d) {\n  vdouble2 x;\n  vdouble t, m, x2;\n\n  vdouble dp1 = vadd_vd_vd_vd(d, vcast_vd_d(1));\n\n#if !defined(ENABLE_AVX512F) && !defined(ENABLE_AVX512FNOFMA)\n  vopmask o = vlt_vo_vd_vd(dp1, vcast_vd_d(DBL_MIN));\n  dp1 = vsel_vd_vo_vd_vd(o, vmul_vd_vd_vd(dp1, vcast_vd_d((double)(INT64_C(1) << 32) * (double)(INT64_C(1) << 32))), dp1);\n  vint e = vilogb2k_vi_vd(vmul_vd_vd_vd(dp1, vcast_vd_d(1.0/0.75)));\n  t = vldexp3_vd_vd_vi(vcast_vd_d(1), vneg_vi_vi(e));\n  m = vmla_vd_vd_vd_vd(d, t, vsub_vd_vd_vd(t, vcast_vd_d(1)));\n  e = vsel_vi_vo_vi_vi(vcast_vo32_vo64(o), vsub_vi_vi_vi(e, vcast_vi_i(64)), e);\n  vdouble2 s = ddmul_vd2_vd2_vd(vcast_vd2_d_d(0.693147180559945286226764, 2.319046813846299558417771e-17), vcast_vd_vi(e));\n#else\n  vdouble e = vgetexp_vd_vd(vmul_vd_vd_vd(dp1, vcast_vd_d(1.0/0.75)));\n  e = vsel_vd_vo_vd_vd(vispinf_vo_vd(e), vcast_vd_d(1024.0), e);\n  t = vldexp3_vd_vd_vi(vcast_vd_d(1), vneg_vi_vi(vrint_vi_vd(e)));\n  m = vmla_vd_vd_vd_vd(d, t, vsub_vd_vd_vd(t, vcast_vd_d(1)));\n  vdouble2 s = ddmul_vd2_vd2_vd(vcast_vd2_d_d(0.693147180559945286226764, 2.319046813846299558417771e-17), e);\n#endif\n\n  x = dddiv_vd2_vd2_vd2(vcast_vd2_vd_vd(m, vcast_vd_d(0)), ddadd_vd2_vd_vd(vcast_vd_d(2), m));\n  x2 = vmul_vd_vd_vd(vd2getx_vd_vd2(x), vd2getx_vd_vd2(x));\n\n  vdouble x4 = vmul_vd_vd_vd(x2, x2), x8 = vmul_vd_vd_vd(x4, x4);\n  t = POLY7(x2, x4, x8,\n\t    0.1532076988502701353e+0,\n\t    0.1525629051003428716e+0,\n\t    0.1818605932937785996e+0,\n\t    0.2222214519839380009e+0,\n\t    0.2857142932794299317e+0,\n\t    0.3999999999635251990e+0,\n\t    0.6666666666667333541e+0);\n  \n  s = ddadd_vd2_vd2_vd2(s, ddscale_vd2_vd2_vd(x, vcast_vd_d(2)));\n  s = ddadd_vd2_vd2_vd(s, vmul_vd_vd_vd(vmul_vd_vd_vd(x2, vd2getx_vd_vd2(x)), t));\n\n  vdouble r = vadd_vd_vd_vd(vd2getx_vd_vd2(s), vd2gety_vd_vd2(s));\n  \n  r = vsel_vd_vo_vd_vd(vgt_vo_vd_vd(d, vcast_vd_d(1e+307)), vcast_vd_d(SLEEF_INFINITY), r);\n  r = vsel_vd_vo_vd_vd(vor_vo_vo_vo(vlt_vo_vd_vd(d, vcast_vd_d(-1)), visnan_vo_vd(d)), vcast_vd_d(SLEEF_NAN), r);\n  r = vsel_vd_vo_vd_vd(veq_vo_vd_vd(d, vcast_vd_d(-1)), vcast_vd_d(-SLEEF_INFINITY), r);\n  r = vsel_vd_vo_vd_vd(visnegzero_vo_vd(d), vcast_vd_d(-0.0), r);\n  \n  return r;\n}\n\n//\n\nstatic INLINE CONST VECTOR_CC vint2 vcast_vi2_i_i(int i0, int i1) { return vcast_vi2_vm(vcast_vm_i_i(i0, i1)); }\n\nEXPORT CONST VECTOR_CC vdouble xfabs(vdouble x) { return vabs_vd_vd(x); }\n\nEXPORT CONST VECTOR_CC vdouble xcopysign(vdouble x, vdouble y) { return vcopysign_vd_vd_vd(x, y); }\n\nEXPORT CONST VECTOR_CC vdouble xfmax(vdouble x, vdouble y) {\n#if (defined(__x86_64__) || defined(__i386__)) && !defined(ENABLE_VECEXT) && !defined(ENABLE_PUREC)\n  return vsel_vd_vo_vd_vd(visnan_vo_vd(y), x, vmax_vd_vd_vd(x, y));\n#else\n  return vsel_vd_vo_vd_vd(visnan_vo_vd(y), x, vsel_vd_vo_vd_vd(vgt_vo_vd_vd(x, y), x, y));\n#endif\n}\n\nEXPORT CONST VECTOR_CC vdouble xfmin(vdouble x, vdouble y) {\n#if (defined(__x86_64__) || defined(__i386__)) && !defined(ENABLE_VECEXT) && !defined(ENABLE_PUREC)\n  return vsel_vd_vo_vd_vd(visnan_vo_vd(y), x, vmin_vd_vd_vd(x, y));\n#else\n  return vsel_vd_vo_vd_vd(visnan_vo_vd(y), x, vsel_vd_vo_vd_vd(vgt_vo_vd_vd(y, x), x, y));\n#endif\n}\n\nEXPORT CONST VECTOR_CC vdouble xfdim(vdouble x, vdouble y) {\n  vdouble ret = vsub_vd_vd_vd(x, y);\n  ret = vsel_vd_vo_vd_vd(vor_vo_vo_vo(vlt_vo_vd_vd(ret, vcast_vd_d(0)), veq_vo_vd_vd(x, y)), vcast_vd_d(0), ret);\n  return ret;\n}\n\nEXPORT CONST VECTOR_CC vdouble xtrunc(vdouble x) {\n#ifdef FULL_FP_ROUNDING\n  return vtruncate_vd_vd(x);\n#else\n  vdouble fr = vsub_vd_vd_vd(x, vmul_vd_vd_vd(vcast_vd_d(INT64_C(1) << 31), vcast_vd_vi(vtruncate_vi_vd(vmul_vd_vd_vd(x, vcast_vd_d(1.0 / (INT64_C(1) << 31)))))));\n  fr = vsub_vd_vd_vd(fr, vcast_vd_vi(vtruncate_vi_vd(fr)));\n  return vsel_vd_vo_vd_vd(vor_vo_vo_vo(visinf_vo_vd(x), vge_vo_vd_vd(vabs_vd_vd(x), vcast_vd_d(INT64_C(1) << 52))), x, vcopysign_vd_vd_vd(vsub_vd_vd_vd(x, fr), x));\n#endif\n}\n\nEXPORT CONST VECTOR_CC vdouble xfloor(vdouble x) {\n  vdouble fr = vsub_vd_vd_vd(x, vmul_vd_vd_vd(vcast_vd_d(INT64_C(1) << 31), vcast_vd_vi(vtruncate_vi_vd(vmul_vd_vd_vd(x, vcast_vd_d(1.0 / (INT64_C(1) << 31)))))));\n  fr = vsub_vd_vd_vd(fr, vcast_vd_vi(vtruncate_vi_vd(fr)));\n  fr = vsel_vd_vo_vd_vd(vlt_vo_vd_vd(fr, vcast_vd_d(0)), vadd_vd_vd_vd(fr, vcast_vd_d(1.0)), fr);\n  return vsel_vd_vo_vd_vd(vor_vo_vo_vo(visinf_vo_vd(x), vge_vo_vd_vd(vabs_vd_vd(x), vcast_vd_d(INT64_C(1) << 52))), x, vcopysign_vd_vd_vd(vsub_vd_vd_vd(x, fr), x));\n}\n\nEXPORT CONST VECTOR_CC vdouble xceil(vdouble x) {\n  vdouble fr = vsub_vd_vd_vd(x, vmul_vd_vd_vd(vcast_vd_d(INT64_C(1) << 31), vcast_vd_vi(vtruncate_vi_vd(vmul_vd_vd_vd(x, vcast_vd_d(1.0 / (INT64_C(1) << 31)))))));\n  fr = vsub_vd_vd_vd(fr, vcast_vd_vi(vtruncate_vi_vd(fr)));\n  fr = vsel_vd_vo_vd_vd(vle_vo_vd_vd(fr, vcast_vd_d(0)), fr, vsub_vd_vd_vd(fr, vcast_vd_d(1.0)));\n  return vsel_vd_vo_vd_vd(vor_vo_vo_vo(visinf_vo_vd(x), vge_vo_vd_vd(vabs_vd_vd(x), vcast_vd_d(INT64_C(1) << 52))), x, vcopysign_vd_vd_vd(vsub_vd_vd_vd(x, fr), x));\n}\n\nEXPORT CONST VECTOR_CC vdouble xround(vdouble d) {\n  vdouble x = vadd_vd_vd_vd(d, vcast_vd_d(0.5));\n  vdouble fr = vsub_vd_vd_vd(x, vmul_vd_vd_vd(vcast_vd_d(INT64_C(1) << 31), vcast_vd_vi(vtruncate_vi_vd(vmul_vd_vd_vd(x, vcast_vd_d(1.0 / (INT64_C(1) << 31)))))));\n  fr = vsub_vd_vd_vd(fr, vcast_vd_vi(vtruncate_vi_vd(fr)));\n  x = vsel_vd_vo_vd_vd(vand_vo_vo_vo(vle_vo_vd_vd(x, vcast_vd_d(0)), veq_vo_vd_vd(fr, vcast_vd_d(0))), vsub_vd_vd_vd(x, vcast_vd_d(1.0)), x);\n  fr = vsel_vd_vo_vd_vd(vlt_vo_vd_vd(fr, vcast_vd_d(0)), vadd_vd_vd_vd(fr, vcast_vd_d(1.0)), fr);\n  x = vsel_vd_vo_vd_vd(veq_vo_vd_vd(d, vcast_vd_d(0.49999999999999994449)), vcast_vd_d(0), x);\n  return vsel_vd_vo_vd_vd(vor_vo_vo_vo(visinf_vo_vd(d), vge_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(INT64_C(1) << 52))), d, vcopysign_vd_vd_vd(vsub_vd_vd_vd(x, fr), d));\n}\n\nEXPORT CONST VECTOR_CC vdouble xrint(vdouble d) {\n#ifdef FULL_FP_ROUNDING\n  return vrint_vd_vd(d);\n#else\n  vdouble c = vmulsign_vd_vd_vd(vcast_vd_d(INT64_C(1) << 52), d);\n  return vsel_vd_vo_vd_vd(vgt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(INT64_C(1) << 52)),\n\t\t\t  d, vorsign_vd_vd_vd(vsub_vd_vd_vd(vadd_vd_vd_vd(d, c), c), d));\n#endif\n}\n\nEXPORT CONST VECTOR_CC vdouble xnextafter(vdouble x, vdouble y) {\n  x = vsel_vd_vo_vd_vd(veq_vo_vd_vd(x, vcast_vd_d(0)), vmulsign_vd_vd_vd(vcast_vd_d(0), y), x);\n  vint2 t, xi2 = vreinterpret_vi2_vd(x);\n  vopmask c = vxor_vo_vo_vo(vsignbit_vo_vd(x), vge_vo_vd_vd(y, x));\n\n  t = vadd_vi2_vi2_vi2(vxor_vi2_vi2_vi2(xi2, vcast_vi2_i_i(0x7fffffff, 0xffffffff)), vcast_vi2_i_i(0, 1));\n  t = vadd_vi2_vi2_vi2(t, vrev21_vi2_vi2(vand_vi2_vi2_vi2(vcast_vi2_i_i(0, 1), veq_vi2_vi2_vi2(t, vcast_vi2_i_i(-1, 0)))));\n  xi2 = vreinterpret_vi2_vd(vsel_vd_vo_vd_vd(c, vreinterpret_vd_vi2(t), vreinterpret_vd_vi2(xi2)));\n\n  xi2 = vsub_vi2_vi2_vi2(xi2, vcast_vi2_vm(vand_vm_vo64_vm(vneq_vo_vd_vd(x, y), vcast_vm_i_i(0, 1))));\n\n  xi2 = vreinterpret_vi2_vd(vsel_vd_vo_vd_vd(vneq_vo_vd_vd(x, y),\n\t\t\t\t\t     vreinterpret_vd_vi2(vadd_vi2_vi2_vi2(xi2, vrev21_vi2_vi2(vand_vi2_vi2_vi2(vcast_vi2_i_i(0, -1), veq_vi2_vi2_vi2(xi2, vcast_vi2_i_i(0, -1)))))),\n\t\t\t\t\t     vreinterpret_vd_vi2(xi2)));\n\n  t = vadd_vi2_vi2_vi2(vxor_vi2_vi2_vi2(xi2, vcast_vi2_i_i(0x7fffffff, 0xffffffff)), vcast_vi2_i_i(0, 1));\n  t = vadd_vi2_vi2_vi2(t, vrev21_vi2_vi2(vand_vi2_vi2_vi2(vcast_vi2_i_i(0, 1), veq_vi2_vi2_vi2(t, vcast_vi2_i_i(-1, 0)))));\n  xi2 = vreinterpret_vi2_vd(vsel_vd_vo_vd_vd(c, vreinterpret_vd_vi2(t), vreinterpret_vd_vi2(xi2)));\n\n  vdouble ret = vreinterpret_vd_vi2(xi2);\n\n  ret = vsel_vd_vo_vd_vd(vand_vo_vo_vo(veq_vo_vd_vd(ret, vcast_vd_d(0)), vneq_vo_vd_vd(x, vcast_vd_d(0))), \n\t\t\t vmulsign_vd_vd_vd(vcast_vd_d(0), x), ret);\n\n  ret = vsel_vd_vo_vd_vd(vand_vo_vo_vo(veq_vo_vd_vd(x, vcast_vd_d(0)), veq_vo_vd_vd(y, vcast_vd_d(0))), y, ret);\n\n  ret = vsel_vd_vo_vd_vd(vor_vo_vo_vo(visnan_vo_vd(x), visnan_vo_vd(y)), vcast_vd_d(SLEEF_NAN), ret);\n  \n  return ret;\n}\n\nEXPORT CONST VECTOR_CC vdouble xfrfrexp(vdouble x) {\n  x = vsel_vd_vo_vd_vd(vlt_vo_vd_vd(vabs_vd_vd(x), vcast_vd_d(DBL_MIN)), vmul_vd_vd_vd(x, vcast_vd_d(UINT64_C(1) << 63)), x);\n\n  vmask xm = vreinterpret_vm_vd(x);\n  xm = vand_vm_vm_vm(xm, vcast_vm_i_i(~0x7ff00000, ~0));\n  xm = vor_vm_vm_vm (xm, vcast_vm_i_i( 0x3fe00000,  0));\n\n  vdouble ret = vreinterpret_vd_vm(xm);\n\n  ret = vsel_vd_vo_vd_vd(visinf_vo_vd(x), vmulsign_vd_vd_vd(vcast_vd_d(SLEEF_INFINITY), x), ret);\n  ret = vsel_vd_vo_vd_vd(veq_vo_vd_vd(x, vcast_vd_d(0)), x, ret);\n  \n  return ret;\n}\n\nEXPORT CONST VECTOR_CC vint xexpfrexp(vdouble x) {\n  x = vsel_vd_vo_vd_vd(vlt_vo_vd_vd(vabs_vd_vd(x), vcast_vd_d(DBL_MIN)), vmul_vd_vd_vd(x, vcast_vd_d(UINT64_C(1) << 63)), x);\n\n  vint ret = vcastu_vi_vi2(vreinterpret_vi2_vd(x));\n  ret = vsub_vi_vi_vi(vand_vi_vi_vi(vsrl_vi_vi_i(ret, 20), vcast_vi_i(0x7ff)), vcast_vi_i(0x3fe));\n\n  ret = vsel_vi_vo_vi_vi(vor_vo_vo_vo(vor_vo_vo_vo(veq_vo_vd_vd(x, vcast_vd_d(0)), visnan_vo_vd(x)), visinf_vo_vd(x)), vcast_vi_i(0), ret);\n  \n  return ret;\n}\n\nEXPORT CONST VECTOR_CC vdouble xfma(vdouble x, vdouble y, vdouble z) {\n#ifdef ENABLE_FMA_DP\n  return vfma_vd_vd_vd_vd(x, y, z);\n#else\n  vdouble h2 = vadd_vd_vd_vd(vmul_vd_vd_vd(x, y), z), q = vcast_vd_d(1);\n  vopmask o = vlt_vo_vd_vd(vabs_vd_vd(h2), vcast_vd_d(1e-300));\n  {\n    const double c0 = UINT64_C(1) << 54, c1 = c0 * c0, c2 = c1 * c1;\n    x = vsel_vd_vo_vd_vd(o, vmul_vd_vd_vd(x, vcast_vd_d(c1)), x);\n    y = vsel_vd_vo_vd_vd(o, vmul_vd_vd_vd(y, vcast_vd_d(c1)), y);\n    z = vsel_vd_vo_vd_vd(o, vmul_vd_vd_vd(z, vcast_vd_d(c2)), z);\n    q = vsel_vd_vo_vd_vd(o, vcast_vd_d(1.0 / c2), q);\n  }\n  o = vgt_vo_vd_vd(vabs_vd_vd(h2), vcast_vd_d(1e+300));\n  {\n    const double c0 = UINT64_C(1) << 54, c1 = c0 * c0, c2 = c1 * c1;\n    x = vsel_vd_vo_vd_vd(o, vmul_vd_vd_vd(x, vcast_vd_d(1.0 / c1)), x);\n    y = vsel_vd_vo_vd_vd(o, vmul_vd_vd_vd(y, vcast_vd_d(1.0 / c1)), y);\n    z = vsel_vd_vo_vd_vd(o, vmul_vd_vd_vd(z, vcast_vd_d(1.0 / c2)), z);\n    q = vsel_vd_vo_vd_vd(o, vcast_vd_d(c2), q);\n  }\n  vdouble2 d = ddmul_vd2_vd_vd(x, y);\n  d = ddadd2_vd2_vd2_vd(d, z);\n  vdouble ret = vsel_vd_vo_vd_vd(vor_vo_vo_vo(veq_vo_vd_vd(x, vcast_vd_d(0)), veq_vo_vd_vd(y, vcast_vd_d(0))), z, vadd_vd_vd_vd(vd2getx_vd_vd2(d), vd2gety_vd_vd2(d)));\n  o = visinf_vo_vd(z);\n  o = vandnot_vo_vo_vo(visinf_vo_vd(x), o);\n  o = vandnot_vo_vo_vo(visnan_vo_vd(x), o);\n  o = vandnot_vo_vo_vo(visinf_vo_vd(y), o);\n  o = vandnot_vo_vo_vo(visnan_vo_vd(y), o);\n  h2 = vsel_vd_vo_vd_vd(o, z, h2);\n\n  o = vor_vo_vo_vo(visinf_vo_vd(h2), visnan_vo_vd(h2));\n  \n  return vsel_vd_vo_vd_vd(o, h2, vmul_vd_vd_vd(ret, q));\n#endif\n}\n\nSQRTU05_FUNCATR VECTOR_CC vdouble xsqrt_u05(vdouble d) {\n#if defined(ENABLE_FMA_DP)\n  vdouble q, w, x, y, z;\n  \n  d = vsel_vd_vo_vd_vd(vlt_vo_vd_vd(d, vcast_vd_d(0)), vcast_vd_d(SLEEF_NAN), d);\n\n  vopmask o = vlt_vo_vd_vd(d, vcast_vd_d(8.636168555094445E-78));\n  d = vsel_vd_vo_vd_vd(o, vmul_vd_vd_vd(d, vcast_vd_d(1.157920892373162E77)), d);\n  q = vsel_vd_vo_vd_vd(o, vcast_vd_d(2.9387358770557188E-39), vcast_vd_d(1));\n\n  y = vreinterpret_vd_vi2(vsub_vi2_vi2_vi2(vcast_vi2_i_i(0x5fe6ec85, 0xe7de30da), vsrl_vi2_vi2_i(vreinterpret_vi2_vd(d), 1)));\n\n  x = vmul_vd_vd_vd(d, y);         w = vmul_vd_vd_vd(vcast_vd_d(0.5), y);\n  y = vfmanp_vd_vd_vd_vd(x, w, vcast_vd_d(0.5));\n  x = vfma_vd_vd_vd_vd(x, y, x);   w = vfma_vd_vd_vd_vd(w, y, w);\n  y = vfmanp_vd_vd_vd_vd(x, w, vcast_vd_d(0.5));\n  x = vfma_vd_vd_vd_vd(x, y, x);   w = vfma_vd_vd_vd_vd(w, y, w);\n  y = vfmanp_vd_vd_vd_vd(x, w, vcast_vd_d(0.5));\n  x = vfma_vd_vd_vd_vd(x, y, x);   w = vfma_vd_vd_vd_vd(w, y, w);\n\n  y = vfmanp_vd_vd_vd_vd(x, w, vcast_vd_d(1.5));  w = vadd_vd_vd_vd(w, w);\n  w = vmul_vd_vd_vd(w, y);\n  x = vmul_vd_vd_vd(w, d);\n  y = vfmapn_vd_vd_vd_vd(w, d, x); z = vfmanp_vd_vd_vd_vd(w, x, vcast_vd_d(1));\n\n  z = vfmanp_vd_vd_vd_vd(w, y, z); w = vmul_vd_vd_vd(vcast_vd_d(0.5), x);\n  w = vfma_vd_vd_vd_vd(w, z, y);\n  w = vadd_vd_vd_vd(w, x);\n\n  w = vmul_vd_vd_vd(w, q);\n\n  w = vsel_vd_vo_vd_vd(vor_vo_vo_vo(veq_vo_vd_vd(d, vcast_vd_d(0)),\n\t\t\t\t    veq_vo_vd_vd(d, vcast_vd_d(SLEEF_INFINITY))), d, w);\n\n  w = vsel_vd_vo_vd_vd(vlt_vo_vd_vd(d, vcast_vd_d(0)), vcast_vd_d(SLEEF_NAN), w);\n\n  return w;\n#else\n  vdouble q;\n  vopmask o;\n  \n  d = vsel_vd_vo_vd_vd(vlt_vo_vd_vd(d, vcast_vd_d(0)), vcast_vd_d(SLEEF_NAN), d);\n\n  o = vlt_vo_vd_vd(d, vcast_vd_d(8.636168555094445E-78));\n  d = vsel_vd_vo_vd_vd(o, vmul_vd_vd_vd(d, vcast_vd_d(1.157920892373162E77)), d);\n  q = vsel_vd_vo_vd_vd(o, vcast_vd_d(2.9387358770557188E-39*0.5), vcast_vd_d(0.5));\n\n  o = vgt_vo_vd_vd(d, vcast_vd_d(1.3407807929942597e+154));\n  d = vsel_vd_vo_vd_vd(o, vmul_vd_vd_vd(d, vcast_vd_d(7.4583407312002070e-155)), d);\n  q = vsel_vd_vo_vd_vd(o, vcast_vd_d(1.1579208923731620e+77*0.5), q);\n\n  vdouble x = vreinterpret_vd_vi2(vsub_vi2_vi2_vi2(vcast_vi2_i_i(0x5fe6ec86, 0), vsrl_vi2_vi2_i(vreinterpret_vi2_vd(vadd_vd_vd_vd(d, vcast_vd_d(1e-320))), 1)));\n\n  x = vmul_vd_vd_vd(x, vsub_vd_vd_vd(vcast_vd_d(1.5), vmul_vd_vd_vd(vmul_vd_vd_vd(vmul_vd_vd_vd(vcast_vd_d(0.5), d), x), x)));\n  x = vmul_vd_vd_vd(x, vsub_vd_vd_vd(vcast_vd_d(1.5), vmul_vd_vd_vd(vmul_vd_vd_vd(vmul_vd_vd_vd(vcast_vd_d(0.5), d), x), x)));\n  x = vmul_vd_vd_vd(x, vsub_vd_vd_vd(vcast_vd_d(1.5), vmul_vd_vd_vd(vmul_vd_vd_vd(vmul_vd_vd_vd(vcast_vd_d(0.5), d), x), x)));\n  x = vmul_vd_vd_vd(x, d);\n\n  vdouble2 d2 = ddmul_vd2_vd2_vd2(ddadd2_vd2_vd_vd2(d, ddmul_vd2_vd_vd(x, x)), ddrec_vd2_vd(x));\n\n  x = vmul_vd_vd_vd(vadd_vd_vd_vd(vd2getx_vd_vd2(d2), vd2gety_vd_vd2(d2)), q);\n\n  x = vsel_vd_vo_vd_vd(vispinf_vo_vd(d), vcast_vd_d(SLEEF_INFINITY), x);\n  x = vsel_vd_vo_vd_vd(veq_vo_vd_vd(d, vcast_vd_d(0)), d, x);\n  \n  return x;\n#endif\n}\n\nEXPORT CONST VECTOR_CC vdouble xsqrt(vdouble d) {\n#if defined(ACCURATE_SQRT)\n  return vsqrt_vd_vd(d);\n#else\n  // fall back to approximation if ACCURATE_SQRT is undefined\n  return xsqrt_u05(d);\n#endif\n}\n\nEXPORT CONST VECTOR_CC vdouble xsqrt_u35(vdouble d) { return xsqrt_u05(d); }\n\nEXPORT CONST VECTOR_CC vdouble xhypot_u05(vdouble x, vdouble y) {\n  x = vabs_vd_vd(x);\n  y = vabs_vd_vd(y);\n  vdouble min = vmin_vd_vd_vd(x, y), n = min;\n  vdouble max = vmax_vd_vd_vd(x, y), d = max;\n\n  vopmask o = vlt_vo_vd_vd(max, vcast_vd_d(DBL_MIN));\n  n = vsel_vd_vo_vd_vd(o, vmul_vd_vd_vd(n, vcast_vd_d(UINT64_C(1) << 54)), n);\n  d = vsel_vd_vo_vd_vd(o, vmul_vd_vd_vd(d, vcast_vd_d(UINT64_C(1) << 54)), d);\n\n  vdouble2 t = dddiv_vd2_vd2_vd2(vcast_vd2_vd_vd(n, vcast_vd_d(0)), vcast_vd2_vd_vd(d, vcast_vd_d(0)));\n  t = ddmul_vd2_vd2_vd(ddsqrt_vd2_vd2(ddadd2_vd2_vd2_vd(ddsqu_vd2_vd2(t), vcast_vd_d(1))), max);\n  vdouble ret = vadd_vd_vd_vd(vd2getx_vd_vd2(t), vd2gety_vd_vd2(t));\n  ret = vsel_vd_vo_vd_vd(visnan_vo_vd(ret), vcast_vd_d(SLEEF_INFINITY), ret);\n  ret = vsel_vd_vo_vd_vd(veq_vo_vd_vd(min, vcast_vd_d(0)), max, ret);\n  ret = vsel_vd_vo_vd_vd(vor_vo_vo_vo(visnan_vo_vd(x), visnan_vo_vd(y)), vcast_vd_d(SLEEF_NAN), ret);\n  ret = vsel_vd_vo_vd_vd(vor_vo_vo_vo(veq_vo_vd_vd(x, vcast_vd_d(SLEEF_INFINITY)), veq_vo_vd_vd(y, vcast_vd_d(SLEEF_INFINITY))), vcast_vd_d(SLEEF_INFINITY), ret);\n\n  return ret;\n}\n\nEXPORT CONST VECTOR_CC vdouble xhypot_u35(vdouble x, vdouble y) {\n  x = vabs_vd_vd(x);\n  y = vabs_vd_vd(y);\n  vdouble min = vmin_vd_vd_vd(x, y);\n  vdouble max = vmax_vd_vd_vd(x, y);\n\n  vdouble t = vdiv_vd_vd_vd(min, max);\n  vdouble ret = vmul_vd_vd_vd(max, vsqrt_vd_vd(vmla_vd_vd_vd_vd(t, t, vcast_vd_d(1))));\n  ret = vsel_vd_vo_vd_vd(veq_vo_vd_vd(min, vcast_vd_d(0)), max, ret);\n  ret = vsel_vd_vo_vd_vd(vor_vo_vo_vo(visnan_vo_vd(x), visnan_vo_vd(y)), vcast_vd_d(SLEEF_NAN), ret);\n  ret = vsel_vd_vo_vd_vd(vor_vo_vo_vo(veq_vo_vd_vd(x, vcast_vd_d(SLEEF_INFINITY)), veq_vo_vd_vd(y, vcast_vd_d(SLEEF_INFINITY))), vcast_vd_d(SLEEF_INFINITY), ret);\n\n  return ret;\n}\n\nstatic INLINE CONST VECTOR_CC vdouble vtoward0(vdouble x) { // returns nextafter(x, 0)\n  vdouble t = vreinterpret_vd_vm(vadd64_vm_vm_vm(vreinterpret_vm_vd(x), vcast_vm_i_i(-1, -1)));\n  return vsel_vd_vo_vd_vd(veq_vo_vd_vd(x, vcast_vd_d(0)), vcast_vd_d(0), t);\n}\n\nstatic INLINE CONST VECTOR_CC vdouble vptrunc(vdouble x) { // round to integer toward 0, positive argument only\n#ifdef FULL_FP_ROUNDING\n  return vtruncate_vd_vd(x);\n#else\n  vdouble fr = vmla_vd_vd_vd_vd(vcast_vd_d(-(double)(INT64_C(1) << 31)), vcast_vd_vi(vtruncate_vi_vd(vmul_vd_vd_vd(x, vcast_vd_d(1.0 / (INT64_C(1) << 31))))), x);\n  fr = vsub_vd_vd_vd(fr, vcast_vd_vi(vtruncate_vi_vd(fr)));\n  return vsel_vd_vo_vd_vd(vge_vo_vd_vd(vabs_vd_vd(x), vcast_vd_d(INT64_C(1) << 52)), x, vsub_vd_vd_vd(x, fr));\n#endif\n}\n\n/* TODO AArch64: potential optimization by using `vfmad_lane_f64` */\nEXPORT CONST VECTOR_CC vdouble xfmod(vdouble x, vdouble y) {\n  vdouble n = vabs_vd_vd(x), d = vabs_vd_vd(y), s = vcast_vd_d(1), q;\n  vopmask o = vlt_vo_vd_vd(d, vcast_vd_d(DBL_MIN));\n  n = vsel_vd_vo_vd_vd(o, vmul_vd_vd_vd(n, vcast_vd_d(UINT64_C(1) << 54)), n);\n  d = vsel_vd_vo_vd_vd(o, vmul_vd_vd_vd(d, vcast_vd_d(UINT64_C(1) << 54)), d);\n  s  = vsel_vd_vo_vd_vd(o, vmul_vd_vd_vd(s , vcast_vd_d(1.0 / (UINT64_C(1) << 54))), s);\n  vdouble2 r = vcast_vd2_vd_vd(n, vcast_vd_d(0));\n  vdouble rd = vtoward0(vrec_vd_vd(d));\n\n  for(int i=0;i<21;i++) { // ceil(log2(DBL_MAX) / 52)\n    q = vptrunc(vmul_vd_vd_vd(vtoward0(vd2getx_vd_vd2(r)), rd));\n#ifndef ENABLE_FMA_DP\n    q = vreinterpret_vd_vm(vand_vm_vm_vm(vreinterpret_vm_vd(q), vcast_vm_i_i(0xffffffff, 0xfffffffe)));\n#endif\n    q = vsel_vd_vo_vd_vd(vand_vo_vo_vo(vgt_vo_vd_vd(vmul_vd_vd_vd(vcast_vd_d(3), d), vd2getx_vd_vd2(r)),\n\t\t\t\t       vge_vo_vd_vd(vd2getx_vd_vd2(r), d)),\n\t\t\t vcast_vd_d(2), q);\n    q = vsel_vd_vo_vd_vd(vand_vo_vo_vo(vgt_vo_vd_vd(vadd_vd_vd_vd(d, d), vd2getx_vd_vd2(r)),\n\t\t\t\t       vge_vo_vd_vd(vd2getx_vd_vd2(r), d)),\n\t\t\t vcast_vd_d(1), q);\n    r = ddnormalize_vd2_vd2(ddadd2_vd2_vd2_vd2(r, ddmul_vd2_vd_vd(q, vneg_vd_vd(d))));\n    if (vtestallones_i_vo64(vlt_vo_vd_vd(vd2getx_vd_vd2(r), d))) break;\n  }\n  \n  vdouble ret = vmul_vd_vd_vd(vd2getx_vd_vd2(r), s);\n  ret = vsel_vd_vo_vd_vd(veq_vo_vd_vd(vadd_vd_vd_vd(vd2getx_vd_vd2(r), vd2gety_vd_vd2(r)), d), vcast_vd_d(0), ret);\n\n  ret = vmulsign_vd_vd_vd(ret, x);\n\n  ret = vsel_vd_vo_vd_vd(vlt_vo_vd_vd(n, d), x, ret);\n  ret = vsel_vd_vo_vd_vd(veq_vo_vd_vd(d, vcast_vd_d(0)), vcast_vd_d(SLEEF_NAN), ret);\n\n  return ret;\n}\n\nstatic INLINE VECTOR_CC vdouble vrintk2_vd_vd(vdouble d) {\n#ifdef FULL_FP_ROUNDING\n  return vrint_vd_vd(d);\n#else\n  vdouble c = vmulsign_vd_vd_vd(vcast_vd_d(INT64_C(1) << 52), d);\n  return vsel_vd_vo_vd_vd(vgt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(INT64_C(1) << 52)),\n\t\t\t  d, vorsign_vd_vd_vd(vsub_vd_vd_vd(vadd_vd_vd_vd(d, c), c), d));\n#endif\n}\n\nEXPORT CONST VECTOR_CC vdouble xremainder(vdouble x, vdouble y) {\n  vdouble n = vabs_vd_vd(x), d = vabs_vd_vd(y), s = vcast_vd_d(1), q;\n  vopmask o = vlt_vo_vd_vd(d, vcast_vd_d(DBL_MIN*2));\n  n = vsel_vd_vo_vd_vd(o, vmul_vd_vd_vd(n, vcast_vd_d(UINT64_C(1) << 54)), n);\n  d = vsel_vd_vo_vd_vd(o, vmul_vd_vd_vd(d, vcast_vd_d(UINT64_C(1) << 54)), d);\n  s  = vsel_vd_vo_vd_vd(o, vmul_vd_vd_vd(s , vcast_vd_d(1.0 / (UINT64_C(1) << 54))), s);\n  vdouble rd = vrec_vd_vd(d);\n  vdouble2 r = vcast_vd2_vd_vd(n, vcast_vd_d(0));\n  vopmask qisodd = vneq_vo_vd_vd(vcast_vd_d(0), vcast_vd_d(0));\n\n  for(int i=0;i<21;i++) { // ceil(log2(DBL_MAX) / 52)\n    q = vrintk2_vd_vd(vmul_vd_vd_vd(vd2getx_vd_vd2(r), rd));\n#ifndef ENABLE_FMA_DP\n    q = vreinterpret_vd_vm(vand_vm_vm_vm(vreinterpret_vm_vd(q), vcast_vm_i_i(0xffffffff, 0xfffffffe)));\n#endif\n    q = vsel_vd_vo_vd_vd(vlt_vo_vd_vd(vabs_vd_vd(vd2getx_vd_vd2(r)), vmul_vd_vd_vd(d, vcast_vd_d(1.5))), vmulsign_vd_vd_vd(vcast_vd_d(1.0), vd2getx_vd_vd2(r)), q);\n    q = vsel_vd_vo_vd_vd(vor_vo_vo_vo(vlt_vo_vd_vd(vabs_vd_vd(vd2getx_vd_vd2(r)), vmul_vd_vd_vd(d, vcast_vd_d(0.5))),\n\t\t\t\t      vandnot_vo_vo_vo(qisodd, veq_vo_vd_vd(vabs_vd_vd(vd2getx_vd_vd2(r)), vmul_vd_vd_vd(d, vcast_vd_d(0.5))))),\n\t\t\t vcast_vd_d(0.0), q);\n    if (vtestallones_i_vo64(veq_vo_vd_vd(q, vcast_vd_d(0)))) break;\n    q = vsel_vd_vo_vd_vd(visinf_vo_vd(vmul_vd_vd_vd(q, vneg_vd_vd(d))), vadd_vd_vd_vd(q, vmulsign_vd_vd_vd(vcast_vd_d(-1), vd2getx_vd_vd2(r))), q);\n    qisodd = vxor_vo_vo_vo(qisodd, visodd_vo_vd(q));\n    r = ddnormalize_vd2_vd2(ddadd2_vd2_vd2_vd2(r, ddmul_vd2_vd_vd(q, vneg_vd_vd(d))));\n  }\n  \n  vdouble ret = vmul_vd_vd_vd(vd2getx_vd_vd2(r), s);\n  ret = vmulsign_vd_vd_vd(ret, x);\n  ret = vsel_vd_vo_vd_vd(visinf_vo_vd(y), vsel_vd_vo_vd_vd(visinf_vo_vd(x), vcast_vd_d(SLEEF_NAN), x), ret);\n  ret = vsel_vd_vo_vd_vd(veq_vo_vd_vd(d, vcast_vd_d(0)), vcast_vd_d(SLEEF_NAN), ret);\n  return ret;\n}\n\n#if !(defined(ENABLE_SVE) || defined(ENABLE_SVENOFMA))\n  typedef struct {\n    vdouble2 a, b;\n  } dd2;\n\nstatic dd2 dd2setab_dd2_vd2_vd2(vdouble2 a, vdouble2 b) {\n  dd2 r = { a, b };\n  return r;\n}\nstatic vdouble2 dd2geta_vd2_dd2(dd2 d) { return d.a; }\nstatic vdouble2 dd2getb_vd2_dd2(dd2 d) { return d.b; }\n#endif\n\n/* TODO AArch64: potential optimization by using `vfmad_lane_f64` */\nstatic CONST dd2 gammak(vdouble a) {\n  vdouble2 clc = vcast_vd2_d_d(0, 0), clln = vcast_vd2_d_d(1, 0), clld = vcast_vd2_d_d(1, 0);\n  vdouble2 v = vcast_vd2_d_d(1, 0), x, y, z;\n  vdouble t, u;\n\n  vopmask otiny = vlt_vo_vd_vd(vabs_vd_vd(a), vcast_vd_d(1e-306)), oref = vlt_vo_vd_vd(a, vcast_vd_d(0.5));\n\n  x = vsel_vd2_vo_vd2_vd2(otiny, vcast_vd2_d_d(0, 0),\n\t\t\t  vsel_vd2_vo_vd2_vd2(oref, ddadd2_vd2_vd_vd(vcast_vd_d(1), vneg_vd_vd(a)),\n\t\t\t\t\t      vcast_vd2_vd_vd(a, vcast_vd_d(0))));\n\n  vopmask o0 = vand_vo_vo_vo(vle_vo_vd_vd(vcast_vd_d(0.5), vd2getx_vd_vd2(x)), vle_vo_vd_vd(vd2getx_vd_vd2(x), vcast_vd_d(1.1)));\n  vopmask o2 = vle_vo_vd_vd(vcast_vd_d(2.3), vd2getx_vd_vd2(x));\n  \n  y = ddnormalize_vd2_vd2(ddmul_vd2_vd2_vd2(ddadd2_vd2_vd2_vd(x, vcast_vd_d(1)), x));\n  y = ddnormalize_vd2_vd2(ddmul_vd2_vd2_vd2(ddadd2_vd2_vd2_vd(x, vcast_vd_d(2)), y));\n  y = ddnormalize_vd2_vd2(ddmul_vd2_vd2_vd2(ddadd2_vd2_vd2_vd(x, vcast_vd_d(3)), y));\n  y = ddnormalize_vd2_vd2(ddmul_vd2_vd2_vd2(ddadd2_vd2_vd2_vd(x, vcast_vd_d(4)), y));\n\n  vopmask o = vand_vo_vo_vo(o2, vle_vo_vd_vd(vd2getx_vd_vd2(x), vcast_vd_d(7)));\n  clln = vsel_vd2_vo_vd2_vd2(o, y, clln);\n\n  x = vsel_vd2_vo_vd2_vd2(o, ddadd2_vd2_vd2_vd(x, vcast_vd_d(5)), x);\n  \n  t = vsel_vd_vo_vd_vd(o2, vrec_vd_vd(vd2getx_vd_vd2(x)), vd2getx_vd_vd2(ddnormalize_vd2_vd2(ddadd2_vd2_vd2_vd(x, vsel_vd_vo_d_d(o0, -1, -2)))));\n\n  u = vsel_vd_vo_vo_d_d_d(o2, o0, -156.801412704022726379848862, +0.2947916772827614196e+2, +0.7074816000864609279e-7);\n  u = vmla_vd_vd_vd_vd(u, t, vsel_vd_vo_vo_d_d_d(o2, o0, +1.120804464289911606838558160000, +0.1281459691827820109e+3, +0.4009244333008730443e-6));\n  u = vmla_vd_vd_vd_vd(u, t, vsel_vd_vo_vo_d_d_d(o2, o0, +13.39798545514258921833306020000, +0.2617544025784515043e+3, +0.1040114641628246946e-5));\n  u = vmla_vd_vd_vd_vd(u, t, vsel_vd_vo_vo_d_d_d(o2, o0, -0.116546276599463200848033357000, +0.3287022855685790432e+3, +0.1508349150733329167e-5));\n  u = vmla_vd_vd_vd_vd(u, t, vsel_vd_vo_vo_d_d_d(o2, o0, -1.391801093265337481495562410000, +0.2818145867730348186e+3, +0.1288143074933901020e-5));\n  u = vmla_vd_vd_vd_vd(u, t, vsel_vd_vo_vo_d_d_d(o2, o0, +0.015056113040026424412918973400, +0.1728670414673559605e+3, +0.4744167749884993937e-6));\n  u = vmla_vd_vd_vd_vd(u, t, vsel_vd_vo_vo_d_d_d(o2, o0, +0.179540117061234856098844714000, +0.7748735764030416817e+2, -0.6554816306542489902e-7));\n  u = vmla_vd_vd_vd_vd(u, t, vsel_vd_vo_vo_d_d_d(o2, o0, -0.002481743600264997730942489280, +0.2512856643080930752e+2, -0.3189252471452599844e-6));\n  u = vmla_vd_vd_vd_vd(u, t, vsel_vd_vo_vo_d_d_d(o2, o0, -0.029527880945699120504851034100, +0.5766792106140076868e+1, +0.1358883821470355377e-6));\n  u = vmla_vd_vd_vd_vd(u, t, vsel_vd_vo_vo_d_d_d(o2, o0, +0.000540164767892604515196325186, +0.7270275473996180571e+0, -0.4343931277157336040e-6));\n  u = vmla_vd_vd_vd_vd(u, t, vsel_vd_vo_vo_d_d_d(o2, o0, +0.006403362833808069794787256200, +0.8396709124579147809e-1, +0.9724785897406779555e-6));\n  u = vmla_vd_vd_vd_vd(u, t, vsel_vd_vo_vo_d_d_d(o2, o0, -0.000162516262783915816896611252, -0.8211558669746804595e-1, -0.2036886057225966011e-5));\n  u = vmla_vd_vd_vd_vd(u, t, vsel_vd_vo_vo_d_d_d(o2, o0, -0.001914438498565477526465972390, +0.6828831828341884458e-1, +0.4373363141819725815e-5));\n  u = vmla_vd_vd_vd_vd(u, t, vsel_vd_vo_vo_d_d_d(o2, o0, +7.20489541602001055898311517e-05, -0.7712481339961671511e-1, -0.9439951268304008677e-5));\n  u = vmla_vd_vd_vd_vd(u, t, vsel_vd_vo_vo_d_d_d(o2, o0, +0.000839498720672087279971000786, +0.8337492023017314957e-1, +0.2050727030376389804e-4));\n  u = vmla_vd_vd_vd_vd(u, t, vsel_vd_vo_vo_d_d_d(o2, o0, -5.17179090826059219329394422e-05, -0.9094964931456242518e-1, -0.4492620183431184018e-4));\n  u = vmla_vd_vd_vd_vd(u, t, vsel_vd_vo_vo_d_d_d(o2, o0, -0.000592166437353693882857342347, +0.1000996313575929358e+0, +0.9945751236071875931e-4));\n  u = vmla_vd_vd_vd_vd(u, t, vsel_vd_vo_vo_d_d_d(o2, o0, +6.97281375836585777403743539e-05, -0.1113342861544207724e+0, -0.2231547599034983196e-3));\n  u = vmla_vd_vd_vd_vd(u, t, vsel_vd_vo_vo_d_d_d(o2, o0, +0.000784039221720066627493314301, +0.1255096673213020875e+0, +0.5096695247101967622e-3));\n  u = vmla_vd_vd_vd_vd(u, t, vsel_vd_vo_vo_d_d_d(o2, o0, -0.000229472093621399176949318732, -0.1440498967843054368e+0, -0.1192753911667886971e-2));\n  u = vmla_vd_vd_vd_vd(u, t, vsel_vd_vo_vo_d_d_d(o2, o0, -0.002681327160493827160473958490, +0.1695571770041949811e+0, +0.2890510330742210310e-2));\n  u = vmla_vd_vd_vd_vd(u, t, vsel_vd_vo_vo_d_d_d(o2, o0, +0.003472222222222222222175164840, -0.2073855510284092762e+0, -0.7385551028674461858e-2));\n  u = vmla_vd_vd_vd_vd(u, t, vsel_vd_vo_vo_d_d_d(o2, o0, +0.083333333333333333335592087900, +0.2705808084277815939e+0, +0.2058080842778455335e-1));\n\n  y = ddmul_vd2_vd2_vd2(ddadd2_vd2_vd2_vd(x, vcast_vd_d(-0.5)), logk2(x));\n  y = ddadd2_vd2_vd2_vd2(y, ddneg_vd2_vd2(x));\n  y = ddadd2_vd2_vd2_vd2(y, vcast_vd2_d_d(0.91893853320467278056, -3.8782941580672414498e-17)); // 0.5*log(2*M_PI)\n\n  z = ddadd2_vd2_vd2_vd(ddmul_vd2_vd_vd (u, t), vsel_vd_vo_d_d(o0, -0.4006856343865314862e+0, -0.6735230105319810201e-1));\n  z = ddadd2_vd2_vd2_vd(ddmul_vd2_vd2_vd(z, t), vsel_vd_vo_d_d(o0, +0.8224670334241132030e+0, +0.3224670334241132030e+0));\n  z = ddadd2_vd2_vd2_vd(ddmul_vd2_vd2_vd(z, t), vsel_vd_vo_d_d(o0, -0.5772156649015328655e+0, +0.4227843350984671345e+0));\n  z = ddmul_vd2_vd2_vd(z, t);\n\n  clc = vsel_vd2_vo_vd2_vd2(o2, y, z);\n  \n  clld = vsel_vd2_vo_vd2_vd2(o2, ddadd2_vd2_vd2_vd(ddmul_vd2_vd_vd(u, t), vcast_vd_d(1)), clld);\n  \n  y = clln;\n\n  clc = vsel_vd2_vo_vd2_vd2(otiny, vcast_vd2_d_d(83.1776616671934334590333, 3.67103459631568507221878e-15), // log(2^120)\n\t\t\t    vsel_vd2_vo_vd2_vd2(oref, ddadd2_vd2_vd2_vd2(vcast_vd2_d_d(1.1447298858494001639, 1.026595116270782638e-17), ddneg_vd2_vd2(clc)), clc)); // log(M_PI)\n  clln = vsel_vd2_vo_vd2_vd2(otiny, vcast_vd2_d_d(1, 0), vsel_vd2_vo_vd2_vd2(oref, clln, clld));\n\n  if (!vtestallones_i_vo64(vnot_vo64_vo64(oref))) {\n    t = vsub_vd_vd_vd(a, vmul_vd_vd_vd(vcast_vd_d(INT64_C(1) << 28), vcast_vd_vi(vtruncate_vi_vd(vmul_vd_vd_vd(a, vcast_vd_d(1.0 / (INT64_C(1) << 28)))))));\n    x = ddmul_vd2_vd2_vd2(clld, sinpik(t));\n  }\n  \n  clld = vsel_vd2_vo_vd2_vd2(otiny, vcast_vd2_vd_vd(vmul_vd_vd_vd(a, vcast_vd_d((INT64_C(1) << 60)*(double)(INT64_C(1) << 60))), vcast_vd_d(0)),\n\t\t\t     vsel_vd2_vo_vd2_vd2(oref, x, y));\n\n  return dd2setab_dd2_vd2_vd2(clc, dddiv_vd2_vd2_vd2(clln, clld));\n}\n\nEXPORT CONST VECTOR_CC vdouble xtgamma_u1(vdouble a) {\n  dd2 d = gammak(a);\n  vdouble2 y = ddmul_vd2_vd2_vd2(expk2(dd2geta_vd2_dd2(d)), dd2getb_vd2_dd2(d));\n  vdouble r = vadd_vd_vd_vd(vd2getx_vd_vd2(y), vd2gety_vd_vd2(y));\n  vopmask o;\n\n  o = vor_vo_vo_vo(vor_vo_vo_vo(veq_vo_vd_vd(a, vcast_vd_d(-SLEEF_INFINITY)),\n\t\t\t\tvand_vo_vo_vo(vlt_vo_vd_vd(a, vcast_vd_d(0)), visint_vo_vd(a))),\n\t\t   vand_vo_vo_vo(vand_vo_vo_vo(visnumber_vo_vd(a), vlt_vo_vd_vd(a, vcast_vd_d(0))), visnan_vo_vd(r)));\n  r = vsel_vd_vo_vd_vd(o, vcast_vd_d(SLEEF_NAN), r);\n\n  o = vand_vo_vo_vo(vand_vo_vo_vo(vor_vo_vo_vo(veq_vo_vd_vd(a, vcast_vd_d(SLEEF_INFINITY)), visnumber_vo_vd(a)),\n\t\t\t\t  vge_vo_vd_vd(a, vcast_vd_d(-DBL_MIN))),\n\t\t    vor_vo_vo_vo(vor_vo_vo_vo(veq_vo_vd_vd(a, vcast_vd_d(0)), vgt_vo_vd_vd(a, vcast_vd_d(200))), visnan_vo_vd(r)));\n  r = vsel_vd_vo_vd_vd(o, vmulsign_vd_vd_vd(vcast_vd_d(SLEEF_INFINITY), a), r);\n  \n  return r;\n}\n\nEXPORT CONST VECTOR_CC vdouble xlgamma_u1(vdouble a) {\n  dd2 d = gammak(a);\n  vdouble2 y = ddadd2_vd2_vd2_vd2(dd2geta_vd2_dd2(d), logk2(ddabs_vd2_vd2(dd2getb_vd2_dd2(d))));\n  vdouble r = vadd_vd_vd_vd(vd2getx_vd_vd2(y), vd2gety_vd_vd2(y));\n  vopmask o;\n\n  o = vor_vo_vo_vo(visinf_vo_vd(a),\n\t\t   vor_vo_vo_vo(vand_vo_vo_vo(vle_vo_vd_vd(a, vcast_vd_d(0)), visint_vo_vd(a)),\n\t\t\t\tvand_vo_vo_vo(visnumber_vo_vd(a), visnan_vo_vd(r))));\n  r = vsel_vd_vo_vd_vd(o, vcast_vd_d(SLEEF_INFINITY), r);\n\n  return r;\n}\n\n/* TODO AArch64: potential optimization by using `vfmad_lane_f64` */\nEXPORT CONST VECTOR_CC vdouble xerf_u1(vdouble a) {\n  vdouble s = a, t, u;\n  vdouble2 d;\n\n  a = vabs_vd_vd(a);\n  vopmask o0 = vlt_vo_vd_vd(a, vcast_vd_d(1.0));\n  vopmask o1 = vlt_vo_vd_vd(a, vcast_vd_d(3.7));\n  vopmask o2 = vlt_vo_vd_vd(a, vcast_vd_d(6.0));\n  u = vsel_vd_vo_vd_vd(o0, vmul_vd_vd_vd(a, a), a);\n  \n  t = vsel_vd_vo_vo_d_d_d(o0, o1, +0.6801072401395392157e-20, +0.2830954522087717660e-13, -0.5846750404269610493e-17);\n  t = vmla_vd_vd_vd_vd(t, u, vsel_vd_vo_vo_d_d_d(o0, o1, -0.2161766247570056391e-18, -0.1509491946179481940e-11, +0.6076691048812607898e-15));\n  t = vmla_vd_vd_vd_vd(t, u, vsel_vd_vo_vo_d_d_d(o0, o1, +0.4695919173301598752e-17, +0.3827857177807173152e-10, -0.3007518609604893831e-13));\n  t = vmla_vd_vd_vd_vd(t, u, vsel_vd_vo_vo_d_d_d(o0, o1, -0.9049140419888010819e-16, -0.6139733921558987241e-09, +0.9427906260824646063e-12));\n  t = vmla_vd_vd_vd_vd(t, u, vsel_vd_vo_vo_d_d_d(o0, o1, +0.1634018903557411517e-14, +0.6985387934608038824e-08, -0.2100110908269393629e-10));\n  t = vmla_vd_vd_vd_vd(t, u, vsel_vd_vo_vo_d_d_d(o0, o1, -0.2783485786333455216e-13, -0.5988224513034371474e-07, +0.3534639523461223473e-09));\n  t = vmla_vd_vd_vd_vd(t, u, vsel_vd_vo_vo_d_d_d(o0, o1, +0.4463221276786412722e-12, +0.4005716952355346640e-06, -0.4664967728285395926e-08));\n  t = vmla_vd_vd_vd_vd(t, u, vsel_vd_vo_vo_d_d_d(o0, o1, -0.6711366622850138987e-11, -0.2132190104575784400e-05, +0.4943823283769000532e-07));\n  t = vmla_vd_vd_vd_vd(t, u, vsel_vd_vo_vo_d_d_d(o0, o1, +0.9422759050232658346e-10, +0.9092461304042630325e-05, -0.4271203394761148254e-06));\n  t = vmla_vd_vd_vd_vd(t, u, vsel_vd_vo_vo_d_d_d(o0, o1, -0.1229055530100228477e-08, -0.3079188080966205457e-04, +0.3034067677404915895e-05));\n  t = vmla_vd_vd_vd_vd(t, u, vsel_vd_vo_vo_d_d_d(o0, o1, +0.1480719281585085023e-07, +0.7971413443082370762e-04, -0.1776295289066871135e-04));\n  t = vmla_vd_vd_vd_vd(t, u, vsel_vd_vo_vo_d_d_d(o0, o1, -0.1636584469123402714e-06, -0.1387853215225442864e-03, +0.8524547630559505050e-04));\n  t = vmla_vd_vd_vd_vd(t, u, vsel_vd_vo_vo_d_d_d(o0, o1, +0.1646211436588923363e-05, +0.6469678026257590965e-04, -0.3290582944961784398e-03));\n  t = vmla_vd_vd_vd_vd(t, u, vsel_vd_vo_vo_d_d_d(o0, o1, -0.1492565035840624866e-04, +0.4996645280372945860e-03, +0.9696966068789101157e-03));\n  t = vmla_vd_vd_vd_vd(t, u, vsel_vd_vo_vo_d_d_d(o0, o1, +0.1205533298178966496e-03, -0.1622802482842520535e-02, -0.1812527628046986137e-02));\n  t = vmla_vd_vd_vd_vd(t, u, vsel_vd_vo_vo_d_d_d(o0, o1, -0.8548327023450851166e-03, +0.1615320557049377171e-03, -0.4725409828123619017e-03));\n  t = vmla_vd_vd_vd_vd(t, u, vsel_vd_vo_vo_d_d_d(o0, o1, +0.5223977625442188799e-02, +0.1915262325574875607e-01, +0.2090315427924229266e-01));\n  t = vmla_vd_vd_vd_vd(t, u, vsel_vd_vo_vo_d_d_d(o0, o1, -0.2686617064513125569e-01, -0.1027818298486033455e+00, -0.1052041921842776645e+00));\n  t = vmla_vd_vd_vd_vd(t, u, vsel_vd_vo_vo_d_d_d(o0, o1, +0.1128379167095512753e+00, -0.6366172819842503827e+00, -0.6345351808766568347e+00));\n  t = vmla_vd_vd_vd_vd(t, u, vsel_vd_vo_vo_d_d_d(o0, o1, -0.3761263890318375380e+00, -0.1128379590648910469e+01, -0.1129442929103524396e+01));\n  d = ddmul_vd2_vd_vd(t, u);\n\n  d = ddadd2_vd2_vd2_vd2(d, vcast_vd2_vd_vd(vsel_vd_vo_vo_d_d_d(o0, o1, 1.1283791670955125586, 3.4110644736196137587e-08, 0.00024963035690526438285),\n\t\t\t\t\t    vsel_vd_vo_vo_d_d_d(o0, o1, 1.5335459613165822674e-17, -2.4875650708323294246e-24, -5.4362665034856259795e-21)));\n  d = vsel_vd2_vo_vd2_vd2(o0, ddmul_vd2_vd2_vd(d, a), ddadd_vd2_vd_vd2(vcast_vd_d(1.0), ddneg_vd2_vd2(expk2(d))));\n\n  u = vmulsign_vd_vd_vd(vsel_vd_vo_vd_vd(o2, vadd_vd_vd_vd(vd2getx_vd_vd2(d), vd2gety_vd_vd2(d)), vcast_vd_d(1)), s);\n  u = vsel_vd_vo_vd_vd(visnan_vo_vd(a), vcast_vd_d(SLEEF_NAN), u);\n\n  return u;\n}\n\n/* TODO AArch64: potential optimization by using `vfmad_lane_f64` */\nEXPORT CONST VECTOR_CC vdouble xerfc_u15(vdouble a) {\n  vdouble s = a, r = vcast_vd_d(0), t;\n  vdouble2 u, d, x;\n  a = vabs_vd_vd(a);\n  vopmask o0 = vlt_vo_vd_vd(a, vcast_vd_d(1.0));\n  vopmask o1 = vlt_vo_vd_vd(a, vcast_vd_d(2.2));\n  vopmask o2 = vlt_vo_vd_vd(a, vcast_vd_d(4.2));\n  vopmask o3 = vlt_vo_vd_vd(a, vcast_vd_d(27.3));\n\n  u = vsel_vd2_vo_vd2_vd2(o0, ddmul_vd2_vd_vd(a, a), vsel_vd2_vo_vd2_vd2(o1, vcast_vd2_vd_vd(a, vcast_vd_d(0)), dddiv_vd2_vd2_vd2(vcast_vd2_d_d(1, 0), vcast_vd2_vd_vd(a, vcast_vd_d(0)))));\n\n  t = vsel_vd_vo_vo_vo_d_d_d_d(o0, o1, o2, +0.6801072401395386139e-20, +0.3438010341362585303e-12, -0.5757819536420710449e+2, +0.2334249729638701319e+5);\n  t = vmla_vd_vd_vd_vd(t, vd2getx_vd_vd2(u), vsel_vd_vo_vo_vo_d_d_d_d(o0, o1, o2, -0.2161766247570055669e-18, -0.1237021188160598264e-10, +0.4669289654498104483e+3, -0.4695661044933107769e+5));\n  t = vmla_vd_vd_vd_vd(t, vd2getx_vd_vd2(u), vsel_vd_vo_vo_vo_d_d_d_d(o0, o1, o2, +0.4695919173301595670e-17, +0.2117985839877627852e-09, -0.1796329879461355858e+4, +0.3173403108748643353e+5));\n  t = vmla_vd_vd_vd_vd(t, vd2getx_vd_vd2(u), vsel_vd_vo_vo_vo_d_d_d_d(o0, o1, o2, -0.9049140419888007122e-16, -0.2290560929177369506e-08, +0.4355892193699575728e+4, +0.3242982786959573787e+4));\n  t = vmla_vd_vd_vd_vd(t, vd2getx_vd_vd2(u), vsel_vd_vo_vo_vo_d_d_d_d(o0, o1, o2, +0.1634018903557410728e-14, +0.1748931621698149538e-07, -0.7456258884965764992e+4, -0.2014717999760347811e+5));\n  t = vmla_vd_vd_vd_vd(t, vd2getx_vd_vd2(u), vsel_vd_vo_vo_vo_d_d_d_d(o0, o1, o2, -0.2783485786333451745e-13, -0.9956602606623249195e-07, +0.9553977358167021521e+4, +0.1554006970967118286e+5));\n  t = vmla_vd_vd_vd_vd(t, vd2getx_vd_vd2(u), vsel_vd_vo_vo_vo_d_d_d_d(o0, o1, o2, +0.4463221276786415752e-12, +0.4330010240640327080e-06, -0.9470019905444229153e+4, -0.6150874190563554293e+4));\n  t = vmla_vd_vd_vd_vd(t, vd2getx_vd_vd2(u), vsel_vd_vo_vo_vo_d_d_d_d(o0, o1, o2, -0.6711366622850136563e-11, -0.1435050600991763331e-05, +0.7387344321849855078e+4, +0.1240047765634815732e+4));\n  t = vmla_vd_vd_vd_vd(t, vd2getx_vd_vd2(u), vsel_vd_vo_vo_vo_d_d_d_d(o0, o1, o2, +0.9422759050232662223e-10, +0.3460139479650695662e-05, -0.4557713054166382790e+4, -0.8210325475752699731e+2));\n  t = vmla_vd_vd_vd_vd(t, vd2getx_vd_vd2(u), vsel_vd_vo_vo_vo_d_d_d_d(o0, o1, o2, -0.1229055530100229098e-08, -0.4988908180632898173e-05, +0.2207866967354055305e+4, +0.3242443880839930870e+2));\n  t = vmla_vd_vd_vd_vd(t, vd2getx_vd_vd2(u), vsel_vd_vo_vo_vo_d_d_d_d(o0, o1, o2, +0.1480719281585086512e-07, -0.1308775976326352012e-05, -0.8217975658621754746e+3, -0.2923418863833160586e+2));\n  t = vmla_vd_vd_vd_vd(t, vd2getx_vd_vd2(u), vsel_vd_vo_vo_vo_d_d_d_d(o0, o1, o2, -0.1636584469123399803e-06, +0.2825086540850310103e-04, +0.2268659483507917400e+3, +0.3457461732814383071e+0));\n  t = vmla_vd_vd_vd_vd(t, vd2getx_vd_vd2(u), vsel_vd_vo_vo_vo_d_d_d_d(o0, o1, o2, +0.1646211436588923575e-05, -0.6393913713069986071e-04, -0.4633361260318560682e+2, +0.5489730155952392998e+1));\n  t = vmla_vd_vd_vd_vd(t, vd2getx_vd_vd2(u), vsel_vd_vo_vo_vo_d_d_d_d(o0, o1, o2, -0.1492565035840623511e-04, -0.2566436514695078926e-04, +0.9557380123733945965e+1, +0.1559934132251294134e-2));\n  t = vmla_vd_vd_vd_vd(t, vd2getx_vd_vd2(u), vsel_vd_vo_vo_vo_d_d_d_d(o0, o1, o2, +0.1205533298178967851e-03, +0.5895792375659440364e-03, -0.2958429331939661289e+1, -0.1541741566831520638e+1));\n  t = vmla_vd_vd_vd_vd(t, vd2getx_vd_vd2(u), vsel_vd_vo_vo_vo_d_d_d_d(o0, o1, o2, -0.8548327023450850081e-03, -0.1695715579163588598e-02, +0.1670329508092765480e+0, +0.2823152230558364186e-5));\n  t = vmla_vd_vd_vd_vd(t, vd2getx_vd_vd2(u), vsel_vd_vo_vo_vo_d_d_d_d(o0, o1, o2, +0.5223977625442187932e-02, +0.2089116434918055149e-03, +0.6096615680115419211e+0, +0.6249999184195342838e+0));\n  t = vmla_vd_vd_vd_vd(t, vd2getx_vd_vd2(u), vsel_vd_vo_vo_vo_d_d_d_d(o0, o1, o2, -0.2686617064513125222e-01, +0.1912855949584917753e-01, +0.1059212443193543585e-2, +0.1741749416408701288e-8));\n\n  d = ddmul_vd2_vd2_vd(u, t);\n  d = ddadd2_vd2_vd2_vd2(d, vcast_vd2_vd_vd(vsel_vd_vo_vo_vo_d_d_d_d(o0, o1, o2, 0.11283791670955126141, -0.10277263343147646779, -0.50005180473999022439, -0.5000000000258444377),\n\t\t\t\t\t    vsel_vd_vo_vo_vo_d_d_d_d(o0, o1, o2, -4.0175691625932118483e-18, -6.2338714083404900225e-18, 2.6362140569041995803e-17, -4.0074044712386992281e-17)));\n  d = ddmul_vd2_vd2_vd2(d, u);\n  d = ddadd2_vd2_vd2_vd2(d, vcast_vd2_vd_vd(vsel_vd_vo_vo_vo_d_d_d_d(o0, o1, o2, -0.37612638903183753802, -0.63661976742916359662, 1.601106273924963368e-06, 2.3761973137523364792e-13),\n\t\t\t\t\t    vsel_vd_vo_vo_vo_d_d_d_d(o0, o1, o2, 1.3391897206042552387e-17, 7.6321019159085724662e-18, 1.1974001857764476775e-23, -1.1670076950531026582e-29)));\n  d = ddmul_vd2_vd2_vd2(d, u);\n  d = ddadd2_vd2_vd2_vd2(d, vcast_vd2_vd_vd(vsel_vd_vo_vo_vo_d_d_d_d(o0, o1, o2, 1.1283791670955125586, -1.1283791674717296161, -0.57236496645145429341, -0.57236494292470108114),\n\t\t\t\t\t    vsel_vd_vo_vo_vo_d_d_d_d(o0, o1, o2, 1.5335459613165822674e-17, 8.0896847755965377194e-17, 3.0704553245872027258e-17, -2.3984352208056898003e-17)));\n  \n  x = ddmul_vd2_vd2_vd(vsel_vd2_vo_vd2_vd2(o1, d, vcast_vd2_vd_vd(vneg_vd_vd(a), vcast_vd_d(0))), a);\n  x = vsel_vd2_vo_vd2_vd2(o1, x, ddadd2_vd2_vd2_vd2(x, d));\n  x = vsel_vd2_vo_vd2_vd2(o0, ddsub_vd2_vd2_vd2(vcast_vd2_d_d(1, 0), x), expk2(x));\n  x = vsel_vd2_vo_vd2_vd2(o1, x, ddmul_vd2_vd2_vd2(x, u));\n\n  r = vsel_vd_vo_vd_vd(o3, vadd_vd_vd_vd(vd2getx_vd_vd2(x), vd2gety_vd_vd2(x)), vcast_vd_d(0));\n  r = vsel_vd_vo_vd_vd(vsignbit_vo_vd(s), vsub_vd_vd_vd(vcast_vd_d(2), r), r);\n  r = vsel_vd_vo_vd_vd(visnan_vo_vd(s), vcast_vd_d(SLEEF_NAN), r);\n  return r;\n}\n#endif // #if !defined(DETERMINISTIC)\n\n#if !defined(DETERMINISTIC) && !defined(ENABLE_GNUABI) && !defined(SLEEF_GENHEADER)\n// The normal and deterministic versions of implementations are common\n// for the functions like sincospi_u05. Aliases are defined by\n// DALIAS_* macros for such functions. The defined aliases\n// (e.g. ysincospi_u05) are renamed(e.g. to\n// Sleef_cinz_sincospid2_u05sse2) by rename*.h.\n\n#ifdef ENABLE_ALIAS\n#define DALIAS_vd_vd(FUNC) EXPORT CONST VECTOR_CC vdouble y ## FUNC(vdouble) __attribute__((alias( stringify(x ## FUNC) )));\n#define DALIAS_vd2_vd(FUNC) EXPORT CONST VECTOR_CC vdouble2 y ## FUNC(vdouble) __attribute__((alias( stringify(x ## FUNC) )));\n#define DALIAS_vi_vd(FUNC) EXPORT CONST VECTOR_CC vint y ## FUNC(vdouble) __attribute__((alias( stringify(x ## FUNC) )));\n#define DALIAS_vd_vd_vd(FUNC) EXPORT CONST VECTOR_CC vdouble y ## FUNC(vdouble, vdouble) __attribute__((alias( stringify(x ## FUNC) )));\n#define DALIAS_vd_vd_vd_vd(FUNC) EXPORT CONST VECTOR_CC vdouble y ## FUNC(vdouble, vdouble, vdouble) __attribute__((alias( stringify(x ## FUNC) )));\n#else\n#define DALIAS_vd_vd(FUNC) EXPORT CONST VECTOR_CC vdouble y ## FUNC(vdouble d) { return x ## FUNC (d); }\n#define DALIAS_vd2_vd(FUNC) EXPORT CONST VECTOR_CC vdouble2 y ## FUNC(vdouble d) { return x ## FUNC (d); }\n#define DALIAS_vi_vd(FUNC) EXPORT CONST VECTOR_CC vint y ## FUNC(vdouble d) { return x ## FUNC (d); }\n#define DALIAS_vd_vd_vd(FUNC) EXPORT CONST VECTOR_CC vdouble y ## FUNC(vdouble x, vdouble y) { return x ## FUNC (x, y); }\n#define DALIAS_vd_vd_vd_vd(FUNC) EXPORT CONST VECTOR_CC vdouble y ## FUNC(vdouble x, vdouble y, vdouble z) { return x ## FUNC (x, y, z); }\n#endif\n\n/* DALIAS_vd2_vd(sincospi_u05) */\n/* DALIAS_vd2_vd(sincospi_u35) */\n/* DALIAS_vd2_vd(modf) */\n/* DALIAS_vd_vd(log) */\n/* DALIAS_vd_vd(log_u1) */\n/* DALIAS_vd_vd_vd(pow) */\n/* DALIAS_vd_vd(sinh) */\n/* DALIAS_vd_vd(cosh) */\n/* DALIAS_vd_vd(tanh) */\n/* DALIAS_vd_vd(sinh_u35) */\n/* DALIAS_vd_vd(cosh_u35) */\n/* DALIAS_vd_vd(tanh_u35) */\n/* DALIAS_vd_vd(asinh) */\n/* DALIAS_vd_vd(acosh) */\n/* DALIAS_vd_vd(atanh) */\n/* DALIAS_vd_vd(cbrt) */\n/* DALIAS_vd_vd(cbrt_u1) */\n/* DALIAS_vd_vd(expm1) */\n/* DALIAS_vd_vd(log10) */\n/* DALIAS_vd_vd(log2) */\n/* DALIAS_vd_vd(log2_u35) */\n/* DALIAS_vd_vd(log1p) */\n/* DALIAS_vd_vd(fabs) */\n/* DALIAS_vd_vd_vd(copysign) */\n/* DALIAS_vd_vd_vd(fmax) */\n/* DALIAS_vd_vd_vd(fmin) */\n/* DALIAS_vd_vd_vd(fdim) */\n/* DALIAS_vd_vd(trunc) */\n/* DALIAS_vd_vd(floor) */\n/* DALIAS_vd_vd(ceil) */\n/* DALIAS_vd_vd(round) */\n/* DALIAS_vd_vd(rint) */\n/* DALIAS_vd_vd_vd(nextafter) */\n/* DALIAS_vd_vd(frfrexp) */\n/* DALIAS_vi_vd(expfrexp) */\n/* DALIAS_vd_vd_vd_vd(fma) */\n/* DALIAS_vd_vd(sqrt_u05) */\n/* DALIAS_vd_vd(sqrt_u35) */\n/* DALIAS_vd_vd_vd(hypot_u05) */\n/* DALIAS_vd_vd_vd(hypot_u35) */\n/* DALIAS_vd_vd_vd(fmod) */\n/* DALIAS_vd_vd_vd(remainder) */\n/* DALIAS_vd_vd(tgamma_u1) */\n/* DALIAS_vd_vd(lgamma_u1) */\n/* DALIAS_vd_vd(erf_u1) */\n/* DALIAS_vd_vd(erfc_u15) */\n#endif // #if !defined(DETERMINISTIC) && !defined(ENABLE_GNUABI) && !defined(SLEEF_GENHEADER)\n\n#if !defined(ENABLE_GNUABI) && !defined(SLEEF_GENHEADER)\nEXPORT CONST int xgetInt(int name) {\n  if (1 <= name && name <= 10) return vavailability_i(name);\n  return 0;\n}\n\nEXPORT CONST void *xgetPtr(int name) {\n  if (name == 0) return ISANAME;\n  return (void *)0;\n}\n#endif\n\n#if defined(ALIAS_NO_EXT_SUFFIX) && !defined(DETERMINISTIC)\n#include ALIAS_NO_EXT_SUFFIX\n#endif\n\n#ifdef ENABLE_MAIN\n// gcc -DENABLE_MAIN -Wno-attributes -I../common -I../arch -DENABLE_AVX2 -mavx2 -mfma sleefsimddp.c rempitab.c ../common/common.c -lm\n#include <stdio.h>\n#include <stdlib.h>\n#include <math.h>\nint main(int argc, char **argv) {\n  vdouble d1 = vcast_vd_d(atof(argv[1]));\n  vdouble d2 = vcast_vd_d(atof(argv[2]));\n  //vdouble d3 = vcast_vd_d(atof(argv[3]));\n  //vdouble r = xnextafter(d1, d2);\n  //int i;\n  //double fr = frexp(atof(argv[1]), &i);\n  //printf(\"%.20g\\n\", xfma(d1, d2, d3)[0]);;\n  //printf(\"test %.20g\\n\", xtgamma_u1(d1)[0]);\n  //printf(\"corr %.20g\\n\", tgamma(d1[0]));\n  //printf(\"test %.20g\\n\", xerf_u1(d1)[0]);\n  //printf(\"corr %.20g\\n\", erf(d1[0]));\n  //printf(\"test %.20g\\n\", xerfc_u15(d1)[0]);\n  //printf(\"corr %.20g\\n\", erfc(d1[0]));\n  //printf(\"%.20g\\n\", nextafter(d1[0], d2[0]));;\n  //printf(\"%.20g\\n\", vcast_d_vd(xhypot_u05(d1, d2)));\n  //printf(\"%.20g\\n\", fr);\n  printf(\"%.20g\\n\", fmod(atof(argv[1]), atof(argv[2])));\n  printf(\"%.20g\\n\", xfmod(d1, d2)[0]);\n  //vdouble2 r = xsincospi_u35(a);\n  //printf(\"%g, %g\\n\", vcast_d_vd(r.x), vcast_d_vd(r.y));\n}\n#endif\n\n#ifdef ENABLE_GNUABI\n/* \"finite\" aliases for compatibility with GLIBC */\nEXPORT CONST VECTOR_CC vdouble __acos_finite     (vdouble)          __attribute__((weak, alias(str_xacos     )));\nEXPORT CONST VECTOR_CC vdouble __acosh_finite    (vdouble)          __attribute__((weak, alias(str_xacosh    )));\nEXPORT CONST VECTOR_CC vdouble __asin_finite     (vdouble)          __attribute__((weak, alias(str_xasin_u1  )));\nEXPORT CONST VECTOR_CC vdouble __atan2_finite    (vdouble, vdouble) __attribute__((weak, alias(str_xatan2_u1 )));\nEXPORT CONST VECTOR_CC vdouble __atanh_finite    (vdouble)          __attribute__((weak, alias(str_xatanh    )));\nEXPORT CONST VECTOR_CC vdouble __cosh_finite     (vdouble)          __attribute__((weak, alias(str_xcosh     )));\nEXPORT CONST VECTOR_CC vdouble __exp10_finite    (vdouble)          __attribute__((weak, alias(str_xexp10    )));\nEXPORT CONST VECTOR_CC vdouble __exp2_finite     (vdouble)          __attribute__((weak, alias(str_xexp2     )));\nEXPORT CONST VECTOR_CC vdouble __exp_finite      (vdouble)          __attribute__((weak, alias(str_xexp      )));\nEXPORT CONST VECTOR_CC vdouble __fmod_finite     (vdouble, vdouble) __attribute__((weak, alias(str_xfmod     )));\nEXPORT CONST VECTOR_CC vdouble __remainder_finite(vdouble, vdouble) __attribute__((weak, alias(str_xremainder)));\nEXPORT CONST VECTOR_CC vdouble __modf_finite     (vdouble, vdouble *) __attribute__((weak, alias(str_xmodf   )));\nEXPORT CONST VECTOR_CC vdouble __hypot_u05_finite(vdouble, vdouble) __attribute__((weak, alias(str_xhypot_u05)));\nEXPORT CONST VECTOR_CC vdouble __lgamma_u1_finite(vdouble)          __attribute__((weak, alias(str_xlgamma_u1)));\nEXPORT CONST VECTOR_CC vdouble __log10_finite    (vdouble)          __attribute__((weak, alias(str_xlog10    )));\nEXPORT CONST VECTOR_CC vdouble __log_finite      (vdouble)          __attribute__((weak, alias(str_xlog_u1   )));\nEXPORT CONST VECTOR_CC vdouble __pow_finite      (vdouble, vdouble) __attribute__((weak, alias(str_xpow      )));\nEXPORT CONST VECTOR_CC vdouble __sinh_finite     (vdouble)          __attribute__((weak, alias(str_xsinh     )));\nEXPORT CONST VECTOR_CC vdouble __sqrt_finite     (vdouble)          __attribute__((weak, alias(str_xsqrt     )));\nEXPORT CONST VECTOR_CC vdouble __tgamma_u1_finite(vdouble)          __attribute__((weak, alias(str_xtgamma_u1)));\n\n#ifdef HEADER_MASKED\n#include HEADER_MASKED\n#endif\n#endif /* #ifdef ENABLE_GNUABI */\n"
  },
  {
    "path": "src/sleefsimddp_emulation.c",
    "content": "/*\n\nCopyright (c) 2021 Agenium Scale\n\nPermission is hereby granted, free of charge, to any person obtaining a copy\nof this software and associated documentation files (the \"Software\"), to deal\nin the Software without restriction, including without limitation the rights\nto use, copy, modify, merge, publish, distribute, sublicense, and/or sell\ncopies of the Software, and to permit persons to whom the Software is\nfurnished to do so, subject to the following conditions:\n\nThe above copyright notice and this permission notice shall be included in all\ncopies or substantial portions of the Software.\n\nTHE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\nIMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\nFITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\nAUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\nLIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\nOUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\nSOFTWARE.\n\n*/\n\n#include <nsimd/nsimd.h>\n\n#ifdef ENABLE_NEON32\n#include \"renameneon32.h\"\n#define nsimd_vec_f64 nsimd_neon128_vf64\n#endif\n\n#ifdef ENABLE_VSX\n#include \"renamevsx.h\"\n#define nsimd_vec_f64 nsimd_vmx_vf64\n#endif\n\n\nnsimd_vec_f64 xsin(nsimd_vec_f64 a0_) {\n  nsimd_vec_f64 ret_;\n  nsimd_cpu_vf64 a0, ret;\n  a0.v0 = a0_.v0;\n  a0.v1 = a0_.v1;\n  ret = nsimd_sin_u35_cpu_f64(a0);\n  ret_.v0 = ret.v0;\n  ret_.v1 = ret.v1;\n  return ret_;\n}\n\n\nnsimd_vec_f64 xcos(nsimd_vec_f64 a0_) {\n  nsimd_vec_f64 ret_;\n  nsimd_cpu_vf64 a0, ret;\n  a0.v0 = a0_.v0;\n  a0.v1 = a0_.v1;\n  ret = nsimd_cos_u35_cpu_f64(a0);\n  ret_.v0 = ret.v0;\n  ret_.v1 = ret.v1;\n  return ret_;\n}\n\nnsimd_vec_f64 xtan(nsimd_vec_f64 a0_) {\n  nsimd_vec_f64 ret_;\n  nsimd_cpu_vf64 a0, ret;\n  a0.v0 = a0_.v0;\n  a0.v1 = a0_.v1;\n  ret = nsimd_tan_u35_cpu_f64(a0);\n  ret_.v0 = ret.v0;\n  ret_.v1 = ret.v1;\n  return ret_;\n}\n\nnsimd_vec_f64 xasin(nsimd_vec_f64 a0_) {\n  nsimd_vec_f64 ret_;\n  nsimd_cpu_vf64 a0, ret;\n  a0.v0 = a0_.v0;\n  a0.v1 = a0_.v1;\n  ret = nsimd_asin_u35_cpu_f64(a0);\n  ret_.v0 = ret.v0;\n  ret_.v1 = ret.v1;\n  return ret_;\n}\n\nnsimd_vec_f64 xacos(nsimd_vec_f64 a0_) {\n  nsimd_vec_f64 ret_;\n  nsimd_cpu_vf64 a0, ret;\n  a0.v0 = a0_.v0;\n  a0.v1 = a0_.v1;\n  ret = nsimd_acos_u35_cpu_f64(a0);\n  ret_.v0 = ret.v0;\n  ret_.v1 = ret.v1;\n  return ret_;\n}\n\nnsimd_vec_f64 xatan(nsimd_vec_f64 a0_) {\n  nsimd_vec_f64 ret_;\n  nsimd_cpu_vf64 a0, ret;\n  a0.v0 = a0_.v0;\n  a0.v1 = a0_.v1;\n  ret = nsimd_atan_u35_cpu_f64(a0);\n  ret_.v0 = ret.v0;\n  ret_.v1 = ret.v1;\n  return ret_;\n}\n\nnsimd_vec_f64 xatan2(nsimd_vec_f64 a0_, nsimd_vec_f64 a1_) {\n  nsimd_vec_f64 ret_;\n  nsimd_cpu_vf64 a0, a1, ret;\n  a0.v0 = a0_.v0;\n  a0.v1 = a0_.v1;\n  a1.v0 = a1_.v0;\n  a1.v1 = a1_.v1;\n  ret = nsimd_atan2_u35_cpu_f64(a0, a1);\n  ret_.v0 = ret.v0;\n  ret_.v1 = ret.v1;\n  return ret_;\n}\n\nnsimd_vec_f64 xlog(nsimd_vec_f64 a0_) {\n  nsimd_vec_f64 ret_;\n  nsimd_cpu_vf64 a0, ret;\n  a0.v0 = a0_.v0;\n  a0.v1 = a0_.v1;\n  ret = nsimd_log_u35_cpu_f64(a0);\n  ret_.v0 = ret.v0;\n  ret_.v1 = ret.v1;\n  return ret_;\n}\n\nnsimd_vec_f64 xcbrt(nsimd_vec_f64 a0_) {\n  nsimd_vec_f64 ret_;\n  nsimd_cpu_vf64 a0, ret;\n  a0.v0 = a0_.v0;\n  a0.v1 = a0_.v1;\n  ret = nsimd_cbrt_u35_cpu_f64(a0);\n  ret_.v0 = ret.v0;\n  ret_.v1 = ret.v1;\n  return ret_;\n}\n\nnsimd_vec_f64 xsin_u1(nsimd_vec_f64 a0_) {\n  nsimd_vec_f64 ret_;\n  nsimd_cpu_vf64 a0, ret;\n  a0.v0 = a0_.v0;\n  a0.v1 = a0_.v1;\n  ret = nsimd_sin_u10_cpu_f64(a0);\n  ret_.v0 = ret.v0;\n  ret_.v1 = ret.v1;\n  return ret_;\n}\n\nnsimd_vec_f64 xcos_u1(nsimd_vec_f64 a0_) {\n  nsimd_vec_f64 ret_;\n  nsimd_cpu_vf64 a0, ret;\n  a0.v0 = a0_.v0;\n  a0.v1 = a0_.v1;\n  ret = nsimd_cos_u10_cpu_f64(a0);\n  ret_.v0 = ret.v0;\n  ret_.v1 = ret.v1;\n  return ret_;\n}\n\nnsimd_vec_f64 xtan_u1(nsimd_vec_f64 a0_) {\n  nsimd_vec_f64 ret_;\n  nsimd_cpu_vf64 a0, ret;\n  a0.v0 = a0_.v0;\n  a0.v1 = a0_.v1;\n  ret = nsimd_tan_u10_cpu_f64(a0);\n  ret_.v0 = ret.v0;\n  ret_.v1 = ret.v1;\n  return ret_;\n}\n\nnsimd_vec_f64 xasin_u1(nsimd_vec_f64 a0_) {\n  nsimd_vec_f64 ret_;\n  nsimd_cpu_vf64 a0, ret;\n  a0.v0 = a0_.v0;\n  a0.v1 = a0_.v1;\n  ret = nsimd_asin_u10_cpu_f64(a0);\n  ret_.v0 = ret.v0;\n  ret_.v1 = ret.v1;\n  return ret_;\n}\n\nnsimd_vec_f64 xacos_u1(nsimd_vec_f64 a0_) {\n  nsimd_vec_f64 ret_;\n  nsimd_cpu_vf64 a0, ret;\n  a0.v0 = a0_.v0;\n  a0.v1 = a0_.v1;\n  ret = nsimd_acos_u10_cpu_f64(a0);\n  ret_.v0 = ret.v0;\n  ret_.v1 = ret.v1;\n  return ret_;\n}\n\nnsimd_vec_f64 xatan_u1(nsimd_vec_f64 a0_) {\n  nsimd_vec_f64 ret_;\n  nsimd_cpu_vf64 a0, ret;\n  a0.v0 = a0_.v0;\n  a0.v1 = a0_.v1;\n  ret = nsimd_atan_u10_cpu_f64(a0);\n  ret_.v0 = ret.v0;\n  ret_.v1 = ret.v1;\n  return ret_;\n}\n\nnsimd_vec_f64 xatan2_u1(nsimd_vec_f64 a0_, nsimd_vec_f64 a1_) {\n  nsimd_vec_f64 ret_;\n  nsimd_cpu_vf64 a0, a1, ret;\n  a0.v0 = a0_.v0;\n  a0.v1 = a0_.v1;\n  a1.v0 = a1_.v0;\n  a1.v1 = a1_.v1;\n  ret = nsimd_atan2_u10_cpu_f64(a0, a1);\n  ret_.v0 = ret.v0;\n  ret_.v1 = ret.v1;\n  return ret_;\n}\n\nnsimd_vec_f64 xlog_u1(nsimd_vec_f64 a0_) {\n  nsimd_vec_f64 ret_;\n  nsimd_cpu_vf64 a0, ret;\n  a0.v0 = a0_.v0;\n  a0.v1 = a0_.v1;\n  ret = nsimd_log_u10_cpu_f64(a0);\n  ret_.v0 = ret.v0;\n  ret_.v1 = ret.v1;\n  return ret_;\n}\n\nnsimd_vec_f64 xcbrt_u1(nsimd_vec_f64 a0_) {\n  nsimd_vec_f64 ret_;\n  nsimd_cpu_vf64 a0, ret;\n  a0.v0 = a0_.v0;\n  a0.v1 = a0_.v1;\n  ret = nsimd_cbrt_u10_cpu_f64(a0);\n  ret_.v0 = ret.v0;\n  ret_.v1 = ret.v1;\n  return ret_;\n}\n\nnsimd_vec_f64 xexp(nsimd_vec_f64 a0_) {\n  nsimd_vec_f64 ret_;\n  nsimd_cpu_vf64 a0, ret;\n  a0.v0 = a0_.v0;\n  a0.v1 = a0_.v1;\n  ret = nsimd_exp_u10_cpu_f64(a0);\n  ret_.v0 = ret.v0;\n  ret_.v1 = ret.v1;\n  return ret_;\n}\n\nnsimd_vec_f64 xpow(nsimd_vec_f64 a0_, nsimd_vec_f64 a1_) {\n  nsimd_vec_f64 ret_;\n  nsimd_cpu_vf64 a0, a1, ret;\n  a0.v0 = a0_.v0;\n  a0.v1 = a0_.v1;\n  a1.v0 = a1_.v0;\n  a1.v1 = a1_.v1;\n  ret = nsimd_pow_u10_cpu_f64(a0, a1);\n  ret_.v0 = ret.v0;\n  ret_.v1 = ret.v1;\n  return ret_;\n}\n\nnsimd_vec_f64 xsinh(nsimd_vec_f64 a0_) {\n  nsimd_vec_f64 ret_;\n  nsimd_cpu_vf64 a0, ret;\n  a0.v0 = a0_.v0;\n  a0.v1 = a0_.v1;\n  ret = nsimd_sinh_u10_cpu_f64(a0);\n  ret_.v0 = ret.v0;\n  ret_.v1 = ret.v1;\n  return ret_;\n}\n\nnsimd_vec_f64 xcosh(nsimd_vec_f64 a0_) {\n  nsimd_vec_f64 ret_;\n  nsimd_cpu_vf64 a0, ret;\n  a0.v0 = a0_.v0;\n  a0.v1 = a0_.v1;\n  ret = nsimd_cosh_u10_cpu_f64(a0);\n  ret_.v0 = ret.v0;\n  ret_.v1 = ret.v1;\n  return ret_;\n}\n\nnsimd_vec_f64 xtanh(nsimd_vec_f64 a0_) {\n  nsimd_vec_f64 ret_;\n  nsimd_cpu_vf64 a0, ret;\n  a0.v0 = a0_.v0;\n  a0.v1 = a0_.v1;\n  ret = nsimd_tanh_u10_cpu_f64(a0);\n  ret_.v0 = ret.v0;\n  ret_.v1 = ret.v1;\n  return ret_;\n}\n\nnsimd_vec_f64 xsinh_u35(nsimd_vec_f64 a0_) {\n  nsimd_vec_f64 ret_;\n  nsimd_cpu_vf64 a0, ret;\n  a0.v0 = a0_.v0;\n  a0.v1 = a0_.v1;\n  ret = nsimd_sinh_u35_cpu_f64(a0);\n  ret_.v0 = ret.v0;\n  ret_.v1 = ret.v1;\n  return ret_;\n}\n\nnsimd_vec_f64 xcosh_u35(nsimd_vec_f64 a0_) {\n  nsimd_vec_f64 ret_;\n  nsimd_cpu_vf64 a0, ret;\n  a0.v0 = a0_.v0;\n  a0.v1 = a0_.v1;\n  ret = nsimd_cosh_u35_cpu_f64(a0);\n  ret_.v0 = ret.v0;\n  ret_.v1 = ret.v1;\n  return ret_;\n}\n\nnsimd_vec_f64 xtanh_u35(nsimd_vec_f64 a0_) {\n  nsimd_vec_f64 ret_;\n  nsimd_cpu_vf64 a0, ret;\n  a0.v0 = a0_.v0;\n  a0.v1 = a0_.v1;\n  ret = nsimd_tanh_u35_cpu_f64(a0);\n  ret_.v0 = ret.v0;\n  ret_.v1 = ret.v1;\n  return ret_;\n}\n\nnsimd_vec_f64 xasinh(nsimd_vec_f64 a0_) {\n  nsimd_vec_f64 ret_;\n  nsimd_cpu_vf64 a0, ret;\n  a0.v0 = a0_.v0;\n  a0.v1 = a0_.v1;\n  ret = nsimd_asinh_u10_cpu_f64(a0);\n  ret_.v0 = ret.v0;\n  ret_.v1 = ret.v1;\n  return ret_;\n}\n\nnsimd_vec_f64 xacosh(nsimd_vec_f64 a0_) {\n  nsimd_vec_f64 ret_;\n  nsimd_cpu_vf64 a0, ret;\n  a0.v0 = a0_.v0;\n  a0.v1 = a0_.v1;\n  ret = nsimd_acosh_u10_cpu_f64(a0);\n  ret_.v0 = ret.v0;\n  ret_.v1 = ret.v1;\n  return ret_;\n}\n\nnsimd_vec_f64 xatanh(nsimd_vec_f64 a0_) {\n  nsimd_vec_f64 ret_;\n  nsimd_cpu_vf64 a0, ret;\n  a0.v0 = a0_.v0;\n  a0.v1 = a0_.v1;\n  ret = nsimd_atanh_u10_cpu_f64(a0);\n  ret_.v0 = ret.v0;\n  ret_.v1 = ret.v1;\n  return ret_;\n}\n\nnsimd_vec_f64 xexp2(nsimd_vec_f64 a0_) {\n  nsimd_vec_f64 ret_;\n  nsimd_cpu_vf64 a0, ret;\n  a0.v0 = a0_.v0;\n  a0.v1 = a0_.v1;\n  ret = nsimd_exp2_u10_cpu_f64(a0);\n  ret_.v0 = ret.v0;\n  ret_.v1 = ret.v1;\n  return ret_;\n}\n\nnsimd_vec_f64 xexp2_u35(nsimd_vec_f64 a0_) {\n  nsimd_vec_f64 ret_;\n  nsimd_cpu_vf64 a0, ret;\n  a0.v0 = a0_.v0;\n  a0.v1 = a0_.v1;\n  ret = nsimd_exp2_u35_cpu_f64(a0);\n  ret_.v0 = ret.v0;\n  ret_.v1 = ret.v1;\n  return ret_;\n}\n\nnsimd_vec_f64 xexp10(nsimd_vec_f64 a0_) {\n  nsimd_vec_f64 ret_;\n  nsimd_cpu_vf64 a0, ret;\n  a0.v0 = a0_.v0;\n  a0.v1 = a0_.v1;\n  ret = nsimd_exp10_u10_cpu_f64(a0);\n  ret_.v0 = ret.v0;\n  ret_.v1 = ret.v1;\n  return ret_;\n}\n\nnsimd_vec_f64 xexp10_u35(nsimd_vec_f64 a0_) {\n  nsimd_vec_f64 ret_;\n  nsimd_cpu_vf64 a0, ret;\n  a0.v0 = a0_.v0;\n  a0.v1 = a0_.v1;\n  ret = nsimd_exp10_u35_cpu_f64(a0);\n  ret_.v0 = ret.v0;\n  ret_.v1 = ret.v1;\n  return ret_;\n}\n\nnsimd_vec_f64 xexpm1(nsimd_vec_f64 a0_) {\n  nsimd_vec_f64 ret_;\n  nsimd_cpu_vf64 a0, ret;\n  a0.v0 = a0_.v0;\n  a0.v1 = a0_.v1;\n  ret = nsimd_expm1_u10_cpu_f64(a0);\n  ret_.v0 = ret.v0;\n  ret_.v1 = ret.v1;\n  return ret_;\n}\n\nnsimd_vec_f64 xlog10(nsimd_vec_f64 a0_) {\n  nsimd_vec_f64 ret_;\n  nsimd_cpu_vf64 a0, ret;\n  a0.v0 = a0_.v0;\n  a0.v1 = a0_.v1;\n  ret = nsimd_log10_u10_cpu_f64(a0);\n  ret_.v0 = ret.v0;\n  ret_.v1 = ret.v1;\n  return ret_;\n}\n\nnsimd_vec_f64 xlog2(nsimd_vec_f64 a0_) {\n  nsimd_vec_f64 ret_;\n  nsimd_cpu_vf64 a0, ret;\n  a0.v0 = a0_.v0;\n  a0.v1 = a0_.v1;\n  ret = nsimd_log2_u10_cpu_f64(a0);\n  ret_.v0 = ret.v0;\n  ret_.v1 = ret.v1;\n  return ret_;\n}\n\nnsimd_vec_f64 xlog2_u35(nsimd_vec_f64 a0_) {\n  nsimd_vec_f64 ret_;\n  nsimd_cpu_vf64 a0, ret;\n  a0.v0 = a0_.v0;\n  a0.v1 = a0_.v1;\n  ret = nsimd_log2_u35_cpu_f64(a0);\n  ret_.v0 = ret.v0;\n  ret_.v1 = ret.v1;\n  return ret_;\n}\n\nnsimd_vec_f64 xlog1p(nsimd_vec_f64 a0_) {\n  nsimd_vec_f64 ret_;\n  nsimd_cpu_vf64 a0, ret;\n  a0.v0 = a0_.v0;\n  a0.v1 = a0_.v1;\n  ret = nsimd_log1p_u10_cpu_f64(a0);\n  ret_.v0 = ret.v0;\n  ret_.v1 = ret.v1;\n  return ret_;\n}\n\nnsimd_vec_f64 xsinpi_u05(nsimd_vec_f64 a0_) {\n  nsimd_vec_f64 ret_;\n  nsimd_cpu_vf64 a0, ret;\n  a0.v0 = a0_.v0;\n  a0.v1 = a0_.v1;\n  ret = nsimd_sinpi_u05_cpu_f64(a0);\n  ret_.v0 = ret.v0;\n  ret_.v1 = ret.v1;\n  return ret_;\n}\n\nnsimd_vec_f64 xcospi_u05(nsimd_vec_f64 a0_) {\n  nsimd_vec_f64 ret_;\n  nsimd_cpu_vf64 a0, ret;\n  a0.v0 = a0_.v0;\n  a0.v1 = a0_.v1;\n  ret = nsimd_cospi_u05_cpu_f64(a0);\n  ret_.v0 = ret.v0;\n  ret_.v1 = ret.v1;\n  return ret_;\n}\n\nnsimd_vec_f64 xhypot_u05(nsimd_vec_f64 a0_, nsimd_vec_f64 a1_) {\n  nsimd_vec_f64 ret_;\n  nsimd_cpu_vf64 a0, a1, ret;\n  a0.v0 = a0_.v0;\n  a0.v1 = a0_.v1;\n  a1.v0 = a1_.v0;\n  a1.v1 = a1_.v1;\n  ret = nsimd_hypot_u05_cpu_f64(a0, a1);\n  ret_.v0 = ret.v0;\n  ret_.v1 = ret.v1;\n  return ret_;\n}\n\nnsimd_vec_f64 xhypot_u35(nsimd_vec_f64 a0_, nsimd_vec_f64 a1_) {\n  nsimd_vec_f64 ret_;\n  nsimd_cpu_vf64 a0, a1, ret;\n  a0.v0 = a0_.v0;\n  a0.v1 = a0_.v1;\n  a1.v0 = a1_.v0;\n  a1.v1 = a1_.v1;\n  ret = nsimd_hypot_u35_cpu_f64(a0, a1);\n  ret_.v0 = ret.v0;\n  ret_.v1 = ret.v1;\n  return ret_;\n}\n\nnsimd_vec_f64 xfmod(nsimd_vec_f64 a0_, nsimd_vec_f64 a1_) {\n  nsimd_vec_f64 ret_;\n  nsimd_cpu_vf64 a0, a1, ret;\n  a0.v0 = a0_.v0;\n  a0.v1 = a0_.v1;\n  a1.v0 = a1_.v0;\n  a1.v1 = a1_.v1;\n  ret = nsimd_fmod_cpu_f64(a0, a1);\n  ret_.v0 = ret.v0;\n  ret_.v1 = ret.v1;\n  return ret_;\n}\n\nnsimd_vec_f64 xremainder(nsimd_vec_f64 a0_, nsimd_vec_f64 a1_) {\n  nsimd_vec_f64 ret_;\n  nsimd_cpu_vf64 a0, a1, ret;\n  a0.v0 = a0_.v0;\n  a0.v1 = a0_.v1;\n  a1.v0 = a1_.v0;\n  a1.v1 = a1_.v1;\n  ret = nsimd_remainder_cpu_f64(a0, a1);\n  ret_.v0 = ret.v0;\n  ret_.v1 = ret.v1;\n  return ret_;\n}\n\nnsimd_vec_f64 xlgamma_u1(nsimd_vec_f64 a0_) {\n  nsimd_vec_f64 ret_;\n  nsimd_cpu_vf64 a0, ret;\n  a0.v0 = a0_.v0;\n  a0.v1 = a0_.v1;\n  ret = nsimd_lgamma_u10_cpu_f64(a0);\n  ret_.v0 = ret.v0;\n  ret_.v1 = ret.v1;\n  return ret_;\n}\n\nnsimd_vec_f64 xtgamma_u1(nsimd_vec_f64 a0_) {\n  nsimd_vec_f64 ret_;\n  nsimd_cpu_vf64 a0, ret;\n  a0.v0 = a0_.v0;\n  a0.v1 = a0_.v1;\n  ret = nsimd_tgamma_u10_cpu_f64(a0);\n  ret_.v0 = ret.v0;\n  ret_.v1 = ret.v1;\n  return ret_;\n}\n\nnsimd_vec_f64 xerf_u1(nsimd_vec_f64 a0_) {\n  nsimd_vec_f64 ret_;\n  nsimd_cpu_vf64 a0, ret;\n  a0.v0 = a0_.v0;\n  a0.v1 = a0_.v1;\n  ret = nsimd_erf_u10_cpu_f64(a0);\n  ret_.v0 = ret.v0;\n  ret_.v1 = ret.v1;\n  return ret_;\n}\n\nnsimd_vec_f64 xerfc_u15(nsimd_vec_f64 a0_) {\n  nsimd_vec_f64 ret_;\n  nsimd_cpu_vf64 a0, ret;\n  a0.v0 = a0_.v0;\n  a0.v1 = a0_.v1;\n  ret = nsimd_erfc_u15_cpu_f64(a0);\n  ret_.v0 = ret.v0;\n  ret_.v1 = ret.v1;\n  return ret_;\n}\n\n"
  },
  {
    "path": "src/sleefsimdsp.c",
    "content": "//   Copyright Naoki Shibata and contributors 2010 - 2020.\n// Distributed under the Boost Software License, Version 1.0.\n//    (See accompanying file LICENSE.txt or copy at\n//          http://www.boost.org/LICENSE_1_0.txt)\n\n// Always use -ffp-contract=off option to compile SLEEF.\n\n#if !defined(SLEEF_GENHEADER)\n#include <stdint.h>\n#include <assert.h>\n#include <limits.h>\n#include <float.h>\n#endif\n\n#include \"misc.h\"\n\nextern const float Sleef_rempitabsp[];\n\n#define __SLEEFSIMDSP_C__\n\n#if (defined(_MSC_VER))\n#pragma fp_contract (off)\n#endif\n\n// Intel\n\n#ifdef ENABLE_SSE2\n#define CONFIG 2\n#if !defined(SLEEF_GENHEADER)\n#include \"helpersse2.h\"\n#else\n#include \"macroonlySSE2.h\"\n#endif\n#ifdef DORENAME\n#ifdef ENABLE_GNUABI\n#include \"renamesse2_gnuabi.h\"\n#else\n#include \"renamesse2.h\"\n#endif\n#endif\n#endif\n\n#ifdef ENABLE_SSE4\n#define CONFIG 4\n#if !defined(SLEEF_GENHEADER)\n#include \"helpersse2.h\"\n#else\n#include \"macroonlySSE4.h\"\n#endif\n#ifdef DORENAME\n#include \"renamesse4.h\"\n#endif\n#endif\n\n#ifdef ENABLE_AVX\n#define CONFIG 1\n#if !defined(SLEEF_GENHEADER)\n#include \"helperavx.h\"\n#else\n#include \"macroonlyAVX.h\"\n#endif\n#ifdef DORENAME\n#ifdef ENABLE_GNUABI\n#include \"renameavx_gnuabi.h\"\n#else\n#include \"renameavx.h\"\n#endif\n#endif\n#endif\n\n#ifdef ENABLE_FMA4\n#define CONFIG 4\n#if !defined(SLEEF_GENHEADER)\n#include \"helperavx.h\"\n#else\n#include \"macroonlyFMA4.h\"\n#endif\n#ifdef DORENAME\n#ifdef ENABLE_GNUABI\n#include \"renamefma4_gnuabi.h\"\n#else\n#include \"renamefma4.h\"\n#endif\n#endif\n#endif\n\n#ifdef ENABLE_AVX2\n#define CONFIG 1\n#if !defined(SLEEF_GENHEADER)\n#include \"helperavx2.h\"\n#else\n#include \"macroonlyAVX2.h\"\n#endif\n#ifdef DORENAME\n#ifdef ENABLE_GNUABI\n#include \"renameavx2_gnuabi.h\"\n#else\n#include \"renameavx2.h\"\n#endif\n#endif\n#endif\n\n#ifdef ENABLE_AVX2128\n#define CONFIG 1\n#if !defined(SLEEF_GENHEADER)\n#include \"helperavx2_128.h\"\n#else\n#include \"macroonlyAVX2128.h\"\n#endif\n#ifdef DORENAME\n#include \"renameavx2128.h\"\n#endif\n#endif\n\n#ifdef ENABLE_AVX512F\n#define CONFIG 1\n#if !defined(SLEEF_GENHEADER)\n#include \"helperavx512f.h\"\n#else\n#include \"macroonlyAVX512F.h\"\n#endif\n#ifdef DORENAME\n#ifdef ENABLE_GNUABI\n#include \"renameavx512f_gnuabi.h\"\n#else\n#include \"renameavx512f.h\"\n#endif\n#endif\n#endif\n\n#ifdef ENABLE_AVX512FNOFMA\n#define CONFIG 2\n#if !defined(SLEEF_GENHEADER)\n#include \"helperavx512f.h\"\n#else\n#include \"macroonlyAVX512FNOFMA.h\"\n#endif\n#ifdef DORENAME\n#include \"renameavx512fnofma.h\"\n#endif\n#endif\n\n// Arm\n\n#ifdef ENABLE_ADVSIMD\n#define CONFIG 1\n#if !defined(SLEEF_GENHEADER)\n#include \"helperadvsimd.h\"\n#else\n#include \"macroonlyADVSIMD.h\"\n#endif\n#ifdef DORENAME\n#ifdef ENABLE_GNUABI\n#include \"renameadvsimd_gnuabi.h\"\n#else\n#include \"renameadvsimd.h\"\n#endif\n#endif\n#endif\n\n#ifdef ENABLE_ADVSIMDNOFMA\n#define CONFIG 2\n#if !defined(SLEEF_GENHEADER)\n#include \"helperadvsimd.h\"\n#else\n#include \"macroonlyADVSIMDNOFMA.h\"\n#endif\n#ifdef DORENAME\n#include \"renameadvsimdnofma.h\"\n#endif\n#endif\n\n#ifdef ENABLE_NEON32\n#define CONFIG 1\n#if !defined(SLEEF_GENHEADER)\n#include \"helperneon32.h\"\n#endif\n#ifdef DORENAME\n#include \"renameneon32.h\"\n#endif\n#endif\n\n#ifdef ENABLE_NEON32VFPV4\n#define CONFIG 4\n#if !defined(SLEEF_GENHEADER)\n#include \"helperneon32.h\"\n#endif\n#ifdef DORENAME\n#include \"renameneon32vfpv4.h\"\n#endif\n#endif\n\n#ifdef ENABLE_SVE\n#define CONFIG 1\n#if !defined(SLEEF_GENHEADER)\n#include \"helpersve.h\"\n#else\n#include \"macroonlySVE.h\"\n#endif\n#ifdef DORENAME\n#ifdef ENABLE_GNUABI\n#include \"renamesve_gnuabi.h\"\n#else\n#include \"renamesve.h\"\n#endif /* ENABLE_GNUABI */\n#endif /* DORENAME */\n#endif /* ENABLE_SVE */\n\n#ifdef ENABLE_SVENOFMA\n#define CONFIG 2\n#if !defined(SLEEF_GENHEADER)\n#include \"helpersve.h\"\n#else\n#include \"macroonlySVENOFMA.h\"\n#endif\n#ifdef DORENAME\n#include \"renamesvenofma.h\"\n#endif /* DORENAME */\n#endif /* ENABLE_SVE */\n\n// IBM\n\n#ifdef ENABLE_VSX\n#define CONFIG 1\n#if !defined(SLEEF_GENHEADER)\n#include \"helperpower_128.h\"\n#else\n#include \"macroonlyVSX.h\"\n#endif\n#ifdef DORENAME\n#include \"renamevsx.h\"\n#endif\n#endif\n\n#ifdef ENABLE_VSXNOFMA\n#define CONFIG 2\n#if !defined(SLEEF_GENHEADER)\n#include \"helperpower_128.h\"\n#else\n#include \"macroonlyVSXNOFMA.h\"\n#endif\n#ifdef DORENAME\n#include \"renamevsxnofma.h\"\n#endif\n#endif\n\n#ifdef ENABLE_ZVECTOR2\n#define CONFIG 140\n#if !defined(SLEEF_GENHEADER)\n#include \"helpers390x_128.h\"\n#else\n#include \"macroonlyZVECTOR2.h\"\n#endif\n#ifdef DORENAME\n#include \"renamezvector2.h\"\n#endif\n#endif\n\n#ifdef ENABLE_ZVECTOR2NOFMA\n#define CONFIG 141\n#if !defined(SLEEF_GENHEADER)\n#include \"helpers390x_128.h\"\n#else\n#include \"macroonlyZVECTOR2NOFMA.h\"\n#endif\n#ifdef DORENAME\n#include \"renamezvector2nofma.h\"\n#endif\n#endif\n\n// Generic\n\n#ifdef ENABLE_VECEXT\n#define CONFIG 1\n#if !defined(SLEEF_GENHEADER)\n#include \"helpervecext.h\"\n#endif\n#ifdef DORENAME\n#include \"renamevecext.h\"\n#endif\n#endif\n\n#ifdef ENABLE_PUREC\n#define CONFIG 1\n#if !defined(SLEEF_GENHEADER)\n#include \"helperpurec.h\"\n#endif\n#ifdef DORENAME\n#include \"renamepurec.h\"\n#endif\n#endif\n\n#ifdef ENABLE_PUREC_SCALAR\n#define CONFIG 1\n#if !defined(SLEEF_GENHEADER)\n#include \"helperpurec_scalar.h\"\n#else\n#include \"macroonlyPUREC_SCALAR.h\"\n#endif\n#ifdef DORENAME\n#include \"renamepurec_scalar.h\"\n#endif\n#endif\n\n#ifdef ENABLE_PURECFMA_SCALAR\n#define CONFIG 2\n#if !defined(SLEEF_GENHEADER)\n#include \"helperpurec_scalar.h\"\n#else\n#include \"macroonlyPURECFMA_SCALAR.h\"\n#endif\n#ifdef DORENAME\n#include \"renamepurecfma_scalar.h\"\n#endif\n#endif\n\n//\n\n#define MLA(x, y, z) vmla_vf_vf_vf_vf((x), (y), (z))\n#define C2V(c) vcast_vf_f(c)\n#include \"estrin.h\"\n\n//\n\n#include \"df.h\"\n\nstatic INLINE CONST VECTOR_CC vopmask visnegzero_vo_vf(vfloat d) {\n  return veq_vo_vi2_vi2(vreinterpret_vi2_vf(d), vreinterpret_vi2_vf(vcast_vf_f(-0.0)));\n}\n\nstatic INLINE VECTOR_CC vopmask vnot_vo32_vo32(vopmask x) {\n  return vxor_vo_vo_vo(x, veq_vo_vi2_vi2(vcast_vi2_i(0), vcast_vi2_i(0)));\n}\n\nstatic INLINE CONST VECTOR_CC vmask vsignbit_vm_vf(vfloat f) {\n  return vand_vm_vm_vm(vreinterpret_vm_vf(f), vreinterpret_vm_vf(vcast_vf_f(-0.0f)));\n}\n\nstatic INLINE CONST VECTOR_CC vfloat vmulsign_vf_vf_vf(vfloat x, vfloat y) {\n  return vreinterpret_vf_vm(vxor_vm_vm_vm(vreinterpret_vm_vf(x), vsignbit_vm_vf(y)));\n}\n\nstatic INLINE CONST VECTOR_CC vfloat vcopysign_vf_vf_vf(vfloat x, vfloat y) {\n  return vreinterpret_vf_vm(vxor_vm_vm_vm(vandnot_vm_vm_vm(vreinterpret_vm_vf(vcast_vf_f(-0.0f)), vreinterpret_vm_vf(x)), \n\t\t\t\t\t  vand_vm_vm_vm   (vreinterpret_vm_vf(vcast_vf_f(-0.0f)), vreinterpret_vm_vf(y))));\n}\n\nstatic INLINE CONST VECTOR_CC vfloat vsign_vf_vf(vfloat f) {\n  return vreinterpret_vf_vm(vor_vm_vm_vm(vreinterpret_vm_vf(vcast_vf_f(1.0f)), vand_vm_vm_vm(vreinterpret_vm_vf(vcast_vf_f(-0.0f)), vreinterpret_vm_vf(f))));\n}\n\nstatic INLINE CONST VECTOR_CC vopmask vsignbit_vo_vf(vfloat d) {\n  return veq_vo_vi2_vi2(vand_vi2_vi2_vi2(vreinterpret_vi2_vf(d), vcast_vi2_i(0x80000000)), vcast_vi2_i(0x80000000));\n}\n\nstatic INLINE CONST VECTOR_CC vint2 vsel_vi2_vf_vf_vi2_vi2(vfloat f0, vfloat f1, vint2 x, vint2 y) {\n  return vsel_vi2_vo_vi2_vi2(vlt_vo_vf_vf(f0, f1), x, y);\n}\n\nstatic INLINE CONST VECTOR_CC vint2 vsel_vi2_vf_vi2(vfloat d, vint2 x) {\n  return vand_vi2_vo_vi2(vsignbit_vo_vf(d), x);\n}\n\nstatic INLINE CONST VECTOR_CC vopmask visint_vo_vf(vfloat y) { return veq_vo_vf_vf(vtruncate_vf_vf(y), y); }\n\nstatic INLINE CONST VECTOR_CC vopmask visnumber_vo_vf(vfloat x) { return vnot_vo32_vo32(vor_vo_vo_vo(visinf_vo_vf(x), visnan_vo_vf(x))); }\n\n#if !defined(ENABLE_AVX512F) && !defined(ENABLE_AVX512FNOFMA)\nstatic INLINE CONST VECTOR_CC vint2 vilogbk_vi2_vf(vfloat d) {\n  vopmask o = vlt_vo_vf_vf(d, vcast_vf_f(5.421010862427522E-20f));\n  d = vsel_vf_vo_vf_vf(o, vmul_vf_vf_vf(vcast_vf_f(1.8446744073709552E19f), d), d);\n  vint2 q = vand_vi2_vi2_vi2(vsrl_vi2_vi2_i(vreinterpret_vi2_vf(d), 23), vcast_vi2_i(0xff));\n  q = vsub_vi2_vi2_vi2(q, vsel_vi2_vo_vi2_vi2(o, vcast_vi2_i(64 + 0x7f), vcast_vi2_i(0x7f)));\n  return q;\n}\n\nstatic INLINE CONST VECTOR_CC vint2 vilogb2k_vi2_vf(vfloat d) {\n  vint2 q = vreinterpret_vi2_vf(d);\n  q = vsrl_vi2_vi2_i(q, 23);\n  q = vand_vi2_vi2_vi2(q, vcast_vi2_i(0xff));\n  q = vsub_vi2_vi2_vi2(q, vcast_vi2_i(0x7f));\n  return q;\n}\n#endif\n\n//\n\nEXPORT CONST VECTOR_CC vint2 xilogbf(vfloat d) {\n  vint2 e = vilogbk_vi2_vf(vabs_vf_vf(d));\n  e = vsel_vi2_vo_vi2_vi2(veq_vo_vf_vf(d, vcast_vf_f(0.0f)), vcast_vi2_i(SLEEF_FP_ILOGB0), e);\n  e = vsel_vi2_vo_vi2_vi2(visnan_vo_vf(d), vcast_vi2_i(SLEEF_FP_ILOGBNAN), e);\n  e = vsel_vi2_vo_vi2_vi2(visinf_vo_vf(d), vcast_vi2_i(INT_MAX), e);\n  return e;\n}\n\nstatic INLINE CONST VECTOR_CC vfloat vpow2i_vf_vi2(vint2 q) {\n  return vreinterpret_vf_vi2(vsll_vi2_vi2_i(vadd_vi2_vi2_vi2(q, vcast_vi2_i(0x7f)), 23));\n}\n\nstatic INLINE CONST VECTOR_CC vfloat vldexp_vf_vf_vi2(vfloat x, vint2 q) {\n  vfloat u;\n  vint2 m = vsra_vi2_vi2_i(q, 31);\n  m = vsll_vi2_vi2_i(vsub_vi2_vi2_vi2(vsra_vi2_vi2_i(vadd_vi2_vi2_vi2(m, q), 6), m), 4);\n  q = vsub_vi2_vi2_vi2(q, vsll_vi2_vi2_i(m, 2));\n  m = vadd_vi2_vi2_vi2(m, vcast_vi2_i(0x7f));\n  m = vand_vi2_vi2_vi2(vgt_vi2_vi2_vi2(m, vcast_vi2_i(0)), m);\n  vint2 n = vgt_vi2_vi2_vi2(m, vcast_vi2_i(0xff));\n  m = vor_vi2_vi2_vi2(vandnot_vi2_vi2_vi2(n, m), vand_vi2_vi2_vi2(n, vcast_vi2_i(0xff)));\n  u = vreinterpret_vf_vi2(vsll_vi2_vi2_i(m, 23));\n  x = vmul_vf_vf_vf(vmul_vf_vf_vf(vmul_vf_vf_vf(vmul_vf_vf_vf(x, u), u), u), u);\n  u = vreinterpret_vf_vi2(vsll_vi2_vi2_i(vadd_vi2_vi2_vi2(q, vcast_vi2_i(0x7f)), 23));\n  return vmul_vf_vf_vf(x, u);\n}\n\nstatic INLINE CONST VECTOR_CC vfloat vldexp2_vf_vf_vi2(vfloat d, vint2 e) {\n  return vmul_vf_vf_vf(vmul_vf_vf_vf(d, vpow2i_vf_vi2(vsra_vi2_vi2_i(e, 1))), vpow2i_vf_vi2(vsub_vi2_vi2_vi2(e, vsra_vi2_vi2_i(e, 1))));\n}\n\nstatic INLINE CONST VECTOR_CC vfloat vldexp3_vf_vf_vi2(vfloat d, vint2 q) {\n  return vreinterpret_vf_vi2(vadd_vi2_vi2_vi2(vreinterpret_vi2_vf(d), vsll_vi2_vi2_i(q, 23)));\n}\n\nEXPORT CONST VECTOR_CC vfloat xldexpf(vfloat x, vint2 q) { return vldexp_vf_vf_vi2(x, q); }\n\n#if !(defined(ENABLE_SVE) || defined(ENABLE_SVENOFMA))\ntypedef struct {\n  vfloat d;\n  vint2 i;\n} fi_t;\n\nstatic vfloat figetd_vf_di(fi_t d) { return d.d; }\nstatic vint2 figeti_vi2_di(fi_t d) { return d.i; }\nstatic fi_t fisetdi_fi_vf_vi2(vfloat d, vint2 i) {\n  fi_t r = { d, i };\n  return r;\n}\n\ntypedef struct {\n  vfloat2 df;\n  vint2 i;\n} dfi_t;\n\nstatic vfloat2 dfigetdf_vf2_dfi(dfi_t d) { return d.df; }\nstatic vint2 dfigeti_vi2_dfi(dfi_t d) { return d.i; }\nstatic dfi_t dfisetdfi_dfi_vf2_vi2(vfloat2 v, vint2 i) {\n  dfi_t r = { v, i };\n  return r;\n}\nstatic dfi_t dfisetdf_dfi_dfi_vf2(dfi_t dfi, vfloat2 v) {\n  dfi.df = v;\n  return dfi;\n}\n#endif\n\nstatic INLINE CONST VECTOR_CC vfloat vorsign_vf_vf_vf(vfloat x, vfloat y) {\n  return vreinterpret_vf_vm(vor_vm_vm_vm(vreinterpret_vm_vf(x), vsignbit_vm_vf(y)));\n}\n\nstatic INLINE CONST fi_t rempisubf(vfloat x) {\n#ifdef FULL_FP_ROUNDING\n  vfloat y = vrint_vf_vf(vmul_vf_vf_vf(x, vcast_vf_f(4)));\n  vint2 vi = vtruncate_vi2_vf(vsub_vf_vf_vf(y, vmul_vf_vf_vf(vrint_vf_vf(x), vcast_vf_f(4))));\n  return fisetdi_fi_vf_vi2(vsub_vf_vf_vf(x, vmul_vf_vf_vf(y, vcast_vf_f(0.25))), vi);\n#else\n  vfloat c = vmulsign_vf_vf_vf(vcast_vf_f(1 << 23), x);\n  vfloat rint4x = vsel_vf_vo_vf_vf(vgt_vo_vf_vf(vabs_vf_vf(vmul_vf_vf_vf(vcast_vf_f(4), x)), vcast_vf_f(1 << 23)),\n\t\t\t\t   vmul_vf_vf_vf(vcast_vf_f(4), x),\n\t\t\t\t   vorsign_vf_vf_vf(vsub_vf_vf_vf(vmla_vf_vf_vf_vf(vcast_vf_f(4), x, c), c), x));\n  vfloat rintx  = vsel_vf_vo_vf_vf(vgt_vo_vf_vf(vabs_vf_vf(x), vcast_vf_f(1 << 23)),\n\t\t\t\t   x, vorsign_vf_vf_vf(vsub_vf_vf_vf(vadd_vf_vf_vf(x, c), c), x));\n  return fisetdi_fi_vf_vi2(vmla_vf_vf_vf_vf(vcast_vf_f(-0.25), rint4x, x),\n\t\t\t   vtruncate_vi2_vf(vmla_vf_vf_vf_vf(vcast_vf_f(-4), rintx, rint4x)));\n#endif\n}\n\nstatic INLINE CONST dfi_t rempif(vfloat a) {\n  vfloat2 x, y, z;\n  vint2 ex = vilogb2k_vi2_vf(a);\n#if defined(ENABLE_AVX512F) || defined(ENABLE_AVX512FNOFMA)\n  ex = vandnot_vi2_vi2_vi2(vsra_vi2_vi2_i(ex, 31), ex);\n  ex = vand_vi2_vi2_vi2(ex, vcast_vi2_i(127));\n#endif\n  ex = vsub_vi2_vi2_vi2(ex, vcast_vi2_i(25));\n  vint2 q = vand_vi2_vo_vi2(vgt_vo_vi2_vi2(ex, vcast_vi2_i(90-25)), vcast_vi2_i(-64));\n  a = vldexp3_vf_vf_vi2(a, q);\n  ex = vandnot_vi2_vi2_vi2(vsra_vi2_vi2_i(ex, 31), ex);\n  ex = vsll_vi2_vi2_i(ex, 2);\n  x = dfmul_vf2_vf_vf(a, vgather_vf_p_vi2(Sleef_rempitabsp, ex));\n  fi_t di = rempisubf(vf2getx_vf_vf2(x));\n  q = figeti_vi2_di(di);\n  x = vf2setx_vf2_vf2_vf(x, figetd_vf_di(di));\n  x = dfnormalize_vf2_vf2(x);\n  y = dfmul_vf2_vf_vf(a, vgather_vf_p_vi2(Sleef_rempitabsp+1, ex));\n  x = dfadd2_vf2_vf2_vf2(x, y);\n  di = rempisubf(vf2getx_vf_vf2(x));\n  q = vadd_vi2_vi2_vi2(q, figeti_vi2_di(di));\n  x = vf2setx_vf2_vf2_vf(x, figetd_vf_di(di));\n  x = dfnormalize_vf2_vf2(x);\n  y = vcast_vf2_vf_vf(vgather_vf_p_vi2(Sleef_rempitabsp+2, ex), vgather_vf_p_vi2(Sleef_rempitabsp+3, ex));\n  y = dfmul_vf2_vf2_vf(y, a);\n  x = dfadd2_vf2_vf2_vf2(x, y);\n  x = dfnormalize_vf2_vf2(x);\n  x = dfmul_vf2_vf2_vf2(x, vcast_vf2_f_f(3.1415927410125732422f*2, -8.7422776573475857731e-08f*2));\n  x = vsel_vf2_vo_vf2_vf2(vlt_vo_vf_vf(vabs_vf_vf(a), vcast_vf_f(0.7f)), vcast_vf2_vf_vf(a, vcast_vf_f(0)), x);\n  return dfisetdfi_dfi_vf2_vi2(x, q);\n}\n\nEXPORT CONST VECTOR_CC vfloat xsinf(vfloat d) {\n#if !defined(DETERMINISTIC)\n  vint2 q;\n  vfloat u, s, r = d;\n\n  if (LIKELY(vtestallones_i_vo32(vlt_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(TRIGRANGEMAX2f))))) {\n    q = vrint_vi2_vf(vmul_vf_vf_vf(d, vcast_vf_f((float)M_1_PI)));\n    u = vcast_vf_vi2(q);\n    d = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_A2f), d);\n    d = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_B2f), d);\n    d = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_C2f), d);\n  } else if (LIKELY(vtestallones_i_vo32(vlt_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(TRIGRANGEMAXf))))) {\n    q = vrint_vi2_vf(vmul_vf_vf_vf(d, vcast_vf_f((float)M_1_PI)));\n    u = vcast_vf_vi2(q);\n    d = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_Af), d);\n    d = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_Bf), d);\n    d = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_Cf), d);\n    d = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_Df), d);\n  } else {\n    dfi_t dfi = rempif(d);\n    q = vand_vi2_vi2_vi2(dfigeti_vi2_dfi(dfi), vcast_vi2_i(3));\n    q = vadd_vi2_vi2_vi2(vadd_vi2_vi2_vi2(q, q), vsel_vi2_vo_vi2_vi2(vgt_vo_vf_vf(vf2getx_vf_vf2(dfigetdf_vf2_dfi(dfi)), vcast_vf_f(0)), vcast_vi2_i(2), vcast_vi2_i(1)));\n    q = vsra_vi2_vi2_i(q, 2);\n    vopmask o = veq_vo_vi2_vi2(vand_vi2_vi2_vi2(dfigeti_vi2_dfi(dfi), vcast_vi2_i(1)), vcast_vi2_i(1));\n    vfloat2 x = vcast_vf2_vf_vf(vmulsign_vf_vf_vf(vcast_vf_f(3.1415927410125732422f*-0.5), vf2getx_vf_vf2(dfigetdf_vf2_dfi(dfi))), \n\t\t\t\tvmulsign_vf_vf_vf(vcast_vf_f(-8.7422776573475857731e-08f*-0.5), vf2getx_vf_vf2(dfigetdf_vf2_dfi(dfi))));\n    x = dfadd2_vf2_vf2_vf2(dfigetdf_vf2_dfi(dfi), x);\n    dfi = dfisetdf_dfi_dfi_vf2(dfi, vsel_vf2_vo_vf2_vf2(o, x, dfigetdf_vf2_dfi(dfi)));\n    d = vadd_vf_vf_vf(vf2getx_vf_vf2(dfigetdf_vf2_dfi(dfi)), vf2gety_vf_vf2(dfigetdf_vf2_dfi(dfi)));\n\n    d = vreinterpret_vf_vm(vor_vm_vo32_vm(vor_vo_vo_vo(visinf_vo_vf(r), visnan_vo_vf(r)), vreinterpret_vm_vf(d)));\n  }\n\n  s = vmul_vf_vf_vf(d, d);\n\n  d = vreinterpret_vf_vm(vxor_vm_vm_vm(vand_vm_vo32_vm(veq_vo_vi2_vi2(vand_vi2_vi2_vi2(q, vcast_vi2_i(1)), vcast_vi2_i(1)), vreinterpret_vm_vf(vcast_vf_f(-0.0f))), vreinterpret_vm_vf(d)));\n\n  u = vcast_vf_f(2.6083159809786593541503e-06f);\n  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(-0.0001981069071916863322258f));\n  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(0.00833307858556509017944336f));\n  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(-0.166666597127914428710938f));\n\n  u = vadd_vf_vf_vf(vmul_vf_vf_vf(s, vmul_vf_vf_vf(u, d)), d);\n\n  u = vsel_vf_vo_vf_vf(visnegzero_vo_vf(r), r, u);\n\n  return u;\n\n#else // #if !defined(DETERMINISTIC)\n\n  vint2 q;\n  vfloat u, s, r = d;\n\n  q = vrint_vi2_vf(vmul_vf_vf_vf(d, vcast_vf_f((float)M_1_PI)));\n  u = vcast_vf_vi2(q);\n  d = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_A2f), d);\n  d = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_B2f), d);\n  d = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_C2f), d);\n  vopmask g = vlt_vo_vf_vf(vabs_vf_vf(r), vcast_vf_f(TRIGRANGEMAX2f));\n\n  if (!LIKELY(vtestallones_i_vo32(g))) {\n    s = vcast_vf_vi2(q);\n    u = vmla_vf_vf_vf_vf(s, vcast_vf_f(-PI_Af), r);\n    u = vmla_vf_vf_vf_vf(s, vcast_vf_f(-PI_Bf), u);\n    u = vmla_vf_vf_vf_vf(s, vcast_vf_f(-PI_Cf), u);\n    u = vmla_vf_vf_vf_vf(s, vcast_vf_f(-PI_Df), u);\n\n    d = vsel_vf_vo_vf_vf(g, d, u);\n    g = vlt_vo_vf_vf(vabs_vf_vf(r), vcast_vf_f(TRIGRANGEMAXf));\n\n    if (!LIKELY(vtestallones_i_vo32(g))) {\n      dfi_t dfi = rempif(r);\n      vint2 q2 = vand_vi2_vi2_vi2(dfigeti_vi2_dfi(dfi), vcast_vi2_i(3));\n      q2 = vadd_vi2_vi2_vi2(vadd_vi2_vi2_vi2(q2, q2), vsel_vi2_vo_vi2_vi2(vgt_vo_vf_vf(vf2getx_vf_vf2(dfigetdf_vf2_dfi(dfi)), vcast_vf_f(0)), vcast_vi2_i(2), vcast_vi2_i(1)));\n      q2 = vsra_vi2_vi2_i(q2, 2);\n      vopmask o = veq_vo_vi2_vi2(vand_vi2_vi2_vi2(dfigeti_vi2_dfi(dfi), vcast_vi2_i(1)), vcast_vi2_i(1));\n      vfloat2 x = vcast_vf2_vf_vf(vmulsign_vf_vf_vf(vcast_vf_f(3.1415927410125732422f*-0.5), vf2getx_vf_vf2(dfigetdf_vf2_dfi(dfi))), \n\t\t\t\t  vmulsign_vf_vf_vf(vcast_vf_f(-8.7422776573475857731e-08f*-0.5), vf2getx_vf_vf2(dfigetdf_vf2_dfi(dfi))));\n      x = dfadd2_vf2_vf2_vf2(dfigetdf_vf2_dfi(dfi), x);\n      dfi = dfisetdf_dfi_dfi_vf2(dfi, vsel_vf2_vo_vf2_vf2(o, x, dfigetdf_vf2_dfi(dfi)));\n      u = vadd_vf_vf_vf(vf2getx_vf_vf2(dfigetdf_vf2_dfi(dfi)), vf2gety_vf_vf2(dfigetdf_vf2_dfi(dfi)));\n\n      u = vreinterpret_vf_vm(vor_vm_vo32_vm(vor_vo_vo_vo(visinf_vo_vf(r), visnan_vo_vf(r)), vreinterpret_vm_vf(u)));\n\n      q = vsel_vi2_vo_vi2_vi2(g, q, q2);\n      d = vsel_vf_vo_vf_vf(g, d, u);\n    }\n  }\n\n  s = vmul_vf_vf_vf(d, d);\n\n  d = vreinterpret_vf_vm(vxor_vm_vm_vm(vand_vm_vo32_vm(veq_vo_vi2_vi2(vand_vi2_vi2_vi2(q, vcast_vi2_i(1)), vcast_vi2_i(1)), vreinterpret_vm_vf(vcast_vf_f(-0.0f))), vreinterpret_vm_vf(d)));\n\n  u = vcast_vf_f(2.6083159809786593541503e-06f);\n  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(-0.0001981069071916863322258f));\n  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(0.00833307858556509017944336f));\n  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(-0.166666597127914428710938f));\n\n  u = vadd_vf_vf_vf(vmul_vf_vf_vf(s, vmul_vf_vf_vf(u, d)), d);\n\n  u = vsel_vf_vo_vf_vf(visnegzero_vo_vf(r), r, u);\n\n  return u;\n#endif // #if !defined(DETERMINISTIC)\n}\n\nEXPORT CONST VECTOR_CC vfloat xcosf(vfloat d) {\n#if !defined(DETERMINISTIC)\n  vint2 q;\n  vfloat u, s, r = d;\n\n  if (LIKELY(vtestallones_i_vo32(vlt_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(TRIGRANGEMAX2f))))) {\n    q = vrint_vi2_vf(vsub_vf_vf_vf(vmul_vf_vf_vf(d, vcast_vf_f((float)M_1_PI)), vcast_vf_f(0.5f)));\n    q = vadd_vi2_vi2_vi2(vadd_vi2_vi2_vi2(q, q), vcast_vi2_i(1));\n\n    u = vcast_vf_vi2(q);\n    d = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_A2f*0.5f), d);\n    d = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_B2f*0.5f), d);\n    d = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_C2f*0.5f), d);\n  } else if (LIKELY(vtestallones_i_vo32(vlt_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(TRIGRANGEMAXf))))) {\n    q = vrint_vi2_vf(vsub_vf_vf_vf(vmul_vf_vf_vf(d, vcast_vf_f((float)M_1_PI)), vcast_vf_f(0.5f)));\n    q = vadd_vi2_vi2_vi2(vadd_vi2_vi2_vi2(q, q), vcast_vi2_i(1));\n\n    u = vcast_vf_vi2(q);\n    d = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_Af*0.5f), d);\n    d = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_Bf*0.5f), d);\n    d = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_Cf*0.5f), d);\n    d = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_Df*0.5f), d);\n  } else {\n    dfi_t dfi = rempif(d);\n    q = vand_vi2_vi2_vi2(dfigeti_vi2_dfi(dfi), vcast_vi2_i(3));\n    q = vadd_vi2_vi2_vi2(vadd_vi2_vi2_vi2(q, q), vsel_vi2_vo_vi2_vi2(vgt_vo_vf_vf(vf2getx_vf_vf2(dfigetdf_vf2_dfi(dfi)), vcast_vf_f(0)), vcast_vi2_i(8), vcast_vi2_i(7)));\n    q = vsra_vi2_vi2_i(q, 1);\n    vopmask o = veq_vo_vi2_vi2(vand_vi2_vi2_vi2(dfigeti_vi2_dfi(dfi), vcast_vi2_i(1)), vcast_vi2_i(0));\n    vfloat y = vsel_vf_vo_vf_vf(vgt_vo_vf_vf(vf2getx_vf_vf2(dfigetdf_vf2_dfi(dfi)), vcast_vf_f(0)), vcast_vf_f(0), vcast_vf_f(-1));\n    vfloat2 x = vcast_vf2_vf_vf(vmulsign_vf_vf_vf(vcast_vf_f(3.1415927410125732422f*-0.5), y),\n\t\t\t\tvmulsign_vf_vf_vf(vcast_vf_f(-8.7422776573475857731e-08f*-0.5), y));\n    x = dfadd2_vf2_vf2_vf2(dfigetdf_vf2_dfi(dfi), x);\n    dfi = dfisetdf_dfi_dfi_vf2(dfi, vsel_vf2_vo_vf2_vf2(o, x, dfigetdf_vf2_dfi(dfi)));\n    d = vadd_vf_vf_vf(vf2getx_vf_vf2(dfigetdf_vf2_dfi(dfi)), vf2gety_vf_vf2(dfigetdf_vf2_dfi(dfi)));\n\n    d = vreinterpret_vf_vm(vor_vm_vo32_vm(vor_vo_vo_vo(visinf_vo_vf(r), visnan_vo_vf(r)), vreinterpret_vm_vf(d)));\n  }\n\n  s = vmul_vf_vf_vf(d, d);\n\n  d = vreinterpret_vf_vm(vxor_vm_vm_vm(vand_vm_vo32_vm(veq_vo_vi2_vi2(vand_vi2_vi2_vi2(q, vcast_vi2_i(2)), vcast_vi2_i(0)), vreinterpret_vm_vf(vcast_vf_f(-0.0f))), vreinterpret_vm_vf(d)));\n\n  u = vcast_vf_f(2.6083159809786593541503e-06f);\n  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(-0.0001981069071916863322258f));\n  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(0.00833307858556509017944336f));\n  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(-0.166666597127914428710938f));\n\n  u = vadd_vf_vf_vf(vmul_vf_vf_vf(s, vmul_vf_vf_vf(u, d)), d);\n\n  return u;\n\n#else // #if !defined(DETERMINISTIC)\n\n  vint2 q;\n  vfloat u, s, r = d;\n\n  q = vrint_vi2_vf(vsub_vf_vf_vf(vmul_vf_vf_vf(d, vcast_vf_f((float)M_1_PI)), vcast_vf_f(0.5f)));\n  q = vadd_vi2_vi2_vi2(vadd_vi2_vi2_vi2(q, q), vcast_vi2_i(1));\n  u = vcast_vf_vi2(q);\n  d = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_A2f*0.5f), d);\n  d = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_B2f*0.5f), d);\n  d = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_C2f*0.5f), d);\n  vopmask g = vlt_vo_vf_vf(vabs_vf_vf(r), vcast_vf_f(TRIGRANGEMAX2f));\n\n  if (!LIKELY(vtestallones_i_vo32(g))) {\n    s = vcast_vf_vi2(q);\n    u = vmla_vf_vf_vf_vf(s, vcast_vf_f(-PI_Af*0.5f), r);\n    u = vmla_vf_vf_vf_vf(s, vcast_vf_f(-PI_Bf*0.5f), u);\n    u = vmla_vf_vf_vf_vf(s, vcast_vf_f(-PI_Cf*0.5f), u);\n    u = vmla_vf_vf_vf_vf(s, vcast_vf_f(-PI_Df*0.5f), u);\n\n    d = vsel_vf_vo_vf_vf(g, d, u);\n    g = vlt_vo_vf_vf(vabs_vf_vf(r), vcast_vf_f(TRIGRANGEMAXf));\n\n    if (!LIKELY(vtestallones_i_vo32(g))) {\n      dfi_t dfi = rempif(r);\n      vint2 q2 = vand_vi2_vi2_vi2(dfigeti_vi2_dfi(dfi), vcast_vi2_i(3));\n      q2 = vadd_vi2_vi2_vi2(vadd_vi2_vi2_vi2(q2, q2), vsel_vi2_vo_vi2_vi2(vgt_vo_vf_vf(vf2getx_vf_vf2(dfigetdf_vf2_dfi(dfi)), vcast_vf_f(0)), vcast_vi2_i(8), vcast_vi2_i(7)));\n      q2 = vsra_vi2_vi2_i(q2, 1);\n      vopmask o = veq_vo_vi2_vi2(vand_vi2_vi2_vi2(dfigeti_vi2_dfi(dfi), vcast_vi2_i(1)), vcast_vi2_i(0));\n      vfloat y = vsel_vf_vo_vf_vf(vgt_vo_vf_vf(vf2getx_vf_vf2(dfigetdf_vf2_dfi(dfi)), vcast_vf_f(0)), vcast_vf_f(0), vcast_vf_f(-1));\n      vfloat2 x = vcast_vf2_vf_vf(vmulsign_vf_vf_vf(vcast_vf_f(3.1415927410125732422f*-0.5), y),\n\t\t\t\t  vmulsign_vf_vf_vf(vcast_vf_f(-8.7422776573475857731e-08f*-0.5), y));\n      x = dfadd2_vf2_vf2_vf2(dfigetdf_vf2_dfi(dfi), x);\n      dfi = dfisetdf_dfi_dfi_vf2(dfi, vsel_vf2_vo_vf2_vf2(o, x, dfigetdf_vf2_dfi(dfi)));\n      u = vadd_vf_vf_vf(vf2getx_vf_vf2(dfigetdf_vf2_dfi(dfi)), vf2gety_vf_vf2(dfigetdf_vf2_dfi(dfi)));\n\n      u = vreinterpret_vf_vm(vor_vm_vo32_vm(vor_vo_vo_vo(visinf_vo_vf(r), visnan_vo_vf(r)), vreinterpret_vm_vf(u)));\n\n      q = vsel_vi2_vo_vi2_vi2(g, q, q2);\n      d = vsel_vf_vo_vf_vf(g, d, u);\n    }\n  }\n\n  s = vmul_vf_vf_vf(d, d);\n\n  d = vreinterpret_vf_vm(vxor_vm_vm_vm(vand_vm_vo32_vm(veq_vo_vi2_vi2(vand_vi2_vi2_vi2(q, vcast_vi2_i(2)), vcast_vi2_i(0)), vreinterpret_vm_vf(vcast_vf_f(-0.0f))), vreinterpret_vm_vf(d)));\n\n  u = vcast_vf_f(2.6083159809786593541503e-06f);\n  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(-0.0001981069071916863322258f));\n  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(0.00833307858556509017944336f));\n  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(-0.166666597127914428710938f));\n\n  u = vadd_vf_vf_vf(vmul_vf_vf_vf(s, vmul_vf_vf_vf(u, d)), d);\n\n  return u;\n#endif // #if !defined(DETERMINISTIC)\n}\n\nEXPORT CONST VECTOR_CC vfloat xtanf(vfloat d) {\n#if !defined(DETERMINISTIC)\n  vint2 q;\n  vopmask o;\n  vfloat u, s, x;\n\n  x = d;\n\n  if (LIKELY(vtestallones_i_vo32(vlt_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(TRIGRANGEMAX2f*0.5f))))) {\n    q = vrint_vi2_vf(vmul_vf_vf_vf(d, vcast_vf_f((float)(2 * M_1_PI))));\n    u = vcast_vf_vi2(q);\n    x = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_A2f*0.5f), x);\n    x = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_B2f*0.5f), x);\n    x = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_C2f*0.5f), x);\n  } else if (LIKELY(vtestallones_i_vo32(vlt_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(TRIGRANGEMAXf))))) {\n    q = vrint_vi2_vf(vmul_vf_vf_vf(d, vcast_vf_f((float)(2 * M_1_PI))));\n    u = vcast_vf_vi2(q);\n    x = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_Af*0.5f), x);\n    x = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_Bf*0.5f), x);\n    x = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_Cf*0.5f), x);\n    x = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_Df*0.5f), x);\n  } else {\n    dfi_t dfi = rempif(d);\n    q = dfigeti_vi2_dfi(dfi);\n    x = vadd_vf_vf_vf(vf2getx_vf_vf2(dfigetdf_vf2_dfi(dfi)), vf2gety_vf_vf2(dfigetdf_vf2_dfi(dfi)));\n    x = vreinterpret_vf_vm(vor_vm_vo32_vm(vor_vo_vo_vo(visinf_vo_vf(d), visnan_vo_vf(d)), vreinterpret_vm_vf(x)));\n    x = vsel_vf_vo_vf_vf(visnegzero_vo_vf(d), d, x);\n  }\n\n  s = vmul_vf_vf_vf(x, x);\n\n  o = veq_vo_vi2_vi2(vand_vi2_vi2_vi2(q, vcast_vi2_i(1)), vcast_vi2_i(1));\n  x = vreinterpret_vf_vm(vxor_vm_vm_vm(vand_vm_vo32_vm(o, vreinterpret_vm_vf(vcast_vf_f(-0.0f))), vreinterpret_vm_vf(x)));\n\n#if defined(ENABLE_NEON32)\n  u = vcast_vf_f(0.00927245803177356719970703f);\n  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(0.00331984995864331722259521f));\n  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(0.0242998078465461730957031f));\n  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(0.0534495301544666290283203f));\n  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(0.133383005857467651367188f));\n  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(0.333331853151321411132812f));\n#else\n  vfloat s2 = vmul_vf_vf_vf(s, s), s4 = vmul_vf_vf_vf(s2, s2);\n  u = POLY6(s, s2, s4,\n\t    0.00927245803177356719970703f,\n\t    0.00331984995864331722259521f,\n\t    0.0242998078465461730957031f,\n\t    0.0534495301544666290283203f,\n\t    0.133383005857467651367188f,\n\t    0.333331853151321411132812f);\n#endif\n\n  u = vmla_vf_vf_vf_vf(s, vmul_vf_vf_vf(u, x), x);\n\n  u = vsel_vf_vo_vf_vf(o, vrec_vf_vf(u), u);\n\n  return u;\n\n#else // #if !defined(DETERMINISTIC)\n\n  vint2 q;\n  vopmask o;\n  vfloat u, s, x;\n\n  q = vrint_vi2_vf(vmul_vf_vf_vf(d, vcast_vf_f((float)(2 * M_1_PI))));\n  u = vcast_vf_vi2(q);\n  x = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_A2f*0.5f), d);\n  x = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_B2f*0.5f), x);\n  x = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_C2f*0.5f), x);\n  vopmask g = vlt_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(TRIGRANGEMAX2f*0.5f));\n\n  if (!LIKELY(vtestallones_i_vo32(g))) {\n    vint2 q2 = vrint_vi2_vf(vmul_vf_vf_vf(d, vcast_vf_f((float)(2 * M_1_PI))));\n    s = vcast_vf_vi2(q);\n    u = vmla_vf_vf_vf_vf(s, vcast_vf_f(-PI_Af*0.5f), d);\n    u = vmla_vf_vf_vf_vf(s, vcast_vf_f(-PI_Bf*0.5f), u);\n    u = vmla_vf_vf_vf_vf(s, vcast_vf_f(-PI_Cf*0.5f), u);\n    u = vmla_vf_vf_vf_vf(s, vcast_vf_f(-PI_Df*0.5f), u);\n\n    q = vsel_vi2_vo_vi2_vi2(g, q, q2);\n    x = vsel_vf_vo_vf_vf(g, x, u);\n    g = vlt_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(TRIGRANGEMAXf));\n\n    if (!LIKELY(vtestallones_i_vo32(g))) {\n      dfi_t dfi = rempif(d);\n      u = vadd_vf_vf_vf(vf2getx_vf_vf2(dfigetdf_vf2_dfi(dfi)), vf2gety_vf_vf2(dfigetdf_vf2_dfi(dfi)));\n      u = vreinterpret_vf_vm(vor_vm_vo32_vm(vor_vo_vo_vo(visinf_vo_vf(d), visnan_vo_vf(d)), vreinterpret_vm_vf(u)));\n      u = vsel_vf_vo_vf_vf(visnegzero_vo_vf(d), d, u);\n      q = vsel_vi2_vo_vi2_vi2(g, q, dfigeti_vi2_dfi(dfi));\n      x = vsel_vf_vo_vf_vf(g, x, u);\n    }\n  }\n\n  s = vmul_vf_vf_vf(x, x);\n\n  o = veq_vo_vi2_vi2(vand_vi2_vi2_vi2(q, vcast_vi2_i(1)), vcast_vi2_i(1));\n  x = vreinterpret_vf_vm(vxor_vm_vm_vm(vand_vm_vo32_vm(o, vreinterpret_vm_vf(vcast_vf_f(-0.0f))), vreinterpret_vm_vf(x)));\n\n#if defined(ENABLE_NEON32)\n  u = vcast_vf_f(0.00927245803177356719970703f);\n  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(0.00331984995864331722259521f));\n  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(0.0242998078465461730957031f));\n  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(0.0534495301544666290283203f));\n  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(0.133383005857467651367188f));\n  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(0.333331853151321411132812f));\n#else\n  vfloat s2 = vmul_vf_vf_vf(s, s), s4 = vmul_vf_vf_vf(s2, s2);\n  u = POLY6(s, s2, s4,\n\t    0.00927245803177356719970703f,\n\t    0.00331984995864331722259521f,\n\t    0.0242998078465461730957031f,\n\t    0.0534495301544666290283203f,\n\t    0.133383005857467651367188f,\n\t    0.333331853151321411132812f);\n#endif\n\n  u = vmla_vf_vf_vf_vf(s, vmul_vf_vf_vf(u, x), x);\n\n  u = vsel_vf_vo_vf_vf(o, vrec_vf_vf(u), u);\n\n  return u;\n#endif // #if !defined(DETERMINISTIC)\n}\n\nEXPORT CONST VECTOR_CC vfloat xsinf_u1(vfloat d) {\n#if !defined(DETERMINISTIC)\n  vint2 q;\n  vfloat u, v;\n  vfloat2 s, t, x;\n\n  if (LIKELY(vtestallones_i_vo32(vlt_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(TRIGRANGEMAX2f))))) {\n    u = vrint_vf_vf(vmul_vf_vf_vf(d, vcast_vf_f(M_1_PI)));\n    q = vrint_vi2_vf(u);\n    v = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_A2f), d);\n    s = dfadd2_vf2_vf_vf(v, vmul_vf_vf_vf(u, vcast_vf_f(-PI_B2f)));\n    s = dfadd_vf2_vf2_vf(s, vmul_vf_vf_vf(u, vcast_vf_f(-PI_C2f)));\n  } else {\n    dfi_t dfi = rempif(d);\n    q = vand_vi2_vi2_vi2(dfigeti_vi2_dfi(dfi), vcast_vi2_i(3));\n    q = vadd_vi2_vi2_vi2(vadd_vi2_vi2_vi2(q, q), vsel_vi2_vo_vi2_vi2(vgt_vo_vf_vf(vf2getx_vf_vf2(dfigetdf_vf2_dfi(dfi)), vcast_vf_f(0)), vcast_vi2_i(2), vcast_vi2_i(1)));\n    q = vsra_vi2_vi2_i(q, 2);\n    vopmask o = veq_vo_vi2_vi2(vand_vi2_vi2_vi2(dfigeti_vi2_dfi(dfi), vcast_vi2_i(1)), vcast_vi2_i(1));\n    vfloat2 x = vcast_vf2_vf_vf(vmulsign_vf_vf_vf(vcast_vf_f(3.1415927410125732422f*-0.5), vf2getx_vf_vf2(dfigetdf_vf2_dfi(dfi))), \n\t\t\t\tvmulsign_vf_vf_vf(vcast_vf_f(-8.7422776573475857731e-08f*-0.5), vf2getx_vf_vf2(dfigetdf_vf2_dfi(dfi))));\n    x = dfadd2_vf2_vf2_vf2(dfigetdf_vf2_dfi(dfi), x);\n    dfi = dfisetdf_dfi_dfi_vf2(dfi, vsel_vf2_vo_vf2_vf2(o, x, dfigetdf_vf2_dfi(dfi)));\n    s = dfnormalize_vf2_vf2(dfigetdf_vf2_dfi(dfi));\n\n#if !defined(_MSC_VER)\n    s = vf2setx_vf2_vf2_vf(s, vreinterpret_vf_vm(vor_vm_vo32_vm(vor_vo_vo_vo(visinf_vo_vf(d), visnan_vo_vf(d)), vreinterpret_vm_vf(vf2getx_vf_vf2(s)))));\n#else\n    s.x = vreinterpret_vf_vm(vor_vm_vo32_vm(vor_vo_vo_vo(visinf_vo_vf(d), visnan_vo_vf(d)), vreinterpret_vm_vf(s.x)));\n#endif\n  }\n\n  t = s;\n  s = dfsqu_vf2_vf2(s);\n\n  u = vcast_vf_f(2.6083159809786593541503e-06f);\n  u = vmla_vf_vf_vf_vf(u, vf2getx_vf_vf2(s), vcast_vf_f(-0.0001981069071916863322258f));\n  u = vmla_vf_vf_vf_vf(u, vf2getx_vf_vf2(s), vcast_vf_f(0.00833307858556509017944336f));\n\n  x = dfadd_vf2_vf_vf2(vcast_vf_f(1), dfmul_vf2_vf2_vf2(dfadd_vf2_vf_vf(vcast_vf_f(-0.166666597127914428710938f), vmul_vf_vf_vf(u, vf2getx_vf_vf2(s))), s));\n\n  u = dfmul_vf_vf2_vf2(t, x);\n\n  u = vreinterpret_vf_vm(vxor_vm_vm_vm(vand_vm_vo32_vm(veq_vo_vi2_vi2(vand_vi2_vi2_vi2(q, vcast_vi2_i(1)), vcast_vi2_i(1)), vreinterpret_vm_vf(vcast_vf_f(-0.0))), vreinterpret_vm_vf(u)));\n\n  u = vsel_vf_vo_vf_vf(visnegzero_vo_vf(d), d, u);\n\n  return u;\n\n#else // #if !defined(DETERMINISTIC)\n\n  vint2 q;\n  vfloat u, v;\n  vfloat2 s, t, x;\n\n  u = vrint_vf_vf(vmul_vf_vf_vf(d, vcast_vf_f(M_1_PI)));\n  q = vrint_vi2_vf(u);\n  v = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_A2f), d);\n  s = dfadd2_vf2_vf_vf(v, vmul_vf_vf_vf(u, vcast_vf_f(-PI_B2f)));\n  s = dfadd_vf2_vf2_vf(s, vmul_vf_vf_vf(u, vcast_vf_f(-PI_C2f)));\n  vopmask g = vlt_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(TRIGRANGEMAX2f));\n\n  if (!LIKELY(vtestallones_i_vo32(g))) {\n    dfi_t dfi = rempif(d);\n    vint2 q2 = vand_vi2_vi2_vi2(dfigeti_vi2_dfi(dfi), vcast_vi2_i(3));\n    q2 = vadd_vi2_vi2_vi2(vadd_vi2_vi2_vi2(q2, q2), vsel_vi2_vo_vi2_vi2(vgt_vo_vf_vf(vf2getx_vf_vf2(dfigetdf_vf2_dfi(dfi)), vcast_vf_f(0)), vcast_vi2_i(2), vcast_vi2_i(1)));\n    q2 = vsra_vi2_vi2_i(q2, 2);\n    vopmask o = veq_vo_vi2_vi2(vand_vi2_vi2_vi2(dfigeti_vi2_dfi(dfi), vcast_vi2_i(1)), vcast_vi2_i(1));\n    vfloat2 x = vcast_vf2_vf_vf(vmulsign_vf_vf_vf(vcast_vf_f(3.1415927410125732422f*-0.5), vf2getx_vf_vf2(dfigetdf_vf2_dfi(dfi))), \n\t\t\t\tvmulsign_vf_vf_vf(vcast_vf_f(-8.7422776573475857731e-08f*-0.5), vf2getx_vf_vf2(dfigetdf_vf2_dfi(dfi))));\n    x = dfadd2_vf2_vf2_vf2(dfigetdf_vf2_dfi(dfi), x);\n    dfi = dfisetdf_dfi_dfi_vf2(dfi, vsel_vf2_vo_vf2_vf2(o, x, dfigetdf_vf2_dfi(dfi)));\n    t = dfnormalize_vf2_vf2(dfigetdf_vf2_dfi(dfi));\n\n    t = vf2setx_vf2_vf2_vf(t, vreinterpret_vf_vm(vor_vm_vo32_vm(vor_vo_vo_vo(visinf_vo_vf(d), visnan_vo_vf(d)), vreinterpret_vm_vf(vf2getx_vf_vf2(t)))));\n\n    q = vsel_vi2_vo_vi2_vi2(g, q, q2);\n    s = vsel_vf2_vo_vf2_vf2(g, s, t);\n  }\n\n  t = s;\n  s = dfsqu_vf2_vf2(s);\n\n  u = vcast_vf_f(2.6083159809786593541503e-06f);\n  u = vmla_vf_vf_vf_vf(u, vf2getx_vf_vf2(s), vcast_vf_f(-0.0001981069071916863322258f));\n  u = vmla_vf_vf_vf_vf(u, vf2getx_vf_vf2(s), vcast_vf_f(0.00833307858556509017944336f));\n\n  x = dfadd_vf2_vf_vf2(vcast_vf_f(1), dfmul_vf2_vf2_vf2(dfadd_vf2_vf_vf(vcast_vf_f(-0.166666597127914428710938f), vmul_vf_vf_vf(u, vf2getx_vf_vf2(s))), s));\n\n  u = dfmul_vf_vf2_vf2(t, x);\n\n  u = vreinterpret_vf_vm(vxor_vm_vm_vm(vand_vm_vo32_vm(veq_vo_vi2_vi2(vand_vi2_vi2_vi2(q, vcast_vi2_i(1)), vcast_vi2_i(1)), vreinterpret_vm_vf(vcast_vf_f(-0.0))), vreinterpret_vm_vf(u)));\n\n  u = vsel_vf_vo_vf_vf(visnegzero_vo_vf(d), d, u);\n\n  return u;\n#endif // #if !defined(DETERMINISTIC)\n}\n\nEXPORT CONST VECTOR_CC vfloat xcosf_u1(vfloat d) {\n#if !defined(DETERMINISTIC)\n  vint2 q;\n  vfloat u;\n  vfloat2 s, t, x;\n\n  if (LIKELY(vtestallones_i_vo32(vlt_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(TRIGRANGEMAX2f))))) {\n    vfloat dq = vmla_vf_vf_vf_vf(vrint_vf_vf(vmla_vf_vf_vf_vf(d, vcast_vf_f(M_1_PI), vcast_vf_f(-0.5f))),\n\t\t\t\t vcast_vf_f(2), vcast_vf_f(1));\n    q = vrint_vi2_vf(dq);\n    s = dfadd2_vf2_vf_vf (d, vmul_vf_vf_vf(dq, vcast_vf_f(-PI_A2f*0.5f)));\n    s = dfadd2_vf2_vf2_vf(s, vmul_vf_vf_vf(dq, vcast_vf_f(-PI_B2f*0.5f)));\n    s = dfadd2_vf2_vf2_vf(s, vmul_vf_vf_vf(dq, vcast_vf_f(-PI_C2f*0.5f)));\n  } else {\n    dfi_t dfi = rempif(d);\n    q = vand_vi2_vi2_vi2(dfigeti_vi2_dfi(dfi), vcast_vi2_i(3));\n    q = vadd_vi2_vi2_vi2(vadd_vi2_vi2_vi2(q, q), vsel_vi2_vo_vi2_vi2(vgt_vo_vf_vf(vf2getx_vf_vf2(dfigetdf_vf2_dfi(dfi)), vcast_vf_f(0)), vcast_vi2_i(8), vcast_vi2_i(7)));\n    q = vsra_vi2_vi2_i(q, 1);\n    vopmask o = veq_vo_vi2_vi2(vand_vi2_vi2_vi2(dfigeti_vi2_dfi(dfi), vcast_vi2_i(1)), vcast_vi2_i(0));\n    vfloat y = vsel_vf_vo_vf_vf(vgt_vo_vf_vf(vf2getx_vf_vf2(dfigetdf_vf2_dfi(dfi)), vcast_vf_f(0)), vcast_vf_f(0), vcast_vf_f(-1));\n    vfloat2 x = vcast_vf2_vf_vf(vmulsign_vf_vf_vf(vcast_vf_f(3.1415927410125732422f*-0.5), y),\n\t\t\t\tvmulsign_vf_vf_vf(vcast_vf_f(-8.7422776573475857731e-08f*-0.5), y));\n    x = dfadd2_vf2_vf2_vf2(dfigetdf_vf2_dfi(dfi), x);\n    dfi = dfisetdf_dfi_dfi_vf2(dfi, vsel_vf2_vo_vf2_vf2(o, x, dfigetdf_vf2_dfi(dfi)));\n    s = dfnormalize_vf2_vf2(dfigetdf_vf2_dfi(dfi));\n\n#if !defined(_MSC_VER)\n    s = vf2setx_vf2_vf2_vf(s, vreinterpret_vf_vm(vor_vm_vo32_vm(vor_vo_vo_vo(visinf_vo_vf(d), visnan_vo_vf(d)), vreinterpret_vm_vf(vf2getx_vf_vf2(s)))));\n#else\n    s.x = vreinterpret_vf_vm(vor_vm_vo32_vm(vor_vo_vo_vo(visinf_vo_vf(d), visnan_vo_vf(d)), vreinterpret_vm_vf(s.x)));\n#endif\n  }\n\n  t = s;\n  s = dfsqu_vf2_vf2(s);\n\n  u = vcast_vf_f(2.6083159809786593541503e-06f);\n  u = vmla_vf_vf_vf_vf(u, vf2getx_vf_vf2(s), vcast_vf_f(-0.0001981069071916863322258f));\n  u = vmla_vf_vf_vf_vf(u, vf2getx_vf_vf2(s), vcast_vf_f(0.00833307858556509017944336f));\n\n  x = dfadd_vf2_vf_vf2(vcast_vf_f(1), dfmul_vf2_vf2_vf2(dfadd_vf2_vf_vf(vcast_vf_f(-0.166666597127914428710938f), vmul_vf_vf_vf(u, vf2getx_vf_vf2(s))), s));\n\n  u = dfmul_vf_vf2_vf2(t, x);\n\n  u = vreinterpret_vf_vm(vxor_vm_vm_vm(vand_vm_vo32_vm(veq_vo_vi2_vi2(vand_vi2_vi2_vi2(q, vcast_vi2_i(2)), vcast_vi2_i(0)), vreinterpret_vm_vf(vcast_vf_f(-0.0))), vreinterpret_vm_vf(u)));\n  \n  return u;\n\n#else // #if !defined(DETERMINISTIC)\n\n  vint2 q;\n  vfloat u;\n  vfloat2 s, t, x;\n\n  vfloat dq = vmla_vf_vf_vf_vf(vrint_vf_vf(vmla_vf_vf_vf_vf(d, vcast_vf_f(M_1_PI), vcast_vf_f(-0.5f))),\n\t\t\t       vcast_vf_f(2), vcast_vf_f(1));\n  q = vrint_vi2_vf(dq);\n  s = dfadd2_vf2_vf_vf (d, vmul_vf_vf_vf(dq, vcast_vf_f(-PI_A2f*0.5f)));\n  s = dfadd2_vf2_vf2_vf(s, vmul_vf_vf_vf(dq, vcast_vf_f(-PI_B2f*0.5f)));\n  s = dfadd2_vf2_vf2_vf(s, vmul_vf_vf_vf(dq, vcast_vf_f(-PI_C2f*0.5f)));\n  vopmask g = vlt_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(TRIGRANGEMAX2f));\n\n  if (!LIKELY(vtestallones_i_vo32(g))) {\n    dfi_t dfi = rempif(d);\n    vint2 q2 = vand_vi2_vi2_vi2(dfigeti_vi2_dfi(dfi), vcast_vi2_i(3));\n    q2 = vadd_vi2_vi2_vi2(vadd_vi2_vi2_vi2(q2, q2), vsel_vi2_vo_vi2_vi2(vgt_vo_vf_vf(vf2getx_vf_vf2(dfigetdf_vf2_dfi(dfi)), vcast_vf_f(0)), vcast_vi2_i(8), vcast_vi2_i(7)));\n    q2 = vsra_vi2_vi2_i(q2, 1);\n    vopmask o = veq_vo_vi2_vi2(vand_vi2_vi2_vi2(dfigeti_vi2_dfi(dfi), vcast_vi2_i(1)), vcast_vi2_i(0));\n    vfloat y = vsel_vf_vo_vf_vf(vgt_vo_vf_vf(vf2getx_vf_vf2(dfigetdf_vf2_dfi(dfi)), vcast_vf_f(0)), vcast_vf_f(0), vcast_vf_f(-1));\n    vfloat2 x = vcast_vf2_vf_vf(vmulsign_vf_vf_vf(vcast_vf_f(3.1415927410125732422f*-0.5), y),\n\t\t\t\tvmulsign_vf_vf_vf(vcast_vf_f(-8.7422776573475857731e-08f*-0.5), y));\n    x = dfadd2_vf2_vf2_vf2(dfigetdf_vf2_dfi(dfi), x);\n    dfi = dfisetdf_dfi_dfi_vf2(dfi, vsel_vf2_vo_vf2_vf2(o, x, dfigetdf_vf2_dfi(dfi)));\n    t = dfnormalize_vf2_vf2(dfigetdf_vf2_dfi(dfi));\n\n    t = vf2setx_vf2_vf2_vf(t, vreinterpret_vf_vm(vor_vm_vo32_vm(vor_vo_vo_vo(visinf_vo_vf(d), visnan_vo_vf(d)), vreinterpret_vm_vf(vf2getx_vf_vf2(t)))));\n\n    q = vsel_vi2_vo_vi2_vi2(g, q, q2);\n    s = vsel_vf2_vo_vf2_vf2(g, s, t);\n  }\n\n  t = s;\n  s = dfsqu_vf2_vf2(s);\n\n  u = vcast_vf_f(2.6083159809786593541503e-06f);\n  u = vmla_vf_vf_vf_vf(u, vf2getx_vf_vf2(s), vcast_vf_f(-0.0001981069071916863322258f));\n  u = vmla_vf_vf_vf_vf(u, vf2getx_vf_vf2(s), vcast_vf_f(0.00833307858556509017944336f));\n\n  x = dfadd_vf2_vf_vf2(vcast_vf_f(1), dfmul_vf2_vf2_vf2(dfadd_vf2_vf_vf(vcast_vf_f(-0.166666597127914428710938f), vmul_vf_vf_vf(u, vf2getx_vf_vf2(s))), s));\n\n  u = dfmul_vf_vf2_vf2(t, x);\n\n  u = vreinterpret_vf_vm(vxor_vm_vm_vm(vand_vm_vo32_vm(veq_vo_vi2_vi2(vand_vi2_vi2_vi2(q, vcast_vi2_i(2)), vcast_vi2_i(0)), vreinterpret_vm_vf(vcast_vf_f(-0.0))), vreinterpret_vm_vf(u)));\n  \n  return u;\n#endif // #if !defined(DETERMINISTIC)\n}\n\nEXPORT CONST VECTOR_CC vfloat xfastsinf_u3500(vfloat d) {\n  vint2 q;\n  vfloat u, s, t = d;\n\n  s = vmul_vf_vf_vf(d, vcast_vf_f((float)M_1_PI));\n  u = vrint_vf_vf(s);\n  q = vrint_vi2_vf(s);\n  d = vmla_vf_vf_vf_vf(u, vcast_vf_f(-(float)M_PI), d);\n\n  s = vmul_vf_vf_vf(d, d);\n\n  u = vcast_vf_f(-0.1881748176e-3);\n  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(+0.8323502727e-2));\n  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(-0.1666651368e+0));\n  u = vmla_vf_vf_vf_vf(vmul_vf_vf_vf(s, d), u, d);\n\n  u = vreinterpret_vf_vm(vxor_vm_vm_vm(vand_vm_vo32_vm(veq_vo_vi2_vi2(vand_vi2_vi2_vi2(q, vcast_vi2_i(1)), vcast_vi2_i(1)), vreinterpret_vm_vf(vcast_vf_f(-0.0f))), vreinterpret_vm_vf(u)));\n\n  vopmask g = vlt_vo_vf_vf(vabs_vf_vf(t), vcast_vf_f(30.0f));\n  if (!LIKELY(vtestallones_i_vo32(g))) return vsel_vf_vo_vf_vf(g, u, xsinf(t));\n\n  return u;\n}\n\nEXPORT CONST VECTOR_CC vfloat xfastcosf_u3500(vfloat d) {\n  vint2 q;\n  vfloat u, s, t = d;\n\n  s = vmla_vf_vf_vf_vf(d, vcast_vf_f((float)M_1_PI), vcast_vf_f(-0.5f));\n  u = vrint_vf_vf(s);\n  q = vrint_vi2_vf(s);\n  d = vmla_vf_vf_vf_vf(u, vcast_vf_f(-(float)M_PI), vsub_vf_vf_vf(d, vcast_vf_f((float)M_PI * 0.5f)));\n\n  s = vmul_vf_vf_vf(d, d);\n\n  u = vcast_vf_f(-0.1881748176e-3);\n  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(+0.8323502727e-2));\n  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(-0.1666651368e+0));\n  u = vmla_vf_vf_vf_vf(vmul_vf_vf_vf(s, d), u, d);\n\n  u = vreinterpret_vf_vm(vxor_vm_vm_vm(vand_vm_vo32_vm(veq_vo_vi2_vi2(vand_vi2_vi2_vi2(q, vcast_vi2_i(1)), vcast_vi2_i(0)), vreinterpret_vm_vf(vcast_vf_f(-0.0f))), vreinterpret_vm_vf(u)));\n\n  vopmask g = vlt_vo_vf_vf(vabs_vf_vf(t), vcast_vf_f(30.0f));\n  if (!LIKELY(vtestallones_i_vo32(g))) return vsel_vf_vo_vf_vf(g, u, xcosf(t));\n\n  return u;\n}\n\n#ifdef ENABLE_GNUABI\n#define TYPE2_FUNCATR static INLINE CONST \n#define TYPE6_FUNCATR static INLINE CONST \n#define SQRTFU05_FUNCATR static INLINE CONST \n#define XSINCOSF sincosfk\n#define XSINCOSF_U1 sincosfk_u1\n#define XSINCOSPIF_U05 sincospifk_u05\n#define XSINCOSPIF_U35 sincospifk_u35\n#define XMODFF modffk\n#else\n#define TYPE2_FUNCATR EXPORT CONST\n#define TYPE6_FUNCATR EXPORT\n#define SQRTFU05_FUNCATR EXPORT\n#define XSINCOSF xsincosf\n#define XSINCOSF_U1 xsincosf_u1\n#define XSINCOSPIF_U05 xsincospif_u05\n#define XSINCOSPIF_U35 xsincospif_u35\n#define XMODFF xmodff\n#endif\n\nTYPE2_FUNCATR VECTOR_CC vfloat2 XSINCOSF(vfloat d) {\n#if !defined(DETERMINISTIC)\n  vint2 q;\n  vopmask o;\n  vfloat u, s, t, rx, ry;\n  vfloat2 r;\n\n  s = d;\n\n  if (LIKELY(vtestallones_i_vo32(vlt_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(TRIGRANGEMAX2f))))) {\n    q = vrint_vi2_vf(vmul_vf_vf_vf(d, vcast_vf_f((float)M_2_PI)));\n    u = vcast_vf_vi2(q);\n    s = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_A2f*0.5f), s);\n    s = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_B2f*0.5f), s);\n    s = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_C2f*0.5f), s);\n  } else if (LIKELY(vtestallones_i_vo32(vlt_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(TRIGRANGEMAXf))))) {\n    q = vrint_vi2_vf(vmul_vf_vf_vf(d, vcast_vf_f((float)M_2_PI)));\n    u = vcast_vf_vi2(q);\n    s = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_Af*0.5f), s);\n    s = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_Bf*0.5f), s);\n    s = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_Cf*0.5f), s);\n    s = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_Df*0.5f), s);\n  } else {\n    dfi_t dfi = rempif(d);\n    q = dfigeti_vi2_dfi(dfi);\n    s = vadd_vf_vf_vf(vf2getx_vf_vf2(dfigetdf_vf2_dfi(dfi)), vf2gety_vf_vf2(dfigetdf_vf2_dfi(dfi)));\n    s = vreinterpret_vf_vm(vor_vm_vo32_vm(vor_vo_vo_vo(visinf_vo_vf(d), visnan_vo_vf(d)), vreinterpret_vm_vf(s)));\n  }\n\n  t = s;\n\n  s = vmul_vf_vf_vf(s, s);\n\n  u = vcast_vf_f(-0.000195169282960705459117889f);\n  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(0.00833215750753879547119141f));\n  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(-0.166666537523269653320312f));\n\n  rx = vmla_vf_vf_vf_vf(vmul_vf_vf_vf(u, s), t, t);\n  rx = vsel_vf_vo_vf_vf(visnegzero_vo_vf(d), vcast_vf_f(-0.0f), rx);\n\n  u = vcast_vf_f(-2.71811842367242206819355e-07f);\n  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(2.47990446951007470488548e-05f));\n  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(-0.00138888787478208541870117f));\n  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(0.0416666641831398010253906f));\n  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(-0.5));\n\n  ry = vmla_vf_vf_vf_vf(s, u, vcast_vf_f(1));\n\n  o = veq_vo_vi2_vi2(vand_vi2_vi2_vi2(q, vcast_vi2_i(1)), vcast_vi2_i(0));\n  r = vf2setxy_vf2_vf_vf(vsel_vf_vo_vf_vf(o, rx, ry), vsel_vf_vo_vf_vf(o, ry, rx));\n\n  o = veq_vo_vi2_vi2(vand_vi2_vi2_vi2(q, vcast_vi2_i(2)), vcast_vi2_i(2));\n  r = vf2setx_vf2_vf2_vf(r, vreinterpret_vf_vm(vxor_vm_vm_vm(vand_vm_vo32_vm(o, vreinterpret_vm_vf(vcast_vf_f(-0.0))), vreinterpret_vm_vf(vf2getx_vf_vf2(r)))));\n\n  o = veq_vo_vi2_vi2(vand_vi2_vi2_vi2(vadd_vi2_vi2_vi2(q, vcast_vi2_i(1)), vcast_vi2_i(2)), vcast_vi2_i(2));\n  r = vf2sety_vf2_vf2_vf(r, vreinterpret_vf_vm(vxor_vm_vm_vm(vand_vm_vo32_vm(o, vreinterpret_vm_vf(vcast_vf_f(-0.0))), vreinterpret_vm_vf(vf2gety_vf_vf2(r)))));\n\n  return r;\n\n#else // #if !defined(DETERMINISTIC)\n\n  vint2 q;\n  vopmask o;\n  vfloat u, s, t, rx, ry;\n  vfloat2 r;\n\n  q = vrint_vi2_vf(vmul_vf_vf_vf(d, vcast_vf_f((float)M_2_PI)));\n  u = vcast_vf_vi2(q);\n  s = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_A2f*0.5f), d);\n  s = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_B2f*0.5f), s);\n  s = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_C2f*0.5f), s);\n  vopmask g = vlt_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(TRIGRANGEMAX2f));\n\n  if (!LIKELY(vtestallones_i_vo32(g))) {\n    vint2 q2 = vrint_vi2_vf(vmul_vf_vf_vf(d, vcast_vf_f((float)M_2_PI)));\n    u = vcast_vf_vi2(q2);\n    t = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_Af*0.5f), d);\n    t = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_Bf*0.5f), t);\n    t = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_Cf*0.5f), t);\n    t = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_Df*0.5f), t);\n\n    q = vsel_vi2_vo_vi2_vi2(g, q, q2);\n    s = vsel_vf_vo_vf_vf(g, s, t);\n    g = vlt_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(TRIGRANGEMAXf));\n\n    if (!LIKELY(vtestallones_i_vo32(g))) {\n      dfi_t dfi = rempif(d);\n      t = vadd_vf_vf_vf(vf2getx_vf_vf2(dfigetdf_vf2_dfi(dfi)), vf2gety_vf_vf2(dfigetdf_vf2_dfi(dfi)));\n      t = vreinterpret_vf_vm(vor_vm_vo32_vm(vor_vo_vo_vo(visinf_vo_vf(d), visnan_vo_vf(d)), vreinterpret_vm_vf(t)));\n\n      q = vsel_vi2_vo_vi2_vi2(g, q, dfigeti_vi2_dfi(dfi));\n      s = vsel_vf_vo_vf_vf(g, s, t);\n    }\n  }\n\n  t = s;\n\n  s = vmul_vf_vf_vf(s, s);\n\n  u = vcast_vf_f(-0.000195169282960705459117889f);\n  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(0.00833215750753879547119141f));\n  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(-0.166666537523269653320312f));\n\n  rx = vmla_vf_vf_vf_vf(vmul_vf_vf_vf(u, s), t, t);\n  rx = vsel_vf_vo_vf_vf(visnegzero_vo_vf(d), vcast_vf_f(-0.0f), rx);\n\n  u = vcast_vf_f(-2.71811842367242206819355e-07f);\n  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(2.47990446951007470488548e-05f));\n  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(-0.00138888787478208541870117f));\n  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(0.0416666641831398010253906f));\n  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(-0.5));\n\n  ry = vmla_vf_vf_vf_vf(s, u, vcast_vf_f(1));\n\n  o = veq_vo_vi2_vi2(vand_vi2_vi2_vi2(q, vcast_vi2_i(1)), vcast_vi2_i(0));\n  r = vf2setxy_vf2_vf_vf(vsel_vf_vo_vf_vf(o, rx, ry), vsel_vf_vo_vf_vf(o, ry, rx));\n\n  o = veq_vo_vi2_vi2(vand_vi2_vi2_vi2(q, vcast_vi2_i(2)), vcast_vi2_i(2));\n  r = vf2setx_vf2_vf2_vf(r, vreinterpret_vf_vm(vxor_vm_vm_vm(vand_vm_vo32_vm(o, vreinterpret_vm_vf(vcast_vf_f(-0.0))), vreinterpret_vm_vf(vf2getx_vf_vf2(r)))));\n\n  o = veq_vo_vi2_vi2(vand_vi2_vi2_vi2(vadd_vi2_vi2_vi2(q, vcast_vi2_i(1)), vcast_vi2_i(2)), vcast_vi2_i(2));\n  r = vf2sety_vf2_vf2_vf(r, vreinterpret_vf_vm(vxor_vm_vm_vm(vand_vm_vo32_vm(o, vreinterpret_vm_vf(vcast_vf_f(-0.0))), vreinterpret_vm_vf(vf2gety_vf_vf2(r)))));\n\n  return r;\n#endif // #if !defined(DETERMINISTIC)\n}\n\nTYPE2_FUNCATR VECTOR_CC vfloat2 XSINCOSF_U1(vfloat d) {\n#if !defined(DETERMINISTIC)\n  vint2 q;\n  vopmask o;\n  vfloat u, v, rx, ry;\n  vfloat2 r, s, t, x;\n\n  if (LIKELY(vtestallones_i_vo32(vlt_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(TRIGRANGEMAX2f))))) {\n    u = vrint_vf_vf(vmul_vf_vf_vf(d, vcast_vf_f(2 * M_1_PI)));\n    q = vrint_vi2_vf(u);\n    v = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_A2f*0.5f), d);\n    s = dfadd2_vf2_vf_vf(v, vmul_vf_vf_vf(u, vcast_vf_f(-PI_B2f*0.5f)));\n    s = dfadd_vf2_vf2_vf(s, vmul_vf_vf_vf(u, vcast_vf_f(-PI_C2f*0.5f)));\n  } else {\n    dfi_t dfi = rempif(d);\n    q = dfigeti_vi2_dfi(dfi);\n    s = dfigetdf_vf2_dfi(dfi);\n    o = vor_vo_vo_vo(visinf_vo_vf(d), visnan_vo_vf(d));\n    s = vf2setx_vf2_vf2_vf(s, vreinterpret_vf_vm(vor_vm_vo32_vm(o, vreinterpret_vm_vf(vf2getx_vf_vf2(s)))));\n  }\n\n  t = s;\n\n  s = vf2setx_vf2_vf2_vf(s, dfsqu_vf_vf2(s));\n\n  u = vcast_vf_f(-0.000195169282960705459117889f);\n  u = vmla_vf_vf_vf_vf(u, vf2getx_vf_vf2(s), vcast_vf_f(0.00833215750753879547119141f));\n  u = vmla_vf_vf_vf_vf(u, vf2getx_vf_vf2(s), vcast_vf_f(-0.166666537523269653320312f));\n\n  u = vmul_vf_vf_vf(u, vmul_vf_vf_vf(vf2getx_vf_vf2(s), vf2getx_vf_vf2(t)));\n\n  x = dfadd_vf2_vf2_vf(t, u);\n  rx = vadd_vf_vf_vf(vf2getx_vf_vf2(x), vf2gety_vf_vf2(x));\n\n  rx = vsel_vf_vo_vf_vf(visnegzero_vo_vf(d), vcast_vf_f(-0.0f), rx);\n\n  u = vcast_vf_f(-2.71811842367242206819355e-07f);\n  u = vmla_vf_vf_vf_vf(u, vf2getx_vf_vf2(s), vcast_vf_f(2.47990446951007470488548e-05f));\n  u = vmla_vf_vf_vf_vf(u, vf2getx_vf_vf2(s), vcast_vf_f(-0.00138888787478208541870117f));\n  u = vmla_vf_vf_vf_vf(u, vf2getx_vf_vf2(s), vcast_vf_f(0.0416666641831398010253906f));\n  u = vmla_vf_vf_vf_vf(u, vf2getx_vf_vf2(s), vcast_vf_f(-0.5));\n\n  x = dfadd_vf2_vf_vf2(vcast_vf_f(1), dfmul_vf2_vf_vf(vf2getx_vf_vf2(s), u));\n  ry = vadd_vf_vf_vf(vf2getx_vf_vf2(x), vf2gety_vf_vf2(x));\n\n  o = veq_vo_vi2_vi2(vand_vi2_vi2_vi2(q, vcast_vi2_i(1)), vcast_vi2_i(0));\n  r = vf2setxy_vf2_vf_vf(vsel_vf_vo_vf_vf(o, rx, ry), vsel_vf_vo_vf_vf(o, ry, rx));\n\n  o = veq_vo_vi2_vi2(vand_vi2_vi2_vi2(q, vcast_vi2_i(2)), vcast_vi2_i(2));\n  r = vf2setx_vf2_vf2_vf(r, vreinterpret_vf_vm(vxor_vm_vm_vm(vand_vm_vo32_vm(o, vreinterpret_vm_vf(vcast_vf_f(-0.0))), vreinterpret_vm_vf(vf2getx_vf_vf2(r)))));\n\n  o = veq_vo_vi2_vi2(vand_vi2_vi2_vi2(vadd_vi2_vi2_vi2(q, vcast_vi2_i(1)), vcast_vi2_i(2)), vcast_vi2_i(2));\n  r = vf2sety_vf2_vf2_vf(r, vreinterpret_vf_vm(vxor_vm_vm_vm(vand_vm_vo32_vm(o, vreinterpret_vm_vf(vcast_vf_f(-0.0))), vreinterpret_vm_vf(vf2gety_vf_vf2(r)))));\n\n  return r;\n\n#else // #if !defined(DETERMINISTIC)\n\n  vint2 q;\n  vopmask o;\n  vfloat u, v, rx, ry;\n  vfloat2 r, s, t, x;\n\n  u = vrint_vf_vf(vmul_vf_vf_vf(d, vcast_vf_f(2 * M_1_PI)));\n  q = vrint_vi2_vf(u);\n  v = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_A2f*0.5f), d);\n  s = dfadd2_vf2_vf_vf(v, vmul_vf_vf_vf(u, vcast_vf_f(-PI_B2f*0.5f)));\n  s = dfadd_vf2_vf2_vf(s, vmul_vf_vf_vf(u, vcast_vf_f(-PI_C2f*0.5f)));\n  vopmask g = vlt_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(TRIGRANGEMAX2f));\n\n  if (!LIKELY(vtestallones_i_vo32(g))) {\n    dfi_t dfi = rempif(d);\n    t = dfigetdf_vf2_dfi(dfi);\n    o = vor_vo_vo_vo(visinf_vo_vf(d), visnan_vo_vf(d));\n    t = vf2setx_vf2_vf2_vf(t, vreinterpret_vf_vm(vor_vm_vo32_vm(o, vreinterpret_vm_vf(vf2getx_vf_vf2(t)))));\n    q = vsel_vi2_vo_vi2_vi2(g, q, dfigeti_vi2_dfi(dfi));\n    s = vsel_vf2_vo_vf2_vf2(g, s, t);\n  }\n\n  t = s;\n\n  s = vf2setx_vf2_vf2_vf(s, dfsqu_vf_vf2(s));\n\n  u = vcast_vf_f(-0.000195169282960705459117889f);\n  u = vmla_vf_vf_vf_vf(u, vf2getx_vf_vf2(s), vcast_vf_f(0.00833215750753879547119141f));\n  u = vmla_vf_vf_vf_vf(u, vf2getx_vf_vf2(s), vcast_vf_f(-0.166666537523269653320312f));\n\n  u = vmul_vf_vf_vf(u, vmul_vf_vf_vf(vf2getx_vf_vf2(s), vf2getx_vf_vf2(t)));\n\n  x = dfadd_vf2_vf2_vf(t, u);\n  rx = vadd_vf_vf_vf(vf2getx_vf_vf2(x), vf2gety_vf_vf2(x));\n\n  rx = vsel_vf_vo_vf_vf(visnegzero_vo_vf(d), vcast_vf_f(-0.0f), rx);\n\n  u = vcast_vf_f(-2.71811842367242206819355e-07f);\n  u = vmla_vf_vf_vf_vf(u, vf2getx_vf_vf2(s), vcast_vf_f(2.47990446951007470488548e-05f));\n  u = vmla_vf_vf_vf_vf(u, vf2getx_vf_vf2(s), vcast_vf_f(-0.00138888787478208541870117f));\n  u = vmla_vf_vf_vf_vf(u, vf2getx_vf_vf2(s), vcast_vf_f(0.0416666641831398010253906f));\n  u = vmla_vf_vf_vf_vf(u, vf2getx_vf_vf2(s), vcast_vf_f(-0.5));\n\n  x = dfadd_vf2_vf_vf2(vcast_vf_f(1), dfmul_vf2_vf_vf(vf2getx_vf_vf2(s), u));\n  ry = vadd_vf_vf_vf(vf2getx_vf_vf2(x), vf2gety_vf_vf2(x));\n\n  o = veq_vo_vi2_vi2(vand_vi2_vi2_vi2(q, vcast_vi2_i(1)), vcast_vi2_i(0));\n  r = vf2setxy_vf2_vf_vf(vsel_vf_vo_vf_vf(o, rx, ry), vsel_vf_vo_vf_vf(o, ry, rx));\n\n  o = veq_vo_vi2_vi2(vand_vi2_vi2_vi2(q, vcast_vi2_i(2)), vcast_vi2_i(2));\n  r = vf2setx_vf2_vf2_vf(r, vreinterpret_vf_vm(vxor_vm_vm_vm(vand_vm_vo32_vm(o, vreinterpret_vm_vf(vcast_vf_f(-0.0))), vreinterpret_vm_vf(vf2getx_vf_vf2(r)))));\n\n  o = veq_vo_vi2_vi2(vand_vi2_vi2_vi2(vadd_vi2_vi2_vi2(q, vcast_vi2_i(1)), vcast_vi2_i(2)), vcast_vi2_i(2));\n  r = vf2sety_vf2_vf2_vf(r, vreinterpret_vf_vm(vxor_vm_vm_vm(vand_vm_vo32_vm(o, vreinterpret_vm_vf(vcast_vf_f(-0.0))), vreinterpret_vm_vf(vf2gety_vf_vf2(r)))));\n\n  return r;\n#endif // #if !defined(DETERMINISTIC)\n}\n\n#if !defined(DETERMINISTIC)\nTYPE2_FUNCATR VECTOR_CC vfloat2 XSINCOSPIF_U05(vfloat d) {\n  vopmask o;\n  vfloat u, s, t, rx, ry;\n  vfloat2 r, x, s2;\n\n  u = vmul_vf_vf_vf(d, vcast_vf_f(4));\n  vint2 q = vtruncate_vi2_vf(u);\n  q = vand_vi2_vi2_vi2(vadd_vi2_vi2_vi2(q, vxor_vi2_vi2_vi2(vsrl_vi2_vi2_i(q, 31), vcast_vi2_i(1))), vcast_vi2_i(~1));\n  s = vsub_vf_vf_vf(u, vcast_vf_vi2(q));\n\n  t = s;\n  s = vmul_vf_vf_vf(s, s);\n  s2 = dfmul_vf2_vf_vf(t, t);\n  \n  //\n\n  u = vcast_vf_f(+0.3093842054e-6);\n  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(-0.3657307388e-4));\n  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(+0.2490393585e-2));\n  x = dfadd2_vf2_vf_vf2(vmul_vf_vf_vf(u, s), vcast_vf2_f_f(-0.080745510756969451904, -1.3373665339076936258e-09));\n  x = dfadd2_vf2_vf2_vf2(dfmul_vf2_vf2_vf2(s2, x), vcast_vf2_f_f(0.78539818525314331055, -2.1857338617566484855e-08));\n\n  x = dfmul_vf2_vf2_vf(x, t);\n  rx = vadd_vf_vf_vf(vf2getx_vf_vf2(x), vf2gety_vf_vf2(x));\n\n  rx = vsel_vf_vo_vf_vf(visnegzero_vo_vf(d), vcast_vf_f(-0.0f), rx);\n  \n  //\n  \n  u = vcast_vf_f(-0.2430611801e-7);\n  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(+0.3590577080e-5));\n  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(-0.3259917721e-3));\n  x = dfadd2_vf2_vf_vf2(vmul_vf_vf_vf(u, s), vcast_vf2_f_f(0.015854343771934509277, 4.4940051354032242811e-10));\n  x = dfadd2_vf2_vf2_vf2(dfmul_vf2_vf2_vf2(s2, x), vcast_vf2_f_f(-0.30842512845993041992, -9.0728339030733922277e-09));\n\n  x = dfadd2_vf2_vf2_vf(dfmul_vf2_vf2_vf2(x, s2), vcast_vf_f(1));\n  ry = vadd_vf_vf_vf(vf2getx_vf_vf2(x), vf2gety_vf_vf2(x));\n\n  //\n\n  o = veq_vo_vi2_vi2(vand_vi2_vi2_vi2(q, vcast_vi2_i(2)), vcast_vi2_i(0));\n  r = vf2setxy_vf2_vf_vf(vsel_vf_vo_vf_vf(o, rx, ry), vsel_vf_vo_vf_vf(o, ry, rx));\n\n  o = veq_vo_vi2_vi2(vand_vi2_vi2_vi2(q, vcast_vi2_i(4)), vcast_vi2_i(4));\n  r = vf2setx_vf2_vf2_vf(r, vreinterpret_vf_vm(vxor_vm_vm_vm(vand_vm_vo32_vm(o, vreinterpret_vm_vf(vcast_vf_f(-0.0))), vreinterpret_vm_vf(vf2getx_vf_vf2(r)))));\n\n  o = veq_vo_vi2_vi2(vand_vi2_vi2_vi2(vadd_vi2_vi2_vi2(q, vcast_vi2_i(2)), vcast_vi2_i(4)), vcast_vi2_i(4));\n  r = vf2sety_vf2_vf2_vf(r, vreinterpret_vf_vm(vxor_vm_vm_vm(vand_vm_vo32_vm(o, vreinterpret_vm_vf(vcast_vf_f(-0.0))), vreinterpret_vm_vf(vf2gety_vf_vf2(r)))));\n\n  o = vgt_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(1e+7f));\n  r = vf2setx_vf2_vf2_vf(r, vreinterpret_vf_vm(vandnot_vm_vo32_vm(o, vreinterpret_vm_vf(vf2getx_vf_vf2(r)))));\n  r = vf2sety_vf2_vf2_vf(r, vreinterpret_vf_vm(vandnot_vm_vo32_vm(o, vreinterpret_vm_vf(vf2gety_vf_vf2(r)))));\n  \n  o = visinf_vo_vf(d);\n  r = vf2setx_vf2_vf2_vf(r, vreinterpret_vf_vm(vor_vm_vo32_vm(o, vreinterpret_vm_vf(vf2getx_vf_vf2(r)))));\n  r = vf2sety_vf2_vf2_vf(r, vreinterpret_vf_vm(vor_vm_vo32_vm(o, vreinterpret_vm_vf(vf2gety_vf_vf2(r)))));\n\n  return r;\n}\n\nTYPE2_FUNCATR VECTOR_CC vfloat2 XSINCOSPIF_U35(vfloat d) {\n  vopmask o;\n  vfloat u, s, t, rx, ry;\n  vfloat2 r;\n\n  u = vmul_vf_vf_vf(d, vcast_vf_f(4));\n  vint2 q = vtruncate_vi2_vf(u);\n  q = vand_vi2_vi2_vi2(vadd_vi2_vi2_vi2(q, vxor_vi2_vi2_vi2(vsrl_vi2_vi2_i(q, 31), vcast_vi2_i(1))), vcast_vi2_i(~1));\n  s = vsub_vf_vf_vf(u, vcast_vf_vi2(q));\n\n  t = s;\n  s = vmul_vf_vf_vf(s, s);\n  \n  //\n\n  u = vcast_vf_f(-0.3600925265e-4);\n  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(+0.2490088111e-2));\n  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(-0.8074551076e-1));\n  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(+0.7853981853e+0));\n\n  rx = vmul_vf_vf_vf(u, t);\n\n  //\n  \n  u = vcast_vf_f(+0.3539815225e-5);\n  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(-0.3259574005e-3));\n  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(+0.1585431583e-1));\n  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(-0.3084251285e+0));\n  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(1));\n\n  ry = u;\n\n  //\n\n  o = veq_vo_vi2_vi2(vand_vi2_vi2_vi2(q, vcast_vi2_i(2)), vcast_vi2_i(0));\n  r = vf2setxy_vf2_vf_vf(vsel_vf_vo_vf_vf(o, rx, ry), vsel_vf_vo_vf_vf(o, ry, rx));\n\n  o = veq_vo_vi2_vi2(vand_vi2_vi2_vi2(q, vcast_vi2_i(4)), vcast_vi2_i(4));\n  r = vf2setx_vf2_vf2_vf(r, vreinterpret_vf_vm(vxor_vm_vm_vm(vand_vm_vo32_vm(o, vreinterpret_vm_vf(vcast_vf_f(-0.0))), vreinterpret_vm_vf(vf2getx_vf_vf2(r)))));\n\n  o = veq_vo_vi2_vi2(vand_vi2_vi2_vi2(vadd_vi2_vi2_vi2(q, vcast_vi2_i(2)), vcast_vi2_i(4)), vcast_vi2_i(4));\n  r = vf2sety_vf2_vf2_vf(r, vreinterpret_vf_vm(vxor_vm_vm_vm(vand_vm_vo32_vm(o, vreinterpret_vm_vf(vcast_vf_f(-0.0))), vreinterpret_vm_vf(vf2gety_vf_vf2(r)))));\n\n  o = vgt_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(1e+7f));\n  r = vf2setx_vf2_vf2_vf(r, vreinterpret_vf_vm(vandnot_vm_vo32_vm(o, vreinterpret_vm_vf(vf2getx_vf_vf2(r)))));\n  r = vf2sety_vf2_vf2_vf(r, vreinterpret_vf_vm(vandnot_vm_vo32_vm(o, vreinterpret_vm_vf(vf2gety_vf_vf2(r)))));\n  \n  o = visinf_vo_vf(d);\n  r = vf2setx_vf2_vf2_vf(r, vreinterpret_vf_vm(vor_vm_vo32_vm(o, vreinterpret_vm_vf(vf2getx_vf_vf2(r)))));\n  r = vf2sety_vf2_vf2_vf(r, vreinterpret_vf_vm(vor_vm_vo32_vm(o, vreinterpret_vm_vf(vf2gety_vf_vf2(r)))));\n\n  return r;\n}\n\nTYPE6_FUNCATR VECTOR_CC vfloat2 XMODFF(vfloat x) {\n  vfloat fr = vsub_vf_vf_vf(x, vcast_vf_vi2(vtruncate_vi2_vf(x)));\n  fr = vsel_vf_vo_vf_vf(vgt_vo_vf_vf(vabs_vf_vf(x), vcast_vf_f(INT64_C(1) << 23)), vcast_vf_f(0), fr);\n\n  vfloat2 ret;\n\n  ret = vf2setxy_vf2_vf_vf(vcopysign_vf_vf_vf(fr, x), vcopysign_vf_vf_vf(vsub_vf_vf_vf(x, fr), x));\n\n  return ret;\n}\n\n#ifdef ENABLE_GNUABI\nEXPORT VECTOR_CC void xsincosf(vfloat a, float *ps, float *pc) {\n  vfloat2 r = sincosfk(a);\n  vstoreu_v_p_vf(ps, vf2getx_vf_vf2(r));\n  vstoreu_v_p_vf(pc, vf2gety_vf_vf2(r));\n}\n\nEXPORT VECTOR_CC void xsincosf_u1(vfloat a, float *ps, float *pc) {\n  vfloat2 r = sincosfk_u1(a);\n  vstoreu_v_p_vf(ps, vf2getx_vf_vf2(r));\n  vstoreu_v_p_vf(pc, vf2gety_vf_vf2(r));\n}\n\nEXPORT VECTOR_CC void xsincospif_u05(vfloat a, float *ps, float *pc) {\n  vfloat2 r = sincospifk_u05(a);\n  vstoreu_v_p_vf(ps, vf2getx_vf_vf2(r));\n  vstoreu_v_p_vf(pc, vf2gety_vf_vf2(r));\n}\n\nEXPORT VECTOR_CC void xsincospif_u35(vfloat a, float *ps, float *pc) {\n  vfloat2 r = sincospifk_u35(a);\n  vstoreu_v_p_vf(ps, vf2getx_vf_vf2(r));\n  vstoreu_v_p_vf(pc, vf2gety_vf_vf2(r));\n}\n\nEXPORT CONST VECTOR_CC vfloat xmodff(vfloat a, float *iptr) {\n  vfloat2 r = modffk(a);\n  vstoreu_v_p_vf(iptr, vf2gety_vf_vf2(r));\n  return vf2getx_vf_vf2(r);\n}\n#endif // #ifdef ENABLE_GNUABI\n#endif // #if !defined(DETERMINISTIC)\n\nEXPORT CONST VECTOR_CC vfloat xtanf_u1(vfloat d) {\n#if !defined(DETERMINISTIC)\n  vint2 q;\n  vfloat u, v;\n  vfloat2 s, t, x;\n  vopmask o;\n\n  if (LIKELY(vtestallones_i_vo32(vlt_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(TRIGRANGEMAX2f))))) {\n    u = vrint_vf_vf(vmul_vf_vf_vf(d, vcast_vf_f(2 * M_1_PI)));\n    q = vrint_vi2_vf(u);\n    v = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_A2f*0.5f), d);\n    s = dfadd2_vf2_vf_vf(v, vmul_vf_vf_vf(u, vcast_vf_f(-PI_B2f*0.5f)));\n    s = dfadd_vf2_vf2_vf(s, vmul_vf_vf_vf(u, vcast_vf_f(-PI_C2f*0.5f)));\n  } else {\n    dfi_t dfi = rempif(d);\n    q = dfigeti_vi2_dfi(dfi);\n    s = dfigetdf_vf2_dfi(dfi);\n    o = vor_vo_vo_vo(visinf_vo_vf(d), visnan_vo_vf(d));\n    s = vf2setx_vf2_vf2_vf(s, vreinterpret_vf_vm(vor_vm_vo32_vm(o, vreinterpret_vm_vf(vf2getx_vf_vf2(s)))));\n    s = vf2sety_vf2_vf2_vf(s, vreinterpret_vf_vm(vor_vm_vo32_vm(o, vreinterpret_vm_vf(vf2gety_vf_vf2(s)))));\n  }\n\n  o = veq_vo_vi2_vi2(vand_vi2_vi2_vi2(q, vcast_vi2_i(1)), vcast_vi2_i(1));\n  vmask n = vand_vm_vo32_vm(o, vreinterpret_vm_vf(vcast_vf_f(-0.0)));\n#if !defined(_MSC_VER)\n  s = vf2setx_vf2_vf2_vf(s, vreinterpret_vf_vm(vxor_vm_vm_vm(vreinterpret_vm_vf(vf2getx_vf_vf2(s)), n)));\n  s = vf2sety_vf2_vf2_vf(s, vreinterpret_vf_vm(vxor_vm_vm_vm(vreinterpret_vm_vf(vf2gety_vf_vf2(s)), n)));\n#else\n  s.x = vreinterpret_vf_vm(vxor_vm_vm_vm(vreinterpret_vm_vf(s.x), n));\n  s.y = vreinterpret_vf_vm(vxor_vm_vm_vm(vreinterpret_vm_vf(s.y), n));\n#endif\n\n  t = s;\n  s = dfsqu_vf2_vf2(s);\n  s = dfnormalize_vf2_vf2(s);\n\n  u = vcast_vf_f(0.00446636462584137916564941f);\n  u = vmla_vf_vf_vf_vf(u, vf2getx_vf_vf2(s), vcast_vf_f(-8.3920182078145444393158e-05f));\n  u = vmla_vf_vf_vf_vf(u, vf2getx_vf_vf2(s), vcast_vf_f(0.0109639242291450500488281f));\n  u = vmla_vf_vf_vf_vf(u, vf2getx_vf_vf2(s), vcast_vf_f(0.0212360303848981857299805f));\n  u = vmla_vf_vf_vf_vf(u, vf2getx_vf_vf2(s), vcast_vf_f(0.0540687143802642822265625f));\n\n  x = dfadd_vf2_vf_vf(vcast_vf_f(0.133325666189193725585938f), vmul_vf_vf_vf(u, vf2getx_vf_vf2(s)));\n  x = dfadd_vf2_vf_vf2(vcast_vf_f(1), dfmul_vf2_vf2_vf2(dfadd_vf2_vf_vf2(vcast_vf_f(0.33333361148834228515625f), dfmul_vf2_vf2_vf2(s, x)), s));\n  x = dfmul_vf2_vf2_vf2(t, x);\n\n  x = vsel_vf2_vo_vf2_vf2(o, dfrec_vf2_vf2(x), x);\n\n  u = vadd_vf_vf_vf(vf2getx_vf_vf2(x), vf2gety_vf_vf2(x));\n\n  u = vsel_vf_vo_vf_vf(visnegzero_vo_vf(d), d, u);\n  \n  return u;\n\n#else // #if !defined(DETERMINISTIC)\n\n  vint2 q;\n  vfloat u, v;\n  vfloat2 s, t, x;\n  vopmask o;\n\n  u = vrint_vf_vf(vmul_vf_vf_vf(d, vcast_vf_f(2 * M_1_PI)));\n  q = vrint_vi2_vf(u);\n  v = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_A2f*0.5f), d);\n  s = dfadd2_vf2_vf_vf(v, vmul_vf_vf_vf(u, vcast_vf_f(-PI_B2f*0.5f)));\n  s = dfadd_vf2_vf2_vf(s, vmul_vf_vf_vf(u, vcast_vf_f(-PI_C2f*0.5f)));\n  vopmask g = vlt_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(TRIGRANGEMAX2f));\n\n  if (!LIKELY(vtestallones_i_vo32(g))) {\n    dfi_t dfi = rempif(d);\n    t = dfigetdf_vf2_dfi(dfi);\n    o = vor_vo_vo_vo(visinf_vo_vf(d), visnan_vo_vf(d));\n    t = vf2setx_vf2_vf2_vf(t, vreinterpret_vf_vm(vor_vm_vo32_vm(o, vreinterpret_vm_vf(vf2getx_vf_vf2(t)))));\n    t = vf2sety_vf2_vf2_vf(t, vreinterpret_vf_vm(vor_vm_vo32_vm(o, vreinterpret_vm_vf(vf2gety_vf_vf2(t)))));\n    q = vsel_vi2_vo_vi2_vi2(g, q, dfigeti_vi2_dfi(dfi));\n    s = vsel_vf2_vo_vf2_vf2(g, s, t);\n  }\n\n  o = veq_vo_vi2_vi2(vand_vi2_vi2_vi2(q, vcast_vi2_i(1)), vcast_vi2_i(1));\n  vmask n = vand_vm_vo32_vm(o, vreinterpret_vm_vf(vcast_vf_f(-0.0)));\n  s = vf2setx_vf2_vf2_vf(s, vreinterpret_vf_vm(vxor_vm_vm_vm(vreinterpret_vm_vf(vf2getx_vf_vf2(s)), n)));\n  s = vf2sety_vf2_vf2_vf(s, vreinterpret_vf_vm(vxor_vm_vm_vm(vreinterpret_vm_vf(vf2gety_vf_vf2(s)), n)));\n\n  t = s;\n  s = dfsqu_vf2_vf2(s);\n  s = dfnormalize_vf2_vf2(s);\n\n  u = vcast_vf_f(0.00446636462584137916564941f);\n  u = vmla_vf_vf_vf_vf(u, vf2getx_vf_vf2(s), vcast_vf_f(-8.3920182078145444393158e-05f));\n  u = vmla_vf_vf_vf_vf(u, vf2getx_vf_vf2(s), vcast_vf_f(0.0109639242291450500488281f));\n  u = vmla_vf_vf_vf_vf(u, vf2getx_vf_vf2(s), vcast_vf_f(0.0212360303848981857299805f));\n  u = vmla_vf_vf_vf_vf(u, vf2getx_vf_vf2(s), vcast_vf_f(0.0540687143802642822265625f));\n\n  x = dfadd_vf2_vf_vf(vcast_vf_f(0.133325666189193725585938f), vmul_vf_vf_vf(u, vf2getx_vf_vf2(s)));\n  x = dfadd_vf2_vf_vf2(vcast_vf_f(1), dfmul_vf2_vf2_vf2(dfadd_vf2_vf_vf2(vcast_vf_f(0.33333361148834228515625f), dfmul_vf2_vf2_vf2(s, x)), s));\n  x = dfmul_vf2_vf2_vf2(t, x);\n\n  x = vsel_vf2_vo_vf2_vf2(o, dfrec_vf2_vf2(x), x);\n\n  u = vadd_vf_vf_vf(vf2getx_vf_vf2(x), vf2gety_vf_vf2(x));\n\n  u = vsel_vf_vo_vf_vf(visnegzero_vo_vf(d), d, u);\n  \n  return u;\n#endif // #if !defined(DETERMINISTIC)\n}\n\n#if !defined(DETERMINISTIC)\nEXPORT CONST VECTOR_CC vfloat xatanf(vfloat d) {\n  vfloat s, t, u;\n  vint2 q;\n\n  q = vsel_vi2_vf_vi2(d, vcast_vi2_i(2));\n  s = vabs_vf_vf(d);\n\n  q = vsel_vi2_vf_vf_vi2_vi2(vcast_vf_f(1.0f), s, vadd_vi2_vi2_vi2(q, vcast_vi2_i(1)), q);\n  s = vsel_vf_vo_vf_vf(vlt_vo_vf_vf(vcast_vf_f(1.0f), s), vrec_vf_vf(s), s);\n\n  t = vmul_vf_vf_vf(s, s);\n\n  vfloat t2 = vmul_vf_vf_vf(t, t), t4 = vmul_vf_vf_vf(t2, t2);\n  u = POLY8(t, t2, t4,\n\t    0.00282363896258175373077393f,\n\t    -0.0159569028764963150024414f,\n\t    0.0425049886107444763183594f,\n\t    -0.0748900920152664184570312f,\n\t    0.106347933411598205566406f,\n\t    -0.142027363181114196777344f,\n\t    0.199926957488059997558594f,\n\t    -0.333331018686294555664062f);\n\n  t = vmla_vf_vf_vf_vf(s, vmul_vf_vf_vf(t, u), s);\n\n  t = vsel_vf_vo_vf_vf(veq_vo_vi2_vi2(vand_vi2_vi2_vi2(q, vcast_vi2_i(1)), vcast_vi2_i(1)), vsub_vf_vf_vf(vcast_vf_f((float)(M_PI/2)), t), t);\n\n  t = vreinterpret_vf_vm(vxor_vm_vm_vm(vand_vm_vo32_vm(veq_vo_vi2_vi2(vand_vi2_vi2_vi2(q, vcast_vi2_i(2)), vcast_vi2_i(2)), vreinterpret_vm_vf(vcast_vf_f(-0.0f))), vreinterpret_vm_vf(t)));\n\n#if defined(ENABLE_NEON32) || defined(ENABLE_NEON32VFPV4)\n  t = vsel_vf_vo_vf_vf(visinf_vo_vf(d), vmulsign_vf_vf_vf(vcast_vf_f(1.5874010519681994747517056f), d), t);\n#endif\n\n  return t;\n}\n#endif // #if !defined(DETERMINISTIC)\n\nstatic INLINE CONST VECTOR_CC vfloat atan2kf(vfloat y, vfloat x) {\n  vfloat s, t, u;\n  vint2 q;\n  vopmask p;\n\n  q = vsel_vi2_vf_vi2(x, vcast_vi2_i(-2));\n  x = vabs_vf_vf(x);\n\n  q = vsel_vi2_vf_vf_vi2_vi2(x, y, vadd_vi2_vi2_vi2(q, vcast_vi2_i(1)), q);\n  p = vlt_vo_vf_vf(x, y);\n  s = vsel_vf_vo_vf_vf(p, vneg_vf_vf(x), y);\n  t = vmax_vf_vf_vf(x, y);\n\n  s = vdiv_vf_vf_vf(s, t);\n  t = vmul_vf_vf_vf(s, s);\n\n  vfloat t2 = vmul_vf_vf_vf(t, t), t4 = vmul_vf_vf_vf(t2, t2);\n  u = POLY8(t, t2, t4,\n\t    0.00282363896258175373077393f,\n\t    -0.0159569028764963150024414f,\n\t    0.0425049886107444763183594f,\n\t    -0.0748900920152664184570312f,\n\t    0.106347933411598205566406f,\n\t    -0.142027363181114196777344f,\n\t    0.199926957488059997558594f,\n\t    -0.333331018686294555664062f);\n\n  t = vmla_vf_vf_vf_vf(s, vmul_vf_vf_vf(t, u), s);\n  t = vmla_vf_vf_vf_vf(vcast_vf_vi2(q), vcast_vf_f((float)(M_PI/2)), t);\n\n  return t;\n}\n\nstatic INLINE CONST VECTOR_CC vfloat visinf2_vf_vf_vf(vfloat d, vfloat m) {\n  return vreinterpret_vf_vm(vand_vm_vo32_vm(visinf_vo_vf(d), vor_vm_vm_vm(vsignbit_vm_vf(d), vreinterpret_vm_vf(m))));\n}\n\n#if !defined(DETERMINISTIC)\nEXPORT CONST VECTOR_CC vfloat xatan2f(vfloat y, vfloat x) {\n  vfloat r = atan2kf(vabs_vf_vf(y), x);\n\n  r = vmulsign_vf_vf_vf(r, x);\n  r = vsel_vf_vo_vf_vf(vor_vo_vo_vo(visinf_vo_vf(x), veq_vo_vf_vf(x, vcast_vf_f(0.0f))), vsub_vf_vf_vf(vcast_vf_f((float)(M_PI/2)), visinf2_vf_vf_vf(x, vmulsign_vf_vf_vf(vcast_vf_f((float)(M_PI/2)), x))), r);\n  r = vsel_vf_vo_vf_vf(visinf_vo_vf(y), vsub_vf_vf_vf(vcast_vf_f((float)(M_PI/2)), visinf2_vf_vf_vf(x, vmulsign_vf_vf_vf(vcast_vf_f((float)(M_PI/4)), x))), r);\n\n  r = vsel_vf_vo_vf_vf(veq_vo_vf_vf(y, vcast_vf_f(0.0f)), vreinterpret_vf_vm(vand_vm_vo32_vm(vsignbit_vo_vf(x), vreinterpret_vm_vf(vcast_vf_f((float)M_PI)))), r);\n\n  r = vreinterpret_vf_vm(vor_vm_vo32_vm(vor_vo_vo_vo(visnan_vo_vf(x), visnan_vo_vf(y)), vreinterpret_vm_vf(vmulsign_vf_vf_vf(r, y))));\n  return r;\n}\n\nEXPORT CONST VECTOR_CC vfloat xasinf(vfloat d) {\n  vopmask o = vlt_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(0.5f));\n  vfloat x2 = vsel_vf_vo_vf_vf(o, vmul_vf_vf_vf(d, d), vmul_vf_vf_vf(vsub_vf_vf_vf(vcast_vf_f(1), vabs_vf_vf(d)), vcast_vf_f(0.5f)));\n  vfloat x = vsel_vf_vo_vf_vf(o, vabs_vf_vf(d), vsqrt_vf_vf(x2)), u;\n\n  u = vcast_vf_f(+0.4197454825e-1);\n  u = vmla_vf_vf_vf_vf(u, x2, vcast_vf_f(+0.2424046025e-1));\n  u = vmla_vf_vf_vf_vf(u, x2, vcast_vf_f(+0.4547423869e-1));\n  u = vmla_vf_vf_vf_vf(u, x2, vcast_vf_f(+0.7495029271e-1));\n  u = vmla_vf_vf_vf_vf(u, x2, vcast_vf_f(+0.1666677296e+0));\n  u = vmla_vf_vf_vf_vf(u, vmul_vf_vf_vf(x, x2), x);\n\n  vfloat r = vsel_vf_vo_vf_vf(o, u, vmla_vf_vf_vf_vf(u, vcast_vf_f(-2), vcast_vf_f(M_PIf/2)));\n  return vmulsign_vf_vf_vf(r, d);\n}\n\nEXPORT CONST VECTOR_CC vfloat xacosf(vfloat d) {\n  vopmask o = vlt_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(0.5f));\n  vfloat x2 = vsel_vf_vo_vf_vf(o, vmul_vf_vf_vf(d, d),\n\t\t\t\tvmul_vf_vf_vf(vsub_vf_vf_vf(vcast_vf_f(1), vabs_vf_vf(d)), vcast_vf_f(0.5f))), u;\n  vfloat x = vsel_vf_vo_vf_vf(o, vabs_vf_vf(d), vsqrt_vf_vf(x2));\n  x = vsel_vf_vo_vf_vf(veq_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(1.0f)), vcast_vf_f(0), x);\n\n  u = vcast_vf_f(+0.4197454825e-1);\n  u = vmla_vf_vf_vf_vf(u, x2, vcast_vf_f(+0.2424046025e-1));\n  u = vmla_vf_vf_vf_vf(u, x2, vcast_vf_f(+0.4547423869e-1));\n  u = vmla_vf_vf_vf_vf(u, x2, vcast_vf_f(+0.7495029271e-1));\n  u = vmla_vf_vf_vf_vf(u, x2, vcast_vf_f(+0.1666677296e+0));\n  u = vmul_vf_vf_vf(u, vmul_vf_vf_vf(x2, x));\n\n  vfloat y = vsub_vf_vf_vf(vcast_vf_f(3.1415926535897932f/2), vadd_vf_vf_vf(vmulsign_vf_vf_vf(x, d), vmulsign_vf_vf_vf(u, d)));\n  x = vadd_vf_vf_vf(x, u);\n  vfloat r = vsel_vf_vo_vf_vf(o, y, vmul_vf_vf_vf(x, vcast_vf_f(2)));\n  return vsel_vf_vo_vf_vf(vandnot_vo_vo_vo(o, vlt_vo_vf_vf(d, vcast_vf_f(0))),\n\t\t\t  vf2getx_vf_vf2(dfadd_vf2_vf2_vf(vcast_vf2_f_f(3.1415927410125732422f,-8.7422776573475857731e-08f),\n\t\t\t\t\t\t\t  vneg_vf_vf(r))), r);\n}\n#endif // #if !defined(DETERMINISTIC)\n\n//\n\nstatic INLINE CONST VECTOR_CC vfloat2 atan2kf_u1(vfloat2 y, vfloat2 x) {\n  vfloat u;\n  vfloat2 s, t;\n  vint2 q;\n  vopmask p;\n  vmask r;\n  \n  q = vsel_vi2_vf_vf_vi2_vi2(vf2getx_vf_vf2(x), vcast_vf_f(0), vcast_vi2_i(-2), vcast_vi2_i(0));\n  p = vlt_vo_vf_vf(vf2getx_vf_vf2(x), vcast_vf_f(0));\n  r = vand_vm_vo32_vm(p, vreinterpret_vm_vf(vcast_vf_f(-0.0)));\n  x = vf2setx_vf2_vf2_vf(x, vreinterpret_vf_vm(vxor_vm_vm_vm(vreinterpret_vm_vf(vf2getx_vf_vf2(x)), r)));\n  x = vf2sety_vf2_vf2_vf(x, vreinterpret_vf_vm(vxor_vm_vm_vm(vreinterpret_vm_vf(vf2gety_vf_vf2(x)), r)));\n\n  q = vsel_vi2_vf_vf_vi2_vi2(vf2getx_vf_vf2(x), vf2getx_vf_vf2(y), vadd_vi2_vi2_vi2(q, vcast_vi2_i(1)), q);\n  p = vlt_vo_vf_vf(vf2getx_vf_vf2(x), vf2getx_vf_vf2(y));\n  s = vsel_vf2_vo_vf2_vf2(p, dfneg_vf2_vf2(x), y);\n  t = vsel_vf2_vo_vf2_vf2(p, y, x);\n\n  s = dfdiv_vf2_vf2_vf2(s, t);\n  t = dfsqu_vf2_vf2(s);\n  t = dfnormalize_vf2_vf2(t);\n\n  u = vcast_vf_f(-0.00176397908944636583328247f);\n  u = vmla_vf_vf_vf_vf(u, vf2getx_vf_vf2(t), vcast_vf_f(0.0107900900766253471374512f));\n  u = vmla_vf_vf_vf_vf(u, vf2getx_vf_vf2(t), vcast_vf_f(-0.0309564601629972457885742f));\n  u = vmla_vf_vf_vf_vf(u, vf2getx_vf_vf2(t), vcast_vf_f(0.0577365085482597351074219f));\n  u = vmla_vf_vf_vf_vf(u, vf2getx_vf_vf2(t), vcast_vf_f(-0.0838950723409652709960938f));\n  u = vmla_vf_vf_vf_vf(u, vf2getx_vf_vf2(t), vcast_vf_f(0.109463557600975036621094f));\n  u = vmla_vf_vf_vf_vf(u, vf2getx_vf_vf2(t), vcast_vf_f(-0.142626821994781494140625f));\n  u = vmla_vf_vf_vf_vf(u, vf2getx_vf_vf2(t), vcast_vf_f(0.199983194470405578613281f));\n\n  t = dfmul_vf2_vf2_vf2(t, dfadd_vf2_vf_vf(vcast_vf_f(-0.333332866430282592773438f), vmul_vf_vf_vf(u, vf2getx_vf_vf2(t))));\n  t = dfmul_vf2_vf2_vf2(s, dfadd_vf2_vf_vf2(vcast_vf_f(1), t));\n  t = dfadd_vf2_vf2_vf2(dfmul_vf2_vf2_vf(vcast_vf2_f_f(1.5707963705062866211f, -4.3711388286737928865e-08f), vcast_vf_vi2(q)), t);\n\n  return t;\n}\n\n#if !defined(DETERMINISTIC)\nEXPORT CONST VECTOR_CC vfloat xatan2f_u1(vfloat y, vfloat x) {\n  vopmask o = vlt_vo_vf_vf(vabs_vf_vf(x), vcast_vf_f(2.9387372783541830947e-39f)); // nexttowardf((1.0 / FLT_MAX), 1)\n  x = vsel_vf_vo_vf_vf(o, vmul_vf_vf_vf(x, vcast_vf_f(1 << 24)), x);\n  y = vsel_vf_vo_vf_vf(o, vmul_vf_vf_vf(y, vcast_vf_f(1 << 24)), y);\n  \n  vfloat2 d = atan2kf_u1(vcast_vf2_vf_vf(vabs_vf_vf(y), vcast_vf_f(0)), vcast_vf2_vf_vf(x, vcast_vf_f(0)));\n  vfloat r = vadd_vf_vf_vf(vf2getx_vf_vf2(d), vf2gety_vf_vf2(d));\n\n  r = vmulsign_vf_vf_vf(r, x);\n  r = vsel_vf_vo_vf_vf(vor_vo_vo_vo(visinf_vo_vf(x), veq_vo_vf_vf(x, vcast_vf_f(0))), vsub_vf_vf_vf(vcast_vf_f(M_PI/2), visinf2_vf_vf_vf(x, vmulsign_vf_vf_vf(vcast_vf_f(M_PI/2), x))), r);\n  r = vsel_vf_vo_vf_vf(visinf_vo_vf(y), vsub_vf_vf_vf(vcast_vf_f(M_PI/2), visinf2_vf_vf_vf(x, vmulsign_vf_vf_vf(vcast_vf_f(M_PI/4), x))), r);\n  r = vsel_vf_vo_vf_vf(veq_vo_vf_vf(y, vcast_vf_f(0.0f)), vreinterpret_vf_vm(vand_vm_vo32_vm(vsignbit_vo_vf(x), vreinterpret_vm_vf(vcast_vf_f((float)M_PI)))), r);\n\n  r = vreinterpret_vf_vm(vor_vm_vo32_vm(vor_vo_vo_vo(visnan_vo_vf(x), visnan_vo_vf(y)), vreinterpret_vm_vf(vmulsign_vf_vf_vf(r, y))));\n  return r;\n}\n\nEXPORT CONST VECTOR_CC vfloat xasinf_u1(vfloat d) {\n  vopmask o = vlt_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(0.5f));\n  vfloat x2 = vsel_vf_vo_vf_vf(o, vmul_vf_vf_vf(d, d), vmul_vf_vf_vf(vsub_vf_vf_vf(vcast_vf_f(1), vabs_vf_vf(d)), vcast_vf_f(0.5f))), u;\n  vfloat2 x = vsel_vf2_vo_vf2_vf2(o, vcast_vf2_vf_vf(vabs_vf_vf(d), vcast_vf_f(0)), dfsqrt_vf2_vf(x2));\n  x = vsel_vf2_vo_vf2_vf2(veq_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(1.0f)), vcast_vf2_f_f(0, 0), x);\n\n  u = vcast_vf_f(+0.4197454825e-1);\n  u = vmla_vf_vf_vf_vf(u, x2, vcast_vf_f(+0.2424046025e-1));\n  u = vmla_vf_vf_vf_vf(u, x2, vcast_vf_f(+0.4547423869e-1));\n  u = vmla_vf_vf_vf_vf(u, x2, vcast_vf_f(+0.7495029271e-1));\n  u = vmla_vf_vf_vf_vf(u, x2, vcast_vf_f(+0.1666677296e+0));\n  u = vmul_vf_vf_vf(u, vmul_vf_vf_vf(x2, vf2getx_vf_vf2(x)));\n\n  vfloat2 y = dfsub_vf2_vf2_vf(dfsub_vf2_vf2_vf2(vcast_vf2_f_f(3.1415927410125732422f/4,-8.7422776573475857731e-08f/4), x), u);\n  \n  vfloat r = vsel_vf_vo_vf_vf(o, vadd_vf_vf_vf(u, vf2getx_vf_vf2(x)),\n\t\t\t       vmul_vf_vf_vf(vadd_vf_vf_vf(vf2getx_vf_vf2(y), vf2gety_vf_vf2(y)), vcast_vf_f(2)));\n  return vmulsign_vf_vf_vf(r, d);\n}\n\nEXPORT CONST VECTOR_CC vfloat xacosf_u1(vfloat d) {\n  vopmask o = vlt_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(0.5f));\n  vfloat x2 = vsel_vf_vo_vf_vf(o, vmul_vf_vf_vf(d, d), vmul_vf_vf_vf(vsub_vf_vf_vf(vcast_vf_f(1), vabs_vf_vf(d)), vcast_vf_f(0.5f))), u;\n  vfloat2 x = vsel_vf2_vo_vf2_vf2(o, vcast_vf2_vf_vf(vabs_vf_vf(d), vcast_vf_f(0)), dfsqrt_vf2_vf(x2));\n  x = vsel_vf2_vo_vf2_vf2(veq_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(1.0f)), vcast_vf2_f_f(0, 0), x);\n\n  u = vcast_vf_f(+0.4197454825e-1);\n  u = vmla_vf_vf_vf_vf(u, x2, vcast_vf_f(+0.2424046025e-1));\n  u = vmla_vf_vf_vf_vf(u, x2, vcast_vf_f(+0.4547423869e-1));\n  u = vmla_vf_vf_vf_vf(u, x2, vcast_vf_f(+0.7495029271e-1));\n  u = vmla_vf_vf_vf_vf(u, x2, vcast_vf_f(+0.1666677296e+0));\n  u = vmul_vf_vf_vf(u, vmul_vf_vf_vf(x2, vf2getx_vf_vf2(x)));\n\n  vfloat2 y = dfsub_vf2_vf2_vf2(vcast_vf2_f_f(3.1415927410125732422f/2, -8.7422776573475857731e-08f/2),\n\t\t\t\t dfadd_vf2_vf_vf(vmulsign_vf_vf_vf(vf2getx_vf_vf2(x), d), vmulsign_vf_vf_vf(u, d)));\n  x = dfadd_vf2_vf2_vf(x, u);\n\n  y = vsel_vf2_vo_vf2_vf2(o, y, dfscale_vf2_vf2_vf(x, vcast_vf_f(2)));\n  \n  y = vsel_vf2_vo_vf2_vf2(vandnot_vo_vo_vo(o, vlt_vo_vf_vf(d, vcast_vf_f(0))),\n\t\t\t  dfsub_vf2_vf2_vf2(vcast_vf2_f_f(3.1415927410125732422f, -8.7422776573475857731e-08f), y), y);\n\n  return vadd_vf_vf_vf(vf2getx_vf_vf2(y), vf2gety_vf_vf2(y));\n}\n\nEXPORT CONST VECTOR_CC vfloat xatanf_u1(vfloat d) {\n  vfloat2 d2 = atan2kf_u1(vcast_vf2_vf_vf(vabs_vf_vf(d), vcast_vf_f(0)), vcast_vf2_f_f(1, 0));\n  vfloat r = vadd_vf_vf_vf(vf2getx_vf_vf2(d2), vf2gety_vf_vf2(d2));\n  r = vsel_vf_vo_vf_vf(visinf_vo_vf(d), vcast_vf_f(1.570796326794896557998982), r);\n  return vmulsign_vf_vf_vf(r, d);\n}\n#endif // #if !defined(DETERMINISTIC)\n\n//\n\n#if !defined(DETERMINISTIC)\nEXPORT CONST VECTOR_CC vfloat xlogf(vfloat d) {\n  vfloat x, x2, t, m;\n\n#if !defined(ENABLE_AVX512F) && !defined(ENABLE_AVX512FNOFMA)\n  vopmask o = vlt_vo_vf_vf(d, vcast_vf_f(FLT_MIN));\n  d = vsel_vf_vo_vf_vf(o, vmul_vf_vf_vf(d, vcast_vf_f((float)(INT64_C(1) << 32) * (float)(INT64_C(1) << 32))), d);\n  vint2 e = vilogb2k_vi2_vf(vmul_vf_vf_vf(d, vcast_vf_f(1.0f/0.75f)));\n  m = vldexp3_vf_vf_vi2(d, vneg_vi2_vi2(e));\n  e = vsel_vi2_vo_vi2_vi2(o, vsub_vi2_vi2_vi2(e, vcast_vi2_i(64)), e);\n#else\n  vfloat e = vgetexp_vf_vf(vmul_vf_vf_vf(d, vcast_vf_f(1.0f/0.75f)));\n  e = vsel_vf_vo_vf_vf(vispinf_vo_vf(e), vcast_vf_f(128.0f), e);\n  m = vgetmant_vf_vf(d);\n#endif\n  \n  x = vdiv_vf_vf_vf(vsub_vf_vf_vf(m, vcast_vf_f(1.0f)), vadd_vf_vf_vf(vcast_vf_f(1.0f), m));\n  x2 = vmul_vf_vf_vf(x, x);\n\n  t = vcast_vf_f(0.2392828464508056640625f);\n  t = vmla_vf_vf_vf_vf(t, x2, vcast_vf_f(0.28518211841583251953125f));\n  t = vmla_vf_vf_vf_vf(t, x2, vcast_vf_f(0.400005877017974853515625f));\n  t = vmla_vf_vf_vf_vf(t, x2, vcast_vf_f(0.666666686534881591796875f));\n  t = vmla_vf_vf_vf_vf(t, x2, vcast_vf_f(2.0f));\n\n#if !defined(ENABLE_AVX512F) && !defined(ENABLE_AVX512FNOFMA)\n  x = vmla_vf_vf_vf_vf(x, t, vmul_vf_vf_vf(vcast_vf_f(0.693147180559945286226764f), vcast_vf_vi2(e)));\n  x = vsel_vf_vo_vf_vf(vispinf_vo_vf(d), vcast_vf_f(SLEEF_INFINITYf), x);\n  x = vsel_vf_vo_vf_vf(vor_vo_vo_vo(vlt_vo_vf_vf(d, vcast_vf_f(0)), visnan_vo_vf(d)), vcast_vf_f(SLEEF_NANf), x);\n  x = vsel_vf_vo_vf_vf(veq_vo_vf_vf(d, vcast_vf_f(0)), vcast_vf_f(-SLEEF_INFINITYf), x);\n#else\n  x = vmla_vf_vf_vf_vf(x, t, vmul_vf_vf_vf(vcast_vf_f(0.693147180559945286226764f), e));\n  x = vfixup_vf_vf_vf_vi2_i(x, d, vcast_vi2_i((5 << (5*4))), 0);\n#endif\n  \n  return x;\n}\n#endif // #if !defined(DETERMINISTIC)\n\n#if !defined(DETERMINISTIC)\nEXPORT CONST VECTOR_CC vfloat xexpf(vfloat d) {\n  vint2 q = vrint_vi2_vf(vmul_vf_vf_vf(d, vcast_vf_f(R_LN2f)));\n  vfloat s, u;\n\n  s = vmla_vf_vf_vf_vf(vcast_vf_vi2(q), vcast_vf_f(-L2Uf), d);\n  s = vmla_vf_vf_vf_vf(vcast_vf_vi2(q), vcast_vf_f(-L2Lf), s);\n\n  u = vcast_vf_f(0.000198527617612853646278381);\n  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(0.00139304355252534151077271));\n  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(0.00833336077630519866943359));\n  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(0.0416664853692054748535156));\n  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(0.166666671633720397949219));\n  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(0.5));\n\n  u = vadd_vf_vf_vf(vcast_vf_f(1.0f), vmla_vf_vf_vf_vf(vmul_vf_vf_vf(s, s), u, s));\n\n  u = vldexp2_vf_vf_vi2(u, q);\n\n  u = vreinterpret_vf_vm(vandnot_vm_vo32_vm(vlt_vo_vf_vf(d, vcast_vf_f(-104)), vreinterpret_vm_vf(u)));\n  u = vsel_vf_vo_vf_vf(vlt_vo_vf_vf(vcast_vf_f(100), d), vcast_vf_f(SLEEF_INFINITYf), u);\n\n  return u;\n}\n#endif // #if !defined(DETERMINISTIC)\n\nstatic INLINE CONST VECTOR_CC vfloat expm1fk(vfloat d) {\n  vint2 q = vrint_vi2_vf(vmul_vf_vf_vf(d, vcast_vf_f(R_LN2f)));\n  vfloat s, u;\n\n  s = vmla_vf_vf_vf_vf(vcast_vf_vi2(q), vcast_vf_f(-L2Uf), d);\n  s = vmla_vf_vf_vf_vf(vcast_vf_vi2(q), vcast_vf_f(-L2Lf), s);\n\n  vfloat s2 = vmul_vf_vf_vf(s, s), s4 = vmul_vf_vf_vf(s2, s2);\n  u = POLY6(s, s2, s4,\n\t    0.000198527617612853646278381,\n\t    0.00139304355252534151077271,\n\t    0.00833336077630519866943359,\n\t    0.0416664853692054748535156,\n\t    0.166666671633720397949219,\n\t    0.5);\n\n  u = vmla_vf_vf_vf_vf(vmul_vf_vf_vf(s, s), u, s);\n\n  u = vsel_vf_vo_vf_vf(veq_vo_vi2_vi2(q, vcast_vi2_i(0)), u,\n\t\t       vsub_vf_vf_vf(vldexp2_vf_vf_vi2(vadd_vf_vf_vf(u, vcast_vf_f(1)), q), vcast_vf_f(1)));\n\n  return u;\n}\n\n#if defined(ENABLE_NEON32) || defined(ENABLE_NEON32VFPV4)\nEXPORT CONST VECTOR_CC vfloat xsqrtf_u35(vfloat d) {\n  vfloat e = vreinterpret_vf_vi2(vadd_vi2_vi2_vi2(vcast_vi2_i(0x20000000), vand_vi2_vi2_vi2(vcast_vi2_i(0x7f000000), vsrl_vi2_vi2_i(vreinterpret_vi2_vf(d), 1))));\n  vfloat m = vreinterpret_vf_vi2(vadd_vi2_vi2_vi2(vcast_vi2_i(0x3f000000), vand_vi2_vi2_vi2(vcast_vi2_i(0x01ffffff), vreinterpret_vi2_vf(d))));\n  float32x4_t x = vrsqrteq_f32(m);\n  x = vmulq_f32(x, vrsqrtsq_f32(m, vmulq_f32(x, x)));\n  float32x4_t u = vmulq_f32(x, m);\n  u = vmlaq_f32(u, vmlsq_f32(m, u, u), vmulq_f32(x, vdupq_n_f32(0.5)));\n  e = vreinterpret_vf_vm(vandnot_vm_vo32_vm(veq_vo_vf_vf(d, vcast_vf_f(0)), vreinterpret_vm_vf(e)));\n  u = vmul_vf_vf_vf(e, u);\n\n  u = vsel_vf_vo_vf_vf(visinf_vo_vf(d), vcast_vf_f(SLEEF_INFINITYf), u);\n  u = vreinterpret_vf_vm(vor_vm_vo32_vm(vor_vo_vo_vo(visnan_vo_vf(d), vlt_vo_vf_vf(d, vcast_vf_f(0))), vreinterpret_vm_vf(u)));\n  u = vmulsign_vf_vf_vf(u, d);\n\n  return u;\n}\n#elif defined(ENABLE_VECEXT)\nEXPORT CONST VECTOR_CC vfloat xsqrtf_u35(vfloat d) {\n  vfloat q = vsqrt_vf_vf(d);\n  q = vsel_vf_vo_vf_vf(visnegzero_vo_vf(d), vcast_vf_f(-0.0), q);\n  return vsel_vf_vo_vf_vf(vispinf_vo_vf(d), vcast_vf_f(SLEEF_INFINITYf), q);\n}\n#else\nEXPORT CONST VECTOR_CC vfloat xsqrtf_u35(vfloat d) { return vsqrt_vf_vf(d); }\n#endif\n\n#if !defined(DETERMINISTIC)\nEXPORT CONST VECTOR_CC vfloat xcbrtf(vfloat d) {\n  vfloat x, y, q = vcast_vf_f(1.0), t;\n  vint2 e, qu, re;\n\n#if defined(ENABLE_AVX512F) || defined(ENABLE_AVX512FNOFMA)\n  vfloat s = d;\n#endif\n  e = vadd_vi2_vi2_vi2(vilogbk_vi2_vf(vabs_vf_vf(d)), vcast_vi2_i(1));\n  d = vldexp2_vf_vf_vi2(d, vneg_vi2_vi2(e));\n\n  t = vadd_vf_vf_vf(vcast_vf_vi2(e), vcast_vf_f(6144));\n  qu = vtruncate_vi2_vf(vmul_vf_vf_vf(t, vcast_vf_f(1.0f/3.0f)));\n  re = vtruncate_vi2_vf(vsub_vf_vf_vf(t, vmul_vf_vf_vf(vcast_vf_vi2(qu), vcast_vf_f(3))));\n\n  q = vsel_vf_vo_vf_vf(veq_vo_vi2_vi2(re, vcast_vi2_i(1)), vcast_vf_f(1.2599210498948731647672106f), q);\n  q = vsel_vf_vo_vf_vf(veq_vo_vi2_vi2(re, vcast_vi2_i(2)), vcast_vf_f(1.5874010519681994747517056f), q);\n  q = vldexp2_vf_vf_vi2(q, vsub_vi2_vi2_vi2(qu, vcast_vi2_i(2048)));\n\n  q = vmulsign_vf_vf_vf(q, d);\n  d = vabs_vf_vf(d);\n\n  x = vcast_vf_f(-0.601564466953277587890625f);\n  x = vmla_vf_vf_vf_vf(x, d, vcast_vf_f(2.8208892345428466796875f));\n  x = vmla_vf_vf_vf_vf(x, d, vcast_vf_f(-5.532182216644287109375f));\n  x = vmla_vf_vf_vf_vf(x, d, vcast_vf_f(5.898262500762939453125f));\n  x = vmla_vf_vf_vf_vf(x, d, vcast_vf_f(-3.8095417022705078125f));\n  x = vmla_vf_vf_vf_vf(x, d, vcast_vf_f(2.2241256237030029296875f));\n\n  y = vmul_vf_vf_vf(vmul_vf_vf_vf(d, x), x);\n  y = vmul_vf_vf_vf(vsub_vf_vf_vf(y, vmul_vf_vf_vf(vmul_vf_vf_vf(vcast_vf_f(2.0f / 3.0f), y), vmla_vf_vf_vf_vf(y, x, vcast_vf_f(-1.0f)))), q);\n\n#if defined(ENABLE_AVX512F) || defined(ENABLE_AVX512FNOFMA)\n  y = vsel_vf_vo_vf_vf(visinf_vo_vf(s), vmulsign_vf_vf_vf(vcast_vf_f(SLEEF_INFINITYf), s), y);\n  y = vsel_vf_vo_vf_vf(veq_vo_vf_vf(s, vcast_vf_f(0)), vmulsign_vf_vf_vf(vcast_vf_f(0), s), y);\n#endif\n  \n  return y;\n}\n#endif // #if !defined(DETERMINISTIC)\n\n#if !defined(DETERMINISTIC)\nEXPORT CONST VECTOR_CC vfloat xcbrtf_u1(vfloat d) {\n  vfloat x, y, z, t;\n  vfloat2 q2 = vcast_vf2_f_f(1, 0), u, v;\n  vint2 e, qu, re;\n\n#if defined(ENABLE_AVX512F) || defined(ENABLE_AVX512FNOFMA)\n  vfloat s = d;\n#endif\n  e = vadd_vi2_vi2_vi2(vilogbk_vi2_vf(vabs_vf_vf(d)), vcast_vi2_i(1));\n  d = vldexp2_vf_vf_vi2(d, vneg_vi2_vi2(e));\n\n  t = vadd_vf_vf_vf(vcast_vf_vi2(e), vcast_vf_f(6144));\n  qu = vtruncate_vi2_vf(vmul_vf_vf_vf(t, vcast_vf_f(1.0/3.0)));\n  re = vtruncate_vi2_vf(vsub_vf_vf_vf(t, vmul_vf_vf_vf(vcast_vf_vi2(qu), vcast_vf_f(3))));\n\n  q2 = vsel_vf2_vo_vf2_vf2(veq_vo_vi2_vi2(re, vcast_vi2_i(1)), vcast_vf2_f_f(1.2599210739135742188f, -2.4018701694217270415e-08), q2);\n  q2 = vsel_vf2_vo_vf2_vf2(veq_vo_vi2_vi2(re, vcast_vi2_i(2)), vcast_vf2_f_f(1.5874010324478149414f,  1.9520385308169352356e-08), q2);\n\n  q2 = vf2setx_vf2_vf2_vf(q2, vmulsign_vf_vf_vf(vf2getx_vf_vf2(q2), d));\n  q2 = vf2sety_vf2_vf2_vf(q2, vmulsign_vf_vf_vf(vf2gety_vf_vf2(q2), d));\n  d = vabs_vf_vf(d);\n\n  x = vcast_vf_f(-0.601564466953277587890625f);\n  x = vmla_vf_vf_vf_vf(x, d, vcast_vf_f(2.8208892345428466796875f));\n  x = vmla_vf_vf_vf_vf(x, d, vcast_vf_f(-5.532182216644287109375f));\n  x = vmla_vf_vf_vf_vf(x, d, vcast_vf_f(5.898262500762939453125f));\n  x = vmla_vf_vf_vf_vf(x, d, vcast_vf_f(-3.8095417022705078125f));\n  x = vmla_vf_vf_vf_vf(x, d, vcast_vf_f(2.2241256237030029296875f));\n\n  y = vmul_vf_vf_vf(x, x); y = vmul_vf_vf_vf(y, y); x = vsub_vf_vf_vf(x, vmul_vf_vf_vf(vmlanp_vf_vf_vf_vf(d, y, x), vcast_vf_f(-1.0 / 3.0)));\n\n  z = x;\n\n  u = dfmul_vf2_vf_vf(x, x);\n  u = dfmul_vf2_vf2_vf2(u, u);\n  u = dfmul_vf2_vf2_vf(u, d);\n  u = dfadd2_vf2_vf2_vf(u, vneg_vf_vf(x));\n  y = vadd_vf_vf_vf(vf2getx_vf_vf2(u), vf2gety_vf_vf2(u));\n\n  y = vmul_vf_vf_vf(vmul_vf_vf_vf(vcast_vf_f(-2.0 / 3.0), y), z);\n  v = dfadd2_vf2_vf2_vf(dfmul_vf2_vf_vf(z, z), y);\n  v = dfmul_vf2_vf2_vf(v, d);\n  v = dfmul_vf2_vf2_vf2(v, q2);\n  z = vldexp2_vf_vf_vi2(vadd_vf_vf_vf(vf2getx_vf_vf2(v), vf2gety_vf_vf2(v)), vsub_vi2_vi2_vi2(qu, vcast_vi2_i(2048)));\n\n  z = vsel_vf_vo_vf_vf(visinf_vo_vf(d), vmulsign_vf_vf_vf(vcast_vf_f(SLEEF_INFINITYf), vf2getx_vf_vf2(q2)), z);\n  z = vsel_vf_vo_vf_vf(veq_vo_vf_vf(d, vcast_vf_f(0)), vreinterpret_vf_vm(vsignbit_vm_vf(vf2getx_vf_vf2(q2))), z);\n\n#if defined(ENABLE_AVX512F) || defined(ENABLE_AVX512FNOFMA)\n  z = vsel_vf_vo_vf_vf(visinf_vo_vf(s), vmulsign_vf_vf_vf(vcast_vf_f(SLEEF_INFINITYf), s), z);\n  z = vsel_vf_vo_vf_vf(veq_vo_vf_vf(s, vcast_vf_f(0)), vmulsign_vf_vf_vf(vcast_vf_f(0), s), z);\n#endif\n\n  return z;\n}\n#endif // #if !defined(DETERMINISTIC)\n\nstatic INLINE CONST VECTOR_CC vfloat2 logkf(vfloat d) {\n  vfloat2 x, x2;\n  vfloat t, m;\n\n#if !defined(ENABLE_AVX512F) && !defined(ENABLE_AVX512FNOFMA)\n  vopmask o = vlt_vo_vf_vf(d, vcast_vf_f(FLT_MIN));\n  d = vsel_vf_vo_vf_vf(o, vmul_vf_vf_vf(d, vcast_vf_f((float)(INT64_C(1) << 32) * (float)(INT64_C(1) << 32))), d);\n  vint2 e = vilogb2k_vi2_vf(vmul_vf_vf_vf(d, vcast_vf_f(1.0f/0.75f)));\n  m = vldexp3_vf_vf_vi2(d, vneg_vi2_vi2(e));\n  e = vsel_vi2_vo_vi2_vi2(o, vsub_vi2_vi2_vi2(e, vcast_vi2_i(64)), e);\n#else\n  vfloat e = vgetexp_vf_vf(vmul_vf_vf_vf(d, vcast_vf_f(1.0f/0.75f)));\n  e = vsel_vf_vo_vf_vf(vispinf_vo_vf(e), vcast_vf_f(128.0f), e);\n  m = vgetmant_vf_vf(d);\n#endif\n\n  x = dfdiv_vf2_vf2_vf2(dfadd2_vf2_vf_vf(vcast_vf_f(-1), m), dfadd2_vf2_vf_vf(vcast_vf_f(1), m));\n  x2 = dfsqu_vf2_vf2(x);\n\n  t = vcast_vf_f(0.240320354700088500976562);\n  t = vmla_vf_vf_vf_vf(t, vf2getx_vf_vf2(x2), vcast_vf_f(0.285112679004669189453125));\n  t = vmla_vf_vf_vf_vf(t, vf2getx_vf_vf2(x2), vcast_vf_f(0.400007992982864379882812));\n  vfloat2 c = vcast_vf2_f_f(0.66666662693023681640625f, 3.69183861259614332084311e-09f);\n\n#if !defined(ENABLE_AVX512F) && !defined(ENABLE_AVX512FNOFMA)\n  vfloat2 s = dfmul_vf2_vf2_vf(vcast_vf2_f_f(0.69314718246459960938f, -1.904654323148236017e-09f), vcast_vf_vi2(e));\n#else\n  vfloat2 s = dfmul_vf2_vf2_vf(vcast_vf2_f_f(0.69314718246459960938f, -1.904654323148236017e-09f), e);\n#endif\n\n  s = dfadd_vf2_vf2_vf2(s, dfscale_vf2_vf2_vf(x, vcast_vf_f(2)));\n  s = dfadd_vf2_vf2_vf2(s, dfmul_vf2_vf2_vf2(dfmul_vf2_vf2_vf2(x2, x),\n\t\t\t\t\t     dfadd2_vf2_vf2_vf2(dfmul_vf2_vf2_vf(x2, t), c)));\n  return s;\n}\n\nstatic INLINE CONST VECTOR_CC vfloat logk3f(vfloat d) {\n  vfloat x, x2, t, m;\n\n#if !defined(ENABLE_AVX512F) && !defined(ENABLE_AVX512FNOFMA)\n  vopmask o = vlt_vo_vf_vf(d, vcast_vf_f(FLT_MIN));\n  d = vsel_vf_vo_vf_vf(o, vmul_vf_vf_vf(d, vcast_vf_f((float)(INT64_C(1) << 32) * (float)(INT64_C(1) << 32))), d);\n  vint2 e = vilogb2k_vi2_vf(vmul_vf_vf_vf(d, vcast_vf_f(1.0f/0.75f)));\n  m = vldexp3_vf_vf_vi2(d, vneg_vi2_vi2(e));\n  e = vsel_vi2_vo_vi2_vi2(o, vsub_vi2_vi2_vi2(e, vcast_vi2_i(64)), e);\n#else\n  vfloat e = vgetexp_vf_vf(vmul_vf_vf_vf(d, vcast_vf_f(1.0f/0.75f)));\n  e = vsel_vf_vo_vf_vf(vispinf_vo_vf(e), vcast_vf_f(128.0f), e);\n  m = vgetmant_vf_vf(d);\n#endif\n\n  x = vdiv_vf_vf_vf(vsub_vf_vf_vf(m, vcast_vf_f(1.0f)), vadd_vf_vf_vf(vcast_vf_f(1.0f), m));\n  x2 = vmul_vf_vf_vf(x, x);\n\n  t = vcast_vf_f(0.2392828464508056640625f);\n  t = vmla_vf_vf_vf_vf(t, x2, vcast_vf_f(0.28518211841583251953125f));\n  t = vmla_vf_vf_vf_vf(t, x2, vcast_vf_f(0.400005877017974853515625f));\n  t = vmla_vf_vf_vf_vf(t, x2, vcast_vf_f(0.666666686534881591796875f));\n  t = vmla_vf_vf_vf_vf(t, x2, vcast_vf_f(2.0f));\n\n#if !defined(ENABLE_AVX512F) && !defined(ENABLE_AVX512FNOFMA)\n  x = vmla_vf_vf_vf_vf(x, t, vmul_vf_vf_vf(vcast_vf_f(0.693147180559945286226764f), vcast_vf_vi2(e)));\n#else\n  x = vmla_vf_vf_vf_vf(x, t, vmul_vf_vf_vf(vcast_vf_f(0.693147180559945286226764f), e));\n#endif\n\n  return x;\n}\n\n#if !defined(DETERMINISTIC)\nEXPORT CONST VECTOR_CC vfloat xlogf_u1(vfloat d) {\n  vfloat2 x;\n  vfloat t, m, x2;\n\n#if !defined(ENABLE_AVX512F) && !defined(ENABLE_AVX512FNOFMA)\n  vopmask o = vlt_vo_vf_vf(d, vcast_vf_f(FLT_MIN));\n  d = vsel_vf_vo_vf_vf(o, vmul_vf_vf_vf(d, vcast_vf_f((float)(INT64_C(1) << 32) * (float)(INT64_C(1) << 32))), d);\n  vint2 e = vilogb2k_vi2_vf(vmul_vf_vf_vf(d, vcast_vf_f(1.0f/0.75f)));\n  m = vldexp3_vf_vf_vi2(d, vneg_vi2_vi2(e));\n  e = vsel_vi2_vo_vi2_vi2(o, vsub_vi2_vi2_vi2(e, vcast_vi2_i(64)), e);\n  vfloat2 s = dfmul_vf2_vf2_vf(vcast_vf2_f_f(0.69314718246459960938f, -1.904654323148236017e-09f), vcast_vf_vi2(e));\n#else\n  vfloat e = vgetexp_vf_vf(vmul_vf_vf_vf(d, vcast_vf_f(1.0f/0.75f)));\n  e = vsel_vf_vo_vf_vf(vispinf_vo_vf(e), vcast_vf_f(128.0f), e);\n  m = vgetmant_vf_vf(d);\n  vfloat2 s = dfmul_vf2_vf2_vf(vcast_vf2_f_f(0.69314718246459960938f, -1.904654323148236017e-09f), e);\n#endif\n\n  x = dfdiv_vf2_vf2_vf2(dfadd2_vf2_vf_vf(vcast_vf_f(-1), m), dfadd2_vf2_vf_vf(vcast_vf_f(1), m));\n  x2 = vmul_vf_vf_vf(vf2getx_vf_vf2(x), vf2getx_vf_vf2(x));\n\n  t = vcast_vf_f(+0.3027294874e+0f);\n  t = vmla_vf_vf_vf_vf(t, x2, vcast_vf_f(+0.3996108174e+0f));\n  t = vmla_vf_vf_vf_vf(t, x2, vcast_vf_f(+0.6666694880e+0f));\n  \n  s = dfadd_vf2_vf2_vf2(s, dfscale_vf2_vf2_vf(x, vcast_vf_f(2)));\n  s = dfadd_vf2_vf2_vf(s, vmul_vf_vf_vf(vmul_vf_vf_vf(x2, vf2getx_vf_vf2(x)), t));\n\n  vfloat r = vadd_vf_vf_vf(vf2getx_vf_vf2(s), vf2gety_vf_vf2(s));\n\n#if !defined(ENABLE_AVX512F) && !defined(ENABLE_AVX512FNOFMA)\n  r = vsel_vf_vo_vf_vf(vispinf_vo_vf(d), vcast_vf_f(SLEEF_INFINITYf), r);\n  r = vsel_vf_vo_vf_vf(vor_vo_vo_vo(vlt_vo_vf_vf(d, vcast_vf_f(0)), visnan_vo_vf(d)), vcast_vf_f(SLEEF_NANf), r);\n  r = vsel_vf_vo_vf_vf(veq_vo_vf_vf(d, vcast_vf_f(0)), vcast_vf_f(-SLEEF_INFINITYf), r);\n#else\n  r = vfixup_vf_vf_vf_vi2_i(r, d, vcast_vi2_i((4 << (2*4)) | (3 << (4*4)) | (5 << (5*4)) | (2 << (6*4))), 0);\n#endif\n  \n  return r;\n}\n#endif // #if !defined(DETERMINISTIC)\n\nstatic INLINE CONST VECTOR_CC vfloat expkf(vfloat2 d) {\n  vfloat u = vmul_vf_vf_vf(vadd_vf_vf_vf(vf2getx_vf_vf2(d), vf2gety_vf_vf2(d)), vcast_vf_f(R_LN2f));\n  vint2 q = vrint_vi2_vf(u);\n  vfloat2 s, t;\n\n  s = dfadd2_vf2_vf2_vf(d, vmul_vf_vf_vf(vcast_vf_vi2(q), vcast_vf_f(-L2Uf)));\n  s = dfadd2_vf2_vf2_vf(s, vmul_vf_vf_vf(vcast_vf_vi2(q), vcast_vf_f(-L2Lf)));\n\n  s = dfnormalize_vf2_vf2(s);\n\n  u = vcast_vf_f(0.00136324646882712841033936f);\n  u = vmla_vf_vf_vf_vf(u, vf2getx_vf_vf2(s), vcast_vf_f(0.00836596917361021041870117f));\n  u = vmla_vf_vf_vf_vf(u, vf2getx_vf_vf2(s), vcast_vf_f(0.0416710823774337768554688f));\n  u = vmla_vf_vf_vf_vf(u, vf2getx_vf_vf2(s), vcast_vf_f(0.166665524244308471679688f));\n  u = vmla_vf_vf_vf_vf(u, vf2getx_vf_vf2(s), vcast_vf_f(0.499999850988388061523438f));\n\n  t = dfadd_vf2_vf2_vf2(s, dfmul_vf2_vf2_vf(dfsqu_vf2_vf2(s), u));\n\n  t = dfadd_vf2_vf_vf2(vcast_vf_f(1), t);\n  u = vadd_vf_vf_vf(vf2getx_vf_vf2(t), vf2gety_vf_vf2(t));\n  u = vldexp_vf_vf_vi2(u, q);\n\n  u = vreinterpret_vf_vm(vandnot_vm_vo32_vm(vlt_vo_vf_vf(vf2getx_vf_vf2(d), vcast_vf_f(-104)), vreinterpret_vm_vf(u)));\n  \n  return u;\n}\n\nstatic INLINE CONST VECTOR_CC vfloat expk3f(vfloat d) {\n  vint2 q = vrint_vi2_vf(vmul_vf_vf_vf(d, vcast_vf_f(R_LN2f)));\n  vfloat s, u;\n\n  s = vmla_vf_vf_vf_vf(vcast_vf_vi2(q), vcast_vf_f(-L2Uf), d);\n  s = vmla_vf_vf_vf_vf(vcast_vf_vi2(q), vcast_vf_f(-L2Lf), s);\n\n  u = vcast_vf_f(0.000198527617612853646278381);\n  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(0.00139304355252534151077271));\n  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(0.00833336077630519866943359));\n  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(0.0416664853692054748535156));\n  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(0.166666671633720397949219));\n  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(0.5));\n\n  u = vmla_vf_vf_vf_vf(vmul_vf_vf_vf(s, s), u, vadd_vf_vf_vf(s, vcast_vf_f(1.0f)));\n  u = vldexp2_vf_vf_vi2(u, q);\n\n  u = vreinterpret_vf_vm(vandnot_vm_vo32_vm(vlt_vo_vf_vf(d, vcast_vf_f(-104)), vreinterpret_vm_vf(u)));\n  \n  return u;\n}\n\n#if !defined(DETERMINISTIC)\nEXPORT CONST VECTOR_CC vfloat xpowf(vfloat x, vfloat y) {\n#if 1\n  vopmask yisint = vor_vo_vo_vo(veq_vo_vf_vf(vtruncate_vf_vf(y), y), vgt_vo_vf_vf(vabs_vf_vf(y), vcast_vf_f(1 << 24)));\n  vopmask yisodd = vand_vo_vo_vo(vand_vo_vo_vo(veq_vo_vi2_vi2(vand_vi2_vi2_vi2(vtruncate_vi2_vf(y), vcast_vi2_i(1)), vcast_vi2_i(1)), yisint),\n\t\t\t\t vlt_vo_vf_vf(vabs_vf_vf(y), vcast_vf_f(1 << 24)));\n\n#if defined(ENABLE_NEON32) || defined(ENABLE_NEON32VFPV4)\n  yisodd = vandnot_vm_vo32_vm(visinf_vo_vf(y), yisodd);\n#endif\n\n  vfloat result = expkf(dfmul_vf2_vf2_vf(logkf(vabs_vf_vf(x)), y));\n\n  result = vsel_vf_vo_vf_vf(visnan_vo_vf(result), vcast_vf_f(SLEEF_INFINITYf), result);\n  \n  result = vmul_vf_vf_vf(result,\n\t\t\t vsel_vf_vo_vf_vf(vgt_vo_vf_vf(x, vcast_vf_f(0)),\n\t\t\t\t\t  vcast_vf_f(1),\n\t\t\t\t\t  vsel_vf_vo_vf_vf(yisint, vsel_vf_vo_vf_vf(yisodd, vcast_vf_f(-1.0f), vcast_vf_f(1)), vcast_vf_f(SLEEF_NANf))));\n\n  vfloat efx = vmulsign_vf_vf_vf(vsub_vf_vf_vf(vabs_vf_vf(x), vcast_vf_f(1)), y);\n\n  result = vsel_vf_vo_vf_vf(visinf_vo_vf(y),\n\t\t\t    vreinterpret_vf_vm(vandnot_vm_vo32_vm(vlt_vo_vf_vf(efx, vcast_vf_f(0.0f)),\n\t\t\t\t\t\t\t\t  vreinterpret_vm_vf(vsel_vf_vo_vf_vf(veq_vo_vf_vf(efx, vcast_vf_f(0.0f)),\n\t\t\t\t\t\t\t\t\t\t\t\t      vcast_vf_f(1.0f),\n\t\t\t\t\t\t\t\t\t\t\t\t      vcast_vf_f(SLEEF_INFINITYf))))),\n\t\t\t    result);\n\n  result = vsel_vf_vo_vf_vf(vor_vo_vo_vo(visinf_vo_vf(x), veq_vo_vf_vf(x, vcast_vf_f(0))),\n\t\t\t    vmul_vf_vf_vf(vsel_vf_vo_vf_vf(yisodd, vsign_vf_vf(x), vcast_vf_f(1)),\n\t\t\t\t\t  vreinterpret_vf_vm(vandnot_vm_vo32_vm(vlt_vo_vf_vf(vsel_vf_vo_vf_vf(veq_vo_vf_vf(x, vcast_vf_f(0)), vneg_vf_vf(y), y), vcast_vf_f(0)),\n\t\t\t\t\t\t\t\t\t\tvreinterpret_vm_vf(vcast_vf_f(SLEEF_INFINITYf))))),\n\t\t\t    result);\n\n  result = vreinterpret_vf_vm(vor_vm_vo32_vm(vor_vo_vo_vo(visnan_vo_vf(x), visnan_vo_vf(y)), vreinterpret_vm_vf(result)));\n\n  result = vsel_vf_vo_vf_vf(vor_vo_vo_vo(veq_vo_vf_vf(y, vcast_vf_f(0)), veq_vo_vf_vf(x, vcast_vf_f(1))), vcast_vf_f(1), result);\n\n  return result;\n#else\n  return expkf(dfmul_vf2_vf2_vf(logkf(x), y));\n#endif\n}\n\nEXPORT CONST VECTOR_CC vfloat xfastpowf_u3500(vfloat x, vfloat y) {\n  vfloat result = expk3f(vmul_vf_vf_vf(logk3f(vabs_vf_vf(x)), y));\n  vopmask yisint = vor_vo_vo_vo(veq_vo_vf_vf(vtruncate_vf_vf(y), y), vgt_vo_vf_vf(vabs_vf_vf(y), vcast_vf_f(1 << 24)));\n  vopmask yisodd = vand_vo_vo_vo(vand_vo_vo_vo(veq_vo_vi2_vi2(vand_vi2_vi2_vi2(vtruncate_vi2_vf(y), vcast_vi2_i(1)), vcast_vi2_i(1)), yisint),\n\t\t\t\t vlt_vo_vf_vf(vabs_vf_vf(y), vcast_vf_f(1 << 24)));\n\n  result = vsel_vf_vo_vf_vf(vand_vo_vo_vo(vsignbit_vo_vf(x), yisodd), vneg_vf_vf(result), result);\n\n  result = vsel_vf_vo_vf_vf(veq_vo_vf_vf(x, vcast_vf_f(0)), vcast_vf_f(0), result);\n  result = vsel_vf_vo_vf_vf(veq_vo_vf_vf(y, vcast_vf_f(0)), vcast_vf_f(1), result);\n\n  return result;\n}\n#endif // #if !defined(DETERMINISTIC)\n\nstatic INLINE CONST VECTOR_CC vfloat2 expk2f(vfloat2 d) {\n  vfloat u = vmul_vf_vf_vf(vadd_vf_vf_vf(vf2getx_vf_vf2(d), vf2gety_vf_vf2(d)), vcast_vf_f(R_LN2f));\n  vint2 q = vrint_vi2_vf(u);\n  vfloat2 s, t;\n\n  s = dfadd2_vf2_vf2_vf(d, vmul_vf_vf_vf(vcast_vf_vi2(q), vcast_vf_f(-L2Uf)));\n  s = dfadd2_vf2_vf2_vf(s, vmul_vf_vf_vf(vcast_vf_vi2(q), vcast_vf_f(-L2Lf)));\n\n  u = vcast_vf_f(+0.1980960224e-3f);\n  u = vmla_vf_vf_vf_vf(u, vf2getx_vf_vf2(s), vcast_vf_f(+0.1394256484e-2f));\n  u = vmla_vf_vf_vf_vf(u, vf2getx_vf_vf2(s), vcast_vf_f(+0.8333456703e-2f));\n  u = vmla_vf_vf_vf_vf(u, vf2getx_vf_vf2(s), vcast_vf_f(+0.4166637361e-1f));\n\n  t = dfadd2_vf2_vf2_vf(dfmul_vf2_vf2_vf(s, u), vcast_vf_f(+0.166666659414234244790680580464e+0f));\n  t = dfadd2_vf2_vf2_vf(dfmul_vf2_vf2_vf2(s, t), vcast_vf_f(0.5));\n  t = dfadd2_vf2_vf2_vf2(s, dfmul_vf2_vf2_vf2(dfsqu_vf2_vf2(s), t));\n\n  t = dfadd_vf2_vf_vf2(vcast_vf_f(1), t);\n\n  t = vf2setx_vf2_vf2_vf(t, vldexp2_vf_vf_vi2(vf2getx_vf_vf2(t), q));\n  t = vf2sety_vf2_vf2_vf(t, vldexp2_vf_vf_vi2(vf2gety_vf_vf2(t), q));\n\n  t = vf2setx_vf2_vf2_vf(t, vreinterpret_vf_vm(vandnot_vm_vo32_vm(vlt_vo_vf_vf(vf2getx_vf_vf2(d), vcast_vf_f(-104)), vreinterpret_vm_vf(vf2getx_vf_vf2(t)))));\n  t = vf2sety_vf2_vf2_vf(t, vreinterpret_vf_vm(vandnot_vm_vo32_vm(vlt_vo_vf_vf(vf2getx_vf_vf2(d), vcast_vf_f(-104)), vreinterpret_vm_vf(vf2gety_vf_vf2(t)))));\n\n  return t;\n}\n\n#if !defined(DETERMINISTIC)\nEXPORT CONST VECTOR_CC vfloat xsinhf(vfloat x) {\n  vfloat y = vabs_vf_vf(x);\n  vfloat2 d = expk2f(vcast_vf2_vf_vf(y, vcast_vf_f(0)));\n  d = dfsub_vf2_vf2_vf2(d, dfrec_vf2_vf2(d));\n  y = vmul_vf_vf_vf(vadd_vf_vf_vf(vf2getx_vf_vf2(d), vf2gety_vf_vf2(d)), vcast_vf_f(0.5));\n\n  y = vsel_vf_vo_vf_vf(vor_vo_vo_vo(vgt_vo_vf_vf(vabs_vf_vf(x), vcast_vf_f(89)),\n\t\t\t\t    visnan_vo_vf(y)), vcast_vf_f(SLEEF_INFINITYf), y);\n  y = vmulsign_vf_vf_vf(y, x);\n  y = vreinterpret_vf_vm(vor_vm_vo32_vm(visnan_vo_vf(x), vreinterpret_vm_vf(y)));\n\n  return y;\n}\n\nEXPORT CONST VECTOR_CC vfloat xcoshf(vfloat x) {\n  vfloat y = vabs_vf_vf(x);\n  vfloat2 d = expk2f(vcast_vf2_vf_vf(y, vcast_vf_f(0)));\n  d = dfadd_vf2_vf2_vf2(d, dfrec_vf2_vf2(d));\n  y = vmul_vf_vf_vf(vadd_vf_vf_vf(vf2getx_vf_vf2(d), vf2gety_vf_vf2(d)), vcast_vf_f(0.5));\n\n  y = vsel_vf_vo_vf_vf(vor_vo_vo_vo(vgt_vo_vf_vf(vabs_vf_vf(x), vcast_vf_f(89)),\n\t\t\t\t    visnan_vo_vf(y)), vcast_vf_f(SLEEF_INFINITYf), y);\n  y = vreinterpret_vf_vm(vor_vm_vo32_vm(visnan_vo_vf(x), vreinterpret_vm_vf(y)));\n\n  return y;\n}\n\nEXPORT CONST VECTOR_CC vfloat xtanhf(vfloat x) {\n  vfloat y = vabs_vf_vf(x);\n  vfloat2 d = expk2f(vcast_vf2_vf_vf(y, vcast_vf_f(0)));\n  vfloat2 e = dfrec_vf2_vf2(d);\n  d = dfdiv_vf2_vf2_vf2(dfadd_vf2_vf2_vf2(d, dfneg_vf2_vf2(e)), dfadd_vf2_vf2_vf2(d, e));\n  y = vadd_vf_vf_vf(vf2getx_vf_vf2(d), vf2gety_vf_vf2(d));\n\n  y = vsel_vf_vo_vf_vf(vor_vo_vo_vo(vgt_vo_vf_vf(vabs_vf_vf(x), vcast_vf_f(8.664339742f)),\n\t\t\t\t    visnan_vo_vf(y)), vcast_vf_f(1.0f), y);\n  y = vmulsign_vf_vf_vf(y, x);\n  y = vreinterpret_vf_vm(vor_vm_vo32_vm(visnan_vo_vf(x), vreinterpret_vm_vf(y)));\n\n  return y;\n}\n\nEXPORT CONST VECTOR_CC vfloat xsinhf_u35(vfloat x) {\n  vfloat e = expm1fk(vabs_vf_vf(x));\n  vfloat y = vdiv_vf_vf_vf(vadd_vf_vf_vf(e, vcast_vf_f(2)), vadd_vf_vf_vf(e, vcast_vf_f(1)));\n  y = vmul_vf_vf_vf(y, vmul_vf_vf_vf(vcast_vf_f(0.5f), e));\n\n  y = vsel_vf_vo_vf_vf(vor_vo_vo_vo(vgt_vo_vf_vf(vabs_vf_vf(x), vcast_vf_f(88)),\n\t\t\t\t    visnan_vo_vf(y)), vcast_vf_f(SLEEF_INFINITYf), y);\n  y = vmulsign_vf_vf_vf(y, x);\n  y = vreinterpret_vf_vm(vor_vm_vo32_vm(visnan_vo_vf(x), vreinterpret_vm_vf(y)));\n\n  return y;\n}\n\nEXPORT CONST VECTOR_CC vfloat xcoshf_u35(vfloat x) {\n  vfloat e = xexpf(vabs_vf_vf(x));\n  vfloat y = vmla_vf_vf_vf_vf(vcast_vf_f(0.5f), e, vdiv_vf_vf_vf(vcast_vf_f(0.5), e));\n\n  y = vsel_vf_vo_vf_vf(vor_vo_vo_vo(vgt_vo_vf_vf(vabs_vf_vf(x), vcast_vf_f(88)),\n\t\t\t\t    visnan_vo_vf(y)), vcast_vf_f(SLEEF_INFINITYf), y);\n  y = vreinterpret_vf_vm(vor_vm_vo32_vm(visnan_vo_vf(x), vreinterpret_vm_vf(y)));\n\n  return y;\n}\n\nEXPORT CONST VECTOR_CC vfloat xtanhf_u35(vfloat x) {\n  vfloat d = expm1fk(vmul_vf_vf_vf(vcast_vf_f(2), vabs_vf_vf(x)));\n  vfloat y = vdiv_vf_vf_vf(d, vadd_vf_vf_vf(vcast_vf_f(2), d));\n\n  y = vsel_vf_vo_vf_vf(vor_vo_vo_vo(vgt_vo_vf_vf(vabs_vf_vf(x), vcast_vf_f(8.664339742f)),\n\t\t\t\t    visnan_vo_vf(y)), vcast_vf_f(1.0f), y);\n  y = vmulsign_vf_vf_vf(y, x);\n  y = vreinterpret_vf_vm(vor_vm_vo32_vm(visnan_vo_vf(x), vreinterpret_vm_vf(y)));\n\n  return y;\n}\n#endif // #if !defined(DETERMINISTIC)\n\nstatic INLINE CONST VECTOR_CC vfloat2 logk2f(vfloat2 d) {\n  vfloat2 x, x2, m, s;\n  vfloat t;\n  vint2 e;\n\n#if !defined(ENABLE_AVX512F) && !defined(ENABLE_AVX512FNOFMA)\n  e = vilogbk_vi2_vf(vmul_vf_vf_vf(vf2getx_vf_vf2(d), vcast_vf_f(1.0f/0.75f)));\n#else\n  e = vrint_vi2_vf(vgetexp_vf_vf(vmul_vf_vf_vf(vf2getx_vf_vf2(d), vcast_vf_f(1.0f/0.75f))));\n#endif\n  m = dfscale_vf2_vf2_vf(d, vpow2i_vf_vi2(vneg_vi2_vi2(e)));\n\n  x = dfdiv_vf2_vf2_vf2(dfadd2_vf2_vf2_vf(m, vcast_vf_f(-1)), dfadd2_vf2_vf2_vf(m, vcast_vf_f(1)));\n  x2 = dfsqu_vf2_vf2(x);\n\n  t = vcast_vf_f(0.2392828464508056640625f);\n  t = vmla_vf_vf_vf_vf(t, vf2getx_vf_vf2(x2), vcast_vf_f(0.28518211841583251953125f));\n  t = vmla_vf_vf_vf_vf(t, vf2getx_vf_vf2(x2), vcast_vf_f(0.400005877017974853515625f));\n  t = vmla_vf_vf_vf_vf(t, vf2getx_vf_vf2(x2), vcast_vf_f(0.666666686534881591796875f));\n\n  s = dfmul_vf2_vf2_vf(vcast_vf2_vf_vf(vcast_vf_f(0.69314718246459960938f), vcast_vf_f(-1.904654323148236017e-09f)), vcast_vf_vi2(e));\n  s = dfadd_vf2_vf2_vf2(s, dfscale_vf2_vf2_vf(x, vcast_vf_f(2)));\n  s = dfadd_vf2_vf2_vf2(s, dfmul_vf2_vf2_vf(dfmul_vf2_vf2_vf2(x2, x), t));\n\n  return s;\n}\n\n#if !defined(DETERMINISTIC)\nEXPORT CONST VECTOR_CC vfloat xasinhf(vfloat x) {\n  vfloat y = vabs_vf_vf(x);\n  vopmask o = vgt_vo_vf_vf(y, vcast_vf_f(1));\n  vfloat2 d;\n  \n  d = vsel_vf2_vo_vf2_vf2(o, dfrec_vf2_vf(x), vcast_vf2_vf_vf(y, vcast_vf_f(0)));\n  d = dfsqrt_vf2_vf2(dfadd2_vf2_vf2_vf(dfsqu_vf2_vf2(d), vcast_vf_f(1)));\n  d = vsel_vf2_vo_vf2_vf2(o, dfmul_vf2_vf2_vf(d, y), d);\n\n  d = logk2f(dfnormalize_vf2_vf2(dfadd2_vf2_vf2_vf(d, x)));\n  y = vadd_vf_vf_vf(vf2getx_vf_vf2(d), vf2gety_vf_vf2(d));\n\n  y = vsel_vf_vo_vf_vf(vor_vo_vo_vo(vgt_vo_vf_vf(vabs_vf_vf(x), vcast_vf_f(SQRT_FLT_MAX)),\n\t\t\t\t    visnan_vo_vf(y)),\n\t\t       vmulsign_vf_vf_vf(vcast_vf_f(SLEEF_INFINITYf), x), y);\n  y = vreinterpret_vf_vm(vor_vm_vo32_vm(visnan_vo_vf(x), vreinterpret_vm_vf(y)));\n  y = vsel_vf_vo_vf_vf(visnegzero_vo_vf(x), vcast_vf_f(-0.0), y);\n\n  return y;\n}\n\nEXPORT CONST VECTOR_CC vfloat xacoshf(vfloat x) {\n  vfloat2 d = logk2f(dfadd2_vf2_vf2_vf(dfmul_vf2_vf2_vf2(dfsqrt_vf2_vf2(dfadd2_vf2_vf_vf(x, vcast_vf_f(1))), dfsqrt_vf2_vf2(dfadd2_vf2_vf_vf(x, vcast_vf_f(-1)))), x));\n  vfloat y = vadd_vf_vf_vf(vf2getx_vf_vf2(d), vf2gety_vf_vf2(d));\n\n  y = vsel_vf_vo_vf_vf(vor_vo_vo_vo(vgt_vo_vf_vf(vabs_vf_vf(x), vcast_vf_f(SQRT_FLT_MAX)),\n\t\t\t\t    visnan_vo_vf(y)),\n\t\t       vcast_vf_f(SLEEF_INFINITYf), y);\n\n  y = vreinterpret_vf_vm(vandnot_vm_vo32_vm(veq_vo_vf_vf(x, vcast_vf_f(1.0f)), vreinterpret_vm_vf(y)));\n\n  y = vreinterpret_vf_vm(vor_vm_vo32_vm(vlt_vo_vf_vf(x, vcast_vf_f(1.0f)), vreinterpret_vm_vf(y)));\n  y = vreinterpret_vf_vm(vor_vm_vo32_vm(visnan_vo_vf(x), vreinterpret_vm_vf(y)));\n\n  return y;\n}\n\nEXPORT CONST VECTOR_CC vfloat xatanhf(vfloat x) {\n  vfloat y = vabs_vf_vf(x);\n  vfloat2 d = logk2f(dfdiv_vf2_vf2_vf2(dfadd2_vf2_vf_vf(vcast_vf_f(1), y), dfadd2_vf2_vf_vf(vcast_vf_f(1), vneg_vf_vf(y))));\n  y = vreinterpret_vf_vm(vor_vm_vo32_vm(vgt_vo_vf_vf(y, vcast_vf_f(1.0)), vreinterpret_vm_vf(vsel_vf_vo_vf_vf(veq_vo_vf_vf(y, vcast_vf_f(1.0)), vcast_vf_f(SLEEF_INFINITYf), vmul_vf_vf_vf(vadd_vf_vf_vf(vf2getx_vf_vf2(d), vf2gety_vf_vf2(d)), vcast_vf_f(0.5))))));\n\n  y = vreinterpret_vf_vm(vor_vm_vo32_vm(vor_vo_vo_vo(visinf_vo_vf(x), visnan_vo_vf(y)), vreinterpret_vm_vf(y)));\n  y = vmulsign_vf_vf_vf(y, x);\n  y = vreinterpret_vf_vm(vor_vm_vo32_vm(visnan_vo_vf(x), vreinterpret_vm_vf(y)));\n\n  return y;\n}\n#endif // #if !defined(DETERMINISTIC)\n\n#if !defined(DETERMINISTIC)\nEXPORT CONST VECTOR_CC vfloat xexp2f(vfloat d) {\n  vfloat u = vrint_vf_vf(d), s;\n  vint2 q = vrint_vi2_vf(u);\n\n  s = vsub_vf_vf_vf(d, u);\n\n  u = vcast_vf_f(+0.1535920892e-3);\n  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(+0.1339262701e-2));\n  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(+0.9618384764e-2));\n  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(+0.5550347269e-1));\n  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(+0.2402264476e+0));\n  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(+0.6931471825e+0));\n\n#ifdef ENABLE_FMA_SP\n  u = vfma_vf_vf_vf_vf(u, s, vcast_vf_f(1));\n#else\n  u = vf2getx_vf_vf2(dfnormalize_vf2_vf2(dfadd_vf2_vf_vf2(vcast_vf_f(1), dfmul_vf2_vf_vf(u, s))));\n#endif\n  \n  u = vldexp2_vf_vf_vi2(u, q);\n\n  u = vsel_vf_vo_vf_vf(vge_vo_vf_vf(d, vcast_vf_f(128)), vcast_vf_f(SLEEF_INFINITY), u);\n  u = vreinterpret_vf_vm(vandnot_vm_vo32_vm(vlt_vo_vf_vf(d, vcast_vf_f(-150)), vreinterpret_vm_vf(u)));\n\n  return u;\n}\n\nEXPORT CONST VECTOR_CC vfloat xexp2f_u35(vfloat d) {\n  vfloat u = vrint_vf_vf(d), s;\n  vint2 q = vrint_vi2_vf(u);\n\n  s = vsub_vf_vf_vf(d, u);\n\n  u = vcast_vf_f(+0.1535920892e-3);\n  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(+0.1339262701e-2));\n  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(+0.9618384764e-2));\n  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(+0.5550347269e-1));\n  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(+0.2402264476e+0));\n  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(+0.6931471825e+0));\n  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(+0.1000000000e+1));\n  \n  u = vldexp2_vf_vf_vi2(u, q);\n\n  u = vsel_vf_vo_vf_vf(vge_vo_vf_vf(d, vcast_vf_f(128)), vcast_vf_f(SLEEF_INFINITY), u);\n  u = vreinterpret_vf_vm(vandnot_vm_vo32_vm(vlt_vo_vf_vf(d, vcast_vf_f(-150)), vreinterpret_vm_vf(u)));\n\n  return u;\n}\n\nEXPORT CONST VECTOR_CC vfloat xexp10f(vfloat d) {\n  vfloat u = vrint_vf_vf(vmul_vf_vf_vf(d, vcast_vf_f(LOG10_2))), s;\n  vint2 q = vrint_vi2_vf(u);\n\n  s = vmla_vf_vf_vf_vf(u, vcast_vf_f(-L10Uf), d);\n  s = vmla_vf_vf_vf_vf(u, vcast_vf_f(-L10Lf), s);\n\n  u = vcast_vf_f(+0.6802555919e-1);\n  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(+0.2078080326e+0));\n  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(+0.5393903852e+0));\n  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(+0.1171245337e+1));\n  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(+0.2034678698e+1));\n  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(+0.2650949001e+1));\n  vfloat2 x = dfadd_vf2_vf2_vf(vcast_vf2_f_f(2.3025851249694824219, -3.1705172516493593157e-08), vmul_vf_vf_vf(u, s));\n  u = vf2getx_vf_vf2(dfnormalize_vf2_vf2(dfadd_vf2_vf_vf2(vcast_vf_f(1), dfmul_vf2_vf2_vf(x, s))));\n  \n  u = vldexp2_vf_vf_vi2(u, q);\n\n  u = vsel_vf_vo_vf_vf(vgt_vo_vf_vf(d, vcast_vf_f(38.5318394191036238941387f)), vcast_vf_f(SLEEF_INFINITYf), u);\n  u = vreinterpret_vf_vm(vandnot_vm_vo32_vm(vlt_vo_vf_vf(d, vcast_vf_f(-50)), vreinterpret_vm_vf(u)));\n\n  return u;\n}\n\nEXPORT CONST VECTOR_CC vfloat xexp10f_u35(vfloat d) {\n  vfloat u = vrint_vf_vf(vmul_vf_vf_vf(d, vcast_vf_f(LOG10_2))), s;\n  vint2 q = vrint_vi2_vf(u);\n\n  s = vmla_vf_vf_vf_vf(u, vcast_vf_f(-L10Uf), d);\n  s = vmla_vf_vf_vf_vf(u, vcast_vf_f(-L10Lf), s);\n\n  u = vcast_vf_f(+0.2064004987e+0);\n  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(+0.5417877436e+0));\n  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(+0.1171286821e+1));\n  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(+0.2034656048e+1));\n  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(+0.2650948763e+1));\n  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(+0.2302585125e+1));\n  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(+0.1000000000e+1));\n  \n  u = vldexp2_vf_vf_vi2(u, q);\n\n  u = vsel_vf_vo_vf_vf(vgt_vo_vf_vf(d, vcast_vf_f(38.5318394191036238941387f)), vcast_vf_f(SLEEF_INFINITYf), u);\n  u = vreinterpret_vf_vm(vandnot_vm_vo32_vm(vlt_vo_vf_vf(d, vcast_vf_f(-50)), vreinterpret_vm_vf(u)));\n\n  return u;\n}\n\nEXPORT CONST VECTOR_CC vfloat xexpm1f(vfloat a) {\n  vfloat2 d = dfadd2_vf2_vf2_vf(expk2f(vcast_vf2_vf_vf(a, vcast_vf_f(0))), vcast_vf_f(-1.0));\n  vfloat x = vadd_vf_vf_vf(vf2getx_vf_vf2(d), vf2gety_vf_vf2(d));\n  x = vsel_vf_vo_vf_vf(vgt_vo_vf_vf(a, vcast_vf_f(88.72283172607421875f)), vcast_vf_f(SLEEF_INFINITYf), x);\n  x = vsel_vf_vo_vf_vf(vlt_vo_vf_vf(a, vcast_vf_f(-16.635532333438687426013570f)), vcast_vf_f(-1), x);\n  x = vsel_vf_vo_vf_vf(visnegzero_vo_vf(a), vcast_vf_f(-0.0f), x);\n  return x;\n}\n#endif // #if !defined(DETERMINISTIC)\n\n#if !defined(DETERMINISTIC)\nEXPORT CONST VECTOR_CC vfloat xlog10f(vfloat d) {\n  vfloat2 x;\n  vfloat t, m, x2;\n\n#if !defined(ENABLE_AVX512F) && !defined(ENABLE_AVX512FNOFMA)\n  vopmask o = vlt_vo_vf_vf(d, vcast_vf_f(FLT_MIN));\n  d = vsel_vf_vo_vf_vf(o, vmul_vf_vf_vf(d, vcast_vf_f((float)(INT64_C(1) << 32) * (float)(INT64_C(1) << 32))), d);\n  vint2 e = vilogb2k_vi2_vf(vmul_vf_vf_vf(d, vcast_vf_f(1.0/0.75)));\n  m = vldexp3_vf_vf_vi2(d, vneg_vi2_vi2(e));\n  e = vsel_vi2_vo_vi2_vi2(o, vsub_vi2_vi2_vi2(e, vcast_vi2_i(64)), e);\n#else\n  vfloat e = vgetexp_vf_vf(vmul_vf_vf_vf(d, vcast_vf_f(1.0/0.75)));\n  e = vsel_vf_vo_vf_vf(vispinf_vo_vf(e), vcast_vf_f(128.0f), e);\n  m = vgetmant_vf_vf(d);\n#endif\n\n  x = dfdiv_vf2_vf2_vf2(dfadd2_vf2_vf_vf(vcast_vf_f(-1), m), dfadd2_vf2_vf_vf(vcast_vf_f(1), m));\n  x2 = vmul_vf_vf_vf(vf2getx_vf_vf2(x), vf2getx_vf_vf2(x));\n\n  t = vcast_vf_f(+0.1314289868e+0);\n  t = vmla_vf_vf_vf_vf(t, x2, vcast_vf_f( +0.1735493541e+0));\n  t = vmla_vf_vf_vf_vf(t, x2, vcast_vf_f( +0.2895309627e+0));\n  \n#if !defined(ENABLE_AVX512F) && !defined(ENABLE_AVX512FNOFMA)\n  vfloat2 s = dfmul_vf2_vf2_vf(vcast_vf2_f_f(0.30103001, -1.432098889e-08), vcast_vf_vi2(e));\n#else\n  vfloat2 s = dfmul_vf2_vf2_vf(vcast_vf2_f_f(0.30103001, -1.432098889e-08), e);\n#endif\n\n  s = dfadd_vf2_vf2_vf2(s, dfmul_vf2_vf2_vf2(x, vcast_vf2_f_f(0.868588984, -2.170757285e-08)));\n  s = dfadd_vf2_vf2_vf(s, vmul_vf_vf_vf(vmul_vf_vf_vf(x2, vf2getx_vf_vf2(x)), t));\n\n  vfloat r = vadd_vf_vf_vf(vf2getx_vf_vf2(s), vf2gety_vf_vf2(s));\n\n#if !defined(ENABLE_AVX512F) && !defined(ENABLE_AVX512FNOFMA)\n  r = vsel_vf_vo_vf_vf(vispinf_vo_vf(d), vcast_vf_f(SLEEF_INFINITY), r);\n  r = vsel_vf_vo_vf_vf(vor_vo_vo_vo(vlt_vo_vf_vf(d, vcast_vf_f(0)), visnan_vo_vf(d)), vcast_vf_f(SLEEF_NAN), r);\n  r = vsel_vf_vo_vf_vf(veq_vo_vf_vf(d, vcast_vf_f(0)), vcast_vf_f(-SLEEF_INFINITY), r);\n#else\n  r = vfixup_vf_vf_vf_vi2_i(r, d, vcast_vi2_i((4 << (2*4)) | (3 << (4*4)) | (5 << (5*4)) | (2 << (6*4))), 0);\n#endif\n  \n  return r;\n}\n\nEXPORT CONST VECTOR_CC vfloat xlog2f(vfloat d) {\n  vfloat2 x;\n  vfloat t, m, x2;\n\n#if !defined(ENABLE_AVX512F) && !defined(ENABLE_AVX512FNOFMA)\n  vopmask o = vlt_vo_vf_vf(d, vcast_vf_f(FLT_MIN));\n  d = vsel_vf_vo_vf_vf(o, vmul_vf_vf_vf(d, vcast_vf_f((float)(INT64_C(1) << 32) * (float)(INT64_C(1) << 32))), d);\n  vint2 e = vilogb2k_vi2_vf(vmul_vf_vf_vf(d, vcast_vf_f(1.0/0.75)));\n  m = vldexp3_vf_vf_vi2(d, vneg_vi2_vi2(e));\n  e = vsel_vi2_vo_vi2_vi2(o, vsub_vi2_vi2_vi2(e, vcast_vi2_i(64)), e);\n#else\n  vfloat e = vgetexp_vf_vf(vmul_vf_vf_vf(d, vcast_vf_f(1.0/0.75)));\n  e = vsel_vf_vo_vf_vf(vispinf_vo_vf(e), vcast_vf_f(128.0f), e);\n  m = vgetmant_vf_vf(d);\n#endif\n\n  x = dfdiv_vf2_vf2_vf2(dfadd2_vf2_vf_vf(vcast_vf_f(-1), m), dfadd2_vf2_vf_vf(vcast_vf_f(1), m));\n  x2 = vmul_vf_vf_vf(vf2getx_vf_vf2(x), vf2getx_vf_vf2(x));\n\n  t = vcast_vf_f(+0.4374550283e+0f);\n  t = vmla_vf_vf_vf_vf(t, x2, vcast_vf_f(+0.5764790177e+0f));\n  t = vmla_vf_vf_vf_vf(t, x2, vcast_vf_f(+0.9618012905120f));\n  \n#if !defined(ENABLE_AVX512F) && !defined(ENABLE_AVX512FNOFMA)\n  vfloat2 s = dfadd2_vf2_vf_vf2(vcast_vf_vi2(e),\n\t\t\t\tdfmul_vf2_vf2_vf2(x, vcast_vf2_f_f(2.8853900432586669922, 3.2734474483568488616e-08)));\n#else\n  vfloat2 s = dfadd2_vf2_vf_vf2(e,\n\t\t\t\tdfmul_vf2_vf2_vf2(x, vcast_vf2_f_f(2.8853900432586669922, 3.2734474483568488616e-08)));\n#endif\n\n  s = dfadd2_vf2_vf2_vf(s, vmul_vf_vf_vf(vmul_vf_vf_vf(x2, vf2getx_vf_vf2(x)), t));\n\n  vfloat r = vadd_vf_vf_vf(vf2getx_vf_vf2(s), vf2gety_vf_vf2(s));\n\n#if !defined(ENABLE_AVX512F) && !defined(ENABLE_AVX512FNOFMA)\n  r = vsel_vf_vo_vf_vf(vispinf_vo_vf(d), vcast_vf_f(SLEEF_INFINITY), r);\n  r = vsel_vf_vo_vf_vf(vor_vo_vo_vo(vlt_vo_vf_vf(d, vcast_vf_f(0)), visnan_vo_vf(d)), vcast_vf_f(SLEEF_NAN), r);\n  r = vsel_vf_vo_vf_vf(veq_vo_vf_vf(d, vcast_vf_f(0)), vcast_vf_f(-SLEEF_INFINITY), r);\n#else\n  r = vfixup_vf_vf_vf_vi2_i(r, d, vcast_vi2_i((4 << (2*4)) | (3 << (4*4)) | (5 << (5*4)) | (2 << (6*4))), 0);\n#endif\n  \n  return r;\n}\n\nEXPORT CONST VECTOR_CC vfloat xlog2f_u35(vfloat d) {\n  vfloat m, t, x, x2;\n\n#if !defined(ENABLE_AVX512F) && !defined(ENABLE_AVX512FNOFMA)\n  vopmask o = vlt_vo_vf_vf(d, vcast_vf_f(FLT_MIN));\n  d = vsel_vf_vo_vf_vf(o, vmul_vf_vf_vf(d, vcast_vf_f((float)(INT64_C(1) << 32) * (float)(INT64_C(1) << 32))), d);\n  vint2 e = vilogb2k_vi2_vf(vmul_vf_vf_vf(d, vcast_vf_f(1.0/0.75)));\n  m = vldexp3_vf_vf_vi2(d, vneg_vi2_vi2(e));\n  e = vsel_vi2_vo_vi2_vi2(o, vsub_vi2_vi2_vi2(e, vcast_vi2_i(64)), e);\n#else\n  vfloat e = vgetexp_vf_vf(vmul_vf_vf_vf(d, vcast_vf_f(1.0/0.75)));\n  e = vsel_vf_vo_vf_vf(vispinf_vo_vf(e), vcast_vf_f(128.0f), e);\n  m = vgetmant_vf_vf(d);\n#endif\n\n  x = vdiv_vf_vf_vf(vsub_vf_vf_vf(m, vcast_vf_f(1)), vadd_vf_vf_vf(m, vcast_vf_f(1)));\n  x2 = vmul_vf_vf_vf(x, x);\n\n  t = vcast_vf_f(+0.4374088347e+0);\n  t = vmla_vf_vf_vf_vf(t, x2, vcast_vf_f(+0.5764843822e+0));\n  t = vmla_vf_vf_vf_vf(t, x2, vcast_vf_f(+0.9618024230e+0));\n  \n#if !defined(ENABLE_AVX512F) && !defined(ENABLE_AVX512FNOFMA)\n  vfloat r = vmla_vf_vf_vf_vf(vmul_vf_vf_vf(x2, x), t,\n\t\t\t      vmla_vf_vf_vf_vf(x, vcast_vf_f(+0.2885390043e+1), vcast_vf_vi2(e)));\n\n  r = vsel_vf_vo_vf_vf(vispinf_vo_vf(d), vcast_vf_f(SLEEF_INFINITY), r);\n  r = vsel_vf_vo_vf_vf(vor_vo_vo_vo(vlt_vo_vf_vf(d, vcast_vf_f(0)), visnan_vo_vf(d)), vcast_vf_f(SLEEF_NAN), r);\n  r = vsel_vf_vo_vf_vf(veq_vo_vf_vf(d, vcast_vf_f(0)), vcast_vf_f(-SLEEF_INFINITY), r);\n#else\n  vfloat r = vmla_vf_vf_vf_vf(vmul_vf_vf_vf(x2, x), t,\n\t\t\t      vmla_vf_vf_vf_vf(x, vcast_vf_f(+0.2885390043e+1), e));\n\n  r = vfixup_vf_vf_vf_vi2_i(r, d, vcast_vi2_i((4 << (2*4)) | (3 << (4*4)) | (5 << (5*4)) | (2 << (6*4))), 0);\n#endif\n\n  return r;\n}\n\nEXPORT CONST VECTOR_CC vfloat xlog1pf(vfloat d) {\n  vfloat2 x;\n  vfloat t, m, x2;\n\n  vfloat dp1 = vadd_vf_vf_vf(d, vcast_vf_f(1));\n\n#if !defined(ENABLE_AVX512F) && !defined(ENABLE_AVX512FNOFMA)\n  vopmask o = vlt_vo_vf_vf(dp1, vcast_vf_f(FLT_MIN));\n  dp1 = vsel_vf_vo_vf_vf(o, vmul_vf_vf_vf(dp1, vcast_vf_f((float)(INT64_C(1) << 32) * (float)(INT64_C(1) << 32))), dp1);\n  vint2 e = vilogb2k_vi2_vf(vmul_vf_vf_vf(dp1, vcast_vf_f(1.0f/0.75f)));\n  t = vldexp3_vf_vf_vi2(vcast_vf_f(1), vneg_vi2_vi2(e));\n  m = vmla_vf_vf_vf_vf(d, t, vsub_vf_vf_vf(t, vcast_vf_f(1)));\n  e = vsel_vi2_vo_vi2_vi2(o, vsub_vi2_vi2_vi2(e, vcast_vi2_i(64)), e);\n  vfloat2 s = dfmul_vf2_vf2_vf(vcast_vf2_f_f(0.69314718246459960938f, -1.904654323148236017e-09f), vcast_vf_vi2(e));\n#else\n  vfloat e = vgetexp_vf_vf(vmul_vf_vf_vf(dp1, vcast_vf_f(1.0f/0.75f)));\n  e = vsel_vf_vo_vf_vf(vispinf_vo_vf(e), vcast_vf_f(128.0f), e);\n  t = vldexp3_vf_vf_vi2(vcast_vf_f(1), vneg_vi2_vi2(vrint_vi2_vf(e)));\n  m = vmla_vf_vf_vf_vf(d, t, vsub_vf_vf_vf(t, vcast_vf_f(1)));\n  vfloat2 s = dfmul_vf2_vf2_vf(vcast_vf2_f_f(0.69314718246459960938f, -1.904654323148236017e-09f), e);\n#endif\n\n  x = dfdiv_vf2_vf2_vf2(vcast_vf2_vf_vf(m, vcast_vf_f(0)), dfadd_vf2_vf_vf(vcast_vf_f(2), m));\n  x2 = vmul_vf_vf_vf(vf2getx_vf_vf2(x), vf2getx_vf_vf2(x));\n\n  t = vcast_vf_f(+0.3027294874e+0f);\n  t = vmla_vf_vf_vf_vf(t, x2, vcast_vf_f(+0.3996108174e+0f));\n  t = vmla_vf_vf_vf_vf(t, x2, vcast_vf_f(+0.6666694880e+0f));\n  \n  s = dfadd_vf2_vf2_vf2(s, dfscale_vf2_vf2_vf(x, vcast_vf_f(2)));\n  s = dfadd_vf2_vf2_vf(s, vmul_vf_vf_vf(vmul_vf_vf_vf(x2, vf2getx_vf_vf2(x)), t));\n\n  vfloat r = vadd_vf_vf_vf(vf2getx_vf_vf2(s), vf2gety_vf_vf2(s));\n  \n  r = vsel_vf_vo_vf_vf(vgt_vo_vf_vf(d, vcast_vf_f(1e+38)), vcast_vf_f(SLEEF_INFINITYf), r);\n  r = vreinterpret_vf_vm(vor_vm_vo32_vm(vgt_vo_vf_vf(vcast_vf_f(-1), d), vreinterpret_vm_vf(r)));\n  r = vsel_vf_vo_vf_vf(veq_vo_vf_vf(d, vcast_vf_f(-1)), vcast_vf_f(-SLEEF_INFINITYf), r);\n  r = vsel_vf_vo_vf_vf(visnegzero_vo_vf(d), vcast_vf_f(-0.0f), r);\n\n  return r;\n}\n#endif // #if !defined(DETERMINISTIC)\n\n//\n\n#if !defined(DETERMINISTIC)\nEXPORT CONST VECTOR_CC vfloat xfabsf(vfloat x) { return vabs_vf_vf(x); }\n\nEXPORT CONST VECTOR_CC vfloat xcopysignf(vfloat x, vfloat y) { return vcopysign_vf_vf_vf(x, y); }\n\nEXPORT CONST VECTOR_CC vfloat xfmaxf(vfloat x, vfloat y) {\n#if (defined(__x86_64__) || defined(__i386__)) && !defined(ENABLE_VECEXT) && !defined(ENABLE_PUREC)\n  return vsel_vf_vo_vf_vf(visnan_vo_vf(y), x, vmax_vf_vf_vf(x, y));\n#else\n  return vsel_vf_vo_vf_vf(visnan_vo_vf(y), x, vsel_vf_vo_vf_vf(vgt_vo_vf_vf(x, y), x, y));\n#endif\n}\n\nEXPORT CONST VECTOR_CC vfloat xfminf(vfloat x, vfloat y) {\n#if (defined(__x86_64__) || defined(__i386__)) && !defined(ENABLE_VECEXT) && !defined(ENABLE_PUREC)\n  return vsel_vf_vo_vf_vf(visnan_vo_vf(y), x, vmin_vf_vf_vf(x, y));\n#else\n  return vsel_vf_vo_vf_vf(visnan_vo_vf(y), x, vsel_vf_vo_vf_vf(vgt_vo_vf_vf(y, x), x, y));\n#endif\n}\n\nEXPORT CONST VECTOR_CC vfloat xfdimf(vfloat x, vfloat y) {\n  vfloat ret = vsub_vf_vf_vf(x, y);\n  ret = vsel_vf_vo_vf_vf(vor_vo_vo_vo(vlt_vo_vf_vf(ret, vcast_vf_f(0)), veq_vo_vf_vf(x, y)), vcast_vf_f(0), ret);\n  return ret;\n}\n\nEXPORT CONST VECTOR_CC vfloat xtruncf(vfloat x) {\n#ifdef FULL_FP_ROUNDING\n  return vtruncate_vf_vf(x);\n#else\n  vfloat fr = vsub_vf_vf_vf(x, vcast_vf_vi2(vtruncate_vi2_vf(x)));\n  return vsel_vf_vo_vf_vf(vor_vo_vo_vo(visinf_vo_vf(x), vge_vo_vf_vf(vabs_vf_vf(x), vcast_vf_f(INT64_C(1) << 23))), x, vcopysign_vf_vf_vf(vsub_vf_vf_vf(x, fr), x));\n#endif\n}\n\nEXPORT CONST VECTOR_CC vfloat xfloorf(vfloat x) {\n  vfloat fr = vsub_vf_vf_vf(x, vcast_vf_vi2(vtruncate_vi2_vf(x)));\n  fr = vsel_vf_vo_vf_vf(vlt_vo_vf_vf(fr, vcast_vf_f(0)), vadd_vf_vf_vf(fr, vcast_vf_f(1.0f)), fr);\n  return vsel_vf_vo_vf_vf(vor_vo_vo_vo(visinf_vo_vf(x), vge_vo_vf_vf(vabs_vf_vf(x), vcast_vf_f(INT64_C(1) << 23))), x, vcopysign_vf_vf_vf(vsub_vf_vf_vf(x, fr), x));\n}\n\nEXPORT CONST VECTOR_CC vfloat xceilf(vfloat x) {\n  vfloat fr = vsub_vf_vf_vf(x, vcast_vf_vi2(vtruncate_vi2_vf(x)));\n  fr = vsel_vf_vo_vf_vf(vle_vo_vf_vf(fr, vcast_vf_f(0)), fr, vsub_vf_vf_vf(fr, vcast_vf_f(1.0f)));\n  return vsel_vf_vo_vf_vf(vor_vo_vo_vo(visinf_vo_vf(x), vge_vo_vf_vf(vabs_vf_vf(x), vcast_vf_f(INT64_C(1) << 23))), x, vcopysign_vf_vf_vf(vsub_vf_vf_vf(x, fr), x));\n}\n\nEXPORT CONST VECTOR_CC vfloat xroundf(vfloat d) {\n  vfloat x = vadd_vf_vf_vf(d, vcast_vf_f(0.5f));\n  vfloat fr = vsub_vf_vf_vf(x, vcast_vf_vi2(vtruncate_vi2_vf(x)));\n  x = vsel_vf_vo_vf_vf(vand_vo_vo_vo(vle_vo_vf_vf(x, vcast_vf_f(0)), veq_vo_vf_vf(fr, vcast_vf_f(0))), vsub_vf_vf_vf(x, vcast_vf_f(1.0f)), x);\n  fr = vsel_vf_vo_vf_vf(vlt_vo_vf_vf(fr, vcast_vf_f(0)), vadd_vf_vf_vf(fr, vcast_vf_f(1.0f)), fr);\n  x = vsel_vf_vo_vf_vf(veq_vo_vf_vf(d, vcast_vf_f(0.4999999701976776123f)), vcast_vf_f(0), x);\n  return vsel_vf_vo_vf_vf(vor_vo_vo_vo(visinf_vo_vf(d), vge_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(INT64_C(1) << 23))), d, vcopysign_vf_vf_vf(vsub_vf_vf_vf(x, fr), d));\n}\n\nEXPORT CONST VECTOR_CC vfloat xrintf(vfloat d) {\n#ifdef FULL_FP_ROUNDING\n  return vrint_vf_vf(d);\n#else\n  vfloat c = vmulsign_vf_vf_vf(vcast_vf_f(1 << 23), d);\n  return vsel_vf_vo_vf_vf(vgt_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(1 << 23)),\n\t\t\t  d, vorsign_vf_vf_vf(vsub_vf_vf_vf(vadd_vf_vf_vf(d, c), c), d));\n#endif\n}\n\nEXPORT CONST VECTOR_CC vfloat xfmaf(vfloat x, vfloat y, vfloat z) {\n#ifdef ENABLE_FMA_SP\n  return vfma_vf_vf_vf_vf(x, y, z);\n#else\n  vfloat h2 = vadd_vf_vf_vf(vmul_vf_vf_vf(x, y), z), q = vcast_vf_f(1);\n  vopmask o = vlt_vo_vf_vf(vabs_vf_vf(h2), vcast_vf_f(1e-38f));\n  {\n    const float c0 = UINT64_C(1) << 25, c1 = c0 * c0, c2 = c1 * c1;\n    x = vsel_vf_vo_vf_vf(o, vmul_vf_vf_vf(x, vcast_vf_f(c1)), x);\n    y = vsel_vf_vo_vf_vf(o, vmul_vf_vf_vf(y, vcast_vf_f(c1)), y);\n    z = vsel_vf_vo_vf_vf(o, vmul_vf_vf_vf(z, vcast_vf_f(c2)), z);\n    q = vsel_vf_vo_vf_vf(o, vcast_vf_f(1.0f / c2), q);\n  }\n  o = vgt_vo_vf_vf(vabs_vf_vf(h2), vcast_vf_f(1e+38f));\n  {\n    const float c0 = UINT64_C(1) << 25, c1 = c0 * c0, c2 = c1 * c1;\n    x = vsel_vf_vo_vf_vf(o, vmul_vf_vf_vf(x, vcast_vf_f(1.0f / c1)), x);\n    y = vsel_vf_vo_vf_vf(o, vmul_vf_vf_vf(y, vcast_vf_f(1.0f / c1)), y);\n    z = vsel_vf_vo_vf_vf(o, vmul_vf_vf_vf(z, vcast_vf_f(1.0f / c2)), z);\n    q = vsel_vf_vo_vf_vf(o, vcast_vf_f(c2), q);\n  }\n  vfloat2 d = dfmul_vf2_vf_vf(x, y);\n  d = dfadd2_vf2_vf2_vf(d, z);\n  vfloat ret = vsel_vf_vo_vf_vf(vor_vo_vo_vo(veq_vo_vf_vf(x, vcast_vf_f(0)), veq_vo_vf_vf(y, vcast_vf_f(0))), z, vadd_vf_vf_vf(vf2getx_vf_vf2(d), vf2gety_vf_vf2(d)));\n  o = visinf_vo_vf(z);\n  o = vandnot_vo_vo_vo(visinf_vo_vf(x), o);\n  o = vandnot_vo_vo_vo(visnan_vo_vf(x), o);\n  o = vandnot_vo_vo_vo(visinf_vo_vf(y), o);\n  o = vandnot_vo_vo_vo(visnan_vo_vf(y), o);\n  h2 = vsel_vf_vo_vf_vf(o, z, h2);\n\n  o = vor_vo_vo_vo(visinf_vo_vf(h2), visnan_vo_vf(h2));\n  \n  return vsel_vf_vo_vf_vf(o, h2, vmul_vf_vf_vf(ret, q));\n#endif\n}\n#endif // #if !defined(DETERMINISTIC)\n\n#if !defined(SLEEF_GENHEADER)\nstatic INLINE CONST VECTOR_CC vint2 vcast_vi2_i_i(int i0, int i1) { return vcast_vi2_vm(vcast_vm_i_i(i0, i1)); }\n#endif\n\nSQRTFU05_FUNCATR VECTOR_CC vfloat xsqrtf_u05(vfloat d) {\n#if defined(ENABLE_FMA_SP)\n  vfloat q, w, x, y, z;\n\n  d = vsel_vf_vo_vf_vf(vlt_vo_vf_vf(d, vcast_vf_f(0)), vcast_vf_f(SLEEF_NANf), d);\n\n  vopmask o = vlt_vo_vf_vf(d, vcast_vf_f(5.2939559203393770e-23f));\n  d = vsel_vf_vo_vf_vf(o, vmul_vf_vf_vf(d, vcast_vf_f(1.8889465931478580e+22f)), d);\n  q = vsel_vf_vo_vf_vf(o, vcast_vf_f(7.2759576141834260e-12f), vcast_vf_f(1.0f));\n\n  y = vreinterpret_vf_vi2(vsub_vi2_vi2_vi2(vcast_vi2_i(0x5f3759df), vsrl_vi2_vi2_i(vreinterpret_vi2_vf(d), 1)));\n\n  x = vmul_vf_vf_vf(d, y);         w = vmul_vf_vf_vf(vcast_vf_f(0.5), y);\n  y = vfmanp_vf_vf_vf_vf(x, w, vcast_vf_f(0.5));\n  x = vfma_vf_vf_vf_vf(x, y, x);   w = vfma_vf_vf_vf_vf(w, y, w);\n  y = vfmanp_vf_vf_vf_vf(x, w, vcast_vf_f(0.5));\n  x = vfma_vf_vf_vf_vf(x, y, x);   w = vfma_vf_vf_vf_vf(w, y, w);\n\n  y = vfmanp_vf_vf_vf_vf(x, w, vcast_vf_f(1.5));  w = vadd_vf_vf_vf(w, w);\n  w = vmul_vf_vf_vf(w, y);\n  x = vmul_vf_vf_vf(w, d);\n  y = vfmapn_vf_vf_vf_vf(w, d, x); z = vfmanp_vf_vf_vf_vf(w, x, vcast_vf_f(1));\n\n  z = vfmanp_vf_vf_vf_vf(w, y, z); w = vmul_vf_vf_vf(vcast_vf_f(0.5), x);\n  w = vfma_vf_vf_vf_vf(w, z, y);\n  w = vadd_vf_vf_vf(w, x);\n\n  w = vmul_vf_vf_vf(w, q);\n\n  w = vsel_vf_vo_vf_vf(vor_vo_vo_vo(veq_vo_vf_vf(d, vcast_vf_f(0)),\n\t\t\t\t    veq_vo_vf_vf(d, vcast_vf_f(SLEEF_INFINITYf))), d, w);\n\n  w = vsel_vf_vo_vf_vf(vlt_vo_vf_vf(d, vcast_vf_f(0)), vcast_vf_f(SLEEF_NANf), w);\n\n  return w;\n#else\n  vfloat q;\n  vopmask o;\n  \n  d = vsel_vf_vo_vf_vf(vlt_vo_vf_vf(d, vcast_vf_f(0)), vcast_vf_f(SLEEF_NANf), d);\n\n  o = vlt_vo_vf_vf(d, vcast_vf_f(5.2939559203393770e-23f));\n  d = vsel_vf_vo_vf_vf(o, vmul_vf_vf_vf(d, vcast_vf_f(1.8889465931478580e+22f)), d);\n  q = vsel_vf_vo_vf_vf(o, vcast_vf_f(7.2759576141834260e-12f*0.5f), vcast_vf_f(0.5f));\n\n  o = vgt_vo_vf_vf(d, vcast_vf_f(1.8446744073709552e+19f));\n  d = vsel_vf_vo_vf_vf(o, vmul_vf_vf_vf(d, vcast_vf_f(5.4210108624275220e-20f)), d);\n  q = vsel_vf_vo_vf_vf(o, vcast_vf_f(4294967296.0f * 0.5f), q);\n\n  vfloat x = vreinterpret_vf_vi2(vsub_vi2_vi2_vi2(vcast_vi2_i(0x5f375a86), vsrl_vi2_vi2_i(vreinterpret_vi2_vf(vadd_vf_vf_vf(d, vcast_vf_f(1e-45f))), 1)));\n\n  x = vmul_vf_vf_vf(x, vsub_vf_vf_vf(vcast_vf_f(1.5f), vmul_vf_vf_vf(vmul_vf_vf_vf(vmul_vf_vf_vf(vcast_vf_f(0.5f), d), x), x)));\n  x = vmul_vf_vf_vf(x, vsub_vf_vf_vf(vcast_vf_f(1.5f), vmul_vf_vf_vf(vmul_vf_vf_vf(vmul_vf_vf_vf(vcast_vf_f(0.5f), d), x), x)));\n  x = vmul_vf_vf_vf(x, vsub_vf_vf_vf(vcast_vf_f(1.5f), vmul_vf_vf_vf(vmul_vf_vf_vf(vmul_vf_vf_vf(vcast_vf_f(0.5f), d), x), x)));\n  x = vmul_vf_vf_vf(x, d);\n\n  vfloat2 d2 = dfmul_vf2_vf2_vf2(dfadd2_vf2_vf_vf2(d, dfmul_vf2_vf_vf(x, x)), dfrec_vf2_vf(x));\n\n  x = vmul_vf_vf_vf(vadd_vf_vf_vf(vf2getx_vf_vf2(d2), vf2gety_vf_vf2(d2)), q);\n\n  x = vsel_vf_vo_vf_vf(vispinf_vo_vf(d), vcast_vf_f(SLEEF_INFINITYf), x);\n  x = vsel_vf_vo_vf_vf(veq_vo_vf_vf(d, vcast_vf_f(0)), d, x);\n  \n  return x;\n#endif\n}\n\nEXPORT CONST VECTOR_CC vfloat xsqrtf(vfloat d) {\n#ifdef ACCURATE_SQRT\n  return vsqrt_vf_vf(d);\n#else\n  // fall back to approximation if ACCURATE_SQRT is undefined\n  return xsqrtf_u05(d);\n#endif\n}\n\n#if !defined(DETERMINISTIC)\nEXPORT CONST VECTOR_CC vfloat xhypotf_u05(vfloat x, vfloat y) {\n  x = vabs_vf_vf(x);\n  y = vabs_vf_vf(y);\n  vfloat min = vmin_vf_vf_vf(x, y), n = min;\n  vfloat max = vmax_vf_vf_vf(x, y), d = max;\n\n  vopmask o = vlt_vo_vf_vf(max, vcast_vf_f(FLT_MIN));\n  n = vsel_vf_vo_vf_vf(o, vmul_vf_vf_vf(n, vcast_vf_f(UINT64_C(1) << 24)), n);\n  d = vsel_vf_vo_vf_vf(o, vmul_vf_vf_vf(d, vcast_vf_f(UINT64_C(1) << 24)), d);\n\n  vfloat2 t = dfdiv_vf2_vf2_vf2(vcast_vf2_vf_vf(n, vcast_vf_f(0)), vcast_vf2_vf_vf(d, vcast_vf_f(0)));\n  t = dfmul_vf2_vf2_vf(dfsqrt_vf2_vf2(dfadd2_vf2_vf2_vf(dfsqu_vf2_vf2(t), vcast_vf_f(1))), max);\n  vfloat ret = vadd_vf_vf_vf(vf2getx_vf_vf2(t), vf2gety_vf_vf2(t));\n  ret = vsel_vf_vo_vf_vf(visnan_vo_vf(ret), vcast_vf_f(SLEEF_INFINITYf), ret);\n  ret = vsel_vf_vo_vf_vf(veq_vo_vf_vf(min, vcast_vf_f(0)), max, ret);\n  ret = vsel_vf_vo_vf_vf(vor_vo_vo_vo(visnan_vo_vf(x), visnan_vo_vf(y)), vcast_vf_f(SLEEF_NANf), ret);\n  ret = vsel_vf_vo_vf_vf(vor_vo_vo_vo(veq_vo_vf_vf(x, vcast_vf_f(SLEEF_INFINITYf)), veq_vo_vf_vf(y, vcast_vf_f(SLEEF_INFINITYf))), vcast_vf_f(SLEEF_INFINITYf), ret);\n\n  return ret;\n}\n\nEXPORT CONST VECTOR_CC vfloat xhypotf_u35(vfloat x, vfloat y) {\n  x = vabs_vf_vf(x);\n  y = vabs_vf_vf(y);\n  vfloat min = vmin_vf_vf_vf(x, y), n = min;\n  vfloat max = vmax_vf_vf_vf(x, y), d = max;\n\n  vfloat t = vdiv_vf_vf_vf(min, max);\n  vfloat ret = vmul_vf_vf_vf(max, vsqrt_vf_vf(vmla_vf_vf_vf_vf(t, t, vcast_vf_f(1))));\n  ret = vsel_vf_vo_vf_vf(veq_vo_vf_vf(min, vcast_vf_f(0)), max, ret);\n  ret = vsel_vf_vo_vf_vf(vor_vo_vo_vo(visnan_vo_vf(x), visnan_vo_vf(y)), vcast_vf_f(SLEEF_NANf), ret);\n  ret = vsel_vf_vo_vf_vf(vor_vo_vo_vo(veq_vo_vf_vf(x, vcast_vf_f(SLEEF_INFINITYf)), veq_vo_vf_vf(y, vcast_vf_f(SLEEF_INFINITYf))), vcast_vf_f(SLEEF_INFINITYf), ret);\n\n  return ret;\n}\n\nEXPORT CONST VECTOR_CC vfloat xnextafterf(vfloat x, vfloat y) {\n  x = vsel_vf_vo_vf_vf(veq_vo_vf_vf(x, vcast_vf_f(0)), vmulsign_vf_vf_vf(vcast_vf_f(0), y), x);\n  vint2 t, xi2 = vreinterpret_vi2_vf(x);\n  vopmask c = vxor_vo_vo_vo(vsignbit_vo_vf(x), vge_vo_vf_vf(y, x));\n\n  xi2 = vsel_vi2_vo_vi2_vi2(c, vsub_vi2_vi2_vi2(vcast_vi2_i(0), vxor_vi2_vi2_vi2(xi2, vcast_vi2_i(1 << 31))), xi2);\n\n  xi2 = vsel_vi2_vo_vi2_vi2(vneq_vo_vf_vf(x, y), vsub_vi2_vi2_vi2(xi2, vcast_vi2_i(1)), xi2);\n\n  xi2 = vsel_vi2_vo_vi2_vi2(c, vsub_vi2_vi2_vi2(vcast_vi2_i(0), vxor_vi2_vi2_vi2(xi2, vcast_vi2_i(1 << 31))), xi2);\n\n  vfloat ret = vreinterpret_vf_vi2(xi2);\n\n  ret = vsel_vf_vo_vf_vf(vand_vo_vo_vo(veq_vo_vf_vf(ret, vcast_vf_f(0)), vneq_vo_vf_vf(x, vcast_vf_f(0))), \n\t\t\t vmulsign_vf_vf_vf(vcast_vf_f(0), x), ret);\n\n  ret = vsel_vf_vo_vf_vf(vand_vo_vo_vo(veq_vo_vf_vf(x, vcast_vf_f(0)), veq_vo_vf_vf(y, vcast_vf_f(0))), y, ret);\n\n  ret = vsel_vf_vo_vf_vf(vor_vo_vo_vo(visnan_vo_vf(x), visnan_vo_vf(y)), vcast_vf_f(SLEEF_NANf), ret);\n  \n  return ret;\n}\n\nEXPORT CONST VECTOR_CC vfloat xfrfrexpf(vfloat x) {\n  x = vsel_vf_vo_vf_vf(vlt_vo_vf_vf(vabs_vf_vf(x), vcast_vf_f(FLT_MIN)), vmul_vf_vf_vf(x, vcast_vf_f(UINT64_C(1) << 30)), x);\n\n  vmask xm = vreinterpret_vm_vf(x);\n  xm = vand_vm_vm_vm(xm, vcast_vm_i_i(~0x7f800000U, ~0x7f800000U));\n  xm = vor_vm_vm_vm (xm, vcast_vm_i_i( 0x3f000000U,  0x3f000000U));\n\n  vfloat ret = vreinterpret_vf_vm(xm);\n\n  ret = vsel_vf_vo_vf_vf(visinf_vo_vf(x), vmulsign_vf_vf_vf(vcast_vf_f(SLEEF_INFINITYf), x), ret);\n  ret = vsel_vf_vo_vf_vf(veq_vo_vf_vf(x, vcast_vf_f(0)), x, ret);\n  \n  return ret;\n}\n#endif // #if !defined(DETERMINISTIC)\n\nEXPORT CONST VECTOR_CC vint2 xexpfrexpf(vfloat x) {\n  /*\n  x = vsel_vf_vo_vf_vf(vlt_vo_vf_vf(vabs_vf_vf(x), vcast_vf_f(FLT_MIN)), vmul_vf_vf_vf(x, vcast_vf_f(UINT64_C(1) << 63)), x);\n\n  vint ret = vcastu_vi_vi2(vreinterpret_vi2_vf(x));\n  ret = vsub_vi_vi_vi(vand_vi_vi_vi(vsrl_vi_vi_i(ret, 20), vcast_vi_i(0x7ff)), vcast_vi_i(0x3fe));\n\n  ret = vsel_vi_vo_vi_vi(vor_vo_vo_vo(vor_vo_vo_vo(veq_vo_vf_vf(x, vcast_vf_f(0)), visnan_vo_vf(x)), visinf_vo_vf(x)), vcast_vi_i(0), ret);\n  \n  return ret;\n  */\n  return vcast_vi2_i(0);\n}\n\nstatic INLINE CONST VECTOR_CC vfloat vtoward0f(vfloat x) {\n  vfloat t = vreinterpret_vf_vi2(vsub_vi2_vi2_vi2(vreinterpret_vi2_vf(x), vcast_vi2_i(1)));\n  return vsel_vf_vo_vf_vf(veq_vo_vf_vf(x, vcast_vf_f(0)), vcast_vf_f(0), t);\n}\n\nstatic INLINE CONST VECTOR_CC vfloat vptruncf(vfloat x) {\n#ifdef FULL_FP_ROUNDING\n  return vtruncate_vf_vf(x);\n#else\n  vfloat fr = vsub_vf_vf_vf(x, vcast_vf_vi2(vtruncate_vi2_vf(x)));\n  return vsel_vf_vo_vf_vf(vge_vo_vf_vf(vabs_vf_vf(x), vcast_vf_f(INT64_C(1) << 23)), x, vsub_vf_vf_vf(x, fr));\n#endif\n}\n\n#if !defined(DETERMINISTIC)\nEXPORT CONST VECTOR_CC vfloat xfmodf(vfloat x, vfloat y) {\n  vfloat nu = vabs_vf_vf(x), de = vabs_vf_vf(y), s = vcast_vf_f(1), q;\n  vopmask o = vlt_vo_vf_vf(de, vcast_vf_f(FLT_MIN));\n  nu = vsel_vf_vo_vf_vf(o, vmul_vf_vf_vf(nu, vcast_vf_f(UINT64_C(1) << 25)), nu);\n  de = vsel_vf_vo_vf_vf(o, vmul_vf_vf_vf(de, vcast_vf_f(UINT64_C(1) << 25)), de);\n  s  = vsel_vf_vo_vf_vf(o, vmul_vf_vf_vf(s , vcast_vf_f(1.0f / (UINT64_C(1) << 25))), s);\n  vfloat rde = vtoward0f(vrec_vf_vf(de));\n#if defined(ENABLE_NEON32) || defined(ENABLE_NEON32VFPV4)\n  rde = vtoward0f(rde);\n#endif\n  vfloat2 r = vcast_vf2_vf_vf(nu, vcast_vf_f(0));\n\n  for(int i=0;i<8;i++) { // ceil(log2(FLT_MAX) / 22)+1\n    q = vptruncf(vmul_vf_vf_vf(vtoward0f(vf2getx_vf_vf2(r)), rde));\n    q = vsel_vf_vo_vf_vf(vand_vo_vo_vo(vgt_vo_vf_vf(vmul_vf_vf_vf(vcast_vf_f(3), de), vf2getx_vf_vf2(r)),\n\t\t\t\t       vge_vo_vf_vf(vf2getx_vf_vf2(r), de)),\n\t\t\t vcast_vf_f(2), q);\n    q = vsel_vf_vo_vf_vf(vand_vo_vo_vo(vgt_vo_vf_vf(vmul_vf_vf_vf(vcast_vf_f(2), de), vf2getx_vf_vf2(r)),\n\t\t\t\t       vge_vo_vf_vf(vf2getx_vf_vf2(r), de)),\n\t\t\t vcast_vf_f(1), q);\n    r = dfnormalize_vf2_vf2(dfadd2_vf2_vf2_vf2(r, dfmul_vf2_vf_vf(vptruncf(q), vneg_vf_vf(de))));\n    if (vtestallones_i_vo32(vlt_vo_vf_vf(vf2getx_vf_vf2(r), de))) break;\n  }\n  \n  vfloat ret = vmul_vf_vf_vf(vadd_vf_vf_vf(vf2getx_vf_vf2(r), vf2gety_vf_vf2(r)), s);\n  ret = vsel_vf_vo_vf_vf(veq_vo_vf_vf(vadd_vf_vf_vf(vf2getx_vf_vf2(r), vf2gety_vf_vf2(r)), de), vcast_vf_f(0), ret);\n\n  ret = vmulsign_vf_vf_vf(ret, x);\n\n  ret = vsel_vf_vo_vf_vf(vlt_vo_vf_vf(nu, de), x, ret);\n  ret = vsel_vf_vo_vf_vf(veq_vo_vf_vf(de, vcast_vf_f(0)), vcast_vf_f(SLEEF_NANf), ret);\n\n  return ret;\n}\n\nstatic INLINE CONST VECTOR_CC vfloat vrintfk2_vf_vf(vfloat d) {\n#ifdef FULL_FP_ROUNDING\n  return vrint_vf_vf(d);\n#else\n  vfloat c = vmulsign_vf_vf_vf(vcast_vf_f(1 << 23), d);\n  return vsel_vf_vo_vf_vf(vgt_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(1 << 23)),\n\t\t\t  d, vorsign_vf_vf_vf(vsub_vf_vf_vf(vadd_vf_vf_vf(d, c), c), d));\n#endif\n}\n\nEXPORT CONST VECTOR_CC vfloat xremainderf(vfloat x, vfloat y) {\n  vfloat n = vabs_vf_vf(x), d = vabs_vf_vf(y), s = vcast_vf_f(1), q;\n  vopmask o = vlt_vo_vf_vf(d, vcast_vf_f(FLT_MIN*2));\n  n = vsel_vf_vo_vf_vf(o, vmul_vf_vf_vf(n, vcast_vf_f(UINT64_C(1) << 25)), n);\n  d = vsel_vf_vo_vf_vf(o, vmul_vf_vf_vf(d, vcast_vf_f(UINT64_C(1) << 25)), d);\n  s  = vsel_vf_vo_vf_vf(o, vmul_vf_vf_vf(s , vcast_vf_f(1.0f / (UINT64_C(1) << 25))), s);\n  vfloat2 r = vcast_vf2_vf_vf(n, vcast_vf_f(0));\n  vfloat rd = vrec_vf_vf(d);\n  vopmask qisodd = vneq_vo_vf_vf(vcast_vf_f(0), vcast_vf_f(0));\n\n  for(int i=0;i<8;i++) { // ceil(log2(FLT_MAX) / 22)+1\n    q = vrintfk2_vf_vf(vmul_vf_vf_vf(vf2getx_vf_vf2(r), rd));\n    q = vsel_vf_vo_vf_vf(vlt_vo_vf_vf(vabs_vf_vf(vf2getx_vf_vf2(r)), vmul_vf_vf_vf(d, vcast_vf_f(1.5f))), vmulsign_vf_vf_vf(vcast_vf_f(1.0f), vf2getx_vf_vf2(r)), q);\n    q = vsel_vf_vo_vf_vf(vor_vo_vo_vo(vlt_vo_vf_vf(vabs_vf_vf(vf2getx_vf_vf2(r)), vmul_vf_vf_vf(d, vcast_vf_f(0.5f))),\n\t\t\t\t      vandnot_vo_vo_vo(qisodd, veq_vo_vf_vf(vabs_vf_vf(vf2getx_vf_vf2(r)), vmul_vf_vf_vf(d, vcast_vf_f(0.5f))))),\n\t\t\t vcast_vf_f(0.0), q);\n    if (vtestallones_i_vo32(veq_vo_vf_vf(q, vcast_vf_f(0)))) break;\n    q = vsel_vf_vo_vf_vf(visinf_vo_vf(vmul_vf_vf_vf(q, vneg_vf_vf(d))), vadd_vf_vf_vf(q, vmulsign_vf_vf_vf(vcast_vf_f(-1), vf2getx_vf_vf2(r))), q);\n    qisodd = vxor_vo_vo_vo(qisodd, vand_vo_vo_vo(veq_vo_vi2_vi2(vand_vi2_vi2_vi2(vtruncate_vi2_vf(q), vcast_vi2_i(1)), vcast_vi2_i(1)),\n\t\t\t\t\t\t vlt_vo_vf_vf(vabs_vf_vf(q), vcast_vf_f(1 << 24))));\n    r = dfnormalize_vf2_vf2(dfadd2_vf2_vf2_vf2(r, dfmul_vf2_vf_vf(q, vneg_vf_vf(d))));\n  }\n  \n  vfloat ret = vmul_vf_vf_vf(vadd_vf_vf_vf(vf2getx_vf_vf2(r), vf2gety_vf_vf2(r)), s);\n  ret = vmulsign_vf_vf_vf(ret, x);\n  ret = vsel_vf_vo_vf_vf(visinf_vo_vf(y), vsel_vf_vo_vf_vf(visinf_vo_vf(x), vcast_vf_f(SLEEF_NANf), x), ret);\n  ret = vsel_vf_vo_vf_vf(veq_vo_vf_vf(d, vcast_vf_f(0)), vcast_vf_f(SLEEF_NANf), ret);\n  return ret;\n}\n#endif // #if !defined(DETERMINISTIC)\n\n//\n\nstatic INLINE CONST VECTOR_CC vfloat2 sinpifk(vfloat d) {\n  vopmask o;\n  vfloat u, s, t;\n  vfloat2 x, s2;\n\n  u = vmul_vf_vf_vf(d, vcast_vf_f(4.0));\n  vint2 q = vtruncate_vi2_vf(u);\n  q = vand_vi2_vi2_vi2(vadd_vi2_vi2_vi2(q, vxor_vi2_vi2_vi2(vsrl_vi2_vi2_i(q, 31), vcast_vi2_i(1))), vcast_vi2_i(~1));\n  o = veq_vo_vi2_vi2(vand_vi2_vi2_vi2(q, vcast_vi2_i(2)), vcast_vi2_i(2));\n\n  s = vsub_vf_vf_vf(u, vcast_vf_vi2(q));\n  t = s;\n  s = vmul_vf_vf_vf(s, s);\n  s2 = dfmul_vf2_vf_vf(t, t);\n\n  //\n\n  u = vsel_vf_vo_f_f(o, -0.2430611801e-7f, +0.3093842054e-6f);\n  u = vmla_vf_vf_vf_vf(u, s, vsel_vf_vo_f_f(o, +0.3590577080e-5f, -0.3657307388e-4f));\n  u = vmla_vf_vf_vf_vf(u, s, vsel_vf_vo_f_f(o, -0.3259917721e-3f, +0.2490393585e-2f));\n  x = dfadd2_vf2_vf_vf2(vmul_vf_vf_vf(u, s),\n\t\t\tvsel_vf2_vo_f_f_f_f(o, 0.015854343771934509277, 4.4940051354032242811e-10,\n\t\t\t\t\t    -0.080745510756969451904, -1.3373665339076936258e-09));\n  x = dfadd2_vf2_vf2_vf2(dfmul_vf2_vf2_vf2(s2, x),\n\t\t\t vsel_vf2_vo_f_f_f_f(o, -0.30842512845993041992, -9.0728339030733922277e-09,\n\t\t\t\t\t     0.78539818525314331055, -2.1857338617566484855e-08));\n\n  x = dfmul_vf2_vf2_vf2(x, vsel_vf2_vo_vf2_vf2(o, s2, vcast_vf2_vf_vf(t, vcast_vf_f(0))));\n  x = vsel_vf2_vo_vf2_vf2(o, dfadd2_vf2_vf2_vf(x, vcast_vf_f(1)), x);\n\n  o = veq_vo_vi2_vi2(vand_vi2_vi2_vi2(q, vcast_vi2_i(4)), vcast_vi2_i(4));\n  x = vf2setx_vf2_vf2_vf(x, vreinterpret_vf_vm(vxor_vm_vm_vm(vand_vm_vo32_vm(o, vreinterpret_vm_vf(vcast_vf_f(-0.0))), vreinterpret_vm_vf(vf2getx_vf_vf2(x)))));\n  x = vf2sety_vf2_vf2_vf(x, vreinterpret_vf_vm(vxor_vm_vm_vm(vand_vm_vo32_vm(o, vreinterpret_vm_vf(vcast_vf_f(-0.0))), vreinterpret_vm_vf(vf2gety_vf_vf2(x)))));\n\n  return x;\n}\n\n#if !defined(DETERMINISTIC)\nEXPORT CONST VECTOR_CC vfloat xsinpif_u05(vfloat d) {\n  vfloat2 x = sinpifk(d);\n  vfloat r = vadd_vf_vf_vf(vf2getx_vf_vf2(x), vf2gety_vf_vf2(x));\n\n  r = vsel_vf_vo_vf_vf(visnegzero_vo_vf(d), vcast_vf_f(-0.0), r);\n  r = vreinterpret_vf_vm(vandnot_vm_vo32_vm(vgt_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(TRIGRANGEMAX4f)), vreinterpret_vm_vf(r)));\n  r = vreinterpret_vf_vm(vor_vm_vo32_vm(visinf_vo_vf(d), vreinterpret_vm_vf(r)));\n  \n  return r;\n}\n#endif // #if !defined(DETERMINISTIC)\n\nstatic INLINE CONST VECTOR_CC vfloat2 cospifk(vfloat d) {\n  vopmask o;\n  vfloat u, s, t;\n  vfloat2 x, s2;\n\n  u = vmul_vf_vf_vf(d, vcast_vf_f(4.0));\n  vint2 q = vtruncate_vi2_vf(u);\n  q = vand_vi2_vi2_vi2(vadd_vi2_vi2_vi2(q, vxor_vi2_vi2_vi2(vsrl_vi2_vi2_i(q, 31), vcast_vi2_i(1))), vcast_vi2_i(~1));\n  o = veq_vo_vi2_vi2(vand_vi2_vi2_vi2(q, vcast_vi2_i(2)), vcast_vi2_i(0));\n\n  s = vsub_vf_vf_vf(u, vcast_vf_vi2(q));\n  t = s;\n  s = vmul_vf_vf_vf(s, s);\n  s2 = dfmul_vf2_vf_vf(t, t);\n  \n  //\n\n  u = vsel_vf_vo_f_f(o, -0.2430611801e-7f, +0.3093842054e-6f);\n  u = vmla_vf_vf_vf_vf(u, s, vsel_vf_vo_f_f(o, +0.3590577080e-5f, -0.3657307388e-4f));\n  u = vmla_vf_vf_vf_vf(u, s, vsel_vf_vo_f_f(o, -0.3259917721e-3f, +0.2490393585e-2f));\n  x = dfadd2_vf2_vf_vf2(vmul_vf_vf_vf(u, s),\n\t\t\tvsel_vf2_vo_f_f_f_f(o, 0.015854343771934509277, 4.4940051354032242811e-10,\n\t\t\t\t\t    -0.080745510756969451904, -1.3373665339076936258e-09));\n  x = dfadd2_vf2_vf2_vf2(dfmul_vf2_vf2_vf2(s2, x),\n\t\t\t vsel_vf2_vo_f_f_f_f(o, -0.30842512845993041992, -9.0728339030733922277e-09,\n\t\t\t\t\t     0.78539818525314331055, -2.1857338617566484855e-08));\n\n  x = dfmul_vf2_vf2_vf2(x, vsel_vf2_vo_vf2_vf2(o, s2, vcast_vf2_vf_vf(t, vcast_vf_f(0))));\n  x = vsel_vf2_vo_vf2_vf2(o, dfadd2_vf2_vf2_vf(x, vcast_vf_f(1)), x);\n\n  o = veq_vo_vi2_vi2(vand_vi2_vi2_vi2(vadd_vi2_vi2_vi2(q, vcast_vi2_i(2)), vcast_vi2_i(4)), vcast_vi2_i(4));\n  x = vf2setx_vf2_vf2_vf(x, vreinterpret_vf_vm(vxor_vm_vm_vm(vand_vm_vo32_vm(o, vreinterpret_vm_vf(vcast_vf_f(-0.0))), vreinterpret_vm_vf(vf2getx_vf_vf2(x)))));\n  x = vf2sety_vf2_vf2_vf(x, vreinterpret_vf_vm(vxor_vm_vm_vm(vand_vm_vo32_vm(o, vreinterpret_vm_vf(vcast_vf_f(-0.0))), vreinterpret_vm_vf(vf2gety_vf_vf2(x)))));\n\n  return x;\n}\n\n#if !defined(DETERMINISTIC)\nEXPORT CONST VECTOR_CC vfloat xcospif_u05(vfloat d) {\n  vfloat2 x = cospifk(d);\n  vfloat r = vadd_vf_vf_vf(vf2getx_vf_vf2(x), vf2gety_vf_vf2(x));\n\n  r = vsel_vf_vo_vf_vf(vgt_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(TRIGRANGEMAX4f)), vcast_vf_f(1), r);\n  r = vreinterpret_vf_vm(vor_vm_vo32_vm(visinf_vo_vf(d), vreinterpret_vm_vf(r)));\n  \n  return r;\n}\n#endif // #if !defined(DETERMINISTIC)\n\n#if !(defined(ENABLE_SVE) || defined(ENABLE_SVENOFMA))\n  typedef struct {\n    vfloat2 a, b;\n  } df2;\n\nstatic df2 df2setab_df2_vf2_vf2(vfloat2 a, vfloat2 b) {\n  df2 r = { a, b };\n  return r;\n}\nstatic vfloat2 df2geta_vf2_df2(df2 d) { return d.a; }\nstatic vfloat2 df2getb_vf2_df2(df2 d) { return d.b; }\n#endif\n\n/* TODO AArch64: potential optimization by using `vfmad_lane_f64` */\nstatic CONST df2 gammafk(vfloat a) {\n  vfloat2 clc = vcast_vf2_f_f(0, 0), clln = vcast_vf2_f_f(1, 0), clld = vcast_vf2_f_f(1, 0);\n  vfloat2 v = vcast_vf2_f_f(1, 0), x, y, z;\n  vfloat t, u;\n\n  vopmask otiny = vlt_vo_vf_vf(vabs_vf_vf(a), vcast_vf_f(1e-30f)), oref = vlt_vo_vf_vf(a, vcast_vf_f(0.5));\n\n  x = vsel_vf2_vo_vf2_vf2(otiny, vcast_vf2_f_f(0, 0),\n\t\t\t  vsel_vf2_vo_vf2_vf2(oref, dfadd2_vf2_vf_vf(vcast_vf_f(1), vneg_vf_vf(a)),\n\t\t\t\t\t      vcast_vf2_vf_vf(a, vcast_vf_f(0))));\n\n  vopmask o0 = vand_vo_vo_vo(vle_vo_vf_vf(vcast_vf_f(0.5), vf2getx_vf_vf2(x)), vle_vo_vf_vf(vf2getx_vf_vf2(x), vcast_vf_f(1.2)));\n  vopmask o2 = vle_vo_vf_vf(vcast_vf_f(2.3), vf2getx_vf_vf2(x));\n  \n  y = dfnormalize_vf2_vf2(dfmul_vf2_vf2_vf2(dfadd2_vf2_vf2_vf(x, vcast_vf_f(1)), x));\n  y = dfnormalize_vf2_vf2(dfmul_vf2_vf2_vf2(dfadd2_vf2_vf2_vf(x, vcast_vf_f(2)), y));\n\n  vopmask o = vand_vo_vo_vo(o2, vle_vo_vf_vf(vf2getx_vf_vf2(x), vcast_vf_f(7)));\n  clln = vsel_vf2_vo_vf2_vf2(o, y, clln);\n\n  x = vsel_vf2_vo_vf2_vf2(o, dfadd2_vf2_vf2_vf(x, vcast_vf_f(3)), x);\n  t = vsel_vf_vo_vf_vf(o2, vrec_vf_vf(vf2getx_vf_vf2(x)), vf2getx_vf_vf2(dfnormalize_vf2_vf2(dfadd2_vf2_vf2_vf(x, vsel_vf_vo_f_f(o0, -1, -2)))));\n\n  u = vsel_vf_vo_vo_f_f_f(o2, o0, +0.000839498720672087279971000786, +0.9435157776e+0f, +0.1102489550e-3f);\n  u = vmla_vf_vf_vf_vf(u, t, vsel_vf_vo_vo_f_f_f(o2, o0, -5.17179090826059219329394422e-05, +0.8670063615e+0f, +0.8160019934e-4f));\n  u = vmla_vf_vf_vf_vf(u, t, vsel_vf_vo_vo_f_f_f(o2, o0, -0.000592166437353693882857342347, +0.4826702476e+0f, +0.1528468856e-3f));\n  u = vmla_vf_vf_vf_vf(u, t, vsel_vf_vo_vo_f_f_f(o2, o0, +6.97281375836585777403743539e-05, -0.8855129778e-1f, -0.2355068718e-3f));\n  u = vmla_vf_vf_vf_vf(u, t, vsel_vf_vo_vo_f_f_f(o2, o0, +0.000784039221720066627493314301, +0.1013825238e+0f, +0.4962242092e-3f));\n  u = vmla_vf_vf_vf_vf(u, t, vsel_vf_vo_vo_f_f_f(o2, o0, -0.000229472093621399176949318732, -0.1493408978e+0f, -0.1193488017e-2f));\n  u = vmla_vf_vf_vf_vf(u, t, vsel_vf_vo_vo_f_f_f(o2, o0, -0.002681327160493827160473958490, +0.1697509140e+0f, +0.2891599433e-2f));\n  u = vmla_vf_vf_vf_vf(u, t, vsel_vf_vo_vo_f_f_f(o2, o0, +0.003472222222222222222175164840, -0.2072454542e+0f, -0.7385451812e-2f));\n  u = vmla_vf_vf_vf_vf(u, t, vsel_vf_vo_vo_f_f_f(o2, o0, +0.083333333333333333335592087900, +0.2705872357e+0f, +0.2058077045e-1f));\n\n  y = dfmul_vf2_vf2_vf2(dfadd2_vf2_vf2_vf(x, vcast_vf_f(-0.5)), logk2f(x));\n  y = dfadd2_vf2_vf2_vf2(y, dfneg_vf2_vf2(x));\n  y = dfadd2_vf2_vf2_vf2(y, vcast_vf2_d(0.91893853320467278056)); // 0.5*log(2*M_PI)\n\n  z = dfadd2_vf2_vf2_vf(dfmul_vf2_vf_vf (u, t), vsel_vf_vo_f_f(o0, -0.400686534596170958447352690395e+0f, -0.673523028297382446749257758235e-1f));\n  z = dfadd2_vf2_vf2_vf(dfmul_vf2_vf2_vf(z, t), vsel_vf_vo_f_f(o0, +0.822466960142643054450325495997e+0f, +0.322467033928981157743538726901e+0f));\n  z = dfadd2_vf2_vf2_vf(dfmul_vf2_vf2_vf(z, t), vsel_vf_vo_f_f(o0, -0.577215665946766039837398973297e+0f, +0.422784335087484338986941629852e+0f));\n  z = dfmul_vf2_vf2_vf(z, t);\n\n  clc = vsel_vf2_vo_vf2_vf2(o2, y, z);\n  \n  clld = vsel_vf2_vo_vf2_vf2(o2, dfadd2_vf2_vf2_vf(dfmul_vf2_vf_vf(u, t), vcast_vf_f(1)), clld);\n  \n  y = clln;\n\n  clc = vsel_vf2_vo_vf2_vf2(otiny, vcast_vf2_d(41.58883083359671856503), // log(2^60)\n\t\t\t    vsel_vf2_vo_vf2_vf2(oref, dfadd2_vf2_vf2_vf2(vcast_vf2_d(1.1447298858494001639), dfneg_vf2_vf2(clc)), clc)); // log(M_PI)\n  clln = vsel_vf2_vo_vf2_vf2(otiny, vcast_vf2_f_f(1, 0), vsel_vf2_vo_vf2_vf2(oref, clln, clld));\n\n  if (!vtestallones_i_vo32(vnot_vo32_vo32(oref))) {\n    t = vsub_vf_vf_vf(a, vmul_vf_vf_vf(vcast_vf_f(INT64_C(1) << 12), vcast_vf_vi2(vtruncate_vi2_vf(vmul_vf_vf_vf(a, vcast_vf_f(1.0 / (INT64_C(1) << 12)))))));\n    x = dfmul_vf2_vf2_vf2(clld, sinpifk(t));\n  }\n  \n  clld = vsel_vf2_vo_vf2_vf2(otiny, vcast_vf2_vf_vf(vmul_vf_vf_vf(a, vcast_vf_f((INT64_C(1) << 30)*(float)(INT64_C(1) << 30))), vcast_vf_f(0)),\n\t\t\t     vsel_vf2_vo_vf2_vf2(oref, x, y));\n\n  return df2setab_df2_vf2_vf2(clc, dfdiv_vf2_vf2_vf2(clln, clld));\n}\n\n#if !defined(DETERMINISTIC)\nEXPORT CONST VECTOR_CC vfloat xtgammaf_u1(vfloat a) {\n  df2 d = gammafk(a);\n  vfloat2 y = dfmul_vf2_vf2_vf2(expk2f(df2geta_vf2_df2(d)), df2getb_vf2_df2(d));\n  vfloat r = vadd_vf_vf_vf(vf2getx_vf_vf2(y), vf2gety_vf_vf2(y));\n  vopmask o;\n\n  o = vor_vo_vo_vo(vor_vo_vo_vo(veq_vo_vf_vf(a, vcast_vf_f(-SLEEF_INFINITYf)),\n\t\t\t\tvand_vo_vo_vo(vlt_vo_vf_vf(a, vcast_vf_f(0)), visint_vo_vf(a))),\n\t\t   vand_vo_vo_vo(vand_vo_vo_vo(visnumber_vo_vf(a), vlt_vo_vf_vf(a, vcast_vf_f(0))), visnan_vo_vf(r)));\n  r = vsel_vf_vo_vf_vf(o, vcast_vf_f(SLEEF_NANf), r);\n\n  o = vand_vo_vo_vo(vand_vo_vo_vo(vor_vo_vo_vo(veq_vo_vf_vf(a, vcast_vf_f(SLEEF_INFINITYf)), visnumber_vo_vf(a)),\n\t\t\t\t  vge_vo_vf_vf(a, vcast_vf_f(-FLT_MIN))),\n\t\t    vor_vo_vo_vo(vor_vo_vo_vo(veq_vo_vf_vf(a, vcast_vf_f(0)), vgt_vo_vf_vf(a, vcast_vf_f(36))), visnan_vo_vf(r)));\n  r = vsel_vf_vo_vf_vf(o, vmulsign_vf_vf_vf(vcast_vf_f(SLEEF_INFINITYf), a), r);\n  \n  return r;\n}\n\nEXPORT CONST VECTOR_CC vfloat xlgammaf_u1(vfloat a) {\n  df2 d = gammafk(a);\n  vfloat2 y = dfadd2_vf2_vf2_vf2(df2geta_vf2_df2(d), logk2f(dfabs_vf2_vf2(df2getb_vf2_df2(d))));\n  vfloat r = vadd_vf_vf_vf(vf2getx_vf_vf2(y), vf2gety_vf_vf2(y));\n  vopmask o;\n\n  o = vor_vo_vo_vo(visinf_vo_vf(a),\n\t\t   vor_vo_vo_vo(vand_vo_vo_vo(vle_vo_vf_vf(a, vcast_vf_f(0)), visint_vo_vf(a)),\n\t\t\t\tvand_vo_vo_vo(visnumber_vo_vf(a), visnan_vo_vf(r))));\n  r = vsel_vf_vo_vf_vf(o, vcast_vf_f(SLEEF_INFINITYf), r);\n\n  return r;\n}\n\n/* TODO AArch64: potential optimization by using `vfmad_lane_f64` */\nEXPORT CONST VECTOR_CC vfloat xerff_u1(vfloat a) {\n  vfloat s = a, t, u;\n  vfloat2 d;\n\n  a = vabs_vf_vf(a);\n  vopmask o0 = vlt_vo_vf_vf(a, vcast_vf_f(1.1));\n  vopmask o1 = vlt_vo_vf_vf(a, vcast_vf_f(2.4));\n  vopmask o2 = vlt_vo_vf_vf(a, vcast_vf_f(4.0));\n  u = vsel_vf_vo_vf_vf(o0, vmul_vf_vf_vf(a, a), a);\n  \n  t = vsel_vf_vo_vo_f_f_f(o0, o1, +0.7089292194e-4f, -0.1792667899e-4f, -0.9495757695e-5f);\n  t = vmla_vf_vf_vf_vf(t, u, vsel_vf_vo_vo_f_f_f(o0, o1, -0.7768311189e-3f, +0.3937633010e-3f, +0.2481465926e-3f));\n  t = vmla_vf_vf_vf_vf(t, u, vsel_vf_vo_vo_f_f_f(o0, o1, +0.5159463733e-2f, -0.3949181177e-2f, -0.2918176819e-2f));\n  t = vmla_vf_vf_vf_vf(t, u, vsel_vf_vo_vo_f_f_f(o0, o1, -0.2683781274e-1f, +0.2445474640e-1f, +0.2059706673e-1f));\n  t = vmla_vf_vf_vf_vf(t, u, vsel_vf_vo_vo_f_f_f(o0, o1, +0.1128318012e+0f, -0.1070996150e+0f, -0.9901899844e-1f));\n  d = dfmul_vf2_vf_vf(t, u);\n  d = dfadd2_vf2_vf2_vf2(d, vsel_vf2_vo_vo_d_d_d(o0, o1, -0.376125876000657465175213237214e+0, -0.634588905908410389971210809210e+0, -0.643598050547891613081201721633e+0));\n  d = dfmul_vf2_vf2_vf(d, u);\n  d = dfadd2_vf2_vf2_vf2(d, vsel_vf2_vo_vo_d_d_d(o0, o1, +0.112837916021059138255978217023e+1, -0.112879855826694507209862753992e+1, -0.112461487742845562801052956293e+1));\n  d = dfmul_vf2_vf2_vf(d, a);\n  d = vsel_vf2_vo_vf2_vf2(o0, d, dfadd_vf2_vf_vf2(vcast_vf_f(1.0), dfneg_vf2_vf2(expk2f(d))));\n  u = vmulsign_vf_vf_vf(vsel_vf_vo_vf_vf(o2, vadd_vf_vf_vf(vf2getx_vf_vf2(d), vf2gety_vf_vf2(d)), vcast_vf_f(1)), s);\n  u = vsel_vf_vo_vf_vf(visnan_vo_vf(a), vcast_vf_f(SLEEF_NANf), u);\n\n  return u;\n}\n\n/* TODO AArch64: potential optimization by using `vfmad_lane_f64` */\nEXPORT CONST VECTOR_CC vfloat xerfcf_u15(vfloat a) {\n  vfloat s = a, r = vcast_vf_f(0), t;\n  vfloat2 u, d, x;\n  a = vabs_vf_vf(a);\n  vopmask o0 = vlt_vo_vf_vf(a, vcast_vf_f(1.0));\n  vopmask o1 = vlt_vo_vf_vf(a, vcast_vf_f(2.2));\n  vopmask o2 = vlt_vo_vf_vf(a, vcast_vf_f(4.3));\n  vopmask o3 = vlt_vo_vf_vf(a, vcast_vf_f(10.1));\n\n  u = vsel_vf2_vo_vf2_vf2(o1, vcast_vf2_vf_vf(a, vcast_vf_f(0)), dfdiv_vf2_vf2_vf2(vcast_vf2_f_f(1, 0), vcast_vf2_vf_vf(a, vcast_vf_f(0))));\n\n  t = vsel_vf_vo_vo_vo_f_f_f_f(o0, o1, o2, -0.8638041618e-4f, -0.6236977242e-5f, -0.3869504035e+0f, +0.1115344167e+1f);\n  t = vmla_vf_vf_vf_vf(t, vf2getx_vf_vf2(u), vsel_vf_vo_vo_vo_f_f_f_f(o0, o1, o2, +0.6000166177e-3f, +0.5749821503e-4f, +0.1288077235e+1f, -0.9454904199e+0f));\n  t = vmla_vf_vf_vf_vf(t, vf2getx_vf_vf2(u), vsel_vf_vo_vo_vo_f_f_f_f(o0, o1, o2, -0.1665703603e-2f, +0.6002851478e-5f, -0.1816803217e+1f, -0.3667259514e+0f));\n  t = vmla_vf_vf_vf_vf(t, vf2getx_vf_vf2(u), vsel_vf_vo_vo_vo_f_f_f_f(o0, o1, o2, +0.1795156277e-3f, -0.2851036377e-2f, +0.1249150872e+1f, +0.7155663371e+0f));\n  t = vmla_vf_vf_vf_vf(t, vf2getx_vf_vf2(u), vsel_vf_vo_vo_vo_f_f_f_f(o0, o1, o2, +0.1914106123e-1f, +0.2260518074e-1f, -0.1328857988e+0f, -0.1262947265e-1f));\n\n  d = dfmul_vf2_vf2_vf(u, t);\n  d = dfadd2_vf2_vf2_vf2(d, vsel_vf2_vo_vo_vo_d_d_d_d(o0, o1, o2, -0.102775359343930288081655368891e+0, -0.105247583459338632253369014063e+0, -0.482365310333045318680618892669e+0, -0.498961546254537647970305302739e+0));\n  d = dfmul_vf2_vf2_vf2(d, u);\n  d = dfadd2_vf2_vf2_vf2(d, vsel_vf2_vo_vo_vo_d_d_d_d(o0, o1, o2, -0.636619483208481931303752546439e+0, -0.635609463574589034216723775292e+0, -0.134450203224533979217859332703e-2, -0.471199543422848492080722832666e-4));\n  d = dfmul_vf2_vf2_vf2(d, u);\n  d = dfadd2_vf2_vf2_vf2(d, vsel_vf2_vo_vo_vo_d_d_d_d(o0, o1, o2, -0.112837917790537404939545770596e+1, -0.112855987376668622084547028949e+1, -0.572319781150472949561786101080e+0, -0.572364030327966044425932623525e+0));\n  \n  x = dfmul_vf2_vf2_vf(vsel_vf2_vo_vf2_vf2(o1, d, vcast_vf2_vf_vf(vneg_vf_vf(a), vcast_vf_f(0))), a);\n  x = vsel_vf2_vo_vf2_vf2(o1, x, dfadd2_vf2_vf2_vf2(x, d));\n\n  x = expk2f(x);\n  x = vsel_vf2_vo_vf2_vf2(o1, x, dfmul_vf2_vf2_vf2(x, u));\n\n  r = vsel_vf_vo_vf_vf(o3, vadd_vf_vf_vf(vf2getx_vf_vf2(x), vf2gety_vf_vf2(x)), vcast_vf_f(0));\n  r = vsel_vf_vo_vf_vf(vsignbit_vo_vf(s), vsub_vf_vf_vf(vcast_vf_f(2), r), r);\n  r = vsel_vf_vo_vf_vf(visnan_vo_vf(s), vcast_vf_f(SLEEF_NANf), r);\n  return r;\n}\n#endif // #if !defined(DETERMINISTIC)\n\n#if !defined(DETERMINISTIC) && !defined(ENABLE_GNUABI) && !defined(SLEEF_GENHEADER)\n// See sleefsimddp.c for explanation of these macros\n\n#ifdef ENABLE_ALIAS\n#define DALIAS_vf_vf(FUNC) EXPORT CONST VECTOR_CC vfloat y ## FUNC(vfloat) __attribute__((alias( stringify(x ## FUNC) )));\n#define DALIAS_vf2_vf(FUNC) EXPORT CONST VECTOR_CC vfloat2 y ## FUNC(vfloat) __attribute__((alias( stringify(x ## FUNC) )));\n#define DALIAS_vf_vf_vf(FUNC) EXPORT CONST VECTOR_CC vfloat y ## FUNC(vfloat, vfloat) __attribute__((alias( stringify(x ## FUNC) )));\n#define DALIAS_vf_vf_vf_vf(FUNC) EXPORT CONST VECTOR_CC vfloat y ## FUNC(vfloat, vfloat, vfloat) __attribute__((alias( stringify(x ## FUNC) )));\n#else\n#define DALIAS_vf_vf(FUNC) EXPORT CONST VECTOR_CC vfloat y ## FUNC(vfloat d) { return x ## FUNC (d); }\n#define DALIAS_vf2_vf(FUNC) EXPORT CONST VECTOR_CC vfloat2 y ## FUNC(vfloat d) { return x ## FUNC (d); }\n#define DALIAS_vf_vf_vf(FUNC) EXPORT CONST VECTOR_CC vfloat y ## FUNC(vfloat x, vfloat y) { return x ## FUNC (x, y); }\n#define DALIAS_vf_vf_vf_vf(FUNC) EXPORT CONST VECTOR_CC vfloat y ## FUNC(vfloat x, vfloat y, vfloat z) { return x ## FUNC (x, y, z); }\n#endif\n\n/* DALIAS_vf2_vf(sincospif_u05) */\n/* DALIAS_vf2_vf(sincospif_u35) */\n/* DALIAS_vf2_vf(modff) */\n/* DALIAS_vf_vf(atanf) */\n/* DALIAS_vf_vf_vf(atan2f) */\n/* DALIAS_vf_vf(asinf) */\n/* DALIAS_vf_vf(acosf) */\n/* DALIAS_vf_vf_vf(atan2f_u1) */\n/* DALIAS_vf_vf(asinf_u1) */\n/* DALIAS_vf_vf(acosf_u1) */\n/* DALIAS_vf_vf(atanf_u1) */\n/* DALIAS_vf_vf(logf) */\n/* DALIAS_vf_vf(expf) */\n/* DALIAS_vf_vf(cbrtf) */\n/* DALIAS_vf_vf(cbrtf_u1) */\n/* DALIAS_vf_vf(logf_u1) */\n/* DALIAS_vf_vf_vf(powf) */\n/* DALIAS_vf_vf(sinhf) */\n/* DALIAS_vf_vf(coshf) */\n/* DALIAS_vf_vf(tanhf) */\n/* DALIAS_vf_vf(sinhf_u35) */\n/* DALIAS_vf_vf(coshf_u35) */\n/* DALIAS_vf_vf(tanhf_u35) */\n/* DALIAS_vf_vf(asinhf) */\n/* DALIAS_vf_vf(acoshf) */\n/* DALIAS_vf_vf(atanhf) */\n/* DALIAS_vf_vf(exp2f) */\n/* DALIAS_vf_vf(exp2f_u35) */\n/* DALIAS_vf_vf(exp10f) */\n/* DALIAS_vf_vf(exp10f_u35) */\n/* DALIAS_vf_vf(expm1f) */\n/* DALIAS_vf_vf(log10f) */\n/* DALIAS_vf_vf(log2f) */\n/* DALIAS_vf_vf(log2f_u35) */\n/* DALIAS_vf_vf(log1pf) */\n/* DALIAS_vf_vf(fabsf) */\n/* DALIAS_vf_vf_vf(copysignf) */\n/* DALIAS_vf_vf_vf(fmaxf) */\n/* DALIAS_vf_vf_vf(fminf) */\n/* DALIAS_vf_vf_vf(fdimf) */\n/* DALIAS_vf_vf(truncf) */\n/* DALIAS_vf_vf(floorf) */\n/* DALIAS_vf_vf(ceilf) */\n/* DALIAS_vf_vf(roundf) */\n/* DALIAS_vf_vf(rintf) */\n/* DALIAS_vf_vf_vf_vf(fmaf) */\n/* DALIAS_vf_vf_vf(hypotf_u05) */\n/* DALIAS_vf_vf_vf(hypotf_u35) */\n/* DALIAS_vf_vf_vf(nextafterf) */\n/* DALIAS_vf_vf(frfrexpf) */\n/* DALIAS_vf_vf_vf(fmodf) */\n/* DALIAS_vf_vf_vf(remainderf) */\n/* DALIAS_vf_vf(sinpif_u05) */\n/* DALIAS_vf_vf(cospif_u05) */\n/* DALIAS_vf_vf(tgammaf_u1) */\n/* DALIAS_vf_vf(lgammaf_u1) */\n/* DALIAS_vf_vf(erff_u1) */\n/* DALIAS_vf_vf(erfcf_u15) */\n/* DALIAS_vf_vf_vf(fastpowf_u3500) */\n#endif // #if !defined(DETERMINISTIC) && !defined(ENABLE_GNUABI) && !defined(SLEEF_GENHEADER)\n\n#if !defined(ENABLE_GNUABI) && !defined(SLEEF_GENHEADER)\nEXPORT CONST int xgetIntf(int name) {\n  if (1 <= name && name <= 10) return vavailability_i(name);\n  return 0;\n}\n\nEXPORT CONST void *xgetPtrf(int name) {\n  if (name == 0) return ISANAME;\n  return (void *)0;\n}\n#endif\n\n#if defined(ALIAS_NO_EXT_SUFFIX) && !defined(DETERMINISTIC)\n#include ALIAS_NO_EXT_SUFFIX\n#endif\n\n#ifdef ENABLE_GNUABI\nEXPORT CONST VECTOR_CC vfloat __acosf_finite     (vfloat)         __attribute__((weak, alias(str_xacosf_u1  )));\nEXPORT CONST VECTOR_CC vfloat __acoshf_finite    (vfloat)         __attribute__((weak, alias(str_xacoshf    )));\nEXPORT CONST VECTOR_CC vfloat __asinf_finite     (vfloat)         __attribute__((weak, alias(str_xasinf_u1  )));\nEXPORT CONST VECTOR_CC vfloat __atan2f_finite    (vfloat, vfloat) __attribute__((weak, alias(str_xatan2f_u1 )));\nEXPORT CONST VECTOR_CC vfloat __atanhf_finite    (vfloat)         __attribute__((weak, alias(str_xatanhf    )));\nEXPORT CONST VECTOR_CC vfloat __coshf_finite     (vfloat)         __attribute__((weak, alias(str_xcoshf     )));\nEXPORT CONST VECTOR_CC vfloat __exp10f_finite    (vfloat)         __attribute__((weak, alias(str_xexp10f    )));\nEXPORT CONST VECTOR_CC vfloat __exp2f_finite     (vfloat)         __attribute__((weak, alias(str_xexp2f     )));\nEXPORT CONST VECTOR_CC vfloat __expf_finite      (vfloat)         __attribute__((weak, alias(str_xexpf      )));\nEXPORT CONST VECTOR_CC vfloat __fmodf_finite     (vfloat, vfloat) __attribute__((weak, alias(str_xfmodf     )));\nEXPORT CONST VECTOR_CC vfloat __remainderf_finite(vfloat, vfloat) __attribute__((weak, alias(str_xremainderf)));\nEXPORT CONST VECTOR_CC vfloat __modff_finite      (vfloat, vfloat *) __attribute__((weak, alias(str_xmodff  )));\nEXPORT CONST VECTOR_CC vfloat __hypotf_u05_finite(vfloat, vfloat) __attribute__((weak, alias(str_xhypotf_u05)));\nEXPORT CONST VECTOR_CC vfloat __lgammaf_u1_finite(vfloat)         __attribute__((weak, alias(str_xlgammaf_u1)));\nEXPORT CONST VECTOR_CC vfloat __log10f_finite    (vfloat)         __attribute__((weak, alias(str_xlog10f    )));\nEXPORT CONST VECTOR_CC vfloat __logf_finite      (vfloat)         __attribute__((weak, alias(str_xlogf_u1   )));\nEXPORT CONST VECTOR_CC vfloat __powf_finite      (vfloat, vfloat) __attribute__((weak, alias(str_xpowf      )));\nEXPORT CONST VECTOR_CC vfloat __sinhf_finite     (vfloat)         __attribute__((weak, alias(str_xsinhf     )));\nEXPORT CONST VECTOR_CC vfloat __sqrtf_finite     (vfloat)         __attribute__((weak, alias(str_xsqrtf     )));\nEXPORT CONST VECTOR_CC vfloat __tgammaf_u1_finite(vfloat)         __attribute__((weak, alias(str_xtgammaf_u1)));\n\n#ifdef HEADER_MASKED\n#include HEADER_MASKED\n#endif\n#endif /* #ifdef ENABLE_GNUABI */\n\n#ifdef ENABLE_MAIN\n// gcc -DENABLE_MAIN -Wno-attributes -I../common -I../arch -DENABLE_AVX2 -mavx2 -mfma sleefsimdsp.c rempitab.c ../common/common.c -lm\n#include <stdio.h>\n#include <stdlib.h>\n#include <math.h>\nint main(int argc, char **argv) {\n  vfloat vf1 = vcast_vf_f(atof(argv[1]));\n  //vfloat vf2 = vcast_vf_f(atof(argv[2]));\n\n  //vfloat r = xpowf(vf1, vf2);\n  //vfloat r = xsqrtf_u05(vf1);\n  //printf(\"%g\\n\", xnextafterf(vf1, vf2)[0]);\n  //printf(\"%g\\n\", nextafterf(atof(argv[1]), atof(argv[2])));\n  printf(\"t = %.20g\\n\", xlogf_u1(vf1)[0]);\n  printf(\"c = %.20g\\n\", logf(atof(argv[1])));\n  \n}\n#endif\n"
  },
  {
    "path": "src/sleefsimdsp_emulation.c",
    "content": "/*\n\nCopyright (c) 2021 Agenium Scale\n\nPermission is hereby granted, free of charge, to any person obtaining a copy\nof this software and associated documentation files (the \"Software\"), to deal\nin the Software without restriction, including without limitation the rights\nto use, copy, modify, merge, publish, distribute, sublicense, and/or sell\ncopies of the Software, and to permit persons to whom the Software is\nfurnished to do so, subject to the following conditions:\n\nThe above copyright notice and this permission notice shall be included in all\ncopies or substantial portions of the Software.\n\nTHE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\nIMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\nFITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\nAUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\nLIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\nOUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\nSOFTWARE.\n\n*/\n\n#include <nsimd/nsimd.h>\n\n#ifdef ENABLE_VSX\n#include \"renamevsx.h\"\n#define nsimd_vec_f32 nsimd_vmx_vf32\n#define get0(a) vec_extract(a, 0)\n#define get1(a) vec_extract(a, 1)\n#define get2(a) vec_extract(a, 2)\n#define get3(a) vec_extract(a, 3)\n#define set0(a, b) vec_splats(b)\n#define set1(a, b) vec_insert(b, a, 1)\n#define set2(a, b) vec_insert(b, a, 2)\n#define set3(a, b) vec_insert(b, a, 3)\n#endif\n\nnsimd_vec_f32 xsinf(nsimd_vec_f32 a0_) {\n  nsimd_vec_f32 ret_;\n  nsimd_cpu_vf32 a0, ret;\n  a0.v0 = get0(a0_);\n  a0.v1 = get1(a0_);\n  a0.v2 = get2(a0_);\n  a0.v3 = get3(a0_);\n  ret = nsimd_sin_u35_cpu_f32(a0);\n  ret_ = set0(ret_, ret.v0);\n  ret_ = set1(ret_, ret.v1);\n  ret_ = set2(ret_, ret.v2);\n  ret_ = set3(ret_, ret.v3);\n  return ret_;\n}\n\n\nnsimd_vec_f32 xcosf(nsimd_vec_f32 a0_) {\n  nsimd_vec_f32 ret_;\n  nsimd_cpu_vf32 a0, ret;\n  a0.v0 = get0(a0_);\n  a0.v1 = get1(a0_);\n  a0.v2 = get2(a0_);\n  a0.v3 = get3(a0_);\n  ret = nsimd_cos_u35_cpu_f32(a0);\n  ret_ = set0(ret_, ret.v0);\n  ret_ = set1(ret_, ret.v1);\n  ret_ = set2(ret_, ret.v2);\n  ret_ = set3(ret_, ret.v3);\n  return ret_;\n}\n\nnsimd_vec_f32 xtanf(nsimd_vec_f32 a0_) {\n  nsimd_vec_f32 ret_;\n  nsimd_cpu_vf32 a0, ret;\n  a0.v0 = get0(a0_);\n  a0.v1 = get1(a0_);\n  a0.v2 = get2(a0_);\n  a0.v3 = get3(a0_);\n  ret = nsimd_tan_u35_cpu_f32(a0);\n  ret_ = set0(ret_, ret.v0);\n  ret_ = set1(ret_, ret.v1);\n  ret_ = set2(ret_, ret.v2);\n  ret_ = set3(ret_, ret.v3);\n  return ret_;\n}\n\nnsimd_vec_f32 xasinf(nsimd_vec_f32 a0_) {\n  nsimd_vec_f32 ret_;\n  nsimd_cpu_vf32 a0, ret;\n  a0.v0 = get0(a0_);\n  a0.v1 = get1(a0_);\n  a0.v2 = get2(a0_);\n  a0.v3 = get3(a0_);\n  ret = nsimd_asin_u35_cpu_f32(a0);\n  ret_ = set0(ret_, ret.v0);\n  ret_ = set1(ret_, ret.v1);\n  ret_ = set2(ret_, ret.v2);\n  ret_ = set3(ret_, ret.v3);\n  return ret_;\n}\n\nnsimd_vec_f32 xacosf(nsimd_vec_f32 a0_) {\n  nsimd_vec_f32 ret_;\n  nsimd_cpu_vf32 a0, ret;\n  a0.v0 = get0(a0_);\n  a0.v1 = get1(a0_);\n  a0.v2 = get2(a0_);\n  a0.v3 = get3(a0_);\n  ret = nsimd_acos_u35_cpu_f32(a0);\n  ret_ = set0(ret_, ret.v0);\n  ret_ = set1(ret_, ret.v1);\n  ret_ = set2(ret_, ret.v2);\n  ret_ = set3(ret_, ret.v3);\n  return ret_;\n}\n\nnsimd_vec_f32 xatanf(nsimd_vec_f32 a0_) {\n  nsimd_vec_f32 ret_;\n  nsimd_cpu_vf32 a0, ret;\n  a0.v0 = get0(a0_);\n  a0.v1 = get1(a0_);\n  a0.v2 = get2(a0_);\n  a0.v3 = get3(a0_);\n  ret = nsimd_atan_u35_cpu_f32(a0);\n  ret_ = set0(ret_, ret.v0);\n  ret_ = set1(ret_, ret.v1);\n  ret_ = set2(ret_, ret.v2);\n  ret_ = set3(ret_, ret.v3);\n  return ret_;\n}\n\nnsimd_vec_f32 xatan2f(nsimd_vec_f32 a0_, nsimd_vec_f32 a1_) {\n  nsimd_vec_f32 ret_;\n  nsimd_cpu_vf32 a0, a1, ret;\n  a0.v0 = get0(a0_);\n  a0.v1 = get1(a0_);\n  a0.v2 = get2(a0_);\n  a0.v3 = get3(a0_);\n  a1.v0 = get0(a1_);\n  a1.v1 = get1(a1_);\n  a1.v2 = get2(a1_);\n  a1.v3 = get3(a1_);\n  ret = nsimd_atan2_u35_cpu_f32(a0, a1);\n  ret_ = set0(ret_, ret.v0);\n  ret_ = set1(ret_, ret.v1);\n  ret_ = set2(ret_, ret.v2);\n  ret_ = set3(ret_, ret.v3);\n  return ret_;\n}\n\nnsimd_vec_f32 xlogf(nsimd_vec_f32 a0_) {\n  nsimd_vec_f32 ret_;\n  nsimd_cpu_vf32 a0, ret;\n  a0.v0 = get0(a0_);\n  a0.v1 = get1(a0_);\n  a0.v2 = get2(a0_);\n  a0.v3 = get3(a0_);\n  ret = nsimd_log_u35_cpu_f32(a0);\n  ret_ = set0(ret_, ret.v0);\n  ret_ = set1(ret_, ret.v1);\n  ret_ = set2(ret_, ret.v2);\n  ret_ = set3(ret_, ret.v3);\n  return ret_;\n}\n\nnsimd_vec_f32 xcbrtf(nsimd_vec_f32 a0_) {\n  nsimd_vec_f32 ret_;\n  nsimd_cpu_vf32 a0, ret;\n  a0.v0 = get0(a0_);\n  a0.v1 = get1(a0_);\n  a0.v2 = get2(a0_);\n  a0.v3 = get3(a0_);\n  ret = nsimd_cbrt_u35_cpu_f32(a0);\n  ret_ = set0(ret_, ret.v0);\n  ret_ = set1(ret_, ret.v1);\n  ret_ = set2(ret_, ret.v2);\n  ret_ = set3(ret_, ret.v3);\n  return ret_;\n}\n\nnsimd_vec_f32 xsinf_u1(nsimd_vec_f32 a0_) {\n  nsimd_vec_f32 ret_;\n  nsimd_cpu_vf32 a0, ret;\n  a0.v0 = get0(a0_);\n  a0.v1 = get1(a0_);\n  a0.v2 = get2(a0_);\n  a0.v3 = get3(a0_);\n  ret = nsimd_sin_u10_cpu_f32(a0);\n  ret_ = set0(ret_, ret.v0);\n  ret_ = set1(ret_, ret.v1);\n  ret_ = set2(ret_, ret.v2);\n  ret_ = set3(ret_, ret.v3);\n  return ret_;\n}\n\nnsimd_vec_f32 xcosf_u1(nsimd_vec_f32 a0_) {\n  nsimd_vec_f32 ret_;\n  nsimd_cpu_vf32 a0, ret;\n  a0.v0 = get0(a0_);\n  a0.v1 = get1(a0_);\n  a0.v2 = get2(a0_);\n  a0.v3 = get3(a0_);\n  ret = nsimd_cos_u10_cpu_f32(a0);\n  ret_ = set0(ret_, ret.v0);\n  ret_ = set1(ret_, ret.v1);\n  ret_ = set2(ret_, ret.v2);\n  ret_ = set3(ret_, ret.v3);\n  return ret_;\n}\n\nnsimd_vec_f32 xtanf_u1(nsimd_vec_f32 a0_) {\n  nsimd_vec_f32 ret_;\n  nsimd_cpu_vf32 a0, ret;\n  a0.v0 = get0(a0_);\n  a0.v1 = get1(a0_);\n  a0.v2 = get2(a0_);\n  a0.v3 = get3(a0_);\n  ret = nsimd_tan_u10_cpu_f32(a0);\n  ret_ = set0(ret_, ret.v0);\n  ret_ = set1(ret_, ret.v1);\n  ret_ = set2(ret_, ret.v2);\n  ret_ = set3(ret_, ret.v3);\n  return ret_;\n}\n\nnsimd_vec_f32 xasinf_u1(nsimd_vec_f32 a0_) {\n  nsimd_vec_f32 ret_;\n  nsimd_cpu_vf32 a0, ret;\n  a0.v0 = get0(a0_);\n  a0.v1 = get1(a0_);\n  a0.v2 = get2(a0_);\n  a0.v3 = get3(a0_);\n  ret = nsimd_asin_u10_cpu_f32(a0);\n  ret_ = set0(ret_, ret.v0);\n  ret_ = set1(ret_, ret.v1);\n  ret_ = set2(ret_, ret.v2);\n  ret_ = set3(ret_, ret.v3);\n  return ret_;\n}\n\nnsimd_vec_f32 xacosf_u1(nsimd_vec_f32 a0_) {\n  nsimd_vec_f32 ret_;\n  nsimd_cpu_vf32 a0, ret;\n  a0.v0 = get0(a0_);\n  a0.v1 = get1(a0_);\n  a0.v2 = get2(a0_);\n  a0.v3 = get3(a0_);\n  ret = nsimd_acos_u10_cpu_f32(a0);\n  ret_ = set0(ret_, ret.v0);\n  ret_ = set1(ret_, ret.v1);\n  ret_ = set2(ret_, ret.v2);\n  ret_ = set3(ret_, ret.v3);\n  return ret_;\n}\n\nnsimd_vec_f32 xatanf_u1(nsimd_vec_f32 a0_) {\n  nsimd_vec_f32 ret_;\n  nsimd_cpu_vf32 a0, ret;\n  a0.v0 = get0(a0_);\n  a0.v1 = get1(a0_);\n  a0.v2 = get2(a0_);\n  a0.v3 = get3(a0_);\n  ret = nsimd_atan_u10_cpu_f32(a0);\n  ret_ = set0(ret_, ret.v0);\n  ret_ = set1(ret_, ret.v1);\n  ret_ = set2(ret_, ret.v2);\n  ret_ = set3(ret_, ret.v3);\n  return ret_;\n}\n\nnsimd_vec_f32 xatan2f_u1(nsimd_vec_f32 a0_, nsimd_vec_f32 a1_) {\n  nsimd_vec_f32 ret_;\n  nsimd_cpu_vf32 a0, a1, ret;\n  a0.v0 = get0(a0_);\n  a0.v1 = get1(a0_);\n  a0.v2 = get2(a0_);\n  a0.v3 = get3(a0_);\n  a1.v0 = get0(a1_);\n  a1.v1 = get1(a1_);\n  a1.v2 = get2(a1_);\n  a1.v3 = get3(a1_);\n  ret = nsimd_atan2_u10_cpu_f32(a0, a1);\n  ret_ = set0(ret_, ret.v0);\n  ret_ = set1(ret_, ret.v1);\n  ret_ = set2(ret_, ret.v2);\n  ret_ = set3(ret_, ret.v3);\n  return ret_;\n}\n\nnsimd_vec_f32 xlogf_u1(nsimd_vec_f32 a0_) {\n  nsimd_vec_f32 ret_;\n  nsimd_cpu_vf32 a0, ret;\n  a0.v0 = get0(a0_);\n  a0.v1 = get1(a0_);\n  a0.v2 = get2(a0_);\n  a0.v3 = get3(a0_);\n  ret = nsimd_log_u10_cpu_f32(a0);\n  ret_ = set0(ret_, ret.v0);\n  ret_ = set1(ret_, ret.v1);\n  ret_ = set2(ret_, ret.v2);\n  ret_ = set3(ret_, ret.v3);\n  return ret_;\n}\n\nnsimd_vec_f32 xcbrtf_u1(nsimd_vec_f32 a0_) {\n  nsimd_vec_f32 ret_;\n  nsimd_cpu_vf32 a0, ret;\n  a0.v0 = get0(a0_);\n  a0.v1 = get1(a0_);\n  a0.v2 = get2(a0_);\n  a0.v3 = get3(a0_);\n  ret = nsimd_cbrt_u10_cpu_f32(a0);\n  ret_ = set0(ret_, ret.v0);\n  ret_ = set1(ret_, ret.v1);\n  ret_ = set2(ret_, ret.v2);\n  ret_ = set3(ret_, ret.v3);\n  return ret_;\n}\n\nnsimd_vec_f32 xexpf(nsimd_vec_f32 a0_) {\n  nsimd_vec_f32 ret_;\n  nsimd_cpu_vf32 a0, ret;\n  a0.v0 = get0(a0_);\n  a0.v1 = get1(a0_);\n  a0.v2 = get2(a0_);\n  a0.v3 = get3(a0_);\n  ret = nsimd_exp_u10_cpu_f32(a0);\n  ret_ = set0(ret_, ret.v0);\n  ret_ = set1(ret_, ret.v1);\n  ret_ = set2(ret_, ret.v2);\n  ret_ = set3(ret_, ret.v3);\n  return ret_;\n}\n\nnsimd_vec_f32 xpowf(nsimd_vec_f32 a0_, nsimd_vec_f32 a1_) {\n  nsimd_vec_f32 ret_;\n  nsimd_cpu_vf32 a0, a1, ret;\n  a0.v0 = get0(a0_);\n  a0.v1 = get1(a0_);\n  a0.v2 = get2(a0_);\n  a0.v3 = get3(a0_);\n  a1.v0 = get0(a1_);\n  a1.v1 = get1(a1_);\n  a1.v2 = get2(a1_);\n  a1.v3 = get3(a1_);\n  ret = nsimd_pow_u10_cpu_f32(a0, a1);\n  ret_ = set0(ret_, ret.v0);\n  ret_ = set1(ret_, ret.v1);\n  ret_ = set2(ret_, ret.v2);\n  ret_ = set3(ret_, ret.v3);\n  return ret_;\n}\n\nnsimd_vec_f32 xsinhf(nsimd_vec_f32 a0_) {\n  nsimd_vec_f32 ret_;\n  nsimd_cpu_vf32 a0, ret;\n  a0.v0 = get0(a0_);\n  a0.v1 = get1(a0_);\n  a0.v2 = get2(a0_);\n  a0.v3 = get3(a0_);\n  ret = nsimd_sinh_u10_cpu_f32(a0);\n  ret_ = set0(ret_, ret.v0);\n  ret_ = set1(ret_, ret.v1);\n  ret_ = set2(ret_, ret.v2);\n  ret_ = set3(ret_, ret.v3);\n  return ret_;\n}\n\nnsimd_vec_f32 xcoshf(nsimd_vec_f32 a0_) {\n  nsimd_vec_f32 ret_;\n  nsimd_cpu_vf32 a0, ret;\n  a0.v0 = get0(a0_);\n  a0.v1 = get1(a0_);\n  a0.v2 = get2(a0_);\n  a0.v3 = get3(a0_);\n  ret = nsimd_cosh_u10_cpu_f32(a0);\n  ret_ = set0(ret_, ret.v0);\n  ret_ = set1(ret_, ret.v1);\n  ret_ = set2(ret_, ret.v2);\n  ret_ = set3(ret_, ret.v3);\n  return ret_;\n}\n\nnsimd_vec_f32 xtanhf(nsimd_vec_f32 a0_) {\n  nsimd_vec_f32 ret_;\n  nsimd_cpu_vf32 a0, ret;\n  a0.v0 = get0(a0_);\n  a0.v1 = get1(a0_);\n  a0.v2 = get2(a0_);\n  a0.v3 = get3(a0_);\n  ret = nsimd_tanh_u10_cpu_f32(a0);\n  ret_ = set0(ret_, ret.v0);\n  ret_ = set1(ret_, ret.v1);\n  ret_ = set2(ret_, ret.v2);\n  ret_ = set3(ret_, ret.v3);\n  return ret_;\n}\n\nnsimd_vec_f32 xsinhf_u35(nsimd_vec_f32 a0_) {\n  nsimd_vec_f32 ret_;\n  nsimd_cpu_vf32 a0, ret;\n  a0.v0 = get0(a0_);\n  a0.v1 = get1(a0_);\n  a0.v2 = get2(a0_);\n  a0.v3 = get3(a0_);\n  ret = nsimd_sinh_u35_cpu_f32(a0);\n  ret_ = set0(ret_, ret.v0);\n  ret_ = set1(ret_, ret.v1);\n  ret_ = set2(ret_, ret.v2);\n  ret_ = set3(ret_, ret.v3);\n  return ret_;\n}\n\nnsimd_vec_f32 xcoshf_u35(nsimd_vec_f32 a0_) {\n  nsimd_vec_f32 ret_;\n  nsimd_cpu_vf32 a0, ret;\n  a0.v0 = get0(a0_);\n  a0.v1 = get1(a0_);\n  a0.v2 = get2(a0_);\n  a0.v3 = get3(a0_);\n  ret = nsimd_cosh_u35_cpu_f32(a0);\n  ret_ = set0(ret_, ret.v0);\n  ret_ = set1(ret_, ret.v1);\n  ret_ = set2(ret_, ret.v2);\n  ret_ = set3(ret_, ret.v3);\n  return ret_;\n}\n\nnsimd_vec_f32 xtanhf_u35(nsimd_vec_f32 a0_) {\n  nsimd_vec_f32 ret_;\n  nsimd_cpu_vf32 a0, ret;\n  a0.v0 = get0(a0_);\n  a0.v1 = get1(a0_);\n  a0.v2 = get2(a0_);\n  a0.v3 = get3(a0_);\n  ret = nsimd_tanh_u35_cpu_f32(a0);\n  ret_ = set0(ret_, ret.v0);\n  ret_ = set1(ret_, ret.v1);\n  ret_ = set2(ret_, ret.v2);\n  ret_ = set3(ret_, ret.v3);\n  return ret_;\n}\n\nnsimd_vec_f32 xasinhf(nsimd_vec_f32 a0_) {\n  nsimd_vec_f32 ret_;\n  nsimd_cpu_vf32 a0, ret;\n  a0.v0 = get0(a0_);\n  a0.v1 = get1(a0_);\n  a0.v2 = get2(a0_);\n  a0.v3 = get3(a0_);\n  ret = nsimd_asinh_u10_cpu_f32(a0);\n  ret_ = set0(ret_, ret.v0);\n  ret_ = set1(ret_, ret.v1);\n  ret_ = set2(ret_, ret.v2);\n  ret_ = set3(ret_, ret.v3);\n  return ret_;\n}\n\nnsimd_vec_f32 xacoshf(nsimd_vec_f32 a0_) {\n  nsimd_vec_f32 ret_;\n  nsimd_cpu_vf32 a0, ret;\n  a0.v0 = get0(a0_);\n  a0.v1 = get1(a0_);\n  a0.v2 = get2(a0_);\n  a0.v3 = get3(a0_);\n  ret = nsimd_acosh_u10_cpu_f32(a0);\n  ret_ = set0(ret_, ret.v0);\n  ret_ = set1(ret_, ret.v1);\n  ret_ = set2(ret_, ret.v2);\n  ret_ = set3(ret_, ret.v3);\n  return ret_;\n}\n\nnsimd_vec_f32 xatanhf(nsimd_vec_f32 a0_) {\n  nsimd_vec_f32 ret_;\n  nsimd_cpu_vf32 a0, ret;\n  a0.v0 = get0(a0_);\n  a0.v1 = get1(a0_);\n  a0.v2 = get2(a0_);\n  a0.v3 = get3(a0_);\n  ret = nsimd_atanh_u10_cpu_f32(a0);\n  ret_ = set0(ret_, ret.v0);\n  ret_ = set1(ret_, ret.v1);\n  ret_ = set2(ret_, ret.v2);\n  ret_ = set3(ret_, ret.v3);\n  return ret_;\n}\n\nnsimd_vec_f32 xexp2f(nsimd_vec_f32 a0_) {\n  nsimd_vec_f32 ret_;\n  nsimd_cpu_vf32 a0, ret;\n  a0.v0 = get0(a0_);\n  a0.v1 = get1(a0_);\n  a0.v2 = get2(a0_);\n  a0.v3 = get3(a0_);\n  ret = nsimd_exp2_u10_cpu_f32(a0);\n  ret_ = set0(ret_, ret.v0);\n  ret_ = set1(ret_, ret.v1);\n  ret_ = set2(ret_, ret.v2);\n  ret_ = set3(ret_, ret.v3);\n  return ret_;\n}\n\nnsimd_vec_f32 xexp2f_u35(nsimd_vec_f32 a0_) {\n  nsimd_vec_f32 ret_;\n  nsimd_cpu_vf32 a0, ret;\n  a0.v0 = get0(a0_);\n  a0.v1 = get1(a0_);\n  a0.v2 = get2(a0_);\n  a0.v3 = get3(a0_);\n  ret = nsimd_exp2_u35_cpu_f32(a0);\n  ret_ = set0(ret_, ret.v0);\n  ret_ = set1(ret_, ret.v1);\n  ret_ = set2(ret_, ret.v2);\n  ret_ = set3(ret_, ret.v3);\n  return ret_;\n}\n\nnsimd_vec_f32 xexp10f(nsimd_vec_f32 a0_) {\n  nsimd_vec_f32 ret_;\n  nsimd_cpu_vf32 a0, ret;\n  a0.v0 = get0(a0_);\n  a0.v1 = get1(a0_);\n  a0.v2 = get2(a0_);\n  a0.v3 = get3(a0_);\n  ret = nsimd_exp10_u10_cpu_f32(a0);\n  ret_ = set0(ret_, ret.v0);\n  ret_ = set1(ret_, ret.v1);\n  ret_ = set2(ret_, ret.v2);\n  ret_ = set3(ret_, ret.v3);\n  return ret_;\n}\n\nnsimd_vec_f32 xexp10f_u35(nsimd_vec_f32 a0_) {\n  nsimd_vec_f32 ret_;\n  nsimd_cpu_vf32 a0, ret;\n  a0.v0 = get0(a0_);\n  a0.v1 = get1(a0_);\n  a0.v2 = get2(a0_);\n  a0.v3 = get3(a0_);\n  ret = nsimd_exp10_u35_cpu_f32(a0);\n  ret_ = set0(ret_, ret.v0);\n  ret_ = set1(ret_, ret.v1);\n  ret_ = set2(ret_, ret.v2);\n  ret_ = set3(ret_, ret.v3);\n  return ret_;\n}\n\nnsimd_vec_f32 xexpm1f(nsimd_vec_f32 a0_) {\n  nsimd_vec_f32 ret_;\n  nsimd_cpu_vf32 a0, ret;\n  a0.v0 = get0(a0_);\n  a0.v1 = get1(a0_);\n  a0.v2 = get2(a0_);\n  a0.v3 = get3(a0_);\n  ret = nsimd_expm1_u10_cpu_f32(a0);\n  ret_ = set0(ret_, ret.v0);\n  ret_ = set1(ret_, ret.v1);\n  ret_ = set2(ret_, ret.v2);\n  ret_ = set3(ret_, ret.v3);\n  return ret_;\n}\n\nnsimd_vec_f32 xlog10f(nsimd_vec_f32 a0_) {\n  nsimd_vec_f32 ret_;\n  nsimd_cpu_vf32 a0, ret;\n  a0.v0 = get0(a0_);\n  a0.v1 = get1(a0_);\n  a0.v2 = get2(a0_);\n  a0.v3 = get3(a0_);\n  ret = nsimd_log10_u10_cpu_f32(a0);\n  ret_ = set0(ret_, ret.v0);\n  ret_ = set1(ret_, ret.v1);\n  ret_ = set2(ret_, ret.v2);\n  ret_ = set3(ret_, ret.v3);\n  return ret_;\n}\n\nnsimd_vec_f32 xlog2f(nsimd_vec_f32 a0_) {\n  nsimd_vec_f32 ret_;\n  nsimd_cpu_vf32 a0, ret;\n  a0.v0 = get0(a0_);\n  a0.v1 = get1(a0_);\n  a0.v2 = get2(a0_);\n  a0.v3 = get3(a0_);\n  ret = nsimd_log2_u10_cpu_f32(a0);\n  ret_ = set0(ret_, ret.v0);\n  ret_ = set1(ret_, ret.v1);\n  ret_ = set2(ret_, ret.v2);\n  ret_ = set3(ret_, ret.v3);\n  return ret_;\n}\n\nnsimd_vec_f32 xlog2f_u35(nsimd_vec_f32 a0_) {\n  nsimd_vec_f32 ret_;\n  nsimd_cpu_vf32 a0, ret;\n  a0.v0 = get0(a0_);\n  a0.v1 = get1(a0_);\n  a0.v2 = get2(a0_);\n  a0.v3 = get3(a0_);\n  ret = nsimd_log2_u35_cpu_f32(a0);\n  ret_ = set0(ret_, ret.v0);\n  ret_ = set1(ret_, ret.v1);\n  ret_ = set2(ret_, ret.v2);\n  ret_ = set3(ret_, ret.v3);\n  return ret_;\n}\n\nnsimd_vec_f32 xlog1pf(nsimd_vec_f32 a0_) {\n  nsimd_vec_f32 ret_;\n  nsimd_cpu_vf32 a0, ret;\n  a0.v0 = get0(a0_);\n  a0.v1 = get1(a0_);\n  a0.v2 = get2(a0_);\n  a0.v3 = get3(a0_);\n  ret = nsimd_log1p_u10_cpu_f32(a0);\n  ret_ = set0(ret_, ret.v0);\n  ret_ = set1(ret_, ret.v1);\n  ret_ = set2(ret_, ret.v2);\n  ret_ = set3(ret_, ret.v3);\n  return ret_;\n}\n\nnsimd_vec_f32 xsinpif_u05(nsimd_vec_f32 a0_) {\n  nsimd_vec_f32 ret_;\n  nsimd_cpu_vf32 a0, ret;\n  a0.v0 = get0(a0_);\n  a0.v1 = get1(a0_);\n  a0.v2 = get2(a0_);\n  a0.v3 = get3(a0_);\n  ret = nsimd_sinpi_u05_cpu_f32(a0);\n  ret_ = set0(ret_, ret.v0);\n  ret_ = set1(ret_, ret.v1);\n  ret_ = set2(ret_, ret.v2);\n  ret_ = set3(ret_, ret.v3);\n  return ret_;\n}\n\nnsimd_vec_f32 xcospif_u05(nsimd_vec_f32 a0_) {\n  nsimd_vec_f32 ret_;\n  nsimd_cpu_vf32 a0, ret;\n  a0.v0 = get0(a0_);\n  a0.v1 = get1(a0_);\n  a0.v2 = get2(a0_);\n  a0.v3 = get3(a0_);\n  ret = nsimd_cospi_u05_cpu_f32(a0);\n  ret_ = set0(ret_, ret.v0);\n  ret_ = set1(ret_, ret.v1);\n  ret_ = set2(ret_, ret.v2);\n  ret_ = set3(ret_, ret.v3);\n  return ret_;\n}\n\nnsimd_vec_f32 xhypotf_u05(nsimd_vec_f32 a0_, nsimd_vec_f32 a1_) {\n  nsimd_vec_f32 ret_;\n  nsimd_cpu_vf32 a0, a1, ret;\n  a0.v0 = get0(a0_);\n  a0.v1 = get1(a0_);\n  a0.v2 = get2(a0_);\n  a0.v3 = get3(a0_);\n  a1.v0 = get0(a1_);\n  a1.v1 = get1(a1_);\n  a1.v2 = get2(a1_);\n  a1.v3 = get3(a1_);\n  ret = nsimd_hypot_u05_cpu_f32(a0, a1);\n  ret_ = set0(ret_, ret.v0);\n  ret_ = set1(ret_, ret.v1);\n  ret_ = set2(ret_, ret.v2);\n  ret_ = set3(ret_, ret.v3);\n  return ret_;\n}\n\nnsimd_vec_f32 xhypotf_u35(nsimd_vec_f32 a0_, nsimd_vec_f32 a1_) {\n  nsimd_vec_f32 ret_;\n  nsimd_cpu_vf32 a0, a1, ret;\n  a0.v0 = get0(a0_);\n  a0.v1 = get1(a0_);\n  a0.v2 = get2(a0_);\n  a0.v3 = get3(a0_);\n  a1.v0 = get0(a1_);\n  a1.v1 = get1(a1_);\n  a1.v2 = get2(a1_);\n  a1.v3 = get3(a1_);\n  ret = nsimd_hypot_u35_cpu_f32(a0, a1);\n  ret_ = set0(ret_, ret.v0);\n  ret_ = set1(ret_, ret.v1);\n  ret_ = set2(ret_, ret.v2);\n  ret_ = set3(ret_, ret.v3);\n  return ret_;\n}\n\nnsimd_vec_f32 xfmodf(nsimd_vec_f32 a0_, nsimd_vec_f32 a1_) {\n  nsimd_vec_f32 ret_;\n  nsimd_cpu_vf32 a0, a1, ret;\n  a0.v0 = get0(a0_);\n  a0.v1 = get1(a0_);\n  a0.v2 = get2(a0_);\n  a0.v3 = get3(a0_);\n  a1.v0 = get0(a1_);\n  a1.v1 = get1(a1_);\n  a1.v2 = get2(a1_);\n  a1.v3 = get3(a1_);\n  ret = nsimd_fmod_cpu_f32(a0, a1);\n  ret_ = set0(ret_, ret.v0);\n  ret_ = set1(ret_, ret.v1);\n  ret_ = set2(ret_, ret.v2);\n  ret_ = set3(ret_, ret.v3);\n  return ret_;\n}\n\nnsimd_vec_f32 xremainderf(nsimd_vec_f32 a0_, nsimd_vec_f32 a1_) {\n  nsimd_vec_f32 ret_;\n  nsimd_cpu_vf32 a0, a1, ret;\n  a0.v0 = get0(a0_);\n  a0.v1 = get1(a0_);\n  a0.v2 = get2(a0_);\n  a0.v3 = get3(a0_);\n  a1.v0 = get0(a1_);\n  a1.v1 = get1(a1_);\n  a1.v2 = get2(a1_);\n  a1.v3 = get3(a1_);\n  ret = nsimd_remainder_cpu_f32(a0, a1);\n  ret_ = set0(ret_, ret.v0);\n  ret_ = set1(ret_, ret.v1);\n  ret_ = set2(ret_, ret.v2);\n  ret_ = set3(ret_, ret.v3);\n  return ret_;\n}\n\nnsimd_vec_f32 xlgammaf_u1(nsimd_vec_f32 a0_) {\n  nsimd_vec_f32 ret_;\n  nsimd_cpu_vf32 a0, ret;\n  a0.v0 = get0(a0_);\n  a0.v1 = get1(a0_);\n  a0.v2 = get2(a0_);\n  a0.v3 = get3(a0_);\n  ret = nsimd_lgamma_u10_cpu_f32(a0);\n  ret_ = set0(ret_, ret.v0);\n  ret_ = set1(ret_, ret.v1);\n  ret_ = set2(ret_, ret.v2);\n  ret_ = set3(ret_, ret.v3);\n  return ret_;\n}\n\nnsimd_vec_f32 xtgammaf_u1(nsimd_vec_f32 a0_) {\n  nsimd_vec_f32 ret_;\n  nsimd_cpu_vf32 a0, ret;\n  a0.v0 = get0(a0_);\n  a0.v1 = get1(a0_);\n  a0.v2 = get2(a0_);\n  a0.v3 = get3(a0_);\n  ret = nsimd_tgamma_u10_cpu_f32(a0);\n  ret_ = set0(ret_, ret.v0);\n  ret_ = set1(ret_, ret.v1);\n  ret_ = set2(ret_, ret.v2);\n  ret_ = set3(ret_, ret.v3);\n  return ret_;\n}\n\nnsimd_vec_f32 xerff_u1(nsimd_vec_f32 a0_) {\n  nsimd_vec_f32 ret_;\n  nsimd_cpu_vf32 a0, ret;\n  a0.v0 = get0(a0_);\n  a0.v1 = get1(a0_);\n  a0.v2 = get2(a0_);\n  a0.v3 = get3(a0_);\n  ret = nsimd_erf_u10_cpu_f32(a0);\n  ret_ = set0(ret_, ret.v0);\n  ret_ = set1(ret_, ret.v1);\n  ret_ = set2(ret_, ret.v2);\n  ret_ = set3(ret_, ret.v3);\n  return ret_;\n}\n\nnsimd_vec_f32 xerfcf_u15(nsimd_vec_f32 a0_) {\n  nsimd_vec_f32 ret_;\n  nsimd_cpu_vf32 a0, ret;\n  a0.v0 = get0(a0_);\n  a0.v1 = get1(a0_);\n  a0.v2 = get2(a0_);\n  a0.v3 = get3(a0_);\n  ret = nsimd_erfc_u15_cpu_f32(a0);\n  ret_ = set0(ret_, ret.v0);\n  ret_ = set1(ret_, ret.v1);\n  ret_ = set2(ret_, ret.v2);\n  ret_ = set3(ret_, ret.v3);\n  return ret_;\n}\n\n"
  },
  {
    "path": "src/sleefsp.c",
    "content": "//   Copyright Naoki Shibata and contributors 2010 - 2020.\n// Distributed under the Boost Software License, Version 1.0.\n//    (See accompanying file LICENSE.txt or copy at\n//          http://www.boost.org/LICENSE_1_0.txt)\n\n// Always use -ffp-contract=off option to compile SLEEF.\n\n#include <stdio.h>\n#include <assert.h>\n#include <stdint.h>\n#include <limits.h>\n#include <float.h>\n\n#ifndef ENABLE_BUILTIN_MATH\n#include <math.h>\n#define SQRTF sqrtf\n#else\n#define SQRTF __builtin_sqrtf\n#endif\n\n#include \"misc.h\"\n\nextern const float Sleef_rempitabsp[];\n\n#ifdef DORENAME\n#include \"rename.h\"\n#endif\n\n#if (defined(_MSC_VER))\n#pragma fp_contract (off)\n#endif\n\n#define MLA mlaf\n#define C2V(x) (x)\n#include \"estrin.h\"\n\nstatic INLINE CONST int32_t floatToRawIntBits(float d) {\n  union {\n    float f;\n    int32_t i;\n  } tmp;\n  tmp.f = d;\n  return tmp.i;\n}\n\nstatic INLINE CONST float intBitsToFloat(int32_t i) {\n  union {\n    float f;\n    int32_t i;\n  } tmp;\n  tmp.i = i;\n  return tmp.f;\n}\n\nstatic INLINE CONST float fabsfk(float x) {\n  return intBitsToFloat(0x7fffffffL & floatToRawIntBits(x));\n}\n\nstatic INLINE CONST float mulsignf(float x, float y) {\n  return intBitsToFloat(floatToRawIntBits(x) ^ (floatToRawIntBits(y) & (1 << 31)));\n}\n\nstatic INLINE CONST float copysignfk(float x, float y) {\n  return intBitsToFloat((floatToRawIntBits(x) & ~(1 << 31)) ^ (floatToRawIntBits(y) & (1 << 31)));\n}\n\nstatic INLINE CONST float signf(float d) { return mulsignf(1, d); }\nstatic INLINE CONST float mlaf(float x, float y, float z) { return x * y + z; }\nstatic INLINE CONST float rintfk(float x) { return x < 0 ? (int)(x - 0.5f) : (int)(x + 0.5f); }\nstatic INLINE CONST int ceilfk(float x) { return (int)x + (x < 0 ? 0 : 1); }\nstatic INLINE CONST float fminfk(float x, float y) { return x < y ? x : y; }\nstatic INLINE CONST float fmaxfk(float x, float y) { return x > y ? x : y; }\nstatic INLINE CONST int xisintf(float x) { return (x == (int)x); }\n\nstatic INLINE CONST int xisnanf(float x) { return x != x; }\nstatic INLINE CONST int xisinff(float x) { return x == SLEEF_INFINITYf || x == -SLEEF_INFINITYf; }\nstatic INLINE CONST int xisminff(float x) { return x == -SLEEF_INFINITYf; }\nstatic INLINE CONST int xispinff(float x) { return x == SLEEF_INFINITYf; }\nstatic INLINE CONST int xisnegzerof(float x) { return floatToRawIntBits(x) == floatToRawIntBits(-0.0); }\nstatic INLINE CONST int xisnumberf(float x) { return !xisinff(x) && !xisnanf(x); }\n\nstatic INLINE CONST int ilogbkf(float d) {\n  int m = d < 5.421010862427522E-20f;\n  d = m ? 1.8446744073709552E19f * d : d;\n  int q = (floatToRawIntBits(d) >> 23) & 0xff;\n  q = m ? q - (64 + 0x7f) : q - 0x7f;\n  return q;\n}\n\n// vilogb2kf is similar to ilogbkf, but the argument has to be a\n// normalized FP value.\nstatic INLINE CONST int ilogb2kf(float d) {\n  return ((floatToRawIntBits(d) >> 23) & 0xff) - 0x7f;\n}\n\nEXPORT CONST int xilogbf(float d) {\n  int e = ilogbkf(fabsfk(d));\n  e = d == 0.0f  ? SLEEF_FP_ILOGB0 : e;\n  e = xisnanf(d) ? SLEEF_FP_ILOGBNAN : e;\n  e = xisinff(d) ? INT_MAX : e;\n  return e;\n}\n\nstatic INLINE CONST float pow2if(int q) {\n  return intBitsToFloat(((int32_t)(q + 0x7f)) << 23);\n}\n\nstatic INLINE CONST float ldexpkf(float x, int q) {\n  float u;\n  int m;\n  m = q >> 31;\n  m = (((m + q) >> 6) - m) << 4;\n  q = q - (m << 2);\n  m += 127;\n  m = m <   0 ?   0 : m;\n  m = m > 255 ? 255 : m;\n  u = intBitsToFloat(((int32_t)m) << 23);\n  x = x * u * u * u * u;\n  u = intBitsToFloat(((int32_t)(q + 0x7f)) << 23);\n  return x * u;\n}\n\nstatic INLINE CONST float ldexp2kf(float d, int e) { // faster than ldexpkf, short reach\n  return d * pow2if(e >> 1) * pow2if(e - (e >> 1));\n}\n\nstatic INLINE CONST float ldexp3kf(float d, int e) { // very fast, no denormal\n  return intBitsToFloat(floatToRawIntBits(d) + (e << 23));\n}\n\n//\n\n#ifndef NDEBUG\nstatic int checkfp(float x) {\n  if (xisinff(x) || xisnanf(x)) return 1;\n  return 0;\n}\n#endif\n\nstatic INLINE CONST float upperf(float d) {\n  return intBitsToFloat(floatToRawIntBits(d) & 0xfffff000);\n}\n\nstatic INLINE CONST Sleef_float2 df(float h, float l) {\n  Sleef_float2 ret;\n  ret.x = h; ret.y = l;\n  return ret;\n}\n\nstatic INLINE CONST Sleef_float2 dfx(double d) {\n  Sleef_float2 ret;\n  ret.x = d; ret.y = d - ret.x;\n  return ret;\n}\n\nstatic INLINE CONST Sleef_float2 dfnormalize_f2_f2(Sleef_float2 t) {\n  Sleef_float2 s;\n\n  s.x = t.x + t.y;\n  s.y = t.x - s.x + t.y;\n\n  return s;\n}\n\nstatic INLINE CONST Sleef_float2 dfscale_f2_f2_f(Sleef_float2 d, float s) {\n  Sleef_float2 r;\n\n  r.x = d.x * s;\n  r.y = d.y * s;\n\n  return r;\n}\n\nstatic INLINE CONST Sleef_float2 dfneg_f2_f2(Sleef_float2 d) {\n  Sleef_float2 r;\n\n  r.x = -d.x;\n  r.y = -d.y;\n\n  return r;\n}\n\nstatic INLINE CONST Sleef_float2 dfabs_f2_f2(Sleef_float2 x) {\n  return df(x.x < 0 ? -x.x : x.x, x.x < 0 ? -x.y : x.y);\n}\n\nstatic INLINE CONST Sleef_float2 dfadd_f2_f_f(float x, float y) {\n  // |x| >= |y|\n\n  Sleef_float2 r;\n\n#ifndef NDEBUG\n  if (!(checkfp(x) || checkfp(y) || fabsfk(x) >= fabsfk(y))) fprintf(stderr, \"[dfadd_f2_f_f : %g, %g]\", x, y);\n#endif\n\n  r.x = x + y;\n  r.y = x - r.x + y;\n\n  return r;\n}\n\nstatic INLINE CONST Sleef_float2 dfadd2_f2_f_f(float x, float y) {\n  Sleef_float2 r;\n\n  r.x = x + y;\n  float v = r.x - x;\n  r.y = (x - (r.x - v)) + (y - v);\n\n  return r;\n}\n\nstatic INLINE CONST Sleef_float2 dfadd_f2_f2_f(Sleef_float2 x, float y) {\n  // |x| >= |y|\n\n  Sleef_float2 r;\n\n#ifndef NDEBUG\n  if (!(checkfp(x.x) || checkfp(y) || fabsfk(x.x) >= fabsfk(y))) fprintf(stderr, \"[dfadd_f2_f2_f : %g %g]\", x.x, y);\n#endif\n\n  r.x = x.x + y;\n  r.y = x.x - r.x + y + x.y;\n\n  return r;\n}\n\nstatic INLINE CONST Sleef_float2 dfadd_f2_f_f2(float x, Sleef_float2 y) {\n  // |x| >= |y|\n\n  Sleef_float2 r;\n\n#ifndef NDEBUG\n  if (!(checkfp(x) || checkfp(y.x) || fabsfk(x) >= fabsfk(y.x))) {\n    fprintf(stderr, \"[dfadd_f2_f_f2 : %g %g]\\n\", x, y.x);\n    fflush(stderr);\n  }\n#endif\n\n  r.x = x + y.x;\n  r.y = x - r.x + y.x + y.y;\n\n  return r;\n}\n\nstatic INLINE CONST Sleef_float2 dfadd2_f2_f2_f(Sleef_float2 x, float y) {\n  // |x| >= |y|\n\n  Sleef_float2 r;\n\n  r.x  = x.x + y;\n  float v = r.x - x.x;\n  r.y = (x.x - (r.x - v)) + (y - v);\n  r.y += x.y;\n\n  return r;\n}\n\nstatic INLINE CONST Sleef_float2 dfadd2_f2_f_f2(float x, Sleef_float2 y) {\n  Sleef_float2 r;\n\n  r.x  = x + y.x;\n  float v = r.x - x;\n  r.y = (x - (r.x - v)) + (y.x - v) + y.y;\n\n  return r;\n}\n\nstatic INLINE CONST Sleef_float2 dfadd_f2_f2_f2(Sleef_float2 x, Sleef_float2 y) {\n  // |x| >= |y|\n\n  Sleef_float2 r;\n\n#ifndef NDEBUG\n  if (!(checkfp(x.x) || checkfp(y.x) || fabsfk(x.x) >= fabsfk(y.x))) fprintf(stderr, \"[dfadd_f2_f2_f2 : %g %g]\", x.x, y.x);\n#endif\n\n  r.x = x.x + y.x;\n  r.y = x.x - r.x + y.x + x.y + y.y;\n\n  return r;\n}\n\nstatic INLINE CONST Sleef_float2 dfadd2_f2_f2_f2(Sleef_float2 x, Sleef_float2 y) {\n  Sleef_float2 r;\n\n  r.x  = x.x + y.x;\n  float v = r.x - x.x;\n  r.y = (x.x - (r.x - v)) + (y.x - v);\n  r.y += x.y + y.y;\n\n  return r;\n}\n\nstatic INLINE CONST Sleef_float2 dfsub_f2_f2_f2(Sleef_float2 x, Sleef_float2 y) {\n  // |x| >= |y|\n\n  Sleef_float2 r;\n\n#ifndef NDEBUG\n  if (!(checkfp(x.x) || checkfp(y.x) || fabsfk(x.x) >= fabsfk(y.x))) fprintf(stderr, \"[dfsub_f2_f2_f2 : %g %g]\", x.x, y.x);\n#endif\n\n  r.x = x.x - y.x;\n  r.y = x.x - r.x - y.x + x.y - y.y;\n\n  return r;\n}\n\nstatic INLINE CONST Sleef_float2 dfdiv_f2_f2_f2(Sleef_float2 n, Sleef_float2 d) {\n  float t = 1.0f / d.x;\n  float dh  = upperf(d.x), dl  = d.x - dh;\n  float th  = upperf(t  ), tl  = t   - th;\n  float nhh = upperf(n.x), nhl = n.x - nhh;\n\n  Sleef_float2 q;\n\n  q.x = n.x * t;\n\n  float u = -q.x + nhh * th + nhh * tl + nhl * th + nhl * tl +\n    q.x * (1 - dh * th - dh * tl - dl * th - dl * tl);\n\n  q.y = t * (n.y - q.x * d.y) + u;\n\n  return q;\n}\n\nstatic INLINE CONST Sleef_float2 dfmul_f2_f_f(float x, float y) {\n  float xh = upperf(x), xl = x - xh;\n  float yh = upperf(y), yl = y - yh;\n  Sleef_float2 r;\n\n  r.x = x * y;\n  r.y = xh * yh - r.x + xl * yh + xh * yl + xl * yl;\n\n  return r;\n}\n\nstatic INLINE CONST Sleef_float2 dfmul_f2_f2_f(Sleef_float2 x, float y) {\n  float xh = upperf(x.x), xl = x.x - xh;\n  float yh = upperf(y  ), yl = y   - yh;\n  Sleef_float2 r;\n\n  r.x = x.x * y;\n  r.y = xh * yh - r.x + xl * yh + xh * yl + xl * yl + x.y * y;\n\n  return r;\n}\n\nstatic INLINE CONST Sleef_float2 dfmul_f2_f2_f2(Sleef_float2 x, Sleef_float2 y) {\n  float xh = upperf(x.x), xl = x.x - xh;\n  float yh = upperf(y.x), yl = y.x - yh;\n  Sleef_float2 r;\n\n  r.x = x.x * y.x;\n  r.y = xh * yh - r.x + xl * yh + xh * yl + xl * yl + x.x * y.y + x.y * y.x;\n\n  return r;\n}\n\nstatic INLINE CONST float dfmul_f_f2_f2(Sleef_float2 x, Sleef_float2 y) {\n  float xh = upperf(x.x), xl = x.x - xh;\n  float yh = upperf(y.x), yl = y.x - yh;\n\n  return x.y * yh + xh * y.y + xl * yl + xh * yl + xl * yh + xh * yh;\n}\n\nstatic INLINE CONST Sleef_float2 dfsqu_f2_f2(Sleef_float2 x) {\n  float xh = upperf(x.x), xl = x.x - xh;\n  Sleef_float2 r;\n\n  r.x = x.x * x.x;\n  r.y = xh * xh - r.x + (xh + xh) * xl + xl * xl + x.x * (x.y + x.y);\n\n  return r;\n}\n\nstatic INLINE CONST float dfsqu_f_f2(Sleef_float2 x) {\n  float xh = upperf(x.x), xl = x.x - xh;\n\n  return xh * x.y + xh * x.y + xl * xl + (xh * xl + xh * xl) + xh * xh;\n}\n\nstatic INLINE CONST Sleef_float2 dfrec_f2_f(float d) {\n  float t = 1.0f / d;\n  float dh = upperf(d), dl = d - dh;\n  float th = upperf(t), tl = t - th;\n  Sleef_float2 q;\n\n  q.x = t;\n  q.y = t * (1 - dh * th - dh * tl - dl * th - dl * tl);\n\n  return q;\n}\n\nstatic INLINE CONST Sleef_float2 dfrec_f2_f2(Sleef_float2 d) {\n  float t = 1.0f / d.x;\n  float dh = upperf(d.x), dl = d.x - dh;\n  float th = upperf(t  ), tl = t   - th;\n  Sleef_float2 q;\n\n  q.x = t;\n  q.y = t * (1 - dh * th - dh * tl - dl * th - dl * tl - d.y * t);\n\n  return q;\n}\n\nstatic INLINE CONST Sleef_float2 dfsqrt_f2_f2(Sleef_float2 d) {\n  float t = SQRTF(d.x + d.y);\n  return dfscale_f2_f2_f(dfmul_f2_f2_f2(dfadd2_f2_f2_f2(d, dfmul_f2_f_f(t, t)), dfrec_f2_f(t)), 0.5f);\n}\n\nstatic INLINE CONST Sleef_float2 dfsqrt_f2_f(float d) {\n  float t = SQRTF(d);\n  return dfscale_f2_f2_f(dfmul_f2_f2_f2(dfadd2_f2_f_f2(d, dfmul_f2_f_f(t, t)), dfrec_f2_f(t)), 0.5);\n}\n\n//\n\ntypedef struct {\n  float d;\n  int32_t i;\n} fi_t;\n\ntypedef struct {\n  Sleef_float2 df;\n  int32_t i;\n} dfi_t;\n\nstatic CONST fi_t rempisubf(float x) {\n  fi_t ret;\n  float fr = x - (float)(INT64_C(1) << 10) * (int32_t)(x * (1.0f / (INT64_C(1) << 10)));\n  ret.i = ((7 & ((x > 0 ? 4 : 3) + (int32_t)(fr * 8))) - 3) >> 1;\n  fr = fr - 0.25f * (int32_t)(fr * 4 + mulsignf(0.5f, x));\n  fr = fabsfk(fr) > 0.125f ? (fr - mulsignf(0.5f, x)) : fr;\n  fr = fabsfk(fr) > 1e+10f ? 0 : fr;\n  if (fabsfk(x) == 0.12499999254941940308f) { fr = x; ret.i = 0; }\n  ret.d = fr;\n  return ret;\n}\n\nstatic CONST dfi_t rempif(float a) {\n  Sleef_float2 x, y, z;\n  fi_t di;\n  float t;\n  int ex = ilogb2kf(a) - 25, q = ex > (90 - 25) ? -64 : 0;\n  a = ldexp3kf(a, q);\n  if (ex < 0) ex = 0;\n  ex *= 4;\n  x = dfmul_f2_f_f(a, Sleef_rempitabsp[ex]);\n  di = rempisubf(x.x);\n  q = di.i;\n  x.x = di.d;\n  x = dfnormalize_f2_f2(x);\n  y = dfmul_f2_f_f(a, Sleef_rempitabsp[ex+1]);\n  x = dfadd2_f2_f2_f2(x, y);\n  di = rempisubf(x.x);\n  q += di.i;\n  x.x = di.d;\n  x = dfnormalize_f2_f2(x);\n  y = dfmul_f2_f2_f(df(Sleef_rempitabsp[ex+2], Sleef_rempitabsp[ex+3]), a);\n  x = dfadd2_f2_f2_f2(x, y);\n  x = dfnormalize_f2_f2(x);\n  x = dfmul_f2_f2_f2(x, df(3.1415927410125732422f*2, -8.7422776573475857731e-08f*2));\n  dfi_t ret = { fabsfk(a) < 0.7f ? df(a, 0) : x, q };\n  return ret;\n}\n\nEXPORT CONST float xsinf(float d) {\n  int q;\n  float u, s, t = d;\n\n  if (fabsfk(d) < TRIGRANGEMAX2f) {\n    q = (int)rintfk(d * (float)M_1_PI);\n    d = mlaf(q, -PI_A2f, d);\n    d = mlaf(q, -PI_B2f, d);\n    d = mlaf(q, -PI_C2f, d);\n  } else if (fabsfk(d) < TRIGRANGEMAXf) {\n    q = (int)rintfk(d * (float)M_1_PI);\n    d = mlaf(q, -PI_Af, d);\n    d = mlaf(q, -PI_Bf, d);\n    d = mlaf(q, -PI_Cf, d);\n    d = mlaf(q, -PI_Df, d);\n  } else {\n    dfi_t dfi = rempif(t);\n    q = ((dfi.i & 3) * 2 + (dfi.df.x > 0) + 1) >> 2;\n    if ((dfi.i & 1) != 0) {\n      dfi.df = dfadd2_f2_f2_f2(dfi.df, df(mulsignf(3.1415927410125732422f*-0.5, dfi.df.x),\n\t\t\t\t\t  mulsignf(-8.7422776573475857731e-08f*-0.5, dfi.df.x)));\n    }\n    d = dfi.df.x + dfi.df.y;\n    if (xisinff(t) || xisnanf(t)) d = SLEEF_NANf;\n  }\n\n  s = d * d;\n\n  if ((q & 1) != 0) d = -d;\n\n  u = 2.6083159809786593541503e-06f;\n  u = mlaf(u, s, -0.0001981069071916863322258f);\n  u = mlaf(u, s, 0.00833307858556509017944336f);\n  u = mlaf(u, s, -0.166666597127914428710938f);\n\n  u = mlaf(s, u * d, d);\n\n  if (xisnegzerof(t)) u = -0.0f;\n\n  return u;\n}\n\nEXPORT CONST float xsinf_u1(float d) {\n  int q;\n  float u;\n  Sleef_float2 s, t, x;\n\n  if (fabsfk(d) < TRIGRANGEMAX2f) {\n    q = (int)rintfk(d * (float)M_1_PI);\n    u = mlaf(q, -PI_A2f, d);\n    s = dfadd2_f2_f_f(u, q * (-PI_B2f));\n    s = dfadd_f2_f2_f(s, q * (-PI_C2f));\n  } else {\n    dfi_t dfi = rempif(d);\n    q = ((dfi.i & 3) * 2 + (dfi.df.x > 0) + 1) >> 2;\n    if ((dfi.i & 1) != 0) {\n      dfi.df = dfadd2_f2_f2_f2(dfi.df, df(mulsignf(3.1415927410125732422f*-0.5, dfi.df.x),\n\t\t\t\t\t  mulsignf(-8.7422776573475857731e-08f*-0.5, dfi.df.x)));\n    }\n    s = dfnormalize_f2_f2(dfi.df);\n    if (xisinff(d) || xisnanf(d)) s.x = SLEEF_NANf;\n  }\n  \n  t = s;\n  s = dfsqu_f2_f2(s);\n\n  u = 2.6083159809786593541503e-06f;\n  u = mlaf(u, s.x, -0.0001981069071916863322258f);\n  u = mlaf(u, s.x, 0.00833307858556509017944336f);\n\n  x = dfadd_f2_f_f2(1, dfmul_f2_f2_f2(dfadd_f2_f_f(-0.166666597127914428710938f, u * s.x), s));\n\n  u = dfmul_f_f2_f2(t, x);\n\n  if ((q & 1) != 0) u = -u;\n  if (xisnegzerof(d)) u = d;\n\n  return u;\n}\n\nEXPORT CONST float xcosf(float d) {\n  int q;\n  float u, s, t = d;\n\n  if (fabsfk(d) < TRIGRANGEMAX2f) {\n    q = 1 + 2*(int)rintfk(d * (float)M_1_PI - 0.5f);\n    d = mlaf(q, -PI_A2f*0.5f, d);\n    d = mlaf(q, -PI_B2f*0.5f, d);\n    d = mlaf(q, -PI_C2f*0.5f, d);\n  } else if (fabsfk(d) < TRIGRANGEMAXf) {\n    q = 1 + 2*(int)rintfk(d * (float)M_1_PI - 0.5f);\n    d = mlaf(q, -PI_Af*0.5f, d);\n    d = mlaf(q, -PI_Bf*0.5f, d);\n    d = mlaf(q, -PI_Cf*0.5f, d);\n    d = mlaf(q, -PI_Df*0.5f, d);\n  } else {\n    dfi_t dfi = rempif(t);\n    q = ((dfi.i & 3) * 2 + (dfi.df.x > 0) + 7) >> 1;\n    if ((dfi.i & 1) == 0) {\n      dfi.df = dfadd2_f2_f2_f2(dfi.df, df(mulsignf(3.1415927410125732422f*-0.5, dfi.df.x > 0 ? 1 : -1),\n\t\t\t\t\t  mulsignf(-8.7422776573475857731e-08f*-0.5, dfi.df.x > 0 ? 1 : -1)));\n    }\n    d = dfi.df.x + dfi.df.y;\n    if (xisinff(t) || xisnanf(t)) d = SLEEF_NANf;\n  }\n\n  s = d * d;\n\n  if ((q & 2) == 0) d = -d;\n\n  u = 2.6083159809786593541503e-06f;\n  u = mlaf(u, s, -0.0001981069071916863322258f);\n  u = mlaf(u, s, 0.00833307858556509017944336f);\n  u = mlaf(u, s, -0.166666597127914428710938f);\n\n  u = mlaf(s, u * d, d);\n  \n  return u;\n}\n\nEXPORT CONST float xcosf_u1(float d) {\n  float u;\n  Sleef_float2 s, t, x;\n  int q;\n\n  if (fabsfk(d) < TRIGRANGEMAX2f) {\n    d = fabsfk(d);\n    float dq = mlaf(rintfk(d * (float)M_1_PI - 0.5f), 2, 1);\n    q = (int)dq;\n    s = dfadd2_f2_f_f (d, dq * (-PI_A2f*0.5f));\n    s = dfadd2_f2_f2_f(s, dq * (-PI_B2f*0.5f));\n    s = dfadd2_f2_f2_f(s, dq * (-PI_C2f*0.5f));\n  } else {\n    dfi_t dfi = rempif(d);\n    q = ((dfi.i & 3) * 2 + (dfi.df.x > 0) + 7) >> 1;\n    if ((dfi.i & 1) == 0) {\n      dfi.df = dfadd2_f2_f2_f2(dfi.df, df(mulsignf(3.1415927410125732422f*-0.5, dfi.df.x > 0 ? 1 : -1),\n\t\t\t\t\t  mulsignf(-8.7422776573475857731e-08f*-0.5, dfi.df.x > 0 ? 1 : -1)));\n    }\n    s = dfnormalize_f2_f2(dfi.df);\n    if (xisinff(d) || xisnanf(d)) s.x = SLEEF_NANf;\n  }\n  \n  t = s;\n  s = dfsqu_f2_f2(s);\n\n  u = 2.6083159809786593541503e-06f;\n  u = mlaf(u, s.x, -0.0001981069071916863322258f);\n  u = mlaf(u, s.x, 0.00833307858556509017944336f);\n\n  x = dfadd_f2_f_f2(1, dfmul_f2_f2_f2(dfadd_f2_f_f(-0.166666597127914428710938f, u * s.x), s));\n\n  u = dfmul_f_f2_f2(t, x);\n\n  if ((((int)q) & 2) == 0) u = -u;\n\n  return u;\n}\n\nEXPORT CONST float xfastsinf_u3500(float d) {\n  int q;\n  float u, s, t = d;\n\n  q = rintfk(d * (float)M_1_PI);\n  d = mlaf(q, -(float)M_PI, d);\n\n  s = d * d;\n\n  u = -0.1881748176e-3;\n  u = mlaf(u, s, +0.8323502727e-2);\n  u = mlaf(u, s, -0.1666651368e+0);\n  u = mlaf(s * d, u, d);\n\n  if ((q & 1) != 0) u = -u;\n\n  if (UNLIKELY(fabsfk(t) > 30.0f)) return xsinf(t);\n\n  return u;\n}\n\nEXPORT CONST float xfastcosf_u3500(float d) {\n  int q;\n  float u, s, t = d;\n\n  q = rintfk(mlaf(d, (float)M_1_PI, -0.5f));\n  d = mlaf(q, -(float)M_PI, d - (float)M_PI*0.5f);\n\n  s = d * d;\n\n  u = -0.1881748176e-3;\n  u = mlaf(u, s, +0.8323502727e-2);\n  u = mlaf(u, s, -0.1666651368e+0);\n  u = mlaf(s * d, u, d);\n\n  if ((q & 1) == 0) u = -u;\n\n  if (UNLIKELY(fabsfk(t) > 30.0f)) return xcosf(t);\n\n  return u;\n}\n\nEXPORT CONST Sleef_float2 xsincosf(float d) {\n  int q;\n  float u, s, t;\n  Sleef_float2 r;\n\n  s = d;\n\n  if (fabsfk(d) < TRIGRANGEMAX2f) {\n    q = (int)rintfk(d * ((float)(2 * M_1_PI)));\n    s = mlaf(q, -PI_A2f*0.5f, s);\n    s = mlaf(q, -PI_B2f*0.5f, s);\n    s = mlaf(q, -PI_C2f*0.5f, s);\n  } else if (fabsfk(d) < TRIGRANGEMAXf) {\n    q = (int)rintfk(d * ((float)(2 * M_1_PI)));\n    s = mlaf(q, -PI_Af*0.5f, s);\n    s = mlaf(q, -PI_Bf*0.5f, s);\n    s = mlaf(q, -PI_Cf*0.5f, s);\n    s = mlaf(q, -PI_Df*0.5f, s);\n  } else {\n    dfi_t dfi = rempif(d);\n    q = dfi.i;\n    s = dfi.df.x + dfi.df.y;\n    if (xisinff(d) || xisnanf(d)) s = SLEEF_NANf;\n  }\n\n  t = s;\n\n  s = s * s;\n\n  u = -0.000195169282960705459117889f;\n  u = mlaf(u, s, 0.00833215750753879547119141f);\n  u = mlaf(u, s, -0.166666537523269653320312f);\n  u = u * s * t;\n\n  r.x = t + u;\n\n  if (xisnegzerof(d)) r.x = -0.0f;\n  \n  u = -2.71811842367242206819355e-07f;\n  u = mlaf(u, s, 2.47990446951007470488548e-05f);\n  u = mlaf(u, s, -0.00138888787478208541870117f);\n  u = mlaf(u, s, 0.0416666641831398010253906f);\n  u = mlaf(u, s, -0.5f);\n\n  r.y = u * s + 1;\n\n  if ((q & 1) != 0) { s = r.y; r.y = r.x; r.x = s; }\n  if ((q & 2) != 0) { r.x = -r.x; }\n  if (((q+1) & 2) != 0) { r.y = -r.y; }\n\n  return r;\n}\n\nEXPORT CONST Sleef_float2 xsincosf_u1(float d) {\n  int q;\n  float u;\n  Sleef_float2 r, s, t, x;\n\n  if (fabsfk(d) < TRIGRANGEMAX2f) {\n    q = (int)rintfk(d * (float)(2 * M_1_PI));\n    u = mlaf(q, -PI_A2f*0.5f, d);\n    s = dfadd2_f2_f_f(u, q * (-PI_B2f*0.5f));\n    s = dfadd_f2_f2_f(s, q * (-PI_C2f*0.5f));\n  } else {\n    dfi_t dfi = rempif(d);\n    q = dfi.i;\n    s = dfi.df;\n    if (xisinff(d) || xisnanf(d)) s.x = SLEEF_NANf;\n  }\n  \n  t = s;\n  s.x = dfsqu_f_f2(s);\n\n  u = -0.000195169282960705459117889f;\n  u = mlaf(u, s.x, 0.00833215750753879547119141f);\n  u = mlaf(u, s.x, -0.166666537523269653320312f);\n\n  u *= s.x * t.x;\n\n  x = dfadd_f2_f2_f(t, u);\n  r.x = x.x + x.y;\n  if (xisnegzerof(d)) r.x = -0.0f;\n\n  u = -2.71811842367242206819355e-07f;\n  u = mlaf(u, s.x, 2.47990446951007470488548e-05f);\n  u = mlaf(u, s.x, -0.00138888787478208541870117f);\n  u = mlaf(u, s.x, 0.0416666641831398010253906f);\n  u = mlaf(u, s.x, -0.5f);\n\n  x = dfadd_f2_f_f2(1, dfmul_f2_f_f(s.x, u));\n  r.y = x.x + x.y;\n\n  if ((q & 1) != 0) { u = r.y; r.y = r.x; r.x = u; }\n  if ((q & 2) != 0) { r.x = -r.x; }\n  if (((q+1) & 2) != 0) { r.y = -r.y; }\n\n  return r;\n}\n\nEXPORT CONST Sleef_float2 xsincospif_u05(float d) {\n  float u, s, t;\n  Sleef_float2 r, x, s2;\n\n  u = d * 4;\n  int q = ceilfk(u) & ~(int)1;\n  \n  s = u - (float)q;\n  t = s;\n  s = s * s;\n  s2 = dfmul_f2_f_f(t, t);\n\n  //\n\n  u = +0.3093842054e-6;\n  u = mlaf(u, s, -0.3657307388e-4);\n  u = mlaf(u, s, +0.2490393585e-2);\n  x = dfadd2_f2_f_f2(u * s, df(-0.080745510756969451904, -1.3373665339076936258e-09));\n  x = dfadd2_f2_f2_f2(dfmul_f2_f2_f2(s2, x), df(0.78539818525314331055, -2.1857338617566484855e-08));\n\n  x = dfmul_f2_f2_f(x, t);\n  r.x = x.x + x.y;\n  if (xisnegzerof(d)) r.x = -0.0f;\n\n  u = -0.2430611801e-7;\n  u = mlaf(u, s, +0.3590577080e-5);\n  u = mlaf(u, s, -0.3259917721e-3);\n  x = dfadd2_f2_f_f2(u * s, df(0.015854343771934509277, 4.4940051354032242811e-10));\n  x = dfadd2_f2_f2_f2(dfmul_f2_f2_f2(s2, x), df(-0.30842512845993041992, -9.0728339030733922277e-09));\n  \n  x = dfadd2_f2_f2_f(dfmul_f2_f2_f2(x, s2), 1);\n  r.y = x.x + x.y;\n\n  if ((q & 2) != 0) { s = r.y; r.y = r.x; r.x = s; }\n  if ((q & 4) != 0) { r.x = -r.x; }\n  if (((q+2) & 4) != 0) { r.y = -r.y; }\n\n  if (fabsfk(d) > 1e+7f) { r.x = 0; r.y = 1; }\n  if (xisinff(d)) { r.x = r.y = SLEEF_NANf; }\n\n  return r;\n}\n\nEXPORT CONST Sleef_float2 xsincospif_u35(float d) {\n  float u, s, t;\n  Sleef_float2 r;\n\n  u = d * 4;\n  int q = ceilfk(u) & ~(int)1;\n  \n  s = u - (float)q;\n  t = s;\n  s = s * s;\n\n  //\n\n  u = -0.3600925265e-4;\n  u = mlaf(u, s, +0.2490088111e-2);\n  u = mlaf(u, s, -0.8074551076e-1);\n  u = mlaf(u, s, +0.7853981853e+0);\n\n  r.x = u * t;\n\n  u = +0.3539815225e-5;\n  u = mlaf(u, s, -0.3259574005e-3);\n  u = mlaf(u, s, +0.1585431583e-1);\n  u = mlaf(u, s, -0.3084251285e+0);\n  u = mlaf(u, s, 1);\n\n  r.y = u;\n\n  if ((q & 2) != 0) { s = r.y; r.y = r.x; r.x = s; }\n  if ((q & 4) != 0) { r.x = -r.x; }\n  if (((q+2) & 4) != 0) { r.y = -r.y; }\n\n  if (fabsfk(d) > 1e+7f) { r.x = 0; r.y = 1; }\n  if (xisinff(d)) { r.x = r.y = SLEEF_NANf; }\n\n  return r;\n}\n\nEXPORT CONST float xtanf(float d) {\n  int q;\n  float u, s, x;\n\n  x = d;\n\n  if (fabsfk(d) < TRIGRANGEMAX2f*0.5f) {\n    q = (int)rintfk(d * (float)(2 * M_1_PI));\n    x = mlaf(q, -PI_A2f*0.5f, x);\n    x = mlaf(q, -PI_B2f*0.5f, x);\n    x = mlaf(q, -PI_C2f*0.5f, x);\n  } else if (fabsfk(d) < TRIGRANGEMAXf) {\n    q = (int)rintfk(d * (float)(2 * M_1_PI));\n    x = mlaf(q, -PI_Af*0.5f, x);\n    x = mlaf(q, -PI_Bf*0.5f, x);\n    x = mlaf(q, -PI_Cf*0.5f, x);\n    x = mlaf(q, -PI_Df*0.5f, x);\n  } else {\n    dfi_t dfi = rempif(d);\n    q = dfi.i;\n    x = dfi.df.x + dfi.df.y;\n    if (xisinff(d) || xisnanf(d)) x = SLEEF_NANf;\n  }\n\n  s = x * x;\n\n  if ((q & 1) != 0) x = -x;\n\n  float s2 = s * s, s4 = s2 * s2;\n  u = POLY6(s, s2, s4,\n\t    0.00927245803177356719970703f,\n\t    0.00331984995864331722259521f,\n\t    0.0242998078465461730957031f,\n\t    0.0534495301544666290283203f,\n\t    0.133383005857467651367188f,\n\t    0.333331853151321411132812f);\n\n  u = mlaf(s, u * x, x);\n\n  if ((q & 1) != 0) u = 1.0f / u;\n\n  return u;\n}\n\nEXPORT CONST float xtanf_u1(float d) {\n  int q;\n  float u;\n  Sleef_float2 s, t, x;\n\n  if (fabsfk(d) < TRIGRANGEMAX2f) {\n    q = (int)rintfk(d * (float)(2 * M_1_PI));\n    u = mlaf(q, -PI_A2f*0.5f, d);\n    s = dfadd2_f2_f_f(u, q * (-PI_B2f*0.5f));\n    s = dfadd_f2_f2_f(s, q * (-PI_C2f*0.5f));\n  } else {\n    dfi_t dfi = rempif(d);\n    q = dfi.i;\n    s = dfi.df;\n    if (xisinff(d) || xisnanf(d)) s.x = SLEEF_NANf;\n  }\n\n  if ((q & 1) != 0) s = dfneg_f2_f2(s);\n\n  t = s;\n  s = dfsqu_f2_f2(s);\n  s = dfnormalize_f2_f2(s);\n\n  u = 0.00446636462584137916564941f;\n  u = mlaf(u, s.x, -8.3920182078145444393158e-05f);\n  u = mlaf(u, s.x, 0.0109639242291450500488281f);\n  u = mlaf(u, s.x, 0.0212360303848981857299805f);\n  u = mlaf(u, s.x, 0.0540687143802642822265625f);\n\n  x = dfadd_f2_f_f(0.133325666189193725585938f, u * s.x);\n  x = dfadd_f2_f_f2(1, dfmul_f2_f2_f2(dfadd_f2_f_f2(0.33333361148834228515625f, dfmul_f2_f2_f2(s, x)), s));\n  x = dfmul_f2_f2_f2(t, x);\n\n  if ((q & 1) != 0) x = dfrec_f2_f2(x);\n\n  u = x.x + x.y;\n\n  if (xisnegzerof(d)) u = -0.0f;\n\n  return u;\n}\n\nEXPORT CONST float xatanf(float s) {\n  float t, u;\n  int q = 0;\n\n  if (signf(s) == -1) { s = -s; q = 2; }\n  if (s > 1) { s = 1.0f / s; q |= 1; }\n\n  t = s * s;\n\n  float t2 = t * t, t4 = t2 * t2;\n  u = POLY8(t, t2, t4,\n\t    0.00282363896258175373077393f,\n\t    -0.0159569028764963150024414f,\n\t    0.0425049886107444763183594f,\n\t    -0.0748900920152664184570312f,\n\t    0.106347933411598205566406f,\n\t    -0.142027363181114196777344f,\n\t    0.199926957488059997558594f,\n\t    -0.333331018686294555664062f);\n\n  t = s + s * (t * u);\n\n  if ((q & 1) != 0) t = 1.570796326794896557998982f - t;\n  if ((q & 2) != 0) t = -t;\n\n  return t;\n}\n\nstatic INLINE CONST float atan2kf(float y, float x) {\n  float s, t, u;\n  int q = 0;\n\n  if (x < 0) { x = -x; q = -2; }\n  if (y > x) { t = x; x = y; y = -t; q += 1; }\n\n  s = y / x;\n  t = s * s;\n\n  float t2 = t * t, t4 = t2 * t2;\n  u = POLY8(t, t2, t4,\n\t    0.00282363896258175373077393f,\n\t    -0.0159569028764963150024414f,\n\t    0.0425049886107444763183594f,\n\t    -0.0748900920152664184570312f,\n\t    0.106347933411598205566406f,\n\t    -0.142027363181114196777344f,\n\t    0.199926957488059997558594f,\n\t    -0.333331018686294555664062f);\n\n  t = u * t * s + s;\n  t = q * (float)(M_PI/2) + t;\n\n  return t;\n}\n\nEXPORT CONST float xatan2f(float y, float x) {\n  float r = atan2kf(fabsfk(y), x);\n\n  r = mulsignf(r, x);\n  if (xisinff(x) || x == 0) r = M_PIf/2 - (xisinff(x) ? (signf(x) * (float)(M_PI  /2)) : 0);\n  if (xisinff(y)          ) r = M_PIf/2 - (xisinff(x) ? (signf(x) * (float)(M_PI*1/4)) : 0);\n  if (              y == 0) r = (signf(x) == -1 ? M_PIf : 0);\n\n  return xisnanf(x) || xisnanf(y) ? SLEEF_NANf : mulsignf(r, y);\n}\n\nEXPORT CONST float xasinf(float d) {\n  int o = fabsfk(d) < 0.5f;\n  float x2 = o ? (d*d) : ((1-fabsfk(d))*0.5f), x = o ? fabsfk(d) : SQRTF(x2), u;\n\n  u = +0.4197454825e-1;\n  u = mlaf(u, x2, +0.2424046025e-1);\n  u = mlaf(u, x2, +0.4547423869e-1);\n  u = mlaf(u, x2, +0.7495029271e-1);\n  u = mlaf(u, x2, +0.1666677296e+0);\n  u = mlaf(u, x * x2, x);\n  \n  float r = o ? u : (M_PIf/2 - 2*u);\n  r = mulsignf(r, d);\n\n  return r;\n}\n\nEXPORT CONST float xacosf(float d) {\n  int o = fabsfk(d) < 0.5f;\n  float x2 = o ? (d*d) : ((1-fabsfk(d))*0.5f), u;\n  float x = o ? fabsfk(d) : SQRTF(x2);\n  x = fabsfk(d) == 1.0 ? 0 : x;\n\n  u = +0.4197454825e-1;\n  u = mlaf(u, x2, +0.2424046025e-1);\n  u = mlaf(u, x2, +0.4547423869e-1);\n  u = mlaf(u, x2, +0.7495029271e-1);\n  u = mlaf(u, x2, +0.1666677296e+0);\n\n  u *= x * x2;\n  \n  float y = 3.1415926535897932f/2 - (mulsignf(x, d) + mulsignf(u, d));\n  x += u;\n  float r = o ? y : (x*2);\n  if (!o && d < 0) r = dfadd_f2_f2_f(df(3.1415927410125732422f,-8.7422776573475857731e-08f), -r).x;\n\n  return r;\n}\n\nstatic Sleef_float2 atan2kf_u1(Sleef_float2 y, Sleef_float2 x) {\n  float u;\n  Sleef_float2 s, t;\n  int q = 0;\n\n  if (x.x < 0) { x.x = -x.x; x.y = -x.y; q = -2; }\n  if (y.x > x.x) { t = x; x = y; y.x = -t.x; y.y = -t.y; q += 1; }\n\n  s = dfdiv_f2_f2_f2(y, x);\n  t = dfsqu_f2_f2(s);\n  t = dfnormalize_f2_f2(t);\n\n  u = -0.00176397908944636583328247f;\n  u = mlaf(u, t.x, 0.0107900900766253471374512f);\n  u = mlaf(u, t.x, -0.0309564601629972457885742f);\n  u = mlaf(u, t.x, 0.0577365085482597351074219f);\n  u = mlaf(u, t.x, -0.0838950723409652709960938f);\n  u = mlaf(u, t.x, 0.109463557600975036621094f);\n  u = mlaf(u, t.x, -0.142626821994781494140625f);\n  u = mlaf(u, t.x, 0.199983194470405578613281f);\n\n  t = dfmul_f2_f2_f2(t, dfadd_f2_f_f(-0.333332866430282592773438f, u * t.x));\n  t = dfmul_f2_f2_f2(s, dfadd_f2_f_f2(1, t));\n  t = dfadd2_f2_f2_f2(dfmul_f2_f2_f(df(1.5707963705062866211f, -4.3711388286737928865e-08f), q), t);\n\n  return t;\n}\n\nEXPORT CONST float xatan2f_u1(float y, float x) {\n  if (fabsfk(x) < 2.9387372783541830947e-39f) { y *= (UINT64_C(1) << 24); x *= (UINT64_C(1) << 24); } // nexttowardf((1.0 / FLT_MAX), 1)\n  Sleef_float2 d = atan2kf_u1(df(fabsfk(y), 0), df(x, 0));\n  float r = d.x + d.y;\n\n  r = mulsignf(r, x);\n  if (xisinff(x) || x == 0) r = (float)M_PI/2 - (xisinff(x) ? (signf(x) * (float)(M_PI  /2)) : 0.0f);\n  if (xisinff(y)          ) r = (float)M_PI/2 - (xisinff(x) ? (signf(x) * (float)(M_PI*1/4)) : 0.0f);\n  if (              y == 0) r = (signf(x) == -1 ? (float)M_PI : 0.0f);\n\n  return xisnanf(x) || xisnanf(y) ? SLEEF_NANf : mulsignf(r, y);\n}\n\nEXPORT CONST float xasinf_u1(float d) {\n  int o = fabsfk(d) < 0.5f;\n  float x2 = o ? (d*d) : ((1-fabsfk(d))*0.5f), u;\n  Sleef_float2 x = o ? df(fabsfk(d), 0) : dfsqrt_f2_f(x2);\n  x = fabsfk(d) == 1.0f ? df(0, 0) : x;\n\n  u = +0.4197454825e-1;\n  u = mlaf(u, x2, +0.2424046025e-1);\n  u = mlaf(u, x2, +0.4547423869e-1);\n  u = mlaf(u, x2, +0.7495029271e-1);\n  u = mlaf(u, x2, +0.1666677296e+0);\n  u *= x2 * x.x;\n  \n  Sleef_float2 y = dfadd_f2_f2_f(dfsub_f2_f2_f2(df(3.1415927410125732422f/4,-8.7422776573475857731e-08f/4), x), -u);\n  float r = o ? (u + x.x) : ((y.x + y.y)*2);\n  r = mulsignf(r, d);\n\n  return r;\n}\n\nEXPORT CONST float xacosf_u1(float d) {\n  int o = fabsfk(d) < 0.5f;\n  float x2 = o ? (d*d) : ((1-fabsfk(d))*0.5f), u;\n  Sleef_float2 x = o ? df(fabsfk(d), 0) : dfsqrt_f2_f(x2);\n  x = fabsfk(d) == 1.0 ? df(0, 0) : x;\n  \n  u = +0.4197454825e-1;\n  u = mlaf(u, x2, +0.2424046025e-1);\n  u = mlaf(u, x2, +0.4547423869e-1);\n  u = mlaf(u, x2, +0.7495029271e-1);\n  u = mlaf(u, x2, +0.1666677296e+0);\n\n  u = u * x.x * x2;\n\n  Sleef_float2 y = dfsub_f2_f2_f2(df(3.1415927410125732422f/2,-8.7422776573475857731e-08f/2),\n\t\t\t\t  dfadd_f2_f_f(mulsignf(x.x, d), mulsignf(u, d)));\n  x = dfadd_f2_f2_f(x, u);\n  y = o ? y : dfscale_f2_f2_f(x, 2);\n  if (!o && d < 0) y = dfsub_f2_f2_f2(df(3.1415927410125732422f,-8.7422776573475857731e-08f), y);\n  \n  return y.x + y.y;\n}\n\nEXPORT CONST float xatanf_u1(float d) {\n  Sleef_float2 d2 = atan2kf_u1(df(fabsfk(d), 0.0f), df(1.0f, 0.0f));\n  float r = d2.x + d2.y;\n  if (xisinff(d)) r = 1.570796326794896557998982f;\n  return mulsignf(r, d);\n}\n\nEXPORT CONST float xlogf(float d) {\n  float x, x2, t, m;\n  int e;\n\n  int o = d < FLT_MIN;\n  if (o) d *= (float)(INT64_C(1) << 32) * (float)(INT64_C(1) << 32);\n      \n  e = ilogb2kf(d * (1.0f/0.75f));\n  m = ldexp3kf(d, -e);\n\n  if (o) e -= 64;\n    \n  x = (m-1.0f) / (m+1.0f);\n  x2 = x * x;\n\n  t = 0.2392828464508056640625f;\n  t = mlaf(t, x2, 0.28518211841583251953125f);\n  t = mlaf(t, x2, 0.400005877017974853515625f);\n  t = mlaf(t, x2, 0.666666686534881591796875f);\n  t = mlaf(t, x2, 2.0f);\n\n  x = x * t + 0.693147180559945286226764f * e;\n  \n  if (xisinff(d)) x = SLEEF_INFINITYf;\n  if (d < 0 || xisnanf(d)) x = SLEEF_NANf;\n  if (d == 0) x = -SLEEF_INFINITYf;\n\n  return x;\n}\n\nEXPORT CONST float xexpf(float d) {\n  int q = (int)rintfk(d * R_LN2f);\n  float s, u;\n\n  s = mlaf(q, -L2Uf, d);\n  s = mlaf(q, -L2Lf, s);\n\n  u = 0.000198527617612853646278381;\n  u = mlaf(u, s, 0.00139304355252534151077271);\n  u = mlaf(u, s, 0.00833336077630519866943359);\n  u = mlaf(u, s, 0.0416664853692054748535156);\n  u = mlaf(u, s, 0.166666671633720397949219);\n  u = mlaf(u, s, 0.5);\n  \n  u = s * s * u + s + 1.0f;\n  u = ldexp2kf(u, q);\n\n  if (d < -104) u = 0;\n  if (d >  104) u = SLEEF_INFINITYf;\n\n  return u;\n}\n\nstatic INLINE CONST float expkf(Sleef_float2 d) {\n  int q = (int)rintfk((d.x + d.y) * R_LN2f);\n  Sleef_float2 s, t;\n  float u;\n\n  s = dfadd2_f2_f2_f(d, q * -L2Uf);\n  s = dfadd2_f2_f2_f(s, q * -L2Lf);\n\n  s = dfnormalize_f2_f2(s);\n\n  u = 0.00136324646882712841033936f;\n  u = mlaf(u, s.x, 0.00836596917361021041870117f);\n  u = mlaf(u, s.x, 0.0416710823774337768554688f);\n  u = mlaf(u, s.x, 0.166665524244308471679688f);\n  u = mlaf(u, s.x, 0.499999850988388061523438f);\n\n  t = dfadd_f2_f2_f2(s, dfmul_f2_f2_f(dfsqu_f2_f2(s), u));\n\n  t = dfadd_f2_f_f2(1, t);\n\n  u = ldexpkf(t.x + t.y, q);\n\n  if (d.x < -104) u = 0;\n  \n  return u;\n}\n\nstatic INLINE CONST float expm1kf(float d) {\n  int q = (int)rintfk(d * R_LN2f);\n  float s, u;\n\n  s = mlaf(q, -L2Uf, d);\n  s = mlaf(q, -L2Lf, s);\n\n  float s2 = s * s, s4 = s2 * s2;\n  u = POLY6(s, s2, s4,\n\t    0.000198527617612853646278381,\n\t    0.00139304355252534151077271,\n\t    0.00833336077630519866943359,\n\t    0.0416664853692054748535156,\n\t    0.166666671633720397949219,\n\t    0.5);\n\n  u = s * s * u + s;\n\n  if (q != 0) u = ldexp2kf(u + 1, q) - 1;\n\n  return u;\n}\n\nstatic INLINE CONST Sleef_float2 logkf(float d) {\n  Sleef_float2 x, x2, s;\n  float m, t;\n  int e;\n\n  int o = d < FLT_MIN;\n  if (o) d *= (float)(INT64_C(1) << 32) * (float)(INT64_C(1) << 32);\n      \n  e = ilogb2kf(d * (1.0f/0.75f));\n  m = ldexp3kf(d, -e);\n\n  if (o) e -= 64;\n  \n  x = dfdiv_f2_f2_f2(dfadd2_f2_f_f(-1, m), dfadd2_f2_f_f(1, m));\n  x2 = dfsqu_f2_f2(x);\n  \n  t = 0.240320354700088500976562;\n  t = mlaf(t, x2.x, 0.285112679004669189453125);\n  t = mlaf(t, x2.x, 0.400007992982864379882812);\n  Sleef_float2 c = df(0.66666662693023681640625f, 3.69183861259614332084311e-09f);\n\n  s = dfmul_f2_f2_f(df(0.69314718246459960938f, -1.904654323148236017e-09f), e);\n  s = dfadd_f2_f2_f2(s, dfscale_f2_f2_f(x, 2));\n  s = dfadd_f2_f2_f2(s, dfmul_f2_f2_f2(dfmul_f2_f2_f2(x2, x),\n\t\t\t\t       dfadd2_f2_f2_f2(dfmul_f2_f2_f(x2, t), c)));\n  return s;\n}\n\nEXPORT CONST float xlogf_u1(float d) {\n  Sleef_float2 x, s;\n  float m, t, x2;\n  int e;\n\n  int o = d < FLT_MIN;\n  if (o) d *= (float)(INT64_C(1) << 32) * (float)(INT64_C(1) << 32);\n      \n  e = ilogb2kf(d * (1.0f/0.75f));\n  m = ldexp3kf(d, -e);\n\n  if (o) e -= 64;\n  \n  x = dfdiv_f2_f2_f2(dfadd2_f2_f_f(-1, m), dfadd2_f2_f_f(1, m));\n  x2 = x.x * x.x;\n\n  t = +0.3027294874e+0f;\n  t = mlaf(t, x2, +0.3996108174e+0f);\n  t = mlaf(t, x2, +0.6666694880e+0f);\n\n  s = dfmul_f2_f2_f(df(0.69314718246459960938f, -1.904654323148236017e-09f), (float)e);\n  s = dfadd_f2_f2_f2(s, dfscale_f2_f2_f(x, 2));\n  s = dfadd_f2_f2_f(s, x2 * x.x * t);\n\n  float r = s.x + s.y;\n  \n  if (xisinff(d)) r = SLEEF_INFINITYf;\n  if (d < 0 || xisnanf(d)) r = SLEEF_NANf;\n  if (d == 0) r = -SLEEF_INFINITYf;\n\n  return r;\n}\n\nstatic INLINE CONST Sleef_float2 expk2f(Sleef_float2 d) {\n  int q = (int)rintfk((d.x + d.y) * R_LN2f);\n  Sleef_float2 s, t;\n  float u;\n\n  s = dfadd2_f2_f2_f(d, q * -L2Uf);\n  s = dfadd2_f2_f2_f(s, q * -L2Lf);\n\n  u = +0.1980960224e-3f;\n  u = mlaf(u, s.x, +0.1394256484e-2f);\n  u = mlaf(u, s.x, +0.8333456703e-2f);\n  u = mlaf(u, s.x, +0.4166637361e-1f);\n\n  t = dfadd2_f2_f2_f(dfmul_f2_f2_f(s, u), +0.166666659414234244790680580464e+0f);\n  t = dfadd2_f2_f2_f(dfmul_f2_f2_f2(s, t), 0.5);\n  t = dfadd2_f2_f2_f2(s, dfmul_f2_f2_f2(dfsqu_f2_f2(s), t));\n\n  t = dfadd2_f2_f_f2(1, t);\n    \n  t.x = ldexp2kf(t.x, q);\n  t.y = ldexp2kf(t.y, q);\n  \n  return d.x < -104 ? df(0, 0) : t;\n}\n\nEXPORT CONST float xpowf(float x, float y) {\n  int yisint = (y == (int)y) || (fabsfk(y) >= (float)(INT64_C(1) << 24));\n  int yisodd = (1 & (int)y) != 0 && yisint && fabsfk(y) < (float)(INT64_C(1) << 24);\n\n  float result = expkf(dfmul_f2_f2_f(logkf(fabsfk(x)), y));\n\n  result = xisnanf(result) ? SLEEF_INFINITYf : result;\n  result *=  (x >= 0 ? 1 : (!yisint ? SLEEF_NANf : (yisodd ? -1 : 1)));\n\n  float efx = mulsignf(fabsfk(x) - 1, y);\n  if (xisinff(y)) result = efx < 0 ? 0.0f : (efx == 0 ? 1.0f : SLEEF_INFINITYf);\n  if (xisinff(x) || x == 0) result = (yisodd ? signf(x) : 1) * ((x == 0 ? -y : y) < 0 ? 0 : SLEEF_INFINITYf);\n  if (xisnanf(x) || xisnanf(y)) result = SLEEF_NANf;\n  if (y == 0 || x == 1) result = 1;\n\n  return result;\n}\n\nstatic INLINE CONST float logk3f(float d) {\n  float x, x2, t, m;\n  int e;\n\n  int o = d < FLT_MIN;\n  if (o) d *= (float)(INT64_C(1) << 32) * (float)(INT64_C(1) << 32);\n      \n  e = ilogb2kf(d * (1.0f/0.75f));\n  m = ldexp3kf(d, -e);\n\n  if (o) e -= 64;\n  \n  x = (m-1) / (m+1);\n  x2 = x * x;\n  \n  t = 0.2392828464508056640625f;\n  t = mlaf(t, x2, 0.28518211841583251953125f);\n  t = mlaf(t, x2, 0.400005877017974853515625f);\n  t = mlaf(t, x2, 0.666666686534881591796875f);\n  t = mlaf(t, x2, 2.0f);\n\n  x = mlaf(x, t, 0.693147180559945286226764f * e);\n\n  return x;\n}\n\nstatic INLINE CONST float expk3f(float d) {\n  int q = (int)rintfk(d * R_LN2f);\n  float s, u;\n\n  s = mlaf(q, -L2Uf, d);\n  s = mlaf(q, -L2Lf, s);\n\n  u = 0.000198527617612853646278381;\n  u = mlaf(u, s, 0.00139304355252534151077271);\n  u = mlaf(u, s, 0.00833336077630519866943359);\n  u = mlaf(u, s, 0.0416664853692054748535156);\n  u = mlaf(u, s, 0.166666671633720397949219);\n  u = mlaf(u, s, 0.5);\n\n  u = mlaf(s * s, u, s + 1.0f);\n  u = ldexpkf(u, q);\n\n  if (d < -104) u = 0;\n  \n  return u;\n}\n\nEXPORT CONST float xfastpowf_u3500(float x, float y) {\n  float result = expk3f(logk3f(fabsfk(x)) * y);\n\n  int yisint = (y == (int)y) || (fabsfk(y) >= (float)(INT64_C(1) << 24));\n  int yisodd = (1 & (int)y) != 0 && yisint && fabsfk(y) < (float)(INT64_C(1) << 24);\n\n  result *= (x < 0 && yisodd) ? -1 : 1;\n  if (x == 0) result = 0;\n  if (y == 0) result = 1;\n\n  return result;\n}\n\nEXPORT CONST float xsinhf(float x) {\n  float y = fabsfk(x);\n  Sleef_float2 d = expk2f(df(y, 0));\n  d = dfsub_f2_f2_f2(d, dfrec_f2_f2(d));\n  y = (d.x + d.y) * 0.5f;\n\n  y = fabsfk(x) > 89 ? SLEEF_INFINITYf : y;\n  y = xisnanf(y) ? SLEEF_INFINITYf : y;\n  y = mulsignf(y, x);\n  y = xisnanf(x) ? SLEEF_NANf : y;\n\n  return y;\n}\n\nEXPORT CONST float xcoshf(float x) {\n  float y = fabsfk(x);\n  Sleef_float2 d = expk2f(df(y, 0));\n  d = dfadd_f2_f2_f2(d, dfrec_f2_f2(d));\n  y = (d.x + d.y) * 0.5f;\n\n  y = fabsfk(x) > 89 ? SLEEF_INFINITYf : y;\n  y = xisnanf(y) ? SLEEF_INFINITYf : y;\n  y = xisnanf(x) ? SLEEF_NANf : y;\n\n  return y;\n}\n\nEXPORT CONST float xtanhf(float x) {\n  float y = fabsfk(x);\n  Sleef_float2 d = expk2f(df(y, 0));\n  Sleef_float2 e = dfrec_f2_f2(d);\n  d = dfdiv_f2_f2_f2(dfsub_f2_f2_f2(d, e), dfadd_f2_f2_f2(d, e));\n  y = d.x + d.y;\n\n  y = fabsfk(x) > 18.714973875f ? 1.0f : y;\n  y = xisnanf(y) ? 1.0f : y;\n  y = mulsignf(y, x);\n  y = xisnanf(x) ? SLEEF_NANf : y;\n\n  return y;\n}\n\nEXPORT CONST float xsinhf_u35(float x) {\n  float e = expm1kf(fabsfk(x));\n  float y = (e + 2) / (e + 1) * (0.5f * e);\n\n  y = fabsfk(x) > 88 ? SLEEF_INFINITYf : y;\n  y = xisnanf(y) ? SLEEF_INFINITYf : y;\n  y = mulsignf(y, x);\n  y = xisnanf(x) ? SLEEF_NANf : y;\n\n  return y;\n}\n\nEXPORT CONST float xcoshf_u35(float x) {\n  float e = xexpf(fabsfk(x));\n  float y = 0.5f * e + 0.5f / e;\n\n  y = fabsfk(x) > 88 ? SLEEF_INFINITYf : y;\n  y = xisnanf(y) ? SLEEF_INFINITYf : y;\n  y = xisnanf(x) ? SLEEF_NANf : y;\n\n  return y;\n}\n\nEXPORT CONST float xtanhf_u35(float x) {\n  float y = fabsfk(x);\n  float d = expm1kf(2*y);\n  y = d / (d + 2);\n\n  y = fabsfk(x) > 18.714973875f ? 1.0f : y;\n  y = xisnanf(y) ? 1.0f : y;\n  y = mulsignf(y, x);\n  y = xisnanf(x) ? SLEEF_NANf : y;\n\n  return y;\n}\n\nstatic INLINE CONST Sleef_float2 logk2f(Sleef_float2 d) {\n  Sleef_float2 x, x2, m, s;\n  float t;\n  int e;\n\n  e = ilogbkf(d.x * (1.0f/0.75f));\n  m = dfscale_f2_f2_f(d, pow2if(-e));\n\n  x = dfdiv_f2_f2_f2(dfadd2_f2_f2_f(m, -1), dfadd2_f2_f2_f(m, 1));\n  x2 = dfsqu_f2_f2(x);\n\n  t = 0.2392828464508056640625f;\n  t = mlaf(t, x2.x, 0.28518211841583251953125f);\n  t = mlaf(t, x2.x, 0.400005877017974853515625f);\n  t = mlaf(t, x2.x, 0.666666686534881591796875f);\n\n  s = dfmul_f2_f2_f(df(0.69314718246459960938f, -1.904654323148236017e-09f), e);\n  s = dfadd_f2_f2_f2(s, dfscale_f2_f2_f(x, 2));\n  s = dfadd_f2_f2_f2(s, dfmul_f2_f2_f(dfmul_f2_f2_f2(x2, x), t));\n\n  return s;\n}\n\nEXPORT CONST float xasinhf(float x) {\n  float y = fabsfk(x);\n  Sleef_float2 d;\n\n  d = y > 1 ? dfrec_f2_f(x) : df(y, 0);\n  d = dfsqrt_f2_f2(dfadd2_f2_f2_f(dfsqu_f2_f2(d), 1));\n  d = y > 1 ? dfmul_f2_f2_f(d, y) : d;\n  \n  d = logk2f(dfnormalize_f2_f2(dfadd_f2_f2_f(d, x)));\n  y = d.x + d.y;\n  \n  y = (fabsfk(x) > SQRT_FLT_MAX || xisnanf(y)) ? mulsignf(SLEEF_INFINITYf, x) : y;\n  y = xisnanf(x) ? SLEEF_NANf : y;\n  y = xisnegzerof(x) ? -0.0f : y;\n\n  return y;\n}\n\nEXPORT CONST float xacoshf(float x) {\n  Sleef_float2 d = logk2f(dfadd2_f2_f2_f(dfmul_f2_f2_f2(dfsqrt_f2_f2(dfadd2_f2_f_f(x, 1)), dfsqrt_f2_f2(dfadd2_f2_f_f(x, -1))), x));\n  float y = d.x + d.y;\n\n  y = (x > SQRT_FLT_MAX || xisnanf(y)) ? SLEEF_INFINITYf : y;\n  y = x == 1.0f ? 0.0f : y;\n  y = x < 1.0f ? SLEEF_NANf : y;\n  y = xisnanf(x) ? SLEEF_NANf : y;\n\n  return y;\n}\n\nEXPORT CONST float xatanhf(float x) {\n  float y = fabsfk(x);\n  Sleef_float2 d = logk2f(dfdiv_f2_f2_f2(dfadd2_f2_f_f(1, y), dfadd2_f2_f_f(1, -y)));\n  y = y > 1.0f ? SLEEF_NANf : (y == 1.0f ? SLEEF_INFINITYf : (d.x + d.y) * 0.5f);\n\n  y = xisinff(x) || xisnanf(y) ? SLEEF_NANf : y;\n  y = mulsignf(y, x);\n  y = xisnanf(x) ? SLEEF_NANf : y;\n\n  return y;\n}\n\nEXPORT CONST float xexp2f(float d) {\n  int q = (int)rintfk(d);\n  float s, u;\n\n  s = d - q;\n\n  u = +0.1535920892e-3;\n  u = mlaf(u, s, +0.1339262701e-2);\n  u = mlaf(u, s, +0.9618384764e-2);\n  u = mlaf(u, s, +0.5550347269e-1);\n  u = mlaf(u, s, +0.2402264476e+0);\n  u = mlaf(u, s, +0.6931471825e+0);\n  u = dfnormalize_f2_f2(dfadd_f2_f_f2(1, dfmul_f2_f_f(u, s))).x;\n\n  u = ldexp2kf(u, q);\n\n  if (d >= 128) u = SLEEF_INFINITYf;\n  if (d < -150) u = 0;\n  \n  return u;\n}\n\nEXPORT CONST float xexp2f_u35(float d) {\n  int q = (int)rintfk(d);\n  float s, u;\n\n  s = d - q;\n\n  u = +0.1535920892e-3;\n  u = mlaf(u, s, +0.1339262701e-2);\n  u = mlaf(u, s, +0.9618384764e-2);\n  u = mlaf(u, s, +0.5550347269e-1);\n  u = mlaf(u, s, +0.2402264476e+0);\n  u = mlaf(u, s, +0.6931471825e+0);\n  u = mlaf(u, s, +0.1000000000e+1);\n\n  u = ldexp2kf(u, q);\n\n  if (d >= 128) u = SLEEF_INFINITYf;\n  if (d < -150) u = 0;\n  \n  return u;\n}\n\nEXPORT CONST float xexp10f(float d) {\n  int q = (int)rintfk(d * (float)LOG10_2);\n  float s, u;\n  \n  s = mlaf(q, -L10Uf, d);\n  s = mlaf(q, -L10Lf, s);\n  \n  u = +0.6802555919e-1;\n  u = mlaf(u, s, +0.2078080326e+0);\n  u = mlaf(u, s, +0.5393903852e+0);\n  u = mlaf(u, s, +0.1171245337e+1);\n  u = mlaf(u, s, +0.2034678698e+1);\n  u = mlaf(u, s, +0.2650949001e+1);\n  Sleef_float2 x = dfadd_f2_f2_f(df(2.3025851249694824219, -3.1705172516493593157e-08), u * s);\n  u = dfnormalize_f2_f2(dfadd_f2_f_f2(1, dfmul_f2_f2_f(x, s))).x;\n\n  u = ldexp2kf(u, q);\n\n  if (d > 38.5318394191036238941387f) u = SLEEF_INFINITYf; // log10(FLT_MAX)\n  if (d < -50) u = 0;\n  \n  return u;\n}\n\nEXPORT CONST float xexp10f_u35(float d) {\n  int q = (int)rintfk(d * (float)LOG10_2);\n  float s, u;\n  \n  s = mlaf(q, -L10Uf, d);\n  s = mlaf(q, -L10Lf, s);\n  \n  u = +0.2064004987e+0;\n  u = mlaf(u, s, +0.5417877436e+0);\n  u = mlaf(u, s, +0.1171286821e+1);\n  u = mlaf(u, s, +0.2034656048e+1);\n  u = mlaf(u, s, +0.2650948763e+1);\n  u = mlaf(u, s, +0.2302585125e+1);\n  u = mlaf(u, s, +0.1000000000e+1);\n\n  u = ldexp2kf(u, q);\n\n  if (d > 38.5318394191036238941387f) u = SLEEF_INFINITYf; // log10(FLT_MAX)\n  if (d < -50) u = 0;\n  \n  return u;\n}\n\nEXPORT CONST float xexpm1f(float a) {\n  Sleef_float2 d = dfadd2_f2_f2_f(expk2f(df(a, 0)), -1.0f);\n  float x = d.x + d.y;\n  if (a > 88.72283172607421875f) x = SLEEF_INFINITYf;\n  if (a < -16.635532333438687426013570f) x = -1;\n  if (xisnegzerof(a)) x = -0.0f;\n  return x;\n}\n\nEXPORT CONST float xlog10f(float d) {\n  Sleef_float2 x, s;\n  float m, t, x2;\n  int e;\n\n  int o = d < FLT_MIN;\n  if (o) d *= (float)(INT64_C(1) << 32) * (float)(INT64_C(1) << 32);\n      \n  e = ilogb2kf(d * (1.0f/0.75f));\n  m = ldexp3kf(d, -e);\n\n  if (o) e -= 64;\n\n  x = dfdiv_f2_f2_f2(dfadd2_f2_f_f(-1, m), dfadd2_f2_f_f(1, m));\n  x2 = x.x * x.x;\n\n  t = +0.1314289868e+0;\n  t = mlaf(t, x2, +0.1735493541e+0);\n  t = mlaf(t, x2, +0.2895309627e+0);\n    \n  s = dfmul_f2_f2_f(df(0.30103001, -1.432098889e-08), (float)e);\n  s = dfadd_f2_f2_f2(s, dfmul_f2_f2_f2(x, df(0.868588984, -2.170757285e-08)));\n  s = dfadd_f2_f2_f(s, x2 * x.x * t);\n\n  float r = s.x + s.y;\n  \n  if (xisinff(d)) r = SLEEF_INFINITYf;\n  if (d < 0 || xisnanf(d)) r = SLEEF_NANf;\n  if (d == 0) r = -SLEEF_INFINITYf;\n\n  return r;\n}\n\nEXPORT CONST float xlog2f(float d) {\n  Sleef_float2 x, s;\n  float m, t, x2;\n  int e;\n\n  int o = d < FLT_MIN;\n  if (o) d *= (float)(INT64_C(1) << 32) * (float)(INT64_C(1) << 32);\n      \n  e = ilogb2kf(d * (1.0f/0.75f));\n  m = ldexp3kf(d, -e);\n\n  if (o) e -= 64;\n\n  x = dfdiv_f2_f2_f2(dfadd2_f2_f_f(-1, m), dfadd2_f2_f_f(1, m));\n  x2 = x.x * x.x;\n\n  t = +0.4374550283e+0f;\n  t = mlaf(t, x2, +0.5764790177e+0f);\n  t = mlaf(t, x2, +0.9618012905120f);\n\n  s = dfadd2_f2_f_f2(e, dfmul_f2_f2_f2(x, df(2.8853900432586669922, 3.2734474483568488616e-08)));\n  s = dfadd2_f2_f2_f(s, x2 * x.x * t);\n  \n  float r = s.x + s.y;\n  \n  if (xisinff(d)) r = SLEEF_INFINITYf;\n  if (d < 0 || xisnanf(d)) r = SLEEF_NANf;\n  if (d == 0) r = -SLEEF_INFINITYf;\n\n  return r;\n}\n\nEXPORT CONST float xlog2f_u35(float d) {\n  float m, t, x, x2;\n  int e;\n\n  int o = d < FLT_MIN;\n  if (o) d *= (float)(INT64_C(1) << 32) * (float)(INT64_C(1) << 32);\n      \n  e = ilogb2kf(d * (1.0f/0.75f));\n  m = ldexp3kf(d, -e);\n\n  if (o) e -= 64;\n\n  x = (m - 1) / (m + 1);\n  x2 = x * x;\n\n  t = +0.4374088347e+0;\n  t = mlaf(t, x2, +0.5764843822e+0);\n  t = mlaf(t, x2, +0.9618024230e+0);\n\n  float r = mlaf(x2 * x, t, mlaf(x, +0.2885390043e+1, e));\n  \n  if (xisinff(d)) r = SLEEF_INFINITYf;\n  if (d < 0 || xisnanf(d)) r = SLEEF_NANf;\n  if (d == 0) r = -SLEEF_INFINITYf;\n\n  return r;\n}\n\nEXPORT CONST float xlog1pf(float d) {\n  Sleef_float2 x, s;\n  float m, t, x2;\n  int e;\n\n  float dp1 = d + 1;\n  \n  int o = dp1 < FLT_MIN;\n  if (o) dp1 *= (float)(INT64_C(1) << 32) * (float)(INT64_C(1) << 32);\n      \n  e = ilogb2kf(dp1 * (1.0f/0.75f));\n\n  t = ldexp3kf(1, -e);\n  m = mlaf(d, t, t-1);\n\n  if (o) e -= 64;\n  \n  x = dfdiv_f2_f2_f2(df(m, 0), dfadd_f2_f_f(2, m));\n  x2 = x.x * x.x;\n\n  t = +0.3027294874e+0f;\n  t = mlaf(t, x2, +0.3996108174e+0f);\n  t = mlaf(t, x2, +0.6666694880e+0f);\n\n  s = dfmul_f2_f2_f(df(0.69314718246459960938f, -1.904654323148236017e-09f), (float)e);\n  s = dfadd_f2_f2_f2(s, dfscale_f2_f2_f(x, 2));\n  s = dfadd_f2_f2_f(s, x2 * x.x * t);\n\n  float r = s.x + s.y;\n    \n  if (d > 1e+38) r = SLEEF_INFINITYf;\n  if (d < -1) r = SLEEF_NANf;\n  if (d == -1) r = -SLEEF_INFINITYf;\n  if (xisnegzerof(d)) r = -0.0f;\n\n  return r;\n}\n\nEXPORT CONST float xcbrtf(float d) {\n  float x, y, q = 1.0f;\n  int e, r;\n\n  e = ilogbkf(fabsfk(d))+1;\n  d = ldexp2kf(d, -e);\n  r = (e + 6144) % 3;\n  q = (r == 1) ? 1.2599210498948731647672106f : q;\n  q = (r == 2) ? 1.5874010519681994747517056f : q;\n  q = ldexp2kf(q, (e + 6144) / 3 - 2048);\n\n  q = mulsignf(q, d);\n  d = fabsfk(d);\n\n  x = -0.601564466953277587890625f;\n  x = mlaf(x, d, 2.8208892345428466796875f);\n  x = mlaf(x, d, -5.532182216644287109375f);\n  x = mlaf(x, d, 5.898262500762939453125f);\n  x = mlaf(x, d, -3.8095417022705078125f);\n  x = mlaf(x, d, 2.2241256237030029296875f);\n\n  y = d * x * x;\n  y = (y - (2.0f / 3.0f) * y * (y * x - 1.0f)) * q;\n\n  return y;\n}\n\nEXPORT CONST float xcbrtf_u1(float d) {\n  float x, y, z;\n  Sleef_float2 q2 = df(1, 0), u, v;\n  int e, r;\n\n  e = ilogbkf(fabsfk(d))+1;\n  d = ldexp2kf(d, -e);\n  r = (e + 6144) % 3;\n  q2 = (r == 1) ? df(1.2599210739135742188, -2.4018701694217270415e-08) : q2;\n  q2 = (r == 2) ? df(1.5874010324478149414,  1.9520385308169352356e-08) : q2;\n\n  q2.x = mulsignf(q2.x, d); q2.y = mulsignf(q2.y, d);\n  d = fabsfk(d);\n\n  x = -0.601564466953277587890625f;\n  x = mlaf(x, d, 2.8208892345428466796875f);\n  x = mlaf(x, d, -5.532182216644287109375f);\n  x = mlaf(x, d, 5.898262500762939453125f);\n  x = mlaf(x, d, -3.8095417022705078125f);\n  x = mlaf(x, d, 2.2241256237030029296875f);\n\n  y = x * x; y = y * y; x -= (d * y - x) * (1.0 / 3.0f);\n\n  z = x;\n\n  u = dfmul_f2_f_f(x, x);\n  u = dfmul_f2_f2_f2(u, u);\n  u = dfmul_f2_f2_f(u, d);\n  u = dfadd2_f2_f2_f(u, -x);\n  y = u.x + u.y;\n\n  y = -2.0 / 3.0 * y * z;\n  v = dfadd2_f2_f2_f(dfmul_f2_f_f(z, z), y);\n  v = dfmul_f2_f2_f(v, d);\n  v = dfmul_f2_f2_f2(v, q2);\n  z = ldexp2kf(v.x + v.y, (e + 6144) / 3 - 2048);\n\n  if (xisinff(d)) { z = mulsignf(SLEEF_INFINITYf, q2.x); }\n  if (d == 0) { z = mulsignf(0, q2.x); }\n\n  return z;\n}\n\n//\n\nEXPORT CONST float xfabsf(float x) { return fabsfk(x); }\n\nEXPORT CONST float xcopysignf(float x, float y) { return copysignfk(x, y); }\n\nEXPORT CONST float xfmaxf(float x, float y) {\n  return y != y ? x : (x > y ? x : y);\n}\n\nEXPORT CONST float xfminf(float x, float y) {\n  return y != y ? x : (x < y ? x : y);\n}\n\nEXPORT CONST float xfdimf(float x, float y) {\n  float ret = x - y;\n  if (ret < 0 || x == y) ret = 0;\n  return ret;\n}\n\nEXPORT CONST float xtruncf(float x) {\n  float fr = x - (int32_t)x;\n  return (xisinff(x) || fabsfk(x) >= (float)(INT64_C(1) << 23)) ? x : copysignfk(x - fr, x);\n}\n\nEXPORT CONST float xfloorf(float x) {\n  float fr = x - (int32_t)x;\n  fr = fr < 0 ? fr+1.0f : fr;\n  return (xisinff(x) || fabsfk(x) >= (float)(INT64_C(1) << 23)) ? x : copysignfk(x - fr, x);\n}\n\nEXPORT CONST float xceilf(float x) {\n  float fr = x - (int32_t)x;\n  fr = fr <= 0 ? fr : fr-1.0f;\n  return (xisinff(x) || fabsfk(x) >= (float)(INT64_C(1) << 23)) ? x : copysignfk(x - fr, x);\n}\n\nEXPORT CONST float xroundf(float d) {\n  float x = d + 0.5f;\n  float fr = x - (int32_t)x;\n  if (fr == 0 && x <= 0) x--;\n  fr = fr < 0 ? fr+1.0f : fr;\n  x = d == 0.4999999701976776123f ? 0 : x;  // nextafterf(0.5, 0)\n  return (xisinff(d) || fabsfk(d) >= (float)(INT64_C(1) << 23)) ? d : copysignfk(x - fr, d);\n}\n\nEXPORT CONST float xrintf(float d) {\n  float x = d + 0.5f;\n  int32_t isodd = (1 & (int32_t)x) != 0;\n  float fr = x - (int32_t)x;\n  fr = (fr < 0 || (fr == 0 && isodd)) ? fr+1.0f : fr;\n  x = d == 0.50000005960464477539f ? 0 : x;  // nextafterf(0.5, 1)\n  return (xisinff(d) || fabsfk(d) >= (float)(INT64_C(1) << 23)) ? d : copysignfk(x - fr, d);\n}\n\nEXPORT CONST Sleef_float2 xmodff(float x) {\n  float fr = x - (int32_t)x;\n  fr = fabsfk(x) > (float)(INT64_C(1) << 23) ? 0 : fr;\n  Sleef_float2 ret = { copysignfk(fr, x), copysignfk(x - fr, x) };\n  return ret;\n}\n\nEXPORT CONST float xldexpf(float x, int exp) {\n  if (exp >  300) exp =  300;\n  if (exp < -300) exp = -300;\n  \n  int e0 = exp >> 2;\n  if (exp < 0) e0++;\n  if (-50 < exp && exp < 50) e0 = 0;\n  int e1 = exp - (e0 << 2);\n  \n  float p = pow2if(e0);\n  float ret = x * pow2if(e1) * p * p * p * p;\n  \n  return ret;\n}\n\nEXPORT CONST float xnextafterf(float x, float y) {\n  union {\n    float f;\n    int32_t i;\n  } cx;\n\n  cx.f = x == 0 ? mulsignf(0, y) : x;\n  int c = (cx.i < 0) == (y < x);\n  if (c) cx.i = -(cx.i ^ (1 << 31));\n\n  if (x != y) cx.i--;\n\n  if (c) cx.i = -(cx.i ^ (1 << 31));\n\n  if (cx.f == 0 && x != 0) cx.f = mulsignf(0, x);\n  if (x == 0 && y == 0) cx.f = y;\n  if (xisnanf(x) || xisnanf(y)) cx.f = SLEEF_NANf;\n  \n  return cx.f;\n}\n\nEXPORT CONST float xfrfrexpf(float x) {\n  union {\n    float f;\n    int32_t u;\n  } cx;\n\n  if (fabsfk(x) < FLT_MIN) x *= (1 << 30);\n  \n  cx.f = x;\n  cx.u &= ~0x7f800000U;\n  cx.u |=  0x3f000000U;\n\n  if (xisinff(x)) cx.f = mulsignf(SLEEF_INFINITYf, x);\n  if (x == 0) cx.f = x;\n  \n  return cx.f;\n}\n\nEXPORT CONST int xexpfrexpf(float x) {\n  union {\n    float f;\n    uint32_t u;\n  } cx;\n\n  int ret = 0;\n  \n  if (fabsfk(x) < FLT_MIN) { x *= (1 << 30); ret = -30; }\n  \n  cx.f = x;\n  ret += (int32_t)(((cx.u >> 23) & 0xff)) - 0x7e;\n\n  if (x == 0 || xisnanf(x) || xisinff(x)) ret = 0;\n  \n  return ret;\n}\n\nEXPORT CONST float xhypotf_u05(float x, float y) {\n  x = fabsfk(x);\n  y = fabsfk(y);\n  float min = fminfk(x, y), n = min;\n  float max = fmaxfk(x, y), d = max;\n\n  if (max < FLT_MIN) { n *= UINT64_C(1) << 24; d *= UINT64_C(1) << 24; }\n  Sleef_float2 t = dfdiv_f2_f2_f2(df(n, 0), df(d, 0));\n  t = dfmul_f2_f2_f(dfsqrt_f2_f2(dfadd2_f2_f2_f(dfsqu_f2_f2(t), 1)), max);\n  float ret = t.x + t.y;\n  if (xisnanf(ret)) ret = SLEEF_INFINITYf;\n  if (min == 0) ret = max;\n  if (xisnanf(x) || xisnanf(y)) ret = SLEEF_NANf;\n  if (x == SLEEF_INFINITYf || y == SLEEF_INFINITYf) ret = SLEEF_INFINITYf;\n  return ret;\n}\n\nEXPORT CONST float xhypotf_u35(float x, float y) {\n  x = fabsfk(x);\n  y = fabsfk(y);\n  float min = fminfk(x, y);\n  float max = fmaxfk(x, y);\n  \n  float t = min / max;\n  float ret = max * SQRTF(1 + t*t);\n  if (min == 0) ret = max;\n  if (xisnanf(x) || xisnanf(y)) ret = SLEEF_NANf;\n  if (x == SLEEF_INFINITYf || y == SLEEF_INFINITYf) ret = SLEEF_INFINITYf;\n  return ret;\n}\n\nstatic INLINE CONST float toward0f(float d) {\n  return d == 0 ? 0 : intBitsToFloat(floatToRawIntBits(d)-1);\n}\n\nstatic INLINE CONST float ptruncf(float x) {\n  return fabsfk(x) >= (float)(INT64_C(1) << 23) ? x : (x - (x - (int32_t)x));\n}\n\nEXPORT CONST float xfmodf(float x, float y) {\n  float nu = fabsfk(x), de = fabsfk(y), s = 1, q;\n  if (de < FLT_MIN) { nu *= UINT64_C(1) << 25; de *= UINT64_C(1) << 25; s = 1.0f / (UINT64_C(1) << 25); }\n  Sleef_float2 r = df(nu, 0);\n  float rde = toward0f(1.0f / de);\n\n  for(int i=0;i<8;i++) { // ceil(log2(FLT_MAX) / 22)+1\n    q = ptruncf(toward0f(r.x) * rde);\n    q = (3*de > r.x && r.x >= de) ? 2 : q;\n    q = (2*de > r.x && r.x >= de) ? 1 : q;\n    r = dfnormalize_f2_f2(dfadd2_f2_f2_f2(r, dfmul_f2_f_f(q, -de)));\n    if (r.x < de) break;\n  }\n  \n  float ret = (r.x + r.y) * s;\n  if (r.x + r.y == de) ret = 0;\n  ret = mulsignf(ret, x);\n  if (nu < de) ret = x;\n  if (de == 0) ret = SLEEF_NANf;\n\n  return ret;\n}\n\nstatic INLINE CONST float rintfk2(float d) {\n  float x = d + 0.5f;\n  int32_t isodd = (1 & (int32_t)x) != 0;\n  float fr = x - (int32_t)x;\n  fr = (fr < 0 || (fr == 0 && isodd)) ? fr+1.0f : fr;\n  return (fabsfk(d) >= (float)(INT64_C(1) << 23)) ? d : copysignfk(x - fr, d);\n}\n\nEXPORT CONST float xremainderf(float x, float y) {\n  float n = fabsfk(x), d = fabsfk(y), s = 1, q;\n  if (d < FLT_MIN*2) { n *= UINT64_C(1) << 25; d *= UINT64_C(1) << 25; s = 1.0f / (UINT64_C(1) << 25); }\n  float rd = 1.0f / d;\n  Sleef_float2 r = df(n, 0);\n  int qisodd = 0;\n\n  for(int i=0;i<8;i++) { // ceil(log2(FLT_MAX) / 22)+1\n    q = rintfk2(r.x * rd);\n    if (fabsfk(r.x) < 1.5f * d) q = r.x < 0 ? -1 : 1;\n    if (fabsfk(r.x) < 0.5f * d || (fabsfk(r.x) == 0.5f * d && !qisodd)) q = 0;\n    if (q == 0) break;\n    if (xisinff(q * -d)) q = q + mulsignf(-1, r.x);\n    qisodd ^= (1 & (int)q) != 0 && fabsfk(q) < (float)(INT64_C(1) << 24);\n    r = dfnormalize_f2_f2(dfadd2_f2_f2_f2(r, dfmul_f2_f_f(q, -d)));\n  }\n  \n  float ret = r.x * s;\n  ret = mulsignf(ret, x);\n  if (xisinff(y)) ret = xisinff(x) ? SLEEF_NANf : x;\n  if (d == 0) ret = SLEEF_NANf;\n\n  return ret;\n}\n\nEXPORT CONST float xsqrtf_u05(float d) {\n  float q = 0.5f;\n\n  d = d < 0 ? SLEEF_NANf : d;\n\n  if (d < 5.2939559203393770e-23f) {\n    d *= 1.8889465931478580e+22f;\n    q = 7.2759576141834260e-12f * 0.5f;\n  }\n\n  if (d > 1.8446744073709552e+19f) {\n    d *= 5.4210108624275220e-20f;\n    q = 4294967296.0f * 0.5f;\n  }\n  \n  // http://en.wikipedia.org/wiki/Fast_inverse_square_root\n  float x = intBitsToFloat(0x5f375a86 - (floatToRawIntBits(d + 1e-45f) >> 1));\n\n  x = x * (1.5f - 0.5f * d * x * x);\n  x = x * (1.5f - 0.5f * d * x * x);\n  x = x * (1.5f - 0.5f * d * x * x) * d;\n\n  Sleef_float2 d2 = dfmul_f2_f2_f2(dfadd2_f2_f_f2(d, dfmul_f2_f_f(x, x)), dfrec_f2_f(x));\n\n  float ret = (d2.x + d2.y) * q;\n\n  ret = d == SLEEF_INFINITYf ? SLEEF_INFINITYf : ret;\n  ret = d == 0 ? d : ret;\n\n  return ret;\n}\n\nEXPORT CONST float xsqrtf_u35(float d) {\n  float q = 1.0f;\n\n  d = d < 0 ? SLEEF_NANf : d;\n\n  if (d < 5.2939559203393770e-23f) {\n    d *= 1.8889465931478580e+22f;\n    q = 7.2759576141834260e-12f;\n  }\n\n  if (d > 1.8446744073709552e+19f) {\n    d *= 5.4210108624275220e-20f;\n    q = 4294967296.0f;\n  }\n  \n  // http://en.wikipedia.org/wiki/Fast_inverse_square_root\n  float x = intBitsToFloat(0x5f375a86 - (floatToRawIntBits(d + 1e-45) >> 1));\n\n  x = x * (1.5f - 0.5f * d * x * x);\n  x = x * (1.5f - 0.5f * d * x * x);\n  x = x * (1.5f - 0.5f * d * x * x);\n  x = x * (1.5f - 0.5f * d * x * x);\n\n  return d == SLEEF_INFINITYf ? SLEEF_INFINITYf : (x * d * q);\n}\n\nEXPORT CONST float xsqrtf(float d) { return SQRTF(d); }\n\nEXPORT CONST float xfmaf(float x, float y, float z) {\n  float h2 = x * y + z, q = 1;\n  if (fabsfk(h2) < 1e-38f) {\n    const float c0 = 1 << 25, c1 = c0 * c0, c2 = c1 * c1;\n    x *= c1;\n    y *= c1;\n    z *= c2;\n    q = 1.0f / c2;\n  }\n  if (fabsfk(h2) > 1e+38f) {\n    const float c0 = 1 << 25, c1 = c0 * c0, c2 = c1 * c1;\n    x *= 1.0 / c1;\n    y *= 1.0 / c1;\n    z *= 1.0 / c2;\n    q = c2;\n  }\n  Sleef_float2 d = dfmul_f2_f_f(x, y);\n  d = dfadd2_f2_f2_f(d, z);\n  float ret = (x == 0 || y == 0) ? z : (d.x + d.y);\n  if (xisinff(z) && !xisinff(x) && !xisnanf(x) && !xisinff(y) && !xisnanf(y)) h2 = z;\n  return (xisinff(h2) || xisnanf(h2)) ? h2 : ret*q;\n}\n\n//\n\nstatic INLINE CONST Sleef_float2 sinpifk(float d) {\n  float u, s, t;\n  Sleef_float2 x, s2;\n\n  u = d * 4;\n  int q = ceilfk(u) & ~1;\n  int o = (q & 2) != 0;\n  \n  s = u - (float)q;\n  t = s;\n  s = s * s;\n  s2 = dfmul_f2_f_f(t, t);\n  \n  //\n  \n  u = o ? -0.2430611801e-7f : +0.3093842054e-6f;\n  u = mlaf(u, s, o ? +0.3590577080e-5f : -0.3657307388e-4f);\n  u = mlaf(u, s, o ? -0.3259917721e-3f : +0.2490393585e-2f);\n  x = dfadd2_f2_f_f2(u * s, o ? df(0.015854343771934509277, 4.4940051354032242811e-10) :\n\t\t     df(-0.080745510756969451904, -1.3373665339076936258e-09));\n  x = dfadd2_f2_f2_f2(dfmul_f2_f2_f2(s2, x), o ? df(-0.30842512845993041992, -9.0728339030733922277e-09) :\n\t\t      df(0.78539818525314331055, -2.1857338617566484855e-08));\n\n  x = dfmul_f2_f2_f2(x, o ? s2 : df(t, 0));\n  x = o ? dfadd2_f2_f2_f(x, 1) : x;\n  \n  //\n\n  if ((q & 4) != 0) { x.x = -x.x; x.y = -x.y; }\n\n  return x;\n}\n\nEXPORT CONST float xsinpif_u05(float d) {\n  Sleef_float2 x = sinpifk(d);\n  float r = x.x + x.y;\n\n  if (xisnegzerof(d)) r = -0.0;\n  if (fabsfk(d) > TRIGRANGEMAX4f) r = 0; \n  if (xisinff(d)) r = SLEEF_NANf;\n\n  return r;\n}\n\nstatic INLINE CONST Sleef_float2 cospifk(float d) {\n  float u, s, t;\n  Sleef_float2 x, s2;\n\n  u = d * 4;\n  int q = ceilfk(u) & ~1;\n  int o = (q & 2) == 0;\n  \n  s = u - (float)q;\n  t = s;\n  s = s * s;\n  s2 = dfmul_f2_f_f(t, t);\n  \n  //\n  \n  u = o ? -0.2430611801e-7f : +0.3093842054e-6f;\n  u = mlaf(u, s, o ? +0.3590577080e-5f : -0.3657307388e-4f);\n  u = mlaf(u, s, o ? -0.3259917721e-3f : +0.2490393585e-2f);\n  x = dfadd2_f2_f_f2(u * s, o ? df(0.015854343771934509277, 4.4940051354032242811e-10) :\n\t\t     df(-0.080745510756969451904, -1.3373665339076936258e-09));\n  x = dfadd2_f2_f2_f2(dfmul_f2_f2_f2(s2, x), o ? df(-0.30842512845993041992, -9.0728339030733922277e-09) :\n\t\t      df(0.78539818525314331055, -2.1857338617566484855e-08));\n\n  x = dfmul_f2_f2_f2(x, o ? s2 : df(t, 0));\n  x = o ? dfadd2_f2_f2_f(x, 1) : x;\n  \n  //\n\n  if (((q+2) & 4) != 0) { x.x = -x.x; x.y = -x.y; }\n\n  return x;\n}\n\nEXPORT CONST float xcospif_u05(float d) {\n  Sleef_float2 x = cospifk(d);\n  float r = x.x + x.y;\n\n  if (fabsfk(d) > TRIGRANGEMAX4f) r = 1;\n  if (xisinff(d)) r = SLEEF_NANf;\n\n  return r;\n}\n\ntypedef struct {\n  Sleef_float2 a, b;\n} df2;\n\nstatic CONST df2 gammafk(float a) {\n  Sleef_float2 clc = df(0, 0), clln = df(1, 0), clld = df(1, 0), v = df(1, 0), x, y, z;\n  float t, u;\n\n  int otiny = fabsfk(a) < 1e-30f, oref = a < 0.5f;\n\n  x = otiny ? df(0, 0) : (oref ? dfadd2_f2_f_f(1, -a) : df(a, 0));\n\n  int o0 = (0.5f <= x.x && x.x <= 1.2), o2 = 2.3 < x.x;\n\n  y = dfnormalize_f2_f2(dfmul_f2_f2_f2(dfadd2_f2_f2_f(x, 1), x));\n  y = dfnormalize_f2_f2(dfmul_f2_f2_f2(dfadd2_f2_f2_f(x, 2), y));\n\n  clln = (o2 && x.x <= 7) ? y : clln;\n\n  x = (o2 && x.x <= 7) ? dfadd2_f2_f2_f(x, 3) : x;\n  t = o2 ? (1.0 / x.x) : dfnormalize_f2_f2(dfadd2_f2_f2_f(x, o0 ? -1 : -2)).x;\n  \n  u = o2 ? +0.000839498720672087279971000786 : (o0 ? +0.9435157776e+0f : +0.1102489550e-3f);\n  u = mlaf(u, t, o2 ? -5.17179090826059219329394422e-05 : (o0 ? +0.8670063615e+0f : +0.8160019934e-4f));\n  u = mlaf(u, t, o2 ? -0.000592166437353693882857342347 : (o0 ? +0.4826702476e+0f : +0.1528468856e-3f));\n  u = mlaf(u, t, o2 ? +6.97281375836585777403743539e-05 : (o0 ? -0.8855129778e-1f : -0.2355068718e-3f));\n  u = mlaf(u, t, o2 ? +0.000784039221720066627493314301 : (o0 ? +0.1013825238e+0f : +0.4962242092e-3f));\n  u = mlaf(u, t, o2 ? -0.000229472093621399176949318732 : (o0 ? -0.1493408978e+0f : -0.1193488017e-2f));\n  u = mlaf(u, t, o2 ? -0.002681327160493827160473958490 : (o0 ? +0.1697509140e+0f : +0.2891599433e-2f));\n  u = mlaf(u, t, o2 ? +0.003472222222222222222175164840 : (o0 ? -0.2072454542e+0f : -0.7385451812e-2f));\n  u = mlaf(u, t, o2 ? +0.083333333333333333335592087900 : (o0 ? +0.2705872357e+0f : +0.2058077045e-1f));\n\n  y = dfmul_f2_f2_f2(dfadd2_f2_f2_f(x, -0.5), logk2f(x));\n  y = dfadd2_f2_f2_f2(y, dfneg_f2_f2(x));\n  y = dfadd2_f2_f2_f2(y, dfx(0.91893853320467278056)); // 0.5*log(2*M_PI)\n\n  z = dfadd2_f2_f2_f(dfmul_f2_f_f (u, t), o0 ? -0.400686534596170958447352690395e+0f : -0.673523028297382446749257758235e-1f);\n  z = dfadd2_f2_f2_f(dfmul_f2_f2_f(z, t), o0 ? +0.822466960142643054450325495997e+0f : +0.322467033928981157743538726901e+0f);\n  z = dfadd2_f2_f2_f(dfmul_f2_f2_f(z, t), o0 ? -0.577215665946766039837398973297e+0f : +0.422784335087484338986941629852e+0f);\n  z = dfmul_f2_f2_f(z, t);\n\n  clc = o2 ? y : z;\n  \n  clld = o2 ? dfadd2_f2_f2_f(dfmul_f2_f_f(u, t), 1) : clld;\n  \n  y = clln;\n\n  clc = otiny ? dfx(41.58883083359671856503) : // log(2^60)\n    (oref ? dfadd2_f2_f2_f2(dfx(1.1447298858494001639), dfneg_f2_f2(clc)) : clc); // log(M_PI)\n  clln = otiny ? df(1, 0) : (oref ? clln : clld);\n\n  if (oref) x = dfmul_f2_f2_f2(clld, sinpifk(a - (float)(INT64_C(1) << 12) * (int32_t)(a * (1.0 / (INT64_C(1) << 12)))));\n\n  clld = otiny ? df(a*((INT64_C(1) << 30)*(float)(INT64_C(1) << 30)), 0) : (oref ? x : y);\n\n  df2 ret = { clc, dfdiv_f2_f2_f2(clln, clld) };\n\n  return ret;\n}\n\nEXPORT CONST float xtgammaf_u1(float a) {\n  df2 d = gammafk(a);\n  Sleef_float2 y = dfmul_f2_f2_f2(expk2f(d.a), d.b);\n  float r = y.x + y.y;\n  r = (a == -SLEEF_INFINITYf || (a < 0 && xisintf(a)) || (xisnumberf(a) && a < 0 && xisnanf(r))) ? SLEEF_NANf : r;\n  r = ((a == SLEEF_INFINITYf || xisnumberf(a)) && a >= -FLT_MIN && (a == 0 || a > 36 || xisnanf(r))) ? mulsignf(SLEEF_INFINITYf, a) : r;\n  return r;\n}\n\nEXPORT CONST float xlgammaf_u1(float a) {\n  df2 d = gammafk(a);\n  Sleef_float2 y = dfadd2_f2_f2_f2(d.a, logk2f(dfabs_f2_f2(d.b)));\n  float r = y.x + y.y;\n  r = (xisinff(a) || (a <= 0 && xisintf(a)) || (xisnumberf(a) && xisnanf(r))) ? SLEEF_INFINITYf : r;\n  return r;\n}\n\nEXPORT CONST float xerff_u1(float a) {\n  float s = a, t, u;\n  Sleef_float2 d;\n\n  a = fabsfk(a);\n  int o0 = a < 1.1f, o1 = a < 2.4f, o2 = a < 4.0f;\n  u = o0 ? (a*a) : a;\n  \n  t = o0 ? +0.7089292194e-4f : o1 ? -0.1792667899e-4f : -0.9495757695e-5f;\n  t = mlaf(t, u, o0 ? -0.7768311189e-3f : o1 ? +0.3937633010e-3f : +0.2481465926e-3f);\n  t = mlaf(t, u, o0 ? +0.5159463733e-2f : o1 ? -0.3949181177e-2f : -0.2918176819e-2f);\n  t = mlaf(t, u, o0 ? -0.2683781274e-1f : o1 ? +0.2445474640e-1f : +0.2059706673e-1f);\n  t = mlaf(t, u, o0 ? +0.1128318012e+0f : o1 ? -0.1070996150e+0f : -0.9901899844e-1f);\n  d = dfmul_f2_f_f(t, u);\n  d = dfadd2_f2_f2_f2(d, o0 ? dfx(-0.376125876000657465175213237214e+0) :\n\t\t      o1 ? dfx(-0.634588905908410389971210809210e+0) :\n\t\t      dfx(-0.643598050547891613081201721633e+0));\n  d = dfmul_f2_f2_f(d, u);\n  d = dfadd2_f2_f2_f2(d, o0 ? dfx(+0.112837916021059138255978217023e+1) :\n\t\t      o1 ? dfx(-0.112879855826694507209862753992e+1) :\n\t\t      dfx(-0.112461487742845562801052956293e+1));\n  d = dfmul_f2_f2_f(d, a);\n  d = o0 ? d : dfadd_f2_f_f2(1.0, dfneg_f2_f2(expk2f(d)));\n  u = mulsignf(o2 ? (d.x + d.y) : 1, s);\n  u = xisnanf(a) ? SLEEF_NANf : u;\n  return u;\n}\n\nEXPORT CONST float xerfcf_u15(float a) {\n  float s = a, r = 0, t;\n  Sleef_float2 u, d, x;\n  a = fabsfk(a);\n  int o0 = a < 1.0f, o1 = a < 2.2f, o2 = a < 4.3f, o3 = a < 10.1f;\n  u = o1 ? df(a, 0) : dfdiv_f2_f2_f2(df(1, 0), df(a, 0));\n\n  t = o0 ? -0.8638041618e-4f : o1 ? -0.6236977242e-5f : o2 ? -0.3869504035e+0f : +0.1115344167e+1f;\n  t = mlaf(t, u.x, o0 ? +0.6000166177e-3f : o1 ? +0.5749821503e-4f : o2 ? +0.1288077235e+1f : -0.9454904199e+0f);\n  t = mlaf(t, u.x, o0 ? -0.1665703603e-2f : o1 ? +0.6002851478e-5f : o2 ? -0.1816803217e+1f : -0.3667259514e+0f);\n  t = mlaf(t, u.x, o0 ? +0.1795156277e-3f : o1 ? -0.2851036377e-2f : o2 ? +0.1249150872e+1f : +0.7155663371e+0f);\n  t = mlaf(t, u.x, o0 ? +0.1914106123e-1f : o1 ? +0.2260518074e-1f : o2 ? -0.1328857988e+0f : -0.1262947265e-1f);\n  \n  d = dfmul_f2_f2_f(u, t);\n  d = dfadd2_f2_f2_f2(d, o0 ? dfx(-0.102775359343930288081655368891e+0) :\n\t\t      o1 ? dfx(-0.105247583459338632253369014063e+0) :\n\t\t      o2 ? dfx(-0.482365310333045318680618892669e+0) :\n\t\t      dfx(-0.498961546254537647970305302739e+0));\n  d = dfmul_f2_f2_f2(d, u);\n  d = dfadd2_f2_f2_f2(d, o0 ? dfx(-0.636619483208481931303752546439e+0) :\n\t\t      o1 ? dfx(-0.635609463574589034216723775292e+0) :\n\t\t      o2 ? dfx(-0.134450203224533979217859332703e-2) :\n\t\t      dfx(-0.471199543422848492080722832666e-4));\n  d = dfmul_f2_f2_f2(d, u);\n  d = dfadd2_f2_f2_f2(d, o0 ? dfx(-0.112837917790537404939545770596e+1) :\n\t\t      o1 ? dfx(-0.112855987376668622084547028949e+1) :\n\t\t      o2 ? dfx(-0.572319781150472949561786101080e+0) :\n\t\t      dfx(-0.572364030327966044425932623525e+0));\n\n  x = dfmul_f2_f2_f(o1 ? d : df(-a, 0), a);\n  x = o1 ? x : dfadd2_f2_f2_f2(x, d);\n\n  x = expk2f(x);\n  x = o1 ? x : dfmul_f2_f2_f2(x, u);\n\n  r = o3 ? (x.x + x.y) : 0;\n  if (s < 0) r = 2 - r;\n  r = xisnanf(s) ? SLEEF_NANf : r;\n  return r;\n}\n\n//\n\n#ifdef ENABLE_MAIN\n// gcc -w -DENABLE_MAIN -I../common sleefsp.c rempitab.c -lm\n#include <stdlib.h>\nint main(int argc, char **argv) {\n  float d1 = atof(argv[1]);\n  //float d2 = atof(argv[2]);\n  //float d3 = atof(argv[3]);\n  //printf(\"%.20g, %.20g\\n\", (double)d1, (double)d2);\n  //float i2 = atoi(argv[2]);\n  //float c = xatan2f_u1(d1, d2);\n  //printf(\"round %.20g\\n\", (double)d1);\n  printf(\"test    = %.20g\\n\", (double)xsqrtf_u05(d1));\n  //printf(\"correct = %.20g\\n\", (double)roundf(d1));\n  //printf(\"rint %.20g\\n\", (double)d1);\n  //printf(\"test    = %.20g\\n\", (double)xrintf(d1));\n  //printf(\"correct = %.20g\\n\", (double)rintf(d1));\n  //Sleef_float2 r = xsincospif_u35(d);\n  //printf(\"%g, %g\\n\", (double)r.x, (double)r.y);\n}\n#endif\n"
  },
  {
    "path": "src/ufp.cpp",
    "content": "/*\n\nCopyright (c) 2021 Agenium Scale\n\nPermission is hereby granted, free of charge, to any person obtaining a copy\nof this software and associated documentation files (the \"Software\"), to deal\nin the Software without restriction, including without limitation the rights\nto use, copy, modify, merge, publish, distribute, sublicense, and/or sell\ncopies of the Software, and to permit persons to whom the Software is\nfurnished to do so, subject to the following conditions:\n\nThe above copyright notice and this permission notice shall be included in all\ncopies or substantial portions of the Software.\n\nTHE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\nIMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\nFITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\nAUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\nLIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\nOUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\nSOFTWARE.\n\n*/\n\n#include <nsimd/nsimd.h>\n\n// ----------------------------------------------------------------------------\n// Actual implementation\n\nnamespace nsimd {\n\ntemplate <int ExponentSize, int MantissaSize, typename UnsignedType,\n          typename T>\nint ufp(T a_, T b_) {\n  UnsignedType a = nsimd::scalar_reinterpret(UnsignedType(), a_);\n  UnsignedType b = nsimd::scalar_reinterpret(UnsignedType(), b_);\n  UnsignedType exp_mask = ((UnsignedType)1 << ExponentSize) - 1;\n  i64 ea = (i64)((a >> MantissaSize) & exp_mask);\n  i64 eb = (i64)((b >> MantissaSize) & exp_mask);\n  if (ea - eb > 1 || ea - eb < -1) {\n    return 0;\n  }\n  UnsignedType man_mask = ((UnsignedType)1 << MantissaSize) - 1;\n  i64 ma = (i64)(a & man_mask) | ((i64)1 << MantissaSize);\n  i64 mb = (i64)(b & man_mask) | ((i64)1 << MantissaSize);\n  i64 d = 0;\n\n  if (ea == eb) {\n    d = ma - mb;\n  } else if (ea > eb) {\n    d = 2 * ma - mb;\n  } else {\n    d = 2 * mb - ma;\n  }\n  d = (d >= 0 ? d : -d);\n  int i = 0;\n  for (; i <= MantissaSize + 1 && d >= ((i64)1 << i); i++)\n    ;\n  return (int)(MantissaSize + 1 - i);\n}\n\n} // namespace nsimd\n\n// ----------------------------------------------------------------------------\n// C ABI\n\nextern \"C\" {\n\nNSIMD_DLLSPEC int nsimd_ufp_f16(f16 a, f16 b) {\n  return nsimd::ufp<5, 10, u16>(a, b);\n}\n\nNSIMD_DLLSPEC int nsimd_ufp_f32(f32 a, f32 b) {\n  return nsimd::ufp<8, 23, u32>(a, b);\n}\n\nNSIMD_DLLSPEC int nsimd_ufp_f64(f64 a, f64 b) {\n  return nsimd::ufp<11, 52, u64>(a, b);\n}\n\n} // extern \"C\"\n"
  },
  {
    "path": "tests/CMakeLists.txt.sh",
    "content": "# MIT License\n#\n# Copyright (c) 2020 Agenium Scale\n#\n# Permission is hereby granted, free of charge, to any person obtaining a copy\n# of this software and associated documentation files (the \"Software\"), to deal\n# in the Software without restriction, including without limitation the rights\n# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n# copies of the Software, and to permit persons to whom the Software is\n# furnished to do so, subject to the following conditions:\n#\n# The above copyright notice and this permission notice shall be included in all\n# copies or substantial portions of the Software.\n#\n# THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n# SOFTWARE.\n\nset -e\nset -x\n\nBUF=\"`dirname $0`/..\"\nNSIMD_CMAKE=`realpath ${BUF}`\n\nfor simd_ext in \"$@\"; do\n\n  # Take care of cross compilation here\n  case ${simd_ext} in\n    aarch64 | sve | sve128 | sve256 | sve512 | sve1024 | sve2048)\n      C_COMP=\"aarch64-linux-gnu-gcc\"\n      CXX_COMP=\"aarch64-linux-gnu-g++\"\n      ;;\n    neon128)\n      C_COMP=\"arm-linux-gnueabi-gcc\"\n      CXX_COMP=\"arm-linux-gnueabi-g++\"\n      ;;\n    vmx | vsx)\n      C_COMP=\"${NSIMD_CMAKE}/scripts/powerpc64le-linux-gnu-clang.sh\"\n      CXX_COMP=\"${NSIMD_CMAKE}/scripts/powerpc64le-linux-gnu-clang++.sh\"\n      ;;\n    oneapi)\n      C_COMP=\"gcc\"\n      CXX_COMP=\"dpcpp\"\n      ;;\n    rocm)\n      C_COMP=\"gcc\"\n      CXX_COMP=\"${NSIMD_CMAKE}/scripts/hipcc.sh\"\n      ;;\n    cuda)\n      C_COMP=\"gcc\"\n      CXX_COMP=\"nvcc\"\n      ;;\n    *)\n      C_COMP=\"gcc\"\n      CXX_COMP=\"g++\"\n      ;;\n  esac\n\n  # First case: find a specific component\n  ROOT_DIR=\"${PWD}/nsimd_cmake_tests/${simd_ext}\"\n  rm -rf ${ROOT_DIR}\n  mkdir -p ${ROOT_DIR}\n  (cd ${ROOT_DIR} && \\\n   cmake ${NSIMD_CMAKE} \\\n         -Dsimd=${simd_ext} \\\n         -DCMAKE_INSTALL_PREFIX=${ROOT_DIR}/root \\\n         -DCMAKE_C_COMPILER=\"${C_COMP}\" \\\n         -DCMAKE_CXX_COMPILER=\"${CXX_COMP}\" && \\\n   make VERBOSE=1 && \\\n   make install)\n\ndone\n"
  },
  {
    "path": "tests/FindNSIMD.cmake.sh",
    "content": "#!/bin/bash\n#\n# Copyright (c) 2020 Agenium Scale\n#\n# Permission is hereby granted, free of charge, to any person obtaining a copy\n# of this software and associated documentation files (the \"Software\"), to deal\n# in the Software without restriction, including without limitation the rights\n# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n# copies of the Software, and to permit persons to whom the Software is\n# furnished to do so, subject to the following conditions:\n#\n# The above copyright notice and this permission notice shall be included in all\n# copies or substantial portions of the Software.\n#\n# THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n# SOFTWARE.\n\nset -e\nset -x\n\nFIND_NSIMD_CMAKE=\"`dirname $0`/../scripts/FindNSIMD.cmake\"\nSIMD_EXTS=\"sse2 sse42 avx avx2 avx512_knl avx512_skylake neon128 aarch64 \\\n           sve sve128 sve256 sve512 sve1024 sve2048 cuda rocm\"\n\nfor simd_ext in ${SIMD_EXTS}; do\n\n  # First case: find a specific component\n  ROOT_DIR=\"${PWD}/find_nsimd_cmake_tests/${simd_ext}\"\n  rm -rf ${ROOT_DIR}\n  mkdir -p \"${ROOT_DIR}/cmake\"\n  cp \"${FIND_NSIMD_CMAKE}\" \"${ROOT_DIR}/cmake\"\n  mkdir -p \"${ROOT_DIR}/root/include/nsimd\"\n  touch \"${ROOT_DIR}/root/include/nsimd/nsimd.h\"\n  mkdir -p \"${ROOT_DIR}/root/lib\"\n  touch \"${ROOT_DIR}/root/lib/libnsimd_${simd_ext}.so\"\n\n  cat >\"${ROOT_DIR}/CMakeLists.txt\" <<-EOF\n\tcmake_minimum_required(VERSION 3.0.0)\n\tproject(FIND_NSIMD_CMAKE_TESTS)\n\tset(CMAKE_MODULE_PATH \"${ROOT_DIR}/cmake\")\n\tset(CMAKE_PREFIX_PATH \"${ROOT_DIR}/root\")\n\tfind_package(NSIMD COMPONENTS ${simd_ext})\n\tmessage(STATUS \"FindNSIMD.cmake test : specific for ${simd_ext}\")\n\tmessage(STATUS \"NSIMD_FOUND = \\${NSIMD_FOUND}\")\n\tif (\\${NSIMD_FOUND})\n\t  message(STATUS \"NSIMD_INCLUDE_DIRS = \\${NSIMD_INCLUDE_DIRS}\")\n\t  message(STATUS \"NSIMD_LIBRARY_DIRS = \\${NSIMD_LIBRARY_DIRS}\")\n\t  message(STATUS \"NSIMD_LIBRARIES = \\${NSIMD_LIBRARIES}\")\n\telse()\n\t  message(FATAL_ERROR \"error NSIMD_FOUND should be TRUE\")\n\tendif()\n\tEOF\n  (cd \"${ROOT_DIR}\" && mkdir -p build && cd build && cmake ..)\n\n  # Second case: find a automatically a component\n  ROOT_DIR=\"${PWD}/find_nsimd_cmake_tests/${simd_ext}-auto\"\n  rm -rf ${ROOT_DIR}\n  mkdir -p \"${ROOT_DIR}/cmake\"\n  cp \"${FIND_NSIMD_CMAKE}\" \"${ROOT_DIR}/cmake\"\n  mkdir -p \"${ROOT_DIR}/root/include/nsimd\"\n  touch \"${ROOT_DIR}/root/include/nsimd/nsimd.h\"\n  mkdir -p \"${ROOT_DIR}/root/lib\"\n  touch \"${ROOT_DIR}/root/lib/libnsimd_${simd_ext}.so\"\n\n  cat >\"${ROOT_DIR}/CMakeLists.txt\" <<-EOF\n\tcmake_minimum_required(VERSION 3.0.0)\n\tproject(FIND_NSIMD_CMAKE_TESTS)\n\tset(CMAKE_MODULE_PATH \"${ROOT_DIR}/cmake\")\n\tset(CMAKE_PREFIX_PATH \"${ROOT_DIR}/root\")\n\tfind_package(NSIMD)\n\tmessage(STATUS \"FindNSIMD.cmake test : automatic for ${simd_ext}\")\n\tmessage(STATUS \"NSIMD_FOUND = \\${NSIMD_FOUND}\")\n\tif (\\${NSIMD_FOUND})\n\t  message(STATUS \"NSIMD_INCLUDE_DIRS = \\${NSIMD_INCLUDE_DIRS}\")\n\t  message(STATUS \"NSIMD_LIBRARY_DIRS = \\${NSIMD_LIBRARY_DIRS}\")\n\t  message(STATUS \"NSIMD_LIBRARIES = \\${NSIMD_LIBRARIES}\")\n\telse()\n\t  message(FATAL_ERROR \"error NSIMD_FOUND should be TRUE\")\n\tendif()\n\tEOF\n  (cd \"${ROOT_DIR}\" && mkdir -p build && cd build && cmake ..)\n\n  # Third case: find a specific component\n  ROOT_DIR=\"${PWD}/find_nsimd_cmake_tests/${simd_ext}-notfound\"\n  rm -rf ${ROOT_DIR}\n  mkdir -p \"${ROOT_DIR}/cmake\"\n  cp \"${FIND_NSIMD_CMAKE}\" \"${ROOT_DIR}/cmake\"\n  mkdir -p \"${ROOT_DIR}/root/include/nsimd\"\n  touch \"${ROOT_DIR}/root/include/nsimd/nsimd.h\"\n  mkdir -p \"${ROOT_DIR}/root/lib\"\n  touch \"${ROOT_DIR}/root/lib/libnsimd_cpu.so\"\n\n  cat >\"${ROOT_DIR}/CMakeLists.txt\" <<-EOF\n\tcmake_minimum_required(VERSION 3.0.0)\n\tproject(FIND_NSIMD_CMAKE_TESTS)\n\tset(CMAKE_MODULE_PATH \"${ROOT_DIR}/cmake\")\n\tset(CMAKE_PREFIX_PATH \"${ROOT_DIR}/root\")\n\tfind_package(NSIMD COMPONENTS ${simd_ext})\n\tmessage(STATUS \"FindNSIMD.cmake test : \"\n\t               \"notfound specific for ${simd_ext}\")\n\tmessage(STATUS \"NSIMD_FOUND = \\${NSIMD_FOUND}\")\n\tif (\\${NSIMD_FOUND})\n\t  message(STATUS \"NSIMD_INCLUDE_DIRS = \\${NSIMD_INCLUDE_DIRS}\")\n\t  message(STATUS \"NSIMD_LIBRARY_DIRS = \\${NSIMD_LIBRARY_DIRS}\")\n\t  message(STATUS \"NSIMD_LIBRARIES = \\${NSIMD_LIBRARIES}\")\n\t  message(FATAL_ERROR \"error NSIMD_FOUND should be FALSE\")\n\telse()\n\t  message(STATUS \"NSIMD not found\")\n\tendif()\n\tEOF\n  (cd \"${ROOT_DIR}\" && mkdir -p build && cd build && cmake ..)\n\n  # Fourth case: find a automatically a component\n  ROOT_DIR=\"${PWD}/find_nsimd_cmake_tests/${simd_ext}-auto-notfound\"\n  rm -rf ${ROOT_DIR}\n  mkdir -p \"${ROOT_DIR}/cmake\"\n  cp \"${FIND_NSIMD_CMAKE}\" \"${ROOT_DIR}/cmake\"\n  mkdir -p \"${ROOT_DIR}/root/include/nsimd\"\n  touch \"${ROOT_DIR}/root/include/nsimd/nsimd.h\"\n  mkdir -p \"${ROOT_DIR}/root/lib\"\n\n  cat >\"${ROOT_DIR}/CMakeLists.txt\" <<-EOF\n\tcmake_minimum_required(VERSION 3.0.0)\n\tproject(FIND_NSIMD_CMAKE_TESTS)\n\tset(CMAKE_MODULE_PATH \"${ROOT_DIR}/cmake\")\n\tset(CMAKE_PREFIX_PATH \"${ROOT_DIR}/root\")\n\tfind_package(NSIMD)\n\tmessage(STATUS \"FindNSIMD.cmake test : \"\n\t               \"notfound automatic for ${simd_ext}\")\n\tmessage(STATUS \"NSIMD_FOUND = \\${NSIMD_FOUND}\")\n\tif (\\${NSIMD_FOUND})\n\t  message(STATUS \"NSIMD_INCLUDE_DIRS = \\${NSIMD_INCLUDE_DIRS}\")\n\t  message(STATUS \"NSIMD_LIBRARY_DIRS = \\${NSIMD_LIBRARY_DIRS}\")\n\t  message(STATUS \"NSIMD_LIBRARIES = \\${NSIMD_LIBRARIES}\")\n\t  message(FATAL_ERROR \"error NSIMD_FOUND should be FALSE\")\n\telse()\n\t  message(STATUS \"NSIMD not found\")\n\tendif()\n\tEOF\n  (cd \"${ROOT_DIR}\" && mkdir -p build && cd build && cmake ..)\n\ndone\n"
  },
  {
    "path": "tests/allocator.cpp",
    "content": "/*\n\nCopyright (c) 2019 Agenium Scale\n\nPermission is hereby granted, free of charge, to any person obtaining a copy\nof this software and associated documentation files (the \"Software\"), to deal\nin the Software without restriction, including without limitation the rights\nto use, copy, modify, merge, publish, distribute, sublicense, and/or sell\ncopies of the Software, and to permit persons to whom the Software is\nfurnished to do so, subject to the following conditions:\n\nThe above copyright notice and this permission notice shall be included in all\ncopies or substantial portions of the Software.\n\nTHE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\nIMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\nFITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\nAUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\nLIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\nOUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\nSOFTWARE.\n\n*/\n\n#include <nsimd/nsimd-all.hpp>\n\n#include <cstdlib>\n#include <vector>\n\nint main() {\n  std::vector<float, nsimd::allocator<float> > v;\n\n  v.clear();\n  v.resize(100);\n\n  v.clear();\n  v.resize(100);\n  v.resize(10000);\n\n  v.clear();\n  v.reserve(30);\n\n  for (int i = 0; i < 1000; i++) {\n    v.push_back(float(i));\n  }\n  if (v.size() != 1000) {\n    exit(EXIT_FAILURE);\n  }\n\n  for (int i = 0; i < 500; i++) {\n    v.pop_back();\n  }\n  if (v.size() != 500) {\n    exit(EXIT_FAILURE);\n  }\n\n  return 0;\n}\n"
  },
  {
    "path": "tests/assign_arith.cpp",
    "content": "/*\n\nCopyright (c) 2021 Agenium Scale\n\nPermission is hereby granted, free of charge, to any person obtaining a copy\nof this software and associated documentation files (the \"Software\"), to deal\nin the Software without restriction, including without limitation the rights\nto use, copy, modify, merge, publish, distribute, sublicense, and/or sell\ncopies of the Software, and to permit persons to whom the Software is\nfurnished to do so, subject to the following conditions:\n\nThe above copyright notice and this permission notice shall be included in all\ncopies or substantial portions of the Software.\n\nTHE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\nIMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\nFITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\nAUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\nLIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\nOUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\nSOFTWARE.\n\n*/\n\n#include <nsimd/nsimd-all.hpp>\n#include <iostream>\n\n/* ------------------------------------------------------------------------- */\n/* Random number */\n\ntemplate <typename T> T get_rand() {\n  return (T)((rand() % 10) + 1);\n}\n\ntemplate <> f16 get_rand() {\n  return nsimd_f32_to_f16(get_rand<f32>());\n}\n\n/* ------------------------------------------------------------------------- */\n/* Arithmetic operators */\n\n#define HELPER(op1, op2, name)                                                \\\n  template <typename T> int test_##name##_T(size_t n) {                       \\\n    std::vector<T> a(n), b(n);                                                \\\n    for (size_t i = 0; i < n; i++) {                                          \\\n      a[i] = get_rand<T>();                                                   \\\n      b[i] = get_rand<T>();                                                   \\\n    }                                                                         \\\n                                                                              \\\n    using namespace nsimd;                                                    \\\n    typedef pack<T> pack;                                                     \\\n    for (size_t i = 0; i < n; i += size_t(len(pack()))) {                     \\\n      pack tmp1 = loadu<pack>(&a[i]);                                         \\\n      tmp1 op1 loadu<pack>(&b[i]);                                            \\\n      pack tmp2 = loadu<pack>(&a[i]) op2 loadu<pack>(&b[i]);                  \\\n      if (any(tmp1 != tmp2)) {                                                \\\n        return -1;                                                            \\\n      }                                                                       \\\n    }                                                                         \\\n    return 0;                                                                 \\\n  }                                                                           \\\n                                                                              \\\n  int test_##name(size_t n) {                                                 \\\n    return test_##name##_T<i8>(n) || test_##name##_T<u8>(n) ||                \\\n           test_##name##_T<i16>(n) || test_##name##_T<u16>(n) ||              \\\n           test_##name##_T<f16>(n) || test_##name##_T<i32>(n) ||              \\\n           test_##name##_T<u32>(n) || test_##name##_T<f32>(n) ||              \\\n           test_##name##_T<i64>(n) || test_##name##_T<u64>(n) ||              \\\n           test_##name##_T<f64>(n);                                           \\\n  }                                                                           \\\n                                                                              \\\n  int test_##name##_int_only(size_t n) {                                      \\\n    return test_##name##_T<i8>(n) || test_##name##_T<u8>(n) ||                \\\n           test_##name##_T<i16>(n) || test_##name##_T<u16>(n) ||              \\\n           test_##name##_T<i32>(n) || test_##name##_T<u32>(n) ||              \\\n           test_##name##_T<i64>(n) || test_##name##_T<u64>(n);                \\\n  }\n\nHELPER(+=, +, add)\nHELPER(-=, -, sub)\nHELPER(*=, *, mul)\nHELPER(/=, /, div)\nHELPER(|=, |, orb)\nHELPER(&=, &, andb)\nHELPER(^=, ^, xorb)\n\n#undef HELPER\n\n/* ------------------------------------------------------------------------- */\n/* Shift operators */\n\n#define HELPER(op1, op2, name)                                                \\\n  template <typename T> int test_##name##_T(size_t n) {                       \\\n    std::vector<T> a(n);                                                      \\\n    for (size_t i = 0; i < n; i++) {                                          \\\n      a[i] = get_rand<T>();                                                   \\\n    }                                                                         \\\n                                                                              \\\n    using namespace nsimd;                                                    \\\n    typedef pack<T> pack;                                                     \\\n    for (int s = 0; s <= 3; s++) {                                            \\\n      for (size_t i = 0; i < n; i += size_t(len(pack()))) {                   \\\n        pack tmp = loadu<pack>(&a[i]);                                        \\\n        tmp op1 s;                                                            \\\n        if (any(tmp != (loadu<pack>(&a[i]) op2 s))) {                         \\\n          return -1;                                                          \\\n        }                                                                     \\\n      }                                                                       \\\n    }                                                                         \\\n    return 0;                                                                 \\\n  }                                                                           \\\n                                                                              \\\n  int test_##name(size_t n) {                                                 \\\n    return test_##name##_T<i8>(n) || test_##name##_T<u8>(n) ||                \\\n           test_##name##_T<i16>(n) || test_##name##_T<u16>(n) ||              \\\n           test_##name##_T<i32>(n) || test_##name##_T<u32>(n) ||              \\\n           test_##name##_T<i64>(n) || test_##name##_T<u64>(n);                \\\n  }\n\nHELPER(<<=, <<, shl)\nHELPER(>>=, >>, shr)\n\n#undef HELPER\n\n/* ------------------------------------------------------------------------- */\n\nint main() {\n  const size_t n = 2048;\n  return test_add(n) || test_sub(n) || test_mul(n) || test_div(n) ||\n         test_orb_int_only(n) || test_andb_int_only(n) ||\n         test_xorb_int_only(n) || test_shl(n) || test_shr(n);\n}\n\n"
  },
  {
    "path": "tests/booleans.cpp",
    "content": "/*\n\nCopyright (c) 2020 Agenium Scale\n\nPermission is hereby granted, free of charge, to any person obtaining a copy\nof this software and associated documentation files (the \"Software\"), to deal\nin the Software without restriction, including without limitation the rights\nto use, copy, modify, merge, publish, distribute, sublicense, and/or sell\ncopies of the Software, and to permit persons to whom the Software is\nfurnished to do so, subject to the following conditions:\n\nThe above copyright notice and this permission notice shall be included in all\ncopies or substantial portions of the Software.\n\nTHE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\nIMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\nFITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\nAUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\nLIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\nOUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\nSOFTWARE.\n\n*/\n\n#include <nsimd/nsimd-all.hpp>\n#include <cstdlib>\n\n// ----------------------------------------------------------------------------\n\nint main() {\n  using namespace nsimd;\n  packl<int> v = packl<int>(true) || packl<float>(false);\n  if (!all(v)) {\n    return -1;\n  }\n  return 0;\n}\n"
  },
  {
    "path": "tests/c11_vec.c",
    "content": "/*\n\nCopyright (c) 2021 Agenium Scale\n\nPermission is hereby granted, free of charge, to any person obtaining a copy\nof this software and associated documentation files (the \"Software\"), to deal\nin the Software without restriction, including without limitation the rights\nto use, copy, modify, merge, publish, distribute, sublicense, and/or sell\ncopies of the Software, and to permit persons to whom the Software is\nfurnished to do so, subject to the following conditions:\n\nThe above copyright notice and this permission notice shall be included in all\ncopies or substantial portions of the Software.\n\nTHE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\nIMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\nFITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\nAUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\nLIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\nOUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\nSOFTWARE.\n\n*/\n\n#include <nsimd/nsimd-all.h>\n\nint main() {\n#if NSIMD_C >= 2011\n  float in[NSIMD_MAX_LEN(f32)];\n  int out[NSIMD_MAX_LEN(i32)];\n\n  nsimd_pack(f32) vin = nsimd_load(unaligned, nsimd_pack(f32), in);\n  nsimd_pack(i32) vout = nsimd_reinterpret(nsimd_pack(i32), vin);\n  nsimd_store(unaligned, out, vout);\n#endif\n\n  return 0;\n}\n"
  },
  {
    "path": "tests/cxx_adv_api_aliases.cpp",
    "content": "/*\n\nCopyright (c) 2021 Agenium Scale\n\nPermission is hereby granted, free of charge, to any person obtaining a copy\nof this software and associated documentation files (the \"Software\"), to deal\nin the Software without restriction, including without limitation the rights\nto use, copy, modify, merge, publish, distribute, sublicense, and/or sell\ncopies of the Software, and to permit persons to whom the Software is\nfurnished to do so, subject to the following conditions:\n\nThe above copyright notice and this permission notice shall be included in all\ncopies or substantial portions of the Software.\n\nTHE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\nIMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\nFITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\nAUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\nLIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\nOUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\nSOFTWARE.\n\n*/\n\n#include <nsimd/cxx_adv_api_aliases.hpp>\n\n/* ------------------------------------------------------------------------- */\n/* Random number */\n\ntemplate <typename T> T get_rand() {\n  return (T)((rand() % 100) - 50);\n}\n\ntemplate <> f16 get_rand() {\n  return nsimd_f32_to_f16(get_rand<f32>());\n}\n\n/* ------------------------------------------------------------------------- */\n\ntemplate <typename T> int test_aliases(size_t n) {\n  std::vector<T> a(n), b(n);\n\n  for (size_t i = 0; i < n; i++) {\n    a[i] = get_rand<T>();\n    b[i] = get_rand<T>();\n  }\n\n  using namespace nsimd;\n  typedef pack<T> pack;\n  size_t step = size_t(len(pack()));\n  for (size_t i = 0; i + step <= n; i += step) {\n    pack tmp1 = loadu<pack>(&a[i]);\n    pack tmp2 = loadu<pack>(&b[i]);\n    if (any(fabs(tmp1) != abs(tmp1))) {\n      return -1;\n    }\n    if (any(fmin(tmp1, tmp2) != min(tmp1, tmp2))) {\n      return -1;\n    }\n    if (any(fmax(tmp1, tmp2) != max(tmp1, tmp2))) {\n      return -1;\n    }\n  }\n\n  return 0;\n}\n\n/* ------------------------------------------------------------------------- */\n\nint main() {\n  return test_aliases<i8>(2048) || test_aliases<u8>(2048) ||\n         test_aliases<i16>(2048) || test_aliases<u16>(2048) ||\n         test_aliases<f16>(2048) || test_aliases<i32>(2048) ||\n         test_aliases<u32>(2048) || test_aliases<f32>(2048) ||\n         test_aliases<i64>(2048) || test_aliases<u64>(2048) ||\n         test_aliases<f64>(2048);\n}\n"
  },
  {
    "path": "tests/fp16.prec11.c",
    "content": "/*\n\nCopyright (c) 2019 Agenium Scale\n\nPermission is hereby granted, free of charge, to any person obtaining a copy\nof this software and associated documentation files (the \"Software\"), to deal\nin the Software without restriction, including without limitation the rights\nto use, copy, modify, merge, publish, distribute, sublicense, and/or sell\ncopies of the Software, and to permit persons to whom the Software is\nfurnished to do so, subject to the following conditions:\n\nThe above copyright notice and this permission notice shall be included in all\ncopies or substantial portions of the Software.\n\nTHE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\nIMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\nFITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\nAUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\nLIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\nOUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\nSOFTWARE.\n\n*/\n\n#define _POSIX_C_SOURCE 200112L\n\n#include <math.h>\n#include <nsimd/nsimd.h>\n#include <stdio.h>\n#include <stdlib.h>\n\n/* ------------------------------------------------------------------------- */\n\nfloat via_fp16(float a) { return nsimd_f16_to_f32(nsimd_f32_to_f16(a)); }\n\n/* ------------------------------------------------------------------------- */\n\nfloat mk_fp32(int mantissa, int exponent) {\n  return (float)ldexp((double)mantissa, exponent);\n}\n\n/* ------------------------------------------------------------------------- */\n\nint test_f16_to_f32(u16 val, u32 expected) {\n  f32 fexpected = nsimd_scalar_reinterpret_f32_u32(expected);\n  f32 res = nsimd_u16_to_f32(val);\n  u32 ures = nsimd_scalar_reinterpret_u32_f32(res);\n  if ((nsimd_isnan_f32(fexpected) && !nsimd_isnan_f32(res)) ||\n      (!nsimd_isnan_f32(fexpected) && ures != expected)) {\n    fprintf(stdout,\n            \"Error, nsimd_f16_to_f32: expected %e(0x%x) but got %e(0x%x) \\n\",\n            (f64)fexpected, expected, (f64)res, ures);\n    fflush(stdout);\n    return 1;\n  }\n\n  return 0;\n}\n\n/* ------------------------------------------------------------------------- */\n\nint test_f32_to_f16(u32 val, u16 expected) {\n  f16 fres = nsimd_f32_to_f16(nsimd_scalar_reinterpret_f32_u32(val));\n  u16 ures = nsimd_scalar_reinterpret_u16_f16(fres);\n  if (ures != expected) {\n    fprintf(stdout, \"Error, nsimd_f32_to_f16: expected 0x%x but got 0x%x \\n\",\n            expected, ures);\n    fflush(stdout);\n    return 1;\n  }\n\n  return 0;\n}\n\n/* ------------------------------------------------------------------------- */\n\nint main(void) {\n#ifndef NSIMD_NO_IEEE754\n  const float infty = nsimd_scalar_reinterpret_f32_u32(0x7F800000);\n  const float m_infty = nsimd_scalar_reinterpret_f32_u32(0xFF800000);\n  const float nan = nsimd_scalar_reinterpret_f32_u32(0x7FC00000);\n#endif\n  int i;\n\n  /* Some corner cases first. */\n  if (test_f16_to_f32(0x0000, 0x0)) {\n    return EXIT_FAILURE;\n  }\n  if (test_f16_to_f32(0x8000, 0x80000000)) {\n    return EXIT_FAILURE;\n  }\n  if (test_f16_to_f32(0x3C00, 0x3f800000)) {\n    return EXIT_FAILURE;\n  }\n  if (test_f16_to_f32(0x13e, 0x379F0000)) { /* 1.8954277E-5 */\n    return EXIT_FAILURE;\n  }\n  if (test_f16_to_f32(0x977e, 0xBAEFC000)) { /* -1.8291473E-3 */\n    return EXIT_FAILURE;\n  }\n\n  if (test_f32_to_f16(0xC7BDC4FC, 0xFC00)) { /* -97161.97 */\n    return EXIT_FAILURE;\n  }\n\n  if (test_f32_to_f16(0x37c3642c, 0x187)) { /* 2.329246e-05 */\n    return EXIT_FAILURE;\n  }\n\n  if (test_f32_to_f16(0xb314e840, 0x8001)) {\n    return EXIT_FAILURE;\n  }\n\n  /* Test rounding when the input f32 is perfectly between 2 f16 */\n  if (test_f32_to_f16(0xC66AD000, 0xf356)) {\n    return EXIT_FAILURE;\n  }\n\n  /* Close to ±Inf */\n  if (test_f32_to_f16(0x477fefff, 0x7bff)) {\n    return EXIT_FAILURE;\n  }\n  if (test_f32_to_f16(0x477ff000, 0x7c00)) {\n    return EXIT_FAILURE;\n  }\n  if (test_f32_to_f16(0xC77fefff, 0xfbff)) {\n    return EXIT_FAILURE;\n  }\n  if (test_f32_to_f16(0xC77ff000, 0xfc00)) {\n    return EXIT_FAILURE;\n  }\n\n  /* Close to ±0 */\n  if (test_f32_to_f16(0x33000001, 0x0001)) {\n    return EXIT_FAILURE;\n  }\n  if (test_f32_to_f16(0x33000000, 0x0000)) {\n    return EXIT_FAILURE;\n  }\n  if (test_f32_to_f16(0xB3000001, 0x8001)) {\n    return EXIT_FAILURE;\n  }\n  if (test_f32_to_f16(0xB3000000, 0x8000)) {\n    return EXIT_FAILURE;\n  }\n\n  /* Close to the denormal limit */\n  if (test_f32_to_f16(0x38800000, 0x0400)) {\n    return EXIT_FAILURE;\n  }\n  if (test_f32_to_f16(0x387fffff, 0x0400)) {\n    return EXIT_FAILURE;\n  }\n\n  /* NaN special value (Copy Intel intrinsics which set the MSB of the mantissa\n   * of NaNs to 1 when converting f16 to f32). */\n  if (test_f16_to_f32(0xfcf8, 0xff9f0000)) {\n    return EXIT_FAILURE;\n  }\n\n#ifndef NSIMD_NO_IEEE754\n  if (via_fp16(mk_fp32(1, 20)) != infty) {\n    fprintf(stdout, \"... Error, %i \\n\", __LINE__);\n    fflush(stdout);\n    return EXIT_FAILURE;\n  }\n  if (via_fp16(mk_fp32(-1, 20)) != m_infty) {\n    fprintf(stdout, \"... Error, %i \\n\", __LINE__);\n    fflush(stdout);\n    return EXIT_FAILURE;\n  }\n  if (!nsimd_isnan_f32(via_fp16(nan))) {\n    fprintf(stdout, \"... Error, %i \\n\", __LINE__);\n    fflush(stdout);\n    return EXIT_FAILURE;\n  }\n#endif\n\n  /* Some random inputs */\n  for (i = 0; i < 100; i++) {\n    float a = (float)rand() / (float)RAND_MAX;\n    if (fabsf(a - via_fp16(a)) > ldexpf(1.0, -9)) {\n      return EXIT_FAILURE;\n    }\n  }\n\n  fprintf(stdout, \"... OK\\n\");\n  fflush(stdout);\n  return EXIT_SUCCESS;\n}\n"
  },
  {
    "path": "tests/get_pack.cpp",
    "content": "/*\n\nCopyright (c) 2020 Agenium Scale\n\nPermission is hereby granted, free of charge, to any person obtaining a copy\nof this software and associated documentation files (the \"Software\"), to deal\nin the Software without restriction, including without limitation the rights\nto use, copy, modify, merge, publish, distribute, sublicense, and/or sell\ncopies of the Software, and to permit persons to whom the Software is\nfurnished to do so, subject to the following conditions:\n\nThe above copyright notice and this permission notice shall be included in all\ncopies or substantial portions of the Software.\n\nTHE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\nIMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\nFITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\nAUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\nLIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\nOUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\nSOFTWARE.\n\n*/\n\n#define STATUS \"test of get_pack over all types\"\n\n#include \"tests_helpers.hpp\"\n\n// ----------------------------------------------------------------------------\n// Little helper for scope memory\n\n// ----------------------------------------------------------------------------\n\ntemplate <typename T> bool get_pack_from_pack_N_1() {\n\n  LOG_TEST_DEBUG(\"get_pack_from_pack_N_1\", T);\n\n  nsimd::pack<T, 1> pack_1(42);\n  nsimd::pack<T, 1> v0_get = nsimd::get_pack<0>(pack_1);\n\n  nsimd::scoped_aligned_mem_for<T> expected(NSIMD_MAX_LEN_BIT / 8);\n  nsimd::scoped_aligned_mem_for<T> computed(NSIMD_MAX_LEN_BIT / 8);\n\n  return nsimd_tests::check_pack_expected_vs_computed(\n      pack_1, v0_get, \"nsimd::pack<T, 1>\", \"nsimd::pack<T, 1>\", expected.get(),\n      computed.get());\n}\n\n// ----------------------------------------------------------------------------\n\ntemplate <typename T> bool get_pack_from_packx2_N_3() {\n\n  LOG_TEST_DEBUG(\"get_pack_from_packx2_N_3\", T);\n\n  nsimd::pack<T, 3> v0(42);\n  nsimd::pack<T, 3> v1(24);\n\n  nsimd::packx2<T, 3> packx2_3;\n  packx2_3.v0 = v0;\n  packx2_3.v1 = v1;\n\n  nsimd::scoped_aligned_mem_for<T> expected(3 * NSIMD_MAX_REGISTER_SIZE_BYTES);\n  nsimd::scoped_aligned_mem_for<T> computed(3 * NSIMD_MAX_REGISTER_SIZE_BYTES);\n\n  nsimd::pack<T, 3> v0_get = nsimd::get_pack<0>(packx2_3);\n  if (!nsimd_tests::check_pack_expected_vs_computed(\n          v0, v0_get, \"nsimd::packx2<T, 3>.v0\", \"nsimd::pack<T, 3>\",\n          expected.get(), computed.get())) {\n    return false;\n  }\n\n  nsimd::pack<T, 3> v1_get = nsimd::get_pack<1>(packx2_3);\n  return nsimd_tests::check_pack_expected_vs_computed(\n      v1, v1_get, \"nsimd::packx2<T, 3>.v1\", \"nsimd::pack<T, 3>\",\n      expected.get(), computed.get());\n}\n\n// ----------------------------------------------------------------------------\n\ntemplate <typename T> bool get_pack_from_packx3_N_3() {\n\n  LOG_TEST_DEBUG(\"get_pack_from_packx3_N_3\", T);\n\n  nsimd::pack<T, 3> v0(42);\n  nsimd::pack<T, 3> v1(24);\n  nsimd::pack<T, 3> v2(66);\n\n  nsimd::packx3<T, 3> packx3_3;\n\n  packx3_3.v0 = v0;\n  packx3_3.v1 = v1;\n  packx3_3.v2 = v2;\n\n  nsimd::scoped_aligned_mem_for<T> expected(3 * NSIMD_MAX_REGISTER_SIZE_BYTES);\n  nsimd::scoped_aligned_mem_for<T> computed(3 * NSIMD_MAX_REGISTER_SIZE_BYTES);\n\n  nsimd::pack<T, 3> v0_get = nsimd::get_pack<0>(packx3_3);\n  if (!nsimd_tests::check_pack_expected_vs_computed(\n          v0, v0_get, \"nsimd::packx3<T, 3>.v0\", \"nsimd::pack<T, 3>\",\n          expected.get(), computed.get())) {\n    return false;\n  }\n\n  nsimd::pack<T, 3> v1_get = nsimd::get_pack<1>(packx3_3);\n  if (!nsimd_tests::check_pack_expected_vs_computed(\n          v1, v1_get, \"nsimd::packx3<T, 3>.v1\", \"nsimd::pack<T, 3>\",\n          expected.get(), computed.get())) {\n    return false;\n  }\n\n  nsimd::pack<T, 3> v2_get = nsimd::get_pack<2>(packx3_3);\n  return nsimd_tests::check_pack_expected_vs_computed(\n      v2, v2_get, \"nsimd::packx3<T, 3>.v2\", \"nsimd::pack<T, 3>\",\n      expected.get(), computed.get());\n}\n\n// ----------------------------------------------------------------------------\n\ntemplate <typename T> bool get_pack_from_packx4_N_3() {\n\n  LOG_TEST_DEBUG(\"get_pack_from_packx4_N_3\", T);\n\n  nsimd::pack<T, 3> v0(42);\n  nsimd::pack<T, 3> v1(24);\n  nsimd::pack<T, 3> v2(66);\n  nsimd::pack<T, 3> v3(90);\n\n  nsimd::packx4<T, 3> packx4_3;\n\n  packx4_3.v0 = v0;\n  packx4_3.v1 = v1;\n  packx4_3.v2 = v2;\n  packx4_3.v3 = v3;\n\n  nsimd::scoped_aligned_mem_for<T> expected(3 * NSIMD_MAX_REGISTER_SIZE_BYTES);\n  nsimd::scoped_aligned_mem_for<T> computed(3 * NSIMD_MAX_REGISTER_SIZE_BYTES);\n\n  nsimd::pack<T, 3> v0_get = nsimd::get_pack<0>(packx4_3);\n  if (!nsimd_tests::check_pack_expected_vs_computed(\n          v0, v0_get, \"nsimd::packx4<T, 3>.v0\", \"nsimd::pack<T, 3>\",\n          expected.get(), computed.get())) {\n    return false;\n  }\n\n  nsimd::pack<T, 3> v1_get = nsimd::get_pack<1>(packx4_3);\n  if (!nsimd_tests::check_pack_expected_vs_computed(\n          v1, v1_get, \"nsimd::packx4<T, 3>.v1\", \"nsimd::pack<T, 3>\",\n          expected.get(), computed.get())) {\n    return false;\n  }\n\n  nsimd::pack<T, 3> v2_get = nsimd::get_pack<2>(packx4_3);\n  if (!nsimd_tests::check_pack_expected_vs_computed(\n          v2, v2_get, \"nsimd::packx4<T, 3>.v2\", \"nsimd::pack<T, 3>\",\n          expected.get(), computed.get())) {\n    return false;\n  }\n\n  nsimd::pack<T, 3> v3_get = nsimd::get_pack<3>(packx4_3);\n  return nsimd_tests::check_pack_expected_vs_computed(\n      v3, v3_get, \"nsimd::packx4<T, 3>.v3\", \"nsimd::pack<T, 3>\",\n      expected.get(), computed.get());\n}\n\n// ----------------------------------------------------------------------------\n\ntemplate <typename T> bool test_all() {\n  if (!get_pack_from_pack_N_1<T>()) {\n    return 0;\n  }\n  if (!get_pack_from_packx2_N_3<T>()) {\n    return 0;\n  }\n  if (!get_pack_from_packx3_N_3<T>()) {\n    return 0;\n  }\n  if (!get_pack_from_packx4_N_3<T>()) {\n    return 0;\n  }\n  return 1;\n}\n\n// ----------------------------------------------------------------------------\n\nint main(void) {\n\n  if (!test_all<i8>() || !test_all<u8>() || !test_all<i16>() ||\n      !test_all<u16>() || !test_all<i32>() || !test_all<u32>() ||\n      !test_all<i64>() || !test_all<u64>() || !test_all<f32>() ||\n      !test_all<f64>()) {\n    return -1;\n  }\n\n  fprintf(stdout, STATUS \"... OK\\n\");\n  fflush(stdout);\n  return 0;\n}\n"
  },
  {
    "path": "tests/memory.cpp",
    "content": "/*\n\nCopyright (c) 2019 Agenium Scale\n\nPermission is hereby granted, free of charge, to any person obtaining a copy\nof this software and associated documentation files (the \"Software\"), to deal\nin the Software without restriction, including without limitation the rights\nto use, copy, modify, merge, publish, distribute, sublicense, and/or sell\ncopies of the Software, and to permit persons to whom the Software is\nfurnished to do so, subject to the following conditions:\n\nThe above copyright notice and this permission notice shall be included in all\ncopies or substantial portions of the Software.\n\nTHE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\nIMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\nFITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\nAUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\nLIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\nOUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\nSOFTWARE.\n\n*/\n\n#include <nsimd/nsimd.h>\n\n#include <vector>\n#include <cstdlib>\n\nint test_aligned_alloc() {\n  void *ptr = nsimd_aligned_alloc(17);\n  if (ptr == NULL || ((size_t)ptr % NSIMD_MAX_ALIGNMENT) != 0) {\n    return EXIT_FAILURE;\n  }\n  nsimd_aligned_free(ptr);\n  return EXIT_SUCCESS;\n}\n\ntemplate <typename T>\nint test_aligned_alloc_for() {\n  void *ptr = nsimd::aligned_alloc(17);\n  if (ptr == NULL || ((size_t)ptr % NSIMD_MAX_ALIGNMENT) != 0) {\n    return EXIT_FAILURE;\n  }\n  nsimd::aligned_free(ptr);\n  return EXIT_SUCCESS;\n}\n\ntemplate <typename T>\nint test_allocator_for() {\n  std::vector< T, nsimd::allocator<T> > v(17);\n  if (v.size() != 17 || ((size_t)v.data() % NSIMD_MAX_ALIGNMENT) != 0) {\n    return EXIT_FAILURE;\n  }\n\n  v.resize(17017);\n  if (v.size() != 17017 || ((size_t)v.data() % NSIMD_MAX_ALIGNMENT) != 0) {\n    return EXIT_FAILURE;\n  }\n\n  v.clear();\n  if (v.size() != 0) {\n    return EXIT_FAILURE;\n  }\n\n  return EXIT_SUCCESS;\n}\n\nint main() {\n  return test_aligned_alloc() ||\n         test_aligned_alloc_for<i8>() ||\n         test_aligned_alloc_for<u8>() ||\n         test_aligned_alloc_for<i16>() ||\n         test_aligned_alloc_for<u16>() ||\n         test_aligned_alloc_for<i32>() ||\n         test_aligned_alloc_for<u32>() ||\n         test_aligned_alloc_for<i64>() ||\n         test_aligned_alloc_for<u64>() ||\n         test_aligned_alloc_for<f32>() ||\n         test_aligned_alloc_for<f64>() ||\n         test_allocator_for<i8>() ||\n         test_allocator_for<u8>() ||\n         test_allocator_for<i16>() ||\n         test_allocator_for<u16>() ||\n         test_allocator_for<i32>() ||\n         test_allocator_for<u32>() ||\n         test_allocator_for<i64>() ||\n         test_allocator_for<u64>() ||\n         test_allocator_for<f32>() ||\n         test_allocator_for<f64>();\n}\n"
  },
  {
    "path": "tests/memory.prec11.c",
    "content": "/*\n\nCopyright (c) 2019 Agenium Scale\n\nPermission is hereby granted, free of charge, to any person obtaining a copy\nof this software and associated documentation files (the \"Software\"), to deal\nin the Software without restriction, including without limitation the rights\nto use, copy, modify, merge, publish, distribute, sublicense, and/or sell\ncopies of the Software, and to permit persons to whom the Software is\nfurnished to do so, subject to the following conditions:\n\nThe above copyright notice and this permission notice shall be included in all\ncopies or substantial portions of the Software.\n\nTHE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\nIMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\nFITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\nAUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\nLIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\nOUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\nSOFTWARE.\n\n*/\n\n#include <nsimd/nsimd.h>\n#include <stdlib.h>\n\nint main(void) {\n  void *ptr = nsimd_aligned_alloc(17);\n  if (ptr == NULL || ((size_t)ptr % NSIMD_MAX_ALIGNMENT) != 0) {\n    return EXIT_FAILURE;\n  }\n  nsimd_aligned_free(ptr);\n  return EXIT_SUCCESS;\n}\n"
  },
  {
    "path": "tests/modules/common.hpp",
    "content": "/*\n\nCopyright (c) 2021 Agenium Scale\n\nPermission is hereby granted, free of charge, to any person obtaining a copy\nof this software and associated documentation files (the \"Software\"), to deal\nin the Software without restriction, including without limitation the rights\nto use, copy, modify, merge, publish, distribute, sublicense, and/or sell\ncopies of the Software, and to permit persons to whom the Software is\nfurnished to do so, subject to the following conditions:\n\nThe above copyright notice and this permission notice shall be included in all\ncopies or substantial portions of the Software.\n\nTHE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\nIMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\nFITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\nAUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\nLIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\nOUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\nSOFTWARE.\n\n*/\n\n#ifndef NSIMD_MODULES_SPMD_COMMON_HPP\n#define NSIMD_MODULES_SPMD_COMMON_HPP\n\n#include <nsimd/nsimd.h>\n#include <nsimd/scalar_utilities.h>\n#include <iostream>\n#include <cstring>\n#include <cerrno>\n#include <cstdlib>\n\n// ----------------------------------------------------------------------------\n// Common code for devices\n\n#if defined(NSIMD_CUDA) || defined(NSIMD_ROCM)\n\ntemplate <typename T> __device__ bool cmp_Ts(T a, T b) { return a == b; }\n\n__device__ bool cmp_Ts(__half a, __half b) {\n  return __half_as_short(a) == __half_as_short(b);\n}\n\n__device__ bool cmp_Ts(float a, float b) {\n  return __float_as_int(a) == __float_as_int(b);\n}\n\n__device__ bool cmp_Ts(double a, double b) {\n  return __double_as_longlong(a) == __double_as_longlong(b);\n}\n\n#elif defined(NSIMD_ONEAPI)\n\ntemplate <typename T> bool cmp_Ts(const T a, const T b) { return a == b; }\n\nbool cmp_Ts(sycl::half a, const sycl::half b) {\n  return nsimd::gpu_reinterpret(u16(), a) ==\n         nsimd::gpu_reinterpret(u16(), b);\n}\n\nbool cmp_Ts(sycl::cl_float a, sycl::cl_float b) {\n  return nsimd::gpu_reinterpret(u32(), a) ==\n         nsimd::gpu_reinterpret(u32(), b);\n}\n\nbool cmp_Ts(sycl::cl_double a, sycl::cl_double b) {\n  return nsimd::gpu_reinterpret(u64(), a) ==\n         nsimd::gpu_reinterpret(u64(), b);\n}\n\n#endif\n\n// ----------------------------------------------------------------------------\n// CUDA\n\n#if defined(NSIMD_CUDA)\n\n// perform reduction on blocks first, note that this could be optimized\n// but to check correctness we don't need it now\ntemplate <typename T>\n__global__ void device_cmp_blocks(T *src1, T *src2, int n) {\n  extern __shared__ char buf_[]; // size of a block\n  T *buf = (T*)buf_;\n  int tid = threadIdx.x;\n  int i = tid + blockIdx.x * blockDim.x;\n  if (i < n) {\n    buf[tid] = T(cmp_Ts(src1[i], src2[i]) ? 1 : 0);\n  }\n\n  const int block_start = blockIdx.x * blockDim.x;\n  const int block_end = block_start + blockDim.x;\n\n  int size;\n  if (block_end < n) {\n    size = blockDim.x;\n  } else {\n    size = n - block_start;\n  }\n\n  __syncthreads();\n  for (int s = size / 2; s != 0; s /= 2) {\n    if (tid < s && i < n) {\n      buf[tid] = nsimd::gpu_mul(buf[tid], buf[tid + s]);\n      __syncthreads();\n    }\n  }\n  if (tid == 0) {\n    src1[i] = buf[0];\n  }\n}\n\ntemplate <typename T>\n__global__ void device_cmp_array(int *dst, T *src1, int n) {\n  // reduction on the whole vector\n  T buf = T(1);\n  for (int i = 0; i < n; i += blockDim.x) {\n    buf = nsimd::gpu_mul(buf, src1[i]);\n  }\n  int i = threadIdx.x + blockIdx.x * blockDim.x;\n  if (i == 0) {\n    dst[0] = int(buf);\n  }\n}\n\ntemplate <typename T> bool cmp(T *src1, T *src2, unsigned int n) {\n  int host_ret;\n  int *device_ret;\n  if (cudaMalloc((void **)&device_ret, sizeof(int)) != cudaSuccess) {\n    std::cerr << \"ERROR: cannot cudaMalloc \" << sizeof(int) << \" bytes\\n\";\n    exit(EXIT_FAILURE);\n  }\n  device_cmp_blocks<<<(n + 127) / 128, 128, 128 * sizeof(T)>>>(src1, src2,\n                                                               int(n));\n  device_cmp_array<<<(n + 127) / 128, 128>>>(device_ret, src1, int(n));\n  cudaMemcpy((void *)&host_ret, (void *)device_ret, sizeof(int),\n             cudaMemcpyDeviceToHost);\n  cudaFree((void *)device_ret);\n  return bool(host_ret);\n}\n\ntemplate <typename T> bool cmp(T *src1, T *src2, unsigned int n, int) {\n  return cmp(src1, src2, n);\n}\n\ntemplate <typename T> void del(T *ptr) { cudaFree(ptr); }\n\n#elif defined(NSIMD_ROCM)\n\n// ----------------------------------------------------------------------------\n// ROCm\n\n// perform reduction on blocks first, note that this could be optimized\n// but to check correctness we don't need it now\ntemplate <typename T>\n__global__ void device_cmp_blocks(T *src1, T *src2, size_t n) {\n  extern __shared__ char buf_[]; // size of a block\n  T *buf = (T*)buf_;\n  size_t tid = hipThreadIdx_x;\n  size_t i = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (i < n) {\n    buf[tid] = T(cmp_Ts(src1[i], src2[i]) ? 1 : 0);\n  }\n\n  const size_t block_start = hipBlockIdx_x * hipBlockDim_x;\n  const size_t block_end = block_start + hipBlockDim_x;\n\n  size_t size;\n  if (block_end < n) {\n    size = hipBlockDim_x;\n  } else {\n    size = n - block_start;\n  }\n\n  __syncthreads();\n  for (size_t s = size / 2; s != 0; s /= 2) {\n    if (tid < s && i < n) {\n      buf[tid] = nsimd::gpu_mul(buf[tid], buf[tid + s]);\n      __syncthreads();\n    }\n  }\n  if (tid == 0) {\n    src1[i] = buf[0];\n  }\n}\n\ntemplate <typename T>\n__global__ void device_cmp_array(int *dst, T *src1, size_t n) {\n  // reduction on the whole vector\n  T buf = T(1);\n  for (size_t i = 0; i < n; i += blockDim.x) {\n    buf = nsimd::gpu_mul(buf, src1[i]);\n  }\n  size_t i = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;\n  if (i == 0) {\n    dst[0] = int(buf);\n  }\n}\n\ntemplate <typename T> bool cmp(T *src1, T *src2, size_t n) {\n  int host_ret;\n  int *device_ret;\n  if (hipMalloc((void **)&device_ret, sizeof(int)) != hipSuccess) {\n    return false;\n  }\n  hipLaunchKernelGGL(device_cmp_blocks, (n + 127) / 128, 128, 128 * sizeof(T),\n                     0, src1, src2, n);\n  hipLaunchKernelGGL(device_cmp_array, (n + 127) / 128, 128, 0, 0, device_ret,\n                     src1, n);\n  hipMemcpy((void *)&host_ret, (void *)device_ret, sizeof(int),\n            hipMemcpyDeviceToHost);\n  hipFree((void *)device_ret);\n  return bool(host_ret);\n}\n\ntemplate <typename T> bool cmp(T *src1, T *src2, size_t n, int) {\n  return cmp(src1, src2, n);\n}\n\ntemplate <typename T> void del(T *ptr) { hipFree(ptr); }\n\n#elif defined(NSIMD_ONEAPI)\n\n// ----------------------------------------------------------------------------\n// oneAPI\n\n// perform reduction on blocks first, note that this could be optimized\n// but to check correctness we don't need it now\ntemplate <typename T>\nvoid device_cmp_blocks(T *const src1, const T *const src2, const size_t n,\n                       sycl::accessor<T, 1, sycl::access::mode::read_write,\n                                      sycl::access::target::local>\n                           local_buffer,\n                       sycl::nd_item<1> item) {\n  size_t tid = item.get_local_id().get(0);\n  size_t i = item.get_global_id().get(0);\n\n  if (i < n) {\n    local_buffer[tid] = T(cmp_Ts(src1[i], src2[i]) ? 1 : 0);\n  }\n\n  item.barrier(sycl::access::fence_space::local_space);\n\n  // other approach: see book p 345\n  if (tid == 0) {\n    sycl::ext::oneapi::sub_group sg = item.get_sub_group();\n    src1[i] = sycl::ext::oneapi::reduce_over_group(\n        sg, local_buffer[0], sycl::ext::oneapi::multiplies<T>());\n  }\n}\n\ntemplate <typename T>\nvoid device_cmp_array(int *const dst, const T *const src1, const size_t n,\n                      sycl::nd_item<1> item) {\n  // reduction mul on the whole vector\n  T buf = T(1);\n  sycl::nd_range<1> nd_range = item.get_nd_range();\n  sycl::range<1> range = nd_range.get_local_range();\n  for (size_t i = 0; i < n; i += range.size()) {\n    buf = nsimd::gpu_mul(buf, src1[i]);\n  }\n  size_t i = item.get_global_id().get(0);\n  if (i == 0) {\n    dst[0] = int(buf);\n  }\n}\n\ntemplate <typename T>\nbool cmp(T *const src1, const T *const src2, unsigned int n) {\n\n  const size_t total_num_threads = (size_t)nsimd_kernel_param(n, 128);\n  sycl::queue q = nsimd::oneapi::default_queue();\n\n  sycl::event e1 = q.submit([=](sycl::handler &h) {\n    sycl::accessor<T, 1, sycl::access::mode::read_write,\n                   sycl::access::target::local>\n        local_buffer(128, h);\n\n    h.parallel_for(sycl::nd_range<1>(sycl::range<1>(total_num_threads),\n                                     sycl::range<1>(128)),\n                   [=](sycl::nd_item<1> item_) {\n                     device_cmp_blocks(src1, src2, size_t(n), local_buffer,\n                                       item_);\n                   });\n  });\n  e1.wait_and_throw();\n\n  int *device_ret = nsimd::device_calloc<int>(n);\n  if (device_ret == NULL) {\n    std::cerr << \"ERROR: cannot sycl::malloc_device \" << sizeof(int)\n              << \" bytes\\n\";\n    exit(EXIT_FAILURE);\n  }\n  sycl::event e2 =\n      q.parallel_for(sycl::nd_range<1>(sycl::range<1>(total_num_threads),\n                                       sycl::range<1>(128)),\n                     [=](sycl::nd_item<1> item_) {\n                       device_cmp_array(device_ret, src1, size_t(n), item_);\n                     });\n  e2.wait_and_throw();\n\n  int host_ret;\n  q.memcpy((void *)&host_ret, (void *)device_ret, sizeof(int)).wait();\n  nsimd::device_free(device_ret);\n\n  return bool(host_ret);\n}\n\ntemplate <typename T> bool cmp(T *src1, T *src2, unsigned int n, double) {\n  return cmp(src1, src2, n);\n}\n\ntemplate <typename T> void del(T *ptr) {\n  sycl::queue q = nsimd::oneapi::default_queue();\n  sycl::free(ptr, q);\n}\n\n#else\n\n// ----------------------------------------------------------------------------\n// SIMD\n\ntemplate <typename T> bool cmp(T *src1, T *src2, unsigned int n) {\n  return memcmp(src1, src2, n * sizeof(T)) == 0;\n}\n\ntemplate <typename T> bool cmp(T *src1, T *src2, unsigned int n, int ufp) {\n  for (unsigned int i = 0; i < n; i++) {\n    if (nsimd::ufp(src1[i], src2[i]) < ufp) {\n      return false;\n    }\n  }\n  return true;\n}\n\n#endif\n\n// ----------------------------------------------------------------------------\n\n#endif\n"
  },
  {
    "path": "tests/nsimd-all.cpp",
    "content": "/*\n\nCopyright (c) 2019 Agenium Scale\n\nPermission is hereby granted, free of charge, to any person obtaining a copy\nof this software and associated documentation files (the \"Software\"), to deal\nin the Software without restriction, including without limitation the rights\nto use, copy, modify, merge, publish, distribute, sublicense, and/or sell\ncopies of the Software, and to permit persons to whom the Software is\nfurnished to do so, subject to the following conditions:\n\nThe above copyright notice and this permission notice shall be included in all\ncopies or substantial portions of the Software.\n\nTHE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\nIMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\nFITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\nAUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\nLIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\nOUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\nSOFTWARE.\n\n*/\n\n#include <nsimd/nsimd-all.hpp>\n#include <iostream>\n#include <cstdlib>\n\n// ----------------------------------------------------------------------------\n\nint main() {\n  using namespace nsimd;\n  const int unroll = 3;\n  typedef pack<float, unroll> upack;\n\n  const int n_max = unroll * NSIMD_MAX_LEN(f32);\n  const int n = len(upack());\n  float buf[n_max];\n\n  for(int i = 0; i < n; i++) {\n    buf[i] = float(i);\n  }\n\n  upack p = loadu<upack>(buf);\n  p = -(p * p) + 1.0f;\n  storeu(buf, p);\n\n  for (int i = 0; i < n; i++) {\n    fprintf(stdout, \"%f vs %f\\n\", double(buf[i]), double(-(i * i) + 1));\n  }\n\n  for (int i = 0; i < n; i++) {\n    if (buf[i] != float(-(i * i) + 1)) {\n      exit(EXIT_FAILURE);\n    }\n  }\n\n  return 0;\n}\n"
  },
  {
    "path": "tests/nsimd.cpp",
    "content": "/*\n\nCopyright (c) 2019 Agenium Scale\n\nPermission is hereby granted, free of charge, to any person obtaining a copy\nof this software and associated documentation files (the \"Software\"), to deal\nin the Software without restriction, including without limitation the rights\nto use, copy, modify, merge, publish, distribute, sublicense, and/or sell\ncopies of the Software, and to permit persons to whom the Software is\nfurnished to do so, subject to the following conditions:\n\nThe above copyright notice and this permission notice shall be included in all\ncopies or substantial portions of the Software.\n\nTHE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\nIMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\nFITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\nAUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\nLIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\nOUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\nSOFTWARE.\n\n*/\n\n#include <nsimd/nsimd.h>\n#include <nsimd/cxx_adv_api.hpp>\n#include <iostream>\n#include <cstdlib>\n\n// ----------------------------------------------------------------------------\n\nvoid test_native_register() {\n  nsimd_cpu_vf32 a = nsimd_set1_cpu_f32(1.0f);\n\n  nsimd::pack<f32, 1, nsimd::cpu> p1(a);\n  nsimd::pack<f32, 1, nsimd::cpu> p2(1.0f);\n\n  if (nsimd_any_cpu_f32(nsimd_ne_cpu_f32(a, p1.native_register()))) {\n    exit(EXIT_FAILURE);\n  }\n\n  if (nsimd_any_cpu_f32(nsimd_ne_cpu_f32(a, nsimd::native_register(p1)))) {\n    exit(EXIT_FAILURE);\n  }\n\n  if (nsimd_any_cpu_f32(nsimd_ne_cpu_f32(nsimd::native_register(a),\n                                         nsimd::native_register(p1)))) {\n    exit(EXIT_FAILURE);\n  }\n\n  if (nsimd_any_cpu_f32(\n          nsimd_ne_cpu_f32(p2.native_register(), p1.native_register()))) {\n    exit(EXIT_FAILURE);\n  }\n}\n\n// ----------------------------------------------------------------------------\n\nvoid test_output() {\n  nsimd_cpu_vf32 a = nsimd_set1_cpu_f32(1.0f);\n\n  if (nsimd_put_cpu_f32(stdout, NULL, a) == -1) {\n    exit(EXIT_FAILURE);\n  }\n\n  if (nsimd_put_cpu_f32(stdout, \"%f\", a) == -1) {\n    exit(EXIT_FAILURE);\n  }\n\n  fflush(stdout);\n\n  nsimd::pack<f32, 1, nsimd::cpu> p1(a);\n  nsimd::pack<f32, 1, nsimd::cpu> p2(1.0f);\n\n  std::cout << p1 << std::endl << p2 << std::endl;\n}\n\n// ----------------------------------------------------------------------------\n\nvoid test_unroll() {\n  using namespace nsimd;\n  const int unroll = 3;\n  typedef pack<float, unroll> upack;\n\n  const int n_max = unroll * NSIMD_MAX_LEN(f32);\n  const int n = len(upack());\n  float buf[n_max];\n\n  for(int i = 0; i < n; i++) {\n    buf[i] = float(i);\n  }\n\n  upack p = loadu<upack>(buf);\n  p = -(p * p);\n  storeu(buf, p);\n\n  for (int i = 0; i < n; i++) {\n    fprintf(stdout, \"%f vs %f\\n\", double(buf[i]), double(-i * i));\n  }\n\n  for (int i = 0; i < n; i++) {\n    if (buf[i] != float(-(i * i))) {\n      exit(EXIT_FAILURE);\n    }\n  }\n}\n\n// ----------------------------------------------------------------------------\n\nint main(void) {\n  test_native_register();\n  test_output();\n  test_unroll();\n  return 0;\n}\n"
  },
  {
    "path": "tests/nsimd.prec11.c",
    "content": "/*\n\nCopyright (c) 2019 Agenium Scale\n\nPermission is hereby granted, free of charge, to any person obtaining a copy\nof this software and associated documentation files (the \"Software\"), to deal\nin the Software without restriction, including without limitation the rights\nto use, copy, modify, merge, publish, distribute, sublicense, and/or sell\ncopies of the Software, and to permit persons to whom the Software is\nfurnished to do so, subject to the following conditions:\n\nThe above copyright notice and this permission notice shall be included in all\ncopies or substantial portions of the Software.\n\nTHE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\nIMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\nFITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\nAUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\nLIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\nOUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\nSOFTWARE.\n\n*/\n\n#include <nsimd/nsimd.h>\n\nint main(void) {\n  return 0;\n}\n"
  },
  {
    "path": "tests/operator_vector_scalar.cpp",
    "content": "/*\n\nCopyright (c) 2019 Agenium Scale\n\nPermission is hereby granted, free of charge, to any person obtaining a copy\nof this software and associated documentation files (the \"Software\"), to deal\nin the Software without restriction, including without limitation the rights\nto use, copy, modify, merge, publish, distribute, sublicense, and/or sell\ncopies of the Software, and to permit persons to whom the Software is\nfurnished to do so, subject to the following conditions:\n\nThe above copyright notice and this permission notice shall be included in all\ncopies or substantial portions of the Software.\n\nTHE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\nIMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\nFITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\nAUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\nLIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\nOUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\nSOFTWARE.\n\n*/\n\n#include <nsimd/nsimd-all.hpp>\n#include <iostream>\n\nint main() {\n  nsimd::pack<float> a(1.0f);\n  return (nsimd::any(a != 0) != 0 ? 0 : 1);\n}\n"
  },
  {
    "path": "tests/shifts.cpp",
    "content": "/*\n\nCopyright (c) 2019 Agenium Scale\n\nPermission is hereby granted, free of charge, to any person obtaining a copy\nof this software and associated documentation files (the \"Software\"), to deal\nin the Software without restriction, including without limitation the rights\nto use, copy, modify, merge, publish, distribute, sublicense, and/or sell\ncopies of the Software, and to permit persons to whom the Software is\nfurnished to do so, subject to the following conditions:\n\nThe above copyright notice and this permission notice shall be included in all\ncopies or substantial portions of the Software.\n\nTHE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\nIMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\nFITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\nAUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\nLIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\nOUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\nSOFTWARE.\n\n*/\n\n#include <nsimd/nsimd-all.hpp>\n#include <cstdlib>\n\n// ----------------------------------------------------------------------------\n\nint main() {\n  using namespace nsimd;\n  const int unroll = 3;\n  typedef pack<unsigned int, unroll> upack;\n\n  const int n_max = unroll * NSIMD_MAX_LEN(f32);\n  const int n = len(upack());\n  unsigned int buf[n_max];\n\n  for(int i = 0; i < n; i++) {\n    buf[i] = (unsigned int)i;\n  }\n  upack v = loadu<upack>(buf);\n\n  if (any(((v << 4) >> 4) != v)) {\n    exit(EXIT_FAILURE);\n  }\n\n  return 0;\n}\n"
  },
  {
    "path": "tests/templated_loads_stores.cpp",
    "content": "/*\n\nCopyright (c) 2019 Agenium Scale\n\nPermission is hereby granted, free of charge, to any person obtaining a copy\nof this software and associated documentation files (the \"Software\"), to deal\nin the Software without restriction, including without limitation the rights\nto use, copy, modify, merge, publish, distribute, sublicense, and/or sell\ncopies of the Software, and to permit persons to whom the Software is\nfurnished to do so, subject to the following conditions:\n\nThe above copyright notice and this permission notice shall be included in all\ncopies or substantial portions of the Software.\n\nTHE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\nIMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\nFITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\nAUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\nLIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\nOUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\nSOFTWARE.\n\n*/\n\n#include <nsimd/nsimd-all.hpp>\n#include <iostream>\n#include <cstdlib>\n\n// ----------------------------------------------------------------------------\n\nfloat *getmem(nsimd::aligned, int sz) {\n  float *ret = (float *)nsimd::aligned_alloc(sz);\n  if (ret == NULL) {\n    std::cerr << \"ERROR: cannot malloc aligned memory\" << std::endl;\n  }\n  return ret;\n}\n\nfloat *getmem(nsimd::unaligned, int sz) {\n  return getmem(nsimd::aligned(), 2 * sz) + 1;\n}\n\n// ----------------------------------------------------------------------------\n\ntemplate <typename Alignment> int test() {\n  using namespace nsimd;\n\n  f32 *buf = getmem(Alignment(), NSIMD_MAX_LEN(f32));\n  memset((void *)buf, 0, NSIMD_MAX_LEN(f32));\n\n  pack<f32> v =\n      masko_load<Alignment>(packl<f32>(false), buf, set1<pack<f32> >(1.0f));\n\n  if (any(v != 1.0f)) {\n    std::cerr << \"[1]: v != [ 1.0f ... 1.0f ]\" << std::endl;\n    return -1;\n  }\n\n  v = load<pack<f32>, Alignment>(buf);\n  if (any(v != 0.0f)) {\n    std::cerr << \"[2]: v != [ 0.0f ... 0.0f ]\" << std::endl;\n    return -1;\n  }\n\n  v = set1<pack<f32> >(1.0f);\n  store<Alignment>(buf, v);\n  for (int i = 0; i < len(pack<f32>()); i++) {\n    if (buf[i] != 1.0f) {\n      std::cerr << \"[3]: buf != [ 1.0f ... 1.0f ]\" << std::endl;\n      return -1;\n    }\n  }\n\n  v = set1<pack<f32> >(2.0f);\n  mask_store<Alignment>(packl<f32>(false), buf, v);\n  for (int i = 0; i < len(pack<f32>()); i++) {\n    if (buf[i] != 1.0f) {\n      std::cerr << \"[4]: buf != [ 1.0f ... 1.0f ]\" << std::endl;\n      return -1;\n    }\n  }\n\n  v = maskz_load<Alignment>(packl<f32>(false), buf);\n  if (any(v != 0.0f)) {\n    std::cerr << \"[5]: v != [ 0.0f ... 0.0f ]\" << std::endl;\n    return -1;\n  }\n\n  return 0;\n}\n\n// ----------------------------------------------------------------------------\n\nint main() { return test<nsimd::aligned>() || test<nsimd::unaligned>(); }\n"
  },
  {
    "path": "tests/tests_helpers.hpp",
    "content": "#ifndef TESTS_HELPERS_HPP\n#define TESTS_HELPERS_HPP\n\n#include <cerrno>\n#include <cstdio>\n#include <cstdlib>\n#include <cstring>\n#include <nsimd/cxx_adv_api.hpp>\n#include <nsimd/nsimd.h>\n\n#define NSIMD_LOG_DEBUG 0\n\n#define NSIMD_MAX_REGISTER_SIZE_BYTES NSIMD_MAX_LEN_BIT / 8\n\n#define LOG_TEST_DEBUG(test_name, T)                                          \\\n  do {                                                                        \\\n    if (NSIMD_LOG_DEBUG) {                                                    \\\n      fprintf(stdout, \"%s%s%s%s%s\", \"\\n--------- \",                           \\\n              nsimd_tests::get_type_str(T()), \": \", test_name,                \\\n              \"---------------\\n\\n\");                                         \\\n    }                                                                         \\\n  } while (0)\n\n#define LOG_MEMORY_CONTENT_DEBUG(vout, len_, memory_type)                     \\\n  do {                                                                        \\\n    if (NSIMD_LOG_DEBUG) {                                                    \\\n      nsimd_tests::print(vout, len_, memory_type);                            \\\n    }                                                                         \\\n  } while (0)\n\n#define CHECK(a)                                                              \\\n  {                                                                           \\\n    errno = 0;                                                                \\\n    if (!(a)) {                                                               \\\n      fprintf(stderr, \"ERROR: \" #a \":%d: %s\\n\", __LINE__, strerror(errno));   \\\n      fflush(stderr);                                                         \\\n      exit(EXIT_FAILURE);                                                     \\\n    }                                                                         \\\n  }\n\n#define TEST_NSIMD_FALSE 0\n#define TEST_NSIMD_TRUE 1\n#define TEST_NSIMD_ERROR -1\n\n/* ----------------------------------------------------------------------- */\n\nnamespace nsimd_tests {\n\ntemplate <typename T>\nint expected_not_equal_computed(const T expected, const T computed) {\n  return expected != computed;\n}\n\nnamespace fprintf_helper {\n// silent the warning for implicit conversion from ‘float’ to ‘double’ when\n// passing argument to fprintf\ntemplate <typename T> struct f64_if_f32_else_T { typedef T value_type; };\ntemplate <> struct f64_if_f32_else_T<f32> { typedef f64 value_type; };\n\nconst char *specifier(i8) { return \"%hhu\"; }\nconst char *specifier(u8) { return \"%hhu\"; }\nconst char *specifier(i16) { return \"%hd\"; }\nconst char *specifier(u16) { return \"%hu\"; }\nconst char *specifier(i32) { return \"%d\"; }\nconst char *specifier(u32) { return \"%u\"; }\nconst char *specifier(i64) { return \"%ld\"; }\nconst char *specifier(u64) { return \"%lu\"; }\nconst char *specifier(f32) { return \"%f\"; }\nconst char *specifier(f64) { return \"%f\"; }\n\n} // namespace fprintf_helper\n\nconst char *get_type_str(i8) { return \"i8\"; }\nconst char *get_type_str(u8) { return \"u8\"; }\nconst char *get_type_str(i16) { return \"i16\"; }\nconst char *get_type_str(u16) { return \"u16\"; }\nconst char *get_type_str(i32) { return \"i32\"; }\nconst char *get_type_str(u32) { return \"u32\"; }\nconst char *get_type_str(i64) { return \"i64\"; }\nconst char *get_type_str(u64) { return \"u64\"; }\nconst char *get_type_str(f32) { return \"f32\"; }\nconst char *get_type_str(f64) { return \"f64\"; }\n\ntemplate <typename T>\nvoid print(T *const arr, const nsimd_nat len_, const char *msg) {\n  fprintf(stdout, \"%-24s: \", msg);\n  char formatter[12];\n  strcpy(formatter, \"%s\");\n  strcat(formatter, fprintf_helper::specifier(T()));\n  for (nsimd_nat ii = 0; ii < len_; ++ii) {\n    fprintf(\n        stdout, formatter, 0 == ii ? \"{\" : \", \",\n        (typename fprintf_helper::f64_if_f32_else_T<T>::value_type)arr[ii]);\n  }\n  fprintf(stdout, \"%s\", \"}\\n\");\n  fflush(stdout);\n}\n\ntemplate <typename T>\nvoid init_arrays(T *const vout_expected, T *const vout_computed,\n                 const nsimd_nat len_) {\n  for (nsimd_nat ii = 0; ii < len_; ++ii) {\n    vout_expected[ii] = (T)-1;\n    vout_computed[ii] = (T)1;\n  }\n}\n\n/* ----------------------------- storea ---------------------------- */\n\n// storea for all packx[Y]<1 .. N> Y in {1, 2, 3, 4}\n\n// struct storea_recurs_helper for packx[Y]<1 .. N> y in {2, 3, 4}\n\n// General definition\ntemplate <typename T, int N, typename SimdExt,\n          template <typename, int, typename> class pack_t, int VIx,\n          bool EndRecurs>\nstruct storea_recurs_helper {};\n\n// Recursive case\ntemplate <typename T, int N, typename SimdExt,\n          template <typename, int, typename> class pack_t, int VIx>\nstruct storea_recurs_helper<T, N, SimdExt, pack_t, VIx, false> {\n  void operator()(T *const begin, const pack_t<T, N, SimdExt> &pack_) const {\n    nsimd::storea(begin, nsimd::get_pack<VIx>(pack_));\n    storea_recurs_helper<T, N, SimdExt, pack_t, VIx + 1,\n                         VIx + 1 == pack_t<T, N, SimdExt>::soa_num_packs>()(\n        begin + nsimd::len(nsimd::pack<T, N, SimdExt>()), pack_);\n  }\n};\n\n// Base case\ntemplate <typename T, int N, typename SimdExt,\n          template <typename, int, typename> class pack_t, int VIx>\nstruct storea_recurs_helper<T, N, SimdExt, pack_t, VIx, true> {\n  void operator()(T *const begin, const pack_t<T, N, SimdExt> &pack_) const {\n    (void)begin;\n    (void)pack_;\n  }\n};\n\n// storea function for packx[Y]<1 .. N> y in {2, 3, 4}\ntemplate <typename T, int N, typename SimdExt,\n          template <typename, int, typename> class pack_t>\nvoid storea__(T *const begin, const pack_t<T, N, SimdExt> &pack_) {\n  storea_recurs_helper<T, N, SimdExt, pack_t, 0,\n                       0 == pack_t<T, N, SimdExt>::soa_num_packs>()(begin,\n                                                                    pack_);\n}\n\n// storea for pack<1 .. N>\ntemplate <typename T, int N, typename SimdExt>\nvoid storea__(T *const begin, const nsimd::pack<T, N, SimdExt> &pack_) {\n  nsimd::storea(begin, pack_);\n}\n\n/* ---------------------- check_arrays ------------------------------- */\n\ntemplate <typename T>\nbool check_arrays(const T *const vout_expected, const T *const vout_computed,\n                  const nsimd_nat len_) {\n  for (nsimd_nat ii = 0; ii < len_; ++ii) {\n    if (expected_not_equal_computed(vout_expected[ii], vout_computed[ii])) {\n      fprintf(stdout, STATUS \"... FAIL\\n\");\n      fflush(stdout);\n      return 0;\n    }\n  }\n  return 1;\n}\n\n/* ---------------------- check_packs_content ------------------------ */\n\ntemplate <typename T, int N_From, int N_To, typename SimdExt,\n          template <typename, int, typename> class PackFrom,\n          template <typename, int, typename> class PackTo>\nbool check_pack_expected_vs_computed(\n    const PackFrom<T, N_From, SimdExt> &pack_from,\n    const PackTo<T, N_To, SimdExt> &pack_to, const char *from_type,\n    const char *to_type, T *const vout_expected, T *const vout_computed) {\n\n  if (nsimd::len(pack_from) != nsimd::len(pack_to)) {\n    return 0;\n  }\n  const nsimd_nat len_ = (nsimd_nat)(nsimd::len(pack_to));\n  init_arrays(vout_expected, vout_computed, len_);\n\n  storea__(vout_expected, pack_from);\n  LOG_MEMORY_CONTENT_DEBUG(vout_expected, nsimd::len(pack_from), from_type);\n\n  nsimd::storea(vout_computed, pack_to);\n  LOG_MEMORY_CONTENT_DEBUG(vout_computed, nsimd::len(pack_to), to_type);\n\n  if (!check_arrays(vout_expected, vout_computed, len_)) {\n    return 0;\n  }\n\n  return 1;\n}\n\n} // namespace nsimd_tests\n\n#endif\n"
  },
  {
    "path": "tests/to_pack.cpp",
    "content": "#define STATUS \"test of to_pack over all types\"\n\n#include \"tests_helpers.hpp\"\n\ntemplate <typename T> bool to_pack_from_pack_1_N_1() {\n\n  LOG_TEST_DEBUG(\"to_pack_from_pack_1_N_1\", T);\n\n  nsimd::pack<T, 1> pack_from(42);\n  nsimd::pack<T, 1> pack_to = nsimd::to_pack(pack_from);\n\n  nsimd::scoped_aligned_mem_for<T>\n    expected(NSIMD_MAX_REGISTER_SIZE_BYTES);\n  nsimd::scoped_aligned_mem_for<T>\n    computed(NSIMD_MAX_REGISTER_SIZE_BYTES);\n\n  return nsimd_tests::check_pack_expected_vs_computed(\n      pack_from, pack_to, \"nsimd::pack<T, 1>\", \"nsimd::pack<T, 1>\",\n      expected.get(), computed.get());\n}\n\ntemplate <typename T> bool to_pack_from_packx2_N_1() {\n\n  LOG_TEST_DEBUG(\"to_pack_from_packx2_N_1\", T);\n\n  nsimd::pack<T, 1> v0(42);\n  nsimd::pack<T, 1> v1(24);\n\n  nsimd::packx2<T, 1> pack_from;\n  pack_from.v0 = v0;\n  pack_from.v1 = v1;\n\n  nsimd::pack<T, 2> pack_to = nsimd::to_pack(pack_from);\n\n  nsimd::scoped_aligned_mem_for<T>\n    expected(2 * NSIMD_MAX_REGISTER_SIZE_BYTES);\n  nsimd::scoped_aligned_mem_for<T>\n    computed(2 * NSIMD_MAX_REGISTER_SIZE_BYTES);\n\n  return nsimd_tests::check_pack_expected_vs_computed(\n      pack_from, pack_to, \"nsimd::packx2<T, 1>\", \"nsimd::pack<T, 2>\",\n      expected.get(), computed.get());\n}\n\ntemplate <typename T> bool to_pack_from_packx3_N_1() {\n\n  LOG_TEST_DEBUG(\"to_pack_from_packx3_N_1\", T);\n\n  nsimd::pack<T, 1> v0(42);\n  nsimd::pack<T, 1> v1(24);\n  nsimd::pack<T, 1> v2(66);\n\n  nsimd::packx3<T, 1> pack_from;\n  pack_from.v0 = v0;\n  pack_from.v1 = v1;\n  pack_from.v2 = v2;\n\n  nsimd::pack<T, 3> pack_to = nsimd::to_pack(pack_from);\n\n  nsimd::scoped_aligned_mem_for<T>\n    expected(3 * NSIMD_MAX_REGISTER_SIZE_BYTES);\n  nsimd::scoped_aligned_mem_for<T>\n    computed(3 * NSIMD_MAX_REGISTER_SIZE_BYTES);\n\n  return nsimd_tests::check_pack_expected_vs_computed(\n      pack_from, pack_to, \"nsimd::packx3<T, 1>\", \"nsimd::pack<T, 3>\",\n      expected.get(), computed.get());\n}\n\ntemplate <typename T> bool to_pack_from_packx2_N_2() {\n\n  LOG_TEST_DEBUG(\"to_pack_from_packx2_N_2\", T);\n\n  nsimd::pack<T, 2> v0(42);\n  nsimd::pack<T, 2> v1(24);\n\n  nsimd::packx2<T, 2> pack_from;\n  pack_from.v0 = v0;\n  pack_from.v1 = v1;\n\n  nsimd::pack<T, 4> pack_to = nsimd::to_pack(pack_from);\n\n  nsimd::scoped_aligned_mem_for<T>\n    expected(4 * NSIMD_MAX_REGISTER_SIZE_BYTES);\n  nsimd::scoped_aligned_mem_for<T>\n    computed(4 * NSIMD_MAX_REGISTER_SIZE_BYTES);\n\n  return nsimd_tests::check_pack_expected_vs_computed(\n      pack_from, pack_to, \"nsimd::packx2<T, 2>\", \"nsimd::pack<T, 4>\",\n      expected.get(), computed.get());\n}\n\ntemplate <typename T> bool to_pack_from_packx2_N_3() {\n\n  LOG_TEST_DEBUG(\"to_pack_from_packx2_N_3\", T);\n\n  nsimd::pack<T, 3> v0(42);\n  nsimd::pack<T, 3> v1(24);\n\n  nsimd::packx2<T, 3> pack_from;\n\n  pack_from.v0 = v0;\n  pack_from.v1 = v1;\n\n  nsimd::pack<T, 6> pack_to = nsimd::to_pack(pack_from);\n\n  nsimd::scoped_aligned_mem_for<T>\n    expected(6 * NSIMD_MAX_REGISTER_SIZE_BYTES);\n  nsimd::scoped_aligned_mem_for<T>\n    computed(6 * NSIMD_MAX_REGISTER_SIZE_BYTES);\n\n  return nsimd_tests::check_pack_expected_vs_computed(\n      pack_from, pack_to, \"nsimd::packx2<T, 3>\", \"nsimd::pack<T, 6>\",\n      expected.get(), computed.get());\n}\n\ntemplate <typename T> bool to_pack_from_packx3_N_2() {\n\n  LOG_TEST_DEBUG(\"to_pack_from_packx3_N_2\", T);\n\n  nsimd::pack<T, 2> v0(42);\n  nsimd::pack<T, 2> v1(24);\n  nsimd::pack<T, 2> v2(66);\n\n  nsimd::packx3<T, 2> pack_from;\n\n  pack_from.v0 = v0;\n  pack_from.v1 = v1;\n  pack_from.v2 = v2;\n\n  nsimd::pack<T, 6> pack_to = nsimd::to_pack(pack_from);\n\n  nsimd::scoped_aligned_mem_for<T>\n    expected(6 * NSIMD_MAX_REGISTER_SIZE_BYTES);\n  nsimd::scoped_aligned_mem_for<T>\n    computed(6 * NSIMD_MAX_REGISTER_SIZE_BYTES);\n\n  return nsimd_tests::check_pack_expected_vs_computed(\n      pack_from, pack_to, \"nsimd::packx3<T, 2>\", \"nsimd::pack<T, 6>\",\n      expected.get(), computed.get());\n}\n\ntemplate <typename T> bool to_pack_from_packx3_N_3() {\n\n  LOG_TEST_DEBUG(\"to_pack_from_packx3_N_3\", T);\n\n  nsimd::pack<T, 3> v0(42);\n  nsimd::pack<T, 3> v1(24);\n  nsimd::pack<T, 3> v2(66);\n\n  nsimd::packx3<T, 3> pack_from;\n\n  pack_from.v0 = v0;\n  pack_from.v1 = v1;\n  pack_from.v2 = v2;\n\n  nsimd::pack<T, 9> pack_to = nsimd::to_pack(pack_from);\n\n  nsimd::scoped_aligned_mem_for<T>\n    expected(9 * NSIMD_MAX_REGISTER_SIZE_BYTES);\n  nsimd::scoped_aligned_mem_for<T>\n    computed(9 * NSIMD_MAX_REGISTER_SIZE_BYTES);\n\n  return nsimd_tests::check_pack_expected_vs_computed(\n      pack_from, pack_to, \"nsimd::packx3<T, 3>\", \"nsimd::pack<T, 9>\",\n      expected.get(), computed.get());\n}\n\ntemplate <typename T> bool to_pack_from_packx4_N_1() {\n\n  LOG_TEST_DEBUG(\"to_pack_from_packx4_N_1\", T);\n\n  nsimd::pack<T, 1> v0(42);\n  nsimd::pack<T, 1> v1(24);\n  nsimd::pack<T, 1> v2(66);\n  nsimd::pack<T, 1> v3(132);\n\n  nsimd::packx4<T, 1> pack_from;\n\n  pack_from.v0 = v0;\n  pack_from.v1 = v1;\n  pack_from.v2 = v2;\n  pack_from.v3 = v3;\n\n  nsimd::pack<T, 4> pack_to = nsimd::to_pack(pack_from);\n\n  nsimd::scoped_aligned_mem_for<T>\n    expected(4 * NSIMD_MAX_REGISTER_SIZE_BYTES);\n  nsimd::scoped_aligned_mem_for<T>\n    computed(4 * NSIMD_MAX_REGISTER_SIZE_BYTES);\n\n  return nsimd_tests::check_pack_expected_vs_computed(\n      pack_from, pack_to, \"nsimd::packx4<T, 1>\", \"nsimd::pack<T, 4>\",\n      expected.get(), computed.get());\n}\n\ntemplate <typename T> bool to_pack_from_packx4_N_2() {\n\n  LOG_TEST_DEBUG(\"to_pack_from_packx4_N_2\", T);\n\n  nsimd::pack<T, 2> v0(42);\n  nsimd::pack<T, 2> v1(24);\n  nsimd::pack<T, 2> v2(66);\n  nsimd::pack<T, 2> v3(132);\n\n  nsimd::packx4<T, 2> pack_from;\n\n  pack_from.v0 = v0;\n  pack_from.v1 = v1;\n  pack_from.v2 = v2;\n  pack_from.v3 = v3;\n\n  nsimd::pack<T, 8> pack_to = nsimd::to_pack(pack_from);\n\n  nsimd::scoped_aligned_mem_for<T>\n    expected(8 * NSIMD_MAX_REGISTER_SIZE_BYTES);\n  nsimd::scoped_aligned_mem_for<T>\n    computed(8 * NSIMD_MAX_REGISTER_SIZE_BYTES);\n\n  return nsimd_tests::check_pack_expected_vs_computed(\n      pack_from, pack_to, \"nsimd::packx4<T, 2>\", \"nsimd::pack<T, 8>\",\n      expected.get(), computed.get());\n}\n\ntemplate <typename T> bool to_pack_from_packx4_N_3() {\n\n  LOG_TEST_DEBUG(\"to_pack_from_packx4_N_3\", T);\n\n  nsimd::pack<T, 3> v0(42);\n  nsimd::pack<T, 3> v1(24);\n  nsimd::pack<T, 3> v2(66);\n  nsimd::pack<T, 3> v3(132);\n\n  nsimd::packx4<T, 3> pack_from;\n\n  pack_from.v0 = v0;\n  pack_from.v1 = v1;\n  pack_from.v2 = v2;\n  pack_from.v3 = v3;\n\n  nsimd::pack<T, 12> pack_to = nsimd::to_pack(pack_from);\n\n  nsimd::scoped_aligned_mem_for<T>\n    expected(12 * NSIMD_MAX_REGISTER_SIZE_BYTES);\n  nsimd::scoped_aligned_mem_for<T>\n    computed(12 * NSIMD_MAX_REGISTER_SIZE_BYTES);\n\n  return nsimd_tests::check_pack_expected_vs_computed(\n      pack_from, pack_to, \"nsimd::packx4<T, 3>\", \"nsimd::pack<T, 12>\",\n      expected.get(), computed.get());\n}\n\ntemplate <typename T> bool test_all() {\n\n  if (!to_pack_from_pack_1_N_1<T>()) {\n    return 0;\n  }\n  if (!to_pack_from_packx2_N_1<T>()) {\n    return 0;\n  }\n  if (!to_pack_from_packx2_N_2<T>()) {\n    return 0;\n  }\n  if (!to_pack_from_packx2_N_3<T>()) {\n    return 0;\n  }\n  if (!to_pack_from_packx3_N_1<T>()) {\n    return 0;\n  }\n  if (!to_pack_from_packx3_N_2<T>()) {\n    return 0;\n  }\n  if (!to_pack_from_packx3_N_3<T>()) {\n    return 0;\n  }\n  if (!to_pack_from_packx4_N_1<T>()) {\n    return 0;\n  }\n  if (!to_pack_from_packx4_N_2<T>()) {\n    return 0;\n  }\n  if (!to_pack_from_packx4_N_3<T>()) {\n    return 0;\n  }\n  return 1;\n}\n\nint main(void) {\n\n  if (!test_all<i8>() || !test_all<u8>() || !test_all<i16>() ||\n      !test_all<u16>() || !test_all<i32>() || !test_all<u32>() ||\n      !test_all<i64>() || !test_all<u64>() || !test_all<f32>() ||\n      !test_all<f64>()) {\n    return -1;\n  }\n\n  fprintf(stdout, STATUS \"... OK\\n\");\n  fflush(stdout);\n  return 0;\n}\n"
  },
  {
    "path": "tests/to_pack_interleave.cpp",
    "content": "#define STATUS \"test of to_pack_interleave over all types\"\n\n#include \"tests_helpers.hpp\"\n\ntemplate <typename T> bool to_pack_interleave_from_pack_1_N_1() {\n\n  LOG_TEST_DEBUG(\"to_pack_interleave_from_pack_1_N_1\", T);\n\n  nsimd::pack<T, 1> pack_from(42);\n  nsimd::pack<T, 1> pack_to = nsimd::to_pack_interleave(pack_from);\n\n  nsimd::scoped_aligned_mem_for<T>\n    expected(NSIMD_MAX_REGISTER_SIZE_BYTES);\n  nsimd::scoped_aligned_mem_for<T>\n    computed(NSIMD_MAX_REGISTER_SIZE_BYTES);\n\n  return nsimd_tests::check_pack_expected_vs_computed(\n      pack_from, pack_to, \"nsimd::pack<T, 1>\", \"nsimd::pack<T, 1>\",\n      expected.get(), computed.get());\n}\n\ntemplate <typename T> bool to_pack_interleave_from_packx2_N_1() {\n\n  LOG_TEST_DEBUG(\"to_pack_interleave_from_packx2_N_1\", T);\n\n  nsimd::pack<T, 1> v0(42);\n  nsimd::pack<T, 1> v1(24);\n\n  nsimd::packx2<T, 1> pack_from;\n  pack_from.v0 = v0;\n  pack_from.v1 = v1;\n\n  nsimd::scoped_aligned_mem_for<T>\n    expected(2 * NSIMD_MAX_REGISTER_SIZE_BYTES);\n  nsimd::scoped_aligned_mem_for<T>\n    computed(2 * NSIMD_MAX_REGISTER_SIZE_BYTES);\n\n  const int len_ = nsimd::len(nsimd::packx2<T, 1>());\n  nsimd_tests::init_arrays(expected.get(), computed.get(), len_);\n\n  T *begin = expected.get();\n  nsimd::storea(begin, nsimd::pack<T, 1>(pack_from.v0.car));\n  begin += nsimd::len(nsimd::pack<T, 1>());\n  nsimd::storea(begin, nsimd::pack<T, 1>(pack_from.v1.car));\n\n  LOG_MEMORY_CONTENT_DEBUG(expected.get(), len_, \"nsimd::packx2<T, 1>\");\n\n  nsimd::pack<T, 2> pack_to = nsimd::to_pack_interleave(pack_from);\n  nsimd::storea(computed.get(), pack_to);\n\n  LOG_MEMORY_CONTENT_DEBUG(computed.get(), len_, \"nsimd::pack<T, 2>\");\n\n  return nsimd_tests::check_arrays(expected.get(), computed.get(), len_);\n}\n\ntemplate <typename T> bool to_pack_interleave_from_packx2_N_2() {\n\n  LOG_TEST_DEBUG(\"to_pack_interleave_from_packx2_N_2\", T);\n\n  nsimd::pack<T, 2> v0(42);\n  nsimd::pack<T, 2> v1(24);\n\n  nsimd::packx2<T, 2> pack_from;\n  pack_from.v0 = v0;\n  pack_from.v1 = v1;\n\n  nsimd::scoped_aligned_mem_for<T>\n    expected(4 * NSIMD_MAX_REGISTER_SIZE_BYTES);\n  nsimd::scoped_aligned_mem_for<T>\n    computed(4 * NSIMD_MAX_REGISTER_SIZE_BYTES);\n\n  const int len_ = nsimd::len(nsimd::packx2<T, 2>());\n  nsimd_tests::init_arrays(expected.get(), computed.get(), len_);\n\n  T *begin = expected.get();\n  nsimd::storea(begin, nsimd::pack<T, 1>(pack_from.v0.car));\n\n  begin += nsimd::len(nsimd::pack<T, 1>());\n  nsimd::storea(begin, nsimd::pack<T, 1>(pack_from.v1.car));\n\n  begin += nsimd::len(nsimd::pack<T, 1>());\n  nsimd::storea(begin, nsimd::pack<T, 1>(pack_from.v0.cdr.car));\n\n  begin += nsimd::len(nsimd::pack<T, 1>());\n  nsimd::storea(begin, nsimd::pack<T, 1>(pack_from.v1.cdr.car));\n\n  LOG_MEMORY_CONTENT_DEBUG(expected.get(), len_, \"nsimd::packx2<T, 2>\");\n\n  nsimd::pack<T, 4> pack_to = nsimd::to_pack_interleave(pack_from);\n  nsimd::storea(computed.get(), pack_to);\n\n  LOG_MEMORY_CONTENT_DEBUG(computed.get(), len_, \"nsimd::pack<T, 4>\");\n\n  return nsimd_tests::check_arrays(expected.get(), computed.get(), len_);\n}\n\ntemplate <typename T> bool to_pack_interleave_from_packx3_N_2() {\n\n  LOG_TEST_DEBUG(\"to_pack_interleave_from_packx3_N_2\", T);\n\n  nsimd::pack<T, 2> v0(42);\n  nsimd::pack<T, 2> v1(24);\n  nsimd::pack<T, 2> v2(66);\n\n  nsimd::packx3<T, 2> pack_from;\n  pack_from.v0 = v0;\n  pack_from.v1 = v1;\n  pack_from.v2 = v2;\n\n  nsimd::scoped_aligned_mem_for<T>\n    expected(6 * NSIMD_MAX_REGISTER_SIZE_BYTES);\n  nsimd::scoped_aligned_mem_for<T>\n    computed(6 * NSIMD_MAX_REGISTER_SIZE_BYTES);\n\n  const int len_ = nsimd::len(nsimd::packx3<T, 2>());\n  nsimd_tests::init_arrays(expected.get(), computed.get(), len_);\n\n  T *begin = expected.get();\n  nsimd::storea(begin, nsimd::pack<T, 1>(pack_from.v0.car));\n\n  begin += nsimd::len(nsimd::pack<T, 1>());\n  nsimd::storea(begin, nsimd::pack<T, 1>(pack_from.v1.car));\n\n  begin += nsimd::len(nsimd::pack<T, 1>());\n  nsimd::storea(begin, nsimd::pack<T, 1>(pack_from.v2.car));\n\n  begin += nsimd::len(nsimd::pack<T, 1>());\n  nsimd::storea(begin, nsimd::pack<T, 1>(pack_from.v0.cdr.car));\n\n  begin += nsimd::len(nsimd::pack<T, 1>());\n  nsimd::storea(begin, nsimd::pack<T, 1>(pack_from.v1.cdr.car));\n\n  begin += nsimd::len(nsimd::pack<T, 1>());\n  nsimd::storea(begin, nsimd::pack<T, 1>(pack_from.v2.cdr.car));\n\n  LOG_MEMORY_CONTENT_DEBUG(expected.get(), len_, \"nsimd::packx3<T, 2>\");\n\n  nsimd::pack<T, 6> pack_to = nsimd::to_pack_interleave(pack_from);\n  nsimd::storea(computed.get(), pack_to);\n\n  LOG_MEMORY_CONTENT_DEBUG(computed.get(), len_, \"nsimd::pack<T, 6>\");\n\n  return nsimd_tests::check_arrays(expected.get(), computed.get(), len_);\n}\n\ntemplate <typename T> bool to_pack_interleave_from_packx3_N_3() {\n\n  LOG_TEST_DEBUG(\"to_pack_interleave_from_packx3_N_3\", T);\n\n  nsimd::pack<T, 3> v0(42);\n  nsimd::pack<T, 3> v1(24);\n  nsimd::pack<T, 3> v2(66);\n\n  nsimd::packx3<T, 3> pack_from;\n  pack_from.v0 = v0;\n  pack_from.v1 = v1;\n  pack_from.v2 = v2;\n\n  nsimd::scoped_aligned_mem_for<T>\n    expected(9 * NSIMD_MAX_REGISTER_SIZE_BYTES);\n  nsimd::scoped_aligned_mem_for<T>\n    computed(9 * NSIMD_MAX_REGISTER_SIZE_BYTES);\n\n  const int len_ = nsimd::len(nsimd::packx3<T, 3>());\n  nsimd_tests::init_arrays(expected.get(), computed.get(), len_);\n\n  T *begin = expected.get();\n  nsimd::storea(begin, nsimd::pack<T, 1>(pack_from.v0.car));\n\n  begin += nsimd::len(nsimd::pack<T, 1>());\n  nsimd::storea(begin, nsimd::pack<T, 1>(pack_from.v1.car));\n\n  begin += nsimd::len(nsimd::pack<T, 1>());\n  nsimd::storea(begin, nsimd::pack<T, 1>(pack_from.v2.car));\n\n  begin += nsimd::len(nsimd::pack<T, 1>());\n  nsimd::storea(begin, nsimd::pack<T, 1>(pack_from.v0.cdr.car));\n\n  begin += nsimd::len(nsimd::pack<T, 1>());\n  nsimd::storea(begin, nsimd::pack<T, 1>(pack_from.v1.cdr.car));\n\n  begin += nsimd::len(nsimd::pack<T, 1>());\n  nsimd::storea(begin, nsimd::pack<T, 1>(pack_from.v2.cdr.car));\n\n  begin += nsimd::len(nsimd::pack<T, 1>());\n  nsimd::storea(begin, nsimd::pack<T, 1>(pack_from.v0.cdr.cdr.car));\n\n  begin += nsimd::len(nsimd::pack<T, 1>());\n  nsimd::storea(begin, nsimd::pack<T, 1>(pack_from.v1.cdr.cdr.car));\n\n  begin += nsimd::len(nsimd::pack<T, 1>());\n  nsimd::storea(begin, nsimd::pack<T, 1>(pack_from.v2.cdr.cdr.car));\n\n  LOG_MEMORY_CONTENT_DEBUG(expected.get(), len_, \"nsimd::packx3<T, 3>\");\n\n  nsimd::pack<T, 9> pack_to = nsimd::to_pack_interleave(pack_from);\n  nsimd::storea(computed.get(), pack_to);\n\n  LOG_MEMORY_CONTENT_DEBUG(computed.get(), len_, \"nsimd::pack<T, 9>\");\n\n  return nsimd_tests::check_arrays(expected.get(), computed.get(), len_);\n}\n\ntemplate <typename T> bool to_pack_interleave_from_packx4_N_1() {\n\n  LOG_TEST_DEBUG(\"to_pack_interleave_from_packx4_N_1\", T);\n\n  nsimd::pack<T, 1> v0(42);\n  nsimd::pack<T, 1> v1(24);\n  nsimd::pack<T, 1> v2(66);\n  nsimd::pack<T, 1> v3(132);\n\n  nsimd::packx4<T, 1> pack_from;\n  pack_from.v0 = v0;\n  pack_from.v1 = v1;\n  pack_from.v2 = v2;\n  pack_from.v3 = v3;\n\n  nsimd::scoped_aligned_mem_for<T>\n    expected(4 * NSIMD_MAX_REGISTER_SIZE_BYTES);\n  nsimd::scoped_aligned_mem_for<T>\n    computed(4 * NSIMD_MAX_REGISTER_SIZE_BYTES);\n\n  const int len_ = nsimd::len(nsimd::packx4<T, 1>());\n  nsimd_tests::init_arrays(expected.get(), computed.get(), len_);\n\n  T *begin = expected.get();\n  nsimd::storea(begin, nsimd::pack<T, 1>(pack_from.v0.car));\n\n  begin += nsimd::len(nsimd::pack<T, 1>());\n  nsimd::storea(begin, nsimd::pack<T, 1>(pack_from.v1.car));\n\n  begin += nsimd::len(nsimd::pack<T, 1>());\n  nsimd::storea(begin, nsimd::pack<T, 1>(pack_from.v2.car));\n\n  begin += nsimd::len(nsimd::pack<T, 1>());\n  nsimd::storea(begin, nsimd::pack<T, 1>(pack_from.v3.car));\n\n  LOG_MEMORY_CONTENT_DEBUG(expected.get(), len_, \"nsimd::packx4<T, 1>\");\n\n  nsimd::pack<T, 4> pack_to = nsimd::to_pack_interleave(pack_from);\n  nsimd::storea(computed.get(), pack_to);\n\n  LOG_MEMORY_CONTENT_DEBUG(computed.get(), len_, \"nsimd::pack<T, 4>\");\n\n  return nsimd_tests::check_arrays(expected.get(), computed.get(), len_);\n}\n\ntemplate <typename T> bool to_pack_interleave_from_packx4_N_2() {\n\n  LOG_TEST_DEBUG(\"to_pack_interleave_from_packx4_N_2\", T);\n\n  nsimd::pack<T, 2> v0(42);\n  nsimd::pack<T, 2> v1(24);\n  nsimd::pack<T, 2> v2(66);\n  nsimd::pack<T, 2> v3(132);\n\n  nsimd::packx4<T, 2> pack_from;\n  pack_from.v0 = v0;\n  pack_from.v1 = v1;\n  pack_from.v2 = v2;\n  pack_from.v3 = v3;\n\n  nsimd::scoped_aligned_mem_for<T>\n    expected(8 * NSIMD_MAX_REGISTER_SIZE_BYTES);\n  nsimd::scoped_aligned_mem_for<T>\n    computed(8 * NSIMD_MAX_REGISTER_SIZE_BYTES);\n\n  const int len_ = nsimd::len(nsimd::packx4<T, 2>());\n  nsimd_tests::init_arrays(expected.get(), computed.get(), len_);\n\n  T *begin = expected.get();\n  nsimd::storea(begin, nsimd::pack<T, 1>(pack_from.v0.car));\n\n  begin += nsimd::len(nsimd::pack<T, 1>());\n  nsimd::storea(begin, nsimd::pack<T, 1>(pack_from.v1.car));\n\n  begin += nsimd::len(nsimd::pack<T, 1>());\n  nsimd::storea(begin, nsimd::pack<T, 1>(pack_from.v2.car));\n\n  begin += nsimd::len(nsimd::pack<T, 1>());\n  nsimd::storea(begin, nsimd::pack<T, 1>(pack_from.v3.car));\n\n  begin += nsimd::len(nsimd::pack<T, 1>());\n  nsimd::storea(begin, nsimd::pack<T, 1>(pack_from.v0.cdr.car));\n\n  begin += nsimd::len(nsimd::pack<T, 1>());\n  nsimd::storea(begin, nsimd::pack<T, 1>(pack_from.v1.cdr.car));\n\n  begin += nsimd::len(nsimd::pack<T, 1>());\n  nsimd::storea(begin, nsimd::pack<T, 1>(pack_from.v2.cdr.car));\n\n  begin += nsimd::len(nsimd::pack<T, 1>());\n  nsimd::storea(begin, nsimd::pack<T, 1>(pack_from.v3.cdr.car));\n\n  LOG_MEMORY_CONTENT_DEBUG(expected.get(), len_, \"nsimd::packx4<T, 2>\");\n\n  nsimd::pack<T, 8> pack_to = nsimd::to_pack_interleave(pack_from);\n  nsimd::storea(computed.get(), pack_to);\n\n  LOG_MEMORY_CONTENT_DEBUG(computed.get(), len_, \"nsimd::pack<T, 8>\");\n\n  return nsimd_tests::check_arrays(expected.get(), computed.get(), len_);\n}\n\ntemplate <typename T> bool to_pack_interleave_from_packx4_N_3() {\n\n  LOG_TEST_DEBUG(\"to_pack_interleave_from_packx4_N_3\", T);\n\n  nsimd::pack<T, 3> v0(42);\n  nsimd::pack<T, 3> v1(24);\n  nsimd::pack<T, 3> v2(66);\n  nsimd::pack<T, 3> v3(132);\n\n  nsimd::packx4<T, 3> pack_from;\n  pack_from.v0 = v0;\n  pack_from.v1 = v1;\n  pack_from.v2 = v2;\n  pack_from.v3 = v3;\n\n  nsimd::scoped_aligned_mem_for<T>\n    expected(12 * NSIMD_MAX_REGISTER_SIZE_BYTES);\n  nsimd::scoped_aligned_mem_for<T>\n    computed(12 * NSIMD_MAX_REGISTER_SIZE_BYTES);\n\n  const int len_ = nsimd::len(nsimd::packx4<T, 3>());\n  nsimd_tests::init_arrays(expected.get(), computed.get(), len_);\n\n  T *begin = expected.get();\n  nsimd::storea(begin, nsimd::pack<T, 1>(pack_from.v0.car));\n\n  begin += nsimd::len(nsimd::pack<T, 1>());\n  nsimd::storea(begin, nsimd::pack<T, 1>(pack_from.v1.car));\n\n  begin += nsimd::len(nsimd::pack<T, 1>());\n  nsimd::storea(begin, nsimd::pack<T, 1>(pack_from.v2.car));\n\n  begin += nsimd::len(nsimd::pack<T, 1>());\n  nsimd::storea(begin, nsimd::pack<T, 1>(pack_from.v3.car));\n\n  begin += nsimd::len(nsimd::pack<T, 1>());\n  nsimd::storea(begin, nsimd::pack<T, 1>(pack_from.v0.cdr.car));\n\n  begin += nsimd::len(nsimd::pack<T, 1>());\n  nsimd::storea(begin, nsimd::pack<T, 1>(pack_from.v1.cdr.car));\n\n  begin += nsimd::len(nsimd::pack<T, 1>());\n  nsimd::storea(begin, nsimd::pack<T, 1>(pack_from.v2.cdr.car));\n\n  begin += nsimd::len(nsimd::pack<T, 1>());\n  nsimd::storea(begin, nsimd::pack<T, 1>(pack_from.v3.cdr.car));\n\n  begin += nsimd::len(nsimd::pack<T, 1>());\n  nsimd::storea(begin, nsimd::pack<T, 1>(pack_from.v0.cdr.cdr.car));\n\n  begin += nsimd::len(nsimd::pack<T, 1>());\n  nsimd::storea(begin, nsimd::pack<T, 1>(pack_from.v1.cdr.cdr.car));\n\n  begin += nsimd::len(nsimd::pack<T, 1>());\n  nsimd::storea(begin, nsimd::pack<T, 1>(pack_from.v2.cdr.cdr.car));\n\n  begin += nsimd::len(nsimd::pack<T, 1>());\n  nsimd::storea(begin, nsimd::pack<T, 1>(pack_from.v3.cdr.cdr.car));\n\n  LOG_MEMORY_CONTENT_DEBUG(expected.get(), len_, \"nsimd::packx4<T, 3>\");\n\n  nsimd::pack<T, 12> pack_to = nsimd::to_pack_interleave(pack_from);\n  nsimd::storea(computed.get(), pack_to);\n\n  LOG_MEMORY_CONTENT_DEBUG(computed.get(), len_, \"nsimd::pack<T, 12>\");\n\n  return nsimd_tests::check_arrays(expected.get(), computed.get(), len_);\n}\n\ntemplate <typename T> bool test_all() {\n  if (!to_pack_interleave_from_pack_1_N_1<T>()) {\n    return 0;\n  }\n  if (!to_pack_interleave_from_packx2_N_1<T>()) {\n    return 0;\n  }\n  if (!to_pack_interleave_from_packx2_N_2<T>()) {\n    return 0;\n  }\n  if (!to_pack_interleave_from_packx3_N_2<T>()) {\n    return 0;\n  }\n  if (!to_pack_interleave_from_packx3_N_3<T>()) {\n    return 0;\n  }\n  if (!to_pack_interleave_from_packx4_N_1<T>()) {\n    return 0;\n  }\n  if (!to_pack_interleave_from_packx4_N_2<T>()) {\n    return 0;\n  }\n  if (!to_pack_interleave_from_packx4_N_3<T>()) {\n    return 0;\n  }\n  return 1;\n}\n\nint main(void) {\n\n  if (!test_all<i8>() || !test_all<u8>() || !test_all<i16>() ||\n      !test_all<u16>() || !test_all<i32>() || !test_all<u32>() ||\n      !test_all<i64>() || !test_all<u64>() || !test_all<f32>() ||\n      !test_all<f64>()) {\n    return -1;\n  }\n\n  fprintf(stdout, STATUS \"... OK\\n\");\n  fflush(stdout);\n  return 0;\n}\n"
  },
  {
    "path": "tests/ufp.cpp",
    "content": "/*\n\nCopyright (c) 2019 Agenium Scale\n\nPermission is hereby granted, free of charge, to any person obtaining a copy\nof this software and associated documentation files (the \"Software\"), to deal\nin the Software without restriction, including without limitation the rights\nto use, copy, modify, merge, publish, distribute, sublicense, and/or sell\ncopies of the Software, and to permit persons to whom the Software is\nfurnished to do so, subject to the following conditions:\n\nThe above copyright notice and this permission notice shall be included in all\ncopies or substantial portions of the Software.\n\nTHE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\nIMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\nFITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\nAUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\nLIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\nOUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\nSOFTWARE.\n\n*/\n\n#include <nsimd/nsimd.h>\n#include <cstdlib>\n\n// ----------------------------------------------------------------------------\n\ntemplate <typename U> U randbits() {\n  U ret = 0;\n  U mask = ((U)1 << CHAR_BIT) - 1;\n  for (int i = 0; i < (int)sizeof(U); i++) {\n    ret = (U)(ret | (U)((((U)rand()) & mask) << (CHAR_BIT * i)));\n  }\n  return ret;\n}\n\n// ----------------------------------------------------------------------------\n\ntemplate <typename U> int log_std_ulp(U a, U b) {\n  U d = (U)(a < b ? b - a : a - b);\n  int i = 0;\n  for (; i < 63 && d >= (U)1 << i; i++)\n    ;\n  return i;\n}\n\n// ----------------------------------------------------------------------------\n\ntemplate <typename T> struct mantissa{};\ntemplate <> struct mantissa<f64> { static const int size = 53; };\ntemplate <> struct mantissa<f32> { static const int size = 24; };\ntemplate <> struct mantissa<f16> { static const int size = 11; };\n\n// ----------------------------------------------------------------------------\n\ntemplate <typename T, typename U>\nint test_ufp(int n) {\n  T a = nsimd::scalar_cvt(T(), (U)1);\n  U ua = nsimd::scalar_reinterpret(U(), a);\n  T ap1 = nsimd::scalar_reinterpret(T(), (U)(ua + 1));\n  if (nsimd::ufp(a, ap1) != mantissa<T>::size - 1) {\n    return -1;\n  }\n\n  T am1 = nsimd::scalar_reinterpret(T(), (U)(ua - 1));\n  if (nsimd::ufp(a, am1) != mantissa<T>::size - 1) {\n    return -1;\n  }\n\n  if (nsimd::ufp(a, a) != mantissa<T>::size) {\n    return -1;\n  }\n  if (nsimd::ufp(a, a) != mantissa<T>::size) {\n    return -1;\n  }\n  if (nsimd::ufp(a, a) != mantissa<T>::size) {\n    return -1;\n  }\n\n  T ax4 = nsimd::scalar_cvt(T(), (U)4);\n  if (nsimd::ufp(a, ax4) != 0) {\n    return -1;\n  }\n\n  U mask = (U)1 << (mantissa<T>::size - 1);\n  U exponent = (U)((~mask) & ua);\n  for (int i = 0; i < n; i++) {\n    U ub = exponent | (randbits<U>() & mask);\n    T b = nsimd::scalar_reinterpret(T(), ub);\n    U uc = exponent | (randbits<U>() & mask);\n    T c = nsimd::scalar_reinterpret(T(), uc);\n    if (nsimd::ufp(b, c) != mantissa<T>::size - log_std_ulp(ub, uc)) {\n      return -1;\n    }\n  }\n\n  return 0;\n}\n\n// ----------------------------------------------------------------------------\n\nint main(void) {\n  int n = 10000;\n  return test_ufp<f64, u64>(n) || test_ufp<f32, u32>(n) ||\n         test_ufp<f16, u16>(n);\n}\n"
  }
]